From c329ef9684de9517d82af5b4758c9e1b64a8a11a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Nov 2019 12:16:39 -0800
Subject: net/tls: take into account that bpf_exec_tx_verdict() may free the
 record

bpf_exec_tx_verdict() may free the record if tls_push_record()
fails, or if the entire record got consumed by BPF. Re-check
ctx->open_rec before touching the data.

Fixes: d3b18ad31f93 ("tls: add bpf support to sk_msg handling")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_sw.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index da9f9ce51e7b..70e3c0c1af50 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -984,7 +984,7 @@ alloc_encrypted:
 					num_async++;
 				else if (ret == -ENOMEM)
 					goto wait_for_memory;
-				else if (ret == -ENOSPC)
+				else if (ctx->open_rec && ret == -ENOSPC)
 					goto rollback_iter;
 				else if (ret != -EAGAIN)
 					goto send_end;
@@ -1053,11 +1053,12 @@ wait_for_memory:
 		ret = sk_stream_wait_memory(sk, &timeo);
 		if (ret) {
 trim_sgl:
-			tls_trim_both_msgs(sk, orig_size);
+			if (ctx->open_rec)
+				tls_trim_both_msgs(sk, orig_size);
 			goto send_end;
 		}
 
-		if (msg_en->sg.size < required_size)
+		if (ctx->open_rec && msg_en->sg.size < required_size)
 			goto alloc_encrypted;
 	}
 
@@ -1190,11 +1191,13 @@ wait_for_sndbuf:
 wait_for_memory:
 		ret = sk_stream_wait_memory(sk, &timeo);
 		if (ret) {
-			tls_trim_both_msgs(sk, msg_pl->sg.size);
+			if (ctx->open_rec)
+				tls_trim_both_msgs(sk, msg_pl->sg.size);
 			goto sendpage_end;
 		}
 
-		goto alloc_payload;
+		if (ctx->open_rec)
+			goto alloc_payload;
 	}
 
 	if (num_async) {
-- 
cgit v1.2.3


From d10523d0b3d78153ee58d19853ced26c9004c8c4 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Nov 2019 12:16:40 -0800
Subject: net/tls: free the record on encryption error

When tls_do_encryption() fails the SG lists are left with the
SG_END and SG_CHAIN marks in place. One could hope that once
encryption fails we will never see the record again, but that
is in fact not true. Commit d3b18ad31f93 ("tls: add bpf support
to sk_msg handling") added special handling to ENOMEM and ENOSPC
errors which mean we may see the same record re-submitted.

As suggested by John free the record, the BPF code is already
doing just that.

Reported-by: syzbot+df0d4ec12332661dd1f9@syzkaller.appspotmail.com
Fixes: d3b18ad31f93 ("tls: add bpf support to sk_msg handling")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_sw.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 70e3c0c1af50..dbba51b69d21 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -771,8 +771,14 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk,
 
 	policy = !(flags & MSG_SENDPAGE_NOPOLICY);
 	psock = sk_psock_get(sk);
-	if (!psock || !policy)
-		return tls_push_record(sk, flags, record_type);
+	if (!psock || !policy) {
+		err = tls_push_record(sk, flags, record_type);
+		if (err) {
+			*copied -= sk_msg_free(sk, msg);
+			tls_free_open_rec(sk);
+		}
+		return err;
+	}
 more_data:
 	enospc = sk_msg_full(msg);
 	if (psock->eval == __SK_NONE) {
-- 
cgit v1.2.3


From 031097d9e079e40dce401031d1012e83d80eaf01 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Nov 2019 12:16:41 -0800
Subject: net: skmsg: fix TLS 1.3 crash with full sk_msg

TLS 1.3 started using the entry at the end of the SG array
for chaining-in the single byte content type entry. This mostly
works:

[ E E E E E E . . ]
  ^           ^
   start       end

                 E < content type
               /
[ E E E E E E C . ]
  ^           ^
   start       end

(Where E denotes a populated SG entry; C denotes a chaining entry.)

If the array is full, however, the end will point to the start:

[ E E E E E E E E ]
  ^
   start
   end

And we end up overwriting the start:

    E < content type
   /
[ C E E E E E E E ]
  ^
   start
   end

The sg array is supposed to be a circular buffer with start and
end markers pointing anywhere. In case where start > end
(i.e. the circular buffer has "wrapped") there is an extra entry
reserved at the end to chain the two halves together.

[ E E E E E E . . l ]

(Where l is the reserved entry for "looping" back to front.

As suggested by John, let's reserve another entry for chaining
SG entries after the main circular buffer. Note that this entry
has to be pointed to by the end entry so its position is not fixed.

Examples of full messages:

[ E E E E E E E E . l ]
  ^               ^
   start           end

   <---------------.
[ E E . E E E E E E l ]
      ^ ^
   end   start

Now the end will always point to an unused entry, so TLS 1.3
can always use it.

Fixes: 130b392c6cd6 ("net: tls: Add tls 1.3 support")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/filter.c  | 8 ++++----
 net/core/skmsg.c   | 2 +-
 net/ipv4/tcp_bpf.c | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net')

diff --git a/net/core/filter.c b/net/core/filter.c
index b0ed048585ba..f1e703eed3d2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2299,7 +2299,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
 	WARN_ON_ONCE(last_sge == first_sge);
 	shift = last_sge > first_sge ?
 		last_sge - first_sge - 1 :
-		MAX_SKB_FRAGS - first_sge + last_sge - 1;
+		NR_MSG_FRAG_IDS - first_sge + last_sge - 1;
 	if (!shift)
 		goto out;
 
@@ -2308,8 +2308,8 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
 	do {
 		u32 move_from;
 
-		if (i + shift >= MAX_MSG_FRAGS)
-			move_from = i + shift - MAX_MSG_FRAGS;
+		if (i + shift >= NR_MSG_FRAG_IDS)
+			move_from = i + shift - NR_MSG_FRAG_IDS;
 		else
 			move_from = i + shift;
 		if (move_from == msg->sg.end)
@@ -2323,7 +2323,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
 	} while (1);
 
 	msg->sg.end = msg->sg.end - shift > msg->sg.end ?
-		      msg->sg.end - shift + MAX_MSG_FRAGS :
+		      msg->sg.end - shift + NR_MSG_FRAG_IDS :
 		      msg->sg.end - shift;
 out:
 	msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index a469d2124f3f..ded2d5227678 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -421,7 +421,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
 	copied = skb->len;
 	msg->sg.start = 0;
 	msg->sg.size = copied;
-	msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge;
+	msg->sg.end = num_sge;
 	msg->skb = skb;
 
 	sk_psock_queue_msg(psock, msg);
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 8a56e09cfb0e..e38705165ac9 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -301,7 +301,7 @@ EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
 static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
 				struct sk_msg *msg, int *copied, int flags)
 {
-	bool cork = false, enospc = msg->sg.start == msg->sg.end;
+	bool cork = false, enospc = sk_msg_full(msg);
 	struct sock *sk_redir;
 	u32 tosend, delta = 0;
 	int ret;
-- 
cgit v1.2.3


From 9e5ffed37df68d0ccfb2fdc528609e23a1e70ebe Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Nov 2019 12:16:43 -0800
Subject: net/tls: remove the dead inplace_crypto code

Looks like when BPF support was added by commit d3b18ad31f93
("tls: add bpf support to sk_msg handling") and
commit d829e9c4112b ("tls: convert to generic sk_msg interface")
it broke/removed the support for in-place crypto as added by
commit 4e6d47206c32 ("tls: Add support for inplace records
encryption").

The inplace_crypto member of struct tls_rec is dead, inited
to zero, and sometimes set to zero again. It used to be
set to 1 when record was allocated, but the skmsg code doesn't
seem to have been written with the idea of in-place crypto
in mind.

Since non trivial effort is required to bring the feature back
and we don't really have the HW to measure the benefit just
remove the left over support for now to avoid confusing readers.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_sw.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index dbba51b69d21..5989dfe5c443 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -710,8 +710,7 @@ static int tls_push_record(struct sock *sk, int flags,
 	}
 
 	i = msg_pl->sg.start;
-	sg_chain(rec->sg_aead_in, 2, rec->inplace_crypto ?
-		 &msg_en->sg.data[i] : &msg_pl->sg.data[i]);
+	sg_chain(rec->sg_aead_in, 2, &msg_pl->sg.data[i]);
 
 	i = msg_en->sg.end;
 	sk_msg_iter_var_prev(i);
@@ -976,8 +975,6 @@ alloc_encrypted:
 			if (ret)
 				goto fallback_to_reg_send;
 
-			rec->inplace_crypto = 0;
-
 			num_zc++;
 			copied += try_to_copy;
 
@@ -1176,7 +1173,6 @@ alloc_payload:
 
 		tls_ctx->pending_open_record_frags = true;
 		if (full_record || eor || sk_msg_full(msg_pl)) {
-			rec->inplace_crypto = 0;
 			ret = bpf_exec_tx_verdict(msg_pl, sk, full_record,
 						  record_type, &copied, flags);
 			if (ret) {
-- 
cgit v1.2.3


From c5daa6cccdc2f94aca2c9b3fa5f94e4469997293 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Wed, 27 Nov 2019 12:16:44 -0800
Subject: net/tls: use sg_next() to walk sg entries

Partially sent record cleanup path increments an SG entry
directly instead of using sg_next(). This should not be a
problem today, as encrypted messages should be always
allocated as arrays. But given this is a cleanup path it's
easy to miss was this ever to change. Use sg_next(), and
simplify the code.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tls/tls_main.c | 13 ++-----------
 net/tls/tls_sw.c   |  3 ++-
 2 files changed, 4 insertions(+), 12 deletions(-)

(limited to 'net')

diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index bdca31ffe6da..b3da6c5ab999 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -209,24 +209,15 @@ int tls_push_partial_record(struct sock *sk, struct tls_context *ctx,
 	return tls_push_sg(sk, ctx, sg, offset, flags);
 }
 
-bool tls_free_partial_record(struct sock *sk, struct tls_context *ctx)
+void tls_free_partial_record(struct sock *sk, struct tls_context *ctx)
 {
 	struct scatterlist *sg;
 
-	sg = ctx->partially_sent_record;
-	if (!sg)
-		return false;
-
-	while (1) {
+	for (sg = ctx->partially_sent_record; sg; sg = sg_next(sg)) {
 		put_page(sg_page(sg));
 		sk_mem_uncharge(sk, sg->length);
-
-		if (sg_is_last(sg))
-			break;
-		sg++;
 	}
 	ctx->partially_sent_record = NULL;
-	return true;
 }
 
 static void tls_write_space(struct sock *sk)
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 5989dfe5c443..2b2d0bae14a9 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -2089,7 +2089,8 @@ void tls_sw_release_resources_tx(struct sock *sk)
 	/* Free up un-sent records in tx_list. First, free
 	 * the partially sent record if any at head of tx_list.
 	 */
-	if (tls_free_partial_record(sk, tls_ctx)) {
+	if (tls_ctx->partially_sent_record) {
+		tls_free_partial_record(sk, tls_ctx);
 		rec = list_first_entry(&ctx->tx_list,
 				       struct tls_rec, list);
 		list_del(&rec->list);
-- 
cgit v1.2.3


From 2fe97a578d7bad3116a89dc8a6692a51e6fc1d9c Mon Sep 17 00:00:00 2001
From: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Date: Thu, 28 Nov 2019 10:10:05 +0700
Subject: tipc: fix potential memory leak in __tipc_sendmsg()

When initiating a connection message to a server side, the connection
message is cloned and added to the socket write queue. However, if the
cloning is failed, only the socket write queue is purged. It causes
memory leak because the original connection message is not freed.

This commit fixes it by purging the list of connection message when
it cannot be cloned.

Fixes: 6787927475e5 ("tipc: buffer overflow handling in listener socket")
Reported-by: Hoang Le <hoang.h.le@dektech.com.au>
Signed-off-by: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index a1c8d722ca20..7baed2c2c93d 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1447,8 +1447,10 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
 	rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
 	if (unlikely(rc != dlen))
 		return rc;
-	if (unlikely(syn && !tipc_msg_skb_clone(&pkts, &sk->sk_write_queue)))
+	if (unlikely(syn && !tipc_msg_skb_clone(&pkts, &sk->sk_write_queue))) {
+		__skb_queue_purge(&pkts);
 		return -ENOMEM;
+	}
 
 	trace_tipc_sk_sendmsg(sk, skb_peek(&pkts), TIPC_DUMP_SK_SNDQ, " ");
 	rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid);
-- 
cgit v1.2.3


From 91a4a3eb433e4d786420c41f3c08d1d16c605962 Mon Sep 17 00:00:00 2001
From: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Date: Thu, 28 Nov 2019 10:10:06 +0700
Subject: tipc: fix wrong socket reference counter after tipc_sk_timeout()
 returns

When tipc_sk_timeout() is executed but user space is grabbing
ownership, this function rearms itself and returns. However, the
socket reference counter is not reduced. This causes potential
unexpected behavior.

This commit fixes it by calling sock_put() before tipc_sk_timeout()
returns in the above-mentioned case.

Fixes: afe8792fec69 ("tipc: refactor function tipc_sk_timeout()")
Signed-off-by: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Acked-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 7baed2c2c93d..fb5595081a05 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2759,6 +2759,7 @@ static void tipc_sk_timeout(struct timer_list *t)
 	if (sock_owned_by_user(sk)) {
 		sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ / 20);
 		bh_unlock_sock(sk);
+		sock_put(sk);
 		return;
 	}
 
-- 
cgit v1.2.3


From 12db3c8083fcab4270866a88191933f2d9f24f89 Mon Sep 17 00:00:00 2001
From: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Date: Thu, 28 Nov 2019 10:10:07 +0700
Subject: tipc: fix wrong timeout input for tipc_wait_for_cond()

In function __tipc_shutdown(), the timeout value passed to
tipc_wait_for_cond() is not jiffies.

This commit fixes it by converting that value from milliseconds
to jiffies.

Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion")
Signed-off-by: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index fb5595081a05..da5fb84852a6 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -532,7 +532,7 @@ static void __tipc_shutdown(struct socket *sock, int error)
 	struct sock *sk = sock->sk;
 	struct tipc_sock *tsk = tipc_sk(sk);
 	struct net *net = sock_net(sk);
-	long timeout = CONN_TIMEOUT_DEFAULT;
+	long timeout = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT);
 	u32 dnode = tsk_peer_node(tsk);
 	struct sk_buff *skb;
 
-- 
cgit v1.2.3


From d34910e1751be79672db9cc61ca8892fbb4763f2 Mon Sep 17 00:00:00 2001
From: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Date: Thu, 28 Nov 2019 10:10:08 +0700
Subject: tipc: fix duplicate SYN messages under link congestion

Scenario:
1. A client socket initiates a SYN message to a listening socket.
2. The send link is congested, the SYN message is put in the
send link and a wakeup message is put in wakeup queue.
3. The congestion situation is abated, the wakeup message is
pulled out of the wakeup queue. Function tipc_sk_push_backlog()
is called to send out delayed messages by Nagle. However,
the client socket is still in CONNECTING state. So, it sends
the SYN message in the socket write queue to the listening socket
again.
4. The listening socket receives the first SYN message and creates
first server socket. The client socket receives ACK- and establishes
a connection to the first server socket. The client socket closes
its connection with the first server socket.
5. The listening socket receives the second SYN message and creates
second server socket. The second server socket sends ACK- to the
client socket, but it has been closed. It results in connection
reset error when reading from the server socket in user space.

Solution: return from function tipc_sk_push_backlog() immediately
if there is pending SYN message in the socket write queue.

Fixes: c0bceb97db9e ("tipc: add smart nagle feature")
Signed-off-by: Tung Nguyen <tung.q.nguyen@dektech.com.au>
Acked-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/tipc/socket.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index da5fb84852a6..41688da233ab 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -540,12 +540,10 @@ static void __tipc_shutdown(struct socket *sock, int error)
 	tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt &&
 					    !tsk_conn_cong(tsk)));
 
-	/* Push out unsent messages or remove if pending SYN */
-	skb = skb_peek(&sk->sk_write_queue);
-	if (skb && !msg_is_syn(buf_msg(skb)))
-		tipc_sk_push_backlog(tsk);
-	else
-		__skb_queue_purge(&sk->sk_write_queue);
+	/* Push out delayed messages if in Nagle mode */
+	tipc_sk_push_backlog(tsk);
+	/* Remove pending SYN */
+	__skb_queue_purge(&sk->sk_write_queue);
 
 	/* Reject all unreceived messages, except on an active connection
 	 * (which disconnects locally & sends a 'FIN+' to peer).
@@ -1248,9 +1246,14 @@ static void tipc_sk_push_backlog(struct tipc_sock *tsk)
 	struct sk_buff_head *txq = &tsk->sk.sk_write_queue;
 	struct net *net = sock_net(&tsk->sk);
 	u32 dnode = tsk_peer_node(tsk);
+	struct sk_buff *skb = skb_peek(txq);
 	int rc;
 
-	if (skb_queue_empty(txq) || tsk->cong_link_cnt)
+	if (!skb || tsk->cong_link_cnt)
+		return;
+
+	/* Do not send SYN again after congestion */
+	if (msg_is_syn(buf_msg(skb)))
 		return;
 
 	tsk->snt_unacked += tsk->snd_backlog;
-- 
cgit v1.2.3


From 14e54ab9143fa60794d13ea0a66c792a2046a8f3 Mon Sep 17 00:00:00 2001
From: Dust Li <dust.li@linux.alibaba.com>
Date: Thu, 28 Nov 2019 14:29:09 +0800
Subject: net: sched: fix `tc -s class show` no bstats on class with nolock
 subqueues

When a classful qdisc's child qdisc has set the flag
TCQ_F_CPUSTATS (pfifo_fast for example), the child qdisc's
cpu_bstats should be passed to gnet_stats_copy_basic(),
but many classful qdisc didn't do that. As a result,
`tc -s class show dev DEV` always return 0 for bytes and
packets in this case.

Pass the child qdisc's cpu_bstats to gnet_stats_copy_basic()
to fix this issue.

The qstats also has this problem, but it has been fixed
in 5dd431b6b9 ("net: sched: introduce and use qstats read...")
and bstats still remains buggy.

Fixes: 22e0f8b9322c ("net: sched: make bstats per cpu and estimator RCU safe")
Signed-off-by: Dust Li <dust.li@linux.alibaba.com>
Signed-off-by: Tony Lu <tonylu@linux.alibaba.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/sch_mq.c     | 3 ++-
 net/sched/sch_mqprio.c | 4 ++--
 net/sched/sch_multiq.c | 2 +-
 net/sched/sch_prio.c   | 2 +-
 4 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index 0d578333e967..278c0b2dc523 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -245,7 +245,8 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 	struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
 
 	sch = dev_queue->qdisc_sleeping;
-	if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 ||
+	if (gnet_stats_copy_basic(&sch->running, d, sch->cpu_bstats,
+				  &sch->bstats) < 0 ||
 	    qdisc_qstats_copy(d, sch) < 0)
 		return -1;
 	return 0;
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 46980b8d66c5..0d0113a24962 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -557,8 +557,8 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 		struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
 
 		sch = dev_queue->qdisc_sleeping;
-		if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
-					  d, NULL, &sch->bstats) < 0 ||
+		if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d,
+					  sch->cpu_bstats, &sch->bstats) < 0 ||
 		    qdisc_qstats_copy(d, sch) < 0)
 			return -1;
 	}
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index b2b7fdb06fc6..1330ad224931 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -339,7 +339,7 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 
 	cl_q = q->queues[cl - 1];
 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
-				  d, NULL, &cl_q->bstats) < 0 ||
+				  d, cl_q->cpu_bstats, &cl_q->bstats) < 0 ||
 	    qdisc_qstats_copy(d, cl_q) < 0)
 		return -1;
 
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index 0f8fedb8809a..18b884cfdfe8 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -356,7 +356,7 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 
 	cl_q = q->queues[cl - 1];
 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
-				  d, NULL, &cl_q->bstats) < 0 ||
+				  d, cl_q->cpu_bstats, &cl_q->bstats) < 0 ||
 	    qdisc_qstats_copy(d, cl_q) < 0)
 		return -1;
 
-- 
cgit v1.2.3


From 8ffeb03fbba3b599690b361467bfd2373e8c450f Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Sun, 1 Dec 2019 18:41:24 +0100
Subject: openvswitch: drop unneeded BUG_ON() in ovs_flow_cmd_build_info()

All the callers of ovs_flow_cmd_build_info() already deal with
error return code correctly, so we can handle the error condition
in a more gracefull way. Still dump a warning to preserve
debuggability.

v1 -> v2:
 - clarify the commit message
 - clean the skb and report the error (DaveM)

Fixes: ccb1352e76cf ("net: Add Open vSwitch kernel components.")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/datapath.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 293d5289c4a1..8cab3435d8da 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -905,7 +905,10 @@ static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow,
 	retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb,
 					info->snd_portid, info->snd_seq, 0,
 					cmd, ufid_flags);
-	BUG_ON(retval < 0);
+	if (WARN_ON_ONCE(retval < 0)) {
+		kfree_skb(skb);
+		skb = ERR_PTR(retval);
+	}
 	return skb;
 }
 
-- 
cgit v1.2.3


From 8a574f86652a4540a2433946ba826ccb87f398cc Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Sun, 1 Dec 2019 18:41:25 +0100
Subject: openvswitch: remove another BUG_ON()

If we can't build the flow del notification, we can simply delete
the flow, no need to crash the kernel. Still keep a WARN_ON to
preserve debuggability.

Note: the BUG_ON() predates the Fixes tag, but this change
can be applied only after the mentioned commit.

v1 -> v2:
 - do not leak an skb on error

Fixes: aed067783e50 ("openvswitch: Minimize ovs_flow_cmd_del critical section.")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/openvswitch/datapath.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 8cab3435d8da..1047e8043084 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1372,7 +1372,10 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
 						     OVS_FLOW_CMD_DEL,
 						     ufid_flags);
 			rcu_read_unlock();
-			BUG_ON(err < 0);
+			if (WARN_ON_ONCE(err < 0)) {
+				kfree_skb(reply);
+				goto out_free;
+			}
 
 			ovs_notify(&dp_flow_genl_family, reply, info);
 		} else {
@@ -1380,6 +1383,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info)
 		}
 	}
 
+out_free:
 	ovs_flow_free(flow, true);
 	return 0;
 unlock:
-- 
cgit v1.2.3