From 9f00d9776bc5beb92e8bfc884a7e96ddc5589e2e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sat, 8 Sep 2012 02:53:54 +0000 Subject: netlink: hide struct module parameter in netlink_kernel_create This patch defines netlink_kernel_create as a wrapper function of __netlink_kernel_create to hide the struct module *me parameter (which seems to be THIS_MODULE in all existing netlink subsystems). Suggested by David S. Miller. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- kernel/audit.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index ea3b7b6191c..a24aafa850a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -971,8 +971,7 @@ static int __init audit_init(void) printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, - THIS_MODULE, &cfg); + audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, &cfg); if (!audit_sock) audit_panic("cannot initialize netlink socket"); else -- cgit v1.2.3 From 15e473046cb6e5d18a4d0057e61d76315230382b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 7 Sep 2012 20:12:54 +0000 Subject: netlink: Rename pid to portid to avoid confusion It is a frequent mistake to confuse the netlink port identifier with a process identifier. Try to reduce this confusion by renaming fields that hold port identifiers portid instead of pid. I have carefully avoided changing the structures exported to userspace to avoid changing the userspace API. I have successfully built an allyesconfig kernel with this change. Signed-off-by: "Eric W. Biederman" Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- kernel/audit.c | 20 ++++++++++---------- kernel/taskstats.c | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index a24aafa850a..e0cf64a0ae2 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -87,11 +87,11 @@ static int audit_failure = AUDIT_FAIL_PRINTK; /* * If audit records are to be written to the netlink socket, audit_pid - * contains the pid of the auditd process and audit_nlk_pid contains - * the pid to use to send netlink messages to that process. + * contains the pid of the auditd process and audit_nlk_portid contains + * the portid to use to send netlink messages to that process. */ int audit_pid; -static int audit_nlk_pid; +static int audit_nlk_portid; /* If audit_rate_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in @@ -401,7 +401,7 @@ static void kauditd_send_skb(struct sk_buff *skb) int err; /* take a reference in case we can't send it and we want to hold it */ skb_get(skb); - err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); + err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); if (err < 0) { BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); @@ -692,7 +692,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) status_set.backlog_limit = audit_backlog_limit; status_set.lost = atomic_read(&audit_lost); status_set.backlog = skb_queue_len(&audit_skb_queue); - audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, + audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, &status_set, sizeof(status_set)); break; case AUDIT_SET: @@ -720,7 +720,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) sessionid, sid, 1); audit_pid = new_pid; - audit_nlk_pid = NETLINK_CB(skb).pid; + audit_nlk_portid = NETLINK_CB(skb).portid; } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { err = audit_set_rate_limit(status_get->rate_limit, @@ -782,7 +782,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } /* fallthrough */ case AUDIT_LIST: - err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, + err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, uid, seq, data, nlmsg_len(nlh), loginuid, sessionid, sid); break; @@ -801,7 +801,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } /* fallthrough */ case AUDIT_LIST_RULES: - err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, + err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, uid, seq, data, nlmsg_len(nlh), loginuid, sessionid, sid); break; @@ -872,7 +872,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) memcpy(sig_data->ctx, ctx, len); security_release_secctx(ctx, len); } - audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, + audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO, 0, 0, sig_data, sizeof(*sig_data) + len); kfree(sig_data); break; @@ -891,7 +891,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) rcu_read_unlock(); if (!err) - audit_send_reply(NETLINK_CB(skb).pid, seq, + audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } diff --git a/kernel/taskstats.c b/kernel/taskstats.c index d0a32796550..123793cd06f 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -467,7 +467,7 @@ static int cmd_attr_register_cpumask(struct genl_info *info) rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); if (rc < 0) goto out; - rc = add_del_listener(info->snd_pid, mask, REGISTER); + rc = add_del_listener(info->snd_portid, mask, REGISTER); out: free_cpumask_var(mask); return rc; @@ -483,7 +483,7 @@ static int cmd_attr_deregister_cpumask(struct genl_info *info) rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); if (rc < 0) goto out; - rc = add_del_listener(info->snd_pid, mask, DEREGISTER); + rc = add_del_listener(info->snd_portid, mask, DEREGISTER); out: free_cpumask_var(mask); return rc; -- cgit v1.2.3 From 5640f7685831e088fe6c2e1f863a6805962f8e81 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 23 Sep 2012 23:04:42 +0000 Subject: net: use a per task frag allocator We currently use a per socket order-0 page cache for tcp_sendmsg() operations. This page is used to build fragments for skbs. Its done to increase probability of coalescing small write() into single segments in skbs still in write queue (not yet sent) But it wastes a lot of memory for applications handling many mostly idle sockets, since each socket holds one page in sk->sk_sndmsg_page Its also quite inefficient to build TSO 64KB packets, because we need about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit page allocator more than wanted. This patch adds a per task frag allocator and uses bigger pages, if available. An automatic fallback is done in case of memory pressure. (up to 32768 bytes per frag, thats order-3 pages on x86) This increases TCP stream performance by 20% on loopback device, but also benefits on other network devices, since 8x less frags are mapped on transmit and unmapped on tx completion. Alexander Duyck mentioned a probable performance win on systems with IOMMU enabled. Its possible some SG enabled hardware cant cope with bigger fragments, but their ndo_start_xmit() should already handle this, splitting a fragment in sub fragments, since some arches have PAGE_SIZE=65536 Successfully tested on various ethernet devices. (ixgbe, igb, bnx2x, tg3, mellanox mlx4) Signed-off-by: Eric Dumazet Cc: Ben Hutchings Cc: Vijay Subramanian Cc: Alexander Duyck Tested-by: Vijay Subramanian Signed-off-by: David S. Miller --- kernel/exit.c | 3 +++ kernel/fork.c | 1 + 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f65345f9e5b..42f25952edd 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1046,6 +1046,9 @@ void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); + if (tsk->task_frag.page) + put_page(tsk->task_frag.page); + validate_creds_for_do_exit(tsk); preempt_disable(); diff --git a/kernel/fork.c b/kernel/fork.c index 2c8857e1285..01565b9ce0f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -330,6 +330,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->btrace_seq = 0; #endif tsk->splice_pipe = NULL; + tsk->task_frag.page = NULL; account_kernel_stack(ti, 1); -- cgit v1.2.3