aboutsummaryrefslogtreecommitdiff
path: root/net/netfilter
diff options
context:
space:
mode:
Diffstat (limited to 'net/netfilter')
-rw-r--r--net/netfilter/core.c32
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_gen.h277
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ip.c411
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ipmac.c624
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_port.c414
-rw-r--r--net/netfilter/ipset/ip_set_core.c41
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h1100
-rw-r--r--net/netfilter/ipset/ip_set_hash_ip.c344
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipport.c362
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportip.c368
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportnet.c469
-rw-r--r--net/netfilter/ipset/ip_set_hash_net.c402
-rw-r--r--net/netfilter/ipset/ip_set_hash_netiface.c478
-rw-r--r--net/netfilter/ipset/ip_set_hash_netport.c456
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c622
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c31
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c312
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c123
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c702
-rw-r--r--net/netfilter/ipvs/ip_vs_dh.c86
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c115
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c190
-rw-r--r--net/netfilter/ipvs/ip_vs_lc.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_nq.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_pe.c55
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c10
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c38
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c40
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c33
-rw-r--r--net/netfilter/ipvs/ip_vs_rr.c64
-rw-r--r--net/netfilter/ipvs/ip_vs_sched.c63
-rw-r--r--net/netfilter/ipvs/ip_vs_sed.c5
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c86
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c56
-rw-r--r--net/netfilter/ipvs/ip_vs_wlc.c5
-rw-r--r--net/netfilter/ipvs/ip_vs_wrr.c176
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c1050
-rw-r--r--net/netfilter/nf_conntrack_amanda.c1
-rw-r--r--net/netfilter/nf_conntrack_core.c60
-rw-r--r--net/netfilter/nf_conntrack_ecache.c8
-rw-r--r--net/netfilter/nf_conntrack_expect.c9
-rw-r--r--net/netfilter/nf_conntrack_ftp.c1
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c1
-rw-r--r--net/netfilter/nf_conntrack_helper.c3
-rw-r--r--net/netfilter/nf_conntrack_irc.c1
-rw-r--r--net/netfilter/nf_conntrack_netlink.c100
-rw-r--r--net/netfilter/nf_conntrack_pptp.c2
-rw-r--r--net/netfilter/nf_conntrack_proto.c1
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c9
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c1
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c3
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c20
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c7
-rw-r--r--net/netfilter/nf_conntrack_proto_udplite.c8
-rw-r--r--net/netfilter/nf_conntrack_standalone.c17
-rw-r--r--net/netfilter/nf_conntrack_tftp.c2
-rw-r--r--net/netfilter/nf_log.c211
-rw-r--r--net/netfilter/nf_nat_amanda.c1
-rw-r--r--net/netfilter/nf_nat_core.c10
-rw-r--r--net/netfilter/nf_nat_helper.c1
-rw-r--r--net/netfilter/nf_nat_proto_sctp.c5
-rw-r--r--net/netfilter/nf_queue.c148
-rw-r--r--net/netfilter/nfnetlink.c27
-rw-r--r--net/netfilter/nfnetlink_log.c203
-rw-r--r--net/netfilter/nfnetlink_queue_core.c441
-rw-r--r--net/netfilter/x_tables.c7
-rw-r--r--net/netfilter/xt_LOG.c63
-rw-r--r--net/netfilter/xt_NFLOG.c3
-rw-r--r--net/netfilter/xt_NFQUEUE.c63
-rw-r--r--net/netfilter/xt_TCPMSS.c1
-rw-r--r--net/netfilter/xt_TCPOPTSTRIP.c17
-rw-r--r--net/netfilter/xt_addrtype.c27
-rw-r--r--net/netfilter/xt_conntrack.c1
-rw-r--r--net/netfilter/xt_hashlimit.c17
-rw-r--r--net/netfilter/xt_limit.c1
-rw-r--r--net/netfilter/xt_osf.c6
-rw-r--r--net/netfilter/xt_recent.c9
-rw-r--r--net/netfilter/xt_set.c94
80 files changed, 5939 insertions, 5323 deletions
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index a9c488b6c50..857ca9f3517 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -5,6 +5,7 @@
* way.
*
* Rusty Russell (C)2000 -- This code is GPL.
+ * Patrick McHardy (c) 2006-2012
*/
#include <linux/kernel.h>
#include <linux/netfilter.h>
@@ -29,6 +30,8 @@ static DEFINE_MUTEX(afinfo_mutex);
const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
EXPORT_SYMBOL(nf_afinfo);
+const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly;
+EXPORT_SYMBOL_GPL(nf_ipv6_ops);
int nf_register_afinfo(const struct nf_afinfo *afinfo)
{
@@ -276,10 +279,30 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
EXPORT_SYMBOL(nf_nat_decode_session_hook);
#endif
+static int __net_init netfilter_net_init(struct net *net)
+{
#ifdef CONFIG_PROC_FS
-struct proc_dir_entry *proc_net_netfilter;
-EXPORT_SYMBOL(proc_net_netfilter);
+ net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
+ net->proc_net);
+ if (!net->nf.proc_netfilter) {
+ if (!net_eq(net, &init_net))
+ pr_err("cannot create netfilter proc entry");
+
+ return -ENOMEM;
+ }
#endif
+ return 0;
+}
+
+static void __net_exit netfilter_net_exit(struct net *net)
+{
+ remove_proc_entry("netfilter", net->proc_net);
+}
+
+static struct pernet_operations netfilter_net_ops = {
+ .init = netfilter_net_init,
+ .exit = netfilter_net_exit,
+};
void __init netfilter_init(void)
{
@@ -289,11 +312,8 @@ void __init netfilter_init(void)
INIT_LIST_HEAD(&nf_hooks[i][h]);
}
-#ifdef CONFIG_PROC_FS
- proc_net_netfilter = proc_mkdir("netfilter", init_net.proc_net);
- if (!proc_net_netfilter)
+ if (register_pernet_subsys(&netfilter_net_ops) < 0)
panic("cannot create netfilter proc entry");
-#endif
if (netfilter_log_init() < 0)
panic("cannot initialize nf_log");
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
new file mode 100644
index 00000000000..25243379b88
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -0,0 +1,277 @@
+/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __IP_SET_BITMAP_IP_GEN_H
+#define __IP_SET_BITMAP_IP_GEN_H
+
+#define CONCAT(a, b) a##b
+#define TOKEN(a,b) CONCAT(a, b)
+
+#define mtype_do_test TOKEN(MTYPE, _do_test)
+#define mtype_gc_test TOKEN(MTYPE, _gc_test)
+#define mtype_is_filled TOKEN(MTYPE, _is_filled)
+#define mtype_do_add TOKEN(MTYPE, _do_add)
+#define mtype_do_del TOKEN(MTYPE, _do_del)
+#define mtype_do_list TOKEN(MTYPE, _do_list)
+#define mtype_do_head TOKEN(MTYPE, _do_head)
+#define mtype_adt_elem TOKEN(MTYPE, _adt_elem)
+#define mtype_add_timeout TOKEN(MTYPE, _add_timeout)
+#define mtype_gc_init TOKEN(MTYPE, _gc_init)
+#define mtype_kadt TOKEN(MTYPE, _kadt)
+#define mtype_uadt TOKEN(MTYPE, _uadt)
+#define mtype_destroy TOKEN(MTYPE, _destroy)
+#define mtype_flush TOKEN(MTYPE, _flush)
+#define mtype_head TOKEN(MTYPE, _head)
+#define mtype_same_set TOKEN(MTYPE, _same_set)
+#define mtype_elem TOKEN(MTYPE, _elem)
+#define mtype_test TOKEN(MTYPE, _test)
+#define mtype_add TOKEN(MTYPE, _add)
+#define mtype_del TOKEN(MTYPE, _del)
+#define mtype_list TOKEN(MTYPE, _list)
+#define mtype_gc TOKEN(MTYPE, _gc)
+#define mtype MTYPE
+
+#define ext_timeout(e, m) \
+ (unsigned long *)((e) + (m)->offset[IPSET_OFFSET_TIMEOUT])
+#define ext_counter(e, m) \
+ (struct ip_set_counter *)((e) + (m)->offset[IPSET_OFFSET_COUNTER])
+#define get_ext(map, id) ((map)->extensions + (map)->dsize * (id))
+
+static void
+mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
+{
+ struct mtype *map = set->data;
+
+ init_timer(&map->gc);
+ map->gc.data = (unsigned long) set;
+ map->gc.function = gc;
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+static void
+mtype_destroy(struct ip_set *set)
+{
+ struct mtype *map = set->data;
+
+ if (SET_WITH_TIMEOUT(set))
+ del_timer_sync(&map->gc);
+
+ ip_set_free(map->members);
+ if (map->dsize)
+ ip_set_free(map->extensions);
+ kfree(map);
+
+ set->data = NULL;
+}
+
+static void
+mtype_flush(struct ip_set *set)
+{
+ struct mtype *map = set->data;
+
+ memset(map->members, 0, map->memsize);
+}
+
+static int
+mtype_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct mtype *map = set->data;
+ struct nlattr *nested;
+
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested)
+ goto nla_put_failure;
+ if (mtype_do_head(skb, map) ||
+ nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
+ nla_put_net32(skb, IPSET_ATTR_MEMSIZE,
+ htonl(sizeof(*map) +
+ map->memsize +
+ map->dsize * map->elements)) ||
+ (SET_WITH_TIMEOUT(set) &&
+ nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout))) ||
+ (SET_WITH_COUNTER(set) &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS,
+ htonl(IPSET_FLAG_WITH_COUNTERS))))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int
+mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct mtype *map = set->data;
+ const struct mtype_adt_elem *e = value;
+ void *x = get_ext(map, e->id);
+ int ret = mtype_do_test(e, map);
+
+ if (ret <= 0)
+ return ret;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(x, map)))
+ return 0;
+ if (SET_WITH_COUNTER(set))
+ ip_set_update_counter(ext_counter(x, map), ext, mext, flags);
+ return 1;
+}
+
+static int
+mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct mtype *map = set->data;
+ const struct mtype_adt_elem *e = value;
+ void *x = get_ext(map, e->id);
+ int ret = mtype_do_add(e, map, flags);
+
+ if (ret == IPSET_ADD_FAILED) {
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(x, map)))
+ ret = 0;
+ else if (!(flags & IPSET_FLAG_EXIST))
+ return -IPSET_ERR_EXIST;
+ }
+
+ if (SET_WITH_TIMEOUT(set))
+#ifdef IP_SET_BITMAP_STORED_TIMEOUT
+ mtype_add_timeout(ext_timeout(x, map), e, ext, map, ret);
+#else
+ ip_set_timeout_set(ext_timeout(x, map), ext->timeout);
+#endif
+
+ if (SET_WITH_COUNTER(set))
+ ip_set_init_counter(ext_counter(x, map), ext);
+ return 0;
+}
+
+static int
+mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct mtype *map = set->data;
+ const struct mtype_adt_elem *e = value;
+ const void *x = get_ext(map, e->id);
+
+ if (mtype_do_del(e, map) ||
+ (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(x, map))))
+ return -IPSET_ERR_EXIST;
+
+ return 0;
+}
+
+static int
+mtype_list(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct mtype *map = set->data;
+ struct nlattr *adt, *nested;
+ void *x;
+ u32 id, first = cb->args[2];
+
+ adt = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!adt)
+ return -EMSGSIZE;
+ for (; cb->args[2] < map->elements; cb->args[2]++) {
+ id = cb->args[2];
+ x = get_ext(map, id);
+ if (!test_bit(id, map->members) ||
+ (SET_WITH_TIMEOUT(set) &&
+#ifdef IP_SET_BITMAP_STORED_TIMEOUT
+ mtype_is_filled((const struct mtype_elem *) x) &&
+#endif
+ ip_set_timeout_expired(ext_timeout(x, map))))
+ continue;
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (id == first) {
+ nla_nest_cancel(skb, adt);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ if (mtype_do_list(skb, map, id))
+ goto nla_put_failure;
+ if (SET_WITH_TIMEOUT(set)) {
+#ifdef IP_SET_BITMAP_STORED_TIMEOUT
+ if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_stored(map, id,
+ ext_timeout(x, map)))))
+ goto nla_put_failure;
+#else
+ if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(
+ ext_timeout(x, map)))))
+ goto nla_put_failure;
+#endif
+ }
+ if (SET_WITH_COUNTER(set) &&
+ ip_set_put_counter(skb, ext_counter(x, map)))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+ }
+ ipset_nest_end(skb, adt);
+
+ /* Set listing finished */
+ cb->args[2] = 0;
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nested);
+ ipset_nest_end(skb, adt);
+ if (unlikely(id == first)) {
+ cb->args[2] = 0;
+ return -EMSGSIZE;
+ }
+ return 0;
+}
+
+static void
+mtype_gc(unsigned long ul_set)
+{
+ struct ip_set *set = (struct ip_set *) ul_set;
+ struct mtype *map = set->data;
+ const void *x;
+ u32 id;
+
+ /* We run parallel with other readers (test element)
+ * but adding/deleting new entries is locked out */
+ read_lock_bh(&set->lock);
+ for (id = 0; id < map->elements; id++)
+ if (mtype_gc_test(id, map)) {
+ x = get_ext(map, id);
+ if (ip_set_timeout_expired(ext_timeout(x, map)))
+ clear_bit(id, map->members);
+ }
+ read_unlock_bh(&set->lock);
+
+ map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+ add_timer(&map->gc);
+}
+
+static const struct ip_set_type_variant mtype = {
+ .kadt = mtype_kadt,
+ .uadt = mtype_uadt,
+ .adt = {
+ [IPSET_ADD] = mtype_add,
+ [IPSET_DEL] = mtype_del,
+ [IPSET_TEST] = mtype_test,
+ },
+ .destroy = mtype_destroy,
+ .flush = mtype_flush,
+ .head = mtype_head,
+ .list = mtype_list,
+ .same_set = mtype_same_set,
+};
+
+#endif /* __IP_SET_BITMAP_IP_GEN_H */
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index 4a92fd47bd4..f1a8128bef0 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -1,6 +1,6 @@
/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
* Patrick Schaaf <bof@bof.de>
- * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -24,31 +24,37 @@
#include <linux/netfilter/ipset/pfxlen.h>
#include <linux/netfilter/ipset/ip_set.h>
#include <linux/netfilter/ipset/ip_set_bitmap.h>
-#define IP_SET_BITMAP_TIMEOUT
-#include <linux/netfilter/ipset/ip_set_timeout.h>
#define REVISION_MIN 0
-#define REVISION_MAX 0
+#define REVISION_MAX 1 /* Counter support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
IP_SET_MODULE_DESC("bitmap:ip", REVISION_MIN, REVISION_MAX);
MODULE_ALIAS("ip_set_bitmap:ip");
+#define MTYPE bitmap_ip
+
/* Type structure */
struct bitmap_ip {
void *members; /* the set members */
+ void *extensions; /* data extensions */
u32 first_ip; /* host byte order, included in range */
u32 last_ip; /* host byte order, included in range */
u32 elements; /* number of max elements in the set */
u32 hosts; /* number of hosts in a subnet */
size_t memsize; /* members size */
+ size_t dsize; /* extensions struct size */
+ size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */
u8 netmask; /* subnet netmask */
u32 timeout; /* timeout parameter */
struct timer_list gc; /* garbage collection */
};
-/* Base variant */
+/* ADT structure for generic function args */
+struct bitmap_ip_adt_elem {
+ u16 id;
+};
static inline u32
ip_to_id(const struct bitmap_ip *m, u32 ip)
@@ -56,188 +62,67 @@ ip_to_id(const struct bitmap_ip *m, u32 ip)
return ((ip & ip_set_hostmask(m->netmask)) - m->first_ip)/m->hosts;
}
-static int
-bitmap_ip_test(struct ip_set *set, void *value, u32 timeout, u32 flags)
-{
- const struct bitmap_ip *map = set->data;
- u16 id = *(u16 *)value;
-
- return !!test_bit(id, map->members);
-}
+/* Common functions */
-static int
-bitmap_ip_add(struct ip_set *set, void *value, u32 timeout, u32 flags)
+static inline int
+bitmap_ip_do_test(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map)
{
- struct bitmap_ip *map = set->data;
- u16 id = *(u16 *)value;
-
- if (test_and_set_bit(id, map->members))
- return -IPSET_ERR_EXIST;
-
- return 0;
+ return !!test_bit(e->id, map->members);
}
-static int
-bitmap_ip_del(struct ip_set *set, void *value, u32 timeout, u32 flags)
+static inline int
+bitmap_ip_gc_test(u16 id, const struct bitmap_ip *map)
{
- struct bitmap_ip *map = set->data;
- u16 id = *(u16 *)value;
-
- if (!test_and_clear_bit(id, map->members))
- return -IPSET_ERR_EXIST;
-
- return 0;
-}
-
-static int
-bitmap_ip_list(const struct ip_set *set,
- struct sk_buff *skb, struct netlink_callback *cb)
-{
- const struct bitmap_ip *map = set->data;
- struct nlattr *atd, *nested;
- u32 id, first = cb->args[2];
-
- atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
- if (!atd)
- return -EMSGSIZE;
- for (; cb->args[2] < map->elements; cb->args[2]++) {
- id = cb->args[2];
- if (!test_bit(id, map->members))
- continue;
- nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
- if (!nested) {
- if (id == first) {
- nla_nest_cancel(skb, atd);
- return -EMSGSIZE;
- } else
- goto nla_put_failure;
- }
- if (nla_put_ipaddr4(skb, IPSET_ATTR_IP,
- htonl(map->first_ip + id * map->hosts)))
- goto nla_put_failure;
- ipset_nest_end(skb, nested);
- }
- ipset_nest_end(skb, atd);
- /* Set listing finished */
- cb->args[2] = 0;
- return 0;
-
-nla_put_failure:
- nla_nest_cancel(skb, nested);
- ipset_nest_end(skb, atd);
- if (unlikely(id == first)) {
- cb->args[2] = 0;
- return -EMSGSIZE;
- }
- return 0;
+ return !!test_bit(id, map->members);
}
-/* Timeout variant */
-
-static int
-bitmap_ip_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags)
+static inline int
+bitmap_ip_do_add(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map,
+ u32 flags)
{
- const struct bitmap_ip *map = set->data;
- const unsigned long *members = map->members;
- u16 id = *(u16 *)value;
-
- return ip_set_timeout_test(members[id]);
+ return !!test_and_set_bit(e->id, map->members);
}
-static int
-bitmap_ip_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags)
+static inline int
+bitmap_ip_do_del(const struct bitmap_ip_adt_elem *e, struct bitmap_ip *map)
{
- struct bitmap_ip *map = set->data;
- unsigned long *members = map->members;
- u16 id = *(u16 *)value;
-
- if (ip_set_timeout_test(members[id]) && !(flags & IPSET_FLAG_EXIST))
- return -IPSET_ERR_EXIST;
-
- members[id] = ip_set_timeout_set(timeout);
-
- return 0;
+ return !test_and_clear_bit(e->id, map->members);
}
-static int
-bitmap_ip_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags)
+static inline int
+bitmap_ip_do_list(struct sk_buff *skb, const struct bitmap_ip *map, u32 id)
{
- struct bitmap_ip *map = set->data;
- unsigned long *members = map->members;
- u16 id = *(u16 *)value;
- int ret = -IPSET_ERR_EXIST;
-
- if (ip_set_timeout_test(members[id]))
- ret = 0;
-
- members[id] = IPSET_ELEM_UNSET;
- return ret;
+ return nla_put_ipaddr4(skb, IPSET_ATTR_IP,
+ htonl(map->first_ip + id * map->hosts));
}
-static int
-bitmap_ip_tlist(const struct ip_set *set,
- struct sk_buff *skb, struct netlink_callback *cb)
+static inline int
+bitmap_ip_do_head(struct sk_buff *skb, const struct bitmap_ip *map)
{
- const struct bitmap_ip *map = set->data;
- struct nlattr *adt, *nested;
- u32 id, first = cb->args[2];
- const unsigned long *members = map->members;
-
- adt = ipset_nest_start(skb, IPSET_ATTR_ADT);
- if (!adt)
- return -EMSGSIZE;
- for (; cb->args[2] < map->elements; cb->args[2]++) {
- id = cb->args[2];
- if (!ip_set_timeout_test(members[id]))
- continue;
- nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
- if (!nested) {
- if (id == first) {
- nla_nest_cancel(skb, adt);
- return -EMSGSIZE;
- } else
- goto nla_put_failure;
- }
- if (nla_put_ipaddr4(skb, IPSET_ATTR_IP,
- htonl(map->first_ip + id * map->hosts)) ||
- nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
- htonl(ip_set_timeout_get(members[id]))))
- goto nla_put_failure;
- ipset_nest_end(skb, nested);
- }
- ipset_nest_end(skb, adt);
-
- /* Set listing finished */
- cb->args[2] = 0;
-
- return 0;
-
-nla_put_failure:
- nla_nest_cancel(skb, nested);
- ipset_nest_end(skb, adt);
- if (unlikely(id == first)) {
- cb->args[2] = 0;
- return -EMSGSIZE;
- }
- return 0;
+ return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) ||
+ nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)) ||
+ (map->netmask != 32 &&
+ nla_put_u8(skb, IPSET_ATTR_NETMASK, map->netmask));
}
static int
bitmap_ip_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
- enum ipset_adt adt, const struct ip_set_adt_opt *opt)
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
struct bitmap_ip *map = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
+ struct bitmap_ip_adt_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, map);
u32 ip;
ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC));
if (ip < map->first_ip || ip > map->last_ip)
return -IPSET_ERR_BITMAP_RANGE;
- ip = ip_to_id(map, ip);
+ e.id = ip_to_id(map, ip);
- return adtfn(set, &ip, opt_timeout(opt, map), opt->cmdflags);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
static int
@@ -246,33 +131,31 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
{
struct bitmap_ip *map = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- u32 timeout = map->timeout;
- u32 ip, ip_to, id;
+ u32 ip, ip_to;
+ struct bitmap_ip_adt_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(map);
int ret = 0;
if (unlikely(!tb[IPSET_ATTR_IP] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
return -IPSET_ERR_PROTOCOL;
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
if (ip < map->first_ip || ip > map->last_ip)
return -IPSET_ERR_BITMAP_RANGE;
- if (tb[IPSET_ATTR_TIMEOUT]) {
- if (!with_timeout(map->timeout))
- return -IPSET_ERR_TIMEOUT;
- timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
- }
-
if (adt == IPSET_TEST) {
- id = ip_to_id(map, ip);
- return adtfn(set, &id, timeout, flags);
+ e.id = ip_to_id(map, ip);
+ return adtfn(set, &e, &ext, &ext, flags);
}
if (tb[IPSET_ATTR_IP_TO]) {
@@ -297,8 +180,8 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
return -IPSET_ERR_BITMAP_RANGE;
for (; !before(ip_to, ip); ip += map->hosts) {
- id = ip_to_id(map, ip);
- ret = adtfn(set, &id, timeout, flags);
+ e.id = ip_to_id(map, ip);
+ ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
@@ -308,54 +191,6 @@ bitmap_ip_uadt(struct ip_set *set, struct nlattr *tb[],
return ret;
}
-static void
-bitmap_ip_destroy(struct ip_set *set)
-{
- struct bitmap_ip *map = set->data;
-
- if (with_timeout(map->timeout))
- del_timer_sync(&map->gc);
-
- ip_set_free(map->members);
- kfree(map);
-
- set->data = NULL;
-}
-
-static void
-bitmap_ip_flush(struct ip_set *set)
-{
- struct bitmap_ip *map = set->data;
-
- memset(map->members, 0, map->memsize);
-}
-
-static int
-bitmap_ip_head(struct ip_set *set, struct sk_buff *skb)
-{
- const struct bitmap_ip *map = set->data;
- struct nlattr *nested;
-
- nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
- if (!nested)
- goto nla_put_failure;
- if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) ||
- nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)) ||
- (map->netmask != 32 &&
- nla_put_u8(skb, IPSET_ATTR_NETMASK, map->netmask)) ||
- nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
- nla_put_net32(skb, IPSET_ATTR_MEMSIZE,
- htonl(sizeof(*map) + map->memsize)) ||
- (with_timeout(map->timeout) &&
- nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout))))
- goto nla_put_failure;
- ipset_nest_end(skb, nested);
-
- return 0;
-nla_put_failure:
- return -EMSGSIZE;
-}
-
static bool
bitmap_ip_same_set(const struct ip_set *a, const struct ip_set *b)
{
@@ -365,70 +200,35 @@ bitmap_ip_same_set(const struct ip_set *a, const struct ip_set *b)
return x->first_ip == y->first_ip &&
x->last_ip == y->last_ip &&
x->netmask == y->netmask &&
- x->timeout == y->timeout;
+ x->timeout == y->timeout &&
+ a->extensions == b->extensions;
}
-static const struct ip_set_type_variant bitmap_ip = {
- .kadt = bitmap_ip_kadt,
- .uadt = bitmap_ip_uadt,
- .adt = {
- [IPSET_ADD] = bitmap_ip_add,
- [IPSET_DEL] = bitmap_ip_del,
- [IPSET_TEST] = bitmap_ip_test,
- },
- .destroy = bitmap_ip_destroy,
- .flush = bitmap_ip_flush,
- .head = bitmap_ip_head,
- .list = bitmap_ip_list,
- .same_set = bitmap_ip_same_set,
+/* Plain variant */
+
+struct bitmap_ip_elem {
};
-static const struct ip_set_type_variant bitmap_tip = {
- .kadt = bitmap_ip_kadt,
- .uadt = bitmap_ip_uadt,
- .adt = {
- [IPSET_ADD] = bitmap_ip_tadd,
- [IPSET_DEL] = bitmap_ip_tdel,
- [IPSET_TEST] = bitmap_ip_ttest,
- },
- .destroy = bitmap_ip_destroy,
- .flush = bitmap_ip_flush,
- .head = bitmap_ip_head,
- .list = bitmap_ip_tlist,
- .same_set = bitmap_ip_same_set,
+/* Timeout variant */
+
+struct bitmap_ipt_elem {
+ unsigned long timeout;
};
-static void
-bitmap_ip_gc(unsigned long ul_set)
-{
- struct ip_set *set = (struct ip_set *) ul_set;
- struct bitmap_ip *map = set->data;
- unsigned long *table = map->members;
- u32 id;
-
- /* We run parallel with other readers (test element)
- * but adding/deleting new entries is locked out */
- read_lock_bh(&set->lock);
- for (id = 0; id < map->elements; id++)
- if (ip_set_timeout_expired(table[id]))
- table[id] = IPSET_ELEM_UNSET;
- read_unlock_bh(&set->lock);
-
- map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
- add_timer(&map->gc);
-}
+/* Plain variant with counter */
-static void
-bitmap_ip_gc_init(struct ip_set *set)
-{
- struct bitmap_ip *map = set->data;
+struct bitmap_ipc_elem {
+ struct ip_set_counter counter;
+};
- init_timer(&map->gc);
- map->gc.data = (unsigned long) set;
- map->gc.function = bitmap_ip_gc;
- map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
- add_timer(&map->gc);
-}
+/* Timeout variant with counter */
+
+struct bitmap_ipct_elem {
+ unsigned long timeout;
+ struct ip_set_counter counter;
+};
+
+#include "ip_set_bitmap_gen.h"
/* Create bitmap:ip type of sets */
@@ -440,6 +240,13 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map,
map->members = ip_set_alloc(map->memsize);
if (!map->members)
return false;
+ if (map->dsize) {
+ map->extensions = ip_set_alloc(map->dsize * elements);
+ if (!map->extensions) {
+ kfree(map->members);
+ return false;
+ }
+ }
map->first_ip = first_ip;
map->last_ip = last_ip;
map->elements = elements;
@@ -457,13 +264,14 @@ static int
bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
{
struct bitmap_ip *map;
- u32 first_ip, last_ip, hosts;
+ u32 first_ip, last_ip, hosts, cadt_flags = 0;
u64 elements;
u8 netmask = 32;
int ret;
if (unlikely(!tb[IPSET_ATTR_IP] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip);
@@ -526,8 +334,45 @@ bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
if (!map)
return -ENOMEM;
- if (tb[IPSET_ATTR_TIMEOUT]) {
- map->memsize = elements * sizeof(unsigned long);
+ map->memsize = bitmap_bytes(0, elements - 1);
+ set->variant = &bitmap_ip;
+ if (tb[IPSET_ATTR_CADT_FLAGS])
+ cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) {
+ set->extensions |= IPSET_EXT_COUNTER;
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ map->dsize = sizeof(struct bitmap_ipct_elem);
+ map->offset[IPSET_OFFSET_TIMEOUT] =
+ offsetof(struct bitmap_ipct_elem, timeout);
+ map->offset[IPSET_OFFSET_COUNTER] =
+ offsetof(struct bitmap_ipct_elem, counter);
+
+ if (!init_map_ip(set, map, first_ip, last_ip,
+ elements, hosts, netmask)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+
+ map->timeout = ip_set_timeout_uget(
+ tb[IPSET_ATTR_TIMEOUT]);
+ set->extensions |= IPSET_EXT_TIMEOUT;
+
+ bitmap_ip_gc_init(set, bitmap_ip_gc);
+ } else {
+ map->dsize = sizeof(struct bitmap_ipc_elem);
+ map->offset[IPSET_OFFSET_COUNTER] =
+ offsetof(struct bitmap_ipc_elem, counter);
+
+ if (!init_map_ip(set, map, first_ip, last_ip,
+ elements, hosts, netmask)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+ }
+ } else if (tb[IPSET_ATTR_TIMEOUT]) {
+ map->dsize = sizeof(struct bitmap_ipt_elem);
+ map->offset[IPSET_OFFSET_TIMEOUT] =
+ offsetof(struct bitmap_ipt_elem, timeout);
if (!init_map_ip(set, map, first_ip, last_ip,
elements, hosts, netmask)) {
@@ -536,19 +381,16 @@ bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
}
map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
- set->variant = &bitmap_tip;
+ set->extensions |= IPSET_EXT_TIMEOUT;
- bitmap_ip_gc_init(set);
+ bitmap_ip_gc_init(set, bitmap_ip_gc);
} else {
- map->memsize = bitmap_bytes(0, elements - 1);
-
+ map->dsize = 0;
if (!init_map_ip(set, map, first_ip, last_ip,
elements, hosts, netmask)) {
kfree(map);
return -ENOMEM;
}
-
- set->variant = &bitmap_ip;
}
return 0;
}
@@ -568,6 +410,7 @@ static struct ip_set_type bitmap_ip_type __read_mostly = {
[IPSET_ATTR_CIDR] = { .type = NLA_U8 },
[IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
},
.adt_policy = {
[IPSET_ATTR_IP] = { .type = NLA_NESTED },
@@ -575,6 +418,8 @@ static struct ip_set_type bitmap_ip_type __read_mostly = {
[IPSET_ATTR_CIDR] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
},
.me = THIS_MODULE,
};
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index d7df6ac2c6f..3b30e0bef89 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -1,7 +1,7 @@
/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
* Patrick Schaaf <bof@bof.de>
* Martin Josefsson <gandalf@wlug.westbo.se>
- * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -23,344 +23,208 @@
#include <linux/netfilter/ipset/pfxlen.h>
#include <linux/netfilter/ipset/ip_set.h>
-#include <linux/netfilter/ipset/ip_set_timeout.h>
#include <linux/netfilter/ipset/ip_set_bitmap.h>
#define REVISION_MIN 0
-#define REVISION_MAX 0
+#define REVISION_MAX 1 /* Counter support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
IP_SET_MODULE_DESC("bitmap:ip,mac", REVISION_MIN, REVISION_MAX);
MODULE_ALIAS("ip_set_bitmap:ip,mac");
+#define MTYPE bitmap_ipmac
+#define IP_SET_BITMAP_STORED_TIMEOUT
+
enum {
- MAC_EMPTY, /* element is not set */
- MAC_FILLED, /* element is set with MAC */
MAC_UNSET, /* element is set, without MAC */
+ MAC_FILLED, /* element is set with MAC */
};
/* Type structure */
struct bitmap_ipmac {
void *members; /* the set members */
+ void *extensions; /* MAC + data extensions */
u32 first_ip; /* host byte order, included in range */
u32 last_ip; /* host byte order, included in range */
+ u32 elements; /* number of max elements in the set */
u32 timeout; /* timeout value */
struct timer_list gc; /* garbage collector */
+ size_t memsize; /* members size */
size_t dsize; /* size of element */
+ size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */
};
/* ADT structure for generic function args */
-struct ipmac {
- u32 id; /* id in array */
- unsigned char *ether; /* ethernet address */
+struct bitmap_ipmac_adt_elem {
+ u16 id;
+ unsigned char *ether;
};
-/* Member element without and with timeout */
-
-struct ipmac_elem {
+struct bitmap_ipmac_elem {
unsigned char ether[ETH_ALEN];
- unsigned char match;
+ unsigned char filled;
} __attribute__ ((aligned));
-struct ipmac_telem {
- unsigned char ether[ETH_ALEN];
- unsigned char match;
- unsigned long timeout;
-} __attribute__ ((aligned));
-
-static inline void *
-bitmap_ipmac_elem(const struct bitmap_ipmac *map, u32 id)
+static inline u32
+ip_to_id(const struct bitmap_ipmac *m, u32 ip)
{
- return (void *)((char *)map->members + id * map->dsize);
+ return ip - m->first_ip;
}
-static inline bool
-bitmap_timeout(const struct bitmap_ipmac *map, u32 id)
+static inline struct bitmap_ipmac_elem *
+get_elem(void *extensions, u16 id, size_t dsize)
{
- const struct ipmac_telem *elem = bitmap_ipmac_elem(map, id);
-
- return ip_set_timeout_test(elem->timeout);
+ return (struct bitmap_ipmac_elem *)(extensions + id * dsize);
}
-static inline bool
-bitmap_expired(const struct bitmap_ipmac *map, u32 id)
-{
- const struct ipmac_telem *elem = bitmap_ipmac_elem(map, id);
-
- return ip_set_timeout_expired(elem->timeout);
-}
+/* Common functions */
static inline int
-bitmap_ipmac_exist(const struct ipmac_telem *elem)
-{
- return elem->match == MAC_UNSET ||
- (elem->match == MAC_FILLED &&
- !ip_set_timeout_expired(elem->timeout));
-}
-
-/* Base variant */
-
-static int
-bitmap_ipmac_test(struct ip_set *set, void *value, u32 timeout, u32 flags)
-{
- const struct bitmap_ipmac *map = set->data;
- const struct ipmac *data = value;
- const struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id);
-
- switch (elem->match) {
- case MAC_UNSET:
- /* Trigger kernel to fill out the ethernet address */
- return -EAGAIN;
- case MAC_FILLED:
- return data->ether == NULL ||
- ether_addr_equal(data->ether, elem->ether);
- }
- return 0;
-}
-
-static int
-bitmap_ipmac_add(struct ip_set *set, void *value, u32 timeout, u32 flags)
-{
- struct bitmap_ipmac *map = set->data;
- const struct ipmac *data = value;
- struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id);
-
- switch (elem->match) {
- case MAC_UNSET:
- if (!data->ether)
- /* Already added without ethernet address */
- return -IPSET_ERR_EXIST;
- /* Fill the MAC address */
- memcpy(elem->ether, data->ether, ETH_ALEN);
- elem->match = MAC_FILLED;
- break;
- case MAC_FILLED:
- return -IPSET_ERR_EXIST;
- case MAC_EMPTY:
- if (data->ether) {
- memcpy(elem->ether, data->ether, ETH_ALEN);
- elem->match = MAC_FILLED;
- } else
- elem->match = MAC_UNSET;
- }
-
- return 0;
-}
-
-static int
-bitmap_ipmac_del(struct ip_set *set, void *value, u32 timeout, u32 flags)
+bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e,
+ const struct bitmap_ipmac *map)
{
- struct bitmap_ipmac *map = set->data;
- const struct ipmac *data = value;
- struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id);
-
- if (elem->match == MAC_EMPTY)
- return -IPSET_ERR_EXIST;
+ const struct bitmap_ipmac_elem *elem;
- elem->match = MAC_EMPTY;
-
- return 0;
+ if (!test_bit(e->id, map->members))
+ return 0;
+ elem = get_elem(map->extensions, e->id, map->dsize);
+ if (elem->filled == MAC_FILLED)
+ return e->ether == NULL ||
+ ether_addr_equal(e->ether, elem->ether);
+ /* Trigger kernel to fill out the ethernet address */
+ return -EAGAIN;
}
-static int
-bitmap_ipmac_list(const struct ip_set *set,
- struct sk_buff *skb, struct netlink_callback *cb)
+static inline int
+bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map)
{
- const struct bitmap_ipmac *map = set->data;
- const struct ipmac_elem *elem;
- struct nlattr *atd, *nested;
- u32 id, first = cb->args[2];
- u32 last = map->last_ip - map->first_ip;
-
- atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
- if (!atd)
- return -EMSGSIZE;
- for (; cb->args[2] <= last; cb->args[2]++) {
- id = cb->args[2];
- elem = bitmap_ipmac_elem(map, id);
- if (elem->match == MAC_EMPTY)
- continue;
- nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
- if (!nested) {
- if (id == first) {
- nla_nest_cancel(skb, atd);
- return -EMSGSIZE;
- } else
- goto nla_put_failure;
- }
- if (nla_put_ipaddr4(skb, IPSET_ATTR_IP,
- htonl(map->first_ip + id)) ||
- (elem->match == MAC_FILLED &&
- nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN,
- elem->ether)))
- goto nla_put_failure;
- ipset_nest_end(skb, nested);
- }
- ipset_nest_end(skb, atd);
- /* Set listing finished */
- cb->args[2] = 0;
-
- return 0;
+ const struct bitmap_ipmac_elem *elem;
-nla_put_failure:
- nla_nest_cancel(skb, nested);
- ipset_nest_end(skb, atd);
- if (unlikely(id == first)) {
- cb->args[2] = 0;
- return -EMSGSIZE;
- }
- return 0;
+ if (!test_bit(id, map->members))
+ return 0;
+ elem = get_elem(map->extensions, id, map->dsize);
+ /* Timer not started for the incomplete elements */
+ return elem->filled == MAC_FILLED;
}
-/* Timeout variant */
-
-static int
-bitmap_ipmac_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags)
+static inline int
+bitmap_ipmac_is_filled(const struct bitmap_ipmac_elem *elem)
{
- const struct bitmap_ipmac *map = set->data;
- const struct ipmac *data = value;
- const struct ipmac_elem *elem = bitmap_ipmac_elem(map, data->id);
-
- switch (elem->match) {
- case MAC_UNSET:
- /* Trigger kernel to fill out the ethernet address */
- return -EAGAIN;
- case MAC_FILLED:
- return (data->ether == NULL ||
- ether_addr_equal(data->ether, elem->ether)) &&
- !bitmap_expired(map, data->id);
- }
- return 0;
+ return elem->filled == MAC_FILLED;
}
-static int
-bitmap_ipmac_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags)
+static inline int
+bitmap_ipmac_add_timeout(unsigned long *timeout,
+ const struct bitmap_ipmac_adt_elem *e,
+ const struct ip_set_ext *ext,
+ struct bitmap_ipmac *map, int mode)
{
- struct bitmap_ipmac *map = set->data;
- const struct ipmac *data = value;
- struct ipmac_telem *elem = bitmap_ipmac_elem(map, data->id);
- bool flag_exist = flags & IPSET_FLAG_EXIST;
+ u32 t = ext->timeout;
- switch (elem->match) {
- case MAC_UNSET:
- if (!(data->ether || flag_exist))
- /* Already added without ethernet address */
- return -IPSET_ERR_EXIST;
- /* Fill the MAC address and activate the timer */
- memcpy(elem->ether, data->ether, ETH_ALEN);
- elem->match = MAC_FILLED;
- if (timeout == map->timeout)
+ if (mode == IPSET_ADD_START_STORED_TIMEOUT) {
+ if (t == map->timeout)
/* Timeout was not specified, get stored one */
- timeout = elem->timeout;
- elem->timeout = ip_set_timeout_set(timeout);
- break;
- case MAC_FILLED:
- if (!(bitmap_expired(map, data->id) || flag_exist))
- return -IPSET_ERR_EXIST;
- /* Fall through */
- case MAC_EMPTY:
- if (data->ether) {
- memcpy(elem->ether, data->ether, ETH_ALEN);
- elem->match = MAC_FILLED;
- } else
- elem->match = MAC_UNSET;
+ t = *timeout;
+ ip_set_timeout_set(timeout, t);
+ } else {
/* If MAC is unset yet, we store plain timeout value
* because the timer is not activated yet
* and we can reuse it later when MAC is filled out,
* possibly by the kernel */
- elem->timeout = data->ether ? ip_set_timeout_set(timeout)
- : timeout;
- break;
+ if (e->ether)
+ ip_set_timeout_set(timeout, t);
+ else
+ *timeout = t;
}
-
return 0;
}
-static int
-bitmap_ipmac_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags)
+static inline int
+bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e,
+ struct bitmap_ipmac *map, u32 flags)
{
- struct bitmap_ipmac *map = set->data;
- const struct ipmac *data = value;
- struct ipmac_telem *elem = bitmap_ipmac_elem(map, data->id);
+ struct bitmap_ipmac_elem *elem;
+
+ elem = get_elem(map->extensions, e->id, map->dsize);
+ if (test_and_set_bit(e->id, map->members)) {
+ if (elem->filled == MAC_FILLED) {
+ if (e->ether && (flags & IPSET_FLAG_EXIST))
+ memcpy(elem->ether, e->ether, ETH_ALEN);
+ return IPSET_ADD_FAILED;
+ } else if (!e->ether)
+ /* Already added without ethernet address */
+ return IPSET_ADD_FAILED;
+ /* Fill the MAC address and trigger the timer activation */
+ memcpy(elem->ether, e->ether, ETH_ALEN);
+ elem->filled = MAC_FILLED;
+ return IPSET_ADD_START_STORED_TIMEOUT;
+ } else if (e->ether) {
+ /* We can store MAC too */
+ memcpy(elem->ether, e->ether, ETH_ALEN);
+ elem->filled = MAC_FILLED;
+ return 0;
+ } else {
+ elem->filled = MAC_UNSET;
+ /* MAC is not stored yet, don't start timer */
+ return IPSET_ADD_STORE_PLAIN_TIMEOUT;
+ }
+}
- if (elem->match == MAC_EMPTY || bitmap_expired(map, data->id))
- return -IPSET_ERR_EXIST;
+static inline int
+bitmap_ipmac_do_del(const struct bitmap_ipmac_adt_elem *e,
+ struct bitmap_ipmac *map)
+{
+ return !test_and_clear_bit(e->id, map->members);
+}
- elem->match = MAC_EMPTY;
+static inline unsigned long
+ip_set_timeout_stored(struct bitmap_ipmac *map, u32 id, unsigned long *timeout)
+{
+ const struct bitmap_ipmac_elem *elem =
+ get_elem(map->extensions, id, map->dsize);
- return 0;
+ return elem->filled == MAC_FILLED ? ip_set_timeout_get(timeout) :
+ *timeout;
}
-static int
-bitmap_ipmac_tlist(const struct ip_set *set,
- struct sk_buff *skb, struct netlink_callback *cb)
+static inline int
+bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map,
+ u32 id)
{
- const struct bitmap_ipmac *map = set->data;
- const struct ipmac_telem *elem;
- struct nlattr *atd, *nested;
- u32 id, first = cb->args[2];
- u32 timeout, last = map->last_ip - map->first_ip;
-
- atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
- if (!atd)
- return -EMSGSIZE;
- for (; cb->args[2] <= last; cb->args[2]++) {
- id = cb->args[2];
- elem = bitmap_ipmac_elem(map, id);
- if (!bitmap_ipmac_exist(elem))
- continue;
- nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
- if (!nested) {
- if (id == first) {
- nla_nest_cancel(skb, atd);
- return -EMSGSIZE;
- } else
- goto nla_put_failure;
- }
- if (nla_put_ipaddr4(skb, IPSET_ATTR_IP,
- htonl(map->first_ip + id)) ||
- (elem->match == MAC_FILLED &&
- nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN,
- elem->ether)))
- goto nla_put_failure;
- timeout = elem->match == MAC_UNSET ? elem->timeout
- : ip_set_timeout_get(elem->timeout);
- if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(timeout)))
- goto nla_put_failure;
- ipset_nest_end(skb, nested);
- }
- ipset_nest_end(skb, atd);
- /* Set listing finished */
- cb->args[2] = 0;
+ const struct bitmap_ipmac_elem *elem =
+ get_elem(map->extensions, id, map->dsize);
- return 0;
+ return nla_put_ipaddr4(skb, IPSET_ATTR_IP,
+ htonl(map->first_ip + id)) ||
+ (elem->filled == MAC_FILLED &&
+ nla_put(skb, IPSET_ATTR_ETHER, ETH_ALEN, elem->ether));
+}
-nla_put_failure:
- nla_nest_cancel(skb, nested);
- ipset_nest_end(skb, atd);
- if (unlikely(id == first)) {
- cb->args[2] = 0;
- return -EMSGSIZE;
- }
- return 0;
+static inline int
+bitmap_ipmac_do_head(struct sk_buff *skb, const struct bitmap_ipmac *map)
+{
+ return nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) ||
+ nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip));
}
static int
bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
- enum ipset_adt adt, const struct ip_set_adt_opt *opt)
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
struct bitmap_ipmac *map = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct ipmac data;
+ struct bitmap_ipmac_adt_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, map);
+ u32 ip;
/* MAC can be src only */
if (!(opt->flags & IPSET_DIM_TWO_SRC))
return 0;
- data.id = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC));
- if (data.id < map->first_ip || data.id > map->last_ip)
+ ip = ntohl(ip4addr(skb, opt->flags & IPSET_DIM_ONE_SRC));
+ if (ip < map->first_ip || ip > map->last_ip)
return -IPSET_ERR_BITMAP_RANGE;
/* Backward compatibility: we don't check the second flag */
@@ -368,10 +232,10 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
(skb_mac_header(skb) + ETH_HLEN) > skb->data)
return -EINVAL;
- data.id -= map->first_ip;
- data.ether = eth_hdr(skb)->h_source;
+ e.id = ip_to_id(map, ip);
+ e.ether = eth_hdr(skb)->h_source;
- return adtfn(set, &data, opt_timeout(opt, map), opt->cmdflags);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
static int
@@ -380,91 +244,39 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[],
{
const struct bitmap_ipmac *map = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct ipmac data;
- u32 timeout = map->timeout;
+ struct bitmap_ipmac_adt_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(map);
+ u32 ip;
int ret = 0;
if (unlikely(!tb[IPSET_ATTR_IP] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
return -IPSET_ERR_PROTOCOL;
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &data.id);
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
- if (data.id < map->first_ip || data.id > map->last_ip)
+ if (ip < map->first_ip || ip > map->last_ip)
return -IPSET_ERR_BITMAP_RANGE;
+ e.id = ip_to_id(map, ip);
if (tb[IPSET_ATTR_ETHER])
- data.ether = nla_data(tb[IPSET_ATTR_ETHER]);
+ e.ether = nla_data(tb[IPSET_ATTR_ETHER]);
else
- data.ether = NULL;
-
- if (tb[IPSET_ATTR_TIMEOUT]) {
- if (!with_timeout(map->timeout))
- return -IPSET_ERR_TIMEOUT;
- timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
- }
-
- data.id -= map->first_ip;
+ e.ether = NULL;
- ret = adtfn(set, &data, timeout, flags);
+ ret = adtfn(set, &e, &ext, &ext, flags);
return ip_set_eexist(ret, flags) ? 0 : ret;
}
-static void
-bitmap_ipmac_destroy(struct ip_set *set)
-{
- struct bitmap_ipmac *map = set->data;
-
- if (with_timeout(map->timeout))
- del_timer_sync(&map->gc);
-
- ip_set_free(map->members);
- kfree(map);
-
- set->data = NULL;
-}
-
-static void
-bitmap_ipmac_flush(struct ip_set *set)
-{
- struct bitmap_ipmac *map = set->data;
-
- memset(map->members, 0,
- (map->last_ip - map->first_ip + 1) * map->dsize);
-}
-
-static int
-bitmap_ipmac_head(struct ip_set *set, struct sk_buff *skb)
-{
- const struct bitmap_ipmac *map = set->data;
- struct nlattr *nested;
-
- nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
- if (!nested)
- goto nla_put_failure;
- if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, htonl(map->first_ip)) ||
- nla_put_ipaddr4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)) ||
- nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
- nla_put_net32(skb, IPSET_ATTR_MEMSIZE,
- htonl(sizeof(*map) +
- ((map->last_ip - map->first_ip + 1) *
- map->dsize))) ||
- (with_timeout(map->timeout) &&
- nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout))))
- goto nla_put_failure;
- ipset_nest_end(skb, nested);
-
- return 0;
-nla_put_failure:
- return -EMSGSIZE;
-}
-
static bool
bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b)
{
@@ -473,85 +285,64 @@ bitmap_ipmac_same_set(const struct ip_set *a, const struct ip_set *b)
return x->first_ip == y->first_ip &&
x->last_ip == y->last_ip &&
- x->timeout == y->timeout;
+ x->timeout == y->timeout &&
+ a->extensions == b->extensions;
}
-static const struct ip_set_type_variant bitmap_ipmac = {
- .kadt = bitmap_ipmac_kadt,
- .uadt = bitmap_ipmac_uadt,
- .adt = {
- [IPSET_ADD] = bitmap_ipmac_add,
- [IPSET_DEL] = bitmap_ipmac_del,
- [IPSET_TEST] = bitmap_ipmac_test,
- },
- .destroy = bitmap_ipmac_destroy,
- .flush = bitmap_ipmac_flush,
- .head = bitmap_ipmac_head,
- .list = bitmap_ipmac_list,
- .same_set = bitmap_ipmac_same_set,
-};
+/* Plain variant */
-static const struct ip_set_type_variant bitmap_tipmac = {
- .kadt = bitmap_ipmac_kadt,
- .uadt = bitmap_ipmac_uadt,
- .adt = {
- [IPSET_ADD] = bitmap_ipmac_tadd,
- [IPSET_DEL] = bitmap_ipmac_tdel,
- [IPSET_TEST] = bitmap_ipmac_ttest,
- },
- .destroy = bitmap_ipmac_destroy,
- .flush = bitmap_ipmac_flush,
- .head = bitmap_ipmac_head,
- .list = bitmap_ipmac_tlist,
- .same_set = bitmap_ipmac_same_set,
+/* Timeout variant */
+
+struct bitmap_ipmact_elem {
+ struct {
+ unsigned char ether[ETH_ALEN];
+ unsigned char filled;
+ } __attribute__ ((aligned));
+ unsigned long timeout;
};
-static void
-bitmap_ipmac_gc(unsigned long ul_set)
-{
- struct ip_set *set = (struct ip_set *) ul_set;
- struct bitmap_ipmac *map = set->data;
- struct ipmac_telem *elem;
- u32 id, last = map->last_ip - map->first_ip;
-
- /* We run parallel with other readers (test element)
- * but adding/deleting new entries is locked out */
- read_lock_bh(&set->lock);
- for (id = 0; id <= last; id++) {
- elem = bitmap_ipmac_elem(map, id);
- if (elem->match == MAC_FILLED &&
- ip_set_timeout_expired(elem->timeout))
- elem->match = MAC_EMPTY;
- }
- read_unlock_bh(&set->lock);
+/* Plain variant with counter */
- map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
- add_timer(&map->gc);
-}
+struct bitmap_ipmacc_elem {
+ struct {
+ unsigned char ether[ETH_ALEN];
+ unsigned char filled;
+ } __attribute__ ((aligned));
+ struct ip_set_counter counter;
+};
-static void
-bitmap_ipmac_gc_init(struct ip_set *set)
-{
- struct bitmap_ipmac *map = set->data;
+/* Timeout variant with counter */
- init_timer(&map->gc);
- map->gc.data = (unsigned long) set;
- map->gc.function = bitmap_ipmac_gc;
- map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
- add_timer(&map->gc);
-}
+struct bitmap_ipmacct_elem {
+ struct {
+ unsigned char ether[ETH_ALEN];
+ unsigned char filled;
+ } __attribute__ ((aligned));
+ unsigned long timeout;
+ struct ip_set_counter counter;
+};
+
+#include "ip_set_bitmap_gen.h"
/* Create bitmap:ip,mac type of sets */
static bool
init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map,
- u32 first_ip, u32 last_ip)
+ u32 first_ip, u32 last_ip, u32 elements)
{
map->members = ip_set_alloc((last_ip - first_ip + 1) * map->dsize);
if (!map->members)
return false;
+ if (map->dsize) {
+ map->extensions = ip_set_alloc(map->dsize * elements);
+ if (!map->extensions) {
+ kfree(map->members);
+ return false;
+ }
+ }
map->first_ip = first_ip;
map->last_ip = last_ip;
+ map->elements = elements;
map->timeout = IPSET_NO_TIMEOUT;
set->data = map;
@@ -564,13 +355,14 @@ static int
bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[],
u32 flags)
{
- u32 first_ip, last_ip;
+ u32 first_ip, last_ip, cadt_flags = 0;
u64 elements;
struct bitmap_ipmac *map;
int ret;
if (unlikely(!tb[IPSET_ATTR_IP] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &first_ip);
@@ -605,28 +397,59 @@ bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[],
if (!map)
return -ENOMEM;
- if (tb[IPSET_ATTR_TIMEOUT]) {
- map->dsize = sizeof(struct ipmac_telem);
+ map->memsize = bitmap_bytes(0, elements - 1);
+ set->variant = &bitmap_ipmac;
+ if (tb[IPSET_ATTR_CADT_FLAGS])
+ cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) {
+ set->extensions |= IPSET_EXT_COUNTER;
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ map->dsize = sizeof(struct bitmap_ipmacct_elem);
+ map->offset[IPSET_OFFSET_TIMEOUT] =
+ offsetof(struct bitmap_ipmacct_elem, timeout);
+ map->offset[IPSET_OFFSET_COUNTER] =
+ offsetof(struct bitmap_ipmacct_elem, counter);
+
+ if (!init_map_ipmac(set, map, first_ip, last_ip,
+ elements)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+ map->timeout = ip_set_timeout_uget(
+ tb[IPSET_ATTR_TIMEOUT]);
+ set->extensions |= IPSET_EXT_TIMEOUT;
+ bitmap_ipmac_gc_init(set, bitmap_ipmac_gc);
+ } else {
+ map->dsize = sizeof(struct bitmap_ipmacc_elem);
+ map->offset[IPSET_OFFSET_COUNTER] =
+ offsetof(struct bitmap_ipmacc_elem, counter);
+
+ if (!init_map_ipmac(set, map, first_ip, last_ip,
+ elements)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+ }
+ } else if (tb[IPSET_ATTR_TIMEOUT]) {
+ map->dsize = sizeof(struct bitmap_ipmact_elem);
+ map->offset[IPSET_OFFSET_TIMEOUT] =
+ offsetof(struct bitmap_ipmact_elem, timeout);
- if (!init_map_ipmac(set, map, first_ip, last_ip)) {
+ if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) {
kfree(map);
return -ENOMEM;
}
-
map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
-
- set->variant = &bitmap_tipmac;
-
- bitmap_ipmac_gc_init(set);
+ set->extensions |= IPSET_EXT_TIMEOUT;
+ bitmap_ipmac_gc_init(set, bitmap_ipmac_gc);
} else {
- map->dsize = sizeof(struct ipmac_elem);
+ map->dsize = sizeof(struct bitmap_ipmac_elem);
- if (!init_map_ipmac(set, map, first_ip, last_ip)) {
+ if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) {
kfree(map);
return -ENOMEM;
}
set->variant = &bitmap_ipmac;
-
}
return 0;
}
@@ -645,6 +468,7 @@ static struct ip_set_type bitmap_ipmac_type = {
[IPSET_ATTR_IP_TO] = { .type = NLA_NESTED },
[IPSET_ATTR_CIDR] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
},
.adt_policy = {
[IPSET_ATTR_IP] = { .type = NLA_NESTED },
@@ -652,6 +476,8 @@ static struct ip_set_type bitmap_ipmac_type = {
.len = ETH_ALEN },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
},
.me = THIS_MODULE,
};
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index e6b2db76f4c..8207d1fda52 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -19,205 +19,94 @@
#include <linux/netfilter/ipset/ip_set.h>
#include <linux/netfilter/ipset/ip_set_bitmap.h>
#include <linux/netfilter/ipset/ip_set_getport.h>
-#define IP_SET_BITMAP_TIMEOUT
-#include <linux/netfilter/ipset/ip_set_timeout.h>
#define REVISION_MIN 0
-#define REVISION_MAX 0
+#define REVISION_MAX 1 /* Counter support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
IP_SET_MODULE_DESC("bitmap:port", REVISION_MIN, REVISION_MAX);
MODULE_ALIAS("ip_set_bitmap:port");
+#define MTYPE bitmap_port
+
/* Type structure */
struct bitmap_port {
void *members; /* the set members */
+ void *extensions; /* data extensions */
u16 first_port; /* host byte order, included in range */
u16 last_port; /* host byte order, included in range */
+ u32 elements; /* number of max elements in the set */
size_t memsize; /* members size */
+ size_t dsize; /* extensions struct size */
+ size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */
u32 timeout; /* timeout parameter */
struct timer_list gc; /* garbage collection */
};
-/* Base variant */
+/* ADT structure for generic function args */
+struct bitmap_port_adt_elem {
+ u16 id;
+};
-static int
-bitmap_port_test(struct ip_set *set, void *value, u32 timeout, u32 flags)
+static inline u16
+port_to_id(const struct bitmap_port *m, u16 port)
{
- const struct bitmap_port *map = set->data;
- u16 id = *(u16 *)value;
-
- return !!test_bit(id, map->members);
+ return port - m->first_port;
}
-static int
-bitmap_port_add(struct ip_set *set, void *value, u32 timeout, u32 flags)
-{
- struct bitmap_port *map = set->data;
- u16 id = *(u16 *)value;
-
- if (test_and_set_bit(id, map->members))
- return -IPSET_ERR_EXIST;
-
- return 0;
-}
+/* Common functions */
-static int
-bitmap_port_del(struct ip_set *set, void *value, u32 timeout, u32 flags)
+static inline int
+bitmap_port_do_test(const struct bitmap_port_adt_elem *e,
+ const struct bitmap_port *map)
{
- struct bitmap_port *map = set->data;
- u16 id = *(u16 *)value;
-
- if (!test_and_clear_bit(id, map->members))
- return -IPSET_ERR_EXIST;
-
- return 0;
+ return !!test_bit(e->id, map->members);
}
-static int
-bitmap_port_list(const struct ip_set *set,
- struct sk_buff *skb, struct netlink_callback *cb)
+static inline int
+bitmap_port_gc_test(u16 id, const struct bitmap_port *map)
{
- const struct bitmap_port *map = set->data;
- struct nlattr *atd, *nested;
- u16 id, first = cb->args[2];
- u16 last = map->last_port - map->first_port;
-
- atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
- if (!atd)
- return -EMSGSIZE;
- for (; cb->args[2] <= last; cb->args[2]++) {
- id = cb->args[2];
- if (!test_bit(id, map->members))
- continue;
- nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
- if (!nested) {
- if (id == first) {
- nla_nest_cancel(skb, atd);
- return -EMSGSIZE;
- } else
- goto nla_put_failure;
- }
- if (nla_put_net16(skb, IPSET_ATTR_PORT,
- htons(map->first_port + id)))
- goto nla_put_failure;
- ipset_nest_end(skb, nested);
- }
- ipset_nest_end(skb, atd);
- /* Set listing finished */
- cb->args[2] = 0;
-
- return 0;
-
-nla_put_failure:
- nla_nest_cancel(skb, nested);
- ipset_nest_end(skb, atd);
- if (unlikely(id == first)) {
- cb->args[2] = 0;
- return -EMSGSIZE;
- }
- return 0;
+ return !!test_bit(id, map->members);
}
-/* Timeout variant */
-
-static int
-bitmap_port_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags)
+static inline int
+bitmap_port_do_add(const struct bitmap_port_adt_elem *e,
+ struct bitmap_port *map, u32 flags)
{
- const struct bitmap_port *map = set->data;
- const unsigned long *members = map->members;
- u16 id = *(u16 *)value;
-
- return ip_set_timeout_test(members[id]);
+ return !!test_and_set_bit(e->id, map->members);
}
-static int
-bitmap_port_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags)
+static inline int
+bitmap_port_do_del(const struct bitmap_port_adt_elem *e,
+ struct bitmap_port *map)
{
- struct bitmap_port *map = set->data;
- unsigned long *members = map->members;
- u16 id = *(u16 *)value;
-
- if (ip_set_timeout_test(members[id]) && !(flags & IPSET_FLAG_EXIST))
- return -IPSET_ERR_EXIST;
-
- members[id] = ip_set_timeout_set(timeout);
-
- return 0;
+ return !test_and_clear_bit(e->id, map->members);
}
-static int
-bitmap_port_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags)
+static inline int
+bitmap_port_do_list(struct sk_buff *skb, const struct bitmap_port *map, u32 id)
{
- struct bitmap_port *map = set->data;
- unsigned long *members = map->members;
- u16 id = *(u16 *)value;
- int ret = -IPSET_ERR_EXIST;
-
- if (ip_set_timeout_test(members[id]))
- ret = 0;
-
- members[id] = IPSET_ELEM_UNSET;
- return ret;
+ return nla_put_net16(skb, IPSET_ATTR_PORT,
+ htons(map->first_port + id));
}
-static int
-bitmap_port_tlist(const struct ip_set *set,
- struct sk_buff *skb, struct netlink_callback *cb)
+static inline int
+bitmap_port_do_head(struct sk_buff *skb, const struct bitmap_port *map)
{
- const struct bitmap_port *map = set->data;
- struct nlattr *adt, *nested;
- u16 id, first = cb->args[2];
- u16 last = map->last_port - map->first_port;
- const unsigned long *members = map->members;
-
- adt = ipset_nest_start(skb, IPSET_ATTR_ADT);
- if (!adt)
- return -EMSGSIZE;
- for (; cb->args[2] <= last; cb->args[2]++) {
- id = cb->args[2];
- if (!ip_set_timeout_test(members[id]))
- continue;
- nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
- if (!nested) {
- if (id == first) {
- nla_nest_cancel(skb, adt);
- return -EMSGSIZE;
- } else
- goto nla_put_failure;
- }
- if (nla_put_net16(skb, IPSET_ATTR_PORT,
- htons(map->first_port + id)) ||
- nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
- htonl(ip_set_timeout_get(members[id]))))
- goto nla_put_failure;
- ipset_nest_end(skb, nested);
- }
- ipset_nest_end(skb, adt);
-
- /* Set listing finished */
- cb->args[2] = 0;
-
- return 0;
-
-nla_put_failure:
- nla_nest_cancel(skb, nested);
- ipset_nest_end(skb, adt);
- if (unlikely(id == first)) {
- cb->args[2] = 0;
- return -EMSGSIZE;
- }
- return 0;
+ return nla_put_net16(skb, IPSET_ATTR_PORT, htons(map->first_port)) ||
+ nla_put_net16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port));
}
static int
bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
- enum ipset_adt adt, const struct ip_set_adt_opt *opt)
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
struct bitmap_port *map = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
+ struct bitmap_port_adt_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, map);
__be16 __port;
u16 port = 0;
@@ -230,9 +119,9 @@ bitmap_port_kadt(struct ip_set *set, const struct sk_buff *skb,
if (port < map->first_port || port > map->last_port)
return -IPSET_ERR_BITMAP_RANGE;
- port -= map->first_port;
+ e.id = port_to_id(map, port);
- return adtfn(set, &port, opt_timeout(opt, map), opt->cmdflags);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
static int
@@ -241,14 +130,17 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],
{
struct bitmap_port *map = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- u32 timeout = map->timeout;
+ struct bitmap_port_adt_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(map);
u32 port; /* wraparound */
- u16 id, port_to;
+ u16 port_to;
int ret = 0;
if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
return -IPSET_ERR_PROTOCOL;
if (tb[IPSET_ATTR_LINENO])
@@ -257,16 +149,13 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],
port = ip_set_get_h16(tb[IPSET_ATTR_PORT]);
if (port < map->first_port || port > map->last_port)
return -IPSET_ERR_BITMAP_RANGE;
-
- if (tb[IPSET_ATTR_TIMEOUT]) {
- if (!with_timeout(map->timeout))
- return -IPSET_ERR_TIMEOUT;
- timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
- }
+ ret = ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
if (adt == IPSET_TEST) {
- id = port - map->first_port;
- return adtfn(set, &id, timeout, flags);
+ e.id = port_to_id(map, port);
+ return adtfn(set, &e, &ext, &ext, flags);
}
if (tb[IPSET_ATTR_PORT_TO]) {
@@ -283,8 +172,8 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],
return -IPSET_ERR_BITMAP_RANGE;
for (; port <= port_to; port++) {
- id = port - map->first_port;
- ret = adtfn(set, &id, timeout, flags);
+ e.id = port_to_id(map, port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
@@ -294,52 +183,6 @@ bitmap_port_uadt(struct ip_set *set, struct nlattr *tb[],
return ret;
}
-static void
-bitmap_port_destroy(struct ip_set *set)
-{
- struct bitmap_port *map = set->data;
-
- if (with_timeout(map->timeout))
- del_timer_sync(&map->gc);
-
- ip_set_free(map->members);
- kfree(map);
-
- set->data = NULL;
-}
-
-static void
-bitmap_port_flush(struct ip_set *set)
-{
- struct bitmap_port *map = set->data;
-
- memset(map->members, 0, map->memsize);
-}
-
-static int
-bitmap_port_head(struct ip_set *set, struct sk_buff *skb)
-{
- const struct bitmap_port *map = set->data;
- struct nlattr *nested;
-
- nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
- if (!nested)
- goto nla_put_failure;
- if (nla_put_net16(skb, IPSET_ATTR_PORT, htons(map->first_port)) ||
- nla_put_net16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port)) ||
- nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
- nla_put_net32(skb, IPSET_ATTR_MEMSIZE,
- htonl(sizeof(*map) + map->memsize)) ||
- (with_timeout(map->timeout) &&
- nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout))))
- goto nla_put_failure;
- ipset_nest_end(skb, nested);
-
- return 0;
-nla_put_failure:
- return -EMSGSIZE;
-}
-
static bool
bitmap_port_same_set(const struct ip_set *a, const struct ip_set *b)
{
@@ -348,71 +191,35 @@ bitmap_port_same_set(const struct ip_set *a, const struct ip_set *b)
return x->first_port == y->first_port &&
x->last_port == y->last_port &&
- x->timeout == y->timeout;
+ x->timeout == y->timeout &&
+ a->extensions == b->extensions;
}
-static const struct ip_set_type_variant bitmap_port = {
- .kadt = bitmap_port_kadt,
- .uadt = bitmap_port_uadt,
- .adt = {
- [IPSET_ADD] = bitmap_port_add,
- [IPSET_DEL] = bitmap_port_del,
- [IPSET_TEST] = bitmap_port_test,
- },
- .destroy = bitmap_port_destroy,
- .flush = bitmap_port_flush,
- .head = bitmap_port_head,
- .list = bitmap_port_list,
- .same_set = bitmap_port_same_set,
+/* Plain variant */
+
+struct bitmap_port_elem {
};
-static const struct ip_set_type_variant bitmap_tport = {
- .kadt = bitmap_port_kadt,
- .uadt = bitmap_port_uadt,
- .adt = {
- [IPSET_ADD] = bitmap_port_tadd,
- [IPSET_DEL] = bitmap_port_tdel,
- [IPSET_TEST] = bitmap_port_ttest,
- },
- .destroy = bitmap_port_destroy,
- .flush = bitmap_port_flush,
- .head = bitmap_port_head,
- .list = bitmap_port_tlist,
- .same_set = bitmap_port_same_set,
+/* Timeout variant */
+
+struct bitmap_portt_elem {
+ unsigned long timeout;
};
-static void
-bitmap_port_gc(unsigned long ul_set)
-{
- struct ip_set *set = (struct ip_set *) ul_set;
- struct bitmap_port *map = set->data;
- unsigned long *table = map->members;
- u32 id; /* wraparound */
- u16 last = map->last_port - map->first_port;
-
- /* We run parallel with other readers (test element)
- * but adding/deleting new entries is locked out */
- read_lock_bh(&set->lock);
- for (id = 0; id <= last; id++)
- if (ip_set_timeout_expired(table[id]))
- table[id] = IPSET_ELEM_UNSET;
- read_unlock_bh(&set->lock);
-
- map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
- add_timer(&map->gc);
-}
+/* Plain variant with counter */
-static void
-bitmap_port_gc_init(struct ip_set *set)
-{
- struct bitmap_port *map = set->data;
+struct bitmap_portc_elem {
+ struct ip_set_counter counter;
+};
- init_timer(&map->gc);
- map->gc.data = (unsigned long) set;
- map->gc.function = bitmap_port_gc;
- map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
- add_timer(&map->gc);
-}
+/* Timeout variant with counter */
+
+struct bitmap_portct_elem {
+ unsigned long timeout;
+ struct ip_set_counter counter;
+};
+
+#include "ip_set_bitmap_gen.h"
/* Create bitmap:ip type of sets */
@@ -423,6 +230,13 @@ init_map_port(struct ip_set *set, struct bitmap_port *map,
map->members = ip_set_alloc(map->memsize);
if (!map->members)
return false;
+ if (map->dsize) {
+ map->extensions = ip_set_alloc(map->dsize * map->elements);
+ if (!map->extensions) {
+ kfree(map->members);
+ return false;
+ }
+ }
map->first_port = first_port;
map->last_port = last_port;
map->timeout = IPSET_NO_TIMEOUT;
@@ -434,15 +248,16 @@ init_map_port(struct ip_set *set, struct bitmap_port *map,
}
static int
-bitmap_port_create(struct ip_set *set, struct nlattr *tb[],
- u32 flags)
+bitmap_port_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
{
struct bitmap_port *map;
u16 first_port, last_port;
+ u32 cadt_flags = 0;
if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
first_port = ip_set_get_h16(tb[IPSET_ATTR_PORT]);
@@ -458,28 +273,56 @@ bitmap_port_create(struct ip_set *set, struct nlattr *tb[],
if (!map)
return -ENOMEM;
- if (tb[IPSET_ATTR_TIMEOUT]) {
- map->memsize = (last_port - first_port + 1)
- * sizeof(unsigned long);
-
+ map->elements = last_port - first_port + 1;
+ map->memsize = map->elements * sizeof(unsigned long);
+ set->variant = &bitmap_port;
+ if (tb[IPSET_ATTR_CADT_FLAGS])
+ cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) {
+ set->extensions |= IPSET_EXT_COUNTER;
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ map->dsize = sizeof(struct bitmap_portct_elem);
+ map->offset[IPSET_OFFSET_TIMEOUT] =
+ offsetof(struct bitmap_portct_elem, timeout);
+ map->offset[IPSET_OFFSET_COUNTER] =
+ offsetof(struct bitmap_portct_elem, counter);
+ if (!init_map_port(set, map, first_port, last_port)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+
+ map->timeout =
+ ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ set->extensions |= IPSET_EXT_TIMEOUT;
+ bitmap_port_gc_init(set, bitmap_port_gc);
+ } else {
+ map->dsize = sizeof(struct bitmap_portc_elem);
+ map->offset[IPSET_OFFSET_COUNTER] =
+ offsetof(struct bitmap_portc_elem, counter);
+ if (!init_map_port(set, map, first_port, last_port)) {
+ kfree(map);
+ return -ENOMEM;
+ }
+ }
+ } else if (tb[IPSET_ATTR_TIMEOUT]) {
+ map->dsize = sizeof(struct bitmap_portt_elem);
+ map->offset[IPSET_OFFSET_TIMEOUT] =
+ offsetof(struct bitmap_portt_elem, timeout);
if (!init_map_port(set, map, first_port, last_port)) {
kfree(map);
return -ENOMEM;
}
map->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
- set->variant = &bitmap_tport;
-
- bitmap_port_gc_init(set);
+ set->extensions |= IPSET_EXT_TIMEOUT;
+ bitmap_port_gc_init(set, bitmap_port_gc);
} else {
- map->memsize = bitmap_bytes(0, last_port - first_port);
- pr_debug("memsize: %zu\n", map->memsize);
+ map->dsize = 0;
if (!init_map_port(set, map, first_port, last_port)) {
kfree(map);
return -ENOMEM;
}
- set->variant = &bitmap_port;
}
return 0;
}
@@ -497,12 +340,15 @@ static struct ip_set_type bitmap_port_type = {
[IPSET_ATTR_PORT] = { .type = NLA_U16 },
[IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
},
.adt_policy = {
[IPSET_ATTR_PORT] = { .type = NLA_U16 },
[IPSET_ATTR_PORT_TO] = { .type = NLA_U16 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
},
.me = THIS_MODULE,
};
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 1ba9dbc0e10..f7713900798 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1,6 +1,6 @@
/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
* Patrick Schaaf <bof@bof.de>
- * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -15,7 +15,6 @@
#include <linux/ip.h>
#include <linux/skbuff.h>
#include <linux/spinlock.h>
-#include <linux/netlink.h>
#include <linux/rculist.h>
#include <net/netlink.h>
@@ -316,6 +315,29 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
}
EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
+int
+ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
+ struct ip_set_ext *ext)
+{
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ if (!(set->extensions & IPSET_EXT_TIMEOUT))
+ return -IPSET_ERR_TIMEOUT;
+ ext->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ }
+ if (tb[IPSET_ATTR_BYTES] || tb[IPSET_ATTR_PACKETS]) {
+ if (!(set->extensions & IPSET_EXT_COUNTER))
+ return -IPSET_ERR_COUNTER;
+ if (tb[IPSET_ATTR_BYTES])
+ ext->bytes = be64_to_cpu(nla_get_be64(
+ tb[IPSET_ATTR_BYTES]));
+ if (tb[IPSET_ATTR_PACKETS])
+ ext->packets = be64_to_cpu(nla_get_be64(
+ tb[IPSET_ATTR_PACKETS]));
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_extensions);
+
/*
* Creating/destroying/renaming/swapping affect the existence and
* the properties of a set. All of these can be executed from userspace
@@ -366,8 +388,7 @@ ip_set_rcu_get(ip_set_id_t index)
int
ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
- const struct xt_action_param *par,
- const struct ip_set_adt_opt *opt)
+ const struct xt_action_param *par, struct ip_set_adt_opt *opt)
{
struct ip_set *set = ip_set_rcu_get(index);
int ret = 0;
@@ -392,7 +413,7 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
ret = 1;
} else {
/* --return-nomatch: invert matched element */
- if ((opt->flags & IPSET_RETURN_NOMATCH) &&
+ if ((opt->cmdflags & IPSET_FLAG_RETURN_NOMATCH) &&
(set->type->features & IPSET_TYPE_NOMATCH) &&
(ret > 0 || ret == -ENOTEMPTY))
ret = -ret;
@@ -405,8 +426,7 @@ EXPORT_SYMBOL_GPL(ip_set_test);
int
ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
- const struct xt_action_param *par,
- const struct ip_set_adt_opt *opt)
+ const struct xt_action_param *par, struct ip_set_adt_opt *opt)
{
struct ip_set *set = ip_set_rcu_get(index);
int ret;
@@ -428,8 +448,7 @@ EXPORT_SYMBOL_GPL(ip_set_add);
int
ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
- const struct xt_action_param *par,
- const struct ip_set_adt_opt *opt)
+ const struct xt_action_param *par, struct ip_set_adt_opt *opt)
{
struct ip_set *set = ip_set_rcu_get(index);
int ret = 0;
@@ -1085,7 +1104,7 @@ static int
dump_init(struct netlink_callback *cb)
{
struct nlmsghdr *nlh = nlmsg_hdr(cb->skb);
- int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
+ int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
struct nlattr *attr = (void *)nlh + min_len;
u32 dump_type;
@@ -1301,7 +1320,7 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
struct sk_buff *skb2;
struct nlmsgerr *errmsg;
size_t payload = sizeof(*errmsg) + nlmsg_len(nlh);
- int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
+ int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
struct nlattr *cda[IPSET_ATTR_CMD_MAX+1];
struct nlattr *cmdattr;
u32 *errline;
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
new file mode 100644
index 00000000000..57beb1762b2
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -0,0 +1,1100 @@
+/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _IP_SET_HASH_GEN_H
+#define _IP_SET_HASH_GEN_H
+
+#include <linux/rcupdate.h>
+#include <linux/jhash.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#ifndef rcu_dereference_bh
+#define rcu_dereference_bh(p) rcu_dereference(p)
+#endif
+
+#define CONCAT(a, b) a##b
+#define TOKEN(a, b) CONCAT(a, b)
+
+/* Hashing which uses arrays to resolve clashing. The hash table is resized
+ * (doubled) when searching becomes too long.
+ * Internally jhash is used with the assumption that the size of the
+ * stored data is a multiple of sizeof(u32). If storage supports timeout,
+ * the timeout field must be the last one in the data structure - that field
+ * is ignored when computing the hash key.
+ *
+ * Readers and resizing
+ *
+ * Resizing can be triggered by userspace command only, and those
+ * are serialized by the nfnl mutex. During resizing the set is
+ * read-locked, so the only possible concurrent operations are
+ * the kernel side readers. Those must be protected by proper RCU locking.
+ */
+
+/* Number of elements to store in an initial array block */
+#define AHASH_INIT_SIZE 4
+/* Max number of elements to store in an array block */
+#define AHASH_MAX_SIZE (3*AHASH_INIT_SIZE)
+
+/* Max number of elements can be tuned */
+#ifdef IP_SET_HASH_WITH_MULTI
+#define AHASH_MAX(h) ((h)->ahash_max)
+
+static inline u8
+tune_ahash_max(u8 curr, u32 multi)
+{
+ u32 n;
+
+ if (multi < curr)
+ return curr;
+
+ n = curr + AHASH_INIT_SIZE;
+ /* Currently, at listing one hash bucket must fit into a message.
+ * Therefore we have a hard limit here.
+ */
+ return n > curr && n <= 64 ? n : curr;
+}
+#define TUNE_AHASH_MAX(h, multi) \
+ ((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi))
+#else
+#define AHASH_MAX(h) AHASH_MAX_SIZE
+#define TUNE_AHASH_MAX(h, multi)
+#endif
+
+/* A hash bucket */
+struct hbucket {
+ void *value; /* the array of the values */
+ u8 size; /* size of the array */
+ u8 pos; /* position of the first free entry */
+};
+
+/* The hash table: the table size stored here in order to make resizing easy */
+struct htable {
+ u8 htable_bits; /* size of hash table == 2^htable_bits */
+ struct hbucket bucket[0]; /* hashtable buckets */
+};
+
+#define hbucket(h, i) (&((h)->bucket[i]))
+
+/* Book-keeping of the prefixes added to the set */
+struct net_prefixes {
+ u8 cidr; /* the different cidr values in the set */
+ u32 nets; /* number of elements per cidr */
+};
+
+/* Compute the hash table size */
+static size_t
+htable_size(u8 hbits)
+{
+ size_t hsize;
+
+ /* We must fit both into u32 in jhash and size_t */
+ if (hbits > 31)
+ return 0;
+ hsize = jhash_size(hbits);
+ if ((((size_t)-1) - sizeof(struct htable))/sizeof(struct hbucket)
+ < hsize)
+ return 0;
+
+ return hsize * sizeof(struct hbucket) + sizeof(struct htable);
+}
+
+/* Compute htable_bits from the user input parameter hashsize */
+static u8
+htable_bits(u32 hashsize)
+{
+ /* Assume that hashsize == 2^htable_bits */
+ u8 bits = fls(hashsize - 1);
+ if (jhash_size(bits) != hashsize)
+ /* Round up to the first 2^n value */
+ bits = fls(hashsize);
+
+ return bits;
+}
+
+/* Destroy the hashtable part of the set */
+static void
+ahash_destroy(struct htable *t)
+{
+ struct hbucket *n;
+ u32 i;
+
+ for (i = 0; i < jhash_size(t->htable_bits); i++) {
+ n = hbucket(t, i);
+ if (n->size)
+ /* FIXME: use slab cache */
+ kfree(n->value);
+ }
+
+ ip_set_free(t);
+}
+
+static int
+hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
+{
+ if (n->pos >= n->size) {
+ void *tmp;
+
+ if (n->size >= ahash_max)
+ /* Trigger rehashing */
+ return -EAGAIN;
+
+ tmp = kzalloc((n->size + AHASH_INIT_SIZE) * dsize,
+ GFP_ATOMIC);
+ if (!tmp)
+ return -ENOMEM;
+ if (n->size) {
+ memcpy(tmp, n->value, n->size * dsize);
+ kfree(n->value);
+ }
+ n->value = tmp;
+ n->size += AHASH_INIT_SIZE;
+ }
+ return 0;
+}
+
+#ifdef IP_SET_HASH_WITH_NETS
+#ifdef IP_SET_HASH_WITH_NETS_PACKED
+/* When cidr is packed with nomatch, cidr - 1 is stored in the entry */
+#define CIDR(cidr) (cidr + 1)
+#else
+#define CIDR(cidr) (cidr)
+#endif
+
+#define SET_HOST_MASK(family) (family == AF_INET ? 32 : 128)
+
+#ifdef IP_SET_HASH_WITH_MULTI
+#define NETS_LENGTH(family) (SET_HOST_MASK(family) + 1)
+#else
+#define NETS_LENGTH(family) SET_HOST_MASK(family)
+#endif
+
+#else
+#define NETS_LENGTH(family) 0
+#endif /* IP_SET_HASH_WITH_NETS */
+
+#define ext_timeout(e, h) \
+(unsigned long *)(((void *)(e)) + (h)->offset[IPSET_OFFSET_TIMEOUT])
+#define ext_counter(e, h) \
+(struct ip_set_counter *)(((void *)(e)) + (h)->offset[IPSET_OFFSET_COUNTER])
+
+#endif /* _IP_SET_HASH_GEN_H */
+
+/* Family dependent templates */
+
+#undef ahash_data
+#undef mtype_data_equal
+#undef mtype_do_data_match
+#undef mtype_data_set_flags
+#undef mtype_data_reset_flags
+#undef mtype_data_netmask
+#undef mtype_data_list
+#undef mtype_data_next
+#undef mtype_elem
+
+#undef mtype_add_cidr
+#undef mtype_del_cidr
+#undef mtype_ahash_memsize
+#undef mtype_flush
+#undef mtype_destroy
+#undef mtype_gc_init
+#undef mtype_same_set
+#undef mtype_kadt
+#undef mtype_uadt
+#undef mtype
+
+#undef mtype_add
+#undef mtype_del
+#undef mtype_test_cidrs
+#undef mtype_test
+#undef mtype_expire
+#undef mtype_resize
+#undef mtype_head
+#undef mtype_list
+#undef mtype_gc
+#undef mtype_gc_init
+#undef mtype_variant
+#undef mtype_data_match
+
+#undef HKEY
+
+#define mtype_data_equal TOKEN(MTYPE, _data_equal)
+#ifdef IP_SET_HASH_WITH_NETS
+#define mtype_do_data_match TOKEN(MTYPE, _do_data_match)
+#else
+#define mtype_do_data_match(d) 1
+#endif
+#define mtype_data_set_flags TOKEN(MTYPE, _data_set_flags)
+#define mtype_data_reset_flags TOKEN(MTYPE, _data_reset_flags)
+#define mtype_data_netmask TOKEN(MTYPE, _data_netmask)
+#define mtype_data_list TOKEN(MTYPE, _data_list)
+#define mtype_data_next TOKEN(MTYPE, _data_next)
+#define mtype_elem TOKEN(MTYPE, _elem)
+#define mtype_add_cidr TOKEN(MTYPE, _add_cidr)
+#define mtype_del_cidr TOKEN(MTYPE, _del_cidr)
+#define mtype_ahash_memsize TOKEN(MTYPE, _ahash_memsize)
+#define mtype_flush TOKEN(MTYPE, _flush)
+#define mtype_destroy TOKEN(MTYPE, _destroy)
+#define mtype_gc_init TOKEN(MTYPE, _gc_init)
+#define mtype_same_set TOKEN(MTYPE, _same_set)
+#define mtype_kadt TOKEN(MTYPE, _kadt)
+#define mtype_uadt TOKEN(MTYPE, _uadt)
+#define mtype MTYPE
+
+#define mtype_elem TOKEN(MTYPE, _elem)
+#define mtype_add TOKEN(MTYPE, _add)
+#define mtype_del TOKEN(MTYPE, _del)
+#define mtype_test_cidrs TOKEN(MTYPE, _test_cidrs)
+#define mtype_test TOKEN(MTYPE, _test)
+#define mtype_expire TOKEN(MTYPE, _expire)
+#define mtype_resize TOKEN(MTYPE, _resize)
+#define mtype_head TOKEN(MTYPE, _head)
+#define mtype_list TOKEN(MTYPE, _list)
+#define mtype_gc TOKEN(MTYPE, _gc)
+#define mtype_variant TOKEN(MTYPE, _variant)
+#define mtype_data_match TOKEN(MTYPE, _data_match)
+
+#ifndef HKEY_DATALEN
+#define HKEY_DATALEN sizeof(struct mtype_elem)
+#endif
+
+#define HKEY(data, initval, htable_bits) \
+(jhash2((u32 *)(data), HKEY_DATALEN/sizeof(u32), initval) \
+ & jhash_mask(htable_bits))
+
+#ifndef htype
+#define htype HTYPE
+
+/* The generic hash structure */
+struct htype {
+ struct htable *table; /* the hash table */
+ u32 maxelem; /* max elements in the hash */
+ u32 elements; /* current element (vs timeout) */
+ u32 initval; /* random jhash init value */
+ u32 timeout; /* timeout value, if enabled */
+ size_t dsize; /* data struct size */
+ size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */
+ struct timer_list gc; /* garbage collection when timeout enabled */
+ struct mtype_elem next; /* temporary storage for uadd */
+#ifdef IP_SET_HASH_WITH_MULTI
+ u8 ahash_max; /* max elements in an array block */
+#endif
+#ifdef IP_SET_HASH_WITH_NETMASK
+ u8 netmask; /* netmask value for subnets to store */
+#endif
+#ifdef IP_SET_HASH_WITH_RBTREE
+ struct rb_root rbtree;
+#endif
+#ifdef IP_SET_HASH_WITH_NETS
+ struct net_prefixes nets[0]; /* book-keeping of prefixes */
+#endif
+};
+#endif
+
+#ifdef IP_SET_HASH_WITH_NETS
+/* Network cidr size book keeping when the hash stores different
+ * sized networks */
+static void
+mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length)
+{
+ int i, j;
+
+ /* Add in increasing prefix order, so larger cidr first */
+ for (i = 0, j = -1; i < nets_length && h->nets[i].nets; i++) {
+ if (j != -1)
+ continue;
+ else if (h->nets[i].cidr < cidr)
+ j = i;
+ else if (h->nets[i].cidr == cidr) {
+ h->nets[i].nets++;
+ return;
+ }
+ }
+ if (j != -1) {
+ for (; i > j; i--) {
+ h->nets[i].cidr = h->nets[i - 1].cidr;
+ h->nets[i].nets = h->nets[i - 1].nets;
+ }
+ }
+ h->nets[i].cidr = cidr;
+ h->nets[i].nets = 1;
+}
+
+static void
+mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length)
+{
+ u8 i, j;
+
+ for (i = 0; i < nets_length - 1 && h->nets[i].cidr != cidr; i++)
+ ;
+ h->nets[i].nets--;
+
+ if (h->nets[i].nets != 0)
+ return;
+
+ for (j = i; j < nets_length - 1 && h->nets[j].nets; j++) {
+ h->nets[j].cidr = h->nets[j + 1].cidr;
+ h->nets[j].nets = h->nets[j + 1].nets;
+ }
+}
+#endif
+
+/* Calculate the actual memory size of the set data */
+static size_t
+mtype_ahash_memsize(const struct htype *h, u8 nets_length)
+{
+ u32 i;
+ struct htable *t = h->table;
+ size_t memsize = sizeof(*h)
+ + sizeof(*t)
+#ifdef IP_SET_HASH_WITH_NETS
+ + sizeof(struct net_prefixes) * nets_length
+#endif
+ + jhash_size(t->htable_bits) * sizeof(struct hbucket);
+
+ for (i = 0; i < jhash_size(t->htable_bits); i++)
+ memsize += t->bucket[i].size * h->dsize;
+
+ return memsize;
+}
+
+/* Flush a hash type of set: destroy all elements */
+static void
+mtype_flush(struct ip_set *set)
+{
+ struct htype *h = set->data;
+ struct htable *t = h->table;
+ struct hbucket *n;
+ u32 i;
+
+ for (i = 0; i < jhash_size(t->htable_bits); i++) {
+ n = hbucket(t, i);
+ if (n->size) {
+ n->size = n->pos = 0;
+ /* FIXME: use slab cache */
+ kfree(n->value);
+ }
+ }
+#ifdef IP_SET_HASH_WITH_NETS
+ memset(h->nets, 0, sizeof(struct net_prefixes)
+ * NETS_LENGTH(set->family));
+#endif
+ h->elements = 0;
+}
+
+/* Destroy a hash type of set */
+static void
+mtype_destroy(struct ip_set *set)
+{
+ struct htype *h = set->data;
+
+ if (set->extensions & IPSET_EXT_TIMEOUT)
+ del_timer_sync(&h->gc);
+
+ ahash_destroy(h->table);
+#ifdef IP_SET_HASH_WITH_RBTREE
+ rbtree_destroy(&h->rbtree);
+#endif
+ kfree(h);
+
+ set->data = NULL;
+}
+
+static void
+mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
+{
+ struct htype *h = set->data;
+
+ init_timer(&h->gc);
+ h->gc.data = (unsigned long) set;
+ h->gc.function = gc;
+ h->gc.expires = jiffies + IPSET_GC_PERIOD(h->timeout) * HZ;
+ add_timer(&h->gc);
+ pr_debug("gc initialized, run in every %u\n",
+ IPSET_GC_PERIOD(h->timeout));
+}
+
+static bool
+mtype_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+ const struct htype *x = a->data;
+ const struct htype *y = b->data;
+
+ /* Resizing changes htable_bits, so we ignore it */
+ return x->maxelem == y->maxelem &&
+ x->timeout == y->timeout &&
+#ifdef IP_SET_HASH_WITH_NETMASK
+ x->netmask == y->netmask &&
+#endif
+ a->extensions == b->extensions;
+}
+
+/* Get the ith element from the array block n */
+#define ahash_data(n, i, dsize) \
+ ((struct mtype_elem *)((n)->value + ((i) * (dsize))))
+
+/* Delete expired elements from the hashtable */
+static void
+mtype_expire(struct htype *h, u8 nets_length, size_t dsize)
+{
+ struct htable *t = h->table;
+ struct hbucket *n;
+ struct mtype_elem *data;
+ u32 i;
+ int j;
+
+ for (i = 0; i < jhash_size(t->htable_bits); i++) {
+ n = hbucket(t, i);
+ for (j = 0; j < n->pos; j++) {
+ data = ahash_data(n, j, dsize);
+ if (ip_set_timeout_expired(ext_timeout(data, h))) {
+ pr_debug("expired %u/%u\n", i, j);
+#ifdef IP_SET_HASH_WITH_NETS
+ mtype_del_cidr(h, CIDR(data->cidr),
+ nets_length);
+#endif
+ if (j != n->pos - 1)
+ /* Not last one */
+ memcpy(data,
+ ahash_data(n, n->pos - 1, dsize),
+ dsize);
+ n->pos--;
+ h->elements--;
+ }
+ }
+ if (n->pos + AHASH_INIT_SIZE < n->size) {
+ void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
+ * dsize,
+ GFP_ATOMIC);
+ if (!tmp)
+ /* Still try to delete expired elements */
+ continue;
+ n->size -= AHASH_INIT_SIZE;
+ memcpy(tmp, n->value, n->size * dsize);
+ kfree(n->value);
+ n->value = tmp;
+ }
+ }
+}
+
+static void
+mtype_gc(unsigned long ul_set)
+{
+ struct ip_set *set = (struct ip_set *) ul_set;
+ struct htype *h = set->data;
+
+ pr_debug("called\n");
+ write_lock_bh(&set->lock);
+ mtype_expire(h, NETS_LENGTH(set->family), h->dsize);
+ write_unlock_bh(&set->lock);
+
+ h->gc.expires = jiffies + IPSET_GC_PERIOD(h->timeout) * HZ;
+ add_timer(&h->gc);
+}
+
+/* Resize a hash: create a new hash table with doubling the hashsize
+ * and inserting the elements to it. Repeat until we succeed or
+ * fail due to memory pressures. */
+static int
+mtype_resize(struct ip_set *set, bool retried)
+{
+ struct htype *h = set->data;
+ struct htable *t, *orig = h->table;
+ u8 htable_bits = orig->htable_bits;
+#ifdef IP_SET_HASH_WITH_NETS
+ u8 flags;
+#endif
+ struct mtype_elem *data;
+ struct mtype_elem *d;
+ struct hbucket *n, *m;
+ u32 i, j;
+ int ret;
+
+ /* Try to cleanup once */
+ if (SET_WITH_TIMEOUT(set) && !retried) {
+ i = h->elements;
+ write_lock_bh(&set->lock);
+ mtype_expire(set->data, NETS_LENGTH(set->family),
+ h->dsize);
+ write_unlock_bh(&set->lock);
+ if (h->elements < i)
+ return 0;
+ }
+
+retry:
+ ret = 0;
+ htable_bits++;
+ pr_debug("attempt to resize set %s from %u to %u, t %p\n",
+ set->name, orig->htable_bits, htable_bits, orig);
+ if (!htable_bits) {
+ /* In case we have plenty of memory :-) */
+ pr_warning("Cannot increase the hashsize of set %s further\n",
+ set->name);
+ return -IPSET_ERR_HASH_FULL;
+ }
+ t = ip_set_alloc(sizeof(*t)
+ + jhash_size(htable_bits) * sizeof(struct hbucket));
+ if (!t)
+ return -ENOMEM;
+ t->htable_bits = htable_bits;
+
+ read_lock_bh(&set->lock);
+ for (i = 0; i < jhash_size(orig->htable_bits); i++) {
+ n = hbucket(orig, i);
+ for (j = 0; j < n->pos; j++) {
+ data = ahash_data(n, j, h->dsize);
+#ifdef IP_SET_HASH_WITH_NETS
+ flags = 0;
+ mtype_data_reset_flags(data, &flags);
+#endif
+ m = hbucket(t, HKEY(data, h->initval, htable_bits));
+ ret = hbucket_elem_add(m, AHASH_MAX(h), h->dsize);
+ if (ret < 0) {
+#ifdef IP_SET_HASH_WITH_NETS
+ mtype_data_reset_flags(data, &flags);
+#endif
+ read_unlock_bh(&set->lock);
+ ahash_destroy(t);
+ if (ret == -EAGAIN)
+ goto retry;
+ return ret;
+ }
+ d = ahash_data(m, m->pos++, h->dsize);
+ memcpy(d, data, h->dsize);
+#ifdef IP_SET_HASH_WITH_NETS
+ mtype_data_reset_flags(d, &flags);
+#endif
+ }
+ }
+
+ rcu_assign_pointer(h->table, t);
+ read_unlock_bh(&set->lock);
+
+ /* Give time to other readers of the set */
+ synchronize_rcu_bh();
+
+ pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name,
+ orig->htable_bits, orig, t->htable_bits, t);
+ ahash_destroy(orig);
+
+ return 0;
+}
+
+/* Add an element to a hash and update the internal counters when succeeded,
+ * otherwise report the proper error code. */
+static int
+mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct htype *h = set->data;
+ struct htable *t;
+ const struct mtype_elem *d = value;
+ struct mtype_elem *data;
+ struct hbucket *n;
+ int i, ret = 0;
+ int j = AHASH_MAX(h) + 1;
+ bool flag_exist = flags & IPSET_FLAG_EXIST;
+ u32 key, multi = 0;
+
+ if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem)
+ /* FIXME: when set is full, we slow down here */
+ mtype_expire(h, NETS_LENGTH(set->family), h->dsize);
+
+ if (h->elements >= h->maxelem) {
+ if (net_ratelimit())
+ pr_warning("Set %s is full, maxelem %u reached\n",
+ set->name, h->maxelem);
+ return -IPSET_ERR_HASH_FULL;
+ }
+
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
+ key = HKEY(value, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_data(n, i, h->dsize);
+ if (mtype_data_equal(data, d, &multi)) {
+ if (flag_exist ||
+ (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(data, h)))) {
+ /* Just the extensions could be overwritten */
+ j = i;
+ goto reuse_slot;
+ } else {
+ ret = -IPSET_ERR_EXIST;
+ goto out;
+ }
+ }
+ /* Reuse first timed out entry */
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(data, h)) &&
+ j != AHASH_MAX(h) + 1)
+ j = i;
+ }
+reuse_slot:
+ if (j != AHASH_MAX(h) + 1) {
+ /* Fill out reused slot */
+ data = ahash_data(n, j, h->dsize);
+#ifdef IP_SET_HASH_WITH_NETS
+ mtype_del_cidr(h, CIDR(data->cidr), NETS_LENGTH(set->family));
+ mtype_add_cidr(h, CIDR(d->cidr), NETS_LENGTH(set->family));
+#endif
+ } else {
+ /* Use/create a new slot */
+ TUNE_AHASH_MAX(h, multi);
+ ret = hbucket_elem_add(n, AHASH_MAX(h), h->dsize);
+ if (ret != 0) {
+ if (ret == -EAGAIN)
+ mtype_data_next(&h->next, d);
+ goto out;
+ }
+ data = ahash_data(n, n->pos++, h->dsize);
+#ifdef IP_SET_HASH_WITH_NETS
+ mtype_add_cidr(h, CIDR(d->cidr), NETS_LENGTH(set->family));
+#endif
+ h->elements++;
+ }
+ memcpy(data, d, sizeof(struct mtype_elem));
+#ifdef IP_SET_HASH_WITH_NETS
+ mtype_data_set_flags(data, flags);
+#endif
+ if (SET_WITH_TIMEOUT(set))
+ ip_set_timeout_set(ext_timeout(data, h), ext->timeout);
+ if (SET_WITH_COUNTER(set))
+ ip_set_init_counter(ext_counter(data, h), ext);
+
+out:
+ rcu_read_unlock_bh();
+ return ret;
+}
+
+/* Delete an element from the hash: swap it with the last element
+ * and free up space if possible.
+ */
+static int
+mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct htype *h = set->data;
+ struct htable *t = h->table;
+ const struct mtype_elem *d = value;
+ struct mtype_elem *data;
+ struct hbucket *n;
+ int i;
+ u32 key, multi = 0;
+
+ key = HKEY(value, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_data(n, i, h->dsize);
+ if (!mtype_data_equal(data, d, &multi))
+ continue;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(data, h)))
+ return -IPSET_ERR_EXIST;
+ if (i != n->pos - 1)
+ /* Not last one */
+ memcpy(data, ahash_data(n, n->pos - 1, h->dsize),
+ h->dsize);
+
+ n->pos--;
+ h->elements--;
+#ifdef IP_SET_HASH_WITH_NETS
+ mtype_del_cidr(h, CIDR(d->cidr), NETS_LENGTH(set->family));
+#endif
+ if (n->pos + AHASH_INIT_SIZE < n->size) {
+ void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
+ * h->dsize,
+ GFP_ATOMIC);
+ if (!tmp)
+ return 0;
+ n->size -= AHASH_INIT_SIZE;
+ memcpy(tmp, n->value, n->size * h->dsize);
+ kfree(n->value);
+ n->value = tmp;
+ }
+ return 0;
+ }
+
+ return -IPSET_ERR_EXIST;
+}
+
+static inline int
+mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, struct ip_set *set, u32 flags)
+{
+ if (SET_WITH_COUNTER(set))
+ ip_set_update_counter(ext_counter(data,
+ (struct htype *)(set->data)),
+ ext, mext, flags);
+ return mtype_do_data_match(data);
+}
+
+#ifdef IP_SET_HASH_WITH_NETS
+/* Special test function which takes into account the different network
+ * sizes added to the set */
+static int
+mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,
+ const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct htype *h = set->data;
+ struct htable *t = h->table;
+ struct hbucket *n;
+ struct mtype_elem *data;
+ int i, j = 0;
+ u32 key, multi = 0;
+ u8 nets_length = NETS_LENGTH(set->family);
+
+ pr_debug("test by nets\n");
+ for (; j < nets_length && h->nets[j].nets && !multi; j++) {
+ mtype_data_netmask(d, h->nets[j].cidr);
+ key = HKEY(d, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_data(n, i, h->dsize);
+ if (!mtype_data_equal(data, d, &multi))
+ continue;
+ if (SET_WITH_TIMEOUT(set)) {
+ if (!ip_set_timeout_expired(
+ ext_timeout(data, h)))
+ return mtype_data_match(data, ext,
+ mext, set,
+ flags);
+#ifdef IP_SET_HASH_WITH_MULTI
+ multi = 0;
+#endif
+ } else
+ return mtype_data_match(data, ext,
+ mext, set, flags);
+ }
+ }
+ return 0;
+}
+#endif
+
+/* Test whether the element is added to the set */
+static int
+mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
+{
+ struct htype *h = set->data;
+ struct htable *t = h->table;
+ struct mtype_elem *d = value;
+ struct hbucket *n;
+ struct mtype_elem *data;
+ int i;
+ u32 key, multi = 0;
+
+#ifdef IP_SET_HASH_WITH_NETS
+ /* If we test an IP address and not a network address,
+ * try all possible network sizes */
+ if (CIDR(d->cidr) == SET_HOST_MASK(set->family))
+ return mtype_test_cidrs(set, d, ext, mext, flags);
+#endif
+
+ key = HKEY(d, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_data(n, i, h->dsize);
+ if (mtype_data_equal(data, d, &multi) &&
+ !(SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(data, h))))
+ return mtype_data_match(data, ext, mext, set, flags);
+ }
+ return 0;
+}
+
+/* Reply a HEADER request: fill out the header part of the set */
+static int
+mtype_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct htype *h = set->data;
+ struct nlattr *nested;
+ size_t memsize;
+
+ read_lock_bh(&set->lock);
+ memsize = mtype_ahash_memsize(h, NETS_LENGTH(set->family));
+ read_unlock_bh(&set->lock);
+
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested)
+ goto nla_put_failure;
+ if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE,
+ htonl(jhash_size(h->table->htable_bits))) ||
+ nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem)))
+ goto nla_put_failure;
+#ifdef IP_SET_HASH_WITH_NETMASK
+ if (h->netmask != HOST_MASK &&
+ nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask))
+ goto nla_put_failure;
+#endif
+ if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
+ nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
+ ((set->extensions & IPSET_EXT_TIMEOUT) &&
+ nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(h->timeout))) ||
+ ((set->extensions & IPSET_EXT_COUNTER) &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS,
+ htonl(IPSET_FLAG_WITH_COUNTERS))))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+/* Reply a LIST/SAVE request: dump the elements of the specified set */
+static int
+mtype_list(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct htype *h = set->data;
+ const struct htable *t = h->table;
+ struct nlattr *atd, *nested;
+ const struct hbucket *n;
+ const struct mtype_elem *e;
+ u32 first = cb->args[2];
+ /* We assume that one hash bucket fills into one page */
+ void *incomplete;
+ int i;
+
+ atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!atd)
+ return -EMSGSIZE;
+ pr_debug("list hash set %s\n", set->name);
+ for (; cb->args[2] < jhash_size(t->htable_bits); cb->args[2]++) {
+ incomplete = skb_tail_pointer(skb);
+ n = hbucket(t, cb->args[2]);
+ pr_debug("cb->args[2]: %lu, t %p n %p\n", cb->args[2], t, n);
+ for (i = 0; i < n->pos; i++) {
+ e = ahash_data(n, i, h->dsize);
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, h)))
+ continue;
+ pr_debug("list hash %lu hbucket %p i %u, data %p\n",
+ cb->args[2], n, i, e);
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (cb->args[2] == first) {
+ nla_nest_cancel(skb, atd);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ if (mtype_data_list(skb, e))
+ goto nla_put_failure;
+ if (SET_WITH_TIMEOUT(set) &&
+ nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(
+ ext_timeout(e, h)))))
+ goto nla_put_failure;
+ if (SET_WITH_COUNTER(set) &&
+ ip_set_put_counter(skb, ext_counter(e, h)))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+ }
+ }
+ ipset_nest_end(skb, atd);
+ /* Set listing finished */
+ cb->args[2] = 0;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, incomplete);
+ ipset_nest_end(skb, atd);
+ if (unlikely(first == cb->args[2])) {
+ pr_warning("Can't list set %s: one bucket does not fit into "
+ "a message. Please report it!\n", set->name);
+ cb->args[2] = 0;
+ return -EMSGSIZE;
+ }
+ return 0;
+}
+
+static int
+TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt);
+
+static int
+TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags, bool retried);
+
+static const struct ip_set_type_variant mtype_variant = {
+ .kadt = mtype_kadt,
+ .uadt = mtype_uadt,
+ .adt = {
+ [IPSET_ADD] = mtype_add,
+ [IPSET_DEL] = mtype_del,
+ [IPSET_TEST] = mtype_test,
+ },
+ .destroy = mtype_destroy,
+ .flush = mtype_flush,
+ .head = mtype_head,
+ .list = mtype_list,
+ .resize = mtype_resize,
+ .same_set = mtype_same_set,
+};
+
+#ifdef IP_SET_EMIT_CREATE
+static int
+TOKEN(HTYPE, _create)(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+ u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+ u32 cadt_flags = 0;
+ u8 hbits;
+#ifdef IP_SET_HASH_WITH_NETMASK
+ u8 netmask;
+#endif
+ size_t hsize;
+ struct HTYPE *h;
+
+ if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6))
+ return -IPSET_ERR_INVALID_FAMILY;
+#ifdef IP_SET_HASH_WITH_NETMASK
+ netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
+ pr_debug("Create set %s with family %s\n",
+ set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6");
+#endif
+
+ if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ return -IPSET_ERR_PROTOCOL;
+
+ if (tb[IPSET_ATTR_HASHSIZE]) {
+ hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+ if (hashsize < IPSET_MIMINAL_HASHSIZE)
+ hashsize = IPSET_MIMINAL_HASHSIZE;
+ }
+
+ if (tb[IPSET_ATTR_MAXELEM])
+ maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+#ifdef IP_SET_HASH_WITH_NETMASK
+ if (tb[IPSET_ATTR_NETMASK]) {
+ netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
+
+ if ((set->family == NFPROTO_IPV4 && netmask > 32) ||
+ (set->family == NFPROTO_IPV6 && netmask > 128) ||
+ netmask == 0)
+ return -IPSET_ERR_INVALID_NETMASK;
+ }
+#endif
+
+ hsize = sizeof(*h);
+#ifdef IP_SET_HASH_WITH_NETS
+ hsize += sizeof(struct net_prefixes) *
+ (set->family == NFPROTO_IPV4 ? 32 : 128);
+#endif
+ h = kzalloc(hsize, GFP_KERNEL);
+ if (!h)
+ return -ENOMEM;
+
+ h->maxelem = maxelem;
+#ifdef IP_SET_HASH_WITH_NETMASK
+ h->netmask = netmask;
+#endif
+ get_random_bytes(&h->initval, sizeof(h->initval));
+ h->timeout = IPSET_NO_TIMEOUT;
+
+ hbits = htable_bits(hashsize);
+ hsize = htable_size(hbits);
+ if (hsize == 0) {
+ kfree(h);
+ return -ENOMEM;
+ }
+ h->table = ip_set_alloc(hsize);
+ if (!h->table) {
+ kfree(h);
+ return -ENOMEM;
+ }
+ h->table->htable_bits = hbits;
+
+ set->data = h;
+ if (set->family == NFPROTO_IPV4)
+ set->variant = &TOKEN(HTYPE, 4_variant);
+ else
+ set->variant = &TOKEN(HTYPE, 6_variant);
+
+ if (tb[IPSET_ATTR_CADT_FLAGS])
+ cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) {
+ set->extensions |= IPSET_EXT_COUNTER;
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ h->timeout =
+ ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ set->extensions |= IPSET_EXT_TIMEOUT;
+ if (set->family == NFPROTO_IPV4) {
+ h->dsize =
+ sizeof(struct TOKEN(HTYPE, 4ct_elem));
+ h->offset[IPSET_OFFSET_TIMEOUT] =
+ offsetof(struct TOKEN(HTYPE, 4ct_elem),
+ timeout);
+ h->offset[IPSET_OFFSET_COUNTER] =
+ offsetof(struct TOKEN(HTYPE, 4ct_elem),
+ counter);
+ TOKEN(HTYPE, 4_gc_init)(set,
+ TOKEN(HTYPE, 4_gc));
+ } else {
+ h->dsize =
+ sizeof(struct TOKEN(HTYPE, 6ct_elem));
+ h->offset[IPSET_OFFSET_TIMEOUT] =
+ offsetof(struct TOKEN(HTYPE, 6ct_elem),
+ timeout);
+ h->offset[IPSET_OFFSET_COUNTER] =
+ offsetof(struct TOKEN(HTYPE, 6ct_elem),
+ counter);
+ TOKEN(HTYPE, 6_gc_init)(set,
+ TOKEN(HTYPE, 6_gc));
+ }
+ } else {
+ if (set->family == NFPROTO_IPV4) {
+ h->dsize =
+ sizeof(struct TOKEN(HTYPE, 4c_elem));
+ h->offset[IPSET_OFFSET_COUNTER] =
+ offsetof(struct TOKEN(HTYPE, 4c_elem),
+ counter);
+ } else {
+ h->dsize =
+ sizeof(struct TOKEN(HTYPE, 6c_elem));
+ h->offset[IPSET_OFFSET_COUNTER] =
+ offsetof(struct TOKEN(HTYPE, 6c_elem),
+ counter);
+ }
+ }
+ } else if (tb[IPSET_ATTR_TIMEOUT]) {
+ h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ set->extensions |= IPSET_EXT_TIMEOUT;
+ if (set->family == NFPROTO_IPV4) {
+ h->dsize = sizeof(struct TOKEN(HTYPE, 4t_elem));
+ h->offset[IPSET_OFFSET_TIMEOUT] =
+ offsetof(struct TOKEN(HTYPE, 4t_elem),
+ timeout);
+ TOKEN(HTYPE, 4_gc_init)(set, TOKEN(HTYPE, 4_gc));
+ } else {
+ h->dsize = sizeof(struct TOKEN(HTYPE, 6t_elem));
+ h->offset[IPSET_OFFSET_TIMEOUT] =
+ offsetof(struct TOKEN(HTYPE, 6t_elem),
+ timeout);
+ TOKEN(HTYPE, 6_gc_init)(set, TOKEN(HTYPE, 6_gc));
+ }
+ } else {
+ if (set->family == NFPROTO_IPV4)
+ h->dsize = sizeof(struct TOKEN(HTYPE, 4_elem));
+ else
+ h->dsize = sizeof(struct TOKEN(HTYPE, 6_elem));
+ }
+
+ pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+ set->name, jhash_size(h->table->htable_bits),
+ h->table->htable_bits, h->maxelem, set->data, h->table);
+
+ return 0;
+}
+#endif /* IP_SET_EMIT_CREATE */
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index b7d4cb475ae..c74e6e14cd9 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -21,11 +21,10 @@
#include <linux/netfilter.h>
#include <linux/netfilter/ipset/pfxlen.h>
#include <linux/netfilter/ipset/ip_set.h>
-#include <linux/netfilter/ipset/ip_set_timeout.h>
#include <linux/netfilter/ipset/ip_set_hash.h>
#define REVISION_MIN 0
-#define REVISION_MAX 0
+#define REVISION_MAX 1 /* Counters support */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
@@ -33,58 +32,47 @@ IP_SET_MODULE_DESC("hash:ip", REVISION_MIN, REVISION_MAX);
MODULE_ALIAS("ip_set_hash:ip");
/* Type specific function prefix */
-#define TYPE hash_ip
-
-static bool
-hash_ip_same_set(const struct ip_set *a, const struct ip_set *b);
-
-#define hash_ip4_same_set hash_ip_same_set
-#define hash_ip6_same_set hash_ip_same_set
+#define HTYPE hash_ip
+#define IP_SET_HASH_WITH_NETMASK
-/* The type variant functions: IPv4 */
+/* IPv4 variants */
-/* Member elements without timeout */
+/* Member elements */
struct hash_ip4_elem {
+ /* Zero valued IP addresses cannot be stored */
__be32 ip;
};
-/* Member elements with timeout support */
-struct hash_ip4_telem {
+struct hash_ip4t_elem {
__be32 ip;
unsigned long timeout;
};
-static inline bool
-hash_ip4_data_equal(const struct hash_ip4_elem *ip1,
- const struct hash_ip4_elem *ip2,
- u32 *multi)
-{
- return ip1->ip == ip2->ip;
-}
+struct hash_ip4c_elem {
+ __be32 ip;
+ struct ip_set_counter counter;
+};
-static inline bool
-hash_ip4_data_isnull(const struct hash_ip4_elem *elem)
-{
- return elem->ip == 0;
-}
+struct hash_ip4ct_elem {
+ __be32 ip;
+ struct ip_set_counter counter;
+ unsigned long timeout;
+};
-static inline void
-hash_ip4_data_copy(struct hash_ip4_elem *dst, const struct hash_ip4_elem *src)
-{
- dst->ip = src->ip;
-}
+/* Common functions */
-/* Zero valued IP addresses cannot be stored */
-static inline void
-hash_ip4_data_zero_out(struct hash_ip4_elem *elem)
+static inline bool
+hash_ip4_data_equal(const struct hash_ip4_elem *e1,
+ const struct hash_ip4_elem *e2,
+ u32 *multi)
{
- elem->ip = 0;
+ return e1->ip == e2->ip;
}
static inline bool
-hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *data)
+hash_ip4_data_list(struct sk_buff *skb, const struct hash_ip4_elem *e)
{
- if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip))
+ if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, e->ip))
goto nla_put_failure;
return 0;
@@ -92,41 +80,26 @@ nla_put_failure:
return 1;
}
-static bool
-hash_ip4_data_tlist(struct sk_buff *skb, const struct hash_ip4_elem *data)
+static inline void
+hash_ip4_data_next(struct hash_ip4_elem *next, const struct hash_ip4_elem *e)
{
- const struct hash_ip4_telem *tdata =
- (const struct hash_ip4_telem *)data;
-
- if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, tdata->ip) ||
- nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
- htonl(ip_set_timeout_get(tdata->timeout))))
- goto nla_put_failure;
-
- return 0;
-
-nla_put_failure:
- return 1;
+ next->ip = e->ip;
}
-#define IP_SET_HASH_WITH_NETMASK
+#define MTYPE hash_ip4
#define PF 4
#define HOST_MASK 32
-#include <linux/netfilter/ipset/ip_set_ahash.h>
-
-static inline void
-hash_ip4_data_next(struct ip_set_hash *h, const struct hash_ip4_elem *d)
-{
- h->next.ip = d->ip;
-}
+#include "ip_set_hash_gen.h"
static int
hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
- enum ipset_adt adt, const struct ip_set_adt_opt *opt)
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_ip *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
+ struct hash_ip4_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h);
__be32 ip;
ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip);
@@ -134,43 +107,42 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb,
if (ip == 0)
return -EINVAL;
- return adtfn(set, &ip, opt_timeout(opt, h), opt->cmdflags);
+ e.ip = ip;
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
static int
hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_ip *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- u32 ip, ip_to, hosts, timeout = h->timeout;
- __be32 nip;
+ struct hash_ip4_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(h);
+ u32 ip, ip_to, hosts;
int ret = 0;
if (unlikely(!tb[IPSET_ATTR_IP] ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
return -IPSET_ERR_PROTOCOL;
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
ip &= ip_set_hostmask(h->netmask);
- if (tb[IPSET_ATTR_TIMEOUT]) {
- if (!with_timeout(h->timeout))
- return -IPSET_ERR_TIMEOUT;
- timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
- }
-
if (adt == IPSET_TEST) {
- nip = htonl(ip);
- if (nip == 0)
+ e.ip = htonl(ip);
+ if (e.ip == 0)
return -IPSET_ERR_HASH_ELEM;
- return adtfn(set, &nip, timeout, flags);
+ return adtfn(set, &e, &ext, &ext, flags);
}
ip_to = ip;
@@ -193,10 +165,10 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
if (retried)
ip = ntohl(h->next.ip);
for (; !before(ip_to, ip); ip += hosts) {
- nip = htonl(ip);
- if (nip == 0)
+ e.ip = htonl(ip);
+ if (e.ip == 0)
return -IPSET_ERR_HASH_ELEM;
- ret = adtfn(set, &nip, timeout, flags);
+ ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
@@ -206,29 +178,31 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[],
return ret;
}
-static bool
-hash_ip_same_set(const struct ip_set *a, const struct ip_set *b)
-{
- const struct ip_set_hash *x = a->data;
- const struct ip_set_hash *y = b->data;
+/* IPv6 variants */
- /* Resizing changes htable_bits, so we ignore it */
- return x->maxelem == y->maxelem &&
- x->timeout == y->timeout &&
- x->netmask == y->netmask;
-}
+/* Member elements */
+struct hash_ip6_elem {
+ union nf_inet_addr ip;
+};
-/* The type variant functions: IPv6 */
+struct hash_ip6t_elem {
+ union nf_inet_addr ip;
+ unsigned long timeout;
+};
-struct hash_ip6_elem {
+struct hash_ip6c_elem {
union nf_inet_addr ip;
+ struct ip_set_counter counter;
};
-struct hash_ip6_telem {
+struct hash_ip6ct_elem {
union nf_inet_addr ip;
+ struct ip_set_counter counter;
unsigned long timeout;
};
+/* Common functions */
+
static inline bool
hash_ip6_data_equal(const struct hash_ip6_elem *ip1,
const struct hash_ip6_elem *ip2,
@@ -237,37 +211,16 @@ hash_ip6_data_equal(const struct hash_ip6_elem *ip1,
return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6);
}
-static inline bool
-hash_ip6_data_isnull(const struct hash_ip6_elem *elem)
-{
- return ipv6_addr_any(&elem->ip.in6);
-}
-
static inline void
-hash_ip6_data_copy(struct hash_ip6_elem *dst, const struct hash_ip6_elem *src)
+hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix)
{
- dst->ip.in6 = src->ip.in6;
-}
-
-static inline void
-hash_ip6_data_zero_out(struct hash_ip6_elem *elem)
-{
- ipv6_addr_set(&elem->ip.in6, 0, 0, 0, 0);
-}
-
-static inline void
-ip6_netmask(union nf_inet_addr *ip, u8 prefix)
-{
- ip->ip6[0] &= ip_set_netmask6(prefix)[0];
- ip->ip6[1] &= ip_set_netmask6(prefix)[1];
- ip->ip6[2] &= ip_set_netmask6(prefix)[2];
- ip->ip6[3] &= ip_set_netmask6(prefix)[3];
+ ip6_netmask(ip, prefix);
}
static bool
-hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *data)
+hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e)
{
- if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6))
+ if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6))
goto nla_put_failure;
return 0;
@@ -275,69 +228,55 @@ nla_put_failure:
return 1;
}
-static bool
-hash_ip6_data_tlist(struct sk_buff *skb, const struct hash_ip6_elem *data)
+static inline void
+hash_ip6_data_next(struct hash_ip4_elem *next, const struct hash_ip6_elem *e)
{
- const struct hash_ip6_telem *e =
- (const struct hash_ip6_telem *)data;
-
- if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) ||
- nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
- htonl(ip_set_timeout_get(e->timeout))))
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return 1;
}
+#undef MTYPE
#undef PF
#undef HOST_MASK
+#undef HKEY_DATALEN
+#define MTYPE hash_ip6
#define PF 6
#define HOST_MASK 128
-#include <linux/netfilter/ipset/ip_set_ahash.h>
-static inline void
-hash_ip6_data_next(struct ip_set_hash *h, const struct hash_ip6_elem *d)
-{
-}
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
static int
hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
- enum ipset_adt adt, const struct ip_set_adt_opt *opt)
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_ip *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- union nf_inet_addr ip;
+ struct hash_ip6_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h);
- ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip.in6);
- ip6_netmask(&ip, h->netmask);
- if (ipv6_addr_any(&ip.in6))
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ hash_ip6_netmask(&e.ip, h->netmask);
+ if (ipv6_addr_any(&e.ip.in6))
return -EINVAL;
- return adtfn(set, &ip, opt_timeout(opt, h), opt->cmdflags);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
-static const struct nla_policy hash_ip6_adt_policy[IPSET_ATTR_ADT_MAX + 1] = {
- [IPSET_ATTR_IP] = { .type = NLA_NESTED },
- [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
- [IPSET_ATTR_LINENO] = { .type = NLA_U32 },
-};
-
static int
hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_ip *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- union nf_inet_addr ip;
- u32 timeout = h->timeout;
+ struct hash_ip6_elem e = {};
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(h);
int ret;
if (unlikely(!tb[IPSET_ATTR_IP] ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
tb[IPSET_ATTR_IP_TO] ||
tb[IPSET_ATTR_CIDR]))
return -IPSET_ERR_PROTOCOL;
@@ -345,110 +284,20 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[],
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &ip);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
- ip6_netmask(&ip, h->netmask);
- if (ipv6_addr_any(&ip.in6))
+ hash_ip6_netmask(&e.ip, h->netmask);
+ if (ipv6_addr_any(&e.ip.in6))
return -IPSET_ERR_HASH_ELEM;
- if (tb[IPSET_ATTR_TIMEOUT]) {
- if (!with_timeout(h->timeout))
- return -IPSET_ERR_TIMEOUT;
- timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
- }
-
- ret = adtfn(set, &ip, timeout, flags);
+ ret = adtfn(set, &e, &ext, &ext, flags);
return ip_set_eexist(ret, flags) ? 0 : ret;
}
-/* Create hash:ip type of sets */
-
-static int
-hash_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
-{
- u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
- u8 netmask, hbits;
- size_t hsize;
- struct ip_set_hash *h;
-
- if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6))
- return -IPSET_ERR_INVALID_FAMILY;
- netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
- pr_debug("Create set %s with family %s\n",
- set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6");
-
- if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
- return -IPSET_ERR_PROTOCOL;
-
- if (tb[IPSET_ATTR_HASHSIZE]) {
- hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
- if (hashsize < IPSET_MIMINAL_HASHSIZE)
- hashsize = IPSET_MIMINAL_HASHSIZE;
- }
-
- if (tb[IPSET_ATTR_MAXELEM])
- maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
-
- if (tb[IPSET_ATTR_NETMASK]) {
- netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
-
- if ((set->family == NFPROTO_IPV4 && netmask > 32) ||
- (set->family == NFPROTO_IPV6 && netmask > 128) ||
- netmask == 0)
- return -IPSET_ERR_INVALID_NETMASK;
- }
-
- h = kzalloc(sizeof(*h), GFP_KERNEL);
- if (!h)
- return -ENOMEM;
-
- h->maxelem = maxelem;
- h->netmask = netmask;
- get_random_bytes(&h->initval, sizeof(h->initval));
- h->timeout = IPSET_NO_TIMEOUT;
-
- hbits = htable_bits(hashsize);
- hsize = htable_size(hbits);
- if (hsize == 0) {
- kfree(h);
- return -ENOMEM;
- }
- h->table = ip_set_alloc(hsize);
- if (!h->table) {
- kfree(h);
- return -ENOMEM;
- }
- h->table->htable_bits = hbits;
-
- set->data = h;
-
- if (tb[IPSET_ATTR_TIMEOUT]) {
- h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
-
- set->variant = set->family == NFPROTO_IPV4
- ? &hash_ip4_tvariant : &hash_ip6_tvariant;
-
- if (set->family == NFPROTO_IPV4)
- hash_ip4_gc_init(set);
- else
- hash_ip6_gc_init(set);
- } else {
- set->variant = set->family == NFPROTO_IPV4
- ? &hash_ip4_variant : &hash_ip6_variant;
- }
-
- pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
- set->name, jhash_size(h->table->htable_bits),
- h->table->htable_bits, h->maxelem, set->data, h->table);
-
- return 0;
-}
-
static struct ip_set_type hash_ip_type __read_mostly = {
.name = "hash:ip",
.protocol = IPSET_PROTOCOL,
@@ -465,6 +314,7 @@ static struct ip_set_type hash_ip_type __read_mostly = {
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_NETMASK] = { .type = NLA_U8 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
},
.adt_policy = {
[IPSET_ATTR_IP] = { .type = NLA_NESTED },
@@ -472,6 +322,8 @@ static struct ip_set_type hash_ip_type __read_mostly = {
[IPSET_ATTR_CIDR] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
},
.me = THIS_MODULE,
};
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index d8f77bacae8..7a2d2bd98d0 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -21,12 +21,12 @@
#include <linux/netfilter.h>
#include <linux/netfilter/ipset/pfxlen.h>
#include <linux/netfilter/ipset/ip_set.h>
-#include <linux/netfilter/ipset/ip_set_timeout.h>
#include <linux/netfilter/ipset/ip_set_getport.h>
#include <linux/netfilter/ipset/ip_set_hash.h>
#define REVISION_MIN 0
-#define REVISION_MAX 1 /* SCTP and UDPLITE support added */
+/* 1 SCTP and UDPLITE support added */
+#define REVISION_MAX 2 /* Counters support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
@@ -34,33 +34,45 @@ IP_SET_MODULE_DESC("hash:ip,port", REVISION_MIN, REVISION_MAX);
MODULE_ALIAS("ip_set_hash:ip,port");
/* Type specific function prefix */
-#define TYPE hash_ipport
+#define HTYPE hash_ipport
-static bool
-hash_ipport_same_set(const struct ip_set *a, const struct ip_set *b);
+/* IPv4 variants */
-#define hash_ipport4_same_set hash_ipport_same_set
-#define hash_ipport6_same_set hash_ipport_same_set
+/* Member elements */
+struct hash_ipport4_elem {
+ __be32 ip;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
-/* The type variant functions: IPv4 */
+struct hash_ipport4t_elem {
+ __be32 ip;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+ unsigned long timeout;
+};
-/* Member elements without timeout */
-struct hash_ipport4_elem {
+struct hash_ipport4c_elem {
__be32 ip;
__be16 port;
u8 proto;
u8 padding;
+ struct ip_set_counter counter;
};
-/* Member elements with timeout support */
-struct hash_ipport4_telem {
+struct hash_ipport4ct_elem {
__be32 ip;
__be16 port;
u8 proto;
u8 padding;
+ struct ip_set_counter counter;
unsigned long timeout;
};
+/* Common functions */
+
static inline bool
hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1,
const struct hash_ipport4_elem *ip2,
@@ -71,27 +83,6 @@ hash_ipport4_data_equal(const struct hash_ipport4_elem *ip1,
ip1->proto == ip2->proto;
}
-static inline bool
-hash_ipport4_data_isnull(const struct hash_ipport4_elem *elem)
-{
- return elem->proto == 0;
-}
-
-static inline void
-hash_ipport4_data_copy(struct hash_ipport4_elem *dst,
- const struct hash_ipport4_elem *src)
-{
- dst->ip = src->ip;
- dst->port = src->port;
- dst->proto = src->proto;
-}
-
-static inline void
-hash_ipport4_data_zero_out(struct hash_ipport4_elem *elem)
-{
- elem->proto = 0;
-}
-
static bool
hash_ipport4_data_list(struct sk_buff *skb,
const struct hash_ipport4_elem *data)
@@ -106,111 +97,91 @@ nla_put_failure:
return 1;
}
-static bool
-hash_ipport4_data_tlist(struct sk_buff *skb,
- const struct hash_ipport4_elem *data)
-{
- const struct hash_ipport4_telem *tdata =
- (const struct hash_ipport4_telem *)data;
-
- if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, tdata->ip) ||
- nla_put_net16(skb, IPSET_ATTR_PORT, tdata->port) ||
- nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
- nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
- htonl(ip_set_timeout_get(tdata->timeout))))
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return 1;
-}
-
-#define PF 4
-#define HOST_MASK 32
-#include <linux/netfilter/ipset/ip_set_ahash.h>
-
static inline void
-hash_ipport4_data_next(struct ip_set_hash *h,
+hash_ipport4_data_next(struct hash_ipport4_elem *next,
const struct hash_ipport4_elem *d)
{
- h->next.ip = d->ip;
- h->next.port = d->port;
+ next->ip = d->ip;
+ next->port = d->port;
}
+#define MTYPE hash_ipport4
+#define PF 4
+#define HOST_MASK 32
+#define HKEY_DATALEN sizeof(struct hash_ipport4_elem)
+#include "ip_set_hash_gen.h"
+
static int
hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
- enum ipset_adt adt, const struct ip_set_adt_opt *opt)
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_ipport *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_ipport4_elem data = { };
+ struct hash_ipport4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h);
if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
- &data.port, &data.proto))
+ &e.port, &e.proto))
return -EINVAL;
- ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip);
-
- return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
static int
hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_ipport *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_ipport4_elem data = { };
+ struct hash_ipport4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(h);
u32 ip, ip_to, p = 0, port, port_to;
- u32 timeout = h->timeout;
bool with_ports = false;
int ret;
if (unlikely(!tb[IPSET_ATTR_IP] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
return -IPSET_ERR_PROTOCOL;
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip);
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
if (tb[IPSET_ATTR_PORT])
- data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
else
return -IPSET_ERR_PROTOCOL;
if (tb[IPSET_ATTR_PROTO]) {
- data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
- with_ports = ip_set_proto_with_ports(data.proto);
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
- if (data.proto == 0)
+ if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
} else
return -IPSET_ERR_MISSING_PROTO;
- if (!(with_ports || data.proto == IPPROTO_ICMP))
- data.port = 0;
-
- if (tb[IPSET_ATTR_TIMEOUT]) {
- if (!with_timeout(h->timeout))
- return -IPSET_ERR_TIMEOUT;
- timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
- }
+ if (!(with_ports || e.proto == IPPROTO_ICMP))
+ e.port = 0;
if (adt == IPSET_TEST ||
!(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] ||
tb[IPSET_ATTR_PORT_TO])) {
- ret = adtfn(set, &data, timeout, flags);
+ ret = adtfn(set, &e, &ext, &ext, flags);
return ip_set_eexist(ret, flags) ? 0 : ret;
}
- ip_to = ip = ntohl(data.ip);
+ ip_to = ip = ntohl(e.ip);
if (tb[IPSET_ATTR_IP_TO]) {
ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
if (ret)
@@ -225,7 +196,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
ip_set_mask_from_to(ip, ip_to, cidr);
}
- port_to = port = ntohs(data.port);
+ port_to = port = ntohs(e.port);
if (with_ports && tb[IPSET_ATTR_PORT_TO]) {
port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
if (port > port_to)
@@ -238,9 +209,9 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
: port;
for (; p <= port_to; p++) {
- data.ip = htonl(ip);
- data.port = htons(p);
- ret = adtfn(set, &data, timeout, flags);
+ e.ip = htonl(ip);
+ e.port = htons(p);
+ ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
@@ -251,34 +222,42 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[],
return ret;
}
-static bool
-hash_ipport_same_set(const struct ip_set *a, const struct ip_set *b)
-{
- const struct ip_set_hash *x = a->data;
- const struct ip_set_hash *y = b->data;
+/* IPv6 variants */
- /* Resizing changes htable_bits, so we ignore it */
- return x->maxelem == y->maxelem &&
- x->timeout == y->timeout;
-}
+struct hash_ipport6_elem {
+ union nf_inet_addr ip;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
-/* The type variant functions: IPv6 */
+struct hash_ipport6t_elem {
+ union nf_inet_addr ip;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+ unsigned long timeout;
+};
-struct hash_ipport6_elem {
+struct hash_ipport6c_elem {
union nf_inet_addr ip;
__be16 port;
u8 proto;
u8 padding;
+ struct ip_set_counter counter;
};
-struct hash_ipport6_telem {
+struct hash_ipport6ct_elem {
union nf_inet_addr ip;
__be16 port;
u8 proto;
u8 padding;
+ struct ip_set_counter counter;
unsigned long timeout;
};
+/* Common functions */
+
static inline bool
hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1,
const struct hash_ipport6_elem *ip2,
@@ -289,25 +268,6 @@ hash_ipport6_data_equal(const struct hash_ipport6_elem *ip1,
ip1->proto == ip2->proto;
}
-static inline bool
-hash_ipport6_data_isnull(const struct hash_ipport6_elem *elem)
-{
- return elem->proto == 0;
-}
-
-static inline void
-hash_ipport6_data_copy(struct hash_ipport6_elem *dst,
- const struct hash_ipport6_elem *src)
-{
- memcpy(dst, src, sizeof(*dst));
-}
-
-static inline void
-hash_ipport6_data_zero_out(struct hash_ipport6_elem *elem)
-{
- elem->proto = 0;
-}
-
static bool
hash_ipport6_data_list(struct sk_buff *skb,
const struct hash_ipport6_elem *data)
@@ -322,66 +282,52 @@ nla_put_failure:
return 1;
}
-static bool
-hash_ipport6_data_tlist(struct sk_buff *skb,
- const struct hash_ipport6_elem *data)
+static inline void
+hash_ipport6_data_next(struct hash_ipport4_elem *next,
+ const struct hash_ipport6_elem *d)
{
- const struct hash_ipport6_telem *e =
- (const struct hash_ipport6_telem *)data;
-
- if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) ||
- nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
- nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
- nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
- htonl(ip_set_timeout_get(e->timeout))))
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return 1;
+ next->port = d->port;
}
+#undef MTYPE
#undef PF
#undef HOST_MASK
+#undef HKEY_DATALEN
+#define MTYPE hash_ipport6
#define PF 6
#define HOST_MASK 128
-#include <linux/netfilter/ipset/ip_set_ahash.h>
-
-static inline void
-hash_ipport6_data_next(struct ip_set_hash *h,
- const struct hash_ipport6_elem *d)
-{
- h->next.port = d->port;
-}
+#define HKEY_DATALEN sizeof(struct hash_ipport6_elem)
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
static int
hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
- enum ipset_adt adt, const struct ip_set_adt_opt *opt)
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_ipport *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_ipport6_elem data = { };
+ struct hash_ipport6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h);
if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
- &data.port, &data.proto))
+ &e.port, &e.proto))
return -EINVAL;
- ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
-
- return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
static int
hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_ipport *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_ipport6_elem data = { };
+ struct hash_ipport6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(h);
u32 port, port_to;
- u32 timeout = h->timeout;
bool with_ports = false;
int ret;
@@ -389,6 +335,8 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
tb[IPSET_ATTR_IP_TO] ||
tb[IPSET_ATTR_CIDR]))
return -IPSET_ERR_PROTOCOL;
@@ -396,39 +344,34 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
if (tb[IPSET_ATTR_PORT])
- data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
else
return -IPSET_ERR_PROTOCOL;
if (tb[IPSET_ATTR_PROTO]) {
- data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
- with_ports = ip_set_proto_with_ports(data.proto);
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
- if (data.proto == 0)
+ if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
} else
return -IPSET_ERR_MISSING_PROTO;
- if (!(with_ports || data.proto == IPPROTO_ICMPV6))
- data.port = 0;
-
- if (tb[IPSET_ATTR_TIMEOUT]) {
- if (!with_timeout(h->timeout))
- return -IPSET_ERR_TIMEOUT;
- timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
- }
+ if (!(with_ports || e.proto == IPPROTO_ICMPV6))
+ e.port = 0;
if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
- ret = adtfn(set, &data, timeout, flags);
+ ret = adtfn(set, &e, &ext, &ext, flags);
return ip_set_eexist(ret, flags) ? 0 : ret;
}
- port = ntohs(data.port);
+ port = ntohs(e.port);
port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
if (port > port_to)
swap(port, port_to);
@@ -436,8 +379,8 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
if (retried)
port = ntohs(h->next.port);
for (; port <= port_to; port++) {
- data.port = htons(port);
- ret = adtfn(set, &data, timeout, flags);
+ e.port = htons(port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
@@ -447,78 +390,6 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[],
return ret;
}
-/* Create hash:ip type of sets */
-
-static int
-hash_ipport_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
-{
- struct ip_set_hash *h;
- u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
- u8 hbits;
- size_t hsize;
-
- if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6))
- return -IPSET_ERR_INVALID_FAMILY;
-
- if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
- return -IPSET_ERR_PROTOCOL;
-
- if (tb[IPSET_ATTR_HASHSIZE]) {
- hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
- if (hashsize < IPSET_MIMINAL_HASHSIZE)
- hashsize = IPSET_MIMINAL_HASHSIZE;
- }
-
- if (tb[IPSET_ATTR_MAXELEM])
- maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
-
- h = kzalloc(sizeof(*h), GFP_KERNEL);
- if (!h)
- return -ENOMEM;
-
- h->maxelem = maxelem;
- get_random_bytes(&h->initval, sizeof(h->initval));
- h->timeout = IPSET_NO_TIMEOUT;
-
- hbits = htable_bits(hashsize);
- hsize = htable_size(hbits);
- if (hsize == 0) {
- kfree(h);
- return -ENOMEM;
- }
- h->table = ip_set_alloc(hsize);
- if (!h->table) {
- kfree(h);
- return -ENOMEM;
- }
- h->table->htable_bits = hbits;
-
- set->data = h;
-
- if (tb[IPSET_ATTR_TIMEOUT]) {
- h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
-
- set->variant = set->family == NFPROTO_IPV4
- ? &hash_ipport4_tvariant : &hash_ipport6_tvariant;
-
- if (set->family == NFPROTO_IPV4)
- hash_ipport4_gc_init(set);
- else
- hash_ipport6_gc_init(set);
- } else {
- set->variant = set->family == NFPROTO_IPV4
- ? &hash_ipport4_variant : &hash_ipport6_variant;
- }
-
- pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
- set->name, jhash_size(h->table->htable_bits),
- h->table->htable_bits, h->maxelem, set->data, h->table);
-
- return 0;
-}
-
static struct ip_set_type hash_ipport_type __read_mostly = {
.name = "hash:ip,port",
.protocol = IPSET_PROTOCOL,
@@ -535,6 +406,7 @@ static struct ip_set_type hash_ipport_type __read_mostly = {
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_PROTO] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
},
.adt_policy = {
[IPSET_ATTR_IP] = { .type = NLA_NESTED },
@@ -545,6 +417,8 @@ static struct ip_set_type hash_ipport_type __read_mostly = {
[IPSET_ATTR_PROTO] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
},
.me = THIS_MODULE,
};
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index 1da1e955f38..34e8a1acce4 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -21,12 +21,12 @@
#include <linux/netfilter.h>
#include <linux/netfilter/ipset/pfxlen.h>
#include <linux/netfilter/ipset/ip_set.h>
-#include <linux/netfilter/ipset/ip_set_timeout.h>
#include <linux/netfilter/ipset/ip_set_getport.h>
#include <linux/netfilter/ipset/ip_set_hash.h>
#define REVISION_MIN 0
-#define REVISION_MAX 1 /* SCTP and UDPLITE support added */
+/* 1 SCTP and UDPLITE support added */
+#define REVISION_MAX 2 /* Counters support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
@@ -34,32 +34,44 @@ IP_SET_MODULE_DESC("hash:ip,port,ip", REVISION_MIN, REVISION_MAX);
MODULE_ALIAS("ip_set_hash:ip,port,ip");
/* Type specific function prefix */
-#define TYPE hash_ipportip
+#define HTYPE hash_ipportip
-static bool
-hash_ipportip_same_set(const struct ip_set *a, const struct ip_set *b);
+/* IPv4 variants */
-#define hash_ipportip4_same_set hash_ipportip_same_set
-#define hash_ipportip6_same_set hash_ipportip_same_set
+/* Member elements */
+struct hash_ipportip4_elem {
+ __be32 ip;
+ __be32 ip2;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
-/* The type variant functions: IPv4 */
+struct hash_ipportip4t_elem {
+ __be32 ip;
+ __be32 ip2;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+ unsigned long timeout;
+};
-/* Member elements without timeout */
-struct hash_ipportip4_elem {
+struct hash_ipportip4c_elem {
__be32 ip;
__be32 ip2;
__be16 port;
u8 proto;
u8 padding;
+ struct ip_set_counter counter;
};
-/* Member elements with timeout support */
-struct hash_ipportip4_telem {
+struct hash_ipportip4ct_elem {
__be32 ip;
__be32 ip2;
__be16 port;
u8 proto;
u8 padding;
+ struct ip_set_counter counter;
unsigned long timeout;
};
@@ -74,25 +86,6 @@ hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1,
ip1->proto == ip2->proto;
}
-static inline bool
-hash_ipportip4_data_isnull(const struct hash_ipportip4_elem *elem)
-{
- return elem->proto == 0;
-}
-
-static inline void
-hash_ipportip4_data_copy(struct hash_ipportip4_elem *dst,
- const struct hash_ipportip4_elem *src)
-{
- memcpy(dst, src, sizeof(*dst));
-}
-
-static inline void
-hash_ipportip4_data_zero_out(struct hash_ipportip4_elem *elem)
-{
- elem->proto = 0;
-}
-
static bool
hash_ipportip4_data_list(struct sk_buff *skb,
const struct hash_ipportip4_elem *data)
@@ -108,117 +101,96 @@ nla_put_failure:
return 1;
}
-static bool
-hash_ipportip4_data_tlist(struct sk_buff *skb,
- const struct hash_ipportip4_elem *data)
+static inline void
+hash_ipportip4_data_next(struct hash_ipportip4_elem *next,
+ const struct hash_ipportip4_elem *d)
{
- const struct hash_ipportip4_telem *tdata =
- (const struct hash_ipportip4_telem *)data;
-
- if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, tdata->ip) ||
- nla_put_ipaddr4(skb, IPSET_ATTR_IP2, tdata->ip2) ||
- nla_put_net16(skb, IPSET_ATTR_PORT, tdata->port) ||
- nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
- nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
- htonl(ip_set_timeout_get(tdata->timeout))))
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return 1;
+ next->ip = d->ip;
+ next->port = d->port;
}
+/* Common functions */
+#define MTYPE hash_ipportip4
#define PF 4
#define HOST_MASK 32
-#include <linux/netfilter/ipset/ip_set_ahash.h>
-
-static inline void
-hash_ipportip4_data_next(struct ip_set_hash *h,
- const struct hash_ipportip4_elem *d)
-{
- h->next.ip = d->ip;
- h->next.port = d->port;
-}
+#include "ip_set_hash_gen.h"
static int
hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
- enum ipset_adt adt, const struct ip_set_adt_opt *opt)
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_ipportip *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_ipportip4_elem data = { };
+ struct hash_ipportip4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h);
if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
- &data.port, &data.proto))
+ &e.port, &e.proto))
return -EINVAL;
- ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip);
- ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2);
-
- return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
static int
hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_ipportip *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_ipportip4_elem data = { };
+ struct hash_ipportip4_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(h);
u32 ip, ip_to, p = 0, port, port_to;
- u32 timeout = h->timeout;
bool with_ports = false;
int ret;
if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
return -IPSET_ERR_PROTOCOL;
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &data.ip);
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
- ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &data.ip2);
+ ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &e.ip2);
if (ret)
return ret;
if (tb[IPSET_ATTR_PORT])
- data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
else
return -IPSET_ERR_PROTOCOL;
if (tb[IPSET_ATTR_PROTO]) {
- data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
- with_ports = ip_set_proto_with_ports(data.proto);
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
- if (data.proto == 0)
+ if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
} else
return -IPSET_ERR_MISSING_PROTO;
- if (!(with_ports || data.proto == IPPROTO_ICMP))
- data.port = 0;
-
- if (tb[IPSET_ATTR_TIMEOUT]) {
- if (!with_timeout(h->timeout))
- return -IPSET_ERR_TIMEOUT;
- timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
- }
+ if (!(with_ports || e.proto == IPPROTO_ICMP))
+ e.port = 0;
if (adt == IPSET_TEST ||
!(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] ||
tb[IPSET_ATTR_PORT_TO])) {
- ret = adtfn(set, &data, timeout, flags);
+ ret = adtfn(set, &e, &ext, &ext, flags);
return ip_set_eexist(ret, flags) ? 0 : ret;
}
- ip_to = ip = ntohl(data.ip);
+ ip_to = ip = ntohl(e.ip);
if (tb[IPSET_ATTR_IP_TO]) {
ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to);
if (ret)
@@ -233,7 +205,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
ip_set_mask_from_to(ip, ip_to, cidr);
}
- port_to = port = ntohs(data.port);
+ port_to = port = ntohs(e.port);
if (with_ports && tb[IPSET_ATTR_PORT_TO]) {
port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
if (port > port_to)
@@ -246,9 +218,9 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
: port;
for (; p <= port_to; p++) {
- data.ip = htonl(ip);
- data.port = htons(p);
- ret = adtfn(set, &data, timeout, flags);
+ e.ip = htonl(ip);
+ e.port = htons(p);
+ ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
@@ -259,36 +231,46 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[],
return ret;
}
-static bool
-hash_ipportip_same_set(const struct ip_set *a, const struct ip_set *b)
-{
- const struct ip_set_hash *x = a->data;
- const struct ip_set_hash *y = b->data;
+/* IPv6 variants */
- /* Resizing changes htable_bits, so we ignore it */
- return x->maxelem == y->maxelem &&
- x->timeout == y->timeout;
-}
+struct hash_ipportip6_elem {
+ union nf_inet_addr ip;
+ union nf_inet_addr ip2;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+};
-/* The type variant functions: IPv6 */
+struct hash_ipportip6t_elem {
+ union nf_inet_addr ip;
+ union nf_inet_addr ip2;
+ __be16 port;
+ u8 proto;
+ u8 padding;
+ unsigned long timeout;
+};
-struct hash_ipportip6_elem {
+struct hash_ipportip6c_elem {
union nf_inet_addr ip;
union nf_inet_addr ip2;
__be16 port;
u8 proto;
u8 padding;
+ struct ip_set_counter counter;
};
-struct hash_ipportip6_telem {
+struct hash_ipportip6ct_elem {
union nf_inet_addr ip;
union nf_inet_addr ip2;
__be16 port;
u8 proto;
u8 padding;
+ struct ip_set_counter counter;
unsigned long timeout;
};
+/* Common functions */
+
static inline bool
hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1,
const struct hash_ipportip6_elem *ip2,
@@ -300,25 +282,6 @@ hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1,
ip1->proto == ip2->proto;
}
-static inline bool
-hash_ipportip6_data_isnull(const struct hash_ipportip6_elem *elem)
-{
- return elem->proto == 0;
-}
-
-static inline void
-hash_ipportip6_data_copy(struct hash_ipportip6_elem *dst,
- const struct hash_ipportip6_elem *src)
-{
- memcpy(dst, src, sizeof(*dst));
-}
-
-static inline void
-hash_ipportip6_data_zero_out(struct hash_ipportip6_elem *elem)
-{
- elem->proto = 0;
-}
-
static bool
hash_ipportip6_data_list(struct sk_buff *skb,
const struct hash_ipportip6_elem *data)
@@ -334,68 +297,51 @@ nla_put_failure:
return 1;
}
-static bool
-hash_ipportip6_data_tlist(struct sk_buff *skb,
- const struct hash_ipportip6_elem *data)
+static inline void
+hash_ipportip6_data_next(struct hash_ipportip4_elem *next,
+ const struct hash_ipportip6_elem *d)
{
- const struct hash_ipportip6_telem *e =
- (const struct hash_ipportip6_telem *)data;
-
- if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) ||
- nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) ||
- nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
- nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
- nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
- htonl(ip_set_timeout_get(e->timeout))))
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return 1;
+ next->port = d->port;
}
+#undef MTYPE
#undef PF
#undef HOST_MASK
+#define MTYPE hash_ipportip6
#define PF 6
#define HOST_MASK 128
-#include <linux/netfilter/ipset/ip_set_ahash.h>
-
-static inline void
-hash_ipportip6_data_next(struct ip_set_hash *h,
- const struct hash_ipportip6_elem *d)
-{
- h->next.port = d->port;
-}
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
static int
hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
- enum ipset_adt adt, const struct ip_set_adt_opt *opt)
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_ipportip *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_ipportip6_elem data = { };
+ struct hash_ipportip6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h);
if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
- &data.port, &data.proto))
+ &e.port, &e.proto))
return -EINVAL;
- ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
- ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2.in6);
-
- return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
static int
hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_ipportip *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_ipportip6_elem data = { };
+ struct hash_ipportip6_elem e = { };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(h);
u32 port, port_to;
- u32 timeout = h->timeout;
bool with_ports = false;
int ret;
@@ -403,6 +349,8 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
tb[IPSET_ATTR_IP_TO] ||
tb[IPSET_ATTR_CIDR]))
return -IPSET_ERR_PROTOCOL;
@@ -410,43 +358,38 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &data.ip2);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2);
if (ret)
return ret;
if (tb[IPSET_ATTR_PORT])
- data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
else
return -IPSET_ERR_PROTOCOL;
if (tb[IPSET_ATTR_PROTO]) {
- data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
- with_ports = ip_set_proto_with_ports(data.proto);
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
- if (data.proto == 0)
+ if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
} else
return -IPSET_ERR_MISSING_PROTO;
- if (!(with_ports || data.proto == IPPROTO_ICMPV6))
- data.port = 0;
-
- if (tb[IPSET_ATTR_TIMEOUT]) {
- if (!with_timeout(h->timeout))
- return -IPSET_ERR_TIMEOUT;
- timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
- }
+ if (!(with_ports || e.proto == IPPROTO_ICMPV6))
+ e.port = 0;
if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
- ret = adtfn(set, &data, timeout, flags);
+ ret = adtfn(set, &e, &ext, &ext, flags);
return ip_set_eexist(ret, flags) ? 0 : ret;
}
- port = ntohs(data.port);
+ port = ntohs(e.port);
port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
if (port > port_to)
swap(port, port_to);
@@ -454,8 +397,8 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
if (retried)
port = ntohs(h->next.port);
for (; port <= port_to; port++) {
- data.port = htons(port);
- ret = adtfn(set, &data, timeout, flags);
+ e.port = htons(port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
@@ -465,78 +408,6 @@ hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[],
return ret;
}
-/* Create hash:ip type of sets */
-
-static int
-hash_ipportip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
-{
- struct ip_set_hash *h;
- u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
- u8 hbits;
- size_t hsize;
-
- if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6))
- return -IPSET_ERR_INVALID_FAMILY;
-
- if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
- return -IPSET_ERR_PROTOCOL;
-
- if (tb[IPSET_ATTR_HASHSIZE]) {
- hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
- if (hashsize < IPSET_MIMINAL_HASHSIZE)
- hashsize = IPSET_MIMINAL_HASHSIZE;
- }
-
- if (tb[IPSET_ATTR_MAXELEM])
- maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
-
- h = kzalloc(sizeof(*h), GFP_KERNEL);
- if (!h)
- return -ENOMEM;
-
- h->maxelem = maxelem;
- get_random_bytes(&h->initval, sizeof(h->initval));
- h->timeout = IPSET_NO_TIMEOUT;
-
- hbits = htable_bits(hashsize);
- hsize = htable_size(hbits);
- if (hsize == 0) {
- kfree(h);
- return -ENOMEM;
- }
- h->table = ip_set_alloc(hsize);
- if (!h->table) {
- kfree(h);
- return -ENOMEM;
- }
- h->table->htable_bits = hbits;
-
- set->data = h;
-
- if (tb[IPSET_ATTR_TIMEOUT]) {
- h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
-
- set->variant = set->family == NFPROTO_IPV4
- ? &hash_ipportip4_tvariant : &hash_ipportip6_tvariant;
-
- if (set->family == NFPROTO_IPV4)
- hash_ipportip4_gc_init(set);
- else
- hash_ipportip6_gc_init(set);
- } else {
- set->variant = set->family == NFPROTO_IPV4
- ? &hash_ipportip4_variant : &hash_ipportip6_variant;
- }
-
- pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
- set->name, jhash_size(h->table->htable_bits),
- h->table->htable_bits, h->maxelem, set->data, h->table);
-
- return 0;
-}
-
static struct ip_set_type hash_ipportip_type __read_mostly = {
.name = "hash:ip,port,ip",
.protocol = IPSET_PROTOCOL,
@@ -552,6 +423,7 @@ static struct ip_set_type hash_ipportip_type __read_mostly = {
[IPSET_ATTR_PROBES] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
},
.adt_policy = {
[IPSET_ATTR_IP] = { .type = NLA_NESTED },
@@ -563,6 +435,8 @@ static struct ip_set_type hash_ipportip_type __read_mostly = {
[IPSET_ATTR_PROTO] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
},
.me = THIS_MODULE,
};
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 10a30b4fc7d..c6a525373be 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -21,14 +21,14 @@
#include <linux/netfilter.h>
#include <linux/netfilter/ipset/pfxlen.h>
#include <linux/netfilter/ipset/ip_set.h>
-#include <linux/netfilter/ipset/ip_set_timeout.h>
#include <linux/netfilter/ipset/ip_set_getport.h>
#include <linux/netfilter/ipset/ip_set_hash.h>
#define REVISION_MIN 0
/* 1 SCTP and UDPLITE support added */
/* 2 Range as input support for IPv4 added */
-#define REVISION_MAX 3 /* nomatch flag support added */
+/* 3 nomatch flag support added */
+#define REVISION_MAX 4 /* Counters support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
@@ -36,23 +36,19 @@ IP_SET_MODULE_DESC("hash:ip,port,net", REVISION_MIN, REVISION_MAX);
MODULE_ALIAS("ip_set_hash:ip,port,net");
/* Type specific function prefix */
-#define TYPE hash_ipportnet
-
-static bool
-hash_ipportnet_same_set(const struct ip_set *a, const struct ip_set *b);
-
-#define hash_ipportnet4_same_set hash_ipportnet_same_set
-#define hash_ipportnet6_same_set hash_ipportnet_same_set
-
-/* The type variant functions: IPv4 */
+#define HTYPE hash_ipportnet
/* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0
* However this way we have to store internally cidr - 1,
* dancing back and forth.
*/
#define IP_SET_HASH_WITH_NETS_PACKED
+#define IP_SET_HASH_WITH_PROTO
+#define IP_SET_HASH_WITH_NETS
+
+/* IPv4 variants */
-/* Member elements without timeout */
+/* Member elements */
struct hash_ipportnet4_elem {
__be32 ip;
__be32 ip2;
@@ -62,8 +58,7 @@ struct hash_ipportnet4_elem {
u8 proto;
};
-/* Member elements with timeout support */
-struct hash_ipportnet4_telem {
+struct hash_ipportnet4t_elem {
__be32 ip;
__be32 ip2;
__be16 port;
@@ -73,6 +68,29 @@ struct hash_ipportnet4_telem {
unsigned long timeout;
};
+struct hash_ipportnet4c_elem {
+ __be32 ip;
+ __be32 ip2;
+ __be16 port;
+ u8 cidr:7;
+ u8 nomatch:1;
+ u8 proto;
+ struct ip_set_counter counter;
+};
+
+struct hash_ipportnet4ct_elem {
+ __be32 ip;
+ __be32 ip2;
+ __be16 port;
+ u8 cidr:7;
+ u8 nomatch:1;
+ u8 proto;
+ struct ip_set_counter counter;
+ unsigned long timeout;
+};
+
+/* Common functions */
+
static inline bool
hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1,
const struct hash_ipportnet4_elem *ip2,
@@ -85,38 +103,22 @@ hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1,
ip1->proto == ip2->proto;
}
-static inline bool
-hash_ipportnet4_data_isnull(const struct hash_ipportnet4_elem *elem)
-{
- return elem->proto == 0;
-}
-
-static inline void
-hash_ipportnet4_data_copy(struct hash_ipportnet4_elem *dst,
- const struct hash_ipportnet4_elem *src)
+static inline int
+hash_ipportnet4_do_data_match(const struct hash_ipportnet4_elem *elem)
{
- memcpy(dst, src, sizeof(*dst));
+ return elem->nomatch ? -ENOTEMPTY : 1;
}
static inline void
-hash_ipportnet4_data_flags(struct hash_ipportnet4_elem *dst, u32 flags)
+hash_ipportnet4_data_set_flags(struct hash_ipportnet4_elem *elem, u32 flags)
{
- dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH);
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
}
static inline void
-hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *dst, u32 *flags)
+hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *elem, u8 *flags)
{
- if (dst->nomatch) {
- *flags = IPSET_FLAG_NOMATCH;
- dst->nomatch = 0;
- }
-}
-
-static inline int
-hash_ipportnet4_data_match(const struct hash_ipportnet4_elem *elem)
-{
- return elem->nomatch ? -ENOTEMPTY : 1;
+ swap(*flags, elem->nomatch);
}
static inline void
@@ -126,12 +128,6 @@ hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr)
elem->cidr = cidr - 1;
}
-static inline void
-hash_ipportnet4_data_zero_out(struct hash_ipportnet4_elem *elem)
-{
- elem->proto = 0;
-}
-
static bool
hash_ipportnet4_data_list(struct sk_buff *skb,
const struct hash_ipportnet4_elem *data)
@@ -152,81 +148,56 @@ nla_put_failure:
return 1;
}
-static bool
-hash_ipportnet4_data_tlist(struct sk_buff *skb,
- const struct hash_ipportnet4_elem *data)
+static inline void
+hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next,
+ const struct hash_ipportnet4_elem *d)
{
- const struct hash_ipportnet4_telem *tdata =
- (const struct hash_ipportnet4_telem *)data;
- u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
-
- if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, tdata->ip) ||
- nla_put_ipaddr4(skb, IPSET_ATTR_IP2, tdata->ip2) ||
- nla_put_net16(skb, IPSET_ATTR_PORT, tdata->port) ||
- nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) ||
- nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
- nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
- htonl(ip_set_timeout_get(tdata->timeout))) ||
- (flags &&
- nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return 1;
+ next->ip = d->ip;
+ next->port = d->port;
+ next->ip2 = d->ip2;
}
-#define IP_SET_HASH_WITH_PROTO
-#define IP_SET_HASH_WITH_NETS
-
+#define MTYPE hash_ipportnet4
#define PF 4
#define HOST_MASK 32
-#include <linux/netfilter/ipset/ip_set_ahash.h>
-
-static inline void
-hash_ipportnet4_data_next(struct ip_set_hash *h,
- const struct hash_ipportnet4_elem *d)
-{
- h->next.ip = d->ip;
- h->next.port = d->port;
- h->next.ip2 = d->ip2;
-}
+#include "ip_set_hash_gen.h"
static int
hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
- enum ipset_adt adt, const struct ip_set_adt_opt *opt)
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_ipportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_ipportnet4_elem data = {
+ struct hash_ipportnet4_elem e = {
.cidr = h->nets[0].cidr ? h->nets[0].cidr - 1 : HOST_MASK - 1
};
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h);
if (adt == IPSET_TEST)
- data.cidr = HOST_MASK - 1;
+ e.cidr = HOST_MASK - 1;
if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
- &data.port, &data.proto))
+ &e.port, &e.proto))
return -EINVAL;
- ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip);
- ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2);
- data.ip2 &= ip_set_netmask(data.cidr + 1);
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2);
+ e.ip2 &= ip_set_netmask(e.cidr + 1);
- return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
static int
hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_ipportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_ipportnet4_elem data = { .cidr = HOST_MASK - 1 };
+ struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(h);
u32 ip, ip_to, p = 0, port, port_to;
u32 ip2_from, ip2_to, ip2_last, ip2;
- u32 timeout = h->timeout;
bool with_ports = false;
u8 cidr;
int ret;
@@ -235,13 +206,16 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
return -IPSET_ERR_PROTOCOL;
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -253,46 +227,41 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
if (!cidr || cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
- data.cidr = cidr - 1;
+ e.cidr = cidr - 1;
}
if (tb[IPSET_ATTR_PORT])
- data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
else
return -IPSET_ERR_PROTOCOL;
if (tb[IPSET_ATTR_PROTO]) {
- data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
- with_ports = ip_set_proto_with_ports(data.proto);
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
- if (data.proto == 0)
+ if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
} else
return -IPSET_ERR_MISSING_PROTO;
- if (!(with_ports || data.proto == IPPROTO_ICMP))
- data.port = 0;
-
- if (tb[IPSET_ATTR_TIMEOUT]) {
- if (!with_timeout(h->timeout))
- return -IPSET_ERR_TIMEOUT;
- timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
- }
+ if (!(with_ports || e.proto == IPPROTO_ICMP))
+ e.port = 0;
- if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) {
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
if (cadt_flags & IPSET_FLAG_NOMATCH)
- flags |= (cadt_flags << 16);
+ flags |= (IPSET_FLAG_NOMATCH << 16);
}
with_ports = with_ports && tb[IPSET_ATTR_PORT_TO];
if (adt == IPSET_TEST ||
!(tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_IP_TO] || with_ports ||
tb[IPSET_ATTR_IP2_TO])) {
- data.ip = htonl(ip);
- data.ip2 = htonl(ip2_from & ip_set_hostmask(data.cidr + 1));
- ret = adtfn(set, &data, timeout, flags);
- return ip_set_eexist(ret, flags) ? 0 : ret;
+ e.ip = htonl(ip);
+ e.ip2 = htonl(ip2_from & ip_set_hostmask(e.cidr + 1));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt) ? 1 :
+ ip_set_eexist(ret, flags) ? 0 : ret;
}
ip_to = ip;
@@ -310,7 +279,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
ip_set_mask_from_to(ip, ip_to, cidr);
}
- port_to = port = ntohs(data.port);
+ port_to = port = ntohs(e.port);
if (tb[IPSET_ATTR_PORT_TO]) {
port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
if (port > port_to)
@@ -326,28 +295,27 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
swap(ip2_from, ip2_to);
if (ip2_from + UINT_MAX == ip2_to)
return -IPSET_ERR_HASH_RANGE;
- } else {
- ip_set_mask_from_to(ip2_from, ip2_to, data.cidr + 1);
- }
+ } else
+ ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1);
if (retried)
ip = ntohl(h->next.ip);
for (; !before(ip_to, ip); ip++) {
- data.ip = htonl(ip);
+ e.ip = htonl(ip);
p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
: port;
for (; p <= port_to; p++) {
- data.port = htons(p);
+ e.port = htons(p);
ip2 = retried
&& ip == ntohl(h->next.ip)
&& p == ntohs(h->next.port)
? ntohl(h->next.ip2) : ip2_from;
while (!after(ip2, ip2_to)) {
- data.ip2 = htonl(ip2);
+ e.ip2 = htonl(ip2);
ip2_last = ip_set_range_to_cidr(ip2, ip2_to,
&cidr);
- data.cidr = cidr - 1;
- ret = adtfn(set, &data, timeout, flags);
+ e.cidr = cidr - 1;
+ ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
@@ -360,38 +328,50 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
return ret;
}
-static bool
-hash_ipportnet_same_set(const struct ip_set *a, const struct ip_set *b)
-{
- const struct ip_set_hash *x = a->data;
- const struct ip_set_hash *y = b->data;
+/* IPv6 variants */
- /* Resizing changes htable_bits, so we ignore it */
- return x->maxelem == y->maxelem &&
- x->timeout == y->timeout;
-}
+struct hash_ipportnet6_elem {
+ union nf_inet_addr ip;
+ union nf_inet_addr ip2;
+ __be16 port;
+ u8 cidr:7;
+ u8 nomatch:1;
+ u8 proto;
+};
-/* The type variant functions: IPv6 */
+struct hash_ipportnet6t_elem {
+ union nf_inet_addr ip;
+ union nf_inet_addr ip2;
+ __be16 port;
+ u8 cidr:7;
+ u8 nomatch:1;
+ u8 proto;
+ unsigned long timeout;
+};
-struct hash_ipportnet6_elem {
+struct hash_ipportnet6c_elem {
union nf_inet_addr ip;
union nf_inet_addr ip2;
__be16 port;
u8 cidr:7;
u8 nomatch:1;
u8 proto;
+ struct ip_set_counter counter;
};
-struct hash_ipportnet6_telem {
+struct hash_ipportnet6ct_elem {
union nf_inet_addr ip;
union nf_inet_addr ip2;
__be16 port;
u8 cidr:7;
u8 nomatch:1;
u8 proto;
+ struct ip_set_counter counter;
unsigned long timeout;
};
+/* Common functions */
+
static inline bool
hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1,
const struct hash_ipportnet6_elem *ip2,
@@ -404,53 +384,22 @@ hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1,
ip1->proto == ip2->proto;
}
-static inline bool
-hash_ipportnet6_data_isnull(const struct hash_ipportnet6_elem *elem)
-{
- return elem->proto == 0;
-}
-
-static inline void
-hash_ipportnet6_data_copy(struct hash_ipportnet6_elem *dst,
- const struct hash_ipportnet6_elem *src)
-{
- memcpy(dst, src, sizeof(*dst));
-}
-
-static inline void
-hash_ipportnet6_data_flags(struct hash_ipportnet6_elem *dst, u32 flags)
-{
- dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH);
-}
-
-static inline void
-hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *dst, u32 *flags)
-{
- if (dst->nomatch) {
- *flags = IPSET_FLAG_NOMATCH;
- dst->nomatch = 0;
- }
-}
-
static inline int
-hash_ipportnet6_data_match(const struct hash_ipportnet6_elem *elem)
+hash_ipportnet6_do_data_match(const struct hash_ipportnet6_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
static inline void
-hash_ipportnet6_data_zero_out(struct hash_ipportnet6_elem *elem)
+hash_ipportnet6_data_set_flags(struct hash_ipportnet6_elem *elem, u32 flags)
{
- elem->proto = 0;
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
}
static inline void
-ip6_netmask(union nf_inet_addr *ip, u8 prefix)
+hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *elem, u8 *flags)
{
- ip->ip6[0] &= ip_set_netmask6(prefix)[0];
- ip->ip6[1] &= ip_set_netmask6(prefix)[1];
- ip->ip6[2] &= ip_set_netmask6(prefix)[2];
- ip->ip6[3] &= ip_set_netmask6(prefix)[3];
+ swap(*flags, elem->nomatch);
}
static inline void
@@ -480,78 +429,58 @@ nla_put_failure:
return 1;
}
-static bool
-hash_ipportnet6_data_tlist(struct sk_buff *skb,
- const struct hash_ipportnet6_elem *data)
+static inline void
+hash_ipportnet6_data_next(struct hash_ipportnet4_elem *next,
+ const struct hash_ipportnet6_elem *d)
{
- const struct hash_ipportnet6_telem *e =
- (const struct hash_ipportnet6_telem *)data;
- u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
-
- if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) ||
- nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) ||
- nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
- nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) ||
- nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
- nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
- htonl(ip_set_timeout_get(e->timeout))) ||
- (flags &&
- nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return 1;
+ next->port = d->port;
}
+#undef MTYPE
#undef PF
#undef HOST_MASK
+#define MTYPE hash_ipportnet6
#define PF 6
#define HOST_MASK 128
-#include <linux/netfilter/ipset/ip_set_ahash.h>
-
-static inline void
-hash_ipportnet6_data_next(struct ip_set_hash *h,
- const struct hash_ipportnet6_elem *d)
-{
- h->next.port = d->port;
-}
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
static int
hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
- enum ipset_adt adt, const struct ip_set_adt_opt *opt)
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_ipportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_ipportnet6_elem data = {
+ struct hash_ipportnet6_elem e = {
.cidr = h->nets[0].cidr ? h->nets[0].cidr - 1 : HOST_MASK - 1
};
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h);
if (adt == IPSET_TEST)
- data.cidr = HOST_MASK - 1;
+ e.cidr = HOST_MASK - 1;
if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
- &data.port, &data.proto))
+ &e.port, &e.proto))
return -EINVAL;
- ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
- ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &data.ip2.in6);
- ip6_netmask(&data.ip2, data.cidr + 1);
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6);
+ ip6_netmask(&e.ip2, e.cidr + 1);
- return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
static int
hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_ipportnet *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_ipportnet6_elem data = { .cidr = HOST_MASK - 1 };
+ struct hash_ipportnet6_elem e = { .cidr = HOST_MASK - 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(h);
u32 port, port_to;
- u32 timeout = h->timeout;
bool with_ports = false;
u8 cidr;
int ret;
@@ -561,6 +490,8 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES) ||
tb[IPSET_ATTR_IP_TO] ||
tb[IPSET_ATTR_CIDR]))
return -IPSET_ERR_PROTOCOL;
@@ -570,11 +501,12 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &data.ip2);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2);
if (ret)
return ret;
@@ -582,46 +514,41 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]);
if (!cidr || cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
- data.cidr = cidr - 1;
+ e.cidr = cidr - 1;
}
- ip6_netmask(&data.ip2, data.cidr + 1);
+ ip6_netmask(&e.ip2, e.cidr + 1);
if (tb[IPSET_ATTR_PORT])
- data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
else
return -IPSET_ERR_PROTOCOL;
if (tb[IPSET_ATTR_PROTO]) {
- data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
- with_ports = ip_set_proto_with_ports(data.proto);
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
- if (data.proto == 0)
+ if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
} else
return -IPSET_ERR_MISSING_PROTO;
- if (!(with_ports || data.proto == IPPROTO_ICMPV6))
- data.port = 0;
-
- if (tb[IPSET_ATTR_TIMEOUT]) {
- if (!with_timeout(h->timeout))
- return -IPSET_ERR_TIMEOUT;
- timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
- }
+ if (!(with_ports || e.proto == IPPROTO_ICMPV6))
+ e.port = 0;
- if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) {
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
if (cadt_flags & IPSET_FLAG_NOMATCH)
- flags |= (cadt_flags << 16);
+ flags |= (IPSET_FLAG_NOMATCH << 16);
}
if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
- ret = adtfn(set, &data, timeout, flags);
- return ip_set_eexist(ret, flags) ? 0 : ret;
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt) ? 1 :
+ ip_set_eexist(ret, flags) ? 0 : ret;
}
- port = ntohs(data.port);
+ port = ntohs(e.port);
port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
if (port > port_to)
swap(port, port_to);
@@ -629,8 +556,8 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
if (retried)
port = ntohs(h->next.port);
for (; port <= port_to; port++) {
- data.port = htons(port);
- ret = adtfn(set, &data, timeout, flags);
+ e.port = htons(port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
@@ -640,81 +567,6 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
return ret;
}
-/* Create hash:ip type of sets */
-
-static int
-hash_ipportnet_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
-{
- struct ip_set_hash *h;
- u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
- u8 hbits;
- size_t hsize;
-
- if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6))
- return -IPSET_ERR_INVALID_FAMILY;
-
- if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
- return -IPSET_ERR_PROTOCOL;
-
- if (tb[IPSET_ATTR_HASHSIZE]) {
- hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
- if (hashsize < IPSET_MIMINAL_HASHSIZE)
- hashsize = IPSET_MIMINAL_HASHSIZE;
- }
-
- if (tb[IPSET_ATTR_MAXELEM])
- maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
-
- h = kzalloc(sizeof(*h)
- + sizeof(struct ip_set_hash_nets)
- * (set->family == NFPROTO_IPV4 ? 32 : 128), GFP_KERNEL);
- if (!h)
- return -ENOMEM;
-
- h->maxelem = maxelem;
- get_random_bytes(&h->initval, sizeof(h->initval));
- h->timeout = IPSET_NO_TIMEOUT;
-
- hbits = htable_bits(hashsize);
- hsize = htable_size(hbits);
- if (hsize == 0) {
- kfree(h);
- return -ENOMEM;
- }
- h->table = ip_set_alloc(hsize);
- if (!h->table) {
- kfree(h);
- return -ENOMEM;
- }
- h->table->htable_bits = hbits;
-
- set->data = h;
-
- if (tb[IPSET_ATTR_TIMEOUT]) {
- h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
-
- set->variant = set->family == NFPROTO_IPV4
- ? &hash_ipportnet4_tvariant
- : &hash_ipportnet6_tvariant;
-
- if (set->family == NFPROTO_IPV4)
- hash_ipportnet4_gc_init(set);
- else
- hash_ipportnet6_gc_init(set);
- } else {
- set->variant = set->family == NFPROTO_IPV4
- ? &hash_ipportnet4_variant : &hash_ipportnet6_variant;
- }
-
- pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
- set->name, jhash_size(h->table->htable_bits),
- h->table->htable_bits, h->maxelem, set->data, h->table);
-
- return 0;
-}
-
static struct ip_set_type hash_ipportnet_type __read_mostly = {
.name = "hash:ip,port,net",
.protocol = IPSET_PROTOCOL,
@@ -731,6 +583,7 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = {
[IPSET_ATTR_PROBES] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
},
.adt_policy = {
[IPSET_ATTR_IP] = { .type = NLA_NESTED },
@@ -745,6 +598,8 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = {
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
},
.me = THIS_MODULE,
};
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index d6a59154d71..da740ceb56a 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -20,12 +20,12 @@
#include <linux/netfilter.h>
#include <linux/netfilter/ipset/pfxlen.h>
#include <linux/netfilter/ipset/ip_set.h>
-#include <linux/netfilter/ipset/ip_set_timeout.h>
#include <linux/netfilter/ipset/ip_set_hash.h>
#define REVISION_MIN 0
/* 1 Range as input support for IPv4 added */
-#define REVISION_MAX 2 /* nomatch flag support added */
+/* 2 nomatch flag support added */
+#define REVISION_MAX 3 /* Counters support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
@@ -33,33 +33,46 @@ IP_SET_MODULE_DESC("hash:net", REVISION_MIN, REVISION_MAX);
MODULE_ALIAS("ip_set_hash:net");
/* Type specific function prefix */
-#define TYPE hash_net
+#define HTYPE hash_net
+#define IP_SET_HASH_WITH_NETS
-static bool
-hash_net_same_set(const struct ip_set *a, const struct ip_set *b);
+/* IPv4 variants */
-#define hash_net4_same_set hash_net_same_set
-#define hash_net6_same_set hash_net_same_set
+/* Member elements */
+struct hash_net4_elem {
+ __be32 ip;
+ u16 padding0;
+ u8 nomatch;
+ u8 cidr;
+};
-/* The type variant functions: IPv4 */
+struct hash_net4t_elem {
+ __be32 ip;
+ u16 padding0;
+ u8 nomatch;
+ u8 cidr;
+ unsigned long timeout;
+};
-/* Member elements without timeout */
-struct hash_net4_elem {
+struct hash_net4c_elem {
__be32 ip;
u16 padding0;
u8 nomatch;
u8 cidr;
+ struct ip_set_counter counter;
};
-/* Member elements with timeout support */
-struct hash_net4_telem {
+struct hash_net4ct_elem {
__be32 ip;
u16 padding0;
u8 nomatch;
u8 cidr;
+ struct ip_set_counter counter;
unsigned long timeout;
};
+/* Common functions */
+
static inline bool
hash_net4_data_equal(const struct hash_net4_elem *ip1,
const struct hash_net4_elem *ip2,
@@ -69,40 +82,22 @@ hash_net4_data_equal(const struct hash_net4_elem *ip1,
ip1->cidr == ip2->cidr;
}
-static inline bool
-hash_net4_data_isnull(const struct hash_net4_elem *elem)
-{
- return elem->cidr == 0;
-}
-
-static inline void
-hash_net4_data_copy(struct hash_net4_elem *dst,
- const struct hash_net4_elem *src)
+static inline int
+hash_net4_do_data_match(const struct hash_net4_elem *elem)
{
- dst->ip = src->ip;
- dst->cidr = src->cidr;
- dst->nomatch = src->nomatch;
+ return elem->nomatch ? -ENOTEMPTY : 1;
}
static inline void
-hash_net4_data_flags(struct hash_net4_elem *dst, u32 flags)
+hash_net4_data_set_flags(struct hash_net4_elem *elem, u32 flags)
{
- dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH);
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
}
static inline void
-hash_net4_data_reset_flags(struct hash_net4_elem *dst, u32 *flags)
+hash_net4_data_reset_flags(struct hash_net4_elem *elem, u8 *flags)
{
- if (dst->nomatch) {
- *flags = IPSET_FLAG_NOMATCH;
- dst->nomatch = 0;
- }
-}
-
-static inline int
-hash_net4_data_match(const struct hash_net4_elem *elem)
-{
- return elem->nomatch ? -ENOTEMPTY : 1;
+ swap(*flags, elem->nomatch);
}
static inline void
@@ -112,13 +107,6 @@ hash_net4_data_netmask(struct hash_net4_elem *elem, u8 cidr)
elem->cidr = cidr;
}
-/* Zero CIDR values cannot be stored */
-static inline void
-hash_net4_data_zero_out(struct hash_net4_elem *elem)
-{
- elem->cidr = 0;
-}
-
static bool
hash_net4_data_list(struct sk_buff *skb, const struct hash_net4_elem *data)
{
@@ -135,106 +123,84 @@ nla_put_failure:
return 1;
}
-static bool
-hash_net4_data_tlist(struct sk_buff *skb, const struct hash_net4_elem *data)
+static inline void
+hash_net4_data_next(struct hash_net4_elem *next,
+ const struct hash_net4_elem *d)
{
- const struct hash_net4_telem *tdata =
- (const struct hash_net4_telem *)data;
- u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
-
- if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, tdata->ip) ||
- nla_put_u8(skb, IPSET_ATTR_CIDR, tdata->cidr) ||
- nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
- htonl(ip_set_timeout_get(tdata->timeout))) ||
- (flags &&
- nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return 1;
+ next->ip = d->ip;
}
-#define IP_SET_HASH_WITH_NETS
-
+#define MTYPE hash_net4
#define PF 4
#define HOST_MASK 32
-#include <linux/netfilter/ipset/ip_set_ahash.h>
-
-static inline void
-hash_net4_data_next(struct ip_set_hash *h,
- const struct hash_net4_elem *d)
-{
- h->next.ip = d->ip;
-}
+#include "ip_set_hash_gen.h"
static int
hash_net4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
- enum ipset_adt adt, const struct ip_set_adt_opt *opt)
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_net *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_net4_elem data = {
+ struct hash_net4_elem e = {
.cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK
};
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h);
- if (data.cidr == 0)
+ if (e.cidr == 0)
return -EINVAL;
if (adt == IPSET_TEST)
- data.cidr = HOST_MASK;
+ e.cidr = HOST_MASK;
- ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip);
- data.ip &= ip_set_netmask(data.cidr);
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ e.ip &= ip_set_netmask(e.cidr);
- return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
static int
hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_net *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_net4_elem data = { .cidr = HOST_MASK };
- u32 timeout = h->timeout;
+ struct hash_net4_elem e = { .cidr = HOST_MASK };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(h);
u32 ip = 0, ip_to, last;
int ret;
if (unlikely(!tb[IPSET_ATTR_IP] ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
return -IPSET_ERR_PROTOCOL;
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
if (tb[IPSET_ATTR_CIDR]) {
- data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (!data.cidr || data.cidr > HOST_MASK)
+ e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (!e.cidr || e.cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
}
- if (tb[IPSET_ATTR_TIMEOUT]) {
- if (!with_timeout(h->timeout))
- return -IPSET_ERR_TIMEOUT;
- timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
- }
-
- if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) {
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
if (cadt_flags & IPSET_FLAG_NOMATCH)
- flags |= (cadt_flags << 16);
+ flags |= (IPSET_FLAG_NOMATCH << 16);
}
if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
- data.ip = htonl(ip & ip_set_hostmask(data.cidr));
- ret = adtfn(set, &data, timeout, flags);
- return ip_set_eexist(ret, flags) ? 0 : ret;
+ e.ip = htonl(ip & ip_set_hostmask(e.cidr));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt) ? 1 :
+ ip_set_eexist(ret, flags) ? 0 : ret;
}
ip_to = ip;
@@ -250,9 +216,9 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
if (retried)
ip = ntohl(h->next.ip);
while (!after(ip, ip_to)) {
- data.ip = htonl(ip);
- last = ip_set_range_to_cidr(ip, ip_to, &data.cidr);
- ret = adtfn(set, &data, timeout, flags);
+ e.ip = htonl(ip);
+ last = ip_set_range_to_cidr(ip, ip_to, &e.cidr);
+ ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
else
@@ -262,34 +228,42 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
return ret;
}
-static bool
-hash_net_same_set(const struct ip_set *a, const struct ip_set *b)
-{
- const struct ip_set_hash *x = a->data;
- const struct ip_set_hash *y = b->data;
+/* IPv6 variants */
- /* Resizing changes htable_bits, so we ignore it */
- return x->maxelem == y->maxelem &&
- x->timeout == y->timeout;
-}
+struct hash_net6_elem {
+ union nf_inet_addr ip;
+ u16 padding0;
+ u8 nomatch;
+ u8 cidr;
+};
-/* The type variant functions: IPv6 */
+struct hash_net6t_elem {
+ union nf_inet_addr ip;
+ u16 padding0;
+ u8 nomatch;
+ u8 cidr;
+ unsigned long timeout;
+};
-struct hash_net6_elem {
+struct hash_net6c_elem {
union nf_inet_addr ip;
u16 padding0;
u8 nomatch;
u8 cidr;
+ struct ip_set_counter counter;
};
-struct hash_net6_telem {
+struct hash_net6ct_elem {
union nf_inet_addr ip;
u16 padding0;
u8 nomatch;
u8 cidr;
+ struct ip_set_counter counter;
unsigned long timeout;
};
+/* Common functions */
+
static inline bool
hash_net6_data_equal(const struct hash_net6_elem *ip1,
const struct hash_net6_elem *ip2,
@@ -299,55 +273,22 @@ hash_net6_data_equal(const struct hash_net6_elem *ip1,
ip1->cidr == ip2->cidr;
}
-static inline bool
-hash_net6_data_isnull(const struct hash_net6_elem *elem)
-{
- return elem->cidr == 0;
-}
-
-static inline void
-hash_net6_data_copy(struct hash_net6_elem *dst,
- const struct hash_net6_elem *src)
-{
- dst->ip.in6 = src->ip.in6;
- dst->cidr = src->cidr;
- dst->nomatch = src->nomatch;
-}
-
-static inline void
-hash_net6_data_flags(struct hash_net6_elem *dst, u32 flags)
-{
- dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH);
-}
-
-static inline void
-hash_net6_data_reset_flags(struct hash_net6_elem *dst, u32 *flags)
-{
- if (dst->nomatch) {
- *flags = IPSET_FLAG_NOMATCH;
- dst->nomatch = 0;
- }
-}
-
static inline int
-hash_net6_data_match(const struct hash_net6_elem *elem)
+hash_net6_do_data_match(const struct hash_net6_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
static inline void
-hash_net6_data_zero_out(struct hash_net6_elem *elem)
+hash_net6_data_set_flags(struct hash_net6_elem *elem, u32 flags)
{
- elem->cidr = 0;
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
}
static inline void
-ip6_netmask(union nf_inet_addr *ip, u8 prefix)
+hash_net6_data_reset_flags(struct hash_net6_elem *elem, u8 *flags)
{
- ip->ip6[0] &= ip_set_netmask6(prefix)[0];
- ip->ip6[1] &= ip_set_netmask6(prefix)[1];
- ip->ip6[2] &= ip_set_netmask6(prefix)[2];
- ip->ip6[3] &= ip_set_netmask6(prefix)[3];
+ swap(*flags, elem->nomatch);
}
static inline void
@@ -373,74 +314,60 @@ nla_put_failure:
return 1;
}
-static bool
-hash_net6_data_tlist(struct sk_buff *skb, const struct hash_net6_elem *data)
+static inline void
+hash_net6_data_next(struct hash_net4_elem *next,
+ const struct hash_net6_elem *d)
{
- const struct hash_net6_telem *e =
- (const struct hash_net6_telem *)data;
- u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
-
- if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) ||
- nla_put_u8(skb, IPSET_ATTR_CIDR, e->cidr) ||
- nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
- htonl(ip_set_timeout_get(e->timeout))) ||
- (flags &&
- nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return 1;
}
+#undef MTYPE
#undef PF
#undef HOST_MASK
+#define MTYPE hash_net6
#define PF 6
#define HOST_MASK 128
-#include <linux/netfilter/ipset/ip_set_ahash.h>
-
-static inline void
-hash_net6_data_next(struct ip_set_hash *h,
- const struct hash_net6_elem *d)
-{
-}
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
static int
hash_net6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
- enum ipset_adt adt, const struct ip_set_adt_opt *opt)
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_net *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_net6_elem data = {
+ struct hash_net6_elem e = {
.cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK
};
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h);
- if (data.cidr == 0)
+ if (e.cidr == 0)
return -EINVAL;
if (adt == IPSET_TEST)
- data.cidr = HOST_MASK;
+ e.cidr = HOST_MASK;
- ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
- ip6_netmask(&data.ip, data.cidr);
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ ip6_netmask(&e.ip, e.cidr);
- return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
static int
hash_net6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_net *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_net6_elem data = { .cidr = HOST_MASK };
- u32 timeout = h->timeout;
+ struct hash_net6_elem e = { .cidr = HOST_MASK };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(h);
int ret;
if (unlikely(!tb[IPSET_ATTR_IP] ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
return -IPSET_ERR_PROTOCOL;
if (unlikely(tb[IPSET_ATTR_IP_TO]))
return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
@@ -448,107 +375,29 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[],
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
if (tb[IPSET_ATTR_CIDR])
- data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (!data.cidr || data.cidr > HOST_MASK)
+ if (!e.cidr || e.cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
- ip6_netmask(&data.ip, data.cidr);
+ ip6_netmask(&e.ip, e.cidr);
- if (tb[IPSET_ATTR_TIMEOUT]) {
- if (!with_timeout(h->timeout))
- return -IPSET_ERR_TIMEOUT;
- timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
- }
-
- if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) {
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
if (cadt_flags & IPSET_FLAG_NOMATCH)
- flags |= (cadt_flags << 16);
+ flags |= (IPSET_FLAG_NOMATCH << 16);
}
- ret = adtfn(set, &data, timeout, flags);
+ ret = adtfn(set, &e, &ext, &ext, flags);
- return ip_set_eexist(ret, flags) ? 0 : ret;
-}
-
-/* Create hash:ip type of sets */
-
-static int
-hash_net_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
-{
- u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
- struct ip_set_hash *h;
- u8 hbits;
- size_t hsize;
-
- if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6))
- return -IPSET_ERR_INVALID_FAMILY;
-
- if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
- return -IPSET_ERR_PROTOCOL;
-
- if (tb[IPSET_ATTR_HASHSIZE]) {
- hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
- if (hashsize < IPSET_MIMINAL_HASHSIZE)
- hashsize = IPSET_MIMINAL_HASHSIZE;
- }
-
- if (tb[IPSET_ATTR_MAXELEM])
- maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
-
- h = kzalloc(sizeof(*h)
- + sizeof(struct ip_set_hash_nets)
- * (set->family == NFPROTO_IPV4 ? 32 : 128), GFP_KERNEL);
- if (!h)
- return -ENOMEM;
-
- h->maxelem = maxelem;
- get_random_bytes(&h->initval, sizeof(h->initval));
- h->timeout = IPSET_NO_TIMEOUT;
-
- hbits = htable_bits(hashsize);
- hsize = htable_size(hbits);
- if (hsize == 0) {
- kfree(h);
- return -ENOMEM;
- }
- h->table = ip_set_alloc(hsize);
- if (!h->table) {
- kfree(h);
- return -ENOMEM;
- }
- h->table->htable_bits = hbits;
-
- set->data = h;
-
- if (tb[IPSET_ATTR_TIMEOUT]) {
- h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
-
- set->variant = set->family == NFPROTO_IPV4
- ? &hash_net4_tvariant : &hash_net6_tvariant;
-
- if (set->family == NFPROTO_IPV4)
- hash_net4_gc_init(set);
- else
- hash_net6_gc_init(set);
- } else {
- set->variant = set->family == NFPROTO_IPV4
- ? &hash_net4_variant : &hash_net6_variant;
- }
-
- pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
- set->name, jhash_size(h->table->htable_bits),
- h->table->htable_bits, h->maxelem, set->data, h->table);
-
- return 0;
+ return ip_set_enomatch(ret, flags, adt) ? 1 :
+ ip_set_eexist(ret, flags) ? 0 : ret;
}
static struct ip_set_type hash_net_type __read_mostly = {
@@ -566,6 +415,7 @@ static struct ip_set_type hash_net_type __read_mostly = {
[IPSET_ATTR_PROBES] = { .type = NLA_U8 },
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
},
.adt_policy = {
[IPSET_ATTR_IP] = { .type = NLA_NESTED },
@@ -573,6 +423,8 @@ static struct ip_set_type hash_net_type __read_mostly = {
[IPSET_ATTR_CIDR] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
},
.me = THIS_MODULE,
};
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index f2b0a3c3013..84ae6f6ce62 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2011-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -21,12 +21,12 @@
#include <linux/netfilter.h>
#include <linux/netfilter/ipset/pfxlen.h>
#include <linux/netfilter/ipset/ip_set.h>
-#include <linux/netfilter/ipset/ip_set_timeout.h>
#include <linux/netfilter/ipset/ip_set_hash.h>
#define REVISION_MIN 0
/* 1 nomatch flag support added */
-#define REVISION_MAX 2 /* /0 support added */
+/* 2 /0 support added */
+#define REVISION_MAX 3 /* Counters support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
@@ -127,17 +127,14 @@ iface_add(struct rb_root *root, const char **iface)
}
/* Type specific function prefix */
-#define TYPE hash_netiface
-
-static bool
-hash_netiface_same_set(const struct ip_set *a, const struct ip_set *b);
-
-#define hash_netiface4_same_set hash_netiface_same_set
-#define hash_netiface6_same_set hash_netiface_same_set
+#define HTYPE hash_netiface
+#define IP_SET_HASH_WITH_NETS
+#define IP_SET_HASH_WITH_RBTREE
+#define IP_SET_HASH_WITH_MULTI
#define STREQ(a, b) (strcmp(a, b) == 0)
-/* The type variant functions: IPv4 */
+/* IPv4 variants */
struct hash_netiface4_elem_hashed {
__be32 ip;
@@ -147,8 +144,6 @@ struct hash_netiface4_elem_hashed {
u8 elem;
};
-#define HKEY_DATALEN sizeof(struct hash_netiface4_elem_hashed)
-
/* Member elements without timeout */
struct hash_netiface4_elem {
__be32 ip;
@@ -159,17 +154,39 @@ struct hash_netiface4_elem {
const char *iface;
};
-/* Member elements with timeout support */
-struct hash_netiface4_telem {
+struct hash_netiface4t_elem {
+ __be32 ip;
+ u8 physdev;
+ u8 cidr;
+ u8 nomatch;
+ u8 elem;
+ const char *iface;
+ unsigned long timeout;
+};
+
+struct hash_netiface4c_elem {
+ __be32 ip;
+ u8 physdev;
+ u8 cidr;
+ u8 nomatch;
+ u8 elem;
+ const char *iface;
+ struct ip_set_counter counter;
+};
+
+struct hash_netiface4ct_elem {
__be32 ip;
u8 physdev;
u8 cidr;
u8 nomatch;
u8 elem;
const char *iface;
+ struct ip_set_counter counter;
unsigned long timeout;
};
+/* Common functions */
+
static inline bool
hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1,
const struct hash_netiface4_elem *ip2,
@@ -182,38 +199,22 @@ hash_netiface4_data_equal(const struct hash_netiface4_elem *ip1,
ip1->iface == ip2->iface;
}
-static inline bool
-hash_netiface4_data_isnull(const struct hash_netiface4_elem *elem)
-{
- return elem->elem == 0;
-}
-
-static inline void
-hash_netiface4_data_copy(struct hash_netiface4_elem *dst,
- const struct hash_netiface4_elem *src)
+static inline int
+hash_netiface4_do_data_match(const struct hash_netiface4_elem *elem)
{
- memcpy(dst, src, sizeof(*dst));
+ return elem->nomatch ? -ENOTEMPTY : 1;
}
static inline void
-hash_netiface4_data_flags(struct hash_netiface4_elem *dst, u32 flags)
+hash_netiface4_data_set_flags(struct hash_netiface4_elem *elem, u32 flags)
{
- dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH);
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
}
static inline void
-hash_netiface4_data_reset_flags(struct hash_netiface4_elem *dst, u32 *flags)
-{
- if (dst->nomatch) {
- *flags = IPSET_FLAG_NOMATCH;
- dst->nomatch = 0;
- }
-}
-
-static inline int
-hash_netiface4_data_match(const struct hash_netiface4_elem *elem)
+hash_netiface4_data_reset_flags(struct hash_netiface4_elem *elem, u8 *flags)
{
- return elem->nomatch ? -ENOTEMPTY : 1;
+ swap(*flags, elem->nomatch);
}
static inline void
@@ -223,12 +224,6 @@ hash_netiface4_data_netmask(struct hash_netiface4_elem *elem, u8 cidr)
elem->cidr = cidr;
}
-static inline void
-hash_netiface4_data_zero_out(struct hash_netiface4_elem *elem)
-{
- elem->elem = 0;
-}
-
static bool
hash_netiface4_data_list(struct sk_buff *skb,
const struct hash_netiface4_elem *data)
@@ -249,66 +244,40 @@ nla_put_failure:
return 1;
}
-static bool
-hash_netiface4_data_tlist(struct sk_buff *skb,
- const struct hash_netiface4_elem *data)
+static inline void
+hash_netiface4_data_next(struct hash_netiface4_elem *next,
+ const struct hash_netiface4_elem *d)
{
- const struct hash_netiface4_telem *tdata =
- (const struct hash_netiface4_telem *)data;
- u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0;
-
- if (data->nomatch)
- flags |= IPSET_FLAG_NOMATCH;
- if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) ||
- nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) ||
- nla_put_string(skb, IPSET_ATTR_IFACE, data->iface) ||
- (flags &&
- nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))) ||
- nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
- htonl(ip_set_timeout_get(tdata->timeout))))
- goto nla_put_failure;
-
- return 0;
-
-nla_put_failure:
- return 1;
+ next->ip = d->ip;
}
-#define IP_SET_HASH_WITH_NETS
-#define IP_SET_HASH_WITH_RBTREE
-#define IP_SET_HASH_WITH_MULTI
-
+#define MTYPE hash_netiface4
#define PF 4
#define HOST_MASK 32
-#include <linux/netfilter/ipset/ip_set_ahash.h>
-
-static inline void
-hash_netiface4_data_next(struct ip_set_hash *h,
- const struct hash_netiface4_elem *d)
-{
- h->next.ip = d->ip;
-}
+#define HKEY_DATALEN sizeof(struct hash_netiface4_elem_hashed)
+#include "ip_set_hash_gen.h"
static int
hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
- enum ipset_adt adt, const struct ip_set_adt_opt *opt)
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- struct ip_set_hash *h = set->data;
+ struct hash_netiface *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netiface4_elem data = {
+ struct hash_netiface4_elem e = {
.cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK,
.elem = 1,
};
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h);
int ret;
- if (data.cidr == 0)
+ if (e.cidr == 0)
return -EINVAL;
if (adt == IPSET_TEST)
- data.cidr = HOST_MASK;
+ e.cidr = HOST_MASK;
- ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip);
- data.ip &= ip_set_netmask(data.cidr);
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ e.ip &= ip_set_netmask(e.cidr);
#define IFACE(dir) (par->dir ? par->dir->name : NULL)
#define PHYSDEV(dir) (nf_bridge->dir ? nf_bridge->dir->name : NULL)
@@ -320,72 +289,69 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
if (!nf_bridge)
return -EINVAL;
- data.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev);
- data.physdev = 1;
+ e.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev);
+ e.physdev = 1;
#else
- data.iface = NULL;
+ e.iface = NULL;
#endif
} else
- data.iface = SRCDIR ? IFACE(in) : IFACE(out);
+ e.iface = SRCDIR ? IFACE(in) : IFACE(out);
- if (!data.iface)
+ if (!e.iface)
return -EINVAL;
- ret = iface_test(&h->rbtree, &data.iface);
+ ret = iface_test(&h->rbtree, &e.iface);
if (adt == IPSET_ADD) {
if (!ret) {
- ret = iface_add(&h->rbtree, &data.iface);
+ ret = iface_add(&h->rbtree, &e.iface);
if (ret)
return ret;
}
} else if (!ret)
return ret;
- return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
static int
hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- struct ip_set_hash *h = set->data;
+ struct hash_netiface *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netiface4_elem data = { .cidr = HOST_MASK, .elem = 1 };
+ struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(h);
u32 ip = 0, ip_to, last;
- u32 timeout = h->timeout;
char iface[IFNAMSIZ];
int ret;
if (unlikely(!tb[IPSET_ATTR_IP] ||
!tb[IPSET_ATTR_IFACE] ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
return -IPSET_ERR_PROTOCOL;
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
if (tb[IPSET_ATTR_CIDR]) {
- data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (data.cidr > HOST_MASK)
+ e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (e.cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
}
- if (tb[IPSET_ATTR_TIMEOUT]) {
- if (!with_timeout(h->timeout))
- return -IPSET_ERR_TIMEOUT;
- timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
- }
-
strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE]));
- data.iface = iface;
- ret = iface_test(&h->rbtree, &data.iface);
+ e.iface = iface;
+ ret = iface_test(&h->rbtree, &e.iface);
if (adt == IPSET_ADD) {
if (!ret) {
- ret = iface_add(&h->rbtree, &data.iface);
+ ret = iface_add(&h->rbtree, &e.iface);
if (ret)
return ret;
}
@@ -395,14 +361,15 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
if (cadt_flags & IPSET_FLAG_PHYSDEV)
- data.physdev = 1;
- if (adt == IPSET_ADD && (cadt_flags & IPSET_FLAG_NOMATCH))
- flags |= (cadt_flags << 16);
+ e.physdev = 1;
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
}
if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
- data.ip = htonl(ip & ip_set_hostmask(data.cidr));
- ret = adtfn(set, &data, timeout, flags);
- return ip_set_eexist(ret, flags) ? 0 : ret;
+ e.ip = htonl(ip & ip_set_hostmask(e.cidr));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt) ? 1 :
+ ip_set_eexist(ret, flags) ? 0 : ret;
}
if (tb[IPSET_ATTR_IP_TO]) {
@@ -413,16 +380,15 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
swap(ip, ip_to);
if (ip + UINT_MAX == ip_to)
return -IPSET_ERR_HASH_RANGE;
- } else {
- ip_set_mask_from_to(ip, ip_to, data.cidr);
- }
+ } else
+ ip_set_mask_from_to(ip, ip_to, e.cidr);
if (retried)
ip = ntohl(h->next.ip);
while (!after(ip, ip_to)) {
- data.ip = htonl(ip);
- last = ip_set_range_to_cidr(ip, ip_to, &data.cidr);
- ret = adtfn(set, &data, timeout, flags);
+ e.ip = htonl(ip);
+ last = ip_set_range_to_cidr(ip, ip_to, &e.cidr);
+ ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
@@ -433,18 +399,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
return ret;
}
-static bool
-hash_netiface_same_set(const struct ip_set *a, const struct ip_set *b)
-{
- const struct ip_set_hash *x = a->data;
- const struct ip_set_hash *y = b->data;
-
- /* Resizing changes htable_bits, so we ignore it */
- return x->maxelem == y->maxelem &&
- x->timeout == y->timeout;
-}
-
-/* The type variant functions: IPv6 */
+/* IPv6 variants */
struct hash_netiface6_elem_hashed {
union nf_inet_addr ip;
@@ -454,8 +409,6 @@ struct hash_netiface6_elem_hashed {
u8 elem;
};
-#define HKEY_DATALEN sizeof(struct hash_netiface6_elem_hashed)
-
struct hash_netiface6_elem {
union nf_inet_addr ip;
u8 physdev;
@@ -465,16 +418,39 @@ struct hash_netiface6_elem {
const char *iface;
};
-struct hash_netiface6_telem {
+struct hash_netiface6t_elem {
+ union nf_inet_addr ip;
+ u8 physdev;
+ u8 cidr;
+ u8 nomatch;
+ u8 elem;
+ const char *iface;
+ unsigned long timeout;
+};
+
+struct hash_netiface6c_elem {
union nf_inet_addr ip;
u8 physdev;
u8 cidr;
u8 nomatch;
u8 elem;
const char *iface;
+ struct ip_set_counter counter;
+};
+
+struct hash_netiface6ct_elem {
+ union nf_inet_addr ip;
+ u8 physdev;
+ u8 cidr;
+ u8 nomatch;
+ u8 elem;
+ const char *iface;
+ struct ip_set_counter counter;
unsigned long timeout;
};
+/* Common functions */
+
static inline bool
hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1,
const struct hash_netiface6_elem *ip2,
@@ -487,53 +463,22 @@ hash_netiface6_data_equal(const struct hash_netiface6_elem *ip1,
ip1->iface == ip2->iface;
}
-static inline bool
-hash_netiface6_data_isnull(const struct hash_netiface6_elem *elem)
-{
- return elem->elem == 0;
-}
-
-static inline void
-hash_netiface6_data_copy(struct hash_netiface6_elem *dst,
- const struct hash_netiface6_elem *src)
-{
- memcpy(dst, src, sizeof(*dst));
-}
-
-static inline void
-hash_netiface6_data_flags(struct hash_netiface6_elem *dst, u32 flags)
-{
- dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH);
-}
-
static inline int
-hash_netiface6_data_match(const struct hash_netiface6_elem *elem)
+hash_netiface6_do_data_match(const struct hash_netiface6_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
static inline void
-hash_netiface6_data_reset_flags(struct hash_netiface6_elem *dst, u32 *flags)
-{
- if (dst->nomatch) {
- *flags = IPSET_FLAG_NOMATCH;
- dst->nomatch = 0;
- }
-}
-
-static inline void
-hash_netiface6_data_zero_out(struct hash_netiface6_elem *elem)
+hash_netiface6_data_set_flags(struct hash_netiface6_elem *elem, u32 flags)
{
- elem->elem = 0;
+ elem->nomatch = (flags >> 16) & IPSET_FLAG_NOMATCH;
}
static inline void
-ip6_netmask(union nf_inet_addr *ip, u8 prefix)
+hash_netiface6_data_reset_flags(struct hash_netiface6_elem *elem, u8 *flags)
{
- ip->ip6[0] &= ip_set_netmask6(prefix)[0];
- ip->ip6[1] &= ip_set_netmask6(prefix)[1];
- ip->ip6[2] &= ip_set_netmask6(prefix)[2];
- ip->ip6[3] &= ip_set_netmask6(prefix)[3];
+ swap(*flags, elem->nomatch);
}
static inline void
@@ -563,63 +508,45 @@ nla_put_failure:
return 1;
}
-static bool
-hash_netiface6_data_tlist(struct sk_buff *skb,
- const struct hash_netiface6_elem *data)
+static inline void
+hash_netiface6_data_next(struct hash_netiface4_elem *next,
+ const struct hash_netiface6_elem *d)
{
- const struct hash_netiface6_telem *e =
- (const struct hash_netiface6_telem *)data;
- u32 flags = data->physdev ? IPSET_FLAG_PHYSDEV : 0;
-
- if (data->nomatch)
- flags |= IPSET_FLAG_NOMATCH;
- if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) ||
- nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr) ||
- nla_put_string(skb, IPSET_ATTR_IFACE, data->iface) ||
- (flags &&
- nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))) ||
- nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
- htonl(ip_set_timeout_get(e->timeout))))
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return 1;
}
+#undef MTYPE
#undef PF
#undef HOST_MASK
+#undef HKEY_DATALEN
+#define MTYPE hash_netiface6
#define PF 6
#define HOST_MASK 128
-#include <linux/netfilter/ipset/ip_set_ahash.h>
-
-static inline void
-hash_netiface6_data_next(struct ip_set_hash *h,
- const struct hash_netiface6_elem *d)
-{
-}
+#define HKEY_DATALEN sizeof(struct hash_netiface6_elem_hashed)
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
static int
hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
- enum ipset_adt adt, const struct ip_set_adt_opt *opt)
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- struct ip_set_hash *h = set->data;
+ struct hash_netiface *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netiface6_elem data = {
+ struct hash_netiface6_elem e = {
.cidr = h->nets[0].cidr ? h->nets[0].cidr : HOST_MASK,
.elem = 1,
};
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h);
int ret;
- if (data.cidr == 0)
+ if (e.cidr == 0)
return -EINVAL;
if (adt == IPSET_TEST)
- data.cidr = HOST_MASK;
+ e.cidr = HOST_MASK;
- ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
- ip6_netmask(&data.ip, data.cidr);
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ ip6_netmask(&e.ip, e.cidr);
if (opt->cmdflags & IPSET_FLAG_PHYSDEV) {
#ifdef CONFIG_BRIDGE_NETFILTER
@@ -627,44 +554,46 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
if (!nf_bridge)
return -EINVAL;
- data.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev);
- data.physdev = 1;
+ e.iface = SRCDIR ? PHYSDEV(physindev) : PHYSDEV(physoutdev);
+ e.physdev = 1;
#else
- data.iface = NULL;
+ e.iface = NULL;
#endif
} else
- data.iface = SRCDIR ? IFACE(in) : IFACE(out);
+ e.iface = SRCDIR ? IFACE(in) : IFACE(out);
- if (!data.iface)
+ if (!e.iface)
return -EINVAL;
- ret = iface_test(&h->rbtree, &data.iface);
+ ret = iface_test(&h->rbtree, &e.iface);
if (adt == IPSET_ADD) {
if (!ret) {
- ret = iface_add(&h->rbtree, &data.iface);
+ ret = iface_add(&h->rbtree, &e.iface);
if (ret)
return ret;
}
} else if (!ret)
return ret;
- return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
static int
hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- struct ip_set_hash *h = set->data;
+ struct hash_netiface *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netiface6_elem data = { .cidr = HOST_MASK, .elem = 1 };
- u32 timeout = h->timeout;
+ struct hash_netiface6_elem e = { .cidr = HOST_MASK, .elem = 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(h);
char iface[IFNAMSIZ];
int ret;
if (unlikely(!tb[IPSET_ATTR_IP] ||
!tb[IPSET_ATTR_IFACE] ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
return -IPSET_ERR_PROTOCOL;
if (unlikely(tb[IPSET_ATTR_IP_TO]))
return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
@@ -672,28 +601,23 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
if (tb[IPSET_ATTR_CIDR])
- data.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
- if (data.cidr > HOST_MASK)
+ e.cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
+ if (e.cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
- ip6_netmask(&data.ip, data.cidr);
-
- if (tb[IPSET_ATTR_TIMEOUT]) {
- if (!with_timeout(h->timeout))
- return -IPSET_ERR_TIMEOUT;
- timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
- }
+ ip6_netmask(&e.ip, e.cidr);
strcpy(iface, nla_data(tb[IPSET_ATTR_IFACE]));
- data.iface = iface;
- ret = iface_test(&h->rbtree, &data.iface);
+ e.iface = iface;
+ ret = iface_test(&h->rbtree, &e.iface);
if (adt == IPSET_ADD) {
if (!ret) {
- ret = iface_add(&h->rbtree, &data.iface);
+ ret = iface_add(&h->rbtree, &e.iface);
if (ret)
return ret;
}
@@ -703,90 +627,15 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
if (cadt_flags & IPSET_FLAG_PHYSDEV)
- data.physdev = 1;
- if (adt == IPSET_ADD && (cadt_flags & IPSET_FLAG_NOMATCH))
- flags |= (cadt_flags << 16);
- }
-
- ret = adtfn(set, &data, timeout, flags);
-
- return ip_set_eexist(ret, flags) ? 0 : ret;
-}
-
-/* Create hash:ip type of sets */
-
-static int
-hash_netiface_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
-{
- struct ip_set_hash *h;
- u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
- u8 hbits;
- size_t hsize;
-
- if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6))
- return -IPSET_ERR_INVALID_FAMILY;
-
- if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
- return -IPSET_ERR_PROTOCOL;
-
- if (tb[IPSET_ATTR_HASHSIZE]) {
- hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
- if (hashsize < IPSET_MIMINAL_HASHSIZE)
- hashsize = IPSET_MIMINAL_HASHSIZE;
+ e.physdev = 1;
+ if (cadt_flags & IPSET_FLAG_NOMATCH)
+ flags |= (IPSET_FLAG_NOMATCH << 16);
}
- if (tb[IPSET_ATTR_MAXELEM])
- maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
-
- h = kzalloc(sizeof(*h)
- + sizeof(struct ip_set_hash_nets)
- * (set->family == NFPROTO_IPV4 ? 32 : 128), GFP_KERNEL);
- if (!h)
- return -ENOMEM;
+ ret = adtfn(set, &e, &ext, &ext, flags);
- h->maxelem = maxelem;
- get_random_bytes(&h->initval, sizeof(h->initval));
- h->timeout = IPSET_NO_TIMEOUT;
- h->ahash_max = AHASH_MAX_SIZE;
-
- hbits = htable_bits(hashsize);
- hsize = htable_size(hbits);
- if (hsize == 0) {
- kfree(h);
- return -ENOMEM;
- }
- h->table = ip_set_alloc(hsize);
- if (!h->table) {
- kfree(h);
- return -ENOMEM;
- }
- h->table->htable_bits = hbits;
- h->rbtree = RB_ROOT;
-
- set->data = h;
-
- if (tb[IPSET_ATTR_TIMEOUT]) {
- h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
-
- set->variant = set->family == NFPROTO_IPV4
- ? &hash_netiface4_tvariant : &hash_netiface6_tvariant;
-
- if (set->family == NFPROTO_IPV4)
- hash_netiface4_gc_init(set);
- else
- hash_netiface6_gc_init(set);
- } else {
- set->variant = set->family == NFPROTO_IPV4
- ? &hash_netiface4_variant : &hash_netiface6_variant;
- }
-
- pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
- set->name, jhash_size(h->table->htable_bits),
- h->table->htable_bits, h->maxelem, set->data, h->table);
-
- return 0;
+ return ip_set_enomatch(ret, flags, adt) ? 1 :
+ ip_set_eexist(ret, flags) ? 0 : ret;
}
static struct ip_set_type hash_netiface_type __read_mostly = {
@@ -806,6 +655,7 @@ static struct ip_set_type hash_netiface_type __read_mostly = {
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_PROTO] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
},
.adt_policy = {
[IPSET_ATTR_IP] = { .type = NLA_NESTED },
@@ -816,6 +666,8 @@ static struct ip_set_type hash_netiface_type __read_mostly = {
[IPSET_ATTR_CIDR] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
},
.me = THIS_MODULE,
};
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 349deb672a2..9a0869853be 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -20,14 +20,14 @@
#include <linux/netfilter.h>
#include <linux/netfilter/ipset/pfxlen.h>
#include <linux/netfilter/ipset/ip_set.h>
-#include <linux/netfilter/ipset/ip_set_timeout.h>
#include <linux/netfilter/ipset/ip_set_getport.h>
#include <linux/netfilter/ipset/ip_set_hash.h>
#define REVISION_MIN 0
/* 1 SCTP and UDPLITE support added */
/* 2 Range as input support for IPv4 added */
-#define REVISION_MAX 3 /* nomatch flag support added */
+/* 3 nomatch flag support added */
+#define REVISION_MAX 4 /* Counters support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
@@ -35,15 +35,9 @@ IP_SET_MODULE_DESC("hash:net,port", REVISION_MIN, REVISION_MAX);
MODULE_ALIAS("ip_set_hash:net,port");
/* Type specific function prefix */
-#define TYPE hash_netport
-
-static bool
-hash_netport_same_set(const struct ip_set *a, const struct ip_set *b);
-
-#define hash_netport4_same_set hash_netport_same_set
-#define hash_netport6_same_set hash_netport_same_set
-
-/* The type variant functions: IPv4 */
+#define HTYPE hash_netport
+#define IP_SET_HASH_WITH_PROTO
+#define IP_SET_HASH_WITH_NETS
/* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0
* However this way we have to store internally cidr - 1,
@@ -51,7 +45,9 @@ hash_netport_same_set(const struct ip_set *a, const struct ip_set *b);
*/
#define IP_SET_HASH_WITH_NETS_PACKED
-/* Member elements without timeout */
+/* IPv4 variants */
+
+/* Member elements */
struct hash_netport4_elem {
__be32 ip;
__be16 port;
@@ -60,16 +56,36 @@ struct hash_netport4_elem {
u8 nomatch:1;
};
-/* Member elements with timeout support */
-struct hash_netport4_telem {
+struct hash_netport4t_elem {
+ __be32 ip;
+ __be16 port;
+ u8 proto;
+ u8 cidr:7;
+ u8 nomatch:1;
+ unsigned long timeout;
+};
+
+struct hash_netport4c_elem {
+ __be32 ip;
+ __be16 port;
+ u8 proto;
+ u8 cidr:7;
+ u8 nomatch:1;
+ struct ip_set_counter counter;
+};
+
+struct hash_netport4ct_elem {
__be32 ip;
__be16 port;
u8 proto;
u8 cidr:7;
u8 nomatch:1;
+ struct ip_set_counter counter;
unsigned long timeout;
};
+/* Common functions */
+
static inline bool
hash_netport4_data_equal(const struct hash_netport4_elem *ip1,
const struct hash_netport4_elem *ip2,
@@ -81,42 +97,22 @@ hash_netport4_data_equal(const struct hash_netport4_elem *ip1,
ip1->cidr == ip2->cidr;
}
-static inline bool
-hash_netport4_data_isnull(const struct hash_netport4_elem *elem)
-{
- return elem->proto == 0;
-}
-
-static inline void
-hash_netport4_data_copy(struct hash_netport4_elem *dst,
- const struct hash_netport4_elem *src)
+static inline int
+hash_netport4_do_data_match(const struct hash_netport4_elem *elem)
{
- dst->ip = src->ip;
- dst->port = src->port;
- dst->proto = src->proto;
- dst->cidr = src->cidr;
- dst->nomatch = src->nomatch;
+ return elem->nomatch ? -ENOTEMPTY : 1;
}
static inline void
-hash_netport4_data_flags(struct hash_netport4_elem *dst, u32 flags)
+hash_netport4_data_set_flags(struct hash_netport4_elem *elem, u32 flags)
{
- dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH);
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
}
static inline void
-hash_netport4_data_reset_flags(struct hash_netport4_elem *dst, u32 *flags)
+hash_netport4_data_reset_flags(struct hash_netport4_elem *elem, u8 *flags)
{
- if (dst->nomatch) {
- *flags = IPSET_FLAG_NOMATCH;
- dst->nomatch = 0;
- }
-}
-
-static inline int
-hash_netport4_data_match(const struct hash_netport4_elem *elem)
-{
- return elem->nomatch ? -ENOTEMPTY : 1;
+ swap(*flags, elem->nomatch);
}
static inline void
@@ -126,12 +122,6 @@ hash_netport4_data_netmask(struct hash_netport4_elem *elem, u8 cidr)
elem->cidr = cidr - 1;
}
-static inline void
-hash_netport4_data_zero_out(struct hash_netport4_elem *elem)
-{
- elem->proto = 0;
-}
-
static bool
hash_netport4_data_list(struct sk_buff *skb,
const struct hash_netport4_elem *data)
@@ -151,77 +141,53 @@ nla_put_failure:
return 1;
}
-static bool
-hash_netport4_data_tlist(struct sk_buff *skb,
- const struct hash_netport4_elem *data)
+static inline void
+hash_netport4_data_next(struct hash_netport4_elem *next,
+ const struct hash_netport4_elem *d)
{
- const struct hash_netport4_telem *tdata =
- (const struct hash_netport4_telem *)data;
- u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
-
- if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, tdata->ip) ||
- nla_put_net16(skb, IPSET_ATTR_PORT, tdata->port) ||
- nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) ||
- nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
- nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
- htonl(ip_set_timeout_get(tdata->timeout))) ||
- (flags &&
- nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return 1;
+ next->ip = d->ip;
+ next->port = d->port;
}
-#define IP_SET_HASH_WITH_PROTO
-#define IP_SET_HASH_WITH_NETS
-
+#define MTYPE hash_netport4
#define PF 4
#define HOST_MASK 32
-#include <linux/netfilter/ipset/ip_set_ahash.h>
-
-static inline void
-hash_netport4_data_next(struct ip_set_hash *h,
- const struct hash_netport4_elem *d)
-{
- h->next.ip = d->ip;
- h->next.port = d->port;
-}
+#include "ip_set_hash_gen.h"
static int
hash_netport4_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
- enum ipset_adt adt, const struct ip_set_adt_opt *opt)
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_netport *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netport4_elem data = {
+ struct hash_netport4_elem e = {
.cidr = h->nets[0].cidr ? h->nets[0].cidr - 1 : HOST_MASK - 1
};
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h);
if (adt == IPSET_TEST)
- data.cidr = HOST_MASK - 1;
+ e.cidr = HOST_MASK - 1;
if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
- &data.port, &data.proto))
+ &e.port, &e.proto))
return -EINVAL;
- ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip);
- data.ip &= ip_set_netmask(data.cidr + 1);
+ ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip);
+ e.ip &= ip_set_netmask(e.cidr + 1);
- return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
static int
hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_netport *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netport4_elem data = { .cidr = HOST_MASK - 1 };
+ struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(h);
u32 port, port_to, p = 0, ip = 0, ip_to, last;
- u32 timeout = h->timeout;
bool with_ports = false;
u8 cidr;
int ret;
@@ -230,13 +196,16 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
return -IPSET_ERR_PROTOCOL;
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip);
+ ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip) ||
+ ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -244,47 +213,42 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
if (!cidr || cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
- data.cidr = cidr - 1;
+ e.cidr = cidr - 1;
}
if (tb[IPSET_ATTR_PORT])
- data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
else
return -IPSET_ERR_PROTOCOL;
if (tb[IPSET_ATTR_PROTO]) {
- data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
- with_ports = ip_set_proto_with_ports(data.proto);
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
- if (data.proto == 0)
+ if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
} else
return -IPSET_ERR_MISSING_PROTO;
- if (!(with_ports || data.proto == IPPROTO_ICMP))
- data.port = 0;
-
- if (tb[IPSET_ATTR_TIMEOUT]) {
- if (!with_timeout(h->timeout))
- return -IPSET_ERR_TIMEOUT;
- timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
- }
+ if (!(with_ports || e.proto == IPPROTO_ICMP))
+ e.port = 0;
with_ports = with_ports && tb[IPSET_ATTR_PORT_TO];
- if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) {
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
if (cadt_flags & IPSET_FLAG_NOMATCH)
- flags |= (cadt_flags << 16);
+ flags |= (IPSET_FLAG_NOMATCH << 16);
}
if (adt == IPSET_TEST || !(with_ports || tb[IPSET_ATTR_IP_TO])) {
- data.ip = htonl(ip & ip_set_hostmask(data.cidr + 1));
- ret = adtfn(set, &data, timeout, flags);
- return ip_set_eexist(ret, flags) ? 0 : ret;
+ e.ip = htonl(ip & ip_set_hostmask(e.cidr + 1));
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt) ? 1 :
+ ip_set_eexist(ret, flags) ? 0 : ret;
}
- port = port_to = ntohs(data.port);
+ port = port_to = ntohs(e.port);
if (tb[IPSET_ATTR_PORT_TO]) {
port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
if (port_to < port)
@@ -298,21 +262,20 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
swap(ip, ip_to);
if (ip + UINT_MAX == ip_to)
return -IPSET_ERR_HASH_RANGE;
- } else {
- ip_set_mask_from_to(ip, ip_to, data.cidr + 1);
- }
+ } else
+ ip_set_mask_from_to(ip, ip_to, e.cidr + 1);
if (retried)
ip = ntohl(h->next.ip);
while (!after(ip, ip_to)) {
- data.ip = htonl(ip);
+ e.ip = htonl(ip);
last = ip_set_range_to_cidr(ip, ip_to, &cidr);
- data.cidr = cidr - 1;
+ e.cidr = cidr - 1;
p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port)
: port;
for (; p <= port_to; p++) {
- data.port = htons(p);
- ret = adtfn(set, &data, timeout, flags);
+ e.port = htons(p);
+ ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
@@ -324,36 +287,46 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
return ret;
}
-static bool
-hash_netport_same_set(const struct ip_set *a, const struct ip_set *b)
-{
- const struct ip_set_hash *x = a->data;
- const struct ip_set_hash *y = b->data;
+/* IPv6 variants */
- /* Resizing changes htable_bits, so we ignore it */
- return x->maxelem == y->maxelem &&
- x->timeout == y->timeout;
-}
+struct hash_netport6_elem {
+ union nf_inet_addr ip;
+ __be16 port;
+ u8 proto;
+ u8 cidr:7;
+ u8 nomatch:1;
+};
-/* The type variant functions: IPv6 */
+struct hash_netport6t_elem {
+ union nf_inet_addr ip;
+ __be16 port;
+ u8 proto;
+ u8 cidr:7;
+ u8 nomatch:1;
+ unsigned long timeout;
+};
-struct hash_netport6_elem {
+struct hash_netport6c_elem {
union nf_inet_addr ip;
__be16 port;
u8 proto;
u8 cidr:7;
u8 nomatch:1;
+ struct ip_set_counter counter;
};
-struct hash_netport6_telem {
+struct hash_netport6ct_elem {
union nf_inet_addr ip;
__be16 port;
u8 proto;
u8 cidr:7;
u8 nomatch:1;
+ struct ip_set_counter counter;
unsigned long timeout;
};
+/* Common functions */
+
static inline bool
hash_netport6_data_equal(const struct hash_netport6_elem *ip1,
const struct hash_netport6_elem *ip2,
@@ -365,53 +338,22 @@ hash_netport6_data_equal(const struct hash_netport6_elem *ip1,
ip1->cidr == ip2->cidr;
}
-static inline bool
-hash_netport6_data_isnull(const struct hash_netport6_elem *elem)
-{
- return elem->proto == 0;
-}
-
-static inline void
-hash_netport6_data_copy(struct hash_netport6_elem *dst,
- const struct hash_netport6_elem *src)
-{
- memcpy(dst, src, sizeof(*dst));
-}
-
-static inline void
-hash_netport6_data_flags(struct hash_netport6_elem *dst, u32 flags)
-{
- dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH);
-}
-
-static inline void
-hash_netport6_data_reset_flags(struct hash_netport6_elem *dst, u32 *flags)
-{
- if (dst->nomatch) {
- *flags = IPSET_FLAG_NOMATCH;
- dst->nomatch = 0;
- }
-}
-
static inline int
-hash_netport6_data_match(const struct hash_netport6_elem *elem)
+hash_netport6_do_data_match(const struct hash_netport6_elem *elem)
{
return elem->nomatch ? -ENOTEMPTY : 1;
}
static inline void
-hash_netport6_data_zero_out(struct hash_netport6_elem *elem)
+hash_netport6_data_set_flags(struct hash_netport6_elem *elem, u32 flags)
{
- elem->proto = 0;
+ elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH);
}
static inline void
-ip6_netmask(union nf_inet_addr *ip, u8 prefix)
+hash_netport6_data_reset_flags(struct hash_netport6_elem *elem, u8 *flags)
{
- ip->ip6[0] &= ip_set_netmask6(prefix)[0];
- ip->ip6[1] &= ip_set_netmask6(prefix)[1];
- ip->ip6[2] &= ip_set_netmask6(prefix)[2];
- ip->ip6[3] &= ip_set_netmask6(prefix)[3];
+ swap(*flags, elem->nomatch);
}
static inline void
@@ -440,76 +382,57 @@ nla_put_failure:
return 1;
}
-static bool
-hash_netport6_data_tlist(struct sk_buff *skb,
- const struct hash_netport6_elem *data)
+static inline void
+hash_netport6_data_next(struct hash_netport4_elem *next,
+ const struct hash_netport6_elem *d)
{
- const struct hash_netport6_telem *e =
- (const struct hash_netport6_telem *)data;
- u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0;
-
- if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &e->ip.in6) ||
- nla_put_net16(skb, IPSET_ATTR_PORT, data->port) ||
- nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr + 1) ||
- nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) ||
- nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
- htonl(ip_set_timeout_get(e->timeout))) ||
- (flags &&
- nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags))))
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return 1;
+ next->port = d->port;
}
+#undef MTYPE
#undef PF
#undef HOST_MASK
+#define MTYPE hash_netport6
#define PF 6
#define HOST_MASK 128
-#include <linux/netfilter/ipset/ip_set_ahash.h>
-
-static inline void
-hash_netport6_data_next(struct ip_set_hash *h,
- const struct hash_netport6_elem *d)
-{
- h->next.port = d->port;
-}
+#define IP_SET_EMIT_CREATE
+#include "ip_set_hash_gen.h"
static int
hash_netport6_kadt(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
- enum ipset_adt adt, const struct ip_set_adt_opt *opt)
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_netport *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netport6_elem data = {
+ struct hash_netport6_elem e = {
.cidr = h->nets[0].cidr ? h->nets[0].cidr - 1 : HOST_MASK - 1,
};
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, h);
if (adt == IPSET_TEST)
- data.cidr = HOST_MASK - 1;
+ e.cidr = HOST_MASK - 1;
if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC,
- &data.port, &data.proto))
+ &e.port, &e.proto))
return -EINVAL;
- ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &data.ip.in6);
- ip6_netmask(&data.ip, data.cidr + 1);
+ ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6);
+ ip6_netmask(&e.ip, e.cidr + 1);
- return adtfn(set, &data, opt_timeout(opt, h), opt->cmdflags);
+ return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
static int
hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
- const struct ip_set_hash *h = set->data;
+ const struct hash_netport *h = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct hash_netport6_elem data = { .cidr = HOST_MASK - 1 };
+ struct hash_netport6_elem e = { .cidr = HOST_MASK - 1 };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(h);
u32 port, port_to;
- u32 timeout = h->timeout;
bool with_ports = false;
u8 cidr;
int ret;
@@ -518,7 +441,9 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
return -IPSET_ERR_PROTOCOL;
if (unlikely(tb[IPSET_ATTR_IP_TO]))
return -IPSET_ERR_HASH_RANGE_UNSUPPORTED;
@@ -526,7 +451,8 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &data.ip);
+ ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip) ||
+ ip_set_get_extensions(set, tb, &ext);
if (ret)
return ret;
@@ -534,45 +460,40 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]);
if (!cidr || cidr > HOST_MASK)
return -IPSET_ERR_INVALID_CIDR;
- data.cidr = cidr - 1;
+ e.cidr = cidr - 1;
}
- ip6_netmask(&data.ip, data.cidr + 1);
+ ip6_netmask(&e.ip, e.cidr + 1);
if (tb[IPSET_ATTR_PORT])
- data.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
+ e.port = nla_get_be16(tb[IPSET_ATTR_PORT]);
else
return -IPSET_ERR_PROTOCOL;
if (tb[IPSET_ATTR_PROTO]) {
- data.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
- with_ports = ip_set_proto_with_ports(data.proto);
+ e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]);
+ with_ports = ip_set_proto_with_ports(e.proto);
- if (data.proto == 0)
+ if (e.proto == 0)
return -IPSET_ERR_INVALID_PROTO;
} else
return -IPSET_ERR_MISSING_PROTO;
- if (!(with_ports || data.proto == IPPROTO_ICMPV6))
- data.port = 0;
+ if (!(with_ports || e.proto == IPPROTO_ICMPV6))
+ e.port = 0;
- if (tb[IPSET_ATTR_TIMEOUT]) {
- if (!with_timeout(h->timeout))
- return -IPSET_ERR_TIMEOUT;
- timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
- }
-
- if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) {
+ if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
if (cadt_flags & IPSET_FLAG_NOMATCH)
- flags |= (cadt_flags << 16);
+ flags |= (IPSET_FLAG_NOMATCH << 16);
}
if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
- ret = adtfn(set, &data, timeout, flags);
- return ip_set_eexist(ret, flags) ? 0 : ret;
+ ret = adtfn(set, &e, &ext, &ext, flags);
+ return ip_set_enomatch(ret, flags, adt) ? 1 :
+ ip_set_eexist(ret, flags) ? 0 : ret;
}
- port = ntohs(data.port);
+ port = ntohs(e.port);
port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]);
if (port > port_to)
swap(port, port_to);
@@ -580,8 +501,8 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
if (retried)
port = ntohs(h->next.port);
for (; port <= port_to; port++) {
- data.port = htons(port);
- ret = adtfn(set, &data, timeout, flags);
+ e.port = htons(port);
+ ret = adtfn(set, &e, &ext, &ext, flags);
if (ret && !ip_set_eexist(ret, flags))
return ret;
@@ -591,80 +512,6 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
return ret;
}
-/* Create hash:ip type of sets */
-
-static int
-hash_netport_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
-{
- struct ip_set_hash *h;
- u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
- u8 hbits;
- size_t hsize;
-
- if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6))
- return -IPSET_ERR_INVALID_FAMILY;
-
- if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
- return -IPSET_ERR_PROTOCOL;
-
- if (tb[IPSET_ATTR_HASHSIZE]) {
- hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
- if (hashsize < IPSET_MIMINAL_HASHSIZE)
- hashsize = IPSET_MIMINAL_HASHSIZE;
- }
-
- if (tb[IPSET_ATTR_MAXELEM])
- maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
-
- h = kzalloc(sizeof(*h)
- + sizeof(struct ip_set_hash_nets)
- * (set->family == NFPROTO_IPV4 ? 32 : 128), GFP_KERNEL);
- if (!h)
- return -ENOMEM;
-
- h->maxelem = maxelem;
- get_random_bytes(&h->initval, sizeof(h->initval));
- h->timeout = IPSET_NO_TIMEOUT;
-
- hbits = htable_bits(hashsize);
- hsize = htable_size(hbits);
- if (hsize == 0) {
- kfree(h);
- return -ENOMEM;
- }
- h->table = ip_set_alloc(hsize);
- if (!h->table) {
- kfree(h);
- return -ENOMEM;
- }
- h->table->htable_bits = hbits;
-
- set->data = h;
-
- if (tb[IPSET_ATTR_TIMEOUT]) {
- h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
-
- set->variant = set->family == NFPROTO_IPV4
- ? &hash_netport4_tvariant : &hash_netport6_tvariant;
-
- if (set->family == NFPROTO_IPV4)
- hash_netport4_gc_init(set);
- else
- hash_netport6_gc_init(set);
- } else {
- set->variant = set->family == NFPROTO_IPV4
- ? &hash_netport4_variant : &hash_netport6_variant;
- }
-
- pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
- set->name, jhash_size(h->table->htable_bits),
- h->table->htable_bits, h->maxelem, set->data, h->table);
-
- return 0;
-}
-
static struct ip_set_type hash_netport_type __read_mostly = {
.name = "hash:net,port",
.protocol = IPSET_PROTOCOL,
@@ -681,6 +528,7 @@ static struct ip_set_type hash_netport_type __read_mostly = {
[IPSET_ATTR_RESIZE] = { .type = NLA_U8 },
[IPSET_ATTR_PROTO] = { .type = NLA_U8 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
},
.adt_policy = {
[IPSET_ATTR_IP] = { .type = NLA_NESTED },
@@ -692,6 +540,8 @@ static struct ip_set_type hash_netport_type __read_mostly = {
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
},
.me = THIS_MODULE,
};
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 09c744aa898..979b8c90e42 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2008-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -13,30 +13,53 @@
#include <linux/errno.h>
#include <linux/netfilter/ipset/ip_set.h>
-#include <linux/netfilter/ipset/ip_set_timeout.h>
#include <linux/netfilter/ipset/ip_set_list.h>
#define REVISION_MIN 0
-#define REVISION_MAX 0
+#define REVISION_MAX 1 /* Counters support added */
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
IP_SET_MODULE_DESC("list:set", REVISION_MIN, REVISION_MAX);
MODULE_ALIAS("ip_set_list:set");
-/* Member elements without and with timeout */
+/* Member elements */
struct set_elem {
ip_set_id_t id;
};
-struct set_telem {
- ip_set_id_t id;
+struct sett_elem {
+ struct {
+ ip_set_id_t id;
+ } __attribute__ ((aligned));
+ unsigned long timeout;
+};
+
+struct setc_elem {
+ struct {
+ ip_set_id_t id;
+ } __attribute__ ((aligned));
+ struct ip_set_counter counter;
+};
+
+struct setct_elem {
+ struct {
+ ip_set_id_t id;
+ } __attribute__ ((aligned));
+ struct ip_set_counter counter;
unsigned long timeout;
};
+struct set_adt_elem {
+ ip_set_id_t id;
+ ip_set_id_t refid;
+ int before;
+};
+
/* Type structure */
struct list_set {
size_t dsize; /* element size */
+ size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */
u32 size; /* size of set list array */
u32 timeout; /* timeout value */
struct timer_list gc; /* garbage collection */
@@ -49,179 +72,311 @@ list_set_elem(const struct list_set *map, u32 id)
return (struct set_elem *)((void *)map->members + id * map->dsize);
}
-static inline struct set_telem *
-list_set_telem(const struct list_set *map, u32 id)
-{
- return (struct set_telem *)((void *)map->members + id * map->dsize);
-}
+#define ext_timeout(e, m) \
+(unsigned long *)((void *)(e) + (m)->offset[IPSET_OFFSET_TIMEOUT])
+#define ext_counter(e, m) \
+(struct ip_set_counter *)((void *)(e) + (m)->offset[IPSET_OFFSET_COUNTER])
-static inline bool
-list_set_timeout(const struct list_set *map, u32 id)
+static int
+list_set_ktest(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ struct ip_set_adt_opt *opt, const struct ip_set_ext *ext)
{
- const struct set_telem *elem = list_set_telem(map, id);
+ struct list_set *map = set->data;
+ struct set_elem *e;
+ u32 i, cmdflags = opt->cmdflags;
+ int ret;
- return ip_set_timeout_test(elem->timeout);
+ /* Don't lookup sub-counters at all */
+ opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS;
+ if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE)
+ opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE;
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(map, i);
+ if (e->id == IPSET_INVALID_ID)
+ return 0;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, map)))
+ continue;
+ ret = ip_set_test(e->id, skb, par, opt);
+ if (ret > 0) {
+ if (SET_WITH_COUNTER(set))
+ ip_set_update_counter(ext_counter(e, map),
+ ext, &opt->ext,
+ cmdflags);
+ return ret;
+ }
+ }
+ return 0;
}
-static inline bool
-list_set_expired(const struct list_set *map, u32 id)
+static int
+list_set_kadd(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ struct ip_set_adt_opt *opt, const struct ip_set_ext *ext)
{
- const struct set_telem *elem = list_set_telem(map, id);
+ struct list_set *map = set->data;
+ struct set_elem *e;
+ u32 i;
+ int ret;
- return ip_set_timeout_expired(elem->timeout);
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(map, i);
+ if (e->id == IPSET_INVALID_ID)
+ return 0;
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, map)))
+ continue;
+ ret = ip_set_add(e->id, skb, par, opt);
+ if (ret == 0)
+ return ret;
+ }
+ return 0;
}
-/* Set list without and with timeout */
-
static int
-list_set_kadt(struct ip_set *set, const struct sk_buff *skb,
+list_set_kdel(struct ip_set *set, const struct sk_buff *skb,
const struct xt_action_param *par,
- enum ipset_adt adt, const struct ip_set_adt_opt *opt)
+ struct ip_set_adt_opt *opt, const struct ip_set_ext *ext)
{
struct list_set *map = set->data;
- struct set_elem *elem;
+ struct set_elem *e;
u32 i;
int ret;
for (i = 0; i < map->size; i++) {
- elem = list_set_elem(map, i);
- if (elem->id == IPSET_INVALID_ID)
+ e = list_set_elem(map, i);
+ if (e->id == IPSET_INVALID_ID)
return 0;
- if (with_timeout(map->timeout) && list_set_expired(map, i))
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, map)))
continue;
- switch (adt) {
- case IPSET_TEST:
- ret = ip_set_test(elem->id, skb, par, opt);
- if (ret > 0)
- return ret;
- break;
- case IPSET_ADD:
- ret = ip_set_add(elem->id, skb, par, opt);
- if (ret == 0)
- return ret;
- break;
- case IPSET_DEL:
- ret = ip_set_del(elem->id, skb, par, opt);
- if (ret == 0)
- return ret;
- break;
- default:
- break;
- }
+ ret = ip_set_del(e->id, skb, par, opt);
+ if (ret == 0)
+ return ret;
+ }
+ return 0;
+}
+
+static int
+list_set_kadt(struct ip_set *set, const struct sk_buff *skb,
+ const struct xt_action_param *par,
+ enum ipset_adt adt, struct ip_set_adt_opt *opt)
+{
+ struct list_set *map = set->data;
+ struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, map);
+
+ switch (adt) {
+ case IPSET_TEST:
+ return list_set_ktest(set, skb, par, opt, &ext);
+ case IPSET_ADD:
+ return list_set_kadd(set, skb, par, opt, &ext);
+ case IPSET_DEL:
+ return list_set_kdel(set, skb, par, opt, &ext);
+ default:
+ break;
}
return -EINVAL;
}
static bool
-id_eq(const struct list_set *map, u32 i, ip_set_id_t id)
+id_eq(const struct ip_set *set, u32 i, ip_set_id_t id)
{
- const struct set_elem *elem;
+ const struct list_set *map = set->data;
+ const struct set_elem *e;
- if (i < map->size) {
- elem = list_set_elem(map, i);
- return elem->id == id;
+ if (i >= map->size)
+ return 0;
+
+ e = list_set_elem(map, i);
+ return !!(e->id == id &&
+ !(SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, map))));
+}
+
+static int
+list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d,
+ const struct ip_set_ext *ext)
+{
+ struct list_set *map = set->data;
+ struct set_elem *e = list_set_elem(map, i);
+
+ if (e->id != IPSET_INVALID_ID) {
+ if (i == map->size - 1)
+ /* Last element replaced: e.g. add new,before,last */
+ ip_set_put_byindex(e->id);
+ else {
+ struct set_elem *x = list_set_elem(map, map->size - 1);
+
+ /* Last element pushed off */
+ if (x->id != IPSET_INVALID_ID)
+ ip_set_put_byindex(x->id);
+ memmove(list_set_elem(map, i + 1), e,
+ map->dsize * (map->size - (i + 1)));
+ }
}
+ e->id = d->id;
+ if (SET_WITH_TIMEOUT(set))
+ ip_set_timeout_set(ext_timeout(e, map), ext->timeout);
+ if (SET_WITH_COUNTER(set))
+ ip_set_init_counter(ext_counter(e, map), ext);
return 0;
}
-static bool
-id_eq_timeout(const struct list_set *map, u32 i, ip_set_id_t id)
+static int
+list_set_del(struct ip_set *set, u32 i)
{
- const struct set_elem *elem;
+ struct list_set *map = set->data;
+ struct set_elem *e = list_set_elem(map, i);
- if (i < map->size) {
- elem = list_set_elem(map, i);
- return !!(elem->id == id &&
- !(with_timeout(map->timeout) &&
- list_set_expired(map, i)));
- }
+ ip_set_put_byindex(e->id);
+ if (i < map->size - 1)
+ memmove(e, list_set_elem(map, i + 1),
+ map->dsize * (map->size - (i + 1)));
+
+ /* Last element */
+ e = list_set_elem(map, map->size - 1);
+ e->id = IPSET_INVALID_ID;
return 0;
}
static void
-list_elem_add(struct list_set *map, u32 i, ip_set_id_t id)
+set_cleanup_entries(struct ip_set *set)
{
+ struct list_set *map = set->data;
struct set_elem *e;
+ u32 i;
- for (; i < map->size; i++) {
+ for (i = 0; i < map->size; i++) {
e = list_set_elem(map, i);
- swap(e->id, id);
- if (e->id == IPSET_INVALID_ID)
- break;
+ if (e->id != IPSET_INVALID_ID &&
+ ip_set_timeout_expired(ext_timeout(e, map)))
+ list_set_del(set, i);
}
}
-static void
-list_elem_tadd(struct list_set *map, u32 i, ip_set_id_t id,
- unsigned long timeout)
+static int
+list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
{
- struct set_telem *e;
+ struct list_set *map = set->data;
+ struct set_adt_elem *d = value;
+ struct set_elem *e;
+ u32 i;
+ int ret;
- for (; i < map->size; i++) {
- e = list_set_telem(map, i);
- swap(e->id, id);
- swap(e->timeout, timeout);
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(map, i);
if (e->id == IPSET_INVALID_ID)
- break;
+ return 0;
+ else if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, map)))
+ continue;
+ else if (e->id != d->id)
+ continue;
+
+ if (d->before == 0)
+ return 1;
+ else if (d->before > 0)
+ ret = id_eq(set, i + 1, d->refid);
+ else
+ ret = i > 0 && id_eq(set, i - 1, d->refid);
+ return ret;
}
+ return 0;
}
+
static int
-list_set_add(struct list_set *map, u32 i, ip_set_id_t id,
- unsigned long timeout)
+list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
{
- const struct set_elem *e = list_set_elem(map, i);
+ struct list_set *map = set->data;
+ struct set_adt_elem *d = value;
+ struct set_elem *e;
+ bool flag_exist = flags & IPSET_FLAG_EXIST;
+ u32 i, ret = 0;
- if (e->id != IPSET_INVALID_ID) {
- const struct set_elem *x = list_set_elem(map, map->size - 1);
+ /* Check already added element */
+ for (i = 0; i < map->size; i++) {
+ e = list_set_elem(map, i);
+ if (e->id == IPSET_INVALID_ID)
+ goto insert;
+ else if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, map)))
+ continue;
+ else if (e->id != d->id)
+ continue;
- /* Last element replaced or pushed off */
- if (x->id != IPSET_INVALID_ID)
- ip_set_put_byindex(x->id);
+ if ((d->before > 1 && !id_eq(set, i + 1, d->refid)) ||
+ (d->before < 0 &&
+ (i == 0 || !id_eq(set, i - 1, d->refid))))
+ /* Before/after doesn't match */
+ return -IPSET_ERR_REF_EXIST;
+ if (!flag_exist)
+ /* Can't re-add */
+ return -IPSET_ERR_EXIST;
+ /* Update extensions */
+ if (SET_WITH_TIMEOUT(set))
+ ip_set_timeout_set(ext_timeout(e, map), ext->timeout);
+ if (SET_WITH_COUNTER(set))
+ ip_set_init_counter(ext_counter(e, map), ext);
+ /* Set is already added to the list */
+ ip_set_put_byindex(d->id);
+ return 0;
+ }
+insert:
+ ret = -IPSET_ERR_LIST_FULL;
+ for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) {
+ e = list_set_elem(map, i);
+ if (e->id == IPSET_INVALID_ID)
+ ret = d->before != 0 ? -IPSET_ERR_REF_EXIST
+ : list_set_add(set, i, d, ext);
+ else if (e->id != d->refid)
+ continue;
+ else if (d->before > 0)
+ ret = list_set_add(set, i, d, ext);
+ else if (i + 1 < map->size)
+ ret = list_set_add(set, i + 1, d, ext);
}
- if (with_timeout(map->timeout))
- list_elem_tadd(map, i, id, ip_set_timeout_set(timeout));
- else
- list_elem_add(map, i, id);
- return 0;
+ return ret;
}
static int
-list_set_del(struct list_set *map, u32 i)
+list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+ struct ip_set_ext *mext, u32 flags)
{
- struct set_elem *a = list_set_elem(map, i), *b;
-
- ip_set_put_byindex(a->id);
-
- for (; i < map->size - 1; i++) {
- b = list_set_elem(map, i + 1);
- a->id = b->id;
- if (with_timeout(map->timeout))
- ((struct set_telem *)a)->timeout =
- ((struct set_telem *)b)->timeout;
- a = b;
- if (a->id == IPSET_INVALID_ID)
- break;
- }
- /* Last element */
- a->id = IPSET_INVALID_ID;
- return 0;
-}
-
-static void
-cleanup_entries(struct list_set *map)
-{
- struct set_telem *e;
+ struct list_set *map = set->data;
+ struct set_adt_elem *d = value;
+ struct set_elem *e;
u32 i;
for (i = 0; i < map->size; i++) {
- e = list_set_telem(map, i);
- if (e->id != IPSET_INVALID_ID && list_set_expired(map, i))
- list_set_del(map, i);
+ e = list_set_elem(map, i);
+ if (e->id == IPSET_INVALID_ID)
+ return d->before != 0 ? -IPSET_ERR_REF_EXIST
+ : -IPSET_ERR_EXIST;
+ else if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, map)))
+ continue;
+ else if (e->id != d->id)
+ continue;
+
+ if (d->before == 0)
+ return list_set_del(set, i);
+ else if (d->before > 0) {
+ if (!id_eq(set, i + 1, d->refid))
+ return -IPSET_ERR_REF_EXIST;
+ return list_set_del(set, i);
+ } else if (i == 0 || !id_eq(set, i - 1, d->refid))
+ return -IPSET_ERR_REF_EXIST;
+ else
+ return list_set_del(set, i);
}
+ return -IPSET_ERR_EXIST;
}
static int
@@ -229,26 +384,27 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[],
enum ipset_adt adt, u32 *lineno, u32 flags, bool retried)
{
struct list_set *map = set->data;
- bool with_timeout = with_timeout(map->timeout);
- bool flag_exist = flags & IPSET_FLAG_EXIST;
- int before = 0;
- u32 timeout = map->timeout;
- ip_set_id_t id, refid = IPSET_INVALID_ID;
- const struct set_elem *elem;
+ ipset_adtfn adtfn = set->variant->adt[adt];
+ struct set_adt_elem e = { .refid = IPSET_INVALID_ID };
+ struct ip_set_ext ext = IP_SET_INIT_UEXT(map);
struct ip_set *s;
- u32 i;
int ret = 0;
if (unlikely(!tb[IPSET_ATTR_NAME] ||
!ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_PACKETS) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_BYTES)))
return -IPSET_ERR_PROTOCOL;
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- id = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAME]), &s);
- if (id == IPSET_INVALID_ID)
+ ret = ip_set_get_extensions(set, tb, &ext);
+ if (ret)
+ return ret;
+ e.id = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAME]), &s);
+ if (e.id == IPSET_INVALID_ID)
return -IPSET_ERR_NAME;
/* "Loop detection" */
if (s->type->features & IPSET_TYPE_NAME) {
@@ -258,115 +414,34 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[],
if (tb[IPSET_ATTR_CADT_FLAGS]) {
u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
- before = f & IPSET_FLAG_BEFORE;
+ e.before = f & IPSET_FLAG_BEFORE;
}
- if (before && !tb[IPSET_ATTR_NAMEREF]) {
+ if (e.before && !tb[IPSET_ATTR_NAMEREF]) {
ret = -IPSET_ERR_BEFORE;
goto finish;
}
if (tb[IPSET_ATTR_NAMEREF]) {
- refid = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAMEREF]),
- &s);
- if (refid == IPSET_INVALID_ID) {
+ e.refid = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAMEREF]),
+ &s);
+ if (e.refid == IPSET_INVALID_ID) {
ret = -IPSET_ERR_NAMEREF;
goto finish;
}
- if (!before)
- before = -1;
- }
- if (tb[IPSET_ATTR_TIMEOUT]) {
- if (!with_timeout) {
- ret = -IPSET_ERR_TIMEOUT;
- goto finish;
- }
- timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ if (!e.before)
+ e.before = -1;
}
- if (with_timeout && adt != IPSET_TEST)
- cleanup_entries(map);
+ if (adt != IPSET_TEST && SET_WITH_TIMEOUT(set))
+ set_cleanup_entries(set);
- switch (adt) {
- case IPSET_TEST:
- for (i = 0; i < map->size && !ret; i++) {
- elem = list_set_elem(map, i);
- if (elem->id == IPSET_INVALID_ID ||
- (before != 0 && i + 1 >= map->size))
- break;
- else if (with_timeout && list_set_expired(map, i))
- continue;
- else if (before > 0 && elem->id == id)
- ret = id_eq_timeout(map, i + 1, refid);
- else if (before < 0 && elem->id == refid)
- ret = id_eq_timeout(map, i + 1, id);
- else if (before == 0 && elem->id == id)
- ret = 1;
- }
- break;
- case IPSET_ADD:
- for (i = 0; i < map->size; i++) {
- elem = list_set_elem(map, i);
- if (elem->id != id)
- continue;
- if (!(with_timeout && flag_exist)) {
- ret = -IPSET_ERR_EXIST;
- goto finish;
- } else {
- struct set_telem *e = list_set_telem(map, i);
-
- if ((before > 1 &&
- !id_eq(map, i + 1, refid)) ||
- (before < 0 &&
- (i == 0 || !id_eq(map, i - 1, refid)))) {
- ret = -IPSET_ERR_EXIST;
- goto finish;
- }
- e->timeout = ip_set_timeout_set(timeout);
- ip_set_put_byindex(id);
- ret = 0;
- goto finish;
- }
- }
- ret = -IPSET_ERR_LIST_FULL;
- for (i = 0; i < map->size && ret == -IPSET_ERR_LIST_FULL; i++) {
- elem = list_set_elem(map, i);
- if (elem->id == IPSET_INVALID_ID)
- ret = before != 0 ? -IPSET_ERR_REF_EXIST
- : list_set_add(map, i, id, timeout);
- else if (elem->id != refid)
- continue;
- else if (before > 0)
- ret = list_set_add(map, i, id, timeout);
- else if (i + 1 < map->size)
- ret = list_set_add(map, i + 1, id, timeout);
- }
- break;
- case IPSET_DEL:
- ret = -IPSET_ERR_EXIST;
- for (i = 0; i < map->size && ret == -IPSET_ERR_EXIST; i++) {
- elem = list_set_elem(map, i);
- if (elem->id == IPSET_INVALID_ID) {
- ret = before != 0 ? -IPSET_ERR_REF_EXIST
- : -IPSET_ERR_EXIST;
- break;
- } else if (elem->id == id &&
- (before == 0 ||
- (before > 0 && id_eq(map, i + 1, refid))))
- ret = list_set_del(map, i);
- else if (elem->id == refid &&
- before < 0 && id_eq(map, i + 1, id))
- ret = list_set_del(map, i + 1);
- }
- break;
- default:
- break;
- }
+ ret = adtfn(set, &e, &ext, &ext, flags);
finish:
- if (refid != IPSET_INVALID_ID)
- ip_set_put_byindex(refid);
+ if (e.refid != IPSET_INVALID_ID)
+ ip_set_put_byindex(e.refid);
if (adt != IPSET_ADD || ret)
- ip_set_put_byindex(id);
+ ip_set_put_byindex(e.id);
return ip_set_eexist(ret, flags) ? 0 : ret;
}
@@ -375,14 +450,14 @@ static void
list_set_flush(struct ip_set *set)
{
struct list_set *map = set->data;
- struct set_elem *elem;
+ struct set_elem *e;
u32 i;
for (i = 0; i < map->size; i++) {
- elem = list_set_elem(map, i);
- if (elem->id != IPSET_INVALID_ID) {
- ip_set_put_byindex(elem->id);
- elem->id = IPSET_INVALID_ID;
+ e = list_set_elem(map, i);
+ if (e->id != IPSET_INVALID_ID) {
+ ip_set_put_byindex(e->id);
+ e->id = IPSET_INVALID_ID;
}
}
}
@@ -392,7 +467,7 @@ list_set_destroy(struct ip_set *set)
{
struct list_set *map = set->data;
- if (with_timeout(map->timeout))
+ if (SET_WITH_TIMEOUT(set))
del_timer_sync(&map->gc);
list_set_flush(set);
kfree(map);
@@ -410,8 +485,11 @@ list_set_head(struct ip_set *set, struct sk_buff *skb)
if (!nested)
goto nla_put_failure;
if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) ||
- (with_timeout(map->timeout) &&
+ (SET_WITH_TIMEOUT(set) &&
nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout))) ||
+ (SET_WITH_COUNTER(set) &&
+ nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS,
+ htonl(IPSET_FLAG_WITH_COUNTERS))) ||
nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
nla_put_net32(skb, IPSET_ATTR_MEMSIZE,
htonl(sizeof(*map) + map->size * map->dsize)))
@@ -440,7 +518,8 @@ list_set_list(const struct ip_set *set,
e = list_set_elem(map, i);
if (e->id == IPSET_INVALID_ID)
goto finish;
- if (with_timeout(map->timeout) && list_set_expired(map, i))
+ if (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, map)))
continue;
nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
if (!nested) {
@@ -453,13 +532,14 @@ list_set_list(const struct ip_set *set,
if (nla_put_string(skb, IPSET_ATTR_NAME,
ip_set_name_byindex(e->id)))
goto nla_put_failure;
- if (with_timeout(map->timeout)) {
- const struct set_telem *te =
- (const struct set_telem *) e;
- __be32 to = htonl(ip_set_timeout_get(te->timeout));
- if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT, to))
- goto nla_put_failure;
- }
+ if (SET_WITH_TIMEOUT(set) &&
+ nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
+ htonl(ip_set_timeout_get(
+ ext_timeout(e, map)))))
+ goto nla_put_failure;
+ if (SET_WITH_COUNTER(set) &&
+ ip_set_put_counter(skb, ext_counter(e, map)))
+ goto nla_put_failure;
ipset_nest_end(skb, nested);
}
finish:
@@ -485,12 +565,18 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b)
const struct list_set *y = b->data;
return x->size == y->size &&
- x->timeout == y->timeout;
+ x->timeout == y->timeout &&
+ a->extensions == b->extensions;
}
-static const struct ip_set_type_variant list_set = {
+static const struct ip_set_type_variant set_variant = {
.kadt = list_set_kadt,
.uadt = list_set_uadt,
+ .adt = {
+ [IPSET_ADD] = list_set_uadd,
+ [IPSET_DEL] = list_set_udel,
+ [IPSET_TEST] = list_set_utest,
+ },
.destroy = list_set_destroy,
.flush = list_set_flush,
.head = list_set_head,
@@ -505,7 +591,7 @@ list_set_gc(unsigned long ul_set)
struct list_set *map = set->data;
write_lock_bh(&set->lock);
- cleanup_entries(map);
+ set_cleanup_entries(set);
write_unlock_bh(&set->lock);
map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
@@ -513,20 +599,20 @@ list_set_gc(unsigned long ul_set)
}
static void
-list_set_gc_init(struct ip_set *set)
+list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
{
struct list_set *map = set->data;
init_timer(&map->gc);
map->gc.data = (unsigned long) set;
- map->gc.function = list_set_gc;
+ map->gc.function = gc;
map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
add_timer(&map->gc);
}
/* Create list:set type of sets */
-static bool
+static struct list_set *
init_list_set(struct ip_set *set, u32 size, size_t dsize,
unsigned long timeout)
{
@@ -536,7 +622,7 @@ init_list_set(struct ip_set *set, u32 size, size_t dsize,
map = kzalloc(sizeof(*map) + size * dsize, GFP_KERNEL);
if (!map)
- return false;
+ return NULL;
map->size = size;
map->dsize = dsize;
@@ -548,16 +634,19 @@ init_list_set(struct ip_set *set, u32 size, size_t dsize,
e->id = IPSET_INVALID_ID;
}
- return true;
+ return map;
}
static int
list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
{
- u32 size = IP_SET_LIST_DEFAULT_SIZE;
+ struct list_set *map;
+ u32 size = IP_SET_LIST_DEFAULT_SIZE, cadt_flags = 0;
+ unsigned long timeout = 0;
if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_SIZE) ||
- !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT)))
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+ !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
return -IPSET_ERR_PROTOCOL;
if (tb[IPSET_ATTR_SIZE])
@@ -565,18 +654,46 @@ list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
if (size < IP_SET_LIST_MIN_SIZE)
size = IP_SET_LIST_MIN_SIZE;
- if (tb[IPSET_ATTR_TIMEOUT]) {
- if (!init_list_set(set, size, sizeof(struct set_telem),
- ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT])))
+ if (tb[IPSET_ATTR_CADT_FLAGS])
+ cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
+ if (tb[IPSET_ATTR_TIMEOUT])
+ timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+ set->variant = &set_variant;
+ if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) {
+ set->extensions |= IPSET_EXT_COUNTER;
+ if (tb[IPSET_ATTR_TIMEOUT]) {
+ map = init_list_set(set, size,
+ sizeof(struct setct_elem), timeout);
+ if (!map)
+ return -ENOMEM;
+ set->extensions |= IPSET_EXT_TIMEOUT;
+ map->offset[IPSET_OFFSET_TIMEOUT] =
+ offsetof(struct setct_elem, timeout);
+ map->offset[IPSET_OFFSET_COUNTER] =
+ offsetof(struct setct_elem, counter);
+ list_set_gc_init(set, list_set_gc);
+ } else {
+ map = init_list_set(set, size,
+ sizeof(struct setc_elem), 0);
+ if (!map)
+ return -ENOMEM;
+ map->offset[IPSET_OFFSET_COUNTER] =
+ offsetof(struct setc_elem, counter);
+ }
+ } else if (tb[IPSET_ATTR_TIMEOUT]) {
+ map = init_list_set(set, size,
+ sizeof(struct sett_elem), timeout);
+ if (!map)
return -ENOMEM;
-
- list_set_gc_init(set);
+ set->extensions |= IPSET_EXT_TIMEOUT;
+ map->offset[IPSET_OFFSET_TIMEOUT] =
+ offsetof(struct sett_elem, timeout);
+ list_set_gc_init(set, list_set_gc);
} else {
- if (!init_list_set(set, size, sizeof(struct set_elem),
- IPSET_NO_TIMEOUT))
+ map = init_list_set(set, size, sizeof(struct set_elem), 0);
+ if (!map)
return -ENOMEM;
}
- set->variant = &list_set;
return 0;
}
@@ -592,6 +709,7 @@ static struct ip_set_type list_set_type __read_mostly = {
.create_policy = {
[IPSET_ATTR_SIZE] = { .type = NLA_U32 },
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
},
.adt_policy = {
[IPSET_ATTR_NAME] = { .type = NLA_STRING,
@@ -601,6 +719,8 @@ static struct ip_set_type list_set_type __read_mostly = {
[IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 },
[IPSET_ATTR_LINENO] = { .type = NLA_U32 },
[IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 },
+ [IPSET_ATTR_BYTES] = { .type = NLA_U64 },
+ [IPSET_ATTR_PACKETS] = { .type = NLA_U64 },
},
.me = THIS_MODULE,
};
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index 0b779d7df88..dfd7b65b3d2 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -58,6 +58,18 @@ static inline void ip_vs_app_put(struct ip_vs_app *app)
module_put(app->module);
}
+static void ip_vs_app_inc_destroy(struct ip_vs_app *inc)
+{
+ kfree(inc->timeout_table);
+ kfree(inc);
+}
+
+static void ip_vs_app_inc_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_app *inc = container_of(head, struct ip_vs_app, rcu_head);
+
+ ip_vs_app_inc_destroy(inc);
+}
/*
* Allocate/initialize app incarnation and register it in proto apps.
@@ -106,8 +118,7 @@ ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto,
return 0;
out:
- kfree(inc->timeout_table);
- kfree(inc);
+ ip_vs_app_inc_destroy(inc);
return ret;
}
@@ -131,8 +142,7 @@ ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc)
list_del(&inc->a_list);
- kfree(inc->timeout_table);
- kfree(inc);
+ call_rcu(&inc->rcu_head, ip_vs_app_inc_rcu_free);
}
@@ -144,9 +154,9 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc)
{
int result;
- atomic_inc(&inc->usecnt);
- if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
- atomic_dec(&inc->usecnt);
+ result = ip_vs_app_get(inc->app);
+ if (result)
+ atomic_inc(&inc->usecnt);
return result;
}
@@ -156,8 +166,8 @@ int ip_vs_app_inc_get(struct ip_vs_app *inc)
*/
void ip_vs_app_inc_put(struct ip_vs_app *inc)
{
- ip_vs_app_put(inc->app);
atomic_dec(&inc->usecnt);
+ ip_vs_app_put(inc->app);
}
@@ -218,6 +228,7 @@ out_unlock:
/*
* ip_vs_app unregistration routine
* We are sure there are no app incarnations attached to services
+ * Caller should use synchronize_rcu() or rcu_barrier()
*/
void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app)
{
@@ -341,14 +352,14 @@ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
unsigned int flag, __u32 seq, int diff)
{
/* spinlock is to keep updating cp->flags atomic */
- spin_lock(&cp->lock);
+ spin_lock_bh(&cp->lock);
if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
vseq->previous_delta = vseq->delta;
vseq->delta += diff;
vseq->init_seq = seq;
cp->flags |= flag;
}
- spin_unlock(&cp->lock);
+ spin_unlock_bh(&cp->lock);
}
static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 704e514e02a..a083bda322b 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -79,51 +79,21 @@ static unsigned int ip_vs_conn_rnd __read_mostly;
struct ip_vs_aligned_lock
{
- rwlock_t l;
+ spinlock_t l;
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
/* lock array for conn table */
static struct ip_vs_aligned_lock
__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
-static inline void ct_read_lock(unsigned int key)
-{
- read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_unlock(unsigned int key)
-{
- read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_lock(unsigned int key)
-{
- write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_write_unlock(unsigned int key)
-{
- write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_lock_bh(unsigned int key)
-{
- read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
-static inline void ct_read_unlock_bh(unsigned int key)
-{
- read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
-}
-
static inline void ct_write_lock_bh(unsigned int key)
{
- write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+ spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
static inline void ct_write_unlock_bh(unsigned int key)
{
- write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+ spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
@@ -197,13 +167,13 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
/* Hash by protocol, client address and port */
hash = ip_vs_conn_hashkey_conn(cp);
- ct_write_lock(hash);
+ ct_write_lock_bh(hash);
spin_lock(&cp->lock);
if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
- hlist_add_head(&cp->c_list, &ip_vs_conn_tab[hash]);
cp->flags |= IP_VS_CONN_F_HASHED;
atomic_inc(&cp->refcnt);
+ hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]);
ret = 1;
} else {
pr_err("%s(): request for already hashed, called from %pF\n",
@@ -212,7 +182,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
}
spin_unlock(&cp->lock);
- ct_write_unlock(hash);
+ ct_write_unlock_bh(hash);
return ret;
}
@@ -220,7 +190,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
/*
* UNhashes ip_vs_conn from ip_vs_conn_tab.
- * returns bool success.
+ * returns bool success. Caller should hold conn reference.
*/
static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
{
@@ -230,11 +200,11 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
/* unhash it and decrease its reference counter */
hash = ip_vs_conn_hashkey_conn(cp);
- ct_write_lock(hash);
+ ct_write_lock_bh(hash);
spin_lock(&cp->lock);
if (cp->flags & IP_VS_CONN_F_HASHED) {
- hlist_del(&cp->c_list);
+ hlist_del_rcu(&cp->c_list);
cp->flags &= ~IP_VS_CONN_F_HASHED;
atomic_dec(&cp->refcnt);
ret = 1;
@@ -242,7 +212,37 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
ret = 0;
spin_unlock(&cp->lock);
- ct_write_unlock(hash);
+ ct_write_unlock_bh(hash);
+
+ return ret;
+}
+
+/* Try to unlink ip_vs_conn from ip_vs_conn_tab.
+ * returns bool success.
+ */
+static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
+{
+ unsigned int hash;
+ bool ret;
+
+ hash = ip_vs_conn_hashkey_conn(cp);
+
+ ct_write_lock_bh(hash);
+ spin_lock(&cp->lock);
+
+ if (cp->flags & IP_VS_CONN_F_HASHED) {
+ ret = false;
+ /* Decrease refcnt and unlink conn only if we are last user */
+ if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) {
+ hlist_del_rcu(&cp->c_list);
+ cp->flags &= ~IP_VS_CONN_F_HASHED;
+ ret = true;
+ }
+ } else
+ ret = atomic_read(&cp->refcnt) ? false : true;
+
+ spin_unlock(&cp->lock);
+ ct_write_unlock_bh(hash);
return ret;
}
@@ -262,24 +262,25 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
hash = ip_vs_conn_hashkey_param(p, false);
- ct_read_lock(hash);
+ rcu_read_lock();
- hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (cp->af == p->af &&
- p->cport == cp->cport && p->vport == cp->vport &&
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (p->cport == cp->cport && p->vport == cp->vport &&
+ cp->af == p->af &&
ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
p->protocol == cp->protocol &&
ip_vs_conn_net_eq(cp, p->net)) {
+ if (!__ip_vs_conn_get(cp))
+ continue;
/* HIT */
- atomic_inc(&cp->refcnt);
- ct_read_unlock(hash);
+ rcu_read_unlock();
return cp;
}
}
- ct_read_unlock(hash);
+ rcu_read_unlock();
return NULL;
}
@@ -346,14 +347,16 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
hash = ip_vs_conn_hashkey_param(p, false);
- ct_read_lock(hash);
+ rcu_read_lock();
- hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (!ip_vs_conn_net_eq(cp, p->net))
- continue;
- if (p->pe_data && p->pe->ct_match) {
- if (p->pe == cp->pe && p->pe->ct_match(p, cp))
- goto out;
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (unlikely(p->pe_data && p->pe->ct_match)) {
+ if (!ip_vs_conn_net_eq(cp, p->net))
+ continue;
+ if (p->pe == cp->pe && p->pe->ct_match(p, cp)) {
+ if (__ip_vs_conn_get(cp))
+ goto out;
+ }
continue;
}
@@ -363,17 +366,18 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
* p->vaddr is a fwmark */
ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
p->af, p->vaddr, &cp->vaddr) &&
- p->cport == cp->cport && p->vport == cp->vport &&
+ p->vport == cp->vport && p->cport == cp->cport &&
cp->flags & IP_VS_CONN_F_TEMPLATE &&
- p->protocol == cp->protocol)
- goto out;
+ p->protocol == cp->protocol &&
+ ip_vs_conn_net_eq(cp, p->net)) {
+ if (__ip_vs_conn_get(cp))
+ goto out;
+ }
}
cp = NULL;
out:
- if (cp)
- atomic_inc(&cp->refcnt);
- ct_read_unlock(hash);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
ip_vs_proto_name(p->protocol),
@@ -398,23 +402,24 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
*/
hash = ip_vs_conn_hashkey_param(p, true);
- ct_read_lock(hash);
+ rcu_read_lock();
- hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
- if (cp->af == p->af &&
- p->vport == cp->cport && p->cport == cp->dport &&
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (p->vport == cp->cport && p->cport == cp->dport &&
+ cp->af == p->af &&
ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
p->protocol == cp->protocol &&
ip_vs_conn_net_eq(cp, p->net)) {
+ if (!__ip_vs_conn_get(cp))
+ continue;
/* HIT */
- atomic_inc(&cp->refcnt);
ret = cp;
break;
}
}
- ct_read_unlock(hash);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
ip_vs_proto_name(p->protocol),
@@ -457,13 +462,13 @@ void ip_vs_conn_put(struct ip_vs_conn *cp)
void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
{
if (ip_vs_conn_unhash(cp)) {
- spin_lock(&cp->lock);
+ spin_lock_bh(&cp->lock);
if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
atomic_dec(&ip_vs_conn_no_cport_cnt);
cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
cp->cport = cport;
}
- spin_unlock(&cp->lock);
+ spin_unlock_bh(&cp->lock);
/* hash on new dport */
ip_vs_conn_hash(cp);
@@ -549,7 +554,7 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
return;
/* Increase the refcnt counter of the dest */
- atomic_inc(&dest->refcnt);
+ ip_vs_dest_hold(dest);
conn_flags = atomic_read(&dest->conn_flags);
if (cp->protocol != IPPROTO_UDP)
@@ -606,20 +611,22 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
* Check if there is a destination for the connection, if so
* bind the connection to the destination.
*/
-struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
+void ip_vs_try_bind_dest(struct ip_vs_conn *cp)
{
struct ip_vs_dest *dest;
+ rcu_read_lock();
dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr,
cp->dport, &cp->vaddr, cp->vport,
cp->protocol, cp->fwmark, cp->flags);
if (dest) {
struct ip_vs_proto_data *pd;
- spin_lock(&cp->lock);
+ spin_lock_bh(&cp->lock);
if (cp->dest) {
- spin_unlock(&cp->lock);
- return dest;
+ spin_unlock_bh(&cp->lock);
+ rcu_read_unlock();
+ return;
}
/* Applications work depending on the forwarding method
@@ -628,7 +635,7 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
ip_vs_unbind_app(cp);
ip_vs_bind_dest(cp, dest);
- spin_unlock(&cp->lock);
+ spin_unlock_bh(&cp->lock);
/* Update its packet transmitter */
cp->packet_xmit = NULL;
@@ -643,7 +650,7 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
if (pd && atomic_read(&pd->appcnt))
ip_vs_bind_app(cp, pd->pp);
}
- return dest;
+ rcu_read_unlock();
}
@@ -695,12 +702,7 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
}
- /*
- * Simply decrease the refcnt of the dest, because the
- * dest will be either in service's destination list
- * or in the trash.
- */
- atomic_dec(&dest->refcnt);
+ ip_vs_dest_put(dest);
}
static int expire_quiescent_template(struct netns_ipvs *ipvs,
@@ -757,41 +759,36 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
* Simply decrease the refcnt of the template,
* don't restart its timer.
*/
- atomic_dec(&ct->refcnt);
+ __ip_vs_conn_put(ct);
return 0;
}
return 1;
}
+static void ip_vs_conn_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn,
+ rcu_head);
+
+ ip_vs_pe_put(cp->pe);
+ kfree(cp->pe_data);
+ kmem_cache_free(ip_vs_conn_cachep, cp);
+}
+
static void ip_vs_conn_expire(unsigned long data)
{
struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
struct net *net = ip_vs_conn_net(cp);
struct netns_ipvs *ipvs = net_ipvs(net);
- cp->timeout = 60*HZ;
-
- /*
- * hey, I'm using it
- */
- atomic_inc(&cp->refcnt);
-
/*
* do I control anybody?
*/
if (atomic_read(&cp->n_control))
goto expire_later;
- /*
- * unhash it if it is hashed in the conn table
- */
- if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET))
- goto expire_later;
-
- /*
- * refcnt==1 implies I'm the only one referrer
- */
- if (likely(atomic_read(&cp->refcnt) == 1)) {
+ /* Unlink conn if not referenced anymore */
+ if (likely(ip_vs_conn_unlink(cp))) {
/* delete the timer if it is activated by other users */
del_timer(&cp->timer);
@@ -810,38 +807,41 @@ static void ip_vs_conn_expire(unsigned long data)
ip_vs_conn_drop_conntrack(cp);
}
- ip_vs_pe_put(cp->pe);
- kfree(cp->pe_data);
if (unlikely(cp->app != NULL))
ip_vs_unbind_app(cp);
ip_vs_unbind_dest(cp);
if (cp->flags & IP_VS_CONN_F_NO_CPORT)
atomic_dec(&ip_vs_conn_no_cport_cnt);
+ call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
atomic_dec(&ipvs->conn_count);
-
- kmem_cache_free(ip_vs_conn_cachep, cp);
return;
}
- /* hash it back to the table */
- ip_vs_conn_hash(cp);
-
expire_later:
- IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
- atomic_read(&cp->refcnt)-1,
+ IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n",
+ atomic_read(&cp->refcnt),
atomic_read(&cp->n_control));
+ atomic_inc(&cp->refcnt);
+ cp->timeout = 60*HZ;
+
if (ipvs->sync_state & IP_VS_STATE_MASTER)
ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs));
ip_vs_conn_put(cp);
}
-
+/* Modify timer, so that it expires as soon as possible.
+ * Can be called without reference only if under RCU lock.
+ */
void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
{
- if (del_timer(&cp->timer))
- mod_timer(&cp->timer, jiffies);
+ /* Using mod_timer_pending will ensure the timer is not
+ * modified after the final del_timer in ip_vs_conn_expire.
+ */
+ if (timer_pending(&cp->timer) &&
+ time_after(cp->timer.expires, jiffies))
+ mod_timer_pending(&cp->timer, jiffies);
}
@@ -858,7 +858,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
p->protocol);
- cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
+ cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
if (cp == NULL) {
IP_VS_ERR_RL("%s(): no memory\n", __func__);
return NULL;
@@ -869,13 +869,13 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
ip_vs_conn_net_set(cp, p->net);
cp->af = p->af;
cp->protocol = p->protocol;
- ip_vs_addr_copy(p->af, &cp->caddr, p->caddr);
+ ip_vs_addr_set(p->af, &cp->caddr, p->caddr);
cp->cport = p->cport;
- ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr);
+ ip_vs_addr_set(p->af, &cp->vaddr, p->vaddr);
cp->vport = p->vport;
/* proto should only be IPPROTO_IP if d_addr is a fwmark */
- ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
- &cp->daddr, daddr);
+ ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
+ &cp->daddr, daddr);
cp->dport = dport;
cp->flags = flags;
cp->fwmark = fwmark;
@@ -884,6 +884,10 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
cp->pe = p->pe;
cp->pe_data = p->pe_data;
cp->pe_data_len = p->pe_data_len;
+ } else {
+ cp->pe = NULL;
+ cp->pe_data = NULL;
+ cp->pe_data_len = 0;
}
spin_lock_init(&cp->lock);
@@ -894,18 +898,28 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
*/
atomic_set(&cp->refcnt, 1);
+ cp->control = NULL;
atomic_set(&cp->n_control, 0);
atomic_set(&cp->in_pkts, 0);
+ cp->packet_xmit = NULL;
+ cp->app = NULL;
+ cp->app_data = NULL;
+ /* reset struct ip_vs_seq */
+ cp->in_seq.delta = 0;
+ cp->out_seq.delta = 0;
+
atomic_inc(&ipvs->conn_count);
if (flags & IP_VS_CONN_F_NO_CPORT)
atomic_inc(&ip_vs_conn_no_cport_cnt);
/* Bind the connection with a destination server */
+ cp->dest = NULL;
ip_vs_bind_dest(cp, dest);
/* Set its state and timeout */
cp->state = 0;
+ cp->old_state = 0;
cp->timeout = 3*HZ;
cp->sync_endtime = jiffies & ~3UL;
@@ -952,24 +966,29 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
struct ip_vs_iter_state *iter = seq->private;
for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
- ct_read_lock_bh(idx);
- hlist_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+ /* __ip_vs_conn_get() is not needed by
+ * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
+ */
if (pos-- == 0) {
iter->l = &ip_vs_conn_tab[idx];
return cp;
}
}
- ct_read_unlock_bh(idx);
+ rcu_read_unlock();
+ rcu_read_lock();
}
return NULL;
}
static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
{
struct ip_vs_iter_state *iter = seq->private;
iter->l = NULL;
+ rcu_read_lock();
return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
}
@@ -977,6 +996,7 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct ip_vs_conn *cp = v;
struct ip_vs_iter_state *iter = seq->private;
+ struct hlist_node *e;
struct hlist_head *l = iter->l;
int idx;
@@ -985,31 +1005,27 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
return ip_vs_conn_array(seq, 0);
/* more on same hash chain? */
- if (cp->c_list.next)
- return hlist_entry(cp->c_list.next, struct ip_vs_conn, c_list);
+ e = rcu_dereference(hlist_next_rcu(&cp->c_list));
+ if (e)
+ return hlist_entry(e, struct ip_vs_conn, c_list);
idx = l - ip_vs_conn_tab;
- ct_read_unlock_bh(idx);
-
while (++idx < ip_vs_conn_tab_size) {
- ct_read_lock_bh(idx);
- hlist_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
iter->l = &ip_vs_conn_tab[idx];
return cp;
}
- ct_read_unlock_bh(idx);
+ rcu_read_unlock();
+ rcu_read_lock();
}
iter->l = NULL;
return NULL;
}
static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
{
- struct ip_vs_iter_state *iter = seq->private;
- struct hlist_head *l = iter->l;
-
- if (l)
- ct_read_unlock_bh(l - ip_vs_conn_tab);
+ rcu_read_unlock();
}
static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
@@ -1188,7 +1204,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
void ip_vs_random_dropentry(struct net *net)
{
int idx;
- struct ip_vs_conn *cp;
+ struct ip_vs_conn *cp, *cp_c;
/*
* Randomly scan 1/32 of the whole table every second
@@ -1199,9 +1215,9 @@ void ip_vs_random_dropentry(struct net *net)
/*
* Lock is actually needed in this loop.
*/
- ct_write_lock_bh(hash);
+ rcu_read_lock();
- hlist_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
if (cp->flags & IP_VS_CONN_F_TEMPLATE)
/* connection template */
continue;
@@ -1228,12 +1244,15 @@ void ip_vs_random_dropentry(struct net *net)
IP_VS_DBG(4, "del connection\n");
ip_vs_conn_expire_now(cp);
- if (cp->control) {
+ cp_c = cp->control;
+ /* cp->control is valid only with reference to cp */
+ if (cp_c && __ip_vs_conn_get(cp)) {
IP_VS_DBG(4, "del conn template\n");
- ip_vs_conn_expire_now(cp->control);
+ ip_vs_conn_expire_now(cp_c);
+ __ip_vs_conn_put(cp);
}
}
- ct_write_unlock_bh(hash);
+ rcu_read_unlock();
}
}
@@ -1244,7 +1263,7 @@ void ip_vs_random_dropentry(struct net *net)
static void ip_vs_conn_flush(struct net *net)
{
int idx;
- struct ip_vs_conn *cp;
+ struct ip_vs_conn *cp, *cp_c;
struct netns_ipvs *ipvs = net_ipvs(net);
flush_again:
@@ -1252,19 +1271,22 @@ flush_again:
/*
* Lock is actually needed in this loop.
*/
- ct_write_lock_bh(idx);
+ rcu_read_lock();
- hlist_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
if (!ip_vs_conn_net_eq(cp, net))
continue;
IP_VS_DBG(4, "del connection\n");
ip_vs_conn_expire_now(cp);
- if (cp->control) {
+ cp_c = cp->control;
+ /* cp->control is valid only with reference to cp */
+ if (cp_c && __ip_vs_conn_get(cp)) {
IP_VS_DBG(4, "del conn template\n");
- ip_vs_conn_expire_now(cp->control);
+ ip_vs_conn_expire_now(cp_c);
+ __ip_vs_conn_put(cp);
}
}
- ct_write_unlock_bh(idx);
+ rcu_read_unlock();
}
/* the counter may be not NULL, because maybe some conn entries
@@ -1331,7 +1353,7 @@ int __init ip_vs_conn_init(void)
INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
- rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
+ spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l);
}
/* calculate the random value for connection hash */
@@ -1342,6 +1364,8 @@ int __init ip_vs_conn_init(void)
void ip_vs_conn_cleanup(void)
{
+ /* Wait all ip_vs_conn_rcu_free() callbacks to complete */
+ rcu_barrier();
/* Release the empty cache */
kmem_cache_destroy(ip_vs_conn_cachep);
vfree(ip_vs_conn_tab);
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 61f49d24171..05565d2b3a6 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -69,10 +69,7 @@ EXPORT_SYMBOL(ip_vs_conn_put);
EXPORT_SYMBOL(ip_vs_get_debug_level);
#endif
-int ip_vs_net_id __read_mostly;
-#ifdef IP_VS_GENERIC_NETNS
-EXPORT_SYMBOL(ip_vs_net_id);
-#endif
+static int ip_vs_net_id __read_mostly;
/* netns cnt used for uniqueness */
static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
@@ -206,7 +203,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
{
ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr,
vport, p);
- p->pe = svc->pe;
+ p->pe = rcu_dereference(svc->pe);
if (p->pe && p->pe->fill_param)
return p->pe->fill_param(p, skb);
@@ -238,7 +235,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
/* Mask saddr with the netmask to adjust template granularity */
#ifdef CONFIG_IP_VS_IPV6
if (svc->af == AF_INET6)
- ipv6_addr_prefix(&snet.in6, &iph->saddr.in6, svc->netmask);
+ ipv6_addr_prefix(&snet.in6, &iph->saddr.in6,
+ (__force __u32) svc->netmask);
else
#endif
snet.ip = iph->saddr.ip & svc->netmask;
@@ -299,12 +297,15 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
/* Check if a template already exists */
ct = ip_vs_ct_in_get(&param);
if (!ct || !ip_vs_check_template(ct)) {
+ struct ip_vs_scheduler *sched;
+
/*
* No template found or the dest of the connection
* template is not available.
* return *ignored=0 i.e. ICMP and NF_DROP
*/
- dest = svc->scheduler->schedule(svc, skb);
+ sched = rcu_dereference(svc->scheduler);
+ dest = sched->schedule(svc, skb);
if (!dest) {
IP_VS_DBG(1, "p-schedule: no dest found.\n");
kfree(param.pe_data);
@@ -394,6 +395,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
{
struct ip_vs_protocol *pp = pd->pp;
struct ip_vs_conn *cp = NULL;
+ struct ip_vs_scheduler *sched;
struct ip_vs_dest *dest;
__be16 _ports[2], *pptr;
unsigned int flags;
@@ -449,7 +451,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
return NULL;
}
- dest = svc->scheduler->schedule(svc, skb);
+ sched = rcu_dereference(svc->scheduler);
+ dest = sched->schedule(svc, skb);
if (dest == NULL) {
IP_VS_DBG(1, "Schedule: no dest found.\n");
return NULL;
@@ -507,7 +510,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
if (pptr == NULL) {
- ip_vs_service_put(svc);
return NF_DROP;
}
@@ -533,8 +535,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
IP_VS_CONN_F_ONE_PACKET : 0;
union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
- ip_vs_service_put(svc);
-
/* create a new connection entry */
IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
{
@@ -571,12 +571,8 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
* listed in the ipvs table), pass the packets, because it is
* not ipvs job to decide to drop the packets.
*/
- if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
- ip_vs_service_put(svc);
+ if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT))
return NF_ACCEPT;
- }
-
- ip_vs_service_put(svc);
/*
* Notify the client that the destination is unreachable, and
@@ -588,9 +584,9 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
#ifdef CONFIG_IP_VS_IPV6
if (svc->af == AF_INET6) {
if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
+ struct net *net_ = dev_net(skb_dst(skb)->dev);
- skb->dev = net->loopback_dev;
+ skb->dev = net_->loopback_dev;
}
icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
} else
@@ -643,8 +639,11 @@ static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
{
- int err = ip_defrag(skb, user);
+ int err;
+ local_bh_disable();
+ err = ip_defrag(skb, user);
+ local_bh_enable();
if (!err)
ip_send_check(ip_hdr(skb));
@@ -1002,6 +1001,32 @@ static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
return th->rst;
}
+static inline bool is_new_conn(const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ switch (iph->protocol) {
+ case IPPROTO_TCP: {
+ struct tcphdr _tcph, *th;
+
+ th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return false;
+ return th->syn;
+ }
+ case IPPROTO_SCTP: {
+ sctp_chunkhdr_t *sch, schunk;
+
+ sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t),
+ sizeof(schunk), &schunk);
+ if (sch == NULL)
+ return false;
+ return sch->type == SCTP_CID_INIT;
+ }
+ default:
+ return false;
+ }
+}
+
/* Handle response packets: rewrite addresses and send away...
*/
static unsigned int
@@ -1164,9 +1189,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
sizeof(_ports), _ports, &iph);
if (pptr == NULL)
return NF_ACCEPT; /* Not for me */
- if (ip_vs_lookup_real_service(net, af, iph.protocol,
- &iph.saddr,
- pptr[0])) {
+ if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr,
+ pptr[0])) {
/*
* Notify the real server: there is no
* existing entry if it is not RST
@@ -1181,9 +1205,6 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
iph.len)))) {
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
- struct net *net =
- dev_net(skb_dst(skb)->dev);
-
if (!skb->dev)
skb->dev = net->loopback_dev;
icmpv6_send(skb,
@@ -1226,13 +1247,7 @@ ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- unsigned int verdict;
-
- /* Disable BH in LOCAL_OUT until all places are fixed */
- local_bh_disable();
- verdict = ip_vs_out(hooknum, skb, AF_INET);
- local_bh_enable();
- return verdict;
+ return ip_vs_out(hooknum, skb, AF_INET);
}
#ifdef CONFIG_IP_VS_IPV6
@@ -1259,13 +1274,7 @@ ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- unsigned int verdict;
-
- /* Disable BH in LOCAL_OUT until all places are fixed */
- local_bh_disable();
- verdict = ip_vs_out(hooknum, skb, AF_INET6);
- local_bh_enable();
- return verdict;
+ return ip_vs_out(hooknum, skb, AF_INET6);
}
#endif
@@ -1401,10 +1410,13 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
goto ignore_ipip;
/* Prefer the resulting PMTU */
if (dest) {
- spin_lock(&dest->dst_lock);
- if (dest->dst_cache)
- mtu = dst_mtu(dest->dst_cache);
- spin_unlock(&dest->dst_lock);
+ struct ip_vs_dest_dst *dest_dst;
+
+ rcu_read_lock();
+ dest_dst = rcu_dereference(dest->dest_dst);
+ if (dest_dst)
+ mtu = dst_mtu(dest_dst->dst_cache);
+ rcu_read_unlock();
}
if (mtu > 68 + sizeof(struct iphdr))
mtu -= sizeof(struct iphdr);
@@ -1626,6 +1638,15 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
* Check if the packet belongs to an existing connection entry
*/
cp = pp->conn_in_get(af, skb, &iph, 0);
+
+ if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp && cp->dest &&
+ unlikely(!atomic_read(&cp->dest->weight)) && !iph.fragoffs &&
+ is_new_conn(skb, &iph)) {
+ ip_vs_conn_expire_now(cp);
+ __ip_vs_conn_put(cp);
+ cp = NULL;
+ }
+
if (unlikely(!cp) && !iph.fragoffs) {
/* No (second) fragments need to enter here, as nf_defrag_ipv6
* replayed fragment zero will already have created the cp
@@ -1720,13 +1741,7 @@ ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- unsigned int verdict;
-
- /* Disable BH in LOCAL_OUT until all places are fixed */
- local_bh_disable();
- verdict = ip_vs_in(hooknum, skb, AF_INET);
- local_bh_enable();
- return verdict;
+ return ip_vs_in(hooknum, skb, AF_INET);
}
#ifdef CONFIG_IP_VS_IPV6
@@ -1785,13 +1800,7 @@ ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- unsigned int verdict;
-
- /* Disable BH in LOCAL_OUT until all places are fixed */
- local_bh_disable();
- verdict = ip_vs_in(hooknum, skb, AF_INET6);
- local_bh_enable();
- return verdict;
+ return ip_vs_in(hooknum, skb, AF_INET6);
}
#endif
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 9e2d1cccd1e..5b142fb1648 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -55,9 +55,6 @@
/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
static DEFINE_MUTEX(__ip_vs_mutex);
-/* lock for service table */
-static DEFINE_RWLOCK(__ip_vs_svc_lock);
-
/* sysctl variables */
#ifdef CONFIG_IP_VS_DEBUG
@@ -71,7 +68,7 @@ int ip_vs_get_debug_level(void)
/* Protos */
-static void __ip_vs_del_service(struct ip_vs_service *svc);
+static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup);
#ifdef CONFIG_IP_VS_IPV6
@@ -257,9 +254,9 @@ ip_vs_use_count_dec(void)
#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
/* the service table hashed by <protocol, addr, port> */
-static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
+static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
/* the service table hashed by fwmark */
-static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
+static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
/*
@@ -271,16 +268,18 @@ ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
{
register unsigned int porth = ntohs(port);
__be32 addr_fold = addr->ip;
+ __u32 ahash;
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6)
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
- addr_fold ^= ((size_t)net>>8);
+ ahash = ntohl(addr_fold);
+ ahash ^= ((size_t) net >> 8);
- return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
- & IP_VS_SVC_TAB_MASK;
+ return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
+ IP_VS_SVC_TAB_MASK;
}
/*
@@ -312,13 +311,13 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
*/
hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
&svc->addr, svc->port);
- list_add(&svc->s_list, &ip_vs_svc_table[hash]);
+ hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]);
} else {
/*
* Hash it by fwmark in svc_fwm_table
*/
hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
- list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
+ hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
}
svc->flags |= IP_VS_SVC_F_HASHED;
@@ -342,10 +341,10 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
if (svc->fwmark == 0) {
/* Remove it from the svc_table table */
- list_del(&svc->s_list);
+ hlist_del_rcu(&svc->s_list);
} else {
/* Remove it from the svc_fwm_table table */
- list_del(&svc->f_list);
+ hlist_del_rcu(&svc->f_list);
}
svc->flags &= ~IP_VS_SVC_F_HASHED;
@@ -367,7 +366,7 @@ __ip_vs_service_find(struct net *net, int af, __u16 protocol,
/* Check for "full" addressed entries */
hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
- list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) {
if ((svc->af == af)
&& ip_vs_addr_equal(af, &svc->addr, vaddr)
&& (svc->port == vport)
@@ -394,7 +393,7 @@ __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
/* Check for fwmark addressed entries */
hash = ip_vs_svc_fwm_hashkey(net, fwmark);
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) {
if (svc->fwmark == fwmark && svc->af == af
&& net_eq(svc->net, net)) {
/* HIT */
@@ -405,15 +404,14 @@ __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
return NULL;
}
+/* Find service, called under RCU lock */
struct ip_vs_service *
-ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
- const union nf_inet_addr *vaddr, __be16 vport)
+ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol,
+ const union nf_inet_addr *vaddr, __be16 vport)
{
struct ip_vs_service *svc;
struct netns_ipvs *ipvs = net_ipvs(net);
- read_lock(&__ip_vs_svc_lock);
-
/*
* Check the table hashed by fwmark first
*/
@@ -449,10 +447,6 @@ ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
}
out:
- if (svc)
- atomic_inc(&svc->usecnt);
- read_unlock(&__ip_vs_svc_lock);
-
IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
fwmark, ip_vs_proto_name(protocol),
IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
@@ -469,6 +463,13 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
dest->svc = svc;
}
+static void ip_vs_service_free(struct ip_vs_service *svc)
+{
+ if (svc->stats.cpustats)
+ free_percpu(svc->stats.cpustats);
+ kfree(svc);
+}
+
static void
__ip_vs_unbind_svc(struct ip_vs_dest *dest)
{
@@ -476,12 +477,11 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest)
dest->svc = NULL;
if (atomic_dec_and_test(&svc->refcnt)) {
- IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+ IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
svc->fwmark,
IP_VS_DBG_ADDR(svc->af, &svc->addr),
- ntohs(svc->port), atomic_read(&svc->usecnt));
- free_percpu(svc->stats.cpustats);
- kfree(svc);
+ ntohs(svc->port));
+ ip_vs_service_free(svc);
}
}
@@ -506,17 +506,13 @@ static inline unsigned int ip_vs_rs_hashkey(int af,
& IP_VS_RTAB_MASK;
}
-/*
- * Hashes ip_vs_dest in rs_table by <proto,addr,port>.
- * should be called with locked tables.
- */
-static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
+/* Hash ip_vs_dest in rs_table by <proto,addr,port>. */
+static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
{
unsigned int hash;
- if (!list_empty(&dest->d_list)) {
- return 0;
- }
+ if (dest->in_rs_table)
+ return;
/*
* Hash by proto,addr,port,
@@ -524,64 +520,51 @@ static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
*/
hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
- list_add(&dest->d_list, &ipvs->rs_table[hash]);
-
- return 1;
+ hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
+ dest->in_rs_table = 1;
}
-/*
- * UNhashes ip_vs_dest from rs_table.
- * should be called with locked tables.
- */
-static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
+/* Unhash ip_vs_dest from rs_table. */
+static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
{
/*
* Remove it from the rs_table table.
*/
- if (!list_empty(&dest->d_list)) {
- list_del_init(&dest->d_list);
+ if (dest->in_rs_table) {
+ hlist_del_rcu(&dest->d_list);
+ dest->in_rs_table = 0;
}
-
- return 1;
}
-/*
- * Lookup real service by <proto,addr,port> in the real service table.
- */
-struct ip_vs_dest *
-ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
- const union nf_inet_addr *daddr,
- __be16 dport)
+/* Check if real service by <proto,addr,port> is present */
+bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol,
+ const union nf_inet_addr *daddr, __be16 dport)
{
struct netns_ipvs *ipvs = net_ipvs(net);
unsigned int hash;
struct ip_vs_dest *dest;
- /*
- * Check for "full" addressed entries
- * Return the first found entry
- */
+ /* Check for "full" addressed entries */
hash = ip_vs_rs_hashkey(af, daddr, dport);
- read_lock(&ipvs->rs_lock);
- list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
- if ((dest->af == af)
- && ip_vs_addr_equal(af, &dest->addr, daddr)
- && (dest->port == dport)
- && ((dest->protocol == protocol) ||
- dest->vfwmark)) {
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
+ if (dest->port == dport &&
+ dest->af == af &&
+ ip_vs_addr_equal(af, &dest->addr, daddr) &&
+ (dest->protocol == protocol || dest->vfwmark)) {
/* HIT */
- read_unlock(&ipvs->rs_lock);
- return dest;
+ rcu_read_unlock();
+ return true;
}
}
- read_unlock(&ipvs->rs_lock);
+ rcu_read_unlock();
- return NULL;
+ return false;
}
-/*
- * Lookup destination by {addr,port} in the given service
+/* Lookup destination by {addr,port} in the given service
+ * Called under RCU lock.
*/
static struct ip_vs_dest *
ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
@@ -592,7 +575,7 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
/*
* Find the destination for the given service
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if ((dest->af == svc->af)
&& ip_vs_addr_equal(svc->af, &dest->addr, daddr)
&& (dest->port == dport)) {
@@ -606,13 +589,11 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
/*
* Find destination by {daddr,dport,vaddr,protocol}
- * Cretaed to be used in ip_vs_process_message() in
+ * Created to be used in ip_vs_process_message() in
* the backup synchronization daemon. It finds the
* destination to be bound to the received connection
* on the backup.
- *
- * ip_vs_lookup_real_service() looked promissing, but
- * seems not working as expected.
+ * Called under RCU lock, no refcnt is returned.
*/
struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af,
const union nf_inet_addr *daddr,
@@ -625,7 +606,7 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af,
struct ip_vs_service *svc;
__be16 port = dport;
- svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
+ svc = ip_vs_service_find(net, af, fwmark, protocol, vaddr, vport);
if (!svc)
return NULL;
if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
@@ -633,12 +614,31 @@ struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af,
dest = ip_vs_lookup_dest(svc, daddr, port);
if (!dest)
dest = ip_vs_lookup_dest(svc, daddr, port ^ dport);
- if (dest)
- atomic_inc(&dest->refcnt);
- ip_vs_service_put(svc);
return dest;
}
+void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_dest_dst *dest_dst = container_of(head,
+ struct ip_vs_dest_dst,
+ rcu_head);
+
+ dst_release(dest_dst->dst_cache);
+ kfree(dest_dst);
+}
+
+/* Release dest_dst and dst_cache for dest in user context */
+static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
+{
+ struct ip_vs_dest_dst *old;
+
+ old = rcu_dereference_protected(dest->dest_dst, 1);
+ if (old) {
+ RCU_INIT_POINTER(dest->dest_dst, NULL);
+ call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
+ }
+}
+
/*
* Lookup dest by {svc,addr,port} in the destination trash.
* The destination trash is used to hold the destinations that are removed
@@ -653,19 +653,25 @@ static struct ip_vs_dest *
ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
__be16 dport)
{
- struct ip_vs_dest *dest, *nxt;
+ struct ip_vs_dest *dest;
struct netns_ipvs *ipvs = net_ipvs(svc->net);
/*
* Find the destination in trash
*/
- list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
"dest->refcnt=%d\n",
dest->vfwmark,
IP_VS_DBG_ADDR(svc->af, &dest->addr),
ntohs(dest->port),
atomic_read(&dest->refcnt));
+ /* We can not reuse dest while in grace period
+ * because conns still can use dest->svc
+ */
+ if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state))
+ continue;
if (dest->af == svc->af &&
ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
dest->port == dport &&
@@ -675,29 +681,27 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
(ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
dest->vport == svc->port))) {
/* HIT */
- return dest;
- }
-
- /*
- * Try to purge the destination from trash if not referenced
- */
- if (atomic_read(&dest->refcnt) == 1) {
- IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
- "from trash\n",
- dest->vfwmark,
- IP_VS_DBG_ADDR(svc->af, &dest->addr),
- ntohs(dest->port));
- list_del(&dest->n_list);
- ip_vs_dst_reset(dest);
- __ip_vs_unbind_svc(dest);
- free_percpu(dest->stats.cpustats);
- kfree(dest);
+ list_del(&dest->t_list);
+ ip_vs_dest_hold(dest);
+ goto out;
}
}
- return NULL;
+ dest = NULL;
+
+out:
+ spin_unlock_bh(&ipvs->dest_trash_lock);
+
+ return dest;
}
+static void ip_vs_dest_free(struct ip_vs_dest *dest)
+{
+ __ip_vs_dst_cache_reset(dest);
+ __ip_vs_unbind_svc(dest);
+ free_percpu(dest->stats.cpustats);
+ kfree(dest);
+}
/*
* Clean up all the destinations in the trash
@@ -706,19 +710,18 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
* When the ip_vs_control_clearup is activated by ipvs module exit,
* the service tables must have been flushed and all the connections
* are expired, and the refcnt of each destination in the trash must
- * be 1, so we simply release them here.
+ * be 0, so we simply release them here.
*/
static void ip_vs_trash_cleanup(struct net *net)
{
struct ip_vs_dest *dest, *nxt;
struct netns_ipvs *ipvs = net_ipvs(net);
- list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
- list_del(&dest->n_list);
- ip_vs_dst_reset(dest);
- __ip_vs_unbind_svc(dest);
- free_percpu(dest->stats.cpustats);
- kfree(dest);
+ del_timer_sync(&ipvs->dest_trash_timer);
+ /* No need to use dest_trash_lock */
+ list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
+ list_del(&dest->t_list);
+ ip_vs_dest_free(dest);
}
}
@@ -768,6 +771,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
struct ip_vs_dest_user_kern *udest, int add)
{
struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct ip_vs_scheduler *sched;
int conn_flags;
/* set the weight and the flags */
@@ -783,9 +787,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
* Put the real service in rs_table if not present.
* For now only for NAT!
*/
- write_lock_bh(&ipvs->rs_lock);
ip_vs_rs_hash(ipvs, dest);
- write_unlock_bh(&ipvs->rs_lock);
}
atomic_set(&dest->conn_flags, conn_flags);
@@ -809,27 +811,20 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
dest->l_threshold = udest->l_threshold;
spin_lock_bh(&dest->dst_lock);
- ip_vs_dst_reset(dest);
+ __ip_vs_dst_cache_reset(dest);
spin_unlock_bh(&dest->dst_lock);
- if (add)
- ip_vs_start_estimator(svc->net, &dest->stats);
-
- write_lock_bh(&__ip_vs_svc_lock);
-
- /* Wait until all other svc users go away */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-
+ sched = rcu_dereference_protected(svc->scheduler, 1);
if (add) {
- list_add(&dest->n_list, &svc->destinations);
+ ip_vs_start_estimator(svc->net, &dest->stats);
+ list_add_rcu(&dest->n_list, &svc->destinations);
svc->num_dests++;
+ if (sched->add_dest)
+ sched->add_dest(svc, dest);
+ } else {
+ if (sched->upd_dest)
+ sched->upd_dest(svc, dest);
}
-
- /* call the update_service, because server weight may be changed */
- if (svc->scheduler->update_service)
- svc->scheduler->update_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
}
@@ -881,7 +876,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
atomic_set(&dest->persistconns, 0);
atomic_set(&dest->refcnt, 1);
- INIT_LIST_HEAD(&dest->d_list);
+ INIT_HLIST_NODE(&dest->d_list);
spin_lock_init(&dest->dst_lock);
spin_lock_init(&dest->stats.lock);
__ip_vs_update_dest(svc, dest, udest, 1);
@@ -923,10 +918,10 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
- /*
- * Check if the dest already exists in the list
- */
+ /* We use function that requires RCU lock */
+ rcu_read_lock();
dest = ip_vs_lookup_dest(svc, &daddr, dport);
+ rcu_read_unlock();
if (dest != NULL) {
IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
@@ -948,11 +943,6 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
ntohs(dest->vport));
- /*
- * Get the destination from the trash
- */
- list_del(&dest->n_list);
-
__ip_vs_update_dest(svc, dest, udest, 1);
ret = 0;
} else {
@@ -992,10 +982,10 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
- /*
- * Lookup the destination list
- */
+ /* We use function that requires RCU lock */
+ rcu_read_lock();
dest = ip_vs_lookup_dest(svc, &daddr, dport);
+ rcu_read_unlock();
if (dest == NULL) {
IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
@@ -1008,11 +998,21 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
return 0;
}
+static void ip_vs_dest_wait_readers(struct rcu_head *head)
+{
+ struct ip_vs_dest *dest = container_of(head, struct ip_vs_dest,
+ rcu_head);
+
+ /* End of grace period after unlinking */
+ clear_bit(IP_VS_DEST_STATE_REMOVING, &dest->state);
+}
+
/*
* Delete a destination (must be already unlinked from the service)
*/
-static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
+static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest,
+ bool cleanup)
{
struct netns_ipvs *ipvs = net_ipvs(net);
@@ -1021,38 +1021,24 @@ static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
/*
* Remove it from the d-linked list with the real services.
*/
- write_lock_bh(&ipvs->rs_lock);
ip_vs_rs_unhash(dest);
- write_unlock_bh(&ipvs->rs_lock);
- /*
- * Decrease the refcnt of the dest, and free the dest
- * if nobody refers to it (refcnt=0). Otherwise, throw
- * the destination into the trash.
- */
- if (atomic_dec_and_test(&dest->refcnt)) {
- IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
- dest->vfwmark,
- IP_VS_DBG_ADDR(dest->af, &dest->addr),
- ntohs(dest->port));
- ip_vs_dst_reset(dest);
- /* simply decrease svc->refcnt here, let the caller check
- and release the service if nobody refers to it.
- Only user context can release destination and service,
- and only one user context can update virtual service at a
- time, so the operation here is OK */
- atomic_dec(&dest->svc->refcnt);
- free_percpu(dest->stats.cpustats);
- kfree(dest);
- } else {
- IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
- "dest->refcnt=%d\n",
- IP_VS_DBG_ADDR(dest->af, &dest->addr),
- ntohs(dest->port),
- atomic_read(&dest->refcnt));
- list_add(&dest->n_list, &ipvs->dest_trash);
- atomic_inc(&dest->refcnt);
+ if (!cleanup) {
+ set_bit(IP_VS_DEST_STATE_REMOVING, &dest->state);
+ call_rcu(&dest->rcu_head, ip_vs_dest_wait_readers);
}
+
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
+ atomic_read(&dest->refcnt));
+ if (list_empty(&ipvs->dest_trash) && !cleanup)
+ mod_timer(&ipvs->dest_trash_timer,
+ jiffies + IP_VS_DEST_TRASH_PERIOD);
+ /* dest lives in trash without reference */
+ list_add(&dest->t_list, &ipvs->dest_trash);
+ spin_unlock_bh(&ipvs->dest_trash_lock);
+ ip_vs_dest_put(dest);
}
@@ -1068,14 +1054,16 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
/*
* Remove it from the d-linked destination list.
*/
- list_del(&dest->n_list);
+ list_del_rcu(&dest->n_list);
svc->num_dests--;
- /*
- * Call the update_service function of its scheduler
- */
- if (svcupd && svc->scheduler->update_service)
- svc->scheduler->update_service(svc);
+ if (svcupd) {
+ struct ip_vs_scheduler *sched;
+
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (sched->del_dest)
+ sched->del_dest(svc, dest);
+ }
}
@@ -1090,37 +1078,56 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
EnterFunction(2);
+ /* We use function that requires RCU lock */
+ rcu_read_lock();
dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
+ rcu_read_unlock();
if (dest == NULL) {
IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
return -ENOENT;
}
- write_lock_bh(&__ip_vs_svc_lock);
-
- /*
- * Wait until all other svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-
/*
* Unlink dest from the service
*/
__ip_vs_unlink_dest(svc, dest, 1);
- write_unlock_bh(&__ip_vs_svc_lock);
-
/*
* Delete the destination
*/
- __ip_vs_del_dest(svc->net, dest);
+ __ip_vs_del_dest(svc->net, dest, false);
LeaveFunction(2);
return 0;
}
+static void ip_vs_dest_trash_expire(unsigned long data)
+{
+ struct net *net = (struct net *) data;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_dest *dest, *next;
+
+ spin_lock(&ipvs->dest_trash_lock);
+ list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
+ /* Skip if dest is in grace period */
+ if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state))
+ continue;
+ if (atomic_read(&dest->refcnt) > 0)
+ continue;
+ IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
+ dest->vfwmark,
+ IP_VS_DBG_ADDR(dest->svc->af, &dest->addr),
+ ntohs(dest->port));
+ list_del(&dest->t_list);
+ ip_vs_dest_free(dest);
+ }
+ if (!list_empty(&ipvs->dest_trash))
+ mod_timer(&ipvs->dest_trash_timer,
+ jiffies + IP_VS_DEST_TRASH_PERIOD);
+ spin_unlock(&ipvs->dest_trash_lock);
+}
/*
* Add a service into the service hash table
@@ -1157,9 +1164,13 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
}
#ifdef CONFIG_IP_VS_IPV6
- if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
- ret = -EINVAL;
- goto out_err;
+ if (u->af == AF_INET6) {
+ __u32 plen = (__force __u32) u->netmask;
+
+ if (plen < 1 || plen > 128) {
+ ret = -EINVAL;
+ goto out_err;
+ }
}
#endif
@@ -1176,7 +1187,6 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
}
/* I'm the first user of the service */
- atomic_set(&svc->usecnt, 0);
atomic_set(&svc->refcnt, 0);
svc->af = u->af;
@@ -1190,7 +1200,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
svc->net = net;
INIT_LIST_HEAD(&svc->destinations);
- rwlock_init(&svc->sched_lock);
+ spin_lock_init(&svc->sched_lock);
spin_lock_init(&svc->stats.lock);
/* Bind the scheduler */
@@ -1200,7 +1210,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
sched = NULL;
/* Bind the ct retriever */
- ip_vs_bind_pe(svc, pe);
+ RCU_INIT_POINTER(svc->pe, pe);
pe = NULL;
/* Update the virtual service counters */
@@ -1216,9 +1226,7 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
ipvs->num_services++;
/* Hash the service into the service table */
- write_lock_bh(&__ip_vs_svc_lock);
ip_vs_svc_hash(svc);
- write_unlock_bh(&__ip_vs_svc_lock);
*svc_p = svc;
/* Now there is a service - full throttle */
@@ -1228,15 +1236,8 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
out_err:
if (svc != NULL) {
- ip_vs_unbind_scheduler(svc);
- if (svc->inc) {
- local_bh_disable();
- ip_vs_app_inc_put(svc->inc);
- local_bh_enable();
- }
- if (svc->stats.cpustats)
- free_percpu(svc->stats.cpustats);
- kfree(svc);
+ ip_vs_unbind_scheduler(svc, sched);
+ ip_vs_service_free(svc);
}
ip_vs_scheduler_put(sched);
ip_vs_pe_put(pe);
@@ -1280,18 +1281,27 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
}
#ifdef CONFIG_IP_VS_IPV6
- if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
- ret = -EINVAL;
- goto out;
+ if (u->af == AF_INET6) {
+ __u32 plen = (__force __u32) u->netmask;
+
+ if (plen < 1 || plen > 128) {
+ ret = -EINVAL;
+ goto out;
+ }
}
#endif
- write_lock_bh(&__ip_vs_svc_lock);
-
- /*
- * Wait until all other svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+ old_sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (sched != old_sched) {
+ /* Bind the new scheduler */
+ ret = ip_vs_bind_scheduler(svc, sched);
+ if (ret) {
+ old_sched = sched;
+ goto out;
+ }
+ /* Unbind the old scheduler on success */
+ ip_vs_unbind_scheduler(svc, old_sched);
+ }
/*
* Set the flags and timeout value
@@ -1300,57 +1310,30 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
svc->timeout = u->timeout * HZ;
svc->netmask = u->netmask;
- old_sched = svc->scheduler;
- if (sched != old_sched) {
- /*
- * Unbind the old scheduler
- */
- if ((ret = ip_vs_unbind_scheduler(svc))) {
- old_sched = sched;
- goto out_unlock;
- }
+ old_pe = rcu_dereference_protected(svc->pe, 1);
+ if (pe != old_pe)
+ rcu_assign_pointer(svc->pe, pe);
- /*
- * Bind the new scheduler
- */
- if ((ret = ip_vs_bind_scheduler(svc, sched))) {
- /*
- * If ip_vs_bind_scheduler fails, restore the old
- * scheduler.
- * The main reason of failure is out of memory.
- *
- * The question is if the old scheduler can be
- * restored all the time. TODO: if it cannot be
- * restored some time, we must delete the service,
- * otherwise the system may crash.
- */
- ip_vs_bind_scheduler(svc, old_sched);
- old_sched = sched;
- goto out_unlock;
- }
- }
-
- old_pe = svc->pe;
- if (pe != old_pe) {
- ip_vs_unbind_pe(svc);
- ip_vs_bind_pe(svc, pe);
- }
-
-out_unlock:
- write_unlock_bh(&__ip_vs_svc_lock);
out:
ip_vs_scheduler_put(old_sched);
ip_vs_pe_put(old_pe);
return ret;
}
+static void ip_vs_service_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_service *svc;
+
+ svc = container_of(head, struct ip_vs_service, rcu_head);
+ ip_vs_service_free(svc);
+}
/*
* Delete a service from the service list
* - The service must be unlinked, unlocked and not referenced!
* - We are called under _bh lock
*/
-static void __ip_vs_del_service(struct ip_vs_service *svc)
+static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
{
struct ip_vs_dest *dest, *nxt;
struct ip_vs_scheduler *old_sched;
@@ -1366,27 +1349,20 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
ip_vs_stop_estimator(svc->net, &svc->stats);
/* Unbind scheduler */
- old_sched = svc->scheduler;
- ip_vs_unbind_scheduler(svc);
+ old_sched = rcu_dereference_protected(svc->scheduler, 1);
+ ip_vs_unbind_scheduler(svc, old_sched);
ip_vs_scheduler_put(old_sched);
- /* Unbind persistence engine */
- old_pe = svc->pe;
- ip_vs_unbind_pe(svc);
+ /* Unbind persistence engine, keep svc->pe */
+ old_pe = rcu_dereference_protected(svc->pe, 1);
ip_vs_pe_put(old_pe);
- /* Unbind app inc */
- if (svc->inc) {
- ip_vs_app_inc_put(svc->inc);
- svc->inc = NULL;
- }
-
/*
* Unlink the whole destination list
*/
list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
__ip_vs_unlink_dest(svc, dest, 0);
- __ip_vs_del_dest(svc->net, dest);
+ __ip_vs_del_dest(svc->net, dest, cleanup);
}
/*
@@ -1400,13 +1376,12 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
/*
* Free the service if nobody refers to it
*/
- if (atomic_read(&svc->refcnt) == 0) {
- IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
+ if (atomic_dec_and_test(&svc->refcnt)) {
+ IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
svc->fwmark,
IP_VS_DBG_ADDR(svc->af, &svc->addr),
- ntohs(svc->port), atomic_read(&svc->usecnt));
- free_percpu(svc->stats.cpustats);
- kfree(svc);
+ ntohs(svc->port));
+ call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
}
/* decrease the module use count */
@@ -1416,23 +1391,16 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
/*
* Unlink a service from list and try to delete it if its refcnt reached 0
*/
-static void ip_vs_unlink_service(struct ip_vs_service *svc)
+static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
{
+ /* Hold svc to avoid double release from dest_trash */
+ atomic_inc(&svc->refcnt);
/*
* Unhash it from the service table
*/
- write_lock_bh(&__ip_vs_svc_lock);
-
ip_vs_svc_unhash(svc);
- /*
- * Wait until all the svc users go away.
- */
- IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
-
- __ip_vs_del_service(svc);
-
- write_unlock_bh(&__ip_vs_svc_lock);
+ __ip_vs_del_service(svc, cleanup);
}
/*
@@ -1442,7 +1410,7 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
{
if (svc == NULL)
return -EEXIST;
- ip_vs_unlink_service(svc);
+ ip_vs_unlink_service(svc, false);
return 0;
}
@@ -1451,19 +1419,20 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
/*
* Flush all the virtual services
*/
-static int ip_vs_flush(struct net *net)
+static int ip_vs_flush(struct net *net, bool cleanup)
{
int idx;
- struct ip_vs_service *svc, *nxt;
+ struct ip_vs_service *svc;
+ struct hlist_node *n;
/*
* Flush the service table hashed by <netns,protocol,addr,port>
*/
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
- s_list) {
+ hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx],
+ s_list) {
if (net_eq(svc->net, net))
- ip_vs_unlink_service(svc);
+ ip_vs_unlink_service(svc, cleanup);
}
}
@@ -1471,10 +1440,10 @@ static int ip_vs_flush(struct net *net)
* Flush the service table hashed by fwmark
*/
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry_safe(svc, nxt,
- &ip_vs_svc_fwm_table[idx], f_list) {
+ hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx],
+ f_list) {
if (net_eq(svc->net, net))
- ip_vs_unlink_service(svc);
+ ip_vs_unlink_service(svc, cleanup);
}
}
@@ -1490,32 +1459,32 @@ void ip_vs_service_net_cleanup(struct net *net)
EnterFunction(2);
/* Check for "full" addressed entries */
mutex_lock(&__ip_vs_mutex);
- ip_vs_flush(net);
+ ip_vs_flush(net, true);
mutex_unlock(&__ip_vs_mutex);
LeaveFunction(2);
}
-/*
- * Release dst hold by dst_cache
- */
+
+/* Put all references for device (dst_cache) */
static inline void
-__ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev)
+ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
{
+ struct ip_vs_dest_dst *dest_dst;
+
spin_lock_bh(&dest->dst_lock);
- if (dest->dst_cache && dest->dst_cache->dev == dev) {
+ dest_dst = rcu_dereference_protected(dest->dest_dst, 1);
+ if (dest_dst && dest_dst->dst_cache->dev == dev) {
IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
dev->name,
IP_VS_DBG_ADDR(dest->af, &dest->addr),
ntohs(dest->port),
atomic_read(&dest->refcnt));
- ip_vs_dst_reset(dest);
+ __ip_vs_dst_cache_reset(dest);
}
spin_unlock_bh(&dest->dst_lock);
}
-/*
- * Netdev event receiver
- * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to
- * a device that is "unregister" it must be released.
+/* Netdev event receiver
+ * Currently only NETDEV_DOWN is handled to release refs to cached dsts
*/
static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
void *ptr)
@@ -1527,35 +1496,37 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
struct ip_vs_dest *dest;
unsigned int idx;
- if (event != NETDEV_UNREGISTER || !ipvs)
+ if (event != NETDEV_DOWN || !ipvs)
return NOTIFY_DONE;
IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
EnterFunction(2);
mutex_lock(&__ip_vs_mutex);
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
if (net_eq(svc->net, net)) {
list_for_each_entry(dest, &svc->destinations,
n_list) {
- __ip_vs_dev_reset(dest, dev);
+ ip_vs_forget_dev(dest, dev);
}
}
}
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
if (net_eq(svc->net, net)) {
list_for_each_entry(dest, &svc->destinations,
n_list) {
- __ip_vs_dev_reset(dest, dev);
+ ip_vs_forget_dev(dest, dev);
}
}
}
}
- list_for_each_entry(dest, &ipvs->dest_trash, n_list) {
- __ip_vs_dev_reset(dest, dev);
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
+ ip_vs_forget_dev(dest, dev);
}
+ spin_unlock_bh(&ipvs->dest_trash_lock);
mutex_unlock(&__ip_vs_mutex);
LeaveFunction(2);
return NOTIFY_DONE;
@@ -1568,12 +1539,10 @@ static int ip_vs_zero_service(struct ip_vs_service *svc)
{
struct ip_vs_dest *dest;
- write_lock_bh(&__ip_vs_svc_lock);
list_for_each_entry(dest, &svc->destinations, n_list) {
ip_vs_zero_stats(&dest->stats);
}
ip_vs_zero_stats(&svc->stats);
- write_unlock_bh(&__ip_vs_svc_lock);
return 0;
}
@@ -1583,14 +1552,14 @@ static int ip_vs_zero_all(struct net *net)
struct ip_vs_service *svc;
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
if (net_eq(svc->net, net))
ip_vs_zero_service(svc);
}
}
for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
if (net_eq(svc->net, net))
ip_vs_zero_service(svc);
}
@@ -1918,7 +1887,7 @@ static struct ctl_table vs_vars[] = {
struct ip_vs_iter {
struct seq_net_private p; /* Do not move this, netns depends upon it*/
- struct list_head *table;
+ struct hlist_head *table;
int bucket;
};
@@ -1951,7 +1920,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
/* look in hash by protocol */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) {
if (net_eq(svc->net, net) && pos-- == 0) {
iter->table = ip_vs_svc_table;
iter->bucket = idx;
@@ -1962,7 +1931,8 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
/* keep looking in fwmark */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx],
+ f_list) {
if (net_eq(svc->net, net) && pos-- == 0) {
iter->table = ip_vs_svc_fwm_table;
iter->bucket = idx;
@@ -1975,17 +1945,16 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
}
static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
-__acquires(__ip_vs_svc_lock)
+ __acquires(RCU)
{
-
- read_lock_bh(&__ip_vs_svc_lock);
+ rcu_read_lock();
return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
}
static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct list_head *e;
+ struct hlist_node *e;
struct ip_vs_iter *iter;
struct ip_vs_service *svc;
@@ -1998,13 +1967,14 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
if (iter->table == ip_vs_svc_table) {
/* next service in table hashed by protocol */
- if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
- return list_entry(e, struct ip_vs_service, s_list);
-
+ e = rcu_dereference(hlist_next_rcu(&svc->s_list));
+ if (e)
+ return hlist_entry(e, struct ip_vs_service, s_list);
while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
- list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
- s_list) {
+ hlist_for_each_entry_rcu(svc,
+ &ip_vs_svc_table[iter->bucket],
+ s_list) {
return svc;
}
}
@@ -2015,13 +1985,15 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
/* next service in hashed by fwmark */
- if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
- return list_entry(e, struct ip_vs_service, f_list);
+ e = rcu_dereference(hlist_next_rcu(&svc->f_list));
+ if (e)
+ return hlist_entry(e, struct ip_vs_service, f_list);
scan_fwmark:
while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
- f_list)
+ hlist_for_each_entry_rcu(svc,
+ &ip_vs_svc_fwm_table[iter->bucket],
+ f_list)
return svc;
}
@@ -2029,9 +2001,9 @@ static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
-__releases(__ip_vs_svc_lock)
+ __releases(RCU)
{
- read_unlock_bh(&__ip_vs_svc_lock);
+ rcu_read_unlock();
}
@@ -2049,6 +2021,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
const struct ip_vs_service *svc = v;
const struct ip_vs_iter *iter = seq->private;
const struct ip_vs_dest *dest;
+ struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
if (iter->table == ip_vs_svc_table) {
#ifdef CONFIG_IP_VS_IPV6
@@ -2057,18 +2030,18 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
ip_vs_proto_name(svc->protocol),
&svc->addr.in6,
ntohs(svc->port),
- svc->scheduler->name);
+ sched->name);
else
#endif
seq_printf(seq, "%s %08X:%04X %s %s ",
ip_vs_proto_name(svc->protocol),
ntohl(svc->addr.ip),
ntohs(svc->port),
- svc->scheduler->name,
+ sched->name,
(svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
} else {
seq_printf(seq, "FWM %08X %s %s",
- svc->fwmark, svc->scheduler->name,
+ svc->fwmark, sched->name,
(svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
}
@@ -2079,7 +2052,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
else
seq_putc(seq, '\n');
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
#ifdef CONFIG_IP_VS_IPV6
if (dest->af == AF_INET6)
seq_printf(seq,
@@ -2173,7 +2146,7 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
{
struct net *net = seq_file_single_net(seq);
struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
- struct ip_vs_cpu_stats *cpustats = tot_stats->cpustats;
+ struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
struct ip_vs_stats_user rates;
int i;
@@ -2389,7 +2362,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
if (cmd == IP_VS_SO_SET_FLUSH) {
/* Flush the virtual service */
- ret = ip_vs_flush(net);
+ ret = ip_vs_flush(net, false);
goto out_unlock;
} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
/* Set timeout values for (tcp tcpfin udp) */
@@ -2424,11 +2397,13 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
}
/* Lookup the exact service by <protocol, addr, port> or fwmark */
+ rcu_read_lock();
if (usvc.fwmark == 0)
svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
&usvc.addr, usvc.port);
else
svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
+ rcu_read_unlock();
if (cmd != IP_VS_SO_SET_ADD
&& (svc == NULL || svc->protocol != usvc.protocol)) {
@@ -2480,11 +2455,14 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
static void
ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
{
+ struct ip_vs_scheduler *sched;
+
+ sched = rcu_dereference_protected(src->scheduler, 1);
dst->protocol = src->protocol;
dst->addr = src->addr.ip;
dst->port = src->port;
dst->fwmark = src->fwmark;
- strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
+ strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name));
dst->flags = src->flags;
dst->timeout = src->timeout / HZ;
dst->netmask = src->netmask;
@@ -2503,7 +2481,7 @@ __ip_vs_get_service_entries(struct net *net,
int ret = 0;
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
/* Only expose IPv4 entries to old interface */
if (svc->af != AF_INET || !net_eq(svc->net, net))
continue;
@@ -2522,7 +2500,7 @@ __ip_vs_get_service_entries(struct net *net,
}
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
/* Only expose IPv4 entries to old interface */
if (svc->af != AF_INET || !net_eq(svc->net, net))
continue;
@@ -2551,11 +2529,13 @@ __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
union nf_inet_addr addr = { .ip = get->addr };
int ret = 0;
+ rcu_read_lock();
if (get->fwmark)
svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
else
svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
get->port);
+ rcu_read_unlock();
if (svc) {
int count = 0;
@@ -2738,12 +2718,14 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
entry = (struct ip_vs_service_entry *)arg;
addr.ip = entry->addr;
+ rcu_read_lock();
if (entry->fwmark)
svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
else
svc = __ip_vs_service_find(net, AF_INET,
entry->protocol, &addr,
entry->port);
+ rcu_read_unlock();
if (svc) {
ip_vs_copy_service(entry, svc);
if (copy_to_user(user, entry, sizeof(*entry)) != 0)
@@ -2900,6 +2882,8 @@ nla_put_failure:
static int ip_vs_genl_fill_service(struct sk_buff *skb,
struct ip_vs_service *svc)
{
+ struct ip_vs_scheduler *sched;
+ struct ip_vs_pe *pe;
struct nlattr *nl_service;
struct ip_vs_flags flags = { .flags = svc->flags,
.mask = ~0 };
@@ -2916,16 +2900,17 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb,
} else {
if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
- nla_put_u16(skb, IPVS_SVC_ATTR_PORT, svc->port))
+ nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port))
goto nla_put_failure;
}
- if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name) ||
- (svc->pe &&
- nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name)) ||
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ pe = rcu_dereference_protected(svc->pe, 1);
+ if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) ||
+ (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) ||
nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
- nla_put_u32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
+ nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
goto nla_put_failure;
if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
goto nla_put_failure;
@@ -2971,7 +2956,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
mutex_lock(&__ip_vs_mutex);
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
- list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
if (++idx <= start || !net_eq(svc->net, net))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
@@ -2982,7 +2967,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
}
for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
- list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
if (++idx <= start || !net_eq(svc->net, net))
continue;
if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
@@ -3038,15 +3023,17 @@ static int ip_vs_genl_parse_service(struct net *net,
} else {
usvc->protocol = nla_get_u16(nla_protocol);
nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
- usvc->port = nla_get_u16(nla_port);
+ usvc->port = nla_get_be16(nla_port);
usvc->fwmark = 0;
}
+ rcu_read_lock();
if (usvc->fwmark)
svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
else
svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
&usvc->addr, usvc->port);
+ rcu_read_unlock();
*ret_svc = svc;
/* If a full entry was requested, check for the additional fields */
@@ -3076,7 +3063,7 @@ static int ip_vs_genl_parse_service(struct net *net,
usvc->sched_name = nla_data(nla_sched);
usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
usvc->timeout = nla_get_u32(nla_timeout);
- usvc->netmask = nla_get_u32(nla_netmask);
+ usvc->netmask = nla_get_be32(nla_netmask);
}
return 0;
@@ -3102,7 +3089,7 @@ static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
return -EMSGSIZE;
if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
- nla_put_u16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
+ nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
(atomic_read(&dest->conn_flags) &
IP_VS_CONN_F_FWD_MASK)) ||
@@ -3211,7 +3198,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
memset(udest, 0, sizeof(*udest));
nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
- udest->port = nla_get_u16(nla_port);
+ udest->port = nla_get_be16(nla_port);
/* If a full entry was requested, check for the additional fields */
if (full_entry) {
@@ -3236,8 +3223,8 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
return 0;
}
-static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
- const char *mcast_ifn, __be32 syncid)
+static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
+ const char *mcast_ifn, __u32 syncid)
{
struct nlattr *nl_daemon;
@@ -3258,8 +3245,8 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
- const char *mcast_ifn, __be32 syncid,
+static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
+ const char *mcast_ifn, __u32 syncid,
struct netlink_callback *cb)
{
void *hdr;
@@ -3398,7 +3385,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
mutex_lock(&__ip_vs_mutex);
if (cmd == IPVS_CMD_FLUSH) {
- ret = ip_vs_flush(net);
+ ret = ip_vs_flush(net, false);
goto out;
} else if (cmd == IPVS_CMD_SET_CONFIG) {
ret = ip_vs_genl_set_config(net, info->attrs);
@@ -3790,13 +3777,14 @@ int __net_init ip_vs_control_net_init(struct net *net)
int idx;
struct netns_ipvs *ipvs = net_ipvs(net);
- rwlock_init(&ipvs->rs_lock);
-
/* Initialize rs_table */
for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
- INIT_LIST_HEAD(&ipvs->rs_table[idx]);
+ INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
INIT_LIST_HEAD(&ipvs->dest_trash);
+ spin_lock_init(&ipvs->dest_trash_lock);
+ setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire,
+ (unsigned long) net);
atomic_set(&ipvs->ftpsvc_counter, 0);
atomic_set(&ipvs->nullsvc_counter, 0);
@@ -3826,6 +3814,10 @@ void __net_exit ip_vs_control_net_cleanup(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
+ /* Some dest can be in grace period even before cleanup, we have to
+ * defer ip_vs_trash_cleanup until ip_vs_dest_wait_readers is called.
+ */
+ rcu_barrier();
ip_vs_trash_cleanup(net);
ip_vs_stop_estimator(net, &ipvs->tot_stats);
ip_vs_control_net_cleanup_sysctl(net);
@@ -3871,10 +3863,10 @@ int __init ip_vs_control_init(void)
EnterFunction(2);
- /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
+ /* Initialize svc_table, ip_vs_svc_fwm_table */
for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
- INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
- INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+ INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
+ INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]);
}
smp_wmb(); /* Do we really need it now ? */
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 7f3b0cc00b7..ccab120df45 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -51,7 +51,7 @@
* IPVS DH bucket
*/
struct ip_vs_dh_bucket {
- struct ip_vs_dest *dest; /* real server (cache) */
+ struct ip_vs_dest __rcu *dest; /* real server (cache) */
};
/*
@@ -64,6 +64,10 @@ struct ip_vs_dh_bucket {
#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS)
#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1)
+struct ip_vs_dh_state {
+ struct ip_vs_dh_bucket buckets[IP_VS_DH_TAB_SIZE];
+ struct rcu_head rcu_head;
+};
/*
* Returns hash value for IPVS DH entry
@@ -85,10 +89,9 @@ static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *ad
* Get ip_vs_dest associated with supplied parameters.
*/
static inline struct ip_vs_dest *
-ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl,
- const union nf_inet_addr *addr)
+ip_vs_dh_get(int af, struct ip_vs_dh_state *s, const union nf_inet_addr *addr)
{
- return (tbl[ip_vs_dh_hashkey(af, addr)]).dest;
+ return rcu_dereference(s->buckets[ip_vs_dh_hashkey(af, addr)].dest);
}
@@ -96,25 +99,30 @@ ip_vs_dh_get(int af, struct ip_vs_dh_bucket *tbl,
* Assign all the hash buckets of the specified table with the service.
*/
static int
-ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
+ip_vs_dh_reassign(struct ip_vs_dh_state *s, struct ip_vs_service *svc)
{
int i;
struct ip_vs_dh_bucket *b;
struct list_head *p;
struct ip_vs_dest *dest;
+ bool empty;
- b = tbl;
+ b = &s->buckets[0];
p = &svc->destinations;
+ empty = list_empty(p);
for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
- if (list_empty(p)) {
- b->dest = NULL;
- } else {
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest)
+ ip_vs_dest_put(dest);
+ if (empty)
+ RCU_INIT_POINTER(b->dest, NULL);
+ else {
if (p == &svc->destinations)
p = p->next;
dest = list_entry(p, struct ip_vs_dest, n_list);
- atomic_inc(&dest->refcnt);
- b->dest = dest;
+ ip_vs_dest_hold(dest);
+ RCU_INIT_POINTER(b->dest, dest);
p = p->next;
}
@@ -127,16 +135,18 @@ ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
/*
* Flush all the hash buckets of the specified table.
*/
-static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
+static void ip_vs_dh_flush(struct ip_vs_dh_state *s)
{
int i;
struct ip_vs_dh_bucket *b;
+ struct ip_vs_dest *dest;
- b = tbl;
+ b = &s->buckets[0];
for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
- if (b->dest) {
- atomic_dec(&b->dest->refcnt);
- b->dest = NULL;
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest) {
+ ip_vs_dest_put(dest);
+ RCU_INIT_POINTER(b->dest, NULL);
}
b++;
}
@@ -145,51 +155,46 @@ static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
{
- struct ip_vs_dh_bucket *tbl;
+ struct ip_vs_dh_state *s;
/* allocate the DH table for this service */
- tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
- GFP_KERNEL);
- if (tbl == NULL)
+ s = kzalloc(sizeof(struct ip_vs_dh_state), GFP_KERNEL);
+ if (s == NULL)
return -ENOMEM;
- svc->sched_data = tbl;
+ svc->sched_data = s;
IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
"current service\n",
sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
- /* assign the hash buckets with the updated service */
- ip_vs_dh_assign(tbl, svc);
+ /* assign the hash buckets with current dests */
+ ip_vs_dh_reassign(s, svc);
return 0;
}
-static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
+static void ip_vs_dh_done_svc(struct ip_vs_service *svc)
{
- struct ip_vs_dh_bucket *tbl = svc->sched_data;
+ struct ip_vs_dh_state *s = svc->sched_data;
/* got to clean up hash buckets here */
- ip_vs_dh_flush(tbl);
+ ip_vs_dh_flush(s);
/* release the table itself */
- kfree(svc->sched_data);
+ kfree_rcu(s, rcu_head);
IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
-
- return 0;
}
-static int ip_vs_dh_update_svc(struct ip_vs_service *svc)
+static int ip_vs_dh_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
{
- struct ip_vs_dh_bucket *tbl = svc->sched_data;
-
- /* got to clean up hash buckets here */
- ip_vs_dh_flush(tbl);
+ struct ip_vs_dh_state *s = svc->sched_data;
/* assign the hash buckets with the updated service */
- ip_vs_dh_assign(tbl, svc);
+ ip_vs_dh_reassign(s, svc);
return 0;
}
@@ -212,19 +217,20 @@ static struct ip_vs_dest *
ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_dest *dest;
- struct ip_vs_dh_bucket *tbl;
+ struct ip_vs_dh_state *s;
struct ip_vs_iphdr iph;
ip_vs_fill_iph_addr_only(svc->af, skb, &iph);
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
- tbl = (struct ip_vs_dh_bucket *)svc->sched_data;
- dest = ip_vs_dh_get(svc->af, tbl, &iph.daddr);
+ s = (struct ip_vs_dh_state *) svc->sched_data;
+ dest = ip_vs_dh_get(svc->af, s, &iph.daddr);
if (!dest
|| !(dest->flags & IP_VS_DEST_F_AVAILABLE)
|| atomic_read(&dest->weight) <= 0
|| is_overloaded(dest)) {
+ ip_vs_scheduler_err(svc, "no destination available");
return NULL;
}
@@ -248,7 +254,8 @@ static struct ip_vs_scheduler ip_vs_dh_scheduler =
.n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list),
.init_service = ip_vs_dh_init_svc,
.done_service = ip_vs_dh_done_svc,
- .update_service = ip_vs_dh_update_svc,
+ .add_dest = ip_vs_dh_dest_changed,
+ .del_dest = ip_vs_dh_dest_changed,
.schedule = ip_vs_dh_schedule,
};
@@ -262,6 +269,7 @@ static int __init ip_vs_dh_init(void)
static void __exit ip_vs_dh_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
+ synchronize_rcu();
}
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index 0fac6017b6f..6bee6d0c73a 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -56,7 +56,7 @@
* Make a summary from each cpu
*/
static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum,
- struct ip_vs_cpu_stats *stats)
+ struct ip_vs_cpu_stats __percpu *stats)
{
int i;
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 4f53a5f0443..77c173282f3 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -267,10 +267,12 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
* hopefully it will succeed on the retransmitted
* packet.
*/
+ rcu_read_lock();
ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
iph->ihl * 4,
start-data, end-start,
buf, buf_len);
+ rcu_read_unlock();
if (ret) {
ip_vs_nfct_expect_related(skb, ct, n_cp,
IPPROTO_TCP, 0, 0);
@@ -480,6 +482,7 @@ static int __init ip_vs_ftp_init(void)
int rv;
rv = register_pernet_subsys(&ip_vs_ftp_ops);
+ /* rcu_barrier() is called by netns on error */
return rv;
}
@@ -489,6 +492,7 @@ static int __init ip_vs_ftp_init(void)
static void __exit ip_vs_ftp_exit(void)
{
unregister_pernet_subsys(&ip_vs_ftp_ops);
+ /* rcu_barrier() is called by netns */
}
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index fdd89b9564e..5ea26bd8774 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -90,11 +90,12 @@
* IP address and its destination server
*/
struct ip_vs_lblc_entry {
- struct list_head list;
+ struct hlist_node list;
int af; /* address family */
union nf_inet_addr addr; /* destination IP address */
- struct ip_vs_dest *dest; /* real server (cache) */
+ struct ip_vs_dest __rcu *dest; /* real server (cache) */
unsigned long lastuse; /* last used time */
+ struct rcu_head rcu_head;
};
@@ -102,12 +103,14 @@ struct ip_vs_lblc_entry {
* IPVS lblc hash table
*/
struct ip_vs_lblc_table {
- struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
+ struct rcu_head rcu_head;
+ struct hlist_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
+ struct timer_list periodic_timer; /* collect stale entries */
atomic_t entries; /* number of entries */
int max_size; /* maximum size of entries */
- struct timer_list periodic_timer; /* collect stale entries */
int rover; /* rover for expire check */
int counter; /* counter for no expire */
+ bool dead;
};
@@ -129,13 +132,16 @@ static ctl_table vs_vars_table[] = {
static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
{
- list_del(&en->list);
+ struct ip_vs_dest *dest;
+
+ hlist_del_rcu(&en->list);
/*
* We don't kfree dest because it is referred either by its service
* or the trash dest list.
*/
- atomic_dec(&en->dest->refcnt);
- kfree(en);
+ dest = rcu_dereference_protected(en->dest, 1);
+ ip_vs_dest_put(dest);
+ kfree_rcu(en, rcu_head);
}
@@ -165,15 +171,12 @@ ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
{
unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr);
- list_add(&en->list, &tbl->bucket[hash]);
+ hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
atomic_inc(&tbl->entries);
}
-/*
- * Get ip_vs_lblc_entry associated with supplied parameters. Called under read
- * lock
- */
+/* Get ip_vs_lblc_entry associated with supplied parameters. */
static inline struct ip_vs_lblc_entry *
ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
const union nf_inet_addr *addr)
@@ -181,7 +184,7 @@ ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
unsigned int hash = ip_vs_lblc_hashkey(af, addr);
struct ip_vs_lblc_entry *en;
- list_for_each_entry(en, &tbl->bucket[hash], list)
+ hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
if (ip_vs_addr_equal(af, &en->addr, addr))
return en;
@@ -191,7 +194,7 @@ ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
/*
* Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
- * address to a server. Called under write lock.
+ * address to a server. Called under spin lock.
*/
static inline struct ip_vs_lblc_entry *
ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
@@ -209,14 +212,20 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
ip_vs_addr_copy(dest->af, &en->addr, daddr);
en->lastuse = jiffies;
- atomic_inc(&dest->refcnt);
- en->dest = dest;
+ ip_vs_dest_hold(dest);
+ RCU_INIT_POINTER(en->dest, dest);
ip_vs_lblc_hash(tbl, en);
- } else if (en->dest != dest) {
- atomic_dec(&en->dest->refcnt);
- atomic_inc(&dest->refcnt);
- en->dest = dest;
+ } else {
+ struct ip_vs_dest *old_dest;
+
+ old_dest = rcu_dereference_protected(en->dest, 1);
+ if (old_dest != dest) {
+ ip_vs_dest_put(old_dest);
+ ip_vs_dest_hold(dest);
+ /* No ordering constraints for refcnt */
+ RCU_INIT_POINTER(en->dest, dest);
+ }
}
return en;
@@ -226,17 +235,22 @@ ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
/*
* Flush all the entries of the specified table.
*/
-static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
+static void ip_vs_lblc_flush(struct ip_vs_service *svc)
{
- struct ip_vs_lblc_entry *en, *nxt;
+ struct ip_vs_lblc_table *tbl = svc->sched_data;
+ struct ip_vs_lblc_entry *en;
+ struct hlist_node *next;
int i;
+ spin_lock_bh(&svc->sched_lock);
+ tbl->dead = 1;
for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
- list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
ip_vs_lblc_free(en);
atomic_dec(&tbl->entries);
}
}
+ spin_unlock_bh(&svc->sched_lock);
}
static int sysctl_lblc_expiration(struct ip_vs_service *svc)
@@ -252,15 +266,16 @@ static int sysctl_lblc_expiration(struct ip_vs_service *svc)
static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
{
struct ip_vs_lblc_table *tbl = svc->sched_data;
- struct ip_vs_lblc_entry *en, *nxt;
+ struct ip_vs_lblc_entry *en;
+ struct hlist_node *next;
unsigned long now = jiffies;
int i, j;
for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLC_TAB_MASK;
- write_lock(&svc->sched_lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
if (time_before(now,
en->lastuse +
sysctl_lblc_expiration(svc)))
@@ -269,7 +284,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
ip_vs_lblc_free(en);
atomic_dec(&tbl->entries);
}
- write_unlock(&svc->sched_lock);
+ spin_unlock(&svc->sched_lock);
}
tbl->rover = j;
}
@@ -293,7 +308,8 @@ static void ip_vs_lblc_check_expire(unsigned long data)
unsigned long now = jiffies;
int goal;
int i, j;
- struct ip_vs_lblc_entry *en, *nxt;
+ struct ip_vs_lblc_entry *en;
+ struct hlist_node *next;
if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
/* do full expiration check */
@@ -314,8 +330,8 @@ static void ip_vs_lblc_check_expire(unsigned long data)
for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLC_TAB_MASK;
- write_lock(&svc->sched_lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
continue;
@@ -323,7 +339,7 @@ static void ip_vs_lblc_check_expire(unsigned long data)
atomic_dec(&tbl->entries);
goal--;
}
- write_unlock(&svc->sched_lock);
+ spin_unlock(&svc->sched_lock);
if (goal <= 0)
break;
}
@@ -354,11 +370,12 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
* Initialize the hash buckets
*/
for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
- INIT_LIST_HEAD(&tbl->bucket[i]);
+ INIT_HLIST_HEAD(&tbl->bucket[i]);
}
tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
tbl->rover = 0;
tbl->counter = 1;
+ tbl->dead = 0;
/*
* Hook periodic timer for garbage collection
@@ -371,7 +388,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
}
-static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
+static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)
{
struct ip_vs_lblc_table *tbl = svc->sched_data;
@@ -379,14 +396,12 @@ static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
del_timer_sync(&tbl->periodic_timer);
/* got to clean up table entries here */
- ip_vs_lblc_flush(tbl);
+ ip_vs_lblc_flush(svc);
/* release the table itself */
- kfree(tbl);
+ kfree_rcu(tbl, rcu_head);
IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
sizeof(*tbl));
-
- return 0;
}
@@ -408,7 +423,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)
* The server with weight=0 is quiesced and will not receive any
* new connection.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
if (atomic_read(&dest->weight) > 0) {
@@ -423,7 +438,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)
* Find the destination with the least load.
*/
nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
@@ -457,7 +472,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
struct ip_vs_dest *d;
- list_for_each_entry(d, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(d, &svc->destinations, n_list) {
if (atomic_read(&d->activeconns)*2
< atomic_read(&d->weight)) {
return 1;
@@ -484,7 +499,6 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
/* First look in our cache */
- read_lock(&svc->sched_lock);
en = ip_vs_lblc_get(svc->af, tbl, &iph.daddr);
if (en) {
/* We only hold a read lock, but this is atomic */
@@ -499,14 +513,11 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* free up entries from the trash at any time.
*/
- if (en->dest->flags & IP_VS_DEST_F_AVAILABLE)
- dest = en->dest;
+ dest = rcu_dereference(en->dest);
+ if ((dest->flags & IP_VS_DEST_F_AVAILABLE) &&
+ atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
+ goto out;
}
- read_unlock(&svc->sched_lock);
-
- /* If the destination has a weight and is not overloaded, use it */
- if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
- goto out;
/* No cache entry or it is invalid, time to schedule */
dest = __ip_vs_lblc_schedule(svc);
@@ -516,9 +527,10 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
}
/* If we fail to create a cache entry, we'll just use the valid dest */
- write_lock(&svc->sched_lock);
- ip_vs_lblc_new(tbl, &iph.daddr, dest);
- write_unlock(&svc->sched_lock);
+ spin_lock_bh(&svc->sched_lock);
+ if (!tbl->dead)
+ ip_vs_lblc_new(tbl, &iph.daddr, dest);
+ spin_unlock_bh(&svc->sched_lock);
out:
IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n",
@@ -621,6 +633,7 @@ static void __exit ip_vs_lblc_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
unregister_pernet_subsys(&ip_vs_lblc_ops);
+ synchronize_rcu();
}
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index c03b6a3ade2..50123c2ab48 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -89,40 +89,44 @@
*/
struct ip_vs_dest_set_elem {
struct list_head list; /* list link */
- struct ip_vs_dest *dest; /* destination server */
+ struct ip_vs_dest __rcu *dest; /* destination server */
+ struct rcu_head rcu_head;
};
struct ip_vs_dest_set {
atomic_t size; /* set size */
unsigned long lastmod; /* last modified time */
struct list_head list; /* destination list */
- rwlock_t lock; /* lock for this list */
};
-static struct ip_vs_dest_set_elem *
-ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set,
+ struct ip_vs_dest *dest, bool check)
{
struct ip_vs_dest_set_elem *e;
- list_for_each_entry(e, &set->list, list) {
- if (e->dest == dest)
- /* already existed */
- return NULL;
+ if (check) {
+ list_for_each_entry(e, &set->list, list) {
+ struct ip_vs_dest *d;
+
+ d = rcu_dereference_protected(e->dest, 1);
+ if (d == dest)
+ /* already existed */
+ return;
+ }
}
e = kmalloc(sizeof(*e), GFP_ATOMIC);
if (e == NULL)
- return NULL;
+ return;
- atomic_inc(&dest->refcnt);
- e->dest = dest;
+ ip_vs_dest_hold(dest);
+ RCU_INIT_POINTER(e->dest, dest);
- list_add(&e->list, &set->list);
+ list_add_rcu(&e->list, &set->list);
atomic_inc(&set->size);
set->lastmod = jiffies;
- return e;
}
static void
@@ -131,13 +135,16 @@ ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
struct ip_vs_dest_set_elem *e;
list_for_each_entry(e, &set->list, list) {
- if (e->dest == dest) {
+ struct ip_vs_dest *d;
+
+ d = rcu_dereference_protected(e->dest, 1);
+ if (d == dest) {
/* HIT */
atomic_dec(&set->size);
set->lastmod = jiffies;
- atomic_dec(&e->dest->refcnt);
- list_del(&e->list);
- kfree(e);
+ ip_vs_dest_put(dest);
+ list_del_rcu(&e->list);
+ kfree_rcu(e, rcu_head);
break;
}
}
@@ -147,17 +154,18 @@ static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
{
struct ip_vs_dest_set_elem *e, *ep;
- write_lock(&set->lock);
list_for_each_entry_safe(e, ep, &set->list, list) {
+ struct ip_vs_dest *d;
+
+ d = rcu_dereference_protected(e->dest, 1);
/*
* We don't kfree dest because it is referred either
* by its service or by the trash dest list.
*/
- atomic_dec(&e->dest->refcnt);
- list_del(&e->list);
- kfree(e);
+ ip_vs_dest_put(d);
+ list_del_rcu(&e->list);
+ kfree_rcu(e, rcu_head);
}
- write_unlock(&set->lock);
}
/* get weighted least-connection node in the destination set */
@@ -171,8 +179,8 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
return NULL;
/* select the first destination server, whose weight > 0 */
- list_for_each_entry(e, &set->list, list) {
- least = e->dest;
+ list_for_each_entry_rcu(e, &set->list, list) {
+ least = rcu_dereference(e->dest);
if (least->flags & IP_VS_DEST_F_OVERLOAD)
continue;
@@ -186,8 +194,8 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
/* find the destination with the weighted least load */
nextstage:
- list_for_each_entry(e, &set->list, list) {
- dest = e->dest;
+ list_for_each_entry_continue_rcu(e, &set->list, list) {
+ dest = rcu_dereference(e->dest);
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
@@ -224,7 +232,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
/* select the first destination server, whose weight > 0 */
list_for_each_entry(e, &set->list, list) {
- most = e->dest;
+ most = rcu_dereference_protected(e->dest, 1);
if (atomic_read(&most->weight) > 0) {
moh = ip_vs_dest_conn_overhead(most);
goto nextstage;
@@ -234,8 +242,8 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
/* find the destination with the weighted most load */
nextstage:
- list_for_each_entry(e, &set->list, list) {
- dest = e->dest;
+ list_for_each_entry_continue(e, &set->list, list) {
+ dest = rcu_dereference_protected(e->dest, 1);
doh = ip_vs_dest_conn_overhead(dest);
/* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
if ((moh * atomic_read(&dest->weight) <
@@ -262,11 +270,12 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
* IP address and its destination server set
*/
struct ip_vs_lblcr_entry {
- struct list_head list;
+ struct hlist_node list;
int af; /* address family */
union nf_inet_addr addr; /* destination IP address */
struct ip_vs_dest_set set; /* destination server set */
unsigned long lastuse; /* last used time */
+ struct rcu_head rcu_head;
};
@@ -274,12 +283,14 @@ struct ip_vs_lblcr_entry {
* IPVS lblcr hash table
*/
struct ip_vs_lblcr_table {
- struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
+ struct rcu_head rcu_head;
+ struct hlist_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
atomic_t entries; /* number of entries */
int max_size; /* maximum size of entries */
struct timer_list periodic_timer; /* collect stale entries */
int rover; /* rover for expire check */
int counter; /* counter for no expire */
+ bool dead;
};
@@ -302,9 +313,9 @@ static ctl_table vs_vars_table[] = {
static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
{
- list_del(&en->list);
+ hlist_del_rcu(&en->list);
ip_vs_dest_set_eraseall(&en->set);
- kfree(en);
+ kfree_rcu(en, rcu_head);
}
@@ -334,15 +345,12 @@ ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
{
unsigned int hash = ip_vs_lblcr_hashkey(en->af, &en->addr);
- list_add(&en->list, &tbl->bucket[hash]);
+ hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
atomic_inc(&tbl->entries);
}
-/*
- * Get ip_vs_lblcr_entry associated with supplied parameters. Called under
- * read lock.
- */
+/* Get ip_vs_lblcr_entry associated with supplied parameters. */
static inline struct ip_vs_lblcr_entry *
ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,
const union nf_inet_addr *addr)
@@ -350,7 +358,7 @@ ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,
unsigned int hash = ip_vs_lblcr_hashkey(af, addr);
struct ip_vs_lblcr_entry *en;
- list_for_each_entry(en, &tbl->bucket[hash], list)
+ hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
if (ip_vs_addr_equal(af, &en->addr, addr))
return en;
@@ -360,7 +368,7 @@ ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,
/*
* Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
- * IP address to a server. Called under write lock.
+ * IP address to a server. Called under spin lock.
*/
static inline struct ip_vs_lblcr_entry *
ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
@@ -381,14 +389,14 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
/* initialize its dest set */
atomic_set(&(en->set.size), 0);
INIT_LIST_HEAD(&en->set.list);
- rwlock_init(&en->set.lock);
+
+ ip_vs_dest_set_insert(&en->set, dest, false);
ip_vs_lblcr_hash(tbl, en);
+ return en;
}
- write_lock(&en->set.lock);
- ip_vs_dest_set_insert(&en->set, dest);
- write_unlock(&en->set.lock);
+ ip_vs_dest_set_insert(&en->set, dest, true);
return en;
}
@@ -397,17 +405,21 @@ ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
/*
* Flush all the entries of the specified table.
*/
-static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
+static void ip_vs_lblcr_flush(struct ip_vs_service *svc)
{
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
int i;
- struct ip_vs_lblcr_entry *en, *nxt;
+ struct ip_vs_lblcr_entry *en;
+ struct hlist_node *next;
- /* No locking required, only called during cleanup. */
+ spin_lock_bh(&svc->sched_lock);
+ tbl->dead = 1;
for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
- list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
ip_vs_lblcr_free(en);
}
}
+ spin_unlock_bh(&svc->sched_lock);
}
static int sysctl_lblcr_expiration(struct ip_vs_service *svc)
@@ -425,13 +437,14 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
struct ip_vs_lblcr_table *tbl = svc->sched_data;
unsigned long now = jiffies;
int i, j;
- struct ip_vs_lblcr_entry *en, *nxt;
+ struct ip_vs_lblcr_entry *en;
+ struct hlist_node *next;
for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
- write_lock(&svc->sched_lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
if (time_after(en->lastuse +
sysctl_lblcr_expiration(svc), now))
continue;
@@ -439,7 +452,7 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
ip_vs_lblcr_free(en);
atomic_dec(&tbl->entries);
}
- write_unlock(&svc->sched_lock);
+ spin_unlock(&svc->sched_lock);
}
tbl->rover = j;
}
@@ -463,7 +476,8 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
unsigned long now = jiffies;
int goal;
int i, j;
- struct ip_vs_lblcr_entry *en, *nxt;
+ struct ip_vs_lblcr_entry *en;
+ struct hlist_node *next;
if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
/* do full expiration check */
@@ -484,8 +498,8 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
- write_lock(&svc->sched_lock);
- list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
continue;
@@ -493,7 +507,7 @@ static void ip_vs_lblcr_check_expire(unsigned long data)
atomic_dec(&tbl->entries);
goal--;
}
- write_unlock(&svc->sched_lock);
+ spin_unlock(&svc->sched_lock);
if (goal <= 0)
break;
}
@@ -523,11 +537,12 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
* Initialize the hash buckets
*/
for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
- INIT_LIST_HEAD(&tbl->bucket[i]);
+ INIT_HLIST_HEAD(&tbl->bucket[i]);
}
tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
tbl->rover = 0;
tbl->counter = 1;
+ tbl->dead = 0;
/*
* Hook periodic timer for garbage collection
@@ -540,7 +555,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
}
-static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
+static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
{
struct ip_vs_lblcr_table *tbl = svc->sched_data;
@@ -548,14 +563,12 @@ static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
del_timer_sync(&tbl->periodic_timer);
/* got to clean up table entries here */
- ip_vs_lblcr_flush(tbl);
+ ip_vs_lblcr_flush(svc);
/* release the table itself */
- kfree(tbl);
+ kfree_rcu(tbl, rcu_head);
IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
sizeof(*tbl));
-
- return 0;
}
@@ -577,7 +590,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)
* The server with weight=0 is quiesced and will not receive any
* new connection.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
@@ -593,7 +606,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)
* Find the destination with the least load.
*/
nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
@@ -627,7 +640,7 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
struct ip_vs_dest *d;
- list_for_each_entry(d, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(d, &svc->destinations, n_list) {
if (atomic_read(&d->activeconns)*2
< atomic_read(&d->weight)) {
return 1;
@@ -646,7 +659,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_lblcr_table *tbl = svc->sched_data;
struct ip_vs_iphdr iph;
- struct ip_vs_dest *dest = NULL;
+ struct ip_vs_dest *dest;
struct ip_vs_lblcr_entry *en;
ip_vs_fill_iph_addr_only(svc->af, skb, &iph);
@@ -654,53 +667,46 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
/* First look in our cache */
- read_lock(&svc->sched_lock);
en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr);
if (en) {
- /* We only hold a read lock, but this is atomic */
en->lastuse = jiffies;
/* Get the least loaded destination */
- read_lock(&en->set.lock);
dest = ip_vs_dest_set_min(&en->set);
- read_unlock(&en->set.lock);
/* More than one destination + enough time passed by, cleanup */
if (atomic_read(&en->set.size) > 1 &&
- time_after(jiffies, en->set.lastmod +
+ time_after(jiffies, en->set.lastmod +
sysctl_lblcr_expiration(svc))) {
- struct ip_vs_dest *m;
+ spin_lock_bh(&svc->sched_lock);
+ if (atomic_read(&en->set.size) > 1) {
+ struct ip_vs_dest *m;
- write_lock(&en->set.lock);
- m = ip_vs_dest_set_max(&en->set);
- if (m)
- ip_vs_dest_set_erase(&en->set, m);
- write_unlock(&en->set.lock);
+ m = ip_vs_dest_set_max(&en->set);
+ if (m)
+ ip_vs_dest_set_erase(&en->set, m);
+ }
+ spin_unlock_bh(&svc->sched_lock);
}
/* If the destination is not overloaded, use it */
- if (dest && !is_overloaded(dest, svc)) {
- read_unlock(&svc->sched_lock);
+ if (dest && !is_overloaded(dest, svc))
goto out;
- }
/* The cache entry is invalid, time to schedule */
dest = __ip_vs_lblcr_schedule(svc);
if (!dest) {
ip_vs_scheduler_err(svc, "no destination available");
- read_unlock(&svc->sched_lock);
return NULL;
}
/* Update our cache entry */
- write_lock(&en->set.lock);
- ip_vs_dest_set_insert(&en->set, dest);
- write_unlock(&en->set.lock);
- }
- read_unlock(&svc->sched_lock);
-
- if (dest)
+ spin_lock_bh(&svc->sched_lock);
+ if (!tbl->dead)
+ ip_vs_dest_set_insert(&en->set, dest, true);
+ spin_unlock_bh(&svc->sched_lock);
goto out;
+ }
/* No cache entry, time to schedule */
dest = __ip_vs_lblcr_schedule(svc);
@@ -710,9 +716,10 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
}
/* If we fail to create a cache entry, we'll just use the valid dest */
- write_lock(&svc->sched_lock);
- ip_vs_lblcr_new(tbl, &iph.daddr, dest);
- write_unlock(&svc->sched_lock);
+ spin_lock_bh(&svc->sched_lock);
+ if (!tbl->dead)
+ ip_vs_lblcr_new(tbl, &iph.daddr, dest);
+ spin_unlock_bh(&svc->sched_lock);
out:
IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n",
@@ -814,6 +821,7 @@ static void __exit ip_vs_lblcr_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
unregister_pernet_subsys(&ip_vs_lblcr_ops);
+ synchronize_rcu();
}
diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c
index f391819c0cc..5128e338a74 100644
--- a/net/netfilter/ipvs/ip_vs_lc.c
+++ b/net/netfilter/ipvs/ip_vs_lc.c
@@ -42,7 +42,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* served, but no new connection is assigned to the server.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
atomic_read(&dest->weight) == 0)
continue;
@@ -84,6 +84,7 @@ static int __init ip_vs_lc_init(void)
static void __exit ip_vs_lc_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_lc_init);
diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c
index 984d9c137d8..646cfd4baa7 100644
--- a/net/netfilter/ipvs/ip_vs_nq.c
+++ b/net/netfilter/ipvs/ip_vs_nq.c
@@ -75,7 +75,7 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* new connections.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
!atomic_read(&dest->weight))
@@ -133,6 +133,7 @@ static int __init ip_vs_nq_init(void)
static void __exit ip_vs_nq_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_nq_init);
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
index 5cf859ccb31..1a82b29ce8e 100644
--- a/net/netfilter/ipvs/ip_vs_pe.c
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -13,20 +13,8 @@
/* IPVS pe list */
static LIST_HEAD(ip_vs_pe);
-/* lock for service table */
-static DEFINE_SPINLOCK(ip_vs_pe_lock);
-
-/* Bind a service with a pe */
-void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe)
-{
- svc->pe = pe;
-}
-
-/* Unbind a service from its pe */
-void ip_vs_unbind_pe(struct ip_vs_service *svc)
-{
- svc->pe = NULL;
-}
+/* semaphore for IPVS PEs. */
+static DEFINE_MUTEX(ip_vs_pe_mutex);
/* Get pe in the pe list by name */
struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
@@ -36,9 +24,8 @@ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__,
pe_name);
- spin_lock_bh(&ip_vs_pe_lock);
-
- list_for_each_entry(pe, &ip_vs_pe, n_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(pe, &ip_vs_pe, n_list) {
/* Test and get the modules atomically */
if (pe->module &&
!try_module_get(pe->module)) {
@@ -47,14 +34,14 @@ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
}
if (strcmp(pe_name, pe->name)==0) {
/* HIT */
- spin_unlock_bh(&ip_vs_pe_lock);
+ rcu_read_unlock();
return pe;
}
if (pe->module)
module_put(pe->module);
}
+ rcu_read_unlock();
- spin_unlock_bh(&ip_vs_pe_lock);
return NULL;
}
@@ -83,22 +70,13 @@ int register_ip_vs_pe(struct ip_vs_pe *pe)
/* increase the module use count */
ip_vs_use_count_inc();
- spin_lock_bh(&ip_vs_pe_lock);
-
- if (!list_empty(&pe->n_list)) {
- spin_unlock_bh(&ip_vs_pe_lock);
- ip_vs_use_count_dec();
- pr_err("%s(): [%s] pe already linked\n",
- __func__, pe->name);
- return -EINVAL;
- }
-
+ mutex_lock(&ip_vs_pe_mutex);
/* Make sure that the pe with this name doesn't exist
* in the pe list.
*/
list_for_each_entry(tmp, &ip_vs_pe, n_list) {
if (strcmp(tmp->name, pe->name) == 0) {
- spin_unlock_bh(&ip_vs_pe_lock);
+ mutex_unlock(&ip_vs_pe_mutex);
ip_vs_use_count_dec();
pr_err("%s(): [%s] pe already existed "
"in the system\n", __func__, pe->name);
@@ -106,8 +84,8 @@ int register_ip_vs_pe(struct ip_vs_pe *pe)
}
}
/* Add it into the d-linked pe list */
- list_add(&pe->n_list, &ip_vs_pe);
- spin_unlock_bh(&ip_vs_pe_lock);
+ list_add_rcu(&pe->n_list, &ip_vs_pe);
+ mutex_unlock(&ip_vs_pe_mutex);
pr_info("[%s] pe registered.\n", pe->name);
@@ -118,17 +96,10 @@ EXPORT_SYMBOL_GPL(register_ip_vs_pe);
/* Unregister a pe from the pe list */
int unregister_ip_vs_pe(struct ip_vs_pe *pe)
{
- spin_lock_bh(&ip_vs_pe_lock);
- if (list_empty(&pe->n_list)) {
- spin_unlock_bh(&ip_vs_pe_lock);
- pr_err("%s(): [%s] pe is not in the list. failed\n",
- __func__, pe->name);
- return -EINVAL;
- }
-
+ mutex_lock(&ip_vs_pe_mutex);
/* Remove it from the d-linked pe list */
- list_del(&pe->n_list);
- spin_unlock_bh(&ip_vs_pe_lock);
+ list_del_rcu(&pe->n_list);
+ mutex_unlock(&ip_vs_pe_mutex);
/* decrease the module use count */
ip_vs_use_count_dec();
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index 12475ef88da..9ef22bdce9f 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -13,7 +13,8 @@ static const char *ip_vs_dbg_callid(char *buf, size_t buf_len,
const char *callid, size_t callid_len,
int *idx)
{
- size_t len = min(min(callid_len, (size_t)64), buf_len - *idx - 1);
+ size_t max_len = 64;
+ size_t len = min3(max_len, callid_len, buf_len - *idx - 1);
memcpy(buf + *idx, callid, len);
buf[*idx+len] = '\0';
*idx += len + 1;
@@ -37,14 +38,10 @@ static int get_callid(const char *dptr, unsigned int dataoff,
if (ret > 0)
break;
if (!ret)
- return 0;
+ return -EINVAL;
dataoff += *matchoff;
}
- /* Empty callid is useless */
- if (!*matchlen)
- return -EINVAL;
-
/* Too large is useless */
if (*matchlen > IP_VS_PEDATA_MAXLEN)
return -EINVAL;
@@ -172,6 +169,7 @@ static int __init ip_vs_sip_init(void)
static void __exit ip_vs_sip_cleanup(void)
{
unregister_ip_vs_pe(&ip_vs_sip_pe);
+ synchronize_rcu();
}
module_init(ip_vs_sip_init);
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index cd1d7298f7b..86464881cd2 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -27,9 +27,10 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
if (sch == NULL)
return 0;
net = skb_net(skb);
+ rcu_read_lock();
if ((sch->type == SCTP_CID_INIT) &&
- (svc = ip_vs_service_get(net, af, skb->mark, iph->protocol,
- &iph->daddr, sh->dest))) {
+ (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+ &iph->daddr, sh->dest))) {
int ignored;
if (ip_vs_todrop(net_ipvs(net))) {
@@ -37,7 +38,7 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
* It seems that we are very loaded.
* We have to drop this packet :(
*/
- ip_vs_service_put(svc);
+ rcu_read_unlock();
*verdict = NF_DROP;
return 0;
}
@@ -49,14 +50,13 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
if (!*cpp && ignored <= 0) {
if (!ignored)
*verdict = ip_vs_leave(svc, skb, pd, iph);
- else {
- ip_vs_service_put(svc);
+ else
*verdict = NF_DROP;
- }
+ rcu_read_unlock();
return 0;
}
- ip_vs_service_put(svc);
}
+ rcu_read_unlock();
/* NF_ACCEPT */
return 1;
}
@@ -208,7 +208,7 @@ enum ipvs_sctp_event_t {
IP_VS_SCTP_EVE_LAST
};
-static enum ipvs_sctp_event_t sctp_events[255] = {
+static enum ipvs_sctp_event_t sctp_events[256] = {
IP_VS_SCTP_EVE_DATA_CLI,
IP_VS_SCTP_EVE_INIT_CLI,
IP_VS_SCTP_EVE_INIT_ACK_CLI,
@@ -994,9 +994,9 @@ static void
sctp_state_transition(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb, struct ip_vs_proto_data *pd)
{
- spin_lock(&cp->lock);
+ spin_lock_bh(&cp->lock);
set_sctp_state(pd, cp, direction, skb);
- spin_unlock(&cp->lock);
+ spin_unlock_bh(&cp->lock);
}
static inline __u16 sctp_app_hashkey(__be16 port)
@@ -1016,30 +1016,25 @@ static int sctp_register_app(struct net *net, struct ip_vs_app *inc)
hash = sctp_app_hashkey(port);
- spin_lock_bh(&ipvs->sctp_app_lock);
list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &ipvs->sctp_apps[hash]);
+ list_add_rcu(&inc->p_list, &ipvs->sctp_apps[hash]);
atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&ipvs->sctp_app_lock);
return ret;
}
static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
- spin_lock_bh(&ipvs->sctp_app_lock);
atomic_dec(&pd->appcnt);
- list_del(&inc->p_list);
- spin_unlock_bh(&ipvs->sctp_app_lock);
+ list_del_rcu(&inc->p_list);
}
static int sctp_app_conn_bind(struct ip_vs_conn *cp)
@@ -1055,12 +1050,12 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = sctp_app_hashkey(cp->vport);
- spin_lock(&ipvs->sctp_app_lock);
- list_for_each_entry(inc, &ipvs->sctp_apps[hash], p_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->sctp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&ipvs->sctp_app_lock);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -1076,7 +1071,7 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&ipvs->sctp_app_lock);
+ rcu_read_unlock();
out:
return result;
}
@@ -1090,7 +1085,6 @@ static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
struct netns_ipvs *ipvs = net_ipvs(net);
ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
- spin_lock_init(&ipvs->sctp_app_lock);
pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts,
sizeof(sctp_timeouts));
if (!pd->timeout_table)
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 9af653a7582..50a15944c6c 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -47,9 +47,10 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
}
net = skb_net(skb);
/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
+ rcu_read_lock();
if (th->syn &&
- (svc = ip_vs_service_get(net, af, skb->mark, iph->protocol,
- &iph->daddr, th->dest))) {
+ (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+ &iph->daddr, th->dest))) {
int ignored;
if (ip_vs_todrop(net_ipvs(net))) {
@@ -57,7 +58,7 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
* It seems that we are very loaded.
* We have to drop this packet :(
*/
- ip_vs_service_put(svc);
+ rcu_read_unlock();
*verdict = NF_DROP;
return 0;
}
@@ -70,14 +71,13 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
if (!*cpp && ignored <= 0) {
if (!ignored)
*verdict = ip_vs_leave(svc, skb, pd, iph);
- else {
- ip_vs_service_put(svc);
+ else
*verdict = NF_DROP;
- }
+ rcu_read_unlock();
return 0;
}
- ip_vs_service_put(svc);
}
+ rcu_read_unlock();
/* NF_ACCEPT */
return 1;
}
@@ -557,9 +557,9 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction,
if (th == NULL)
return;
- spin_lock(&cp->lock);
+ spin_lock_bh(&cp->lock);
set_tcp_state(pd, cp, direction, th);
- spin_unlock(&cp->lock);
+ spin_unlock_bh(&cp->lock);
}
static inline __u16 tcp_app_hashkey(__be16 port)
@@ -580,18 +580,16 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
hash = tcp_app_hashkey(port);
- spin_lock_bh(&ipvs->tcp_app_lock);
list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &ipvs->tcp_apps[hash]);
+ list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]);
atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&ipvs->tcp_app_lock);
return ret;
}
@@ -599,13 +597,10 @@ static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
static void
tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
- spin_lock_bh(&ipvs->tcp_app_lock);
atomic_dec(&pd->appcnt);
- list_del(&inc->p_list);
- spin_unlock_bh(&ipvs->tcp_app_lock);
+ list_del_rcu(&inc->p_list);
}
@@ -624,12 +619,12 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = tcp_app_hashkey(cp->vport);
- spin_lock(&ipvs->tcp_app_lock);
- list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&ipvs->tcp_app_lock);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -646,7 +641,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&ipvs->tcp_app_lock);
+ rcu_read_unlock();
out:
return result;
@@ -660,11 +655,11 @@ void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
{
struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
- spin_lock(&cp->lock);
+ spin_lock_bh(&cp->lock);
cp->state = IP_VS_TCP_S_LISTEN;
cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
: tcp_timeouts[IP_VS_TCP_S_LISTEN]);
- spin_unlock(&cp->lock);
+ spin_unlock_bh(&cp->lock);
}
/* ---------------------------------------------
@@ -676,7 +671,6 @@ static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
struct netns_ipvs *ipvs = net_ipvs(net);
ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
- spin_lock_init(&ipvs->tcp_app_lock);
pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
sizeof(tcp_timeouts));
if (!pd->timeout_table)
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 503a842c90d..b62a3c0ff9b 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -44,8 +44,9 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
return 0;
}
net = skb_net(skb);
- svc = ip_vs_service_get(net, af, skb->mark, iph->protocol,
- &iph->daddr, uh->dest);
+ rcu_read_lock();
+ svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+ &iph->daddr, uh->dest);
if (svc) {
int ignored;
@@ -54,7 +55,7 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
* It seems that we are very loaded.
* We have to drop this packet :(
*/
- ip_vs_service_put(svc);
+ rcu_read_unlock();
*verdict = NF_DROP;
return 0;
}
@@ -67,14 +68,13 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
if (!*cpp && ignored <= 0) {
if (!ignored)
*verdict = ip_vs_leave(svc, skb, pd, iph);
- else {
- ip_vs_service_put(svc);
+ else
*verdict = NF_DROP;
- }
+ rcu_read_unlock();
return 0;
}
- ip_vs_service_put(svc);
}
+ rcu_read_unlock();
/* NF_ACCEPT */
return 1;
}
@@ -359,19 +359,16 @@ static int udp_register_app(struct net *net, struct ip_vs_app *inc)
hash = udp_app_hashkey(port);
-
- spin_lock_bh(&ipvs->udp_app_lock);
list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) {
if (i->port == port) {
ret = -EEXIST;
goto out;
}
}
- list_add(&inc->p_list, &ipvs->udp_apps[hash]);
+ list_add_rcu(&inc->p_list, &ipvs->udp_apps[hash]);
atomic_inc(&pd->appcnt);
out:
- spin_unlock_bh(&ipvs->udp_app_lock);
return ret;
}
@@ -380,12 +377,9 @@ static void
udp_unregister_app(struct net *net, struct ip_vs_app *inc)
{
struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
- struct netns_ipvs *ipvs = net_ipvs(net);
- spin_lock_bh(&ipvs->udp_app_lock);
atomic_dec(&pd->appcnt);
- list_del(&inc->p_list);
- spin_unlock_bh(&ipvs->udp_app_lock);
+ list_del_rcu(&inc->p_list);
}
@@ -403,12 +397,12 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
/* Lookup application incarnations and bind the right one */
hash = udp_app_hashkey(cp->vport);
- spin_lock(&ipvs->udp_app_lock);
- list_for_each_entry(inc, &ipvs->udp_apps[hash], p_list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->udp_apps[hash], p_list) {
if (inc->port == cp->vport) {
if (unlikely(!ip_vs_app_inc_get(inc)))
break;
- spin_unlock(&ipvs->udp_app_lock);
+ rcu_read_unlock();
IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
"%s:%u to app %s on port %u\n",
@@ -425,7 +419,7 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
goto out;
}
}
- spin_unlock(&ipvs->udp_app_lock);
+ rcu_read_unlock();
out:
return result;
@@ -467,7 +461,6 @@ static int __udp_init(struct net *net, struct ip_vs_proto_data *pd)
struct netns_ipvs *ipvs = net_ipvs(net);
ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE);
- spin_lock_init(&ipvs->udp_app_lock);
pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts,
sizeof(udp_timeouts));
if (!pd->timeout_table)
diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
index c49b388d108..c35986c793d 100644
--- a/net/netfilter/ipvs/ip_vs_rr.c
+++ b/net/netfilter/ipvs/ip_vs_rr.c
@@ -35,9 +35,18 @@ static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
}
-static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
+static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest)
{
- svc->sched_data = &svc->destinations;
+ struct list_head *p;
+
+ spin_lock_bh(&svc->sched_lock);
+ p = (struct list_head *) svc->sched_data;
+ /* dest is already unlinked, so p->prev is not valid but
+ * p->next is valid, use it to reach previous entry.
+ */
+ if (p == &dest->n_list)
+ svc->sched_data = p->next->prev;
+ spin_unlock_bh(&svc->sched_lock);
return 0;
}
@@ -48,36 +57,41 @@ static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
static struct ip_vs_dest *
ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
- struct list_head *p, *q;
- struct ip_vs_dest *dest;
+ struct list_head *p;
+ struct ip_vs_dest *dest, *last;
+ int pass = 0;
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
- write_lock(&svc->sched_lock);
- p = (struct list_head *)svc->sched_data;
- p = p->next;
- q = p;
+ spin_lock_bh(&svc->sched_lock);
+ p = (struct list_head *) svc->sched_data;
+ last = dest = list_entry(p, struct ip_vs_dest, n_list);
+
do {
- /* skip list head */
- if (q == &svc->destinations) {
- q = q->next;
- continue;
+ list_for_each_entry_continue_rcu(dest,
+ &svc->destinations,
+ n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) > 0)
+ /* HIT */
+ goto out;
+ if (dest == last)
+ goto stop;
}
-
- dest = list_entry(q, struct ip_vs_dest, n_list);
- if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
- atomic_read(&dest->weight) > 0)
- /* HIT */
- goto out;
- q = q->next;
- } while (q != p);
- write_unlock(&svc->sched_lock);
+ pass++;
+ /* Previous dest could be unlinked, do not loop forever.
+ * If we stay at head there is no need for 2nd pass.
+ */
+ } while (pass < 2 && p != &svc->destinations);
+
+stop:
+ spin_unlock_bh(&svc->sched_lock);
ip_vs_scheduler_err(svc, "no destination available");
return NULL;
out:
- svc->sched_data = q;
- write_unlock(&svc->sched_lock);
+ svc->sched_data = &dest->n_list;
+ spin_unlock_bh(&svc->sched_lock);
IP_VS_DBG_BUF(6, "RR: server %s:%u "
"activeconns %d refcnt %d weight %d\n",
IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
@@ -94,7 +108,8 @@ static struct ip_vs_scheduler ip_vs_rr_scheduler = {
.module = THIS_MODULE,
.n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
.init_service = ip_vs_rr_init_svc,
- .update_service = ip_vs_rr_update_svc,
+ .add_dest = NULL,
+ .del_dest = ip_vs_rr_del_dest,
.schedule = ip_vs_rr_schedule,
};
@@ -106,6 +121,7 @@ static int __init ip_vs_rr_init(void)
static void __exit ip_vs_rr_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_rr_init);
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
index d6bf20d6cdb..4dbcda6258b 100644
--- a/net/netfilter/ipvs/ip_vs_sched.c
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -35,8 +35,8 @@ EXPORT_SYMBOL(ip_vs_scheduler_err);
*/
static LIST_HEAD(ip_vs_schedulers);
-/* lock for service table */
-static DEFINE_SPINLOCK(ip_vs_sched_lock);
+/* semaphore for schedulers */
+static DEFINE_MUTEX(ip_vs_sched_mutex);
/*
@@ -47,8 +47,6 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
{
int ret;
- svc->scheduler = scheduler;
-
if (scheduler->init_service) {
ret = scheduler->init_service(svc);
if (ret) {
@@ -56,7 +54,7 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
return ret;
}
}
-
+ rcu_assign_pointer(svc->scheduler, scheduler);
return 0;
}
@@ -64,22 +62,19 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc,
/*
* Unbind a service with its scheduler
*/
-int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
+void ip_vs_unbind_scheduler(struct ip_vs_service *svc,
+ struct ip_vs_scheduler *sched)
{
- struct ip_vs_scheduler *sched = svc->scheduler;
+ struct ip_vs_scheduler *cur_sched;
- if (!sched)
- return 0;
+ cur_sched = rcu_dereference_protected(svc->scheduler, 1);
+ /* This check proves that old 'sched' was installed */
+ if (!cur_sched)
+ return;
- if (sched->done_service) {
- if (sched->done_service(svc) != 0) {
- pr_err("%s(): done error\n", __func__);
- return -EINVAL;
- }
- }
-
- svc->scheduler = NULL;
- return 0;
+ if (sched->done_service)
+ sched->done_service(svc);
+ /* svc->scheduler can not be set to NULL */
}
@@ -92,7 +87,7 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name);
- spin_lock_bh(&ip_vs_sched_lock);
+ mutex_lock(&ip_vs_sched_mutex);
list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
/*
@@ -106,14 +101,14 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
}
if (strcmp(sched_name, sched->name)==0) {
/* HIT */
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
return sched;
}
if (sched->module)
module_put(sched->module);
}
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
return NULL;
}
@@ -153,21 +148,21 @@ void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg)
{
+ struct ip_vs_scheduler *sched;
+
+ sched = rcu_dereference(svc->scheduler);
if (svc->fwmark) {
IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n",
- svc->scheduler->name, svc->fwmark,
- svc->fwmark, msg);
+ sched->name, svc->fwmark, svc->fwmark, msg);
#ifdef CONFIG_IP_VS_IPV6
} else if (svc->af == AF_INET6) {
IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n",
- svc->scheduler->name,
- ip_vs_proto_name(svc->protocol),
+ sched->name, ip_vs_proto_name(svc->protocol),
&svc->addr.in6, ntohs(svc->port), msg);
#endif
} else {
IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n",
- svc->scheduler->name,
- ip_vs_proto_name(svc->protocol),
+ sched->name, ip_vs_proto_name(svc->protocol),
&svc->addr.ip, ntohs(svc->port), msg);
}
}
@@ -192,10 +187,10 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
/* increase the module use count */
ip_vs_use_count_inc();
- spin_lock_bh(&ip_vs_sched_lock);
+ mutex_lock(&ip_vs_sched_mutex);
if (!list_empty(&scheduler->n_list)) {
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
ip_vs_use_count_dec();
pr_err("%s(): [%s] scheduler already linked\n",
__func__, scheduler->name);
@@ -208,7 +203,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
*/
list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
if (strcmp(scheduler->name, sched->name) == 0) {
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
ip_vs_use_count_dec();
pr_err("%s(): [%s] scheduler already existed "
"in the system\n", __func__, scheduler->name);
@@ -219,7 +214,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
* Add it into the d-linked scheduler list
*/
list_add(&scheduler->n_list, &ip_vs_schedulers);
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
pr_info("[%s] scheduler registered.\n", scheduler->name);
@@ -237,9 +232,9 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
return -EINVAL;
}
- spin_lock_bh(&ip_vs_sched_lock);
+ mutex_lock(&ip_vs_sched_mutex);
if (list_empty(&scheduler->n_list)) {
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
pr_err("%s(): [%s] scheduler is not in the list. failed\n",
__func__, scheduler->name);
return -EINVAL;
@@ -249,7 +244,7 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
* Remove it from the d-linked scheduler list
*/
list_del(&scheduler->n_list);
- spin_unlock_bh(&ip_vs_sched_lock);
+ mutex_unlock(&ip_vs_sched_mutex);
/* decrease the module use count */
ip_vs_use_count_dec();
diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c
index 89ead246ed3..f3205925359 100644
--- a/net/netfilter/ipvs/ip_vs_sed.c
+++ b/net/netfilter/ipvs/ip_vs_sed.c
@@ -79,7 +79,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* new connections.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
atomic_read(&dest->weight) > 0) {
least = dest;
@@ -94,7 +94,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* Find the destination with the least load.
*/
nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = ip_vs_sed_dest_overhead(dest);
@@ -134,6 +134,7 @@ static int __init ip_vs_sed_init(void)
static void __exit ip_vs_sed_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_sed_init);
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index e3312699462..a65edfe4b16 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -53,7 +53,7 @@
* IPVS SH bucket
*/
struct ip_vs_sh_bucket {
- struct ip_vs_dest *dest; /* real server (cache) */
+ struct ip_vs_dest __rcu *dest; /* real server (cache) */
};
/*
@@ -66,6 +66,10 @@ struct ip_vs_sh_bucket {
#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS)
#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1)
+struct ip_vs_sh_state {
+ struct rcu_head rcu_head;
+ struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE];
+};
/*
* Returns hash value for IPVS SH entry
@@ -87,10 +91,9 @@ static inline unsigned int ip_vs_sh_hashkey(int af, const union nf_inet_addr *ad
* Get ip_vs_dest associated with supplied parameters.
*/
static inline struct ip_vs_dest *
-ip_vs_sh_get(int af, struct ip_vs_sh_bucket *tbl,
- const union nf_inet_addr *addr)
+ip_vs_sh_get(int af, struct ip_vs_sh_state *s, const union nf_inet_addr *addr)
{
- return (tbl[ip_vs_sh_hashkey(af, addr)]).dest;
+ return rcu_dereference(s->buckets[ip_vs_sh_hashkey(af, addr)].dest);
}
@@ -98,27 +101,32 @@ ip_vs_sh_get(int af, struct ip_vs_sh_bucket *tbl,
* Assign all the hash buckets of the specified table with the service.
*/
static int
-ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
+ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc)
{
int i;
struct ip_vs_sh_bucket *b;
struct list_head *p;
struct ip_vs_dest *dest;
int d_count;
+ bool empty;
- b = tbl;
+ b = &s->buckets[0];
p = &svc->destinations;
+ empty = list_empty(p);
d_count = 0;
for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
- if (list_empty(p)) {
- b->dest = NULL;
- } else {
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest)
+ ip_vs_dest_put(dest);
+ if (empty)
+ RCU_INIT_POINTER(b->dest, NULL);
+ else {
if (p == &svc->destinations)
p = p->next;
dest = list_entry(p, struct ip_vs_dest, n_list);
- atomic_inc(&dest->refcnt);
- b->dest = dest;
+ ip_vs_dest_hold(dest);
+ RCU_INIT_POINTER(b->dest, dest);
IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n",
i, IP_VS_DBG_ADDR(svc->af, &dest->addr),
@@ -140,16 +148,18 @@ ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
/*
* Flush all the hash buckets of the specified table.
*/
-static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
+static void ip_vs_sh_flush(struct ip_vs_sh_state *s)
{
int i;
struct ip_vs_sh_bucket *b;
+ struct ip_vs_dest *dest;
- b = tbl;
+ b = &s->buckets[0];
for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
- if (b->dest) {
- atomic_dec(&b->dest->refcnt);
- b->dest = NULL;
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest) {
+ ip_vs_dest_put(dest);
+ RCU_INIT_POINTER(b->dest, NULL);
}
b++;
}
@@ -158,51 +168,46 @@ static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
{
- struct ip_vs_sh_bucket *tbl;
+ struct ip_vs_sh_state *s;
/* allocate the SH table for this service */
- tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
- GFP_KERNEL);
- if (tbl == NULL)
+ s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL);
+ if (s == NULL)
return -ENOMEM;
- svc->sched_data = tbl;
+ svc->sched_data = s;
IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
"current service\n",
sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
- /* assign the hash buckets with the updated service */
- ip_vs_sh_assign(tbl, svc);
+ /* assign the hash buckets with current dests */
+ ip_vs_sh_reassign(s, svc);
return 0;
}
-static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
+static void ip_vs_sh_done_svc(struct ip_vs_service *svc)
{
- struct ip_vs_sh_bucket *tbl = svc->sched_data;
+ struct ip_vs_sh_state *s = svc->sched_data;
/* got to clean up hash buckets here */
- ip_vs_sh_flush(tbl);
+ ip_vs_sh_flush(s);
/* release the table itself */
- kfree(svc->sched_data);
+ kfree_rcu(s, rcu_head);
IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
-
- return 0;
}
-static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
+static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
{
- struct ip_vs_sh_bucket *tbl = svc->sched_data;
-
- /* got to clean up hash buckets here */
- ip_vs_sh_flush(tbl);
+ struct ip_vs_sh_state *s = svc->sched_data;
/* assign the hash buckets with the updated service */
- ip_vs_sh_assign(tbl, svc);
+ ip_vs_sh_reassign(s, svc);
return 0;
}
@@ -225,15 +230,15 @@ static struct ip_vs_dest *
ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_dest *dest;
- struct ip_vs_sh_bucket *tbl;
+ struct ip_vs_sh_state *s;
struct ip_vs_iphdr iph;
ip_vs_fill_iph_addr_only(svc->af, skb, &iph);
IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
- tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
- dest = ip_vs_sh_get(svc->af, tbl, &iph.saddr);
+ s = (struct ip_vs_sh_state *) svc->sched_data;
+ dest = ip_vs_sh_get(svc->af, s, &iph.saddr);
if (!dest
|| !(dest->flags & IP_VS_DEST_F_AVAILABLE)
|| atomic_read(&dest->weight) <= 0
@@ -262,7 +267,9 @@ static struct ip_vs_scheduler ip_vs_sh_scheduler =
.n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
.init_service = ip_vs_sh_init_svc,
.done_service = ip_vs_sh_done_svc,
- .update_service = ip_vs_sh_update_svc,
+ .add_dest = ip_vs_sh_dest_changed,
+ .del_dest = ip_vs_sh_dest_changed,
+ .upd_dest = ip_vs_sh_dest_changed,
.schedule = ip_vs_sh_schedule,
};
@@ -276,6 +283,7 @@ static int __init ip_vs_sh_init(void)
static void __exit ip_vs_sh_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
+ synchronize_rcu();
}
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 44fd10c539a..f6046d9af8d 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -246,7 +246,7 @@ struct ip_vs_sync_thread_data {
struct ip_vs_sync_mesg_v0 {
__u8 nr_conns;
__u8 syncid;
- __u16 size;
+ __be16 size;
/* ip_vs_sync_conn entries start here */
};
@@ -255,7 +255,7 @@ struct ip_vs_sync_mesg_v0 {
struct ip_vs_sync_mesg {
__u8 reserved; /* must be zero */
__u8 syncid;
- __u16 size;
+ __be16 size;
__u8 nr_conns;
__s8 version; /* SYNC_PROTO_VER */
__u16 spare;
@@ -335,7 +335,7 @@ ip_vs_sync_buff_create(struct netns_ipvs *ipvs)
sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */
sb->mesg->version = SYNC_PROTO_VER;
sb->mesg->syncid = ipvs->master_syncid;
- sb->mesg->size = sizeof(struct ip_vs_sync_mesg);
+ sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg));
sb->mesg->nr_conns = 0;
sb->mesg->spare = 0;
sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
@@ -418,7 +418,7 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
mesg->nr_conns = 0;
mesg->syncid = ipvs->master_syncid;
- mesg->size = sizeof(struct ip_vs_sync_mesg_v0);
+ mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0));
sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen;
sb->firstuse = jiffies;
@@ -531,9 +531,9 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
return;
- spin_lock(&ipvs->sync_buff_lock);
+ spin_lock_bh(&ipvs->sync_buff_lock);
if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
- spin_unlock(&ipvs->sync_buff_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
return;
}
@@ -552,7 +552,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
if (!buff) {
buff = ip_vs_sync_buff_create_v0(ipvs);
if (!buff) {
- spin_unlock(&ipvs->sync_buff_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
pr_err("ip_vs_sync_buff_create failed.\n");
return;
}
@@ -582,7 +582,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
}
m->nr_conns++;
- m->size += len;
+ m->size = htons(ntohs(m->size) + len);
buff->head += len;
/* check if there is a space for next one */
@@ -590,7 +590,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
sb_queue_tail(ipvs, ms);
ms->sync_buff = NULL;
}
- spin_unlock(&ipvs->sync_buff_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
/* synchronize its controller if it has */
cp = cp->control;
@@ -641,9 +641,9 @@ sloop:
pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
}
- spin_lock(&ipvs->sync_buff_lock);
+ spin_lock_bh(&ipvs->sync_buff_lock);
if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
- spin_unlock(&ipvs->sync_buff_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
return;
}
@@ -683,7 +683,7 @@ sloop:
if (!buff) {
buff = ip_vs_sync_buff_create(ipvs);
if (!buff) {
- spin_unlock(&ipvs->sync_buff_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
pr_err("ip_vs_sync_buff_create failed.\n");
return;
}
@@ -693,7 +693,7 @@ sloop:
p = buff->head;
buff->head += pad + len;
- m->size += pad + len;
+ m->size = htons(ntohs(m->size) + pad + len);
/* Add ev. padding from prev. sync_conn */
while (pad--)
*(p++) = 0;
@@ -750,7 +750,7 @@ sloop:
}
}
- spin_unlock(&ipvs->sync_buff_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
control:
/* synchronize its controller if it has */
@@ -843,7 +843,7 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
kfree(param->pe_data);
dest = cp->dest;
- spin_lock(&cp->lock);
+ spin_lock_bh(&cp->lock);
if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE &&
!(flags & IP_VS_CONN_F_TEMPLATE) && dest) {
if (flags & IP_VS_CONN_F_INACTIVE) {
@@ -857,24 +857,21 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
flags &= IP_VS_CONN_F_BACKUP_UPD_MASK;
flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK;
cp->flags = flags;
- spin_unlock(&cp->lock);
- if (!dest) {
- dest = ip_vs_try_bind_dest(cp);
- if (dest)
- atomic_dec(&dest->refcnt);
- }
+ spin_unlock_bh(&cp->lock);
+ if (!dest)
+ ip_vs_try_bind_dest(cp);
} else {
/*
* Find the appropriate destination for the connection.
* If it is not found the connection will remain unbound
* but still handled.
*/
+ rcu_read_lock();
dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr,
param->vport, protocol, fwmark, flags);
cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark);
- if (dest)
- atomic_dec(&dest->refcnt);
+ rcu_read_unlock();
if (!cp) {
if (param->pe_data)
kfree(param->pe_data);
@@ -1178,10 +1175,8 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer,
IP_VS_DBG(2, "BACKUP, message header too short\n");
return;
}
- /* Convert size back to host byte order */
- m2->size = ntohs(m2->size);
- if (buflen != m2->size) {
+ if (buflen != ntohs(m2->size)) {
IP_VS_DBG(2, "BACKUP, bogus message size\n");
return;
}
@@ -1547,10 +1542,7 @@ ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
int msize;
int ret;
- msize = msg->size;
-
- /* Put size in network byte order */
- msg->size = htons(msg->size);
+ msize = ntohs(msg->size);
ret = ip_vs_send_async(sock, (char *)msg, msize);
if (ret >= 0 || ret == -EAGAIN)
@@ -1692,11 +1684,7 @@ static int sync_thread_backup(void *data)
break;
}
- /* disable bottom half, because it accesses the data
- shared by softirq while getting/creating conns */
- local_bh_disable();
ip_vs_process_message(tinfo->net, tinfo->buf, len);
- local_bh_enable();
}
}
diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c
index bc1bfc48a17..c60a81c4ce9 100644
--- a/net/netfilter/ipvs/ip_vs_wlc.c
+++ b/net/netfilter/ipvs/ip_vs_wlc.c
@@ -51,7 +51,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* new connections.
*/
- list_for_each_entry(dest, &svc->destinations, n_list) {
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
atomic_read(&dest->weight) > 0) {
least = dest;
@@ -66,7 +66,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
* Find the destination with the least load.
*/
nextstage:
- list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
doh = ip_vs_dest_conn_overhead(dest);
@@ -106,6 +106,7 @@ static int __init ip_vs_wlc_init(void)
static void __exit ip_vs_wlc_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_wlc_init);
diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
index 231be7dd547..0e68555bceb 100644
--- a/net/netfilter/ipvs/ip_vs_wrr.c
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -29,14 +29,45 @@
#include <net/ip_vs.h>
+/* The WRR algorithm depends on some caclulations:
+ * - mw: maximum weight
+ * - di: weight step, greatest common divisor from all weights
+ * - cw: current required weight
+ * As result, all weights are in the [di..mw] range with a step=di.
+ *
+ * First, we start with cw = mw and select dests with weight >= cw.
+ * Then cw is reduced with di and all dests are checked again.
+ * Last pass should be with cw = di. We have mw/di passes in total:
+ *
+ * pass 1: cw = max weight
+ * pass 2: cw = max weight - di
+ * pass 3: cw = max weight - 2 * di
+ * ...
+ * last pass: cw = di
+ *
+ * Weights are supposed to be >= di but we run in parallel with
+ * weight changes, it is possible some dest weight to be reduced
+ * below di, bad if it is the only available dest.
+ *
+ * So, we modify how mw is calculated, now it is reduced with (di - 1),
+ * so that last cw is 1 to catch such dests with weight below di:
+ * pass 1: cw = max weight - (di - 1)
+ * pass 2: cw = max weight - di - (di - 1)
+ * pass 3: cw = max weight - 2 * di - (di - 1)
+ * ...
+ * last pass: cw = 1
+ *
+ */
+
/*
* current destination pointer for weighted round-robin scheduling
*/
struct ip_vs_wrr_mark {
- struct list_head *cl; /* current list head */
+ struct ip_vs_dest *cl; /* current dest or head */
int cw; /* current weight */
int mw; /* maximum weight */
int di; /* decreasing interval */
+ struct rcu_head rcu_head;
};
@@ -88,36 +119,41 @@ static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
if (mark == NULL)
return -ENOMEM;
- mark->cl = &svc->destinations;
- mark->cw = 0;
- mark->mw = ip_vs_wrr_max_weight(svc);
+ mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);
mark->di = ip_vs_wrr_gcd_weight(svc);
+ mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1);
+ mark->cw = mark->mw;
svc->sched_data = mark;
return 0;
}
-static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
+static void ip_vs_wrr_done_svc(struct ip_vs_service *svc)
{
+ struct ip_vs_wrr_mark *mark = svc->sched_data;
+
/*
* Release the mark variable
*/
- kfree(svc->sched_data);
-
- return 0;
+ kfree_rcu(mark, rcu_head);
}
-static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
+static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
{
struct ip_vs_wrr_mark *mark = svc->sched_data;
- mark->cl = &svc->destinations;
- mark->mw = ip_vs_wrr_max_weight(svc);
+ spin_lock_bh(&svc->sched_lock);
+ mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);
mark->di = ip_vs_wrr_gcd_weight(svc);
- if (mark->cw > mark->mw)
- mark->cw = 0;
+ mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1);
+ if (mark->cw > mark->mw || !mark->cw)
+ mark->cw = mark->mw;
+ else if (mark->di > 1)
+ mark->cw = (mark->cw / mark->di) * mark->di + 1;
+ spin_unlock_bh(&svc->sched_lock);
return 0;
}
@@ -128,80 +164,79 @@ static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
static struct ip_vs_dest *
ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
- struct ip_vs_dest *dest;
+ struct ip_vs_dest *dest, *last, *stop = NULL;
struct ip_vs_wrr_mark *mark = svc->sched_data;
- struct list_head *p;
+ bool last_pass = false, restarted = false;
IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
- /*
- * This loop will always terminate, because mark->cw in (0, max_weight]
- * and at least one server has its weight equal to max_weight.
- */
- write_lock(&svc->sched_lock);
- p = mark->cl;
+ spin_lock_bh(&svc->sched_lock);
+ dest = mark->cl;
+ /* No available dests? */
+ if (mark->mw == 0)
+ goto err_noavail;
+ last = dest;
+ /* Stop only after all dests were checked for weight >= 1 (last pass) */
while (1) {
- if (mark->cl == &svc->destinations) {
- /* it is at the head of the destination list */
-
- if (mark->cl == mark->cl->next) {
- /* no dest entry */
- ip_vs_scheduler_err(svc,
- "no destination available: "
- "no destinations present");
- dest = NULL;
- goto out;
- }
-
- mark->cl = svc->destinations.next;
- mark->cw -= mark->di;
- if (mark->cw <= 0) {
- mark->cw = mark->mw;
- /*
- * Still zero, which means no available servers.
- */
- if (mark->cw == 0) {
- mark->cl = &svc->destinations;
- ip_vs_scheduler_err(svc,
- "no destination available");
- dest = NULL;
- goto out;
- }
- }
- } else
- mark->cl = mark->cl->next;
-
- if (mark->cl != &svc->destinations) {
- /* not at the head of the list */
- dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
+ list_for_each_entry_continue_rcu(dest,
+ &svc->destinations,
+ n_list) {
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
- atomic_read(&dest->weight) >= mark->cw) {
- /* got it */
- break;
- }
+ atomic_read(&dest->weight) >= mark->cw)
+ goto found;
+ if (dest == stop)
+ goto err_over;
}
-
- if (mark->cl == p && mark->cw == mark->di) {
- /* back to the start, and no dest is found.
- It is only possible when all dests are OVERLOADED */
- dest = NULL;
- ip_vs_scheduler_err(svc,
- "no destination available: "
- "all destinations are overloaded");
- goto out;
+ mark->cw -= mark->di;
+ if (mark->cw <= 0) {
+ mark->cw = mark->mw;
+ /* Stop if we tried last pass from first dest:
+ * 1. last_pass: we started checks when cw > di but
+ * then all dests were checked for w >= 1
+ * 2. last was head: the first and only traversal
+ * was for weight >= 1, for all dests.
+ */
+ if (last_pass ||
+ &last->n_list == &svc->destinations)
+ goto err_over;
+ restarted = true;
+ }
+ last_pass = mark->cw <= mark->di;
+ if (last_pass && restarted &&
+ &last->n_list != &svc->destinations) {
+ /* First traversal was for w >= 1 but only
+ * for dests after 'last', now do the same
+ * for all dests up to 'last'.
+ */
+ stop = last;
}
}
+found:
IP_VS_DBG_BUF(6, "WRR: server %s:%u "
"activeconns %d refcnt %d weight %d\n",
IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port),
atomic_read(&dest->activeconns),
atomic_read(&dest->refcnt),
atomic_read(&dest->weight));
+ mark->cl = dest;
out:
- write_unlock(&svc->sched_lock);
+ spin_unlock_bh(&svc->sched_lock);
return dest;
+
+err_noavail:
+ mark->cl = dest;
+ dest = NULL;
+ ip_vs_scheduler_err(svc, "no destination available");
+ goto out;
+
+err_over:
+ mark->cl = dest;
+ dest = NULL;
+ ip_vs_scheduler_err(svc, "no destination available: "
+ "all destinations are overloaded");
+ goto out;
}
@@ -212,7 +247,9 @@ static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
.n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
.init_service = ip_vs_wrr_init_svc,
.done_service = ip_vs_wrr_done_svc,
- .update_service = ip_vs_wrr_update_svc,
+ .add_dest = ip_vs_wrr_dest_changed,
+ .del_dest = ip_vs_wrr_dest_changed,
+ .upd_dest = ip_vs_wrr_dest_changed,
.schedule = ip_vs_wrr_schedule,
};
@@ -224,6 +261,7 @@ static int __init ip_vs_wrr_init(void)
static void __exit ip_vs_wrr_cleanup(void)
{
unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
+ synchronize_rcu();
}
module_init(ip_vs_wrr_init);
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index ee6b7a9f1ec..b75ff6429a0 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -17,6 +17,8 @@
* - not all connections have destination server, for example,
* connections in backup server when fwmark is used
* - bypass connections use daddr from packet
+ * - we can use dst without ref while sending in RCU section, we use
+ * ref when returning NF_ACCEPT for NAT-ed packet via loopback
* LOCAL_OUT rules:
* - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
* - skb->pkt_type is not set yet
@@ -51,39 +53,54 @@ enum {
*/
IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */
IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */
+ IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */
};
+static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void)
+{
+ return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC);
+}
+
+static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst)
+{
+ kfree(dest_dst);
+}
+
/*
* Destination cache to speed up outgoing route lookup
*/
static inline void
-__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst,
- u32 dst_cookie)
+__ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst,
+ struct dst_entry *dst, u32 dst_cookie)
{
- struct dst_entry *old_dst;
+ struct ip_vs_dest_dst *old;
+
+ old = rcu_dereference_protected(dest->dest_dst,
+ lockdep_is_held(&dest->dst_lock));
- old_dst = dest->dst_cache;
- dest->dst_cache = dst;
- dest->dst_rtos = rtos;
- dest->dst_cookie = dst_cookie;
- dst_release(old_dst);
+ if (dest_dst) {
+ dest_dst->dst_cache = dst;
+ dest_dst->dst_cookie = dst_cookie;
+ }
+ rcu_assign_pointer(dest->dest_dst, dest_dst);
+
+ if (old)
+ call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
}
-static inline struct dst_entry *
-__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos)
+static inline struct ip_vs_dest_dst *
+__ip_vs_dst_check(struct ip_vs_dest *dest)
{
- struct dst_entry *dst = dest->dst_cache;
+ struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst);
+ struct dst_entry *dst;
- if (!dst)
+ if (!dest_dst)
return NULL;
- if ((dst->obsolete || rtos != dest->dst_rtos) &&
- dst->ops->check(dst, dest->dst_cookie) == NULL) {
- dest->dst_cache = NULL;
- dst_release(dst);
+ dst = dest_dst->dst_cache;
+ if (dst->obsolete &&
+ dst->ops->check(dst, dest_dst->dst_cookie) == NULL)
return NULL;
- }
- dst_hold(dst);
- return dst;
+ return dest_dst;
}
static inline bool
@@ -104,7 +121,7 @@ __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
/* Get route to daddr, update *saddr, optionally bind route to saddr */
static struct rtable *do_output_route4(struct net *net, __be32 daddr,
- u32 rtos, int rt_mode, __be32 *saddr)
+ int rt_mode, __be32 *saddr)
{
struct flowi4 fl4;
struct rtable *rt;
@@ -113,7 +130,6 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr,
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = daddr;
fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0;
- fl4.flowi4_tos = rtos;
fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ?
FLOWI_FLAG_KNOWN_NH : 0;
@@ -124,7 +140,7 @@ retry:
if (PTR_ERR(rt) == -EINVAL && *saddr &&
rt_mode & IP_VS_RT_MODE_CONNECT && !loop) {
*saddr = 0;
- flowi4_update_output(&fl4, 0, rtos, daddr, 0);
+ flowi4_update_output(&fl4, 0, 0, daddr, 0);
goto retry;
}
IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
@@ -132,7 +148,7 @@ retry:
} else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
ip_rt_put(rt);
*saddr = fl4.saddr;
- flowi4_update_output(&fl4, 0, rtos, daddr, fl4.saddr);
+ flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr);
loop++;
goto retry;
}
@@ -141,113 +157,140 @@ retry:
}
/* Get route to destination or remote server */
-static struct rtable *
+static int
__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest,
- __be32 daddr, u32 rtos, int rt_mode, __be32 *ret_saddr)
+ __be32 daddr, int rt_mode, __be32 *ret_saddr)
{
struct net *net = dev_net(skb_dst(skb)->dev);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_dest_dst *dest_dst;
struct rtable *rt; /* Route to the other host */
struct rtable *ort; /* Original route */
- int local;
+ struct iphdr *iph;
+ __be16 df;
+ int mtu;
+ int local, noref = 1;
if (dest) {
- spin_lock(&dest->dst_lock);
- if (!(rt = (struct rtable *)
- __ip_vs_dst_check(dest, rtos))) {
- rt = do_output_route4(net, dest->addr.ip, rtos,
- rt_mode, &dest->dst_saddr.ip);
+ dest_dst = __ip_vs_dst_check(dest);
+ if (likely(dest_dst))
+ rt = (struct rtable *) dest_dst->dst_cache;
+ else {
+ dest_dst = ip_vs_dest_dst_alloc();
+ spin_lock_bh(&dest->dst_lock);
+ if (!dest_dst) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ goto err_unreach;
+ }
+ rt = do_output_route4(net, dest->addr.ip, rt_mode,
+ &dest_dst->dst_saddr.ip);
if (!rt) {
- spin_unlock(&dest->dst_lock);
- return NULL;
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ ip_vs_dest_dst_free(dest_dst);
+ goto err_unreach;
}
- __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0);
- IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d, "
- "rtos=%X\n",
- &dest->addr.ip, &dest->dst_saddr.ip,
- atomic_read(&rt->dst.__refcnt), rtos);
+ __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
+ &dest->addr.ip, &dest_dst->dst_saddr.ip,
+ atomic_read(&rt->dst.__refcnt));
}
daddr = dest->addr.ip;
if (ret_saddr)
- *ret_saddr = dest->dst_saddr.ip;
- spin_unlock(&dest->dst_lock);
+ *ret_saddr = dest_dst->dst_saddr.ip;
} else {
__be32 saddr = htonl(INADDR_ANY);
+ noref = 0;
+
/* For such unconfigured boxes avoid many route lookups
* for performance reasons because we do not remember saddr
*/
rt_mode &= ~IP_VS_RT_MODE_CONNECT;
- rt = do_output_route4(net, daddr, rtos, rt_mode, &saddr);
+ rt = do_output_route4(net, daddr, rt_mode, &saddr);
if (!rt)
- return NULL;
+ goto err_unreach;
if (ret_saddr)
*ret_saddr = saddr;
}
- local = rt->rt_flags & RTCF_LOCAL;
+ local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0;
if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) &
rt_mode)) {
IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n",
(rt->rt_flags & RTCF_LOCAL) ?
"local":"non-local", &daddr);
- ip_rt_put(rt);
- return NULL;
- }
- if (local && !(rt_mode & IP_VS_RT_MODE_RDR) &&
- !((ort = skb_rtable(skb)) && ort->rt_flags & RTCF_LOCAL)) {
- IP_VS_DBG_RL("Redirect from non-local address %pI4 to local "
- "requires NAT method, dest: %pI4\n",
- &ip_hdr(skb)->daddr, &daddr);
- ip_rt_put(rt);
- return NULL;
+ goto err_put;
}
- if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) {
- IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 "
- "to non-local address, dest: %pI4\n",
- &ip_hdr(skb)->saddr, &daddr);
- ip_rt_put(rt);
- return NULL;
+ iph = ip_hdr(skb);
+ if (likely(!local)) {
+ if (unlikely(ipv4_is_loopback(iph->saddr))) {
+ IP_VS_DBG_RL("Stopping traffic from loopback address "
+ "%pI4 to non-local address, dest: %pI4\n",
+ &iph->saddr, &daddr);
+ goto err_put;
+ }
+ } else {
+ ort = skb_rtable(skb);
+ if (!(rt_mode & IP_VS_RT_MODE_RDR) &&
+ !(ort->rt_flags & RTCF_LOCAL)) {
+ IP_VS_DBG_RL("Redirect from non-local address %pI4 to "
+ "local requires NAT method, dest: %pI4\n",
+ &iph->daddr, &daddr);
+ goto err_put;
+ }
+ /* skb to local stack, preserve old route */
+ if (!noref)
+ ip_rt_put(rt);
+ return local;
}
- return rt;
-}
-
-/* Reroute packet to local IPv4 stack after DNAT */
-static int
-__ip_vs_reroute_locally(struct sk_buff *skb)
-{
- struct rtable *rt = skb_rtable(skb);
- struct net_device *dev = rt->dst.dev;
- struct net *net = dev_net(dev);
- struct iphdr *iph = ip_hdr(skb);
-
- if (rt_is_input_route(rt)) {
- unsigned long orefdst = skb->_skb_refdst;
-
- if (ip_route_input(skb, iph->daddr, iph->saddr,
- iph->tos, skb->dev))
- return 0;
- refdst_drop(orefdst);
+ if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) {
+ mtu = dst_mtu(&rt->dst);
+ df = iph->frag_off & htons(IP_DF);
} else {
- struct flowi4 fl4 = {
- .daddr = iph->daddr,
- .saddr = iph->saddr,
- .flowi4_tos = RT_TOS(iph->tos),
- .flowi4_mark = skb->mark,
- };
-
- rt = ip_route_output_key(net, &fl4);
- if (IS_ERR(rt))
- return 0;
- if (!(rt->rt_flags & RTCF_LOCAL)) {
- ip_rt_put(rt);
- return 0;
+ struct sock *sk = skb->sk;
+
+ mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
+ if (mtu < 68) {
+ IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
+ goto err_put;
}
- /* Drop old route. */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
+ ort = skb_rtable(skb);
+ if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT)
+ ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
+ /* MTU check allowed? */
+ df = sysctl_pmtu_disc(ipvs) ? iph->frag_off & htons(IP_DF) : 0;
}
- return 1;
+
+ /* MTU checking */
+ if (unlikely(df && skb->len > mtu && !skb_is_gso(skb))) {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ IP_VS_DBG(1, "frag needed for %pI4\n", &iph->saddr);
+ goto err_put;
+ }
+
+ skb_dst_drop(skb);
+ if (noref) {
+ if (!local)
+ skb_dst_set_noref_force(skb, &rt->dst);
+ else
+ skb_dst_set(skb, dst_clone(&rt->dst));
+ } else
+ skb_dst_set(skb, &rt->dst);
+
+ return local;
+
+err_put:
+ if (!noref)
+ ip_rt_put(rt);
+ return -1;
+
+err_unreach:
+ dst_link_failure(skb);
+ return -1;
}
#ifdef CONFIG_IP_VS_IPV6
@@ -294,44 +337,57 @@ out_err:
/*
* Get route to destination or remote server
*/
-static struct rt6_info *
+static int
__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
struct in6_addr *daddr, struct in6_addr *ret_saddr,
- int do_xfrm, int rt_mode)
+ struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode)
{
struct net *net = dev_net(skb_dst(skb)->dev);
+ struct ip_vs_dest_dst *dest_dst;
struct rt6_info *rt; /* Route to the other host */
struct rt6_info *ort; /* Original route */
struct dst_entry *dst;
- int local;
+ int mtu;
+ int local, noref = 1;
if (dest) {
- spin_lock(&dest->dst_lock);
- rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0);
- if (!rt) {
+ dest_dst = __ip_vs_dst_check(dest);
+ if (likely(dest_dst))
+ rt = (struct rt6_info *) dest_dst->dst_cache;
+ else {
u32 cookie;
+ dest_dst = ip_vs_dest_dst_alloc();
+ spin_lock_bh(&dest->dst_lock);
+ if (!dest_dst) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ goto err_unreach;
+ }
dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
- &dest->dst_saddr.in6,
+ &dest_dst->dst_saddr.in6,
do_xfrm);
if (!dst) {
- spin_unlock(&dest->dst_lock);
- return NULL;
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ ip_vs_dest_dst_free(dest_dst);
+ goto err_unreach;
}
rt = (struct rt6_info *) dst;
cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
- __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie);
+ __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
+ spin_unlock_bh(&dest->dst_lock);
IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
- &dest->addr.in6, &dest->dst_saddr.in6,
+ &dest->addr.in6, &dest_dst->dst_saddr.in6,
atomic_read(&rt->dst.__refcnt));
}
if (ret_saddr)
- *ret_saddr = dest->dst_saddr.in6;
- spin_unlock(&dest->dst_lock);
+ *ret_saddr = dest_dst->dst_saddr.in6;
} else {
+ noref = 0;
dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
if (!dst)
- return NULL;
+ goto err_unreach;
rt = (struct rt6_info *) dst;
}
@@ -340,86 +396,137 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest,
rt_mode)) {
IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6c\n",
local ? "local":"non-local", daddr);
- dst_release(&rt->dst);
- return NULL;
+ goto err_put;
}
- if (local && !(rt_mode & IP_VS_RT_MODE_RDR) &&
- !((ort = (struct rt6_info *) skb_dst(skb)) &&
- __ip_vs_is_local_route6(ort))) {
- IP_VS_DBG_RL("Redirect from non-local address %pI6c to local "
- "requires NAT method, dest: %pI6c\n",
- &ipv6_hdr(skb)->daddr, daddr);
- dst_release(&rt->dst);
- return NULL;
+ if (likely(!local)) {
+ if (unlikely((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
+ IPV6_ADDR_LOOPBACK)) {
+ IP_VS_DBG_RL("Stopping traffic from loopback address "
+ "%pI6c to non-local address, "
+ "dest: %pI6c\n",
+ &ipv6_hdr(skb)->saddr, daddr);
+ goto err_put;
+ }
+ } else {
+ ort = (struct rt6_info *) skb_dst(skb);
+ if (!(rt_mode & IP_VS_RT_MODE_RDR) &&
+ !__ip_vs_is_local_route6(ort)) {
+ IP_VS_DBG_RL("Redirect from non-local address %pI6c "
+ "to local requires NAT method, "
+ "dest: %pI6c\n",
+ &ipv6_hdr(skb)->daddr, daddr);
+ goto err_put;
+ }
+ /* skb to local stack, preserve old route */
+ if (!noref)
+ dst_release(&rt->dst);
+ return local;
}
- if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
- ipv6_addr_type(&ipv6_hdr(skb)->saddr) &
- IPV6_ADDR_LOOPBACK)) {
- IP_VS_DBG_RL("Stopping traffic from loopback address %pI6c "
- "to non-local address, dest: %pI6c\n",
- &ipv6_hdr(skb)->saddr, daddr);
- dst_release(&rt->dst);
- return NULL;
+
+ /* MTU checking */
+ if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL)))
+ mtu = dst_mtu(&rt->dst);
+ else {
+ struct sock *sk = skb->sk;
+
+ mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
+ if (mtu < IPV6_MIN_MTU) {
+ IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
+ IPV6_MIN_MTU);
+ goto err_put;
+ }
+ ort = (struct rt6_info *) skb_dst(skb);
+ if (!skb->dev && sk && sk->sk_state != TCP_TIME_WAIT)
+ ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
}
- return rt;
+ if (unlikely(__mtu_check_toobig_v6(skb, mtu))) {
+ if (!skb->dev)
+ skb->dev = net->loopback_dev;
+ /* only send ICMP too big on first fragment */
+ if (!ipvsh->fragoffs)
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ IP_VS_DBG(1, "frag needed for %pI6c\n", &ipv6_hdr(skb)->saddr);
+ goto err_put;
+ }
+
+ skb_dst_drop(skb);
+ if (noref) {
+ if (!local)
+ skb_dst_set_noref_force(skb, &rt->dst);
+ else
+ skb_dst_set(skb, dst_clone(&rt->dst));
+ } else
+ skb_dst_set(skb, &rt->dst);
+
+ return local;
+
+err_put:
+ if (!noref)
+ dst_release(&rt->dst);
+ return -1;
+
+err_unreach:
+ dst_link_failure(skb);
+ return -1;
}
#endif
-/*
- * Release dest->dst_cache before a dest is removed
- */
-void
-ip_vs_dst_reset(struct ip_vs_dest *dest)
+/* return NF_ACCEPT to allow forwarding or other NF_xxx on error */
+static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
+ struct ip_vs_conn *cp)
{
- struct dst_entry *old_dst;
+ int ret = NF_ACCEPT;
+
+ skb->ipvs_property = 1;
+ if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
+ ret = ip_vs_confirm_conntrack(skb);
+ if (ret == NF_ACCEPT) {
+ nf_reset(skb);
+ skb_forward_csum(skb);
+ }
+ return ret;
+}
+
+/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
+static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
+ struct ip_vs_conn *cp, int local)
+{
+ int ret = NF_STOLEN;
- old_dst = dest->dst_cache;
- dest->dst_cache = NULL;
- dst_release(old_dst);
- dest->dst_saddr.ip = 0;
+ skb->ipvs_property = 1;
+ if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
+ ip_vs_notrack(skb);
+ else
+ ip_vs_update_conntrack(skb, cp, 1);
+ if (!local) {
+ skb_forward_csum(skb);
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
+ dst_output);
+ } else
+ ret = NF_ACCEPT;
+ return ret;
}
-#define IP_VS_XMIT_TUNNEL(skb, cp) \
-({ \
- int __ret = NF_ACCEPT; \
- \
- (skb)->ipvs_property = 1; \
- if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \
- __ret = ip_vs_confirm_conntrack(skb); \
- if (__ret == NF_ACCEPT) { \
- nf_reset(skb); \
- skb_forward_csum(skb); \
- } \
- __ret; \
-})
-
-#define IP_VS_XMIT_NAT(pf, skb, cp, local) \
-do { \
- (skb)->ipvs_property = 1; \
- if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
- ip_vs_notrack(skb); \
- else \
- ip_vs_update_conntrack(skb, cp, 1); \
- if (local) \
- return NF_ACCEPT; \
- skb_forward_csum(skb); \
- NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
- skb_dst(skb)->dev, dst_output); \
-} while (0)
-
-#define IP_VS_XMIT(pf, skb, cp, local) \
-do { \
- (skb)->ipvs_property = 1; \
- if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
- ip_vs_notrack(skb); \
- if (local) \
- return NF_ACCEPT; \
- skb_forward_csum(skb); \
- NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
- skb_dst(skb)->dev, dst_output); \
-} while (0)
+/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
+static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
+ struct ip_vs_conn *cp, int local)
+{
+ int ret = NF_STOLEN;
+
+ skb->ipvs_property = 1;
+ if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
+ ip_vs_notrack(skb);
+ if (!local) {
+ skb_forward_csum(skb);
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
+ dst_output);
+ } else
+ ret = NF_ACCEPT;
+ return ret;
+}
/*
@@ -430,7 +537,7 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
/* we do not touch skb and do not need pskb ptr */
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
}
@@ -443,52 +550,29 @@ int
ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
- struct rtable *rt; /* Route to the other host */
struct iphdr *iph = ip_hdr(skb);
- int mtu;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, RT_TOS(iph->tos),
- IP_VS_RT_MODE_NON_LOCAL, NULL)))
- goto tx_error_icmp;
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) &&
- !skb_is_gso(skb)) {
- ip_rt_put(rt);
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+ rcu_read_lock();
+ if (__ip_vs_get_out_rt(skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL,
+ NULL) < 0)
goto tx_error;
- }
- /*
- * Call ip_send_check because we are not sure it is called
- * after ip_defrag. Is copy-on-write needed?
- */
- if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
- ip_rt_put(rt);
- return NF_STOLEN;
- }
- ip_send_check(ip_hdr(skb));
-
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
+ ip_send_check(iph);
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
+ ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
@@ -496,60 +580,27 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
#ifdef CONFIG_IP_VS_IPV6
int
ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
- struct rt6_info *rt; /* Route to the other host */
- int mtu;
-
EnterFunction(10);
- rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr.in6, NULL, 0,
- IP_VS_RT_MODE_NON_LOCAL);
- if (!rt)
- goto tx_error_icmp;
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if (__mtu_check_toobig_v6(skb, mtu)) {
- if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net->loopback_dev;
- }
- /* only send ICMP too big on first fragment */
- if (!iph->fragoffs)
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- dst_release(&rt->dst);
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+ rcu_read_lock();
+ if (__ip_vs_get_out_rt_v6(skb, NULL, &ipvsh->daddr.in6, NULL,
+ ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
goto tx_error;
- }
-
- /*
- * Call ip_send_check because we are not sure it is called
- * after ip_defrag. Is copy-on-write needed?
- */
- skb = skb_share_check(skb, GFP_ATOMIC);
- if (unlikely(skb == NULL)) {
- dst_release(&rt->dst);
- return NF_STOLEN;
- }
-
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
+ ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
@@ -564,29 +615,30 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
struct rtable *rt; /* Route to the other host */
- int mtu;
- struct iphdr *iph = ip_hdr(skb);
- int local;
+ int local, rc, was_input;
EnterFunction(10);
+ rcu_read_lock();
/* check if it is a connection of no-client-port */
if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
__be16 _pt, *p;
- p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
+
+ p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
if (p == NULL)
goto tx_error;
ip_vs_conn_fill_cport(cp, *p);
IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
}
- if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(iph->tos),
- IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL |
- IP_VS_RT_MODE_RDR, NULL)))
- goto tx_error_icmp;
- local = rt->rt_flags & RTCF_LOCAL;
+ was_input = rt_is_input_route(skb_rtable(skb));
+ local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR, NULL);
+ if (local < 0)
+ goto tx_error;
+ rt = skb_rtable(skb);
/*
* Avoid duplicate tuple in reply direction for NAT traffic
* to local address when connection is sync-ed
@@ -600,57 +652,31 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0,
"ip_vs_nat_xmit(): "
"stopping DNAT to local address");
- goto tx_error_put;
+ goto tx_error;
}
}
#endif
/* From world but DNAT to loopback address? */
- if (local && ipv4_is_loopback(cp->daddr.ip) &&
- rt_is_input_route(skb_rtable(skb))) {
+ if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): "
"stopping DNAT to loopback address");
- goto tx_error_put;
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) &&
- !skb_is_gso(skb)) {
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0,
- "ip_vs_nat_xmit(): frag needed for");
- goto tx_error_put;
+ goto tx_error;
}
/* copy-on-write the packet before mangling it */
if (!skb_make_writable(skb, sizeof(struct iphdr)))
- goto tx_error_put;
+ goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
- goto tx_error_put;
+ goto tx_error;
/* mangle the packet */
if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
- goto tx_error_put;
+ goto tx_error;
ip_hdr(skb)->daddr = cp->daddr.ip;
ip_send_check(ip_hdr(skb));
- if (!local) {
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- } else {
- ip_rt_put(rt);
- /*
- * Some IPv4 replies get local address from routes,
- * not from iph, so while we DNAT after routing
- * we need this second input/output route.
- */
- if (!__ip_vs_reroute_locally(skb))
- goto tx_error;
- }
-
IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
/* FIXME: when application helper enlarges the packet and the length
@@ -660,49 +686,48 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
+ rcu_read_unlock();
LeaveFunction(10);
- return NF_STOLEN;
+ return rc;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
- tx_error_put:
- ip_rt_put(rt);
- goto tx_error;
}
#ifdef CONFIG_IP_VS_IPV6
int
ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
struct rt6_info *rt; /* Route to the other host */
- int mtu;
- int local;
+ int local, rc;
EnterFunction(10);
+ rcu_read_lock();
/* check if it is a connection of no-client-port */
- if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !iph->fragoffs)) {
+ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
__be16 _pt, *p;
- p = skb_header_pointer(skb, iph->len, sizeof(_pt), &_pt);
+ p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
if (p == NULL)
goto tx_error;
ip_vs_conn_fill_cport(cp, *p);
IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
}
- if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
- 0, (IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL |
- IP_VS_RT_MODE_RDR))))
- goto tx_error_icmp;
- local = __ip_vs_is_local_route6(rt);
+ local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ ipvsh, 0,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR);
+ if (local < 0)
+ goto tx_error;
+ rt = (struct rt6_info *) skb_dst(skb);
/*
* Avoid duplicate tuple in reply direction for NAT traffic
* to local address when connection is sync-ed
@@ -716,7 +741,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0,
"ip_vs_nat_xmit_v6(): "
"stopping DNAT to local address");
- goto tx_error_put;
+ goto tx_error;
}
}
#endif
@@ -727,46 +752,21 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
"ip_vs_nat_xmit_v6(): "
"stopping DNAT to loopback address");
- goto tx_error_put;
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if (__mtu_check_toobig_v6(skb, mtu)) {
- if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net->loopback_dev;
- }
- /* only send ICMP too big on first fragment */
- if (!iph->fragoffs)
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0,
- "ip_vs_nat_xmit_v6(): frag needed for");
- goto tx_error_put;
+ goto tx_error;
}
/* copy-on-write the packet before mangling it */
if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
- goto tx_error_put;
+ goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
- goto tx_error_put;
+ goto tx_error;
/* mangle the packet */
- if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, iph))
+ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
goto tx_error;
ipv6_hdr(skb)->daddr = cp->daddr.in6;
- if (!local || !skb->dev) {
- /* drop the old route when skb is not shared */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- } else {
- /* destined to loopback, do we need to change route? */
- dst_release(&rt->dst);
- }
-
IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT");
/* FIXME: when application helper enlarges the packet and the length
@@ -776,20 +776,17 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
+ rcu_read_unlock();
LeaveFunction(10);
- return NF_STOLEN;
+ return rc;
-tx_error_icmp:
- dst_link_failure(skb);
tx_error:
LeaveFunction(10);
kfree_skb(skb);
+ rcu_read_unlock();
return NF_STOLEN;
-tx_error_put:
- dst_release(&rt->dst);
- goto tx_error;
}
#endif
@@ -826,56 +823,40 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
__be16 df;
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
- int mtu;
- int ret;
+ int ret, local;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(tos), IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL |
- IP_VS_RT_MODE_CONNECT,
- &saddr)))
- goto tx_error_icmp;
- if (rt->rt_flags & RTCF_LOCAL) {
- ip_rt_put(rt);
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_CONNECT |
+ IP_VS_RT_MODE_TUNNEL, &saddr);
+ if (local < 0)
+ goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
}
+ rt = skb_rtable(skb);
tdev = rt->dst.dev;
- mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
- if (mtu < 68) {
- IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
- goto tx_error_put;
- }
- if (rt_is_output_route(skb_rtable(skb)))
- skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
-
/* Copy DF, reset fragment offset and MF */
df = sysctl_pmtu_disc(ipvs) ? old_iph->frag_off & htons(IP_DF) : 0;
- if (df && mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb)) {
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error_put;
- }
-
/*
* Okay, now see if we can stuff it in the buffer as-is.
*/
max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
- if (skb_headroom(skb) < max_headroom
- || skb_cloned(skb) || skb_shared(skb)) {
+ if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
struct sk_buff *new_skb =
skb_realloc_headroom(skb, max_headroom);
- if (!new_skb) {
- ip_rt_put(rt);
- kfree_skb(skb);
- IP_VS_ERR_RL("%s(): no memory\n", __func__);
- return NF_STOLEN;
- }
+
+ if (!new_skb)
+ goto tx_error;
consume_skb(skb);
skb = new_skb;
old_iph = ip_hdr(skb);
@@ -890,10 +871,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
skb_reset_network_header(skb);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
/*
* Push down and install the IPIP header.
*/
@@ -911,25 +888,22 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- ret = IP_VS_XMIT_TUNNEL(skb, cp);
+ ret = ip_vs_tunnel_xmit_prepare(skb, cp);
if (ret == NF_ACCEPT)
ip_local_out(skb);
else if (ret == NF_DROP)
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
-tx_error_put:
- ip_rt_put(rt);
- goto tx_error;
}
#ifdef CONFIG_IP_VS_IPV6
@@ -943,60 +917,37 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ipv6hdr *old_iph = ipv6_hdr(skb);
struct ipv6hdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
- int mtu;
- int ret;
+ int ret, local;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
- &saddr, 1, (IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL))))
- goto tx_error_icmp;
- if (__ip_vs_is_local_route6(rt)) {
- dst_release(&rt->dst);
- IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6,
+ &saddr, ipvsh, 1,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_TUNNEL);
+ if (local < 0)
+ goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
}
+ rt = (struct rt6_info *) skb_dst(skb);
tdev = rt->dst.dev;
- mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
- if (mtu < IPV6_MIN_MTU) {
- IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
- IPV6_MIN_MTU);
- goto tx_error_put;
- }
- if (skb_dst(skb))
- skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
-
- /* MTU checking: Notice that 'mtu' have been adjusted before hand */
- if (__mtu_check_toobig_v6(skb, mtu)) {
- if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net->loopback_dev;
- }
- /* only send ICMP too big on first fragment */
- if (!ipvsh->fragoffs)
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error_put;
- }
-
/*
* Okay, now see if we can stuff it in the buffer as-is.
*/
max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
- if (skb_headroom(skb) < max_headroom
- || skb_cloned(skb) || skb_shared(skb)) {
+ if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
struct sk_buff *new_skb =
skb_realloc_headroom(skb, max_headroom);
- if (!new_skb) {
- dst_release(&rt->dst);
- kfree_skb(skb);
- IP_VS_ERR_RL("%s(): no memory\n", __func__);
- return NF_STOLEN;
- }
+
+ if (!new_skb)
+ goto tx_error;
consume_skb(skb);
skb = new_skb;
old_iph = ipv6_hdr(skb);
@@ -1008,10 +959,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
skb_reset_network_header(skb);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
/*
* Push down and install the IPIP header.
*/
@@ -1029,25 +976,22 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- ret = IP_VS_XMIT_TUNNEL(skb, cp);
+ ret = ip_vs_tunnel_xmit_prepare(skb, cp);
if (ret == NF_ACCEPT)
ip6_local_out(skb);
else if (ret == NF_DROP)
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
-tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
-tx_error_put:
- dst_release(&rt->dst);
- goto tx_error;
}
#endif
@@ -1060,59 +1004,36 @@ int
ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
- struct rtable *rt; /* Route to the other host */
- struct iphdr *iph = ip_hdr(skb);
- int mtu;
+ int local;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(iph->tos),
- IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL |
- IP_VS_RT_MODE_KNOWN_NH, NULL)))
- goto tx_error_icmp;
- if (rt->rt_flags & RTCF_LOCAL) {
- ip_rt_put(rt);
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1);
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu &&
- !skb_is_gso(skb)) {
- icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
- ip_rt_put(rt);
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_KNOWN_NH, NULL);
+ if (local < 0)
goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
}
- /*
- * Call ip_send_check because we are not sure it is called
- * after ip_defrag. Is copy-on-write needed?
- */
- if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
- ip_rt_put(rt);
- return NF_STOLEN;
- }
ip_send_check(ip_hdr(skb));
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0);
+ ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
@@ -1120,64 +1041,36 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
#ifdef CONFIG_IP_VS_IPV6
int
ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
- struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph)
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
- struct rt6_info *rt; /* Route to the other host */
- int mtu;
+ int local;
EnterFunction(10);
- if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
- 0, (IP_VS_RT_MODE_LOCAL |
- IP_VS_RT_MODE_NON_LOCAL))))
- goto tx_error_icmp;
- if (__ip_vs_is_local_route6(rt)) {
- dst_release(&rt->dst);
- IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1);
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if (__mtu_check_toobig_v6(skb, mtu)) {
- if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net->loopback_dev;
- }
- /* only send ICMP too big on first fragment */
- if (!iph->fragoffs)
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- dst_release(&rt->dst);
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ ipvsh, 0,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL);
+ if (local < 0)
goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
}
- /*
- * Call ip_send_check because we are not sure it is called
- * after ip_defrag. Is copy-on-write needed?
- */
- skb = skb_share_check(skb, GFP_ATOMIC);
- if (unlikely(skb == NULL)) {
- dst_release(&rt->dst);
- return NF_STOLEN;
- }
-
- /* drop old route */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
-
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0);
+ ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
-tx_error_icmp:
- dst_link_failure(skb);
tx_error:
kfree_skb(skb);
+ rcu_read_unlock();
LeaveFunction(10);
return NF_STOLEN;
}
@@ -1194,10 +1087,9 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_iphdr *iph)
{
struct rtable *rt; /* Route to the other host */
- int mtu;
int rc;
int local;
- int rt_mode;
+ int rt_mode, was_input;
EnterFunction(10);
@@ -1217,16 +1109,17 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/*
* mangle and send the packet here (only for VS/NAT)
*/
+ was_input = rt_is_input_route(skb_rtable(skb));
/* LOCALNODE from FORWARD hook is not supported */
rt_mode = (hooknum != NF_INET_FORWARD) ?
IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
- if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip,
- RT_TOS(ip_hdr(skb)->tos),
- rt_mode, NULL)))
- goto tx_error_icmp;
- local = rt->rt_flags & RTCF_LOCAL;
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, rt_mode, NULL);
+ if (local < 0)
+ goto tx_error;
+ rt = skb_rtable(skb);
/*
* Avoid duplicate tuple in reply direction for NAT traffic
@@ -1241,82 +1134,51 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG(10, "%s(): "
"stopping DNAT to local address %pI4\n",
__func__, &cp->daddr.ip);
- goto tx_error_put;
+ goto tx_error;
}
}
#endif
/* From world but DNAT to loopback address? */
- if (local && ipv4_is_loopback(cp->daddr.ip) &&
- rt_is_input_route(skb_rtable(skb))) {
+ if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
IP_VS_DBG(1, "%s(): "
"stopping DNAT to loopback %pI4\n",
__func__, &cp->daddr.ip);
- goto tx_error_put;
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF)) &&
- !skb_is_gso(skb)) {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error_put;
+ goto tx_error;
}
/* copy-on-write the packet before mangling it */
if (!skb_make_writable(skb, offset))
- goto tx_error_put;
+ goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
- goto tx_error_put;
+ goto tx_error;
ip_vs_nat_icmp(skb, pp, cp, 0);
- if (!local) {
- /* drop the old route when skb is not shared */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- } else {
- ip_rt_put(rt);
- /*
- * Some IPv4 replies get local address from routes,
- * not from iph, so while we DNAT after routing
- * we need this second input/output route.
- */
- if (!__ip_vs_reroute_locally(skb))
- goto tx_error;
- }
-
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local);
-
- rc = NF_STOLEN;
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
+ rcu_read_unlock();
goto out;
- tx_error_icmp:
- dst_link_failure(skb);
tx_error:
- dev_kfree_skb(skb);
+ kfree_skb(skb);
+ rcu_read_unlock();
rc = NF_STOLEN;
out:
LeaveFunction(10);
return rc;
- tx_error_put:
- ip_rt_put(rt);
- goto tx_error;
}
#ifdef CONFIG_IP_VS_IPV6
int
ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
- struct ip_vs_iphdr *iph)
+ struct ip_vs_iphdr *ipvsh)
{
struct rt6_info *rt; /* Route to the other host */
- int mtu;
int rc;
int local;
int rt_mode;
@@ -1328,7 +1190,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
translate address/port back */
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
if (cp->packet_xmit)
- rc = cp->packet_xmit(skb, cp, pp, iph);
+ rc = cp->packet_xmit(skb, cp, pp, ipvsh);
else
rc = NF_ACCEPT;
/* do not touch skb anymore */
@@ -1344,11 +1206,12 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
rt_mode = (hooknum != NF_INET_FORWARD) ?
IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
- if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
- 0, rt_mode)))
- goto tx_error_icmp;
-
- local = __ip_vs_is_local_route6(rt);
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL,
+ ipvsh, 0, rt_mode);
+ if (local < 0)
+ goto tx_error;
+ rt = (struct rt6_info *) skb_dst(skb);
/*
* Avoid duplicate tuple in reply direction for NAT traffic
* to local address when connection is sync-ed
@@ -1362,7 +1225,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG(10, "%s(): "
"stopping DNAT to local address %pI6\n",
__func__, &cp->daddr.in6);
- goto tx_error_put;
+ goto tx_error;
}
}
#endif
@@ -1373,60 +1236,31 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG(1, "%s(): "
"stopping DNAT to loopback %pI6\n",
__func__, &cp->daddr.in6);
- goto tx_error_put;
- }
-
- /* MTU checking */
- mtu = dst_mtu(&rt->dst);
- if (__mtu_check_toobig_v6(skb, mtu)) {
- if (!skb->dev) {
- struct net *net = dev_net(skb_dst(skb)->dev);
-
- skb->dev = net->loopback_dev;
- }
- /* only send ICMP too big on first fragment */
- if (!iph->fragoffs)
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- IP_VS_DBG_RL("%s(): frag needed\n", __func__);
- goto tx_error_put;
+ goto tx_error;
}
/* copy-on-write the packet before mangling it */
if (!skb_make_writable(skb, offset))
- goto tx_error_put;
+ goto tx_error;
if (skb_cow(skb, rt->dst.dev->hard_header_len))
- goto tx_error_put;
+ goto tx_error;
ip_vs_nat_icmp_v6(skb, pp, cp, 0);
- if (!local || !skb->dev) {
- /* drop the old route when skb is not shared */
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- } else {
- /* destined to loopback, do we need to change route? */
- dst_release(&rt->dst);
- }
-
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local);
-
- rc = NF_STOLEN;
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
+ rcu_read_unlock();
goto out;
-tx_error_icmp:
- dst_link_failure(skb);
tx_error:
- dev_kfree_skb(skb);
+ kfree_skb(skb);
+ rcu_read_unlock();
rc = NF_STOLEN;
out:
LeaveFunction(10);
return rc;
-tx_error_put:
- dst_release(&rt->dst);
- goto tx_error;
}
#endif
diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c
index dbdaa114926..b8b95f4027c 100644
--- a/net/netfilter/nf_conntrack_amanda.c
+++ b/net/netfilter/nf_conntrack_amanda.c
@@ -2,6 +2,7 @@
*
* (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
* based on HW's ip_conntrack_irc.c as well as other modules
+ * (C) 2006 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index c8e001a9c45..0283baedcdf 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -5,6 +5,7 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -48,6 +49,7 @@
#include <net/netfilter/nf_conntrack_labels.h>
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_helper.h>
#define NF_CONNTRACK_VERSION "0.5.0"
@@ -264,7 +266,7 @@ static void death_by_event(unsigned long ul_conntrack)
if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) {
/* bad luck, let's retry again */
ecache->timeout.expires = jiffies +
- (random32() % net->ct.sysctl_events_retry_timeout);
+ (prandom_u32() % net->ct.sysctl_events_retry_timeout);
add_timer(&ecache->timeout);
return;
}
@@ -283,7 +285,7 @@ void nf_ct_dying_timeout(struct nf_conn *ct)
/* set a new timer to retry event delivery */
setup_timer(&ecache->timeout, death_by_event, (unsigned long)ct);
ecache->timeout.expires = jiffies +
- (random32() % net->ct.sysctl_events_retry_timeout);
+ (prandom_u32() % net->ct.sysctl_events_retry_timeout);
add_timer(&ecache->timeout);
}
EXPORT_SYMBOL_GPL(nf_ct_dying_timeout);
@@ -1259,7 +1261,7 @@ void nf_ct_iterate_cleanup(struct net *net,
EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
struct __nf_ct_flush_report {
- u32 pid;
+ u32 portid;
int report;
};
@@ -1274,7 +1276,7 @@ static int kill_report(struct nf_conn *i, void *data)
/* If we fail to deliver the event, death_by_timeout() will retry */
if (nf_conntrack_event_report(IPCT_DESTROY, i,
- fr->pid, fr->report) < 0)
+ fr->portid, fr->report) < 0)
return 1;
/* Avoid the delivery of the destroy event in death_by_timeout(). */
@@ -1297,10 +1299,10 @@ void nf_ct_free_hashtable(void *hash, unsigned int size)
}
EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
-void nf_conntrack_flush_report(struct net *net, u32 pid, int report)
+void nf_conntrack_flush_report(struct net *net, u32 portid, int report)
{
struct __nf_ct_flush_report fr = {
- .pid = pid,
+ .portid = portid,
.report = report,
};
nf_ct_iterate_cleanup(net, kill_report, &fr);
@@ -1364,30 +1366,48 @@ void nf_conntrack_cleanup_end(void)
*/
void nf_conntrack_cleanup_net(struct net *net)
{
+ LIST_HEAD(single);
+
+ list_add(&net->exit_list, &single);
+ nf_conntrack_cleanup_net_list(&single);
+}
+
+void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
+{
+ int busy;
+ struct net *net;
+
/*
* This makes sure all current packets have passed through
* netfilter framework. Roll on, two-stage module
* delete...
*/
synchronize_net();
- i_see_dead_people:
- nf_ct_iterate_cleanup(net, kill_all, NULL);
- nf_ct_release_dying_list(net);
- if (atomic_read(&net->ct.count) != 0) {
+i_see_dead_people:
+ busy = 0;
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nf_ct_iterate_cleanup(net, kill_all, NULL);
+ nf_ct_release_dying_list(net);
+ if (atomic_read(&net->ct.count) != 0)
+ busy = 1;
+ }
+ if (busy) {
schedule();
goto i_see_dead_people;
}
- nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
- nf_conntrack_proto_pernet_fini(net);
- nf_conntrack_helper_pernet_fini(net);
- nf_conntrack_ecache_pernet_fini(net);
- nf_conntrack_tstamp_pernet_fini(net);
- nf_conntrack_acct_pernet_fini(net);
- nf_conntrack_expect_pernet_fini(net);
- kmem_cache_destroy(net->ct.nf_conntrack_cachep);
- kfree(net->ct.slabname);
- free_percpu(net->ct.stat);
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
+ nf_conntrack_proto_pernet_fini(net);
+ nf_conntrack_helper_pernet_fini(net);
+ nf_conntrack_ecache_pernet_fini(net);
+ nf_conntrack_tstamp_pernet_fini(net);
+ nf_conntrack_acct_pernet_fini(net);
+ nf_conntrack_expect_pernet_fini(net);
+ kmem_cache_destroy(net->ct.nf_conntrack_cachep);
+ kfree(net->ct.slabname);
+ free_percpu(net->ct.stat);
+ }
}
void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index b5d2eb8bf0d..1df17614656 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -1,8 +1,10 @@
/* Event cache for netfilter. */
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+/*
+ * (C) 2005 Harald Welte <laforge@gnumonks.org>
+ * (C) 2005 Patrick McHardy <kaber@trash.net>
+ * (C) 2005-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2005 USAGI/WIDE Project <http://www.linux-ipv6.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 8c10e3db3d9..c63b618cd61 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -3,6 +3,7 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (c) 2005-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -40,7 +41,7 @@ static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
/* nf_conntrack_expect helper functions */
void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
- u32 pid, int report)
+ u32 portid, int report)
{
struct nf_conn_help *master_help = nfct_help(exp->master);
struct net *net = nf_ct_exp_net(exp);
@@ -54,7 +55,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
hlist_del(&exp->lnode);
master_help->expecting[exp->class]--;
- nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report);
+ nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report);
nf_ct_expect_put(exp);
NF_CT_STAT_INC(net, expect_delete);
@@ -412,7 +413,7 @@ out:
}
int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
- u32 pid, int report)
+ u32 portid, int report)
{
int ret;
@@ -425,7 +426,7 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
if (ret < 0)
goto out;
spin_unlock_bh(&nf_conntrack_lock);
- nf_ct_expect_event_report(IPEXP_NEW, expect, pid, report);
+ nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
return ret;
out:
spin_unlock_bh(&nf_conntrack_lock);
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index 62fb8faedb8..6b217074237 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -3,6 +3,7 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 7df7b36d2e2..bdebd03bc8c 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -2,6 +2,7 @@
* H.323 connection tracking helper
*
* Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
+ * Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This source code is licensed under General Public License version 2.
*
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 94b4b9853f6..974a2a4adef 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -3,6 +3,7 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -353,7 +354,7 @@ void nf_ct_helper_log(struct sk_buff *skb, const struct nf_conn *ct,
/* rcu_read_lock()ed by nf_hook_slow */
helper = rcu_dereference(help->helper);
- nf_log_packet(nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL,
+ nf_log_packet(nf_ct_net(ct), nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL,
"nf_ct_%s: dropping packet: %pV ", helper->name, &vaf);
va_end(args);
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index 70985c5d0ff..0fd2976db7e 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -1,6 +1,7 @@
/* IRC extension for IP connection tracking, Version 1.21
* (C) 2000-2002 by Harald Welte <laforge@gnumonks.org>
* based on RR's ip_conntrack_ftp.c
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 9904b15f600..6d0f8a17c5b 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2409,6 +2409,92 @@ out:
return skb->len;
}
+static int
+ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nf_conntrack_expect *exp, *last;
+ struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
+ struct nf_conn *ct = cb->data;
+ struct nf_conn_help *help = nfct_help(ct);
+ u_int8_t l3proto = nfmsg->nfgen_family;
+
+ if (cb->args[0])
+ return 0;
+
+ rcu_read_lock();
+ last = (struct nf_conntrack_expect *)cb->args[1];
+restart:
+ hlist_for_each_entry(exp, &help->expectations, lnode) {
+ if (l3proto && exp->tuple.src.l3num != l3proto)
+ continue;
+ if (cb->args[1]) {
+ if (exp != last)
+ continue;
+ cb->args[1] = 0;
+ }
+ if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ IPCTNL_MSG_EXP_NEW,
+ exp) < 0) {
+ if (!atomic_inc_not_zero(&exp->use))
+ continue;
+ cb->args[1] = (unsigned long)exp;
+ goto out;
+ }
+ }
+ if (cb->args[1]) {
+ cb->args[1] = 0;
+ goto restart;
+ }
+ cb->args[0] = 1;
+out:
+ rcu_read_unlock();
+ if (last)
+ nf_ct_expect_put(last);
+
+ return skb->len;
+}
+
+static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
+{
+ int err;
+ struct net *net = sock_net(ctnl);
+ struct nfgenmsg *nfmsg = nlmsg_data(nlh);
+ u_int8_t u3 = nfmsg->nfgen_family;
+ struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+ u16 zone = 0;
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_exp_ct_dump_table,
+ .done = ctnetlink_exp_done,
+ };
+
+ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3);
+ if (err < 0)
+ return err;
+
+ if (cda[CTA_EXPECT_ZONE]) {
+ err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
+ if (err < 0)
+ return err;
+ }
+
+ h = nf_conntrack_find_get(net, zone, &tuple);
+ if (!h)
+ return -ENOENT;
+
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ c.data = ct;
+
+ err = netlink_dump_start(ctnl, skb, nlh, &c);
+ nf_ct_put(ct);
+
+ return err;
+}
+
static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = {
[CTA_EXPECT_MASTER] = { .type = NLA_NESTED },
[CTA_EXPECT_TUPLE] = { .type = NLA_NESTED },
@@ -2439,11 +2525,15 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
int err;
if (nlh->nlmsg_flags & NLM_F_DUMP) {
- struct netlink_dump_control c = {
- .dump = ctnetlink_exp_dump_table,
- .done = ctnetlink_exp_done,
- };
- return netlink_dump_start(ctnl, skb, nlh, &c);
+ if (cda[CTA_EXPECT_MASTER])
+ return ctnetlink_dump_exp_ct(ctnl, skb, nlh, cda);
+ else {
+ struct netlink_dump_control c = {
+ .dump = ctnetlink_exp_dump_table,
+ .done = ctnetlink_exp_done,
+ };
+ return netlink_dump_start(ctnl, skb, nlh, &c);
+ }
}
err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone);
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index e6678d2b624..7bd03decd36 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -11,6 +11,8 @@
*
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
+ *
* Limitations:
* - We blindly assume that control connections are always
* established in PNS->PAC direction. This is a violation
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 58ab4050830..0ab9636ac57 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -3,6 +3,7 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index ba65b2041eb..a99b6c3427b 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -456,7 +456,8 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
out_invalid:
if (LOG_INVALID(net, IPPROTO_DCCP))
- nf_log_packet(nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, msg);
+ nf_log_packet(net, nf_ct_l3num(ct), 0, skb, NULL, NULL,
+ NULL, msg);
return false;
}
@@ -542,13 +543,13 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
spin_unlock_bh(&ct->lock);
if (LOG_INVALID(net, IPPROTO_DCCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_dccp: invalid packet ignored ");
return NF_ACCEPT;
case CT_DCCP_INVALID:
spin_unlock_bh(&ct->lock);
if (LOG_INVALID(net, IPPROTO_DCCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_dccp: invalid state transition ");
return -NF_ACCEPT;
}
@@ -613,7 +614,7 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl,
out_invalid:
if (LOG_INVALID(net, IPPROTO_DCCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL, msg);
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL, msg);
return -NF_ACCEPT;
}
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index 155ce9f8a0d..9d9c0dade60 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -21,6 +21,7 @@
*
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*/
#include <linux/module.h>
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index ec83536def9..1314d33f6bc 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -1,6 +1,9 @@
/*
* Connection tracking protocol helper module for SCTP.
*
+ * Copyright (c) 2004 Kiran Kumar Immidi <immidi_kiran@yahoo.com>
+ * Copyright (c) 2004-2012 Patrick McHardy <kaber@trash.net>
+ *
* SCTP is defined in RFC 2960. References to various sections in this code
* are to this RFC.
*
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 83876e9877f..4d4d8f1d01f 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1,5 +1,7 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2002-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -720,7 +722,7 @@ static bool tcp_in_window(const struct nf_conn *ct,
tn->tcp_be_liberal)
res = true;
if (!res && LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: %s ",
before(seq, sender->td_maxend + 1) ?
after(end, sender->td_end - receiver->td_maxwin - 1) ?
@@ -772,7 +774,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
if (th == NULL) {
if (LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: short packet ");
return -NF_ACCEPT;
}
@@ -780,7 +782,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
/* Not whole TCP header or malformed packet */
if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
if (LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: truncated/malformed packet ");
return -NF_ACCEPT;
}
@@ -793,7 +795,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {
if (LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: bad TCP checksum ");
return -NF_ACCEPT;
}
@@ -802,7 +804,7 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
tcpflags = (tcp_flag_byte(th) & ~(TCPHDR_ECE|TCPHDR_CWR|TCPHDR_PSH));
if (!tcp_valid_flags[tcpflags]) {
if (LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: invalid TCP flag combination ");
return -NF_ACCEPT;
}
@@ -949,7 +951,7 @@ static int tcp_packet(struct nf_conn *ct,
}
spin_unlock_bh(&ct->lock);
if (LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: invalid packet ignored in "
"state %s ", tcp_conntrack_names[old_state]);
return NF_ACCEPT;
@@ -959,7 +961,7 @@ static int tcp_packet(struct nf_conn *ct,
dir, get_conntrack_index(th), old_state);
spin_unlock_bh(&ct->lock);
if (LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_tcp: invalid state ");
return -NF_ACCEPT;
case TCP_CONNTRACK_CLOSE:
@@ -969,8 +971,8 @@ static int tcp_packet(struct nf_conn *ct,
/* Invalid RST */
spin_unlock_bh(&ct->lock);
if (LOG_INVALID(net, IPPROTO_TCP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
- "nf_ct_tcp: invalid RST ");
+ nf_log_packet(net, pf, 0, skb, NULL, NULL,
+ NULL, "nf_ct_tcp: invalid RST ");
return -NF_ACCEPT;
}
if (index == TCP_RST_SET
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index 59623cc56e8..9d7721cbce4 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -1,5 +1,6 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -119,7 +120,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
if (hdr == NULL) {
if (LOG_INVALID(net, IPPROTO_UDP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udp: short packet ");
return -NF_ACCEPT;
}
@@ -127,7 +128,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
/* Truncated/malformed packets */
if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
if (LOG_INVALID(net, IPPROTO_UDP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udp: truncated/malformed packet ");
return -NF_ACCEPT;
}
@@ -143,7 +144,7 @@ static int udp_error(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) {
if (LOG_INVALID(net, IPPROTO_UDP))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udp: bad UDP checksum ");
return -NF_ACCEPT;
}
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index ca969f6273f..2750e6c69f8 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -131,7 +131,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
if (hdr == NULL) {
if (LOG_INVALID(net, IPPROTO_UDPLITE))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udplite: short packet ");
return -NF_ACCEPT;
}
@@ -141,7 +141,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
cscov = udplen;
else if (cscov < sizeof(*hdr) || cscov > udplen) {
if (LOG_INVALID(net, IPPROTO_UDPLITE))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udplite: invalid checksum coverage ");
return -NF_ACCEPT;
}
@@ -149,7 +149,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
/* UDPLITE mandates checksums */
if (!hdr->check) {
if (LOG_INVALID(net, IPPROTO_UDPLITE))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udplite: checksum missing ");
return -NF_ACCEPT;
}
@@ -159,7 +159,7 @@ static int udplite_error(struct net *net, struct nf_conn *tmpl,
nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP,
pf)) {
if (LOG_INVALID(net, IPPROTO_UDPLITE))
- nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+ nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
"nf_ct_udplite: bad UDPLite checksum ");
return -NF_ACCEPT;
}
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index fedee394366..bd700b4013c 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -1,5 +1,6 @@
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -545,16 +546,20 @@ out_init:
return ret;
}
-static void nf_conntrack_pernet_exit(struct net *net)
+static void nf_conntrack_pernet_exit(struct list_head *net_exit_list)
{
- nf_conntrack_standalone_fini_sysctl(net);
- nf_conntrack_standalone_fini_proc(net);
- nf_conntrack_cleanup_net(net);
+ struct net *net;
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ nf_conntrack_standalone_fini_sysctl(net);
+ nf_conntrack_standalone_fini_proc(net);
+ }
+ nf_conntrack_cleanup_net_list(net_exit_list);
}
static struct pernet_operations nf_conntrack_net_ops = {
- .init = nf_conntrack_pernet_init,
- .exit = nf_conntrack_pernet_exit,
+ .init = nf_conntrack_pernet_init,
+ .exit_batch = nf_conntrack_pernet_exit,
};
static int __init nf_conntrack_standalone_init(void)
diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c
index e9936c83020..e68ab4fbd71 100644
--- a/net/netfilter/nf_conntrack_tftp.c
+++ b/net/netfilter/nf_conntrack_tftp.c
@@ -1,5 +1,5 @@
/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
- *
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 9e312695c81..3b18dd1be7d 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -16,7 +16,6 @@
#define NF_LOG_PREFIXLEN 128
#define NFLOGGER_NAME_LEN 64
-static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly;
static DEFINE_MUTEX(nf_log_mutex);
@@ -32,13 +31,46 @@ static struct nf_logger *__find_logger(int pf, const char *str_logger)
return NULL;
}
+void nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger)
+{
+ const struct nf_logger *log;
+
+ if (pf == NFPROTO_UNSPEC)
+ return;
+
+ mutex_lock(&nf_log_mutex);
+ log = rcu_dereference_protected(net->nf.nf_loggers[pf],
+ lockdep_is_held(&nf_log_mutex));
+ if (log == NULL)
+ rcu_assign_pointer(net->nf.nf_loggers[pf], logger);
+
+ mutex_unlock(&nf_log_mutex);
+}
+EXPORT_SYMBOL(nf_log_set);
+
+void nf_log_unset(struct net *net, const struct nf_logger *logger)
+{
+ int i;
+ const struct nf_logger *log;
+
+ mutex_lock(&nf_log_mutex);
+ for (i = 0; i < NFPROTO_NUMPROTO; i++) {
+ log = rcu_dereference_protected(net->nf.nf_loggers[i],
+ lockdep_is_held(&nf_log_mutex));
+ if (log == logger)
+ RCU_INIT_POINTER(net->nf.nf_loggers[i], NULL);
+ }
+ mutex_unlock(&nf_log_mutex);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL(nf_log_unset);
+
/* return EEXIST if the same logger is registered, 0 on success. */
int nf_log_register(u_int8_t pf, struct nf_logger *logger)
{
- const struct nf_logger *llog;
int i;
- if (pf >= ARRAY_SIZE(nf_loggers))
+ if (pf >= ARRAY_SIZE(init_net.nf.nf_loggers))
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(logger->list); i++)
@@ -52,10 +84,6 @@ int nf_log_register(u_int8_t pf, struct nf_logger *logger)
} else {
/* register at end of list to honor first register win */
list_add_tail(&logger->list[pf], &nf_loggers_l[pf]);
- llog = rcu_dereference_protected(nf_loggers[pf],
- lockdep_is_held(&nf_log_mutex));
- if (llog == NULL)
- rcu_assign_pointer(nf_loggers[pf], logger);
}
mutex_unlock(&nf_log_mutex);
@@ -66,49 +94,43 @@ EXPORT_SYMBOL(nf_log_register);
void nf_log_unregister(struct nf_logger *logger)
{
- const struct nf_logger *c_logger;
int i;
mutex_lock(&nf_log_mutex);
- for (i = 0; i < ARRAY_SIZE(nf_loggers); i++) {
- c_logger = rcu_dereference_protected(nf_loggers[i],
- lockdep_is_held(&nf_log_mutex));
- if (c_logger == logger)
- RCU_INIT_POINTER(nf_loggers[i], NULL);
+ for (i = 0; i < NFPROTO_NUMPROTO; i++)
list_del(&logger->list[i]);
- }
mutex_unlock(&nf_log_mutex);
-
- synchronize_rcu();
}
EXPORT_SYMBOL(nf_log_unregister);
-int nf_log_bind_pf(u_int8_t pf, const struct nf_logger *logger)
+int nf_log_bind_pf(struct net *net, u_int8_t pf,
+ const struct nf_logger *logger)
{
- if (pf >= ARRAY_SIZE(nf_loggers))
+ if (pf >= ARRAY_SIZE(net->nf.nf_loggers))
return -EINVAL;
mutex_lock(&nf_log_mutex);
if (__find_logger(pf, logger->name) == NULL) {
mutex_unlock(&nf_log_mutex);
return -ENOENT;
}
- rcu_assign_pointer(nf_loggers[pf], logger);
+ rcu_assign_pointer(net->nf.nf_loggers[pf], logger);
mutex_unlock(&nf_log_mutex);
return 0;
}
EXPORT_SYMBOL(nf_log_bind_pf);
-void nf_log_unbind_pf(u_int8_t pf)
+void nf_log_unbind_pf(struct net *net, u_int8_t pf)
{
- if (pf >= ARRAY_SIZE(nf_loggers))
+ if (pf >= ARRAY_SIZE(net->nf.nf_loggers))
return;
mutex_lock(&nf_log_mutex);
- RCU_INIT_POINTER(nf_loggers[pf], NULL);
+ RCU_INIT_POINTER(net->nf.nf_loggers[pf], NULL);
mutex_unlock(&nf_log_mutex);
}
EXPORT_SYMBOL(nf_log_unbind_pf);
-void nf_log_packet(u_int8_t pf,
+void nf_log_packet(struct net *net,
+ u_int8_t pf,
unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
@@ -121,12 +143,12 @@ void nf_log_packet(u_int8_t pf,
const struct nf_logger *logger;
rcu_read_lock();
- logger = rcu_dereference(nf_loggers[pf]);
+ logger = rcu_dereference(net->nf.nf_loggers[pf]);
if (logger) {
va_start(args, fmt);
vsnprintf(prefix, sizeof(prefix), fmt, args);
va_end(args);
- logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix);
+ logger->logfn(net, pf, hooknum, skb, in, out, loginfo, prefix);
}
rcu_read_unlock();
}
@@ -135,9 +157,11 @@ EXPORT_SYMBOL(nf_log_packet);
#ifdef CONFIG_PROC_FS
static void *seq_start(struct seq_file *seq, loff_t *pos)
{
+ struct net *net = seq_file_net(seq);
+
mutex_lock(&nf_log_mutex);
- if (*pos >= ARRAY_SIZE(nf_loggers))
+ if (*pos >= ARRAY_SIZE(net->nf.nf_loggers))
return NULL;
return pos;
@@ -145,9 +169,11 @@ static void *seq_start(struct seq_file *seq, loff_t *pos)
static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
{
+ struct net *net = seq_file_net(s);
+
(*pos)++;
- if (*pos >= ARRAY_SIZE(nf_loggers))
+ if (*pos >= ARRAY_SIZE(net->nf.nf_loggers))
return NULL;
return pos;
@@ -164,8 +190,9 @@ static int seq_show(struct seq_file *s, void *v)
const struct nf_logger *logger;
struct nf_logger *t;
int ret;
+ struct net *net = seq_file_net(s);
- logger = rcu_dereference_protected(nf_loggers[*pos],
+ logger = rcu_dereference_protected(net->nf.nf_loggers[*pos],
lockdep_is_held(&nf_log_mutex));
if (!logger)
@@ -199,7 +226,8 @@ static const struct seq_operations nflog_seq_ops = {
static int nflog_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &nflog_seq_ops);
+ return seq_open_net(inode, file, &nflog_seq_ops,
+ sizeof(struct seq_net_private));
}
static const struct file_operations nflog_file_ops = {
@@ -207,7 +235,7 @@ static const struct file_operations nflog_file_ops = {
.open = nflog_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = seq_release_net,
};
@@ -216,7 +244,6 @@ static const struct file_operations nflog_file_ops = {
#ifdef CONFIG_SYSCTL
static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3];
static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1];
-static struct ctl_table_header *nf_log_dir_header;
static int nf_log_proc_dostring(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -226,6 +253,7 @@ static int nf_log_proc_dostring(ctl_table *table, int write,
size_t size = *lenp;
int r = 0;
int tindex = (unsigned long)table->extra1;
+ struct net *net = current->nsproxy->net_ns;
if (write) {
if (size > sizeof(buf))
@@ -234,7 +262,7 @@ static int nf_log_proc_dostring(ctl_table *table, int write,
return -EFAULT;
if (!strcmp(buf, "NONE")) {
- nf_log_unbind_pf(tindex);
+ nf_log_unbind_pf(net, tindex);
return 0;
}
mutex_lock(&nf_log_mutex);
@@ -243,11 +271,11 @@ static int nf_log_proc_dostring(ctl_table *table, int write,
mutex_unlock(&nf_log_mutex);
return -ENOENT;
}
- rcu_assign_pointer(nf_loggers[tindex], logger);
+ rcu_assign_pointer(net->nf.nf_loggers[tindex], logger);
mutex_unlock(&nf_log_mutex);
} else {
mutex_lock(&nf_log_mutex);
- logger = rcu_dereference_protected(nf_loggers[tindex],
+ logger = rcu_dereference_protected(net->nf.nf_loggers[tindex],
lockdep_is_held(&nf_log_mutex));
if (!logger)
table->data = "NONE";
@@ -260,49 +288,114 @@ static int nf_log_proc_dostring(ctl_table *table, int write,
return r;
}
-static __init int netfilter_log_sysctl_init(void)
+static int netfilter_log_sysctl_init(struct net *net)
{
int i;
-
- for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) {
- snprintf(nf_log_sysctl_fnames[i-NFPROTO_UNSPEC], 3, "%d", i);
- nf_log_sysctl_table[i].procname =
- nf_log_sysctl_fnames[i-NFPROTO_UNSPEC];
- nf_log_sysctl_table[i].data = NULL;
- nf_log_sysctl_table[i].maxlen =
- NFLOGGER_NAME_LEN * sizeof(char);
- nf_log_sysctl_table[i].mode = 0644;
- nf_log_sysctl_table[i].proc_handler = nf_log_proc_dostring;
- nf_log_sysctl_table[i].extra1 = (void *)(unsigned long) i;
+ struct ctl_table *table;
+
+ table = nf_log_sysctl_table;
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(nf_log_sysctl_table,
+ sizeof(nf_log_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+ } else {
+ for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++) {
+ snprintf(nf_log_sysctl_fnames[i],
+ 3, "%d", i);
+ nf_log_sysctl_table[i].procname =
+ nf_log_sysctl_fnames[i];
+ nf_log_sysctl_table[i].data = NULL;
+ nf_log_sysctl_table[i].maxlen =
+ NFLOGGER_NAME_LEN * sizeof(char);
+ nf_log_sysctl_table[i].mode = 0644;
+ nf_log_sysctl_table[i].proc_handler =
+ nf_log_proc_dostring;
+ nf_log_sysctl_table[i].extra1 =
+ (void *)(unsigned long) i;
+ }
}
- nf_log_dir_header = register_net_sysctl(&init_net, "net/netfilter/nf_log",
- nf_log_sysctl_table);
- if (!nf_log_dir_header)
- return -ENOMEM;
+ net->nf.nf_log_dir_header = register_net_sysctl(net,
+ "net/netfilter/nf_log",
+ table);
+ if (!net->nf.nf_log_dir_header)
+ goto err_reg;
return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void netfilter_log_sysctl_exit(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->nf.nf_log_dir_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->nf.nf_log_dir_header);
+ if (!net_eq(net, &init_net))
+ kfree(table);
}
#else
-static __init int netfilter_log_sysctl_init(void)
+static int netfilter_log_sysctl_init(struct net *net)
{
return 0;
}
+
+static void netfilter_log_sysctl_exit(struct net *net)
+{
+}
#endif /* CONFIG_SYSCTL */
-int __init netfilter_log_init(void)
+static int __net_init nf_log_net_init(struct net *net)
{
- int i, r;
+ int ret = -ENOMEM;
+
#ifdef CONFIG_PROC_FS
if (!proc_create("nf_log", S_IRUGO,
- proc_net_netfilter, &nflog_file_ops))
- return -1;
+ net->nf.proc_netfilter, &nflog_file_ops))
+ return ret;
+#endif
+ ret = netfilter_log_sysctl_init(net);
+ if (ret < 0)
+ goto out_sysctl;
+
+ return 0;
+
+out_sysctl:
+#ifdef CONFIG_PROC_FS
+ /* For init_net: errors will trigger panic, don't unroll on error. */
+ if (!net_eq(net, &init_net))
+ remove_proc_entry("nf_log", net->nf.proc_netfilter);
#endif
+ return ret;
+}
+
+static void __net_exit nf_log_net_exit(struct net *net)
+{
+ netfilter_log_sysctl_exit(net);
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("nf_log", net->nf.proc_netfilter);
+#endif
+}
- /* Errors will trigger panic, unroll on error is unnecessary. */
- r = netfilter_log_sysctl_init();
- if (r < 0)
- return r;
+static struct pernet_operations nf_log_net_ops = {
+ .init = nf_log_net_init,
+ .exit = nf_log_net_exit,
+};
+
+int __init netfilter_log_init(void)
+{
+ int i, ret;
+
+ ret = register_pernet_subsys(&nf_log_net_ops);
+ if (ret < 0)
+ return ret;
for (i = NFPROTO_UNSPEC; i < NFPROTO_NUMPROTO; i++)
INIT_LIST_HEAD(&(nf_loggers_l[i]));
diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c
index 3b67c9d1127..eb772380a20 100644
--- a/net/netfilter/nf_nat_amanda.c
+++ b/net/netfilter/nf_nat_amanda.c
@@ -1,6 +1,7 @@
/* Amanda extension for TCP NAT alteration.
* (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
* based on a copy of HW's ip_nat_irc.c as well as other modules
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index ad24be070e5..038eee5c8f8 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -87,9 +87,11 @@ int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family)
struct flowi fl;
unsigned int hh_len;
struct dst_entry *dst;
+ int err;
- if (xfrm_decode_session(skb, &fl, family) < 0)
- return -1;
+ err = xfrm_decode_session(skb, &fl, family);
+ if (err < 0)
+ return err;
dst = skb_dst(skb);
if (dst->xfrm)
@@ -98,7 +100,7 @@ int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family)
dst = xfrm_lookup(dev_net(dst->dev), dst, &fl, skb->sk, 0);
if (IS_ERR(dst))
- return -1;
+ return PTR_ERR(dst);
skb_dst_drop(skb);
skb_dst_set(skb, dst);
@@ -107,7 +109,7 @@ int nf_xfrm_me_harder(struct sk_buff *skb, unsigned int family)
hh_len = skb_dst(skb)->dev->hard_header_len;
if (skb_headroom(skb) < hh_len &&
pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC))
- return -1;
+ return -ENOMEM;
return 0;
}
EXPORT_SYMBOL(nf_xfrm_me_harder);
diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c
index 23c2b38676a..5fea563afe3 100644
--- a/net/netfilter/nf_nat_helper.c
+++ b/net/netfilter/nf_nat_helper.c
@@ -2,6 +2,7 @@
*
* (C) 2000-2002 Harald Welte <laforge@netfilter.org>
* (C) 2003-2006 Netfilter Core Team <coreteam@netfilter.org>
+ * (C) 2007-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c
index e64faa5ca89..396e55d46f9 100644
--- a/net/netfilter/nf_nat_proto_sctp.c
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -36,7 +36,7 @@ sctp_manip_pkt(struct sk_buff *skb,
{
struct sk_buff *frag;
sctp_sctphdr_t *hdr;
- __be32 crc32;
+ __u32 crc32;
if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
return false;
@@ -55,8 +55,7 @@ sctp_manip_pkt(struct sk_buff *skb,
skb_walk_frags(skb, frag)
crc32 = sctp_update_cksum((u8 *)frag->data, skb_headlen(frag),
crc32);
- crc32 = sctp_end_cksum(crc32);
- hdr->checksum = crc32;
+ hdr->checksum = sctp_end_cksum(crc32);
return true;
}
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index d812c1235b3..5d24b1fdb59 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -1,3 +1,8 @@
+/*
+ * Rusty Russell (C)2000 -- This code is GPL.
+ * Patrick McHardy (c) 2006-2012
+ */
+
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/init.h>
@@ -40,7 +45,7 @@ void nf_unregister_queue_handler(void)
}
EXPORT_SYMBOL(nf_unregister_queue_handler);
-static void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
+void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
{
/* Release those devices we held, or Alexey will kill me. */
if (entry->indev)
@@ -60,12 +65,41 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
/* Drop reference to owner of hook which queued us. */
module_put(entry->elem->owner);
}
+EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs);
+
+/* Bump dev refs so they don't vanish while packet is out */
+bool nf_queue_entry_get_refs(struct nf_queue_entry *entry)
+{
+ if (!try_module_get(entry->elem->owner))
+ return false;
+
+ if (entry->indev)
+ dev_hold(entry->indev);
+ if (entry->outdev)
+ dev_hold(entry->outdev);
+#ifdef CONFIG_BRIDGE_NETFILTER
+ if (entry->skb->nf_bridge) {
+ struct nf_bridge_info *nf_bridge = entry->skb->nf_bridge;
+ struct net_device *physdev;
+
+ physdev = nf_bridge->physindev;
+ if (physdev)
+ dev_hold(physdev);
+ physdev = nf_bridge->physoutdev;
+ if (physdev)
+ dev_hold(physdev);
+ }
+#endif
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
/*
* Any packet that leaves via this function must come back
* through nf_reinject().
*/
-static int __nf_queue(struct sk_buff *skb,
+int nf_queue(struct sk_buff *skb,
struct nf_hook_ops *elem,
u_int8_t pf, unsigned int hook,
struct net_device *indev,
@@ -75,10 +109,6 @@ static int __nf_queue(struct sk_buff *skb,
{
int status = -ENOENT;
struct nf_queue_entry *entry = NULL;
-#ifdef CONFIG_BRIDGE_NETFILTER
- struct net_device *physindev;
- struct net_device *physoutdev;
-#endif
const struct nf_afinfo *afinfo;
const struct nf_queue_handler *qh;
@@ -109,28 +139,13 @@ static int __nf_queue(struct sk_buff *skb,
.indev = indev,
.outdev = outdev,
.okfn = okfn,
+ .size = sizeof(*entry) + afinfo->route_key_size,
};
- /* If it's going away, ignore hook. */
- if (!try_module_get(entry->elem->owner)) {
+ if (!nf_queue_entry_get_refs(entry)) {
status = -ECANCELED;
goto err_unlock;
}
- /* Bump dev refs so they don't vanish while packet is out */
- if (indev)
- dev_hold(indev);
- if (outdev)
- dev_hold(outdev);
-#ifdef CONFIG_BRIDGE_NETFILTER
- if (skb->nf_bridge) {
- physindev = skb->nf_bridge->physindev;
- if (physindev)
- dev_hold(physindev);
- physoutdev = skb->nf_bridge->physoutdev;
- if (physoutdev)
- dev_hold(physoutdev);
- }
-#endif
skb_dst_force(skb);
afinfo->saveroute(skb, entry);
status = qh->outfn(entry, queuenum);
@@ -151,87 +166,6 @@ err:
return status;
}
-#ifdef CONFIG_BRIDGE_NETFILTER
-/* When called from bridge netfilter, skb->data must point to MAC header
- * before calling skb_gso_segment(). Else, original MAC header is lost
- * and segmented skbs will be sent to wrong destination.
- */
-static void nf_bridge_adjust_skb_data(struct sk_buff *skb)
-{
- if (skb->nf_bridge)
- __skb_push(skb, skb->network_header - skb->mac_header);
-}
-
-static void nf_bridge_adjust_segmented_data(struct sk_buff *skb)
-{
- if (skb->nf_bridge)
- __skb_pull(skb, skb->network_header - skb->mac_header);
-}
-#else
-#define nf_bridge_adjust_skb_data(s) do {} while (0)
-#define nf_bridge_adjust_segmented_data(s) do {} while (0)
-#endif
-
-int nf_queue(struct sk_buff *skb,
- struct nf_hook_ops *elem,
- u_int8_t pf, unsigned int hook,
- struct net_device *indev,
- struct net_device *outdev,
- int (*okfn)(struct sk_buff *),
- unsigned int queuenum)
-{
- struct sk_buff *segs;
- int err = -EINVAL;
- unsigned int queued;
-
- if (!skb_is_gso(skb))
- return __nf_queue(skb, elem, pf, hook, indev, outdev, okfn,
- queuenum);
-
- switch (pf) {
- case NFPROTO_IPV4:
- skb->protocol = htons(ETH_P_IP);
- break;
- case NFPROTO_IPV6:
- skb->protocol = htons(ETH_P_IPV6);
- break;
- }
-
- nf_bridge_adjust_skb_data(skb);
- segs = skb_gso_segment(skb, 0);
- /* Does not use PTR_ERR to limit the number of error codes that can be
- * returned by nf_queue. For instance, callers rely on -ECANCELED to mean
- * 'ignore this hook'.
- */
- if (IS_ERR(segs))
- goto out_err;
- queued = 0;
- err = 0;
- do {
- struct sk_buff *nskb = segs->next;
-
- segs->next = NULL;
- if (err == 0) {
- nf_bridge_adjust_segmented_data(segs);
- err = __nf_queue(segs, elem, pf, hook, indev,
- outdev, okfn, queuenum);
- }
- if (err == 0)
- queued++;
- else
- kfree_skb(segs);
- segs = nskb;
- } while (segs);
-
- if (queued) {
- kfree_skb(skb);
- return 0;
- }
- out_err:
- nf_bridge_adjust_segmented_data(skb);
- return err;
-}
-
void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
{
struct sk_buff *skb = entry->skb;
@@ -271,9 +205,9 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
local_bh_enable();
break;
case NF_QUEUE:
- err = __nf_queue(skb, elem, entry->pf, entry->hook,
- entry->indev, entry->outdev, entry->okfn,
- verdict >> NF_VERDICT_QBITS);
+ err = nf_queue(skb, elem, entry->pf, entry->hook,
+ entry->indev, entry->outdev, entry->okfn,
+ verdict >> NF_VERDICT_QBITS);
if (err < 0) {
if (err == -ECANCELED)
goto next_hook;
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 0b1b32cda30..572d87dc116 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -24,10 +24,9 @@
#include <linux/skbuff.h>
#include <asm/uaccess.h>
#include <net/sock.h>
-#include <net/netlink.h>
#include <linux/init.h>
-#include <linux/netlink.h>
+#include <net/netlink.h>
#include <linux/netfilter/nfnetlink.h>
MODULE_LICENSE("GPL");
@@ -113,22 +112,30 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group)
}
EXPORT_SYMBOL_GPL(nfnetlink_has_listeners);
-int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid,
+struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size,
+ u32 dst_portid, gfp_t gfp_mask)
+{
+ return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask);
+}
+EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb);
+
+int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
unsigned int group, int echo, gfp_t flags)
{
- return nlmsg_notify(net->nfnl, skb, pid, group, echo, flags);
+ return nlmsg_notify(net->nfnl, skb, portid, group, echo, flags);
}
EXPORT_SYMBOL_GPL(nfnetlink_send);
-int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error)
+int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error)
{
- return netlink_set_err(net->nfnl, pid, group, error);
+ return netlink_set_err(net->nfnl, portid, group, error);
}
EXPORT_SYMBOL_GPL(nfnetlink_set_err);
-int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags)
+int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid,
+ int flags)
{
- return netlink_unicast(net->nfnl, skb, pid, flags);
+ return netlink_unicast(net->nfnl, skb, portid, flags);
}
EXPORT_SYMBOL_GPL(nfnetlink_unicast);
@@ -144,7 +151,7 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return -EPERM;
/* All the messages must at least contain nfgenmsg */
- if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct nfgenmsg)))
+ if (nlmsg_len(nlh) < sizeof(struct nfgenmsg))
return 0;
type = nlh->nlmsg_type;
@@ -172,7 +179,7 @@ replay:
}
{
- int min_len = NLMSG_SPACE(sizeof(struct nfgenmsg));
+ int min_len = nlmsg_total_size(sizeof(struct nfgenmsg));
u_int8_t cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type);
struct nlattr *cda[ss->cb[cb_id].attr_count + 1];
struct nlattr *attr = (void *)nlh + min_len;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index f248db57297..962e9792e31 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -3,6 +3,7 @@
* nfetlink.
*
* (C) 2005 by Harald Welte <laforge@netfilter.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* Based on the old ipv4-only ipt_ULOG.c:
* (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
@@ -19,7 +20,7 @@
#include <linux/ipv6.h>
#include <linux/netdevice.h>
#include <linux/netfilter.h>
-#include <linux/netlink.h>
+#include <net/netlink.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_log.h>
#include <linux/spinlock.h>
@@ -32,6 +33,7 @@
#include <linux/slab.h>
#include <net/sock.h>
#include <net/netfilter/nf_log.h>
+#include <net/netns/generic.h>
#include <net/netfilter/nfnetlink_log.h>
#include <linux/atomic.h>
@@ -56,6 +58,7 @@ struct nfulnl_instance {
unsigned int qlen; /* number of nlmsgs in skb */
struct sk_buff *skb; /* pre-allocatd skb */
struct timer_list timer;
+ struct net *net;
struct user_namespace *peer_user_ns; /* User namespace of the peer process */
int peer_portid; /* PORTID of the peer process */
@@ -71,25 +74,34 @@ struct nfulnl_instance {
struct rcu_head rcu;
};
-static DEFINE_SPINLOCK(instances_lock);
-static atomic_t global_seq;
-
#define INSTANCE_BUCKETS 16
-static struct hlist_head instance_table[INSTANCE_BUCKETS];
static unsigned int hash_init;
+static int nfnl_log_net_id __read_mostly;
+
+struct nfnl_log_net {
+ spinlock_t instances_lock;
+ struct hlist_head instance_table[INSTANCE_BUCKETS];
+ atomic_t global_seq;
+};
+
+static struct nfnl_log_net *nfnl_log_pernet(struct net *net)
+{
+ return net_generic(net, nfnl_log_net_id);
+}
+
static inline u_int8_t instance_hashfn(u_int16_t group_num)
{
return ((group_num & 0xff) % INSTANCE_BUCKETS);
}
static struct nfulnl_instance *
-__instance_lookup(u_int16_t group_num)
+__instance_lookup(struct nfnl_log_net *log, u_int16_t group_num)
{
struct hlist_head *head;
struct nfulnl_instance *inst;
- head = &instance_table[instance_hashfn(group_num)];
+ head = &log->instance_table[instance_hashfn(group_num)];
hlist_for_each_entry_rcu(inst, head, hlist) {
if (inst->group_num == group_num)
return inst;
@@ -104,12 +116,12 @@ instance_get(struct nfulnl_instance *inst)
}
static struct nfulnl_instance *
-instance_lookup_get(u_int16_t group_num)
+instance_lookup_get(struct nfnl_log_net *log, u_int16_t group_num)
{
struct nfulnl_instance *inst;
rcu_read_lock_bh();
- inst = __instance_lookup(group_num);
+ inst = __instance_lookup(log, group_num);
if (inst && !atomic_inc_not_zero(&inst->use))
inst = NULL;
rcu_read_unlock_bh();
@@ -119,7 +131,11 @@ instance_lookup_get(u_int16_t group_num)
static void nfulnl_instance_free_rcu(struct rcu_head *head)
{
- kfree(container_of(head, struct nfulnl_instance, rcu));
+ struct nfulnl_instance *inst =
+ container_of(head, struct nfulnl_instance, rcu);
+
+ put_net(inst->net);
+ kfree(inst);
module_put(THIS_MODULE);
}
@@ -133,13 +149,15 @@ instance_put(struct nfulnl_instance *inst)
static void nfulnl_timer(unsigned long data);
static struct nfulnl_instance *
-instance_create(u_int16_t group_num, int portid, struct user_namespace *user_ns)
+instance_create(struct net *net, u_int16_t group_num,
+ int portid, struct user_namespace *user_ns)
{
struct nfulnl_instance *inst;
+ struct nfnl_log_net *log = nfnl_log_pernet(net);
int err;
- spin_lock_bh(&instances_lock);
- if (__instance_lookup(group_num)) {
+ spin_lock_bh(&log->instances_lock);
+ if (__instance_lookup(log, group_num)) {
err = -EEXIST;
goto out_unlock;
}
@@ -163,6 +181,7 @@ instance_create(u_int16_t group_num, int portid, struct user_namespace *user_ns)
setup_timer(&inst->timer, nfulnl_timer, (unsigned long)inst);
+ inst->net = get_net(net);
inst->peer_user_ns = user_ns;
inst->peer_portid = portid;
inst->group_num = group_num;
@@ -174,14 +193,15 @@ instance_create(u_int16_t group_num, int portid, struct user_namespace *user_ns)
inst->copy_range = NFULNL_COPY_RANGE_MAX;
hlist_add_head_rcu(&inst->hlist,
- &instance_table[instance_hashfn(group_num)]);
+ &log->instance_table[instance_hashfn(group_num)]);
- spin_unlock_bh(&instances_lock);
+
+ spin_unlock_bh(&log->instances_lock);
return inst;
out_unlock:
- spin_unlock_bh(&instances_lock);
+ spin_unlock_bh(&log->instances_lock);
return ERR_PTR(err);
}
@@ -210,11 +230,12 @@ __instance_destroy(struct nfulnl_instance *inst)
}
static inline void
-instance_destroy(struct nfulnl_instance *inst)
+instance_destroy(struct nfnl_log_net *log,
+ struct nfulnl_instance *inst)
{
- spin_lock_bh(&instances_lock);
+ spin_lock_bh(&log->instances_lock);
__instance_destroy(inst);
- spin_unlock_bh(&instances_lock);
+ spin_unlock_bh(&log->instances_lock);
}
static int
@@ -298,7 +319,7 @@ nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags)
}
static struct sk_buff *
-nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size)
+nfulnl_alloc_skb(u32 peer_portid, unsigned int inst_size, unsigned int pkt_size)
{
struct sk_buff *skb;
unsigned int n;
@@ -307,13 +328,14 @@ nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size)
* message. WARNING: has to be <= 128k due to slab restrictions */
n = max(inst_size, pkt_size);
- skb = alloc_skb(n, GFP_ATOMIC);
+ skb = nfnetlink_alloc_skb(&init_net, n, peer_portid, GFP_ATOMIC);
if (!skb) {
if (n > pkt_size) {
/* try to allocate only as much as we need for current
* packet */
- skb = alloc_skb(pkt_size, GFP_ATOMIC);
+ skb = nfnetlink_alloc_skb(&init_net, pkt_size,
+ peer_portid, GFP_ATOMIC);
if (!skb)
pr_err("nfnetlink_log: can't even alloc %u bytes\n",
pkt_size);
@@ -336,7 +358,7 @@ __nfulnl_send(struct nfulnl_instance *inst)
if (!nlh)
goto out;
}
- status = nfnetlink_unicast(inst->skb, &init_net, inst->peer_portid,
+ status = nfnetlink_unicast(inst->skb, inst->net, inst->peer_portid,
MSG_DONTWAIT);
inst->qlen = 0;
@@ -370,7 +392,8 @@ nfulnl_timer(unsigned long data)
/* This is an inline function, we don't really care about a long
* list of arguments */
static inline int
-__build_packet_message(struct nfulnl_instance *inst,
+__build_packet_message(struct nfnl_log_net *log,
+ struct nfulnl_instance *inst,
const struct sk_buff *skb,
unsigned int data_len,
u_int8_t pf,
@@ -536,7 +559,7 @@ __build_packet_message(struct nfulnl_instance *inst,
/* global sequence number */
if ((inst->flags & NFULNL_CFG_F_SEQ_GLOBAL) &&
nla_put_be32(inst->skb, NFULA_SEQ_GLOBAL,
- htonl(atomic_inc_return(&global_seq))))
+ htonl(atomic_inc_return(&log->global_seq))))
goto nla_put_failure;
if (data_len) {
@@ -579,7 +602,8 @@ static struct nf_loginfo default_loginfo = {
/* log handler for internal netfilter logging api */
void
-nfulnl_log_packet(u_int8_t pf,
+nfulnl_log_packet(struct net *net,
+ u_int8_t pf,
unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
@@ -592,13 +616,14 @@ nfulnl_log_packet(u_int8_t pf,
const struct nf_loginfo *li;
unsigned int qthreshold;
unsigned int plen;
+ struct nfnl_log_net *log = nfnl_log_pernet(net);
if (li_user && li_user->type == NF_LOG_TYPE_ULOG)
li = li_user;
else
li = &default_loginfo;
- inst = instance_lookup_get(li->u.ulog.group);
+ inst = instance_lookup_get(log, li->u.ulog.group);
if (!inst)
return;
@@ -609,7 +634,7 @@ nfulnl_log_packet(u_int8_t pf,
/* FIXME: do we want to make the size calculation conditional based on
* what is actually present? way more branches and checks, but more
* memory efficient... */
- size = NLMSG_SPACE(sizeof(struct nfgenmsg))
+ size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ nla_total_size(sizeof(struct nfulnl_msg_packet_hdr))
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
@@ -673,14 +698,15 @@ nfulnl_log_packet(u_int8_t pf,
}
if (!inst->skb) {
- inst->skb = nfulnl_alloc_skb(inst->nlbufsiz, size);
+ inst->skb = nfulnl_alloc_skb(inst->peer_portid, inst->nlbufsiz,
+ size);
if (!inst->skb)
goto alloc_failure;
}
inst->qlen++;
- __build_packet_message(inst, skb, data_len, pf,
+ __build_packet_message(log, inst, skb, data_len, pf,
hooknum, in, out, prefix, plen);
if (inst->qlen >= qthreshold)
@@ -709,24 +735,24 @@ nfulnl_rcv_nl_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct netlink_notify *n = ptr;
+ struct nfnl_log_net *log = nfnl_log_pernet(n->net);
if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) {
int i;
/* destroy all instances for this portid */
- spin_lock_bh(&instances_lock);
+ spin_lock_bh(&log->instances_lock);
for (i = 0; i < INSTANCE_BUCKETS; i++) {
struct hlist_node *t2;
struct nfulnl_instance *inst;
- struct hlist_head *head = &instance_table[i];
+ struct hlist_head *head = &log->instance_table[i];
hlist_for_each_entry_safe(inst, t2, head, hlist) {
- if ((net_eq(n->net, &init_net)) &&
- (n->portid == inst->peer_portid))
+ if (n->portid == inst->peer_portid)
__instance_destroy(inst);
}
}
- spin_unlock_bh(&instances_lock);
+ spin_unlock_bh(&log->instances_lock);
}
return NOTIFY_DONE;
}
@@ -767,6 +793,8 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
u_int16_t group_num = ntohs(nfmsg->res_id);
struct nfulnl_instance *inst;
struct nfulnl_msg_config_cmd *cmd = NULL;
+ struct net *net = sock_net(ctnl);
+ struct nfnl_log_net *log = nfnl_log_pernet(net);
int ret = 0;
if (nfula[NFULA_CFG_CMD]) {
@@ -776,14 +804,14 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
/* Commands without queue context */
switch (cmd->command) {
case NFULNL_CFG_CMD_PF_BIND:
- return nf_log_bind_pf(pf, &nfulnl_logger);
+ return nf_log_bind_pf(net, pf, &nfulnl_logger);
case NFULNL_CFG_CMD_PF_UNBIND:
- nf_log_unbind_pf(pf);
+ nf_log_unbind_pf(net, pf);
return 0;
}
}
- inst = instance_lookup_get(group_num);
+ inst = instance_lookup_get(log, group_num);
if (inst && inst->peer_portid != NETLINK_CB(skb).portid) {
ret = -EPERM;
goto out_put;
@@ -797,9 +825,9 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
goto out_put;
}
- inst = instance_create(group_num,
+ inst = instance_create(net, group_num,
NETLINK_CB(skb).portid,
- sk_user_ns(NETLINK_CB(skb).ssk));
+ sk_user_ns(NETLINK_CB(skb).sk));
if (IS_ERR(inst)) {
ret = PTR_ERR(inst);
goto out;
@@ -811,7 +839,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
goto out;
}
- instance_destroy(inst);
+ instance_destroy(log, inst);
goto out_put;
default:
ret = -ENOTSUPP;
@@ -894,55 +922,68 @@ static const struct nfnetlink_subsystem nfulnl_subsys = {
#ifdef CONFIG_PROC_FS
struct iter_state {
+ struct seq_net_private p;
unsigned int bucket;
};
-static struct hlist_node *get_first(struct iter_state *st)
+static struct hlist_node *get_first(struct net *net, struct iter_state *st)
{
+ struct nfnl_log_net *log;
if (!st)
return NULL;
+ log = nfnl_log_pernet(net);
+
for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
- if (!hlist_empty(&instance_table[st->bucket]))
- return rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket]));
+ struct hlist_head *head = &log->instance_table[st->bucket];
+
+ if (!hlist_empty(head))
+ return rcu_dereference_bh(hlist_first_rcu(head));
}
return NULL;
}
-static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h)
+static struct hlist_node *get_next(struct net *net, struct iter_state *st,
+ struct hlist_node *h)
{
h = rcu_dereference_bh(hlist_next_rcu(h));
while (!h) {
+ struct nfnl_log_net *log;
+ struct hlist_head *head;
+
if (++st->bucket >= INSTANCE_BUCKETS)
return NULL;
- h = rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket]));
+ log = nfnl_log_pernet(net);
+ head = &log->instance_table[st->bucket];
+ h = rcu_dereference_bh(hlist_first_rcu(head));
}
return h;
}
-static struct hlist_node *get_idx(struct iter_state *st, loff_t pos)
+static struct hlist_node *get_idx(struct net *net, struct iter_state *st,
+ loff_t pos)
{
struct hlist_node *head;
- head = get_first(st);
+ head = get_first(net, st);
if (head)
- while (pos && (head = get_next(st, head)))
+ while (pos && (head = get_next(net, st, head)))
pos--;
return pos ? NULL : head;
}
-static void *seq_start(struct seq_file *seq, loff_t *pos)
+static void *seq_start(struct seq_file *s, loff_t *pos)
__acquires(rcu_bh)
{
rcu_read_lock_bh();
- return get_idx(seq->private, *pos);
+ return get_idx(seq_file_net(s), s->private, *pos);
}
static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
{
(*pos)++;
- return get_next(s->private, v);
+ return get_next(seq_file_net(s), s->private, v);
}
static void seq_stop(struct seq_file *s, void *v)
@@ -971,8 +1012,8 @@ static const struct seq_operations nful_seq_ops = {
static int nful_open(struct inode *inode, struct file *file)
{
- return seq_open_private(file, &nful_seq_ops,
- sizeof(struct iter_state));
+ return seq_open_net(inode, file, &nful_seq_ops,
+ sizeof(struct iter_state));
}
static const struct file_operations nful_file_ops = {
@@ -980,17 +1021,45 @@ static const struct file_operations nful_file_ops = {
.open = nful_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = seq_release_net,
};
#endif /* PROC_FS */
-static int __init nfnetlink_log_init(void)
+static int __net_init nfnl_log_net_init(struct net *net)
{
- int i, status = -ENOMEM;
+ unsigned int i;
+ struct nfnl_log_net *log = nfnl_log_pernet(net);
for (i = 0; i < INSTANCE_BUCKETS; i++)
- INIT_HLIST_HEAD(&instance_table[i]);
+ INIT_HLIST_HEAD(&log->instance_table[i]);
+ spin_lock_init(&log->instances_lock);
+
+#ifdef CONFIG_PROC_FS
+ if (!proc_create("nfnetlink_log", 0440,
+ net->nf.proc_netfilter, &nful_file_ops))
+ return -ENOMEM;
+#endif
+ return 0;
+}
+
+static void __net_exit nfnl_log_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter);
+#endif
+}
+
+static struct pernet_operations nfnl_log_net_ops = {
+ .init = nfnl_log_net_init,
+ .exit = nfnl_log_net_exit,
+ .id = &nfnl_log_net_id,
+ .size = sizeof(struct nfnl_log_net),
+};
+
+static int __init nfnetlink_log_init(void)
+{
+ int status = -ENOMEM;
/* it's not really all that important to have a random value, so
* we can do this from the init function, even if there hasn't
@@ -1000,29 +1069,25 @@ static int __init nfnetlink_log_init(void)
netlink_register_notifier(&nfulnl_rtnl_notifier);
status = nfnetlink_subsys_register(&nfulnl_subsys);
if (status < 0) {
- printk(KERN_ERR "log: failed to create netlink socket\n");
+ pr_err("log: failed to create netlink socket\n");
goto cleanup_netlink_notifier;
}
status = nf_log_register(NFPROTO_UNSPEC, &nfulnl_logger);
if (status < 0) {
- printk(KERN_ERR "log: failed to register logger\n");
+ pr_err("log: failed to register logger\n");
goto cleanup_subsys;
}
-#ifdef CONFIG_PROC_FS
- if (!proc_create("nfnetlink_log", 0440,
- proc_net_netfilter, &nful_file_ops)) {
- status = -ENOMEM;
+ status = register_pernet_subsys(&nfnl_log_net_ops);
+ if (status < 0) {
+ pr_err("log: failed to register pernet ops\n");
goto cleanup_logger;
}
-#endif
return status;
-#ifdef CONFIG_PROC_FS
cleanup_logger:
nf_log_unregister(&nfulnl_logger);
-#endif
cleanup_subsys:
nfnetlink_subsys_unregister(&nfulnl_subsys);
cleanup_netlink_notifier:
@@ -1032,10 +1097,8 @@ cleanup_netlink_notifier:
static void __exit nfnetlink_log_fini(void)
{
+ unregister_pernet_subsys(&nfnl_log_net_ops);
nf_log_unregister(&nfulnl_logger);
-#ifdef CONFIG_PROC_FS
- remove_proc_entry("nfnetlink_log", proc_net_netfilter);
-#endif
nfnetlink_subsys_unregister(&nfulnl_subsys);
netlink_unregister_notifier(&nfulnl_rtnl_notifier);
}
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index 42680b2baa1..4e27fa03581 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -30,6 +30,7 @@
#include <linux/list.h>
#include <net/sock.h>
#include <net/netfilter/nf_queue.h>
+#include <net/netns/generic.h>
#include <net/netfilter/nfnetlink_queue.h>
#include <linux/atomic.h>
@@ -66,23 +67,31 @@ struct nfqnl_instance {
typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long);
-static DEFINE_SPINLOCK(instances_lock);
+static int nfnl_queue_net_id __read_mostly;
#define INSTANCE_BUCKETS 16
-static struct hlist_head instance_table[INSTANCE_BUCKETS] __read_mostly;
+struct nfnl_queue_net {
+ spinlock_t instances_lock;
+ struct hlist_head instance_table[INSTANCE_BUCKETS];
+};
+
+static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net)
+{
+ return net_generic(net, nfnl_queue_net_id);
+}
static inline u_int8_t instance_hashfn(u_int16_t queue_num)
{
- return ((queue_num >> 8) | queue_num) % INSTANCE_BUCKETS;
+ return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS;
}
static struct nfqnl_instance *
-instance_lookup(u_int16_t queue_num)
+instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num)
{
struct hlist_head *head;
struct nfqnl_instance *inst;
- head = &instance_table[instance_hashfn(queue_num)];
+ head = &q->instance_table[instance_hashfn(queue_num)];
hlist_for_each_entry_rcu(inst, head, hlist) {
if (inst->queue_num == queue_num)
return inst;
@@ -91,14 +100,15 @@ instance_lookup(u_int16_t queue_num)
}
static struct nfqnl_instance *
-instance_create(u_int16_t queue_num, int portid)
+instance_create(struct nfnl_queue_net *q, u_int16_t queue_num,
+ int portid)
{
struct nfqnl_instance *inst;
unsigned int h;
int err;
- spin_lock(&instances_lock);
- if (instance_lookup(queue_num)) {
+ spin_lock(&q->instances_lock);
+ if (instance_lookup(q, queue_num)) {
err = -EEXIST;
goto out_unlock;
}
@@ -123,16 +133,16 @@ instance_create(u_int16_t queue_num, int portid)
}
h = instance_hashfn(queue_num);
- hlist_add_head_rcu(&inst->hlist, &instance_table[h]);
+ hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]);
- spin_unlock(&instances_lock);
+ spin_unlock(&q->instances_lock);
return inst;
out_free:
kfree(inst);
out_unlock:
- spin_unlock(&instances_lock);
+ spin_unlock(&q->instances_lock);
return ERR_PTR(err);
}
@@ -158,11 +168,11 @@ __instance_destroy(struct nfqnl_instance *inst)
}
static void
-instance_destroy(struct nfqnl_instance *inst)
+instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst)
{
- spin_lock(&instances_lock);
+ spin_lock(&q->instances_lock);
__instance_destroy(inst);
- spin_unlock(&instances_lock);
+ spin_unlock(&q->instances_lock);
}
static inline void
@@ -217,14 +227,71 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
spin_unlock_bh(&queue->lock);
}
+static void
+nfqnl_zcopy(struct sk_buff *to, const struct sk_buff *from, int len, int hlen)
+{
+ int i, j = 0;
+ int plen = 0; /* length of skb->head fragment */
+ struct page *page;
+ unsigned int offset;
+
+ /* dont bother with small payloads */
+ if (len <= skb_tailroom(to)) {
+ skb_copy_bits(from, 0, skb_put(to, len), len);
+ return;
+ }
+
+ if (hlen) {
+ skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
+ len -= hlen;
+ } else {
+ plen = min_t(int, skb_headlen(from), len);
+ if (plen) {
+ page = virt_to_head_page(from->head);
+ offset = from->data - (unsigned char *)page_address(page);
+ __skb_fill_page_desc(to, 0, page, offset, plen);
+ get_page(page);
+ j = 1;
+ len -= plen;
+ }
+ }
+
+ to->truesize += len + plen;
+ to->len += len + plen;
+ to->data_len += len + plen;
+
+ for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
+ if (!len)
+ break;
+ skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
+ skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len);
+ len -= skb_shinfo(to)->frags[j].size;
+ skb_frag_ref(to, j);
+ j++;
+ }
+ skb_shinfo(to)->nr_frags = j;
+}
+
+static int nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet)
+{
+ __u32 flags = 0;
+
+ if (packet->ip_summed == CHECKSUM_PARTIAL)
+ flags = NFQA_SKB_CSUMNOTREADY;
+ if (skb_is_gso(packet))
+ flags |= NFQA_SKB_GSO;
+
+ return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0;
+}
+
static struct sk_buff *
nfqnl_build_packet_message(struct nfqnl_instance *queue,
struct nf_queue_entry *entry,
__be32 **packet_id_ptr)
{
- sk_buff_data_t old_tail;
size_t size;
size_t data_len = 0, cap_len = 0;
+ int hlen = 0;
struct sk_buff *skb;
struct nlattr *nla;
struct nfqnl_msg_packet_hdr *pmsg;
@@ -236,7 +303,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
struct nf_conn *ct = NULL;
enum ip_conntrack_info uninitialized_var(ctinfo);
- size = NLMSG_SPACE(sizeof(struct nfgenmsg))
+ size = nlmsg_total_size(sizeof(struct nfgenmsg))
+ nla_total_size(sizeof(struct nfqnl_msg_packet_hdr))
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
+ nla_total_size(sizeof(u_int32_t)) /* ifindex */
@@ -246,8 +313,11 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
#endif
+ nla_total_size(sizeof(u_int32_t)) /* mark */
+ nla_total_size(sizeof(struct nfqnl_msg_packet_hw))
- + nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)
- + nla_total_size(sizeof(u_int32_t))); /* cap_len */
+ + nla_total_size(sizeof(u_int32_t)) /* skbinfo */
+ + nla_total_size(sizeof(u_int32_t)); /* cap_len */
+
+ if (entskb->tstamp.tv64)
+ size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));
outdev = entry->outdev;
@@ -257,7 +327,8 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
break;
case NFQNL_COPY_PACKET:
- if (entskb->ip_summed == CHECKSUM_PARTIAL &&
+ if (!(queue->flags & NFQA_CFG_F_GSO) &&
+ entskb->ip_summed == CHECKSUM_PARTIAL &&
skb_checksum_help(entskb))
return NULL;
@@ -265,7 +336,16 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
if (data_len == 0 || data_len > entskb->len)
data_len = entskb->len;
- size += nla_total_size(data_len);
+
+ if (!entskb->head_frag ||
+ skb_headlen(entskb) < L1_CACHE_BYTES ||
+ skb_shinfo(entskb)->nr_frags >= MAX_SKB_FRAGS)
+ hlen = skb_headlen(entskb);
+
+ if (skb_has_frag_list(entskb))
+ hlen = entskb->len;
+ hlen = min_t(int, data_len, hlen);
+ size += sizeof(struct nlattr) + hlen;
cap_len = entskb->len;
break;
}
@@ -273,11 +353,11 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
if (queue->flags & NFQA_CFG_F_CONNTRACK)
ct = nfqnl_ct_get(entskb, &size, &ctinfo);
- skb = alloc_skb(size, GFP_ATOMIC);
+ skb = nfnetlink_alloc_skb(&init_net, size, queue->peer_portid,
+ GFP_ATOMIC);
if (!skb)
return NULL;
- old_tail = skb->tail;
nlh = nlmsg_put(skb, 0, 0,
NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET,
sizeof(struct nfgenmsg), 0);
@@ -382,31 +462,29 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
goto nla_put_failure;
}
+ if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0)
+ goto nla_put_failure;
+
+ if (cap_len > 0 && nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len)))
+ goto nla_put_failure;
+
+ if (nfqnl_put_packet_info(skb, entskb))
+ goto nla_put_failure;
+
if (data_len) {
struct nlattr *nla;
- int sz = nla_attr_size(data_len);
- if (skb_tailroom(skb) < nla_total_size(data_len)) {
- printk(KERN_WARNING "nf_queue: no tailroom!\n");
- kfree_skb(skb);
- return NULL;
- }
+ if (skb_tailroom(skb) < sizeof(*nla) + hlen)
+ goto nla_put_failure;
- nla = (struct nlattr *)skb_put(skb, nla_total_size(data_len));
+ nla = (struct nlattr *)skb_put(skb, sizeof(*nla));
nla->nla_type = NFQA_PAYLOAD;
- nla->nla_len = sz;
+ nla->nla_len = nla_attr_size(data_len);
- if (skb_copy_bits(entskb, 0, nla_data(nla), data_len))
- BUG();
+ nfqnl_zcopy(skb, entskb, data_len, hlen);
}
- if (ct && nfqnl_ct_put(skb, ct, ctinfo) < 0)
- goto nla_put_failure;
-
- if (cap_len > 0 && nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len)))
- goto nla_put_failure;
-
- nlh->nlmsg_len = skb->tail - old_tail;
+ nlh->nlmsg_len = skb->len;
return skb;
nla_put_failure:
@@ -416,26 +494,14 @@ nla_put_failure:
}
static int
-nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
+__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
+ struct nf_queue_entry *entry)
{
struct sk_buff *nskb;
- struct nfqnl_instance *queue;
int err = -ENOBUFS;
__be32 *packet_id_ptr;
int failopen = 0;
- /* rcu_read_lock()ed by nf_hook_slow() */
- queue = instance_lookup(queuenum);
- if (!queue) {
- err = -ESRCH;
- goto err_out;
- }
-
- if (queue->copy_mode == NFQNL_COPY_NONE) {
- err = -EINVAL;
- goto err_out;
- }
-
nskb = nfqnl_build_packet_message(queue, entry, &packet_id_ptr);
if (nskb == NULL) {
err = -ENOMEM;
@@ -462,7 +528,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
*packet_id_ptr = htonl(entry->id);
/* nfnetlink_unicast will either free the nskb or add it to a socket */
- err = nfnetlink_unicast(nskb, &init_net, queue->peer_portid, MSG_DONTWAIT);
+ err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT);
if (err < 0) {
queue->queue_user_dropped++;
goto err_out_unlock;
@@ -483,6 +549,141 @@ err_out:
return err;
}
+static struct nf_queue_entry *
+nf_queue_entry_dup(struct nf_queue_entry *e)
+{
+ struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC);
+ if (entry) {
+ if (nf_queue_entry_get_refs(entry))
+ return entry;
+ kfree(entry);
+ }
+ return NULL;
+}
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+/* When called from bridge netfilter, skb->data must point to MAC header
+ * before calling skb_gso_segment(). Else, original MAC header is lost
+ * and segmented skbs will be sent to wrong destination.
+ */
+static void nf_bridge_adjust_skb_data(struct sk_buff *skb)
+{
+ if (skb->nf_bridge)
+ __skb_push(skb, skb->network_header - skb->mac_header);
+}
+
+static void nf_bridge_adjust_segmented_data(struct sk_buff *skb)
+{
+ if (skb->nf_bridge)
+ __skb_pull(skb, skb->network_header - skb->mac_header);
+}
+#else
+#define nf_bridge_adjust_skb_data(s) do {} while (0)
+#define nf_bridge_adjust_segmented_data(s) do {} while (0)
+#endif
+
+static void free_entry(struct nf_queue_entry *entry)
+{
+ nf_queue_entry_release_refs(entry);
+ kfree(entry);
+}
+
+static int
+__nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
+ struct sk_buff *skb, struct nf_queue_entry *entry)
+{
+ int ret = -ENOMEM;
+ struct nf_queue_entry *entry_seg;
+
+ nf_bridge_adjust_segmented_data(skb);
+
+ if (skb->next == NULL) { /* last packet, no need to copy entry */
+ struct sk_buff *gso_skb = entry->skb;
+ entry->skb = skb;
+ ret = __nfqnl_enqueue_packet(net, queue, entry);
+ if (ret)
+ entry->skb = gso_skb;
+ return ret;
+ }
+
+ skb->next = NULL;
+
+ entry_seg = nf_queue_entry_dup(entry);
+ if (entry_seg) {
+ entry_seg->skb = skb;
+ ret = __nfqnl_enqueue_packet(net, queue, entry_seg);
+ if (ret)
+ free_entry(entry_seg);
+ }
+ return ret;
+}
+
+static int
+nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
+{
+ unsigned int queued;
+ struct nfqnl_instance *queue;
+ struct sk_buff *skb, *segs;
+ int err = -ENOBUFS;
+ struct net *net = dev_net(entry->indev ?
+ entry->indev : entry->outdev);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+
+ /* rcu_read_lock()ed by nf_hook_slow() */
+ queue = instance_lookup(q, queuenum);
+ if (!queue)
+ return -ESRCH;
+
+ if (queue->copy_mode == NFQNL_COPY_NONE)
+ return -EINVAL;
+
+ if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(entry->skb))
+ return __nfqnl_enqueue_packet(net, queue, entry);
+
+ skb = entry->skb;
+
+ switch (entry->pf) {
+ case NFPROTO_IPV4:
+ skb->protocol = htons(ETH_P_IP);
+ break;
+ case NFPROTO_IPV6:
+ skb->protocol = htons(ETH_P_IPV6);
+ break;
+ }
+
+ nf_bridge_adjust_skb_data(skb);
+ segs = skb_gso_segment(skb, 0);
+ /* Does not use PTR_ERR to limit the number of error codes that can be
+ * returned by nf_queue. For instance, callers rely on -ECANCELED to
+ * mean 'ignore this hook'.
+ */
+ if (IS_ERR(segs))
+ goto out_err;
+ queued = 0;
+ err = 0;
+ do {
+ struct sk_buff *nskb = segs->next;
+ if (err == 0)
+ err = __nfqnl_enqueue_packet_gso(net, queue,
+ segs, entry);
+ if (err == 0)
+ queued++;
+ else
+ kfree_skb(segs);
+ segs = nskb;
+ } while (segs);
+
+ if (queued) {
+ if (err) /* some segments are already queued */
+ free_entry(entry);
+ kfree_skb(skb);
+ return 0;
+ }
+ out_err:
+ nf_bridge_adjust_segmented_data(skb);
+ return err;
+}
+
static int
nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff)
{
@@ -575,15 +776,16 @@ dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex)
/* drop all packets with either indev or outdev == ifindex from all queue
* instances */
static void
-nfqnl_dev_drop(int ifindex)
+nfqnl_dev_drop(struct net *net, int ifindex)
{
int i;
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
rcu_read_lock();
for (i = 0; i < INSTANCE_BUCKETS; i++) {
struct nfqnl_instance *inst;
- struct hlist_head *head = &instance_table[i];
+ struct hlist_head *head = &q->instance_table[i];
hlist_for_each_entry_rcu(inst, head, hlist)
nfqnl_flush(inst, dev_cmp, ifindex);
@@ -600,12 +802,9 @@ nfqnl_rcv_dev_event(struct notifier_block *this,
{
struct net_device *dev = ptr;
- if (!net_eq(dev_net(dev), &init_net))
- return NOTIFY_DONE;
-
/* Drop any packets associated with the downed device */
if (event == NETDEV_DOWN)
- nfqnl_dev_drop(dev->ifindex);
+ nfqnl_dev_drop(dev_net(dev), dev->ifindex);
return NOTIFY_DONE;
}
@@ -618,24 +817,24 @@ nfqnl_rcv_nl_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct netlink_notify *n = ptr;
+ struct nfnl_queue_net *q = nfnl_queue_pernet(n->net);
if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) {
int i;
/* destroy all instances for this portid */
- spin_lock(&instances_lock);
+ spin_lock(&q->instances_lock);
for (i = 0; i < INSTANCE_BUCKETS; i++) {
struct hlist_node *t2;
struct nfqnl_instance *inst;
- struct hlist_head *head = &instance_table[i];
+ struct hlist_head *head = &q->instance_table[i];
hlist_for_each_entry_safe(inst, t2, head, hlist) {
- if ((n->net == &init_net) &&
- (n->portid == inst->peer_portid))
+ if (n->portid == inst->peer_portid)
__instance_destroy(inst);
}
}
- spin_unlock(&instances_lock);
+ spin_unlock(&q->instances_lock);
}
return NOTIFY_DONE;
}
@@ -656,11 +855,12 @@ static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = {
[NFQA_MARK] = { .type = NLA_U32 },
};
-static struct nfqnl_instance *verdict_instance_lookup(u16 queue_num, int nlportid)
+static struct nfqnl_instance *
+verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, int nlportid)
{
struct nfqnl_instance *queue;
- queue = instance_lookup(queue_num);
+ queue = instance_lookup(q, queue_num);
if (!queue)
return ERR_PTR(-ENODEV);
@@ -704,7 +904,11 @@ nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb,
LIST_HEAD(batch_list);
u16 queue_num = ntohs(nfmsg->res_id);
- queue = verdict_instance_lookup(queue_num, NETLINK_CB(skb).portid);
+ struct net *net = sock_net(ctnl);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+
+ queue = verdict_instance_lookup(q, queue_num,
+ NETLINK_CB(skb).portid);
if (IS_ERR(queue))
return PTR_ERR(queue);
@@ -752,10 +956,13 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
enum ip_conntrack_info uninitialized_var(ctinfo);
struct nf_conn *ct = NULL;
- queue = instance_lookup(queue_num);
- if (!queue)
+ struct net *net = sock_net(ctnl);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
- queue = verdict_instance_lookup(queue_num, NETLINK_CB(skb).portid);
+ queue = instance_lookup(q, queue_num);
+ if (!queue)
+ queue = verdict_instance_lookup(q, queue_num,
+ NETLINK_CB(skb).portid);
if (IS_ERR(queue))
return PTR_ERR(queue);
@@ -819,6 +1026,8 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
u_int16_t queue_num = ntohs(nfmsg->res_id);
struct nfqnl_instance *queue;
struct nfqnl_msg_config_cmd *cmd = NULL;
+ struct net *net = sock_net(ctnl);
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
int ret = 0;
if (nfqa[NFQA_CFG_CMD]) {
@@ -832,7 +1041,7 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
}
rcu_read_lock();
- queue = instance_lookup(queue_num);
+ queue = instance_lookup(q, queue_num);
if (queue && queue->peer_portid != NETLINK_CB(skb).portid) {
ret = -EPERM;
goto err_out_unlock;
@@ -845,7 +1054,8 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
ret = -EBUSY;
goto err_out_unlock;
}
- queue = instance_create(queue_num, NETLINK_CB(skb).portid);
+ queue = instance_create(q, queue_num,
+ NETLINK_CB(skb).portid);
if (IS_ERR(queue)) {
ret = PTR_ERR(queue);
goto err_out_unlock;
@@ -856,7 +1066,7 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
ret = -ENODEV;
goto err_out_unlock;
}
- instance_destroy(queue);
+ instance_destroy(q, queue);
break;
case NFQNL_CFG_CMD_PF_BIND:
case NFQNL_CFG_CMD_PF_UNBIND:
@@ -950,19 +1160,24 @@ static const struct nfnetlink_subsystem nfqnl_subsys = {
#ifdef CONFIG_PROC_FS
struct iter_state {
+ struct seq_net_private p;
unsigned int bucket;
};
static struct hlist_node *get_first(struct seq_file *seq)
{
struct iter_state *st = seq->private;
+ struct net *net;
+ struct nfnl_queue_net *q;
if (!st)
return NULL;
+ net = seq_file_net(seq);
+ q = nfnl_queue_pernet(net);
for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
- if (!hlist_empty(&instance_table[st->bucket]))
- return instance_table[st->bucket].first;
+ if (!hlist_empty(&q->instance_table[st->bucket]))
+ return q->instance_table[st->bucket].first;
}
return NULL;
}
@@ -970,13 +1185,17 @@ static struct hlist_node *get_first(struct seq_file *seq)
static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
{
struct iter_state *st = seq->private;
+ struct net *net = seq_file_net(seq);
h = h->next;
while (!h) {
+ struct nfnl_queue_net *q;
+
if (++st->bucket >= INSTANCE_BUCKETS)
return NULL;
- h = instance_table[st->bucket].first;
+ q = nfnl_queue_pernet(net);
+ h = q->instance_table[st->bucket].first;
}
return h;
}
@@ -992,11 +1211,11 @@ static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
return pos ? NULL : head;
}
-static void *seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(instances_lock)
+static void *seq_start(struct seq_file *s, loff_t *pos)
+ __acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock)
{
- spin_lock(&instances_lock);
- return get_idx(seq, *pos);
+ spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock);
+ return get_idx(s, *pos);
}
static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
@@ -1006,9 +1225,9 @@ static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
}
static void seq_stop(struct seq_file *s, void *v)
- __releases(instances_lock)
+ __releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock)
{
- spin_unlock(&instances_lock);
+ spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock);
}
static int seq_show(struct seq_file *s, void *v)
@@ -1032,7 +1251,7 @@ static const struct seq_operations nfqnl_seq_ops = {
static int nfqnl_open(struct inode *inode, struct file *file)
{
- return seq_open_private(file, &nfqnl_seq_ops,
+ return seq_open_net(inode, file, &nfqnl_seq_ops,
sizeof(struct iter_state));
}
@@ -1041,41 +1260,65 @@ static const struct file_operations nfqnl_file_ops = {
.open = nfqnl_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release_private,
+ .release = seq_release_net,
};
#endif /* PROC_FS */
-static int __init nfnetlink_queue_init(void)
+static int __net_init nfnl_queue_net_init(struct net *net)
{
- int i, status = -ENOMEM;
+ unsigned int i;
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
for (i = 0; i < INSTANCE_BUCKETS; i++)
- INIT_HLIST_HEAD(&instance_table[i]);
+ INIT_HLIST_HEAD(&q->instance_table[i]);
+
+ spin_lock_init(&q->instances_lock);
+
+#ifdef CONFIG_PROC_FS
+ if (!proc_create("nfnetlink_queue", 0440,
+ net->nf.proc_netfilter, &nfqnl_file_ops))
+ return -ENOMEM;
+#endif
+ return 0;
+}
+
+static void __net_exit nfnl_queue_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+ remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter);
+#endif
+}
+
+static struct pernet_operations nfnl_queue_net_ops = {
+ .init = nfnl_queue_net_init,
+ .exit = nfnl_queue_net_exit,
+ .id = &nfnl_queue_net_id,
+ .size = sizeof(struct nfnl_queue_net),
+};
+
+static int __init nfnetlink_queue_init(void)
+{
+ int status = -ENOMEM;
netlink_register_notifier(&nfqnl_rtnl_notifier);
status = nfnetlink_subsys_register(&nfqnl_subsys);
if (status < 0) {
- printk(KERN_ERR "nf_queue: failed to create netlink socket\n");
+ pr_err("nf_queue: failed to create netlink socket\n");
goto cleanup_netlink_notifier;
}
-#ifdef CONFIG_PROC_FS
- if (!proc_create("nfnetlink_queue", 0440,
- proc_net_netfilter, &nfqnl_file_ops)) {
- status = -ENOMEM;
+ status = register_pernet_subsys(&nfnl_queue_net_ops);
+ if (status < 0) {
+ pr_err("nf_queue: failed to register pernet ops\n");
goto cleanup_subsys;
}
-#endif
-
register_netdevice_notifier(&nfqnl_dev_notifier);
nf_register_queue_handler(&nfqh);
return status;
-#ifdef CONFIG_PROC_FS
cleanup_subsys:
nfnetlink_subsys_unregister(&nfqnl_subsys);
-#endif
cleanup_netlink_notifier:
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
return status;
@@ -1085,9 +1328,7 @@ static void __exit nfnetlink_queue_fini(void)
{
nf_unregister_queue_handler();
unregister_netdevice_notifier(&nfqnl_dev_notifier);
-#ifdef CONFIG_PROC_FS
- remove_proc_entry("nfnetlink_queue", proc_net_netfilter);
-#endif
+ unregister_pernet_subsys(&nfnl_queue_net_ops);
nfnetlink_subsys_unregister(&nfqnl_subsys);
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 686c7715d77..8b03028cca6 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -2,6 +2,7 @@
* x_tables core - Backend for {ip,ip6,arp}_tables
*
* Copyright (C) 2006-2006 Harald Welte <laforge@netfilter.org>
+ * Copyright (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* Based on existing ip_tables code which is
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
@@ -999,7 +1000,7 @@ static int xt_table_open(struct inode *inode, struct file *file)
sizeof(struct xt_names_priv));
if (!ret) {
priv = ((struct seq_file *)file->private_data)->private;
- priv->af = (unsigned long)PDE(inode)->data;
+ priv->af = (unsigned long)PDE_DATA(inode);
}
return ret;
}
@@ -1147,7 +1148,7 @@ static int xt_match_open(struct inode *inode, struct file *file)
seq = file->private_data;
seq->private = trav;
- trav->nfproto = (unsigned long)PDE(inode)->data;
+ trav->nfproto = (unsigned long)PDE_DATA(inode);
return 0;
}
@@ -1211,7 +1212,7 @@ static int xt_target_open(struct inode *inode, struct file *file)
seq = file->private_data;
seq->private = trav;
- trav->nfproto = (unsigned long)PDE(inode)->data;
+ trav->nfproto = (unsigned long)PDE_DATA(inode);
return 0;
}
diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c
index fa40096940a..5ab24843370 100644
--- a/net/netfilter/xt_LOG.c
+++ b/net/netfilter/xt_LOG.c
@@ -466,7 +466,8 @@ log_packet_common(struct sbuff *m,
static void
-ipt_log_packet(u_int8_t pf,
+ipt_log_packet(struct net *net,
+ u_int8_t pf,
unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
@@ -474,7 +475,13 @@ ipt_log_packet(u_int8_t pf,
const struct nf_loginfo *loginfo,
const char *prefix)
{
- struct sbuff *m = sb_open();
+ struct sbuff *m;
+
+ /* FIXME: Disabled from containers until syslog ns is supported */
+ if (!net_eq(net, &init_net))
+ return;
+
+ m = sb_open();
if (!loginfo)
loginfo = &default_loginfo;
@@ -730,7 +737,7 @@ static void dump_ipv6_packet(struct sbuff *m,
dump_sk_uid_gid(m, skb->sk);
/* Max length: 16 "MARK=0xFFFFFFFF " */
- if (!recurse && skb->mark)
+ if (recurse && skb->mark)
sb_add(m, "MARK=0x%x ", skb->mark);
}
@@ -790,7 +797,8 @@ fallback:
}
static void
-ip6t_log_packet(u_int8_t pf,
+ip6t_log_packet(struct net *net,
+ u_int8_t pf,
unsigned int hooknum,
const struct sk_buff *skb,
const struct net_device *in,
@@ -798,7 +806,13 @@ ip6t_log_packet(u_int8_t pf,
const struct nf_loginfo *loginfo,
const char *prefix)
{
- struct sbuff *m = sb_open();
+ struct sbuff *m;
+
+ /* FIXME: Disabled from containers until syslog ns is supported */
+ if (!net_eq(net, &init_net))
+ return;
+
+ m = sb_open();
if (!loginfo)
loginfo = &default_loginfo;
@@ -819,17 +833,18 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_log_info *loginfo = par->targinfo;
struct nf_loginfo li;
+ struct net *net = dev_net(par->in ? par->in : par->out);
li.type = NF_LOG_TYPE_LOG;
li.u.log.level = loginfo->level;
li.u.log.logflags = loginfo->logflags;
if (par->family == NFPROTO_IPV4)
- ipt_log_packet(NFPROTO_IPV4, par->hooknum, skb, par->in,
+ ipt_log_packet(net, NFPROTO_IPV4, par->hooknum, skb, par->in,
par->out, &li, loginfo->prefix);
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
else if (par->family == NFPROTO_IPV6)
- ip6t_log_packet(NFPROTO_IPV6, par->hooknum, skb, par->in,
+ ip6t_log_packet(net, NFPROTO_IPV6, par->hooknum, skb, par->in,
par->out, &li, loginfo->prefix);
#endif
else
@@ -893,23 +908,55 @@ static struct nf_logger ip6t_log_logger __read_mostly = {
};
#endif
+static int __net_init log_net_init(struct net *net)
+{
+ nf_log_set(net, NFPROTO_IPV4, &ipt_log_logger);
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ nf_log_set(net, NFPROTO_IPV6, &ip6t_log_logger);
+#endif
+ return 0;
+}
+
+static void __net_exit log_net_exit(struct net *net)
+{
+ nf_log_unset(net, &ipt_log_logger);
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
+ nf_log_unset(net, &ip6t_log_logger);
+#endif
+}
+
+static struct pernet_operations log_net_ops = {
+ .init = log_net_init,
+ .exit = log_net_exit,
+};
+
static int __init log_tg_init(void)
{
int ret;
+ ret = register_pernet_subsys(&log_net_ops);
+ if (ret < 0)
+ goto err_pernet;
+
ret = xt_register_targets(log_tg_regs, ARRAY_SIZE(log_tg_regs));
if (ret < 0)
- return ret;
+ goto err_target;
nf_log_register(NFPROTO_IPV4, &ipt_log_logger);
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
nf_log_register(NFPROTO_IPV6, &ip6t_log_logger);
#endif
return 0;
+
+err_target:
+ unregister_pernet_subsys(&log_net_ops);
+err_pernet:
+ return ret;
}
static void __exit log_tg_exit(void)
{
+ unregister_pernet_subsys(&log_net_ops);
nf_log_unregister(&ipt_log_logger);
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
nf_log_unregister(&ip6t_log_logger);
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index a17dd0f589b..fb7497c928a 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -26,13 +26,14 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_nflog_info *info = par->targinfo;
struct nf_loginfo li;
+ struct net *net = dev_net(par->in ? par->in : par->out);
li.type = NF_LOG_TYPE_ULOG;
li.u.ulog.copy_len = info->len;
li.u.ulog.group = info->group;
li.u.ulog.qthreshold = info->threshold;
- nfulnl_log_packet(par->family, par->hooknum, skb, par->in,
+ nfulnl_log_packet(net, par->family, par->hooknum, skb, par->in,
par->out, &li, info->prefix);
return XT_CONTINUE;
}
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index 817f9e9f2b1..1e2fae32f81 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -76,22 +76,31 @@ static u32 hash_v6(const struct sk_buff *skb)
}
#endif
-static unsigned int
-nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
+static u32
+nfqueue_hash(const struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_NFQ_info_v1 *info = par->targinfo;
u32 queue = info->queuenum;
- if (info->queues_total > 1) {
- if (par->family == NFPROTO_IPV4)
- queue = (((u64) hash_v4(skb) * info->queues_total) >>
- 32) + queue;
+ if (par->family == NFPROTO_IPV4)
+ queue += ((u64) hash_v4(skb) * info->queues_total) >> 32;
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
- else if (par->family == NFPROTO_IPV6)
- queue = (((u64) hash_v6(skb) * info->queues_total) >>
- 32) + queue;
+ else if (par->family == NFPROTO_IPV6)
+ queue += ((u64) hash_v6(skb) * info->queues_total) >> 32;
#endif
- }
+
+ return queue;
+}
+
+static unsigned int
+nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_NFQ_info_v1 *info = par->targinfo;
+ u32 queue = info->queuenum;
+
+ if (info->queues_total > 1)
+ queue = nfqueue_hash(skb, par);
+
return NF_QUEUE_NR(queue);
}
@@ -108,7 +117,7 @@ nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par)
static int nfqueue_tg_check(const struct xt_tgchk_param *par)
{
- const struct xt_NFQ_info_v2 *info = par->targinfo;
+ const struct xt_NFQ_info_v3 *info = par->targinfo;
u32 maxid;
if (unlikely(!rnd_inited)) {
@@ -125,11 +134,32 @@ static int nfqueue_tg_check(const struct xt_tgchk_param *par)
info->queues_total, maxid);
return -ERANGE;
}
- if (par->target->revision == 2 && info->bypass > 1)
+ if (par->target->revision == 2 && info->flags > 1)
return -EINVAL;
+ if (par->target->revision == 3 && info->flags & ~NFQ_FLAG_MASK)
+ return -EINVAL;
+
return 0;
}
+static unsigned int
+nfqueue_tg_v3(struct sk_buff *skb, const struct xt_action_param *par)
+{
+ const struct xt_NFQ_info_v3 *info = par->targinfo;
+ u32 queue = info->queuenum;
+
+ if (info->queues_total > 1) {
+ if (info->flags & NFQ_FLAG_CPU_FANOUT) {
+ int cpu = smp_processor_id();
+
+ queue = info->queuenum + cpu % info->queues_total;
+ } else
+ queue = nfqueue_hash(skb, par);
+ }
+
+ return NF_QUEUE_NR(queue);
+}
+
static struct xt_target nfqueue_tg_reg[] __read_mostly = {
{
.name = "NFQUEUE",
@@ -156,6 +186,15 @@ static struct xt_target nfqueue_tg_reg[] __read_mostly = {
.targetsize = sizeof(struct xt_NFQ_info_v2),
.me = THIS_MODULE,
},
+ {
+ .name = "NFQUEUE",
+ .revision = 3,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = nfqueue_tg_check,
+ .target = nfqueue_tg_v3,
+ .targetsize = sizeof(struct xt_NFQ_info_v3),
+ .me = THIS_MODULE,
+ },
};
static int __init nfqueue_tg_init(void)
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index 71a266de5fb..a75240f0d42 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -2,6 +2,7 @@
* This is a module which is used for setting the MSS option in TCP packets.
*
* Copyright (C) 2000 Marc Boucher <marc@mbsi.ca>
+ * Copyright (C) 2007 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c
index 25fd1c4e1ee..1eb1a44bfd3 100644
--- a/net/netfilter/xt_TCPOPTSTRIP.c
+++ b/net/netfilter/xt_TCPOPTSTRIP.c
@@ -30,17 +30,28 @@ static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset)
static unsigned int
tcpoptstrip_mangle_packet(struct sk_buff *skb,
- const struct xt_tcpoptstrip_target_info *info,
+ const struct xt_action_param *par,
unsigned int tcphoff, unsigned int minlen)
{
+ const struct xt_tcpoptstrip_target_info *info = par->targinfo;
unsigned int optl, i, j;
struct tcphdr *tcph;
u_int16_t n, o;
u_int8_t *opt;
+ int len;
+
+ /* This is a fragment, no TCP header is available */
+ if (par->fragoff != 0)
+ return XT_CONTINUE;
if (!skb_make_writable(skb, skb->len))
return NF_DROP;
+ len = skb->len - tcphoff;
+ if (len < (int)sizeof(struct tcphdr) ||
+ tcp_hdr(skb)->doff * 4 > len)
+ return NF_DROP;
+
tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff);
opt = (u_int8_t *)tcph;
@@ -76,7 +87,7 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb,
static unsigned int
tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
- return tcpoptstrip_mangle_packet(skb, par->targinfo, ip_hdrlen(skb),
+ return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb),
sizeof(struct iphdr) + sizeof(struct tcphdr));
}
@@ -94,7 +105,7 @@ tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par)
if (tcphoff < 0)
return NF_DROP;
- return tcpoptstrip_mangle_packet(skb, par->targinfo, tcphoff,
+ return tcpoptstrip_mangle_packet(skb, par, tcphoff,
sizeof(*ipv6h) + sizeof(struct tcphdr));
}
#endif
diff --git a/net/netfilter/xt_addrtype.c b/net/netfilter/xt_addrtype.c
index 49c5ff7f6dd..68ff29f6086 100644
--- a/net/netfilter/xt_addrtype.c
+++ b/net/netfilter/xt_addrtype.c
@@ -22,6 +22,7 @@
#include <net/ip6_fib.h>
#endif
+#include <linux/netfilter_ipv6.h>
#include <linux/netfilter/xt_addrtype.h>
#include <linux/netfilter/x_tables.h>
@@ -33,12 +34,12 @@ MODULE_ALIAS("ip6t_addrtype");
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
static u32 match_lookup_rt6(struct net *net, const struct net_device *dev,
- const struct in6_addr *addr)
+ const struct in6_addr *addr, u16 mask)
{
const struct nf_afinfo *afinfo;
struct flowi6 flow;
struct rt6_info *rt;
- u32 ret;
+ u32 ret = 0;
int route_err;
memset(&flow, 0, sizeof(flow));
@@ -49,12 +50,19 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev,
rcu_read_lock();
afinfo = nf_get_afinfo(NFPROTO_IPV6);
- if (afinfo != NULL)
+ if (afinfo != NULL) {
+ const struct nf_ipv6_ops *v6ops;
+
+ if (dev && (mask & XT_ADDRTYPE_LOCAL)) {
+ v6ops = nf_get_ipv6_ops();
+ if (v6ops && v6ops->chk_addr(net, addr, dev, true))
+ ret = XT_ADDRTYPE_LOCAL;
+ }
route_err = afinfo->route(net, (struct dst_entry **)&rt,
- flowi6_to_flowi(&flow), !!dev);
- else
+ flowi6_to_flowi(&flow), false);
+ } else {
route_err = 1;
-
+ }
rcu_read_unlock();
if (route_err)
@@ -62,15 +70,12 @@ static u32 match_lookup_rt6(struct net *net, const struct net_device *dev,
if (rt->rt6i_flags & RTF_REJECT)
ret = XT_ADDRTYPE_UNREACHABLE;
- else
- ret = 0;
- if (rt->rt6i_flags & RTF_LOCAL)
+ if (dev == NULL && rt->rt6i_flags & RTF_LOCAL)
ret |= XT_ADDRTYPE_LOCAL;
if (rt->rt6i_flags & RTF_ANYCAST)
ret |= XT_ADDRTYPE_ANYCAST;
-
dst_release(&rt->dst);
return ret;
}
@@ -90,7 +95,7 @@ static bool match_type6(struct net *net, const struct net_device *dev,
if ((XT_ADDRTYPE_LOCAL | XT_ADDRTYPE_ANYCAST |
XT_ADDRTYPE_UNREACHABLE) & mask)
- return !!(mask & match_lookup_rt6(net, dev, addr));
+ return !!(mask & match_lookup_rt6(net, dev, addr, mask));
return true;
}
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index 61805d7b38a..188404b9b00 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -3,6 +3,7 @@
* information. (Superset of Rusty's minimalistic state match.)
*
* (C) 2001 Marc Boucher (marc@mbsi.ca).
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
* Copyright © CC Computer Consultants GmbH, 2007 - 2008
*
* This program is free software; you can redistribute it and/or modify
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index f330e8beaf6..9ff035c7140 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -3,6 +3,7 @@
* separately for each hashbucket (sourceip/sourceport/dstip/dstport)
*
* (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
* Copyright © CC Computer Consultants GmbH, 2007 - 2008
*
* Development of this code was funded by Astaro AG, http://www.astaro.com/
@@ -107,6 +108,7 @@ struct xt_hashlimit_htable {
/* seq_file stuff */
struct proc_dir_entry *pde;
+ const char *name;
struct net *net;
struct hlist_head hash[0]; /* hashtable itself */
@@ -253,6 +255,11 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
hinfo->count = 0;
hinfo->family = family;
hinfo->rnd_initialized = false;
+ hinfo->name = kstrdup(minfo->name, GFP_KERNEL);
+ if (!hinfo->name) {
+ vfree(hinfo);
+ return -ENOMEM;
+ }
spin_lock_init(&hinfo->lock);
hinfo->pde = proc_create_data(minfo->name, 0,
@@ -260,6 +267,7 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
&dl_file_ops, hinfo);
if (hinfo->pde == NULL) {
+ kfree(hinfo->name);
vfree(hinfo);
return -ENOMEM;
}
@@ -330,9 +338,10 @@ static void htable_destroy(struct xt_hashlimit_htable *hinfo)
parent = hashlimit_net->ip6t_hashlimit;
if(parent != NULL)
- remove_proc_entry(hinfo->pde->name, parent);
+ remove_proc_entry(hinfo->name, parent);
htable_selective_cleanup(hinfo, select_all);
+ kfree(hinfo->name);
vfree(hinfo);
}
@@ -344,7 +353,7 @@ static struct xt_hashlimit_htable *htable_find_get(struct net *net,
struct xt_hashlimit_htable *hinfo;
hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) {
- if (!strcmp(name, hinfo->pde->name) &&
+ if (!strcmp(name, hinfo->name) &&
hinfo->family == family) {
hinfo->use++;
return hinfo;
@@ -841,7 +850,7 @@ static int dl_proc_open(struct inode *inode, struct file *file)
if (!ret) {
struct seq_file *sf = file->private_data;
- sf->private = PDE(inode)->data;
+ sf->private = PDE_DATA(inode);
}
return ret;
}
@@ -887,7 +896,7 @@ static void __net_exit hashlimit_proc_net_exit(struct net *net)
pde = hashlimit_net->ip6t_hashlimit;
hlist_for_each_entry(hinfo, &hashlimit_net->htables, node)
- remove_proc_entry(hinfo->pde->name, pde);
+ remove_proc_entry(hinfo->name, pde);
hashlimit_net->ipt_hashlimit = NULL;
hashlimit_net->ip6t_hashlimit = NULL;
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index a4c1e4528ca..bef85059655 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -1,5 +1,6 @@
/* (C) 1999 Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr>
* (C) 1999 Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr>
+ * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index a5e673d32bd..647d989a01e 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -201,6 +201,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
unsigned char opts[MAX_IPOPTLEN];
const struct xt_osf_finger *kf;
const struct xt_osf_user_finger *f;
+ struct net *net = dev_net(p->in ? p->in : p->out);
if (!info)
return false;
@@ -325,7 +326,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
fcount++;
if (info->flags & XT_OSF_LOG)
- nf_log_packet(p->family, p->hooknum, skb,
+ nf_log_packet(net, p->family, p->hooknum, skb,
p->in, p->out, NULL,
"%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
f->genre, f->version, f->subtype,
@@ -341,7 +342,8 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
rcu_read_unlock();
if (!fcount && (info->flags & XT_OSF_LOG))
- nf_log_packet(p->family, p->hooknum, skb, p->in, p->out, NULL,
+ nf_log_packet(net, p->family, p->hooknum, skb, p->in,
+ p->out, NULL,
"Remote OS is not known: %pI4:%u -> %pI4:%u\n",
&ip->saddr, ntohs(tcp->source),
&ip->daddr, ntohs(tcp->dest));
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index d9cad315229..1e657cf715c 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -401,8 +401,7 @@ static int recent_mt_check(const struct xt_mtchk_param *par,
ret = -ENOMEM;
goto out;
}
- pde->uid = uid;
- pde->gid = gid;
+ proc_set_user(pde, uid, gid);
#endif
spin_lock_bh(&recent_lock);
list_add_tail(&t->list, &recent_net->tables);
@@ -525,14 +524,13 @@ static const struct seq_operations recent_seq_ops = {
static int recent_seq_open(struct inode *inode, struct file *file)
{
- struct proc_dir_entry *pde = PDE(inode);
struct recent_iter_state *st;
st = __seq_open_private(file, &recent_seq_ops, sizeof(*st));
if (st == NULL)
return -ENOMEM;
- st->table = pde->data;
+ st->table = PDE_DATA(inode);
return 0;
}
@@ -540,8 +538,7 @@ static ssize_t
recent_mt_proc_write(struct file *file, const char __user *input,
size_t size, loff_t *loff)
{
- const struct proc_dir_entry *pde = PDE(file_inode(file));
- struct recent_table *t = pde->data;
+ struct recent_table *t = PDE_DATA(file_inode(file));
struct recent_entry *e;
char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")];
const char *c = buf;
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index 865a9e54f3a..31790e789e2 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -1,7 +1,7 @@
/* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
* Patrick Schaaf <bof@bof.de>
* Martin Josefsson <gandalf@wlug.westbo.se>
- * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -30,7 +30,7 @@ MODULE_ALIAS("ip6t_SET");
static inline int
match_set(ip_set_id_t index, const struct sk_buff *skb,
const struct xt_action_param *par,
- const struct ip_set_adt_opt *opt, int inv)
+ struct ip_set_adt_opt *opt, int inv)
{
if (ip_set_test(index, skb, par, opt))
inv = !inv;
@@ -38,20 +38,12 @@ match_set(ip_set_id_t index, const struct sk_buff *skb,
}
#define ADT_OPT(n, f, d, fs, cfs, t) \
-const struct ip_set_adt_opt n = { \
- .family = f, \
- .dim = d, \
- .flags = fs, \
- .cmdflags = cfs, \
- .timeout = t, \
-}
-#define ADT_MOPT(n, f, d, fs, cfs, t) \
struct ip_set_adt_opt n = { \
.family = f, \
.dim = d, \
.flags = fs, \
.cmdflags = cfs, \
- .timeout = t, \
+ .ext.timeout = t, \
}
/* Revision 0 interface: backward compatible with netfilter/iptables */
@@ -197,6 +189,9 @@ set_match_v1(const struct sk_buff *skb, struct xt_action_param *par)
ADT_OPT(opt, par->family, info->match_set.dim,
info->match_set.flags, 0, UINT_MAX);
+ if (opt.flags & IPSET_RETURN_NOMATCH)
+ opt.cmdflags |= IPSET_FLAG_RETURN_NOMATCH;
+
return match_set(info->match_set.index, skb, par, &opt,
info->match_set.flags & IPSET_INV_MATCH);
}
@@ -305,15 +300,15 @@ static unsigned int
set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_set_info_target_v2 *info = par->targinfo;
- ADT_MOPT(add_opt, par->family, info->add_set.dim,
- info->add_set.flags, info->flags, info->timeout);
+ ADT_OPT(add_opt, par->family, info->add_set.dim,
+ info->add_set.flags, info->flags, info->timeout);
ADT_OPT(del_opt, par->family, info->del_set.dim,
info->del_set.flags, 0, UINT_MAX);
/* Normalize to fit into jiffies */
- if (add_opt.timeout != IPSET_NO_TIMEOUT &&
- add_opt.timeout > UINT_MAX/MSEC_PER_SEC)
- add_opt.timeout = UINT_MAX/MSEC_PER_SEC;
+ if (add_opt.ext.timeout != IPSET_NO_TIMEOUT &&
+ add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC)
+ add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC;
if (info->add_set.index != IPSET_INVALID_ID)
ip_set_add(info->add_set.index, skb, par, &add_opt);
if (info->del_set.index != IPSET_INVALID_ID)
@@ -325,6 +320,52 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
#define set_target_v2_checkentry set_target_v1_checkentry
#define set_target_v2_destroy set_target_v1_destroy
+/* Revision 3 match */
+
+static bool
+match_counter(u64 counter, const struct ip_set_counter_match *info)
+{
+ switch (info->op) {
+ case IPSET_COUNTER_NONE:
+ return true;
+ case IPSET_COUNTER_EQ:
+ return counter == info->value;
+ case IPSET_COUNTER_NE:
+ return counter != info->value;
+ case IPSET_COUNTER_LT:
+ return counter < info->value;
+ case IPSET_COUNTER_GT:
+ return counter > info->value;
+ }
+ return false;
+}
+
+static bool
+set_match_v3(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_set_info_match_v3 *info = par->matchinfo;
+ ADT_OPT(opt, par->family, info->match_set.dim,
+ info->match_set.flags, info->flags, UINT_MAX);
+ int ret;
+
+ if (info->packets.op != IPSET_COUNTER_NONE ||
+ info->bytes.op != IPSET_COUNTER_NONE)
+ opt.cmdflags |= IPSET_FLAG_MATCH_COUNTERS;
+
+ ret = match_set(info->match_set.index, skb, par, &opt,
+ info->match_set.flags & IPSET_INV_MATCH);
+
+ if (!(ret && opt.cmdflags & IPSET_FLAG_MATCH_COUNTERS))
+ return ret;
+
+ if (!match_counter(opt.ext.packets, &info->packets))
+ return 0;
+ return match_counter(opt.ext.bytes, &info->bytes);
+}
+
+#define set_match_v3_checkentry set_match_v1_checkentry
+#define set_match_v3_destroy set_match_v1_destroy
+
static struct xt_match set_matches[] __read_mostly = {
{
.name = "set",
@@ -377,6 +418,27 @@ static struct xt_match set_matches[] __read_mostly = {
.destroy = set_match_v1_destroy,
.me = THIS_MODULE
},
+ /* counters support: update, match */
+ {
+ .name = "set",
+ .family = NFPROTO_IPV4,
+ .revision = 3,
+ .match = set_match_v3,
+ .matchsize = sizeof(struct xt_set_info_match_v3),
+ .checkentry = set_match_v3_checkentry,
+ .destroy = set_match_v3_destroy,
+ .me = THIS_MODULE
+ },
+ {
+ .name = "set",
+ .family = NFPROTO_IPV6,
+ .revision = 3,
+ .match = set_match_v3,
+ .matchsize = sizeof(struct xt_set_info_match_v3),
+ .checkentry = set_match_v3_checkentry,
+ .destroy = set_match_v3_destroy,
+ .me = THIS_MODULE
+ },
};
static struct xt_target set_targets[] __read_mostly = {