aboutsummaryrefslogtreecommitdiff
path: root/Documentation/networking/packet_mmap.txt
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/networking/packet_mmap.txt')
-rw-r--r--Documentation/networking/packet_mmap.txt368
1 files changed, 362 insertions, 6 deletions
diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index 94444b152fb..23dd80e82b8 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -685,14 +685,342 @@ int main(int argc, char **argp)
}
-------------------------------------------------------------------------------
++ AF_PACKET TPACKET_V3 example
+-------------------------------------------------------------------------------
+
+AF_PACKET's TPACKET_V3 ring buffer can be configured to use non-static frame
+sizes by doing it's own memory management. It is based on blocks where polling
+works on a per block basis instead of per ring as in TPACKET_V2 and predecessor.
+
+It is said that TPACKET_V3 brings the following benefits:
+ *) ~15 - 20% reduction in CPU-usage
+ *) ~20% increase in packet capture rate
+ *) ~2x increase in packet density
+ *) Port aggregation analysis
+ *) Non static frame size to capture entire packet payload
+
+So it seems to be a good candidate to be used with packet fanout.
+
+Minimal example code by Daniel Borkmann based on Chetan Loke's lolpcap (compile
+it with gcc -Wall -O2 blob.c, and try things like "./a.out eth0", etc.):
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+#include <net/if.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <poll.h>
+#include <unistd.h>
+#include <signal.h>
+#include <inttypes.h>
+#include <sys/socket.h>
+#include <sys/mman.h>
+#include <linux/if_packet.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+
+#define BLOCK_SIZE (1 << 22)
+#define FRAME_SIZE 2048
+
+#define NUM_BLOCKS 64
+#define NUM_FRAMES ((BLOCK_SIZE * NUM_BLOCKS) / FRAME_SIZE)
+
+#define BLOCK_RETIRE_TOV_IN_MS 64
+#define BLOCK_PRIV_AREA_SZ 13
+
+#define ALIGN_8(x) (((x) + 8 - 1) & ~(8 - 1))
+
+#define BLOCK_STATUS(x) ((x)->h1.block_status)
+#define BLOCK_NUM_PKTS(x) ((x)->h1.num_pkts)
+#define BLOCK_O2FP(x) ((x)->h1.offset_to_first_pkt)
+#define BLOCK_LEN(x) ((x)->h1.blk_len)
+#define BLOCK_SNUM(x) ((x)->h1.seq_num)
+#define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
+#define BLOCK_PRIV(x) ((void *) ((uint8_t *) (x) + BLOCK_O2PRIV(x)))
+#define BLOCK_HDR_LEN (ALIGN_8(sizeof(struct block_desc)))
+#define BLOCK_PLUS_PRIV(sz_pri) (BLOCK_HDR_LEN + ALIGN_8((sz_pri)))
+
+#ifndef likely
+# define likely(x) __builtin_expect(!!(x), 1)
+#endif
+#ifndef unlikely
+# define unlikely(x) __builtin_expect(!!(x), 0)
+#endif
+
+struct block_desc {
+ uint32_t version;
+ uint32_t offset_to_priv;
+ struct tpacket_hdr_v1 h1;
+};
+
+struct ring {
+ struct iovec *rd;
+ uint8_t *map;
+ struct tpacket_req3 req;
+};
+
+static unsigned long packets_total = 0, bytes_total = 0;
+static sig_atomic_t sigint = 0;
+
+void sighandler(int num)
+{
+ sigint = 1;
+}
+
+static int setup_socket(struct ring *ring, char *netdev)
+{
+ int err, i, fd, v = TPACKET_V3;
+ struct sockaddr_ll ll;
+
+ fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
+ if (fd < 0) {
+ perror("socket");
+ exit(1);
+ }
+
+ err = setsockopt(fd, SOL_PACKET, PACKET_VERSION, &v, sizeof(v));
+ if (err < 0) {
+ perror("setsockopt");
+ exit(1);
+ }
+
+ memset(&ring->req, 0, sizeof(ring->req));
+ ring->req.tp_block_size = BLOCK_SIZE;
+ ring->req.tp_frame_size = FRAME_SIZE;
+ ring->req.tp_block_nr = NUM_BLOCKS;
+ ring->req.tp_frame_nr = NUM_FRAMES;
+ ring->req.tp_retire_blk_tov = BLOCK_RETIRE_TOV_IN_MS;
+ ring->req.tp_sizeof_priv = BLOCK_PRIV_AREA_SZ;
+ ring->req.tp_feature_req_word |= TP_FT_REQ_FILL_RXHASH;
+
+ err = setsockopt(fd, SOL_PACKET, PACKET_RX_RING, &ring->req,
+ sizeof(ring->req));
+ if (err < 0) {
+ perror("setsockopt");
+ exit(1);
+ }
+
+ ring->map = mmap(NULL, ring->req.tp_block_size * ring->req.tp_block_nr,
+ PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED,
+ fd, 0);
+ if (ring->map == MAP_FAILED) {
+ perror("mmap");
+ exit(1);
+ }
+
+ ring->rd = malloc(ring->req.tp_block_nr * sizeof(*ring->rd));
+ assert(ring->rd);
+ for (i = 0; i < ring->req.tp_block_nr; ++i) {
+ ring->rd[i].iov_base = ring->map + (i * ring->req.tp_block_size);
+ ring->rd[i].iov_len = ring->req.tp_block_size;
+ }
+
+ memset(&ll, 0, sizeof(ll));
+ ll.sll_family = PF_PACKET;
+ ll.sll_protocol = htons(ETH_P_ALL);
+ ll.sll_ifindex = if_nametoindex(netdev);
+ ll.sll_hatype = 0;
+ ll.sll_pkttype = 0;
+ ll.sll_halen = 0;
+
+ err = bind(fd, (struct sockaddr *) &ll, sizeof(ll));
+ if (err < 0) {
+ perror("bind");
+ exit(1);
+ }
+
+ return fd;
+}
+
+#ifdef __checked
+static uint64_t prev_block_seq_num = 0;
+
+void assert_block_seq_num(struct block_desc *pbd)
+{
+ if (unlikely(prev_block_seq_num + 1 != BLOCK_SNUM(pbd))) {
+ printf("prev_block_seq_num:%"PRIu64", expected seq:%"PRIu64" != "
+ "actual seq:%"PRIu64"\n", prev_block_seq_num,
+ prev_block_seq_num + 1, (uint64_t) BLOCK_SNUM(pbd));
+ exit(1);
+ }
+
+ prev_block_seq_num = BLOCK_SNUM(pbd);
+}
+
+static void assert_block_len(struct block_desc *pbd, uint32_t bytes, int block_num)
+{
+ if (BLOCK_NUM_PKTS(pbd)) {
+ if (unlikely(bytes != BLOCK_LEN(pbd))) {
+ printf("block:%u with %upackets, expected len:%u != actual len:%u\n",
+ block_num, BLOCK_NUM_PKTS(pbd), bytes, BLOCK_LEN(pbd));
+ exit(1);
+ }
+ } else {
+ if (unlikely(BLOCK_LEN(pbd) != BLOCK_PLUS_PRIV(BLOCK_PRIV_AREA_SZ))) {
+ printf("block:%u, expected len:%lu != actual len:%u\n",
+ block_num, BLOCK_HDR_LEN, BLOCK_LEN(pbd));
+ exit(1);
+ }
+ }
+}
+
+static void assert_block_header(struct block_desc *pbd, const int block_num)
+{
+ uint32_t block_status = BLOCK_STATUS(pbd);
+
+ if (unlikely((block_status & TP_STATUS_USER) == 0)) {
+ printf("block:%u, not in TP_STATUS_USER\n", block_num);
+ exit(1);
+ }
+
+ assert_block_seq_num(pbd);
+}
+#else
+static inline void assert_block_header(struct block_desc *pbd, const int block_num)
+{
+}
+static void assert_block_len(struct block_desc *pbd, uint32_t bytes, int block_num)
+{
+}
+#endif
+
+static void display(struct tpacket3_hdr *ppd)
+{
+ struct ethhdr *eth = (struct ethhdr *) ((uint8_t *) ppd + ppd->tp_mac);
+ struct iphdr *ip = (struct iphdr *) ((uint8_t *) eth + ETH_HLEN);
+
+ if (eth->h_proto == htons(ETH_P_IP)) {
+ struct sockaddr_in ss, sd;
+ char sbuff[NI_MAXHOST], dbuff[NI_MAXHOST];
+
+ memset(&ss, 0, sizeof(ss));
+ ss.sin_family = PF_INET;
+ ss.sin_addr.s_addr = ip->saddr;
+ getnameinfo((struct sockaddr *) &ss, sizeof(ss),
+ sbuff, sizeof(sbuff), NULL, 0, NI_NUMERICHOST);
+
+ memset(&sd, 0, sizeof(sd));
+ sd.sin_family = PF_INET;
+ sd.sin_addr.s_addr = ip->daddr;
+ getnameinfo((struct sockaddr *) &sd, sizeof(sd),
+ dbuff, sizeof(dbuff), NULL, 0, NI_NUMERICHOST);
+
+ printf("%s -> %s, ", sbuff, dbuff);
+ }
+
+ printf("rxhash: 0x%x\n", ppd->hv1.tp_rxhash);
+}
+
+static void walk_block(struct block_desc *pbd, const int block_num)
+{
+ int num_pkts = BLOCK_NUM_PKTS(pbd), i;
+ unsigned long bytes = 0;
+ unsigned long bytes_with_padding = BLOCK_PLUS_PRIV(BLOCK_PRIV_AREA_SZ);
+ struct tpacket3_hdr *ppd;
+
+ assert_block_header(pbd, block_num);
+
+ ppd = (struct tpacket3_hdr *) ((uint8_t *) pbd + BLOCK_O2FP(pbd));
+ for (i = 0; i < num_pkts; ++i) {
+ bytes += ppd->tp_snaplen;
+ if (ppd->tp_next_offset)
+ bytes_with_padding += ppd->tp_next_offset;
+ else
+ bytes_with_padding += ALIGN_8(ppd->tp_snaplen + ppd->tp_mac);
+
+ display(ppd);
+
+ ppd = (struct tpacket3_hdr *) ((uint8_t *) ppd + ppd->tp_next_offset);
+ __sync_synchronize();
+ }
+
+ assert_block_len(pbd, bytes_with_padding, block_num);
+
+ packets_total += num_pkts;
+ bytes_total += bytes;
+}
+
+void flush_block(struct block_desc *pbd)
+{
+ BLOCK_STATUS(pbd) = TP_STATUS_KERNEL;
+ __sync_synchronize();
+}
+
+static void teardown_socket(struct ring *ring, int fd)
+{
+ munmap(ring->map, ring->req.tp_block_size * ring->req.tp_block_nr);
+ free(ring->rd);
+ close(fd);
+}
+
+int main(int argc, char **argp)
+{
+ int fd, err;
+ socklen_t len;
+ struct ring ring;
+ struct pollfd pfd;
+ unsigned int block_num = 0;
+ struct block_desc *pbd;
+ struct tpacket_stats_v3 stats;
+
+ if (argc != 2) {
+ fprintf(stderr, "Usage: %s INTERFACE\n", argp[0]);
+ return EXIT_FAILURE;
+ }
+
+ signal(SIGINT, sighandler);
+
+ memset(&ring, 0, sizeof(ring));
+ fd = setup_socket(&ring, argp[argc - 1]);
+ assert(fd > 0);
+
+ memset(&pfd, 0, sizeof(pfd));
+ pfd.fd = fd;
+ pfd.events = POLLIN | POLLERR;
+ pfd.revents = 0;
+
+ while (likely(!sigint)) {
+ pbd = (struct block_desc *) ring.rd[block_num].iov_base;
+retry_block:
+ if ((BLOCK_STATUS(pbd) & TP_STATUS_USER) == 0) {
+ poll(&pfd, 1, -1);
+ goto retry_block;
+ }
+
+ walk_block(pbd, block_num);
+ flush_block(pbd);
+ block_num = (block_num + 1) % NUM_BLOCKS;
+ }
+
+ len = sizeof(stats);
+ err = getsockopt(fd, SOL_PACKET, PACKET_STATISTICS, &stats, &len);
+ if (err < 0) {
+ perror("getsockopt");
+ exit(1);
+ }
+
+ fflush(stdout);
+ printf("\nReceived %u packets, %lu bytes, %u dropped, freeze_q_cnt: %u\n",
+ stats.tp_packets, bytes_total, stats.tp_drops,
+ stats.tp_freeze_q_cnt);
+
+ teardown_socket(&ring, fd);
+ return 0;
+}
+
+-------------------------------------------------------------------------------
+ PACKET_TIMESTAMP
-------------------------------------------------------------------------------
The PACKET_TIMESTAMP setting determines the source of the timestamp in
-the packet meta information. If your NIC is capable of timestamping
-packets in hardware, you can request those hardware timestamps to used.
-Note: you may need to enable the generation of hardware timestamps with
-SIOCSHWTSTAMP.
+the packet meta information for mmap(2)ed RX_RING and TX_RINGs. If your
+NIC is capable of timestamping packets in hardware, you can request those
+hardware timestamps to be used. Note: you may need to enable the generation
+of hardware timestamps with SIOCSHWTSTAMP (see related information from
+Documentation/networking/timestamping.txt).
PACKET_TIMESTAMP accepts the same integer bit field as
SO_TIMESTAMPING. However, only the SOF_TIMESTAMPING_SYS_HARDWARE
@@ -704,8 +1032,36 @@ SOF_TIMESTAMPING_RAW_HARDWARE if both bits are set.
req |= SOF_TIMESTAMPING_SYS_HARDWARE;
setsockopt(fd, SOL_PACKET, PACKET_TIMESTAMP, (void *) &req, sizeof(req))
-If PACKET_TIMESTAMP is not set, a software timestamp generated inside
-the networking stack is used (the behavior before this setting was added).
+For the mmap(2)ed ring buffers, such timestamps are stored in the
+tpacket{,2,3}_hdr structure's tp_sec and tp_{n,u}sec members. To determine
+what kind of timestamp has been reported, the tp_status field is binary |'ed
+with the following possible bits ...
+
+ TP_STATUS_TS_SYS_HARDWARE
+ TP_STATUS_TS_RAW_HARDWARE
+ TP_STATUS_TS_SOFTWARE
+
+... that are equivalent to its SOF_TIMESTAMPING_* counterparts. For the
+RX_RING, if none of those 3 are set (i.e. PACKET_TIMESTAMP is not set),
+then this means that a software fallback was invoked *within* PF_PACKET's
+processing code (less precise).
+
+Getting timestamps for the TX_RING works as follows: i) fill the ring frames,
+ii) call sendto() e.g. in blocking mode, iii) wait for status of relevant
+frames to be updated resp. the frame handed over to the application, iv) walk
+through the frames to pick up the individual hw/sw timestamps.
+
+Only (!) if transmit timestamping is enabled, then these bits are combined
+with binary | with TP_STATUS_AVAILABLE, so you must check for that in your
+application (e.g. !(tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING))
+in a first step to see if the frame belongs to the application, and then
+one can extract the type of timestamp in a second step from tp_status)!
+
+If you don't care about them, thus having it disabled, checking for
+TP_STATUS_AVAILABLE resp. TP_STATUS_WRONG_FORMAT is sufficient. If in the
+TX_RING part only TP_STATUS_AVAILABLE is set, then the tp_sec and tp_{n,u}sec
+members do not contain a valid value. For TX_RINGs, by default no timestamp
+is generated!
See include/linux/net_tstamp.h and Documentation/networking/timestamping
for more information on hardware timestamps.