From f00d4d54f28c0374cc57e6ca07dd648d7684c69c Mon Sep 17 00:00:00 2001 From: =?utf8?q?Micha=C5=82=20Purzy=C5=84ski?= Date: Tue, 21 Apr 2015 11:12:44 +0200 Subject: [PATCH] netsniff-ng: add packet fanout support MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This work adds packet fanout support to netsniff-ng. Multiple netsniff-ng instances can join the same fanout group with a particular id in order to improve scaling. Based on different fanout disciplines, e.g. distribute to fanout member by packet hash, round-robin, by arrival cpu, by random, by socket rollover (if one members socket queue is full, switch to next one, etc), by hardware queue mapping, traffic can be distributed to one of the fanout members. Moreover, we also allow the user to specify additional aux arguments, e.g. whether to defrag incoming traffic for the fanout group or not, and whether to roll over a socket in case other disciplines than socket rollover have been used. All that is configurable via command line option. Signed-off-by: Michał Purzyński [ dbkm made some bigger changes to get this upstream ready ] Signed-off-by: Daniel Borkmann --- netsniff-ng.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++----- ring_rx.c | 19 ++++++++++++++++++- ring_rx.h | 36 +++++++++++++++++++++++++++++++++++- 3 files changed, 102 insertions(+), 7 deletions(-) diff --git a/netsniff-ng.c b/netsniff-ng.c index dfb99bb3..2eafe31b 100644 --- a/netsniff-ng.c +++ b/netsniff-ng.c @@ -58,14 +58,18 @@ struct ctx { unsigned long kpull, dump_interval, tx_bytes, tx_packets; size_t reserve_size; bool randomize, promiscuous, enforce, jumbo, dump_bpf, hwtimestamp, verbose; - enum pcap_ops_groups pcap; enum dump_mode dump_mode; - uid_t uid; gid_t gid; uint32_t link_type, magic; + enum pcap_ops_groups pcap; + enum dump_mode dump_mode; + uid_t uid; + gid_t gid; + uint32_t link_type, magic; + uint32_t fanout_group, fanout_type; }; static volatile sig_atomic_t sigint = 0; static volatile bool next_dump = false; -static const char *short_options = "d:i:o:rf:MNJt:S:k:n:b:HQmcsqXlvhF:RGAP:Vu:g:T:DBU"; +static const char *short_options = "d:i:o:rf:MNJt:S:k:n:b:HQmcsqXlvhF:RGAP:Vu:g:T:DBUC:K:L:"; static const struct option long_options[] = { {"dev", required_argument, NULL, 'd'}, {"in", required_argument, NULL, 'i'}, @@ -81,6 +85,9 @@ static const struct option long_options[] = { {"user", required_argument, NULL, 'u'}, {"group", required_argument, NULL, 'g'}, {"magic", required_argument, NULL, 'T'}, + {"fanout-group", required_argument, NULL, 'C'}, + {"fanout-type", required_argument, NULL, 'K'}, + {"fanout-opts", required_argument, NULL, 'L'}, {"rand", no_argument, NULL, 'r'}, {"rfraw", no_argument, NULL, 'R'}, {"mmap", no_argument, NULL, 'm'}, @@ -377,7 +384,8 @@ static void receive_to_xmit(struct ctx *ctx) bpf_dump_all(&bpf_ops); bpf_attach_to_sock(rx_sock, &bpf_ops); - ring_rx_setup(&rx_ring, rx_sock, size_in, ifindex_in, &rx_poll, false, ctx->jumbo, ctx->verbose); + ring_rx_setup(&rx_ring, rx_sock, size_in, ifindex_in, &rx_poll, false, ctx->jumbo, + ctx->verbose, ctx->fanout_group, ctx->fanout_type); ring_tx_setup(&tx_ring, tx_sock, size_out, ifindex_out, ctx->jumbo, ctx->verbose); dissector_init_all(ctx->print_mode); @@ -925,7 +933,8 @@ static void recv_only_or_dump(struct ctx *ctx) printf("HW timestamping enabled\n"); } - ring_rx_setup(&rx_ring, sock, size, ifindex, &rx_poll, is_defined(HAVE_TPACKET3), true, ctx->verbose); + ring_rx_setup(&rx_ring, sock, size, ifindex, &rx_poll, is_defined(HAVE_TPACKET3), true, + ctx->verbose, ctx->fanout_group, ctx->fanout_type); dissector_init_all(ctx->print_mode); @@ -1073,12 +1082,15 @@ next: static void init_ctx(struct ctx *ctx) { memset(ctx, 0, sizeof(*ctx)); + ctx->uid = getuid(); ctx->uid = getgid(); ctx->cpu = -1; ctx->packet_type = -1; + ctx->fanout_type = PACKET_FANOUT_ROLLOVER; + ctx->magic = ORIGINAL_TCPDUMP_MAGIC; ctx->print_mode = PRINT_NORM; ctx->pcap = PCAP_OPS_SG; @@ -1108,6 +1120,9 @@ static void __noreturn help(void) "Options:\n" " -i|-d|--dev|--in Input source as netdev, pcap or pcap stdin\n" " -o|--out Output sink as netdev, pcap, directory, trafgen, or stdout\n" + " -C|--fanout-group Join packet fanout group\n" + " -K|--fanout-type Apply fanout discipline: hash|lb|cpu|rnd|roll|qm\n" + " -L|--fanout-opts Additional fanout options: defrag|roll\n" " -f|--filter Use BPF filter file from bpfc or tcpdump-like expression\n" " -t|--type Filter for: host|broadcast|multicast|others|outgoing\n" " -F|--interval Dump interval if -o is a dir: KiB/MiB/GiB/s/sec/min/hrs\n" @@ -1223,6 +1238,35 @@ int main(int argc, char **argv) ctx.gid = strtoul(optarg, NULL, 0); ctx.enforce = true; break; + case 'C': + ctx.fanout_group = strtoul(optarg, NULL, 0); + if (ctx.fanout_group == 0) + panic("Non-zero fanout group id required!\n"); + break; + case 'K': + if (!strncmp(optarg, "hash", strlen("hash"))) + ctx.fanout_type = PACKET_FANOUT_HASH; + else if (!strncmp(optarg, "lb", strlen("lb"))) + ctx.fanout_type = PACKET_FANOUT_LB; + else if (!strncmp(optarg, "cpu", strlen("cpu"))) + ctx.fanout_type = PACKET_FANOUT_CPU; + else if (!strncmp(optarg, "rnd", strlen("rnd"))) + ctx.fanout_type = PACKET_FANOUT_RND; + else if (!strncmp(optarg, "roll", strlen("roll"))) + ctx.fanout_type = PACKET_FANOUT_ROLLOVER; + else if (!strncmp(optarg, "qm", strlen("qm"))) + ctx.fanout_type = PACKET_FANOUT_QM; + else + panic("Unkown fanout type!\n"); + break; + case 'L': + if (!strncmp(optarg, "defrag", strlen("defrag"))) + ctx.fanout_type |= PACKET_FANOUT_FLAG_DEFRAG; + else if (!strncmp(optarg, "roll", strlen("roll"))) + ctx.fanout_type |= PACKET_FANOUT_FLAG_ROLLOVER; + else + panic("Unkown fanout option!\n"); + break; case 't': if (!strncmp(optarg, "host", strlen("host"))) ctx.packet_type = PACKET_HOST; diff --git a/ring_rx.c b/ring_rx.c index 8ad64d16..32d3f6d7 100644 --- a/ring_rx.c +++ b/ring_rx.c @@ -209,9 +209,25 @@ static void alloc_rx_ring_frames(int sock, struct ring *ring) rx_ring_get_size(ring, v3)); } +void join_fanout_group(int sock, uint32_t fanout_group, uint32_t fanout_type) +{ + uint32_t fanout_opt = 0; + int ret; + + if (fanout_group == 0) + return; + + fanout_opt = (fanout_group & 0xffff) | (fanout_type << 16); + + ret = setsockopt(sock, SOL_PACKET, PACKET_FANOUT, &fanout_opt, + sizeof(fanout_opt)); + if (ret < 0) + panic("Cannot set fanout ring mode!\n"); +} + void ring_rx_setup(struct ring *ring, int sock, size_t size, int ifindex, struct pollfd *poll, bool v3, bool jumbo_support, - bool verbose) + bool verbose, uint32_t fanout_group, uint32_t fanout_type) { fmemset(ring, 0, sizeof(*ring)); setup_rx_ring_layout(sock, ring, size, jumbo_support, v3); @@ -219,6 +235,7 @@ void ring_rx_setup(struct ring *ring, int sock, size_t size, int ifindex, mmap_ring_generic(sock, ring); alloc_rx_ring_frames(sock, ring); bind_ring_generic(sock, ring, ifindex, false); + join_fanout_group(sock, fanout_group, fanout_type); prepare_polling(sock, poll); } diff --git a/ring_rx.h b/ring_rx.h index edd0febe..1f33018e 100644 --- a/ring_rx.h +++ b/ring_rx.h @@ -13,7 +13,7 @@ extern void ring_rx_setup(struct ring *ring, int sock, size_t size, int ifindex, struct pollfd *poll, bool v3, bool jumbo_support, - bool verbose); + bool verbose, uint32_t fanout_group, uint32_t fanout_type); extern void destroy_rx_ring(int sock, struct ring *ring); extern void sock_rx_net_stats(int sock, unsigned long seen); @@ -39,4 +39,38 @@ static inline void kernel_may_pull_from_rx_block(struct block_desc *pbd) } #endif /* HAVE_TPACKET3 */ +/* Fanout types. */ + +#ifndef PACKET_FANOUT_HASH +# define PACKET_FANOUT_HASH 0 +#endif + +#ifndef PACKET_FANOUT_LB +# define PACKET_FANOUT_LB 1 +#endif + +#ifndef PACKET_FANOUT_CPU +# define PACKET_FANOUT_CPU 2 +#endif + +#ifndef PACKET_FANOUT_ROLLOVER +# define PACKET_FANOUT_ROLLOVER 3 +#endif + +#ifndef PACKET_FANOUT_RND +# define PACKET_FANOUT_RND 4 +#endif + +#ifndef PACKET_FANOUT_QM +# define PACKET_FANOUT_QM 5 +#endif + +#ifndef PACKET_FANOUT_FLAG_ROLLOVER +# define PACKET_FANOUT_FLAG_ROLLOVER 0x1000 +#endif + +#ifndef PACKET_FANOUT_FLAG_DEFRAG +# define PACKET_FANOUT_FLAG_DEFRAG 0x8000 +#endif + #endif /* RX_RING_H */ -- 2.11.4.GIT