hw/net/virtio-net.c

   1 /*
   2  * Virtio Network Device
   3  *
   4  * Copyright IBM, Corp. 2007
   5  *
   6  * Authors:
   7  *  Anthony Liguori   <aliguori@us.ibm.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2.  See
  10  * the COPYING file in the top-level directory.
  11  *
  12  */
  13
  14 #include "qemu/osdep.h"
  15 #include "qemu/atomic.h"
  16 #include "qemu/iov.h"
  17 #include "qemu/log.h"
  18 #include "qemu/main-loop.h"
  19 #include "qemu/module.h"
  20 #include "hw/virtio/virtio.h"
  21 #include "net/net.h"
  22 #include "net/checksum.h"
  23 #include "net/tap.h"
  24 #include "qemu/error-report.h"
  25 #include "qemu/timer.h"
  26 #include "qemu/option.h"
  27 #include "qemu/option_int.h"
  28 #include "qemu/config-file.h"
  29 #include "qapi/qmp/qdict.h"
  30 #include "hw/virtio/virtio-net.h"
  31 #include "net/vhost_net.h"
  32 #include "net/announce.h"
  33 #include "hw/virtio/virtio-bus.h"
  34 #include "qapi/error.h"
  35 #include "qapi/qapi-events-net.h"
  36 #include "hw/qdev-properties.h"
  37 #include "qapi/qapi-types-migration.h"
  38 #include "qapi/qapi-events-migration.h"
  39 #include "hw/virtio/virtio-access.h"
  40 #include "migration/misc.h"
  41 #include "standard-headers/linux/ethtool.h"
  42 #include "sysemu/sysemu.h"
  43 #include "trace.h"
  44 #include "monitor/qdev.h"
  45 #include "hw/pci/pci.h"
  46 #include "net_rx_pkt.h"
  47 #include "hw/virtio/vhost.h"
  48 #include "sysemu/qtest.h"
  49
  50 #define VIRTIO_NET_VM_VERSION    11
  51
  52 #define MAX_VLAN    (1 << 12)   /* Per 802.1Q definition */
  53
  54 /* previously fixed value */
  55 #define VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE 256
  56 #define VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE 256
  57
  58 /* for now, only allow larger queue_pairs; with virtio-1, guest can downsize */
  59 #define VIRTIO_NET_RX_QUEUE_MIN_SIZE VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE
  60 #define VIRTIO_NET_TX_QUEUE_MIN_SIZE VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE
  61
  62 #define VIRTIO_NET_IP4_ADDR_SIZE   8        /* ipv4 saddr + daddr */
  63
  64 #define VIRTIO_NET_TCP_FLAG         0x3F
  65 #define VIRTIO_NET_TCP_HDR_LENGTH   0xF000
  66
  67 /* IPv4 max payload, 16 bits in the header */
  68 #define VIRTIO_NET_MAX_IP4_PAYLOAD (65535 - sizeof(struct ip_header))
  69 #define VIRTIO_NET_MAX_TCP_PAYLOAD 65535
  70
  71 /* header length value in ip header without option */
  72 #define VIRTIO_NET_IP4_HEADER_LENGTH 5
  73
  74 #define VIRTIO_NET_IP6_ADDR_SIZE   32      /* ipv6 saddr + daddr */
  75 #define VIRTIO_NET_MAX_IP6_PAYLOAD VIRTIO_NET_MAX_TCP_PAYLOAD
  76
  77 /* Purge coalesced packets timer interval, This value affects the performance
  78    a lot, and should be tuned carefully, '300000'(300us) is the recommended
  79    value to pass the WHQL test, '50000' can gain 2x netperf throughput with
  80    tso/gso/gro 'off'. */
  81 #define VIRTIO_NET_RSC_DEFAULT_INTERVAL 300000
  82
  83 #define VIRTIO_NET_RSS_SUPPORTED_HASHES (VIRTIO_NET_RSS_HASH_TYPE_IPv4 | \
  84                                          VIRTIO_NET_RSS_HASH_TYPE_TCPv4 | \
  85                                          VIRTIO_NET_RSS_HASH_TYPE_UDPv4 | \
  86                                          VIRTIO_NET_RSS_HASH_TYPE_IPv6 | \
  87                                          VIRTIO_NET_RSS_HASH_TYPE_TCPv6 | \
  88                                          VIRTIO_NET_RSS_HASH_TYPE_UDPv6 | \
  89                                          VIRTIO_NET_RSS_HASH_TYPE_IP_EX | \
  90                                          VIRTIO_NET_RSS_HASH_TYPE_TCP_EX | \
  91                                          VIRTIO_NET_RSS_HASH_TYPE_UDP_EX)
  92
  93 static const VirtIOFeature feature_sizes[] = {
  94     {.flags = 1ULL << VIRTIO_NET_F_MAC,
  95      .end = endof(struct virtio_net_config, mac)},
  96     {.flags = 1ULL << VIRTIO_NET_F_STATUS,
  97      .end = endof(struct virtio_net_config, status)},
  98     {.flags = 1ULL << VIRTIO_NET_F_MQ,
  99      .end = endof(struct virtio_net_config, max_virtqueue_pairs)},
 100     {.flags = 1ULL << VIRTIO_NET_F_MTU,
 101      .end = endof(struct virtio_net_config, mtu)},
 102     {.flags = 1ULL << VIRTIO_NET_F_SPEED_DUPLEX,
 103      .end = endof(struct virtio_net_config, duplex)},
 104     {.flags = (1ULL << VIRTIO_NET_F_RSS) | (1ULL << VIRTIO_NET_F_HASH_REPORT),
 105      .end = endof(struct virtio_net_config, supported_hash_types)},
 106     {}
 107 };
 108
 109 static const VirtIOConfigSizeParams cfg_size_params = {
 110     .min_size = endof(struct virtio_net_config, mac),
 111     .max_size = sizeof(struct virtio_net_config),
 112     .feature_sizes = feature_sizes
 113 };
 114
 115 static VirtIONetQueue *virtio_net_get_subqueue(NetClientState *nc)
 116 {
 117     VirtIONet *n = qemu_get_nic_opaque(nc);
 118
 119     return &n->vqs[nc->queue_index];
 120 }
 121
 122 static int vq2q(int queue_index)
 123 {
 124     return queue_index / 2;
 125 }
 126
 127 static void flush_or_purge_queued_packets(NetClientState *nc)
 128 {
 129     if (!nc->peer) {
 130         return;
 131     }
 132
 133     qemu_flush_or_purge_queued_packets(nc->peer, true);
 134     assert(!virtio_net_get_subqueue(nc)->async_tx.elem);
 135 }
 136
 137 /* TODO
 138  * - we could suppress RX interrupt if we were so inclined.
 139  */
 140
 141 static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
 142 {
 143     VirtIONet *n = VIRTIO_NET(vdev);
 144     struct virtio_net_config netcfg;
 145     NetClientState *nc = qemu_get_queue(n->nic);
 146     static const MACAddr zero = { .a = { 0, 0, 0, 0, 0, 0 } };
 147
 148     int ret = 0;
 149     memset(&netcfg, 0 , sizeof(struct virtio_net_config));
 150     virtio_stw_p(vdev, &netcfg.status, n->status);
 151     virtio_stw_p(vdev, &netcfg.max_virtqueue_pairs, n->max_queue_pairs);
 152     virtio_stw_p(vdev, &netcfg.mtu, n->net_conf.mtu);
 153     memcpy(netcfg.mac, n->mac, ETH_ALEN);
 154     virtio_stl_p(vdev, &netcfg.speed, n->net_conf.speed);
 155     netcfg.duplex = n->net_conf.duplex;
 156     netcfg.rss_max_key_size = VIRTIO_NET_RSS_MAX_KEY_SIZE;
 157     virtio_stw_p(vdev, &netcfg.rss_max_indirection_table_length,
 158                  virtio_host_has_feature(vdev, VIRTIO_NET_F_RSS) ?
 159                  VIRTIO_NET_RSS_MAX_TABLE_LEN : 1);
 160     virtio_stl_p(vdev, &netcfg.supported_hash_types,
 161                  VIRTIO_NET_RSS_SUPPORTED_HASHES);
 162     memcpy(config, &netcfg, n->config_size);
 163
 164     /*
 165      * Is this VDPA? No peer means not VDPA: there's no way to
 166      * disconnect/reconnect a VDPA peer.
 167      */
 168     if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
 169         ret = vhost_net_get_config(get_vhost_net(nc->peer), (uint8_t *)&netcfg,
 170                                    n->config_size);
 171         if (ret != -1) {
 172             /*
 173              * Some NIC/kernel combinations present 0 as the mac address.  As
 174              * that is not a legal address, try to proceed with the
 175              * address from the QEMU command line in the hope that the
 176              * address has been configured correctly elsewhere - just not
 177              * reported by the device.
 178              */
 179             if (memcmp(&netcfg.mac, &zero, sizeof(zero)) == 0) {
 180                 info_report("Zero hardware mac address detected. Ignoring.");
 181                 memcpy(netcfg.mac, n->mac, ETH_ALEN);
 182             }
 183             memcpy(config, &netcfg, n->config_size);
 184         }
 185     }
 186 }
 187
 188 static void virtio_net_set_config(VirtIODevice *vdev, const uint8_t *config)
 189 {
 190     VirtIONet *n = VIRTIO_NET(vdev);
 191     struct virtio_net_config netcfg = {};
 192     NetClientState *nc = qemu_get_queue(n->nic);
 193
 194     memcpy(&netcfg, config, n->config_size);
 195
 196     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR) &&
 197         !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1) &&
 198         memcmp(netcfg.mac, n->mac, ETH_ALEN)) {
 199         memcpy(n->mac, netcfg.mac, ETH_ALEN);
 200         qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
 201     }
 202
 203     /*
 204      * Is this VDPA? No peer means not VDPA: there's no way to
 205      * disconnect/reconnect a VDPA peer.
 206      */
 207     if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
 208         vhost_net_set_config(get_vhost_net(nc->peer),
 209                              (uint8_t *)&netcfg, 0, n->config_size,
 210                              VHOST_SET_CONFIG_TYPE_MASTER);
 211       }
 212 }
 213
 214 static bool virtio_net_started(VirtIONet *n, uint8_t status)
 215 {
 216     VirtIODevice *vdev = VIRTIO_DEVICE(n);
 217     return (status & VIRTIO_CONFIG_S_DRIVER_OK) &&
 218         (n->status & VIRTIO_NET_S_LINK_UP) && vdev->vm_running;
 219 }
 220
 221 static void virtio_net_announce_notify(VirtIONet *net)
 222 {
 223     VirtIODevice *vdev = VIRTIO_DEVICE(net);
 224     trace_virtio_net_announce_notify();
 225
 226     net->status |= VIRTIO_NET_S_ANNOUNCE;
 227     virtio_notify_config(vdev);
 228 }
 229
 230 static void virtio_net_announce_timer(void *opaque)
 231 {
 232     VirtIONet *n = opaque;
 233     trace_virtio_net_announce_timer(n->announce_timer.round);
 234
 235     n->announce_timer.round--;
 236     virtio_net_announce_notify(n);
 237 }
 238
 239 static void virtio_net_announce(NetClientState *nc)
 240 {
 241     VirtIONet *n = qemu_get_nic_opaque(nc);
 242     VirtIODevice *vdev = VIRTIO_DEVICE(n);
 243
 244     /*
 245      * Make sure the virtio migration announcement timer isn't running
 246      * If it is, let it trigger announcement so that we do not cause
 247      * confusion.
 248      */
 249     if (n->announce_timer.round) {
 250         return;
 251     }
 252
 253     if (virtio_vdev_has_feature(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE) &&
 254         virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) {
 255             virtio_net_announce_notify(n);
 256     }
 257 }
 258
 259 static void virtio_net_vhost_status(VirtIONet *n, uint8_t status)
 260 {
 261     VirtIODevice *vdev = VIRTIO_DEVICE(n);
 262     NetClientState *nc = qemu_get_queue(n->nic);
 263     int queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
 264     int cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ?
 265               n->max_ncs - n->max_queue_pairs : 0;
 266
 267     if (!get_vhost_net(nc->peer)) {
 268         return;
 269     }
 270
 271     if ((virtio_net_started(n, status) && !nc->peer->link_down) ==
 272         !!n->vhost_started) {
 273         return;
 274     }
 275     if (!n->vhost_started) {
 276         int r, i;
 277
 278         if (n->needs_vnet_hdr_swap) {
 279             error_report("backend does not support %s vnet headers; "
 280                          "falling back on userspace virtio",
 281                          virtio_is_big_endian(vdev) ? "BE" : "LE");
 282             return;
 283         }
 284
 285         /* Any packets outstanding? Purge them to avoid touching rings
 286          * when vhost is running.
 287          */
 288         for (i = 0;  i < queue_pairs; i++) {
 289             NetClientState *qnc = qemu_get_subqueue(n->nic, i);
 290
 291             /* Purge both directions: TX and RX. */
 292             qemu_net_queue_purge(qnc->peer->incoming_queue, qnc);
 293             qemu_net_queue_purge(qnc->incoming_queue, qnc->peer);
 294         }
 295
 296         if (virtio_has_feature(vdev->guest_features, VIRTIO_NET_F_MTU)) {
 297             r = vhost_net_set_mtu(get_vhost_net(nc->peer), n->net_conf.mtu);
 298             if (r < 0) {
 299                 error_report("%uBytes MTU not supported by the backend",
 300                              n->net_conf.mtu);
 301
 302                 return;
 303             }
 304         }
 305
 306         n->vhost_started = 1;
 307         r = vhost_net_start(vdev, n->nic->ncs, queue_pairs, cvq);
 308         if (r < 0) {
 309             error_report("unable to start vhost net: %d: "
 310                          "falling back on userspace virtio", -r);
 311             n->vhost_started = 0;
 312         }
 313     } else {
 314         vhost_net_stop(vdev, n->nic->ncs, queue_pairs, cvq);
 315         n->vhost_started = 0;
 316     }
 317 }
 318
 319 static int virtio_net_set_vnet_endian_one(VirtIODevice *vdev,
 320                                           NetClientState *peer,
 321                                           bool enable)
 322 {
 323     if (virtio_is_big_endian(vdev)) {
 324         return qemu_set_vnet_be(peer, enable);
 325     } else {
 326         return qemu_set_vnet_le(peer, enable);
 327     }
 328 }
 329
 330 static bool virtio_net_set_vnet_endian(VirtIODevice *vdev, NetClientState *ncs,
 331                                        int queue_pairs, bool enable)
 332 {
 333     int i;
 334
 335     for (i = 0; i < queue_pairs; i++) {
 336         if (virtio_net_set_vnet_endian_one(vdev, ncs[i].peer, enable) < 0 &&
 337             enable) {
 338             while (--i >= 0) {
 339                 virtio_net_set_vnet_endian_one(vdev, ncs[i].peer, false);
 340             }
 341
 342             return true;
 343         }
 344     }
 345
 346     return false;
 347 }
 348
 349 static void virtio_net_vnet_endian_status(VirtIONet *n, uint8_t status)
 350 {
 351     VirtIODevice *vdev = VIRTIO_DEVICE(n);
 352     int queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
 353
 354     if (virtio_net_started(n, status)) {
 355         /* Before using the device, we tell the network backend about the
 356          * endianness to use when parsing vnet headers. If the backend
 357          * can't do it, we fallback onto fixing the headers in the core
 358          * virtio-net code.
 359          */
 360         n->needs_vnet_hdr_swap = virtio_net_set_vnet_endian(vdev, n->nic->ncs,
 361                                                             queue_pairs, true);
 362     } else if (virtio_net_started(n, vdev->status)) {
 363         /* After using the device, we need to reset the network backend to
 364          * the default (guest native endianness), otherwise the guest may
 365          * lose network connectivity if it is rebooted into a different
 366          * endianness.
 367          */
 368         virtio_net_set_vnet_endian(vdev, n->nic->ncs, queue_pairs, false);
 369     }
 370 }
 371
 372 static void virtio_net_drop_tx_queue_data(VirtIODevice *vdev, VirtQueue *vq)
 373 {
 374     unsigned int dropped = virtqueue_drop_all(vq);
 375     if (dropped) {
 376         virtio_notify(vdev, vq);
 377     }
 378 }
 379
 380 static void virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status)
 381 {
 382     VirtIONet *n = VIRTIO_NET(vdev);
 383     VirtIONetQueue *q;
 384     int i;
 385     uint8_t queue_status;
 386
 387     virtio_net_vnet_endian_status(n, status);
 388     virtio_net_vhost_status(n, status);
 389
 390     for (i = 0; i < n->max_queue_pairs; i++) {
 391         NetClientState *ncs = qemu_get_subqueue(n->nic, i);
 392         bool queue_started;
 393         q = &n->vqs[i];
 394
 395         if ((!n->multiqueue && i != 0) || i >= n->curr_queue_pairs) {
 396             queue_status = 0;
 397         } else {
 398             queue_status = status;
 399         }
 400         queue_started =
 401             virtio_net_started(n, queue_status) && !n->vhost_started;
 402
 403         if (queue_started) {
 404             qemu_flush_queued_packets(ncs);
 405         }
 406
 407         if (!q->tx_waiting) {
 408             continue;
 409         }
 410
 411         if (queue_started) {
 412             if (q->tx_timer) {
 413                 timer_mod(q->tx_timer,
 414                                qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
 415             } else {
 416                 qemu_bh_schedule(q->tx_bh);
 417             }
 418         } else {
 419             if (q->tx_timer) {
 420                 timer_del(q->tx_timer);
 421             } else {
 422                 qemu_bh_cancel(q->tx_bh);
 423             }
 424             if ((n->status & VIRTIO_NET_S_LINK_UP) == 0 &&
 425                 (queue_status & VIRTIO_CONFIG_S_DRIVER_OK) &&
 426                 vdev->vm_running) {
 427                 /* if tx is waiting we are likely have some packets in tx queue
 428                  * and disabled notification */
 429                 q->tx_waiting = 0;
 430                 virtio_queue_set_notification(q->tx_vq, 1);
 431                 virtio_net_drop_tx_queue_data(vdev, q->tx_vq);
 432             }
 433         }
 434     }
 435 }
 436
 437 static void virtio_net_set_link_status(NetClientState *nc)
 438 {
 439     VirtIONet *n = qemu_get_nic_opaque(nc);
 440     VirtIODevice *vdev = VIRTIO_DEVICE(n);
 441     uint16_t old_status = n->status;
 442
 443     if (nc->link_down)
 444         n->status &= ~VIRTIO_NET_S_LINK_UP;
 445     else
 446         n->status |= VIRTIO_NET_S_LINK_UP;
 447
 448     if (n->status != old_status)
 449         virtio_notify_config(vdev);
 450
 451     virtio_net_set_status(vdev, vdev->status);
 452 }
 453
 454 static void rxfilter_notify(NetClientState *nc)
 455 {
 456     VirtIONet *n = qemu_get_nic_opaque(nc);
 457
 458     if (nc->rxfilter_notify_enabled) {
 459         char *path = object_get_canonical_path(OBJECT(n->qdev));
 460         qapi_event_send_nic_rx_filter_changed(!!n->netclient_name,
 461                                               n->netclient_name, path);
 462         g_free(path);
 463
 464         /* disable event notification to avoid events flooding */
 465         nc->rxfilter_notify_enabled = 0;
 466     }
 467 }
 468
 469 static intList *get_vlan_table(VirtIONet *n)
 470 {
 471     intList *list;
 472     int i, j;
 473
 474     list = NULL;
 475     for (i = 0; i < MAX_VLAN >> 5; i++) {
 476         for (j = 0; n->vlans[i] && j <= 0x1f; j++) {
 477             if (n->vlans[i] & (1U << j)) {
 478                 QAPI_LIST_PREPEND(list, (i << 5) + j);
 479             }
 480         }
 481     }
 482
 483     return list;
 484 }
 485
 486 static RxFilterInfo *virtio_net_query_rxfilter(NetClientState *nc)
 487 {
 488     VirtIONet *n = qemu_get_nic_opaque(nc);
 489     VirtIODevice *vdev = VIRTIO_DEVICE(n);
 490     RxFilterInfo *info;
 491     strList *str_list;
 492     int i;
 493
 494     info = g_malloc0(sizeof(*info));
 495     info->name = g_strdup(nc->name);
 496     info->promiscuous = n->promisc;
 497
 498     if (n->nouni) {
 499         info->unicast = RX_STATE_NONE;
 500     } else if (n->alluni) {
 501         info->unicast = RX_STATE_ALL;
 502     } else {
 503         info->unicast = RX_STATE_NORMAL;
 504     }
 505
 506     if (n->nomulti) {
 507         info->multicast = RX_STATE_NONE;
 508     } else if (n->allmulti) {
 509         info->multicast = RX_STATE_ALL;
 510     } else {
 511         info->multicast = RX_STATE_NORMAL;
 512     }
 513
 514     info->broadcast_allowed = n->nobcast;
 515     info->multicast_overflow = n->mac_table.multi_overflow;
 516     info->unicast_overflow = n->mac_table.uni_overflow;
 517
 518     info->main_mac = qemu_mac_strdup_printf(n->mac);
 519
 520     str_list = NULL;
 521     for (i = 0; i < n->mac_table.first_multi; i++) {
 522         QAPI_LIST_PREPEND(str_list,
 523                       qemu_mac_strdup_printf(n->mac_table.macs + i * ETH_ALEN));
 524     }
 525     info->unicast_table = str_list;
 526
 527     str_list = NULL;
 528     for (i = n->mac_table.first_multi; i < n->mac_table.in_use; i++) {
 529         QAPI_LIST_PREPEND(str_list,
 530                       qemu_mac_strdup_printf(n->mac_table.macs + i * ETH_ALEN));
 531     }
 532     info->multicast_table = str_list;
 533     info->vlan_table = get_vlan_table(n);
 534
 535     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VLAN)) {
 536         info->vlan = RX_STATE_ALL;
 537     } else if (!info->vlan_table) {
 538         info->vlan = RX_STATE_NONE;
 539     } else {
 540         info->vlan = RX_STATE_NORMAL;
 541     }
 542
 543     /* enable event notification after query */
 544     nc->rxfilter_notify_enabled = 1;
 545
 546     return info;
 547 }
 548
 549 static void virtio_net_queue_reset(VirtIODevice *vdev, uint32_t queue_index)
 550 {
 551     VirtIONet *n = VIRTIO_NET(vdev);
 552     NetClientState *nc = qemu_get_subqueue(n->nic, vq2q(queue_index));
 553
 554     if (!nc->peer) {
 555         return;
 556     }
 557
 558     if (get_vhost_net(nc->peer) &&
 559         nc->peer->info->type == NET_CLIENT_DRIVER_TAP) {
 560         vhost_net_virtqueue_reset(vdev, nc, queue_index);
 561     }
 562
 563     flush_or_purge_queued_packets(nc);
 564 }
 565
 566 static void virtio_net_queue_enable(VirtIODevice *vdev, uint32_t queue_index)
 567 {
 568     VirtIONet *n = VIRTIO_NET(vdev);
 569     NetClientState *nc = qemu_get_subqueue(n->nic, vq2q(queue_index));
 570     int r;
 571
 572     if (!nc->peer || !vdev->vhost_started) {
 573         return;
 574     }
 575
 576     if (get_vhost_net(nc->peer) &&
 577         nc->peer->info->type == NET_CLIENT_DRIVER_TAP) {
 578         r = vhost_net_virtqueue_restart(vdev, nc, queue_index);
 579         if (r < 0) {
 580             error_report("unable to restart vhost net virtqueue: %d, "
 581                             "when resetting the queue", queue_index);
 582         }
 583     }
 584 }
 585
 586 static void virtio_net_reset(VirtIODevice *vdev)
 587 {
 588     VirtIONet *n = VIRTIO_NET(vdev);
 589     int i;
 590
 591     /* Reset back to compatibility mode */
 592     n->promisc = 1;
 593     n->allmulti = 0;
 594     n->alluni = 0;
 595     n->nomulti = 0;
 596     n->nouni = 0;
 597     n->nobcast = 0;
 598     /* multiqueue is disabled by default */
 599     n->curr_queue_pairs = 1;
 600     timer_del(n->announce_timer.tm);
 601     n->announce_timer.round = 0;
 602     n->status &= ~VIRTIO_NET_S_ANNOUNCE;
 603
 604     /* Flush any MAC and VLAN filter table state */
 605     n->mac_table.in_use = 0;
 606     n->mac_table.first_multi = 0;
 607     n->mac_table.multi_overflow = 0;
 608     n->mac_table.uni_overflow = 0;
 609     memset(n->mac_table.macs, 0, MAC_TABLE_ENTRIES * ETH_ALEN);
 610     memcpy(&n->mac[0], &n->nic->conf->macaddr, sizeof(n->mac));
 611     qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
 612     memset(n->vlans, 0, MAX_VLAN >> 3);
 613
 614     /* Flush any async TX */
 615     for (i = 0;  i < n->max_queue_pairs; i++) {
 616         flush_or_purge_queued_packets(qemu_get_subqueue(n->nic, i));
 617     }
 618 }
 619
 620 static void peer_test_vnet_hdr(VirtIONet *n)
 621 {
 622     NetClientState *nc = qemu_get_queue(n->nic);
 623     if (!nc->peer) {
 624         return;
 625     }
 626
 627     n->has_vnet_hdr = qemu_has_vnet_hdr(nc->peer);
 628 }
 629
 630 static int peer_has_vnet_hdr(VirtIONet *n)
 631 {
 632     return n->has_vnet_hdr;
 633 }
 634
 635 static int peer_has_ufo(VirtIONet *n)
 636 {
 637     if (!peer_has_vnet_hdr(n))
 638         return 0;
 639
 640     n->has_ufo = qemu_has_ufo(qemu_get_queue(n->nic)->peer);
 641
 642     return n->has_ufo;
 643 }
 644
 645 static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
 646                                        int version_1, int hash_report)
 647 {
 648     int i;
 649     NetClientState *nc;
 650
 651     n->mergeable_rx_bufs = mergeable_rx_bufs;
 652
 653     if (version_1) {
 654         n->guest_hdr_len = hash_report ?
 655             sizeof(struct virtio_net_hdr_v1_hash) :
 656             sizeof(struct virtio_net_hdr_mrg_rxbuf);
 657         n->rss_data.populate_hash = !!hash_report;
 658     } else {
 659         n->guest_hdr_len = n->mergeable_rx_bufs ?
 660             sizeof(struct virtio_net_hdr_mrg_rxbuf) :
 661             sizeof(struct virtio_net_hdr);
 662     }
 663
 664     for (i = 0; i < n->max_queue_pairs; i++) {
 665         nc = qemu_get_subqueue(n->nic, i);
 666
 667         if (peer_has_vnet_hdr(n) &&
 668             qemu_has_vnet_hdr_len(nc->peer, n->guest_hdr_len)) {
 669             qemu_set_vnet_hdr_len(nc->peer, n->guest_hdr_len);
 670             n->host_hdr_len = n->guest_hdr_len;
 671         }
 672     }
 673 }
 674
 675 static int virtio_net_max_tx_queue_size(VirtIONet *n)
 676 {
 677     NetClientState *peer = n->nic_conf.peers.ncs[0];
 678
 679     /*
 680      * Backends other than vhost-user or vhost-vdpa don't support max queue
 681      * size.
 682      */
 683     if (!peer) {
 684         return VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE;
 685     }
 686
 687     switch(peer->info->type) {
 688     case NET_CLIENT_DRIVER_VHOST_USER:
 689     case NET_CLIENT_DRIVER_VHOST_VDPA:
 690         return VIRTQUEUE_MAX_SIZE;
 691     default:
 692         return VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE;
 693     };
 694 }
 695
 696 static int peer_attach(VirtIONet *n, int index)
 697 {
 698     NetClientState *nc = qemu_get_subqueue(n->nic, index);
 699
 700     if (!nc->peer) {
 701         return 0;
 702     }
 703
 704     if (nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_USER) {
 705         vhost_set_vring_enable(nc->peer, 1);
 706     }
 707
 708     if (nc->peer->info->type != NET_CLIENT_DRIVER_TAP) {
 709         return 0;
 710     }
 711
 712     if (n->max_queue_pairs == 1) {
 713         return 0;
 714     }
 715
 716     return tap_enable(nc->peer);
 717 }
 718
 719 static int peer_detach(VirtIONet *n, int index)
 720 {
 721     NetClientState *nc = qemu_get_subqueue(n->nic, index);
 722
 723     if (!nc->peer) {
 724         return 0;
 725     }
 726
 727     if (nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_USER) {
 728         vhost_set_vring_enable(nc->peer, 0);
 729     }
 730
 731     if (nc->peer->info->type !=  NET_CLIENT_DRIVER_TAP) {
 732         return 0;
 733     }
 734
 735     return tap_disable(nc->peer);
 736 }
 737
 738 static void virtio_net_set_queue_pairs(VirtIONet *n)
 739 {
 740     int i;
 741     int r;
 742
 743     if (n->nic->peer_deleted) {
 744         return;
 745     }
 746
 747     for (i = 0; i < n->max_queue_pairs; i++) {
 748         if (i < n->curr_queue_pairs) {
 749             r = peer_attach(n, i);
 750             assert(!r);
 751         } else {
 752             r = peer_detach(n, i);
 753             assert(!r);
 754         }
 755     }
 756 }
 757
 758 static void virtio_net_set_multiqueue(VirtIONet *n, int multiqueue);
 759
 760 static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
 761                                         Error **errp)
 762 {
 763     VirtIONet *n = VIRTIO_NET(vdev);
 764     NetClientState *nc = qemu_get_queue(n->nic);
 765
 766     /* Firstly sync all virtio-net possible supported features */
 767     features |= n->host_features;
 768
 769     virtio_add_feature(&features, VIRTIO_NET_F_MAC);
 770
 771     if (!peer_has_vnet_hdr(n)) {
 772         virtio_clear_feature(&features, VIRTIO_NET_F_CSUM);
 773         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_TSO4);
 774         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_TSO6);
 775         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_ECN);
 776
 777         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_CSUM);
 778         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO4);
 779         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6);
 780         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN);
 781
 782         virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT);
 783     }
 784
 785     if (!peer_has_vnet_hdr(n) || !peer_has_ufo(n)) {
 786         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_UFO);
 787         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO);
 788     }
 789
 790     if (!get_vhost_net(nc->peer)) {
 791         return features;
 792     }
 793
 794     if (!ebpf_rss_is_loaded(&n->ebpf_rss)) {
 795         virtio_clear_feature(&features, VIRTIO_NET_F_RSS);
 796     }
 797     features = vhost_net_get_features(get_vhost_net(nc->peer), features);
 798     vdev->backend_features = features;
 799
 800     if (n->mtu_bypass_backend &&
 801             (n->host_features & 1ULL << VIRTIO_NET_F_MTU)) {
 802         features |= (1ULL << VIRTIO_NET_F_MTU);
 803     }
 804
 805     return features;
 806 }
 807
 808 static uint64_t virtio_net_bad_features(VirtIODevice *vdev)
 809 {
 810     uint64_t features = 0;
 811
 812     /* Linux kernel 2.6.25.  It understood MAC (as everyone must),
 813      * but also these: */
 814     virtio_add_feature(&features, VIRTIO_NET_F_MAC);
 815     virtio_add_feature(&features, VIRTIO_NET_F_CSUM);
 816     virtio_add_feature(&features, VIRTIO_NET_F_HOST_TSO4);
 817     virtio_add_feature(&features, VIRTIO_NET_F_HOST_TSO6);
 818     virtio_add_feature(&features, VIRTIO_NET_F_HOST_ECN);
 819
 820     return features;
 821 }
 822
 823 static void virtio_net_apply_guest_offloads(VirtIONet *n)
 824 {
 825     qemu_set_offload(qemu_get_queue(n->nic)->peer,
 826             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_CSUM)),
 827             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO4)),
 828             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO6)),
 829             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_ECN)),
 830             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)));
 831 }
 832
 833 static uint64_t virtio_net_guest_offloads_by_features(uint32_t features)
 834 {
 835     static const uint64_t guest_offloads_mask =
 836         (1ULL << VIRTIO_NET_F_GUEST_CSUM) |
 837         (1ULL << VIRTIO_NET_F_GUEST_TSO4) |
 838         (1ULL << VIRTIO_NET_F_GUEST_TSO6) |
 839         (1ULL << VIRTIO_NET_F_GUEST_ECN)  |
 840         (1ULL << VIRTIO_NET_F_GUEST_UFO);
 841
 842     return guest_offloads_mask & features;
 843 }
 844
 845 static inline uint64_t virtio_net_supported_guest_offloads(VirtIONet *n)
 846 {
 847     VirtIODevice *vdev = VIRTIO_DEVICE(n);
 848     return virtio_net_guest_offloads_by_features(vdev->guest_features);
 849 }
 850
 851 typedef struct {
 852     VirtIONet *n;
 853     DeviceState *dev;
 854 } FailoverDevice;
 855
 856 /**
 857  * Set the failover primary device
 858  *
 859  * @opaque: FailoverId to setup
 860  * @opts: opts for device we are handling
 861  * @errp: returns an error if this function fails
 862  */
 863 static int failover_set_primary(DeviceState *dev, void *opaque)
 864 {
 865     FailoverDevice *fdev = opaque;
 866     PCIDevice *pci_dev = (PCIDevice *)
 867         object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE);
 868
 869     if (!pci_dev) {
 870         return 0;
 871     }
 872
 873     if (!g_strcmp0(pci_dev->failover_pair_id, fdev->n->netclient_name)) {
 874         fdev->dev = dev;
 875         return 1;
 876     }
 877
 878     return 0;
 879 }
 880
 881 /**
 882  * Find the primary device for this failover virtio-net
 883  *
 884  * @n: VirtIONet device
 885  * @errp: returns an error if this function fails
 886  */
 887 static DeviceState *failover_find_primary_device(VirtIONet *n)
 888 {
 889     FailoverDevice fdev = {
 890         .n = n,
 891     };
 892
 893     qbus_walk_children(sysbus_get_default(), failover_set_primary, NULL,
 894                        NULL, NULL, &fdev);
 895     return fdev.dev;
 896 }
 897
 898 static void failover_add_primary(VirtIONet *n, Error **errp)
 899 {
 900     Error *err = NULL;
 901     DeviceState *dev = failover_find_primary_device(n);
 902
 903     if (dev) {
 904         return;
 905     }
 906
 907     if (!n->primary_opts) {
 908         error_setg(errp, "Primary device not found");
 909         error_append_hint(errp, "Virtio-net failover will not work. Make "
 910                           "sure primary device has parameter"
 911                           " failover_pair_id=%s\n", n->netclient_name);
 912         return;
 913     }
 914
 915     dev = qdev_device_add_from_qdict(n->primary_opts,
 916                                      n->primary_opts_from_json,
 917                                      &err);
 918     if (err) {
 919         qobject_unref(n->primary_opts);
 920         n->primary_opts = NULL;
 921     } else {
 922         object_unref(OBJECT(dev));
 923     }
 924     error_propagate(errp, err);
 925 }
 926
 927 static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
 928 {
 929     VirtIONet *n = VIRTIO_NET(vdev);
 930     Error *err = NULL;
 931     int i;
 932
 933     if (n->mtu_bypass_backend &&
 934             !virtio_has_feature(vdev->backend_features, VIRTIO_NET_F_MTU)) {
 935         features &= ~(1ULL << VIRTIO_NET_F_MTU);
 936     }
 937
 938     virtio_net_set_multiqueue(n,
 939                               virtio_has_feature(features, VIRTIO_NET_F_RSS) ||
 940                               virtio_has_feature(features, VIRTIO_NET_F_MQ));
 941
 942     virtio_net_set_mrg_rx_bufs(n,
 943                                virtio_has_feature(features,
 944                                                   VIRTIO_NET_F_MRG_RXBUF),
 945                                virtio_has_feature(features,
 946                                                   VIRTIO_F_VERSION_1),
 947                                virtio_has_feature(features,
 948                                                   VIRTIO_NET_F_HASH_REPORT));
 949
 950     n->rsc4_enabled = virtio_has_feature(features, VIRTIO_NET_F_RSC_EXT) &&
 951         virtio_has_feature(features, VIRTIO_NET_F_GUEST_TSO4);
 952     n->rsc6_enabled = virtio_has_feature(features, VIRTIO_NET_F_RSC_EXT) &&
 953         virtio_has_feature(features, VIRTIO_NET_F_GUEST_TSO6);
 954     n->rss_data.redirect = virtio_has_feature(features, VIRTIO_NET_F_RSS);
 955
 956     if (n->has_vnet_hdr) {
 957         n->curr_guest_offloads =
 958             virtio_net_guest_offloads_by_features(features);
 959         virtio_net_apply_guest_offloads(n);
 960     }
 961
 962     for (i = 0;  i < n->max_queue_pairs; i++) {
 963         NetClientState *nc = qemu_get_subqueue(n->nic, i);
 964
 965         if (!get_vhost_net(nc->peer)) {
 966             continue;
 967         }
 968         vhost_net_ack_features(get_vhost_net(nc->peer), features);
 969     }
 970
 971     if (virtio_has_feature(features, VIRTIO_NET_F_CTRL_VLAN)) {
 972         memset(n->vlans, 0, MAX_VLAN >> 3);
 973     } else {
 974         memset(n->vlans, 0xff, MAX_VLAN >> 3);
 975     }
 976
 977     if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) {
 978         qapi_event_send_failover_negotiated(n->netclient_name);
 979         qatomic_set(&n->failover_primary_hidden, false);
 980         failover_add_primary(n, &err);
 981         if (err) {
 982             if (!qtest_enabled()) {
 983                 warn_report_err(err);
 984             } else {
 985                 error_free(err);
 986             }
 987         }
 988     }
 989 }
 990
 991 static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
 992                                      struct iovec *iov, unsigned int iov_cnt)
 993 {
 994     uint8_t on;
 995     size_t s;
 996     NetClientState *nc = qemu_get_queue(n->nic);
 997
 998     s = iov_to_buf(iov, iov_cnt, 0, &on, sizeof(on));
 999     if (s != sizeof(on)) {
1000         return VIRTIO_NET_ERR;
1001     }
1002
1003     if (cmd == VIRTIO_NET_CTRL_RX_PROMISC) {
1004         n->promisc = on;
1005     } else if (cmd == VIRTIO_NET_CTRL_RX_ALLMULTI) {
1006         n->allmulti = on;
1007     } else if (cmd == VIRTIO_NET_CTRL_RX_ALLUNI) {
1008         n->alluni = on;
1009     } else if (cmd == VIRTIO_NET_CTRL_RX_NOMULTI) {
1010         n->nomulti = on;
1011     } else if (cmd == VIRTIO_NET_CTRL_RX_NOUNI) {
1012         n->nouni = on;
1013     } else if (cmd == VIRTIO_NET_CTRL_RX_NOBCAST) {
1014         n->nobcast = on;
1015     } else {
1016         return VIRTIO_NET_ERR;
1017     }
1018
1019     rxfilter_notify(nc);
1020
1021     return VIRTIO_NET_OK;
1022 }
1023
1024 static int virtio_net_handle_offloads(VirtIONet *n, uint8_t cmd,
1025                                      struct iovec *iov, unsigned int iov_cnt)
1026 {
1027     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1028     uint64_t offloads;
1029     size_t s;
1030
1031     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
1032         return VIRTIO_NET_ERR;
1033     }
1034
1035     s = iov_to_buf(iov, iov_cnt, 0, &offloads, sizeof(offloads));
1036     if (s != sizeof(offloads)) {
1037         return VIRTIO_NET_ERR;
1038     }
1039
1040     if (cmd == VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET) {
1041         uint64_t supported_offloads;
1042
1043         offloads = virtio_ldq_p(vdev, &offloads);
1044
1045         if (!n->has_vnet_hdr) {
1046             return VIRTIO_NET_ERR;
1047         }
1048
1049         n->rsc4_enabled = virtio_has_feature(offloads, VIRTIO_NET_F_RSC_EXT) &&
1050             virtio_has_feature(offloads, VIRTIO_NET_F_GUEST_TSO4);
1051         n->rsc6_enabled = virtio_has_feature(offloads, VIRTIO_NET_F_RSC_EXT) &&
1052             virtio_has_feature(offloads, VIRTIO_NET_F_GUEST_TSO6);
1053         virtio_clear_feature(&offloads, VIRTIO_NET_F_RSC_EXT);
1054
1055         supported_offloads = virtio_net_supported_guest_offloads(n);
1056         if (offloads & ~supported_offloads) {
1057             return VIRTIO_NET_ERR;
1058         }
1059
1060         n->curr_guest_offloads = offloads;
1061         virtio_net_apply_guest_offloads(n);
1062
1063         return VIRTIO_NET_OK;
1064     } else {
1065         return VIRTIO_NET_ERR;
1066     }
1067 }
1068
1069 static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd,
1070                                  struct iovec *iov, unsigned int iov_cnt)
1071 {
1072     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1073     struct virtio_net_ctrl_mac mac_data;
1074     size_t s;
1075     NetClientState *nc = qemu_get_queue(n->nic);
1076
1077     if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET) {
1078         if (iov_size(iov, iov_cnt) != sizeof(n->mac)) {
1079             return VIRTIO_NET_ERR;
1080         }
1081         s = iov_to_buf(iov, iov_cnt, 0, &n->mac, sizeof(n->mac));
1082         assert(s == sizeof(n->mac));
1083         qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
1084         rxfilter_notify(nc);
1085
1086         return VIRTIO_NET_OK;
1087     }
1088
1089     if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET) {
1090         return VIRTIO_NET_ERR;
1091     }
1092
1093     int in_use = 0;
1094     int first_multi = 0;
1095     uint8_t uni_overflow = 0;
1096     uint8_t multi_overflow = 0;
1097     uint8_t *macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN);
1098
1099     s = iov_to_buf(iov, iov_cnt, 0, &mac_data.entries,
1100                    sizeof(mac_data.entries));
1101     mac_data.entries = virtio_ldl_p(vdev, &mac_data.entries);
1102     if (s != sizeof(mac_data.entries)) {
1103         goto error;
1104     }
1105     iov_discard_front(&iov, &iov_cnt, s);
1106
1107     if (mac_data.entries * ETH_ALEN > iov_size(iov, iov_cnt)) {
1108         goto error;
1109     }
1110
1111     if (mac_data.entries <= MAC_TABLE_ENTRIES) {
1112         s = iov_to_buf(iov, iov_cnt, 0, macs,
1113                        mac_data.entries * ETH_ALEN);
1114         if (s != mac_data.entries * ETH_ALEN) {
1115             goto error;
1116         }
1117         in_use += mac_data.entries;
1118     } else {
1119         uni_overflow = 1;
1120     }
1121
1122     iov_discard_front(&iov, &iov_cnt, mac_data.entries * ETH_ALEN);
1123
1124     first_multi = in_use;
1125
1126     s = iov_to_buf(iov, iov_cnt, 0, &mac_data.entries,
1127                    sizeof(mac_data.entries));
1128     mac_data.entries = virtio_ldl_p(vdev, &mac_data.entries);
1129     if (s != sizeof(mac_data.entries)) {
1130         goto error;
1131     }
1132
1133     iov_discard_front(&iov, &iov_cnt, s);
1134
1135     if (mac_data.entries * ETH_ALEN != iov_size(iov, iov_cnt)) {
1136         goto error;
1137     }
1138
1139     if (mac_data.entries <= MAC_TABLE_ENTRIES - in_use) {
1140         s = iov_to_buf(iov, iov_cnt, 0, &macs[in_use * ETH_ALEN],
1141                        mac_data.entries * ETH_ALEN);
1142         if (s != mac_data.entries * ETH_ALEN) {
1143             goto error;
1144         }
1145         in_use += mac_data.entries;
1146     } else {
1147         multi_overflow = 1;
1148     }
1149
1150     n->mac_table.in_use = in_use;
1151     n->mac_table.first_multi = first_multi;
1152     n->mac_table.uni_overflow = uni_overflow;
1153     n->mac_table.multi_overflow = multi_overflow;
1154     memcpy(n->mac_table.macs, macs, MAC_TABLE_ENTRIES * ETH_ALEN);
1155     g_free(macs);
1156     rxfilter_notify(nc);
1157
1158     return VIRTIO_NET_OK;
1159
1160 error:
1161     g_free(macs);
1162     return VIRTIO_NET_ERR;
1163 }
1164
1165 static int virtio_net_handle_vlan_table(VirtIONet *n, uint8_t cmd,
1166                                         struct iovec *iov, unsigned int iov_cnt)
1167 {
1168     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1169     uint16_t vid;
1170     size_t s;
1171     NetClientState *nc = qemu_get_queue(n->nic);
1172
1173     s = iov_to_buf(iov, iov_cnt, 0, &vid, sizeof(vid));
1174     vid = virtio_lduw_p(vdev, &vid);
1175     if (s != sizeof(vid)) {
1176         return VIRTIO_NET_ERR;
1177     }
1178
1179     if (vid >= MAX_VLAN)
1180         return VIRTIO_NET_ERR;
1181
1182     if (cmd == VIRTIO_NET_CTRL_VLAN_ADD)
1183         n->vlans[vid >> 5] |= (1U << (vid & 0x1f));
1184     else if (cmd == VIRTIO_NET_CTRL_VLAN_DEL)
1185         n->vlans[vid >> 5] &= ~(1U << (vid & 0x1f));
1186     else
1187         return VIRTIO_NET_ERR;
1188
1189     rxfilter_notify(nc);
1190
1191     return VIRTIO_NET_OK;
1192 }
1193
1194 static int virtio_net_handle_announce(VirtIONet *n, uint8_t cmd,
1195                                       struct iovec *iov, unsigned int iov_cnt)
1196 {
1197     trace_virtio_net_handle_announce(n->announce_timer.round);
1198     if (cmd == VIRTIO_NET_CTRL_ANNOUNCE_ACK &&
1199         n->status & VIRTIO_NET_S_ANNOUNCE) {
1200         n->status &= ~VIRTIO_NET_S_ANNOUNCE;
1201         if (n->announce_timer.round) {
1202             qemu_announce_timer_step(&n->announce_timer);
1203         }
1204         return VIRTIO_NET_OK;
1205     } else {
1206         return VIRTIO_NET_ERR;
1207     }
1208 }
1209
1210 static void virtio_net_detach_epbf_rss(VirtIONet *n);
1211
1212 static void virtio_net_disable_rss(VirtIONet *n)
1213 {
1214     if (n->rss_data.enabled) {
1215         trace_virtio_net_rss_disable();
1216     }
1217     n->rss_data.enabled = false;
1218
1219     virtio_net_detach_epbf_rss(n);
1220 }
1221
1222 static bool virtio_net_attach_ebpf_to_backend(NICState *nic, int prog_fd)
1223 {
1224     NetClientState *nc = qemu_get_peer(qemu_get_queue(nic), 0);
1225     if (nc == NULL || nc->info->set_steering_ebpf == NULL) {
1226         return false;
1227     }
1228
1229     return nc->info->set_steering_ebpf(nc, prog_fd);
1230 }
1231
1232 static void rss_data_to_rss_config(struct VirtioNetRssData *data,
1233                                    struct EBPFRSSConfig *config)
1234 {
1235     config->redirect = data->redirect;
1236     config->populate_hash = data->populate_hash;
1237     config->hash_types = data->hash_types;
1238     config->indirections_len = data->indirections_len;
1239     config->default_queue = data->default_queue;
1240 }
1241
1242 static bool virtio_net_attach_epbf_rss(VirtIONet *n)
1243 {
1244     struct EBPFRSSConfig config = {};
1245
1246     if (!ebpf_rss_is_loaded(&n->ebpf_rss)) {
1247         return false;
1248     }
1249
1250     rss_data_to_rss_config(&n->rss_data, &config);
1251
1252     if (!ebpf_rss_set_all(&n->ebpf_rss, &config,
1253                           n->rss_data.indirections_table, n->rss_data.key)) {
1254         return false;
1255     }
1256
1257     if (!virtio_net_attach_ebpf_to_backend(n->nic, n->ebpf_rss.program_fd)) {
1258         return false;
1259     }
1260
1261     return true;
1262 }
1263
1264 static void virtio_net_detach_epbf_rss(VirtIONet *n)
1265 {
1266     virtio_net_attach_ebpf_to_backend(n->nic, -1);
1267 }
1268
1269 static bool virtio_net_load_ebpf(VirtIONet *n)
1270 {
1271     if (!virtio_net_attach_ebpf_to_backend(n->nic, -1)) {
1272         /* backend does't support steering ebpf */
1273         return false;
1274     }
1275
1276     return ebpf_rss_load(&n->ebpf_rss);
1277 }
1278
1279 static void virtio_net_unload_ebpf(VirtIONet *n)
1280 {
1281     virtio_net_attach_ebpf_to_backend(n->nic, -1);
1282     ebpf_rss_unload(&n->ebpf_rss);
1283 }
1284
1285 static uint16_t virtio_net_handle_rss(VirtIONet *n,
1286                                       struct iovec *iov,
1287                                       unsigned int iov_cnt,
1288                                       bool do_rss)
1289 {
1290     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1291     struct virtio_net_rss_config cfg;
1292     size_t s, offset = 0, size_get;
1293     uint16_t queue_pairs, i;
1294     struct {
1295         uint16_t us;
1296         uint8_t b;
1297     } QEMU_PACKED temp;
1298     const char *err_msg = "";
1299     uint32_t err_value = 0;
1300
1301     if (do_rss && !virtio_vdev_has_feature(vdev, VIRTIO_NET_F_RSS)) {
1302         err_msg = "RSS is not negotiated";
1303         goto error;
1304     }
1305     if (!do_rss && !virtio_vdev_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT)) {
1306         err_msg = "Hash report is not negotiated";
1307         goto error;
1308     }
1309     size_get = offsetof(struct virtio_net_rss_config, indirection_table);
1310     s = iov_to_buf(iov, iov_cnt, offset, &cfg, size_get);
1311     if (s != size_get) {
1312         err_msg = "Short command buffer";
1313         err_value = (uint32_t)s;
1314         goto error;
1315     }
1316     n->rss_data.hash_types = virtio_ldl_p(vdev, &cfg.hash_types);
1317     n->rss_data.indirections_len =
1318         virtio_lduw_p(vdev, &cfg.indirection_table_mask);
1319     n->rss_data.indirections_len++;
1320     if (!do_rss) {
1321         n->rss_data.indirections_len = 1;
1322     }
1323     if (!is_power_of_2(n->rss_data.indirections_len)) {
1324         err_msg = "Invalid size of indirection table";
1325         err_value = n->rss_data.indirections_len;
1326         goto error;
1327     }
1328     if (n->rss_data.indirections_len > VIRTIO_NET_RSS_MAX_TABLE_LEN) {
1329         err_msg = "Too large indirection table";
1330         err_value = n->rss_data.indirections_len;
1331         goto error;
1332     }
1333     n->rss_data.default_queue = do_rss ?
1334         virtio_lduw_p(vdev, &cfg.unclassified_queue) : 0;
1335     if (n->rss_data.default_queue >= n->max_queue_pairs) {
1336         err_msg = "Invalid default queue";
1337         err_value = n->rss_data.default_queue;
1338         goto error;
1339     }
1340     offset += size_get;
1341     size_get = sizeof(uint16_t) * n->rss_data.indirections_len;
1342     g_free(n->rss_data.indirections_table);
1343     n->rss_data.indirections_table = g_malloc(size_get);
1344     if (!n->rss_data.indirections_table) {
1345         err_msg = "Can't allocate indirections table";
1346         err_value = n->rss_data.indirections_len;
1347         goto error;
1348     }
1349     s = iov_to_buf(iov, iov_cnt, offset,
1350                    n->rss_data.indirections_table, size_get);
1351     if (s != size_get) {
1352         err_msg = "Short indirection table buffer";
1353         err_value = (uint32_t)s;
1354         goto error;
1355     }
1356     for (i = 0; i < n->rss_data.indirections_len; ++i) {
1357         uint16_t val = n->rss_data.indirections_table[i];
1358         n->rss_data.indirections_table[i] = virtio_lduw_p(vdev, &val);
1359     }
1360     offset += size_get;
1361     size_get = sizeof(temp);
1362     s = iov_to_buf(iov, iov_cnt, offset, &temp, size_get);
1363     if (s != size_get) {
1364         err_msg = "Can't get queue_pairs";
1365         err_value = (uint32_t)s;
1366         goto error;
1367     }
1368     queue_pairs = do_rss ? virtio_lduw_p(vdev, &temp.us) : n->curr_queue_pairs;
1369     if (queue_pairs == 0 || queue_pairs > n->max_queue_pairs) {
1370         err_msg = "Invalid number of queue_pairs";
1371         err_value = queue_pairs;
1372         goto error;
1373     }
1374     if (temp.b > VIRTIO_NET_RSS_MAX_KEY_SIZE) {
1375         err_msg = "Invalid key size";
1376         err_value = temp.b;
1377         goto error;
1378     }
1379     if (!temp.b && n->rss_data.hash_types) {
1380         err_msg = "No key provided";
1381         err_value = 0;
1382         goto error;
1383     }
1384     if (!temp.b && !n->rss_data.hash_types) {
1385         virtio_net_disable_rss(n);
1386         return queue_pairs;
1387     }
1388     offset += size_get;
1389     size_get = temp.b;
1390     s = iov_to_buf(iov, iov_cnt, offset, n->rss_data.key, size_get);
1391     if (s != size_get) {
1392         err_msg = "Can get key buffer";
1393         err_value = (uint32_t)s;
1394         goto error;
1395     }
1396     n->rss_data.enabled = true;
1397
1398     if (!n->rss_data.populate_hash) {
1399         if (!virtio_net_attach_epbf_rss(n)) {
1400             /* EBPF must be loaded for vhost */
1401             if (get_vhost_net(qemu_get_queue(n->nic)->peer)) {
1402                 warn_report("Can't load eBPF RSS for vhost");
1403                 goto error;
1404             }
1405             /* fallback to software RSS */
1406             warn_report("Can't load eBPF RSS - fallback to software RSS");
1407             n->rss_data.enabled_software_rss = true;
1408         }
1409     } else {
1410         /* use software RSS for hash populating */
1411         /* and detach eBPF if was loaded before */
1412         virtio_net_detach_epbf_rss(n);
1413         n->rss_data.enabled_software_rss = true;
1414     }
1415
1416     trace_virtio_net_rss_enable(n->rss_data.hash_types,
1417                                 n->rss_data.indirections_len,
1418                                 temp.b);
1419     return queue_pairs;
1420 error:
1421     trace_virtio_net_rss_error(err_msg, err_value);
1422     virtio_net_disable_rss(n);
1423     return 0;
1424 }
1425
1426 static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd,
1427                                 struct iovec *iov, unsigned int iov_cnt)
1428 {
1429     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1430     uint16_t queue_pairs;
1431     NetClientState *nc = qemu_get_queue(n->nic);
1432
1433     virtio_net_disable_rss(n);
1434     if (cmd == VIRTIO_NET_CTRL_MQ_HASH_CONFIG) {
1435         queue_pairs = virtio_net_handle_rss(n, iov, iov_cnt, false);
1436         return queue_pairs ? VIRTIO_NET_OK : VIRTIO_NET_ERR;
1437     }
1438     if (cmd == VIRTIO_NET_CTRL_MQ_RSS_CONFIG) {
1439         queue_pairs = virtio_net_handle_rss(n, iov, iov_cnt, true);
1440     } else if (cmd == VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET) {
1441         struct virtio_net_ctrl_mq mq;
1442         size_t s;
1443         if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_MQ)) {
1444             return VIRTIO_NET_ERR;
1445         }
1446         s = iov_to_buf(iov, iov_cnt, 0, &mq, sizeof(mq));
1447         if (s != sizeof(mq)) {
1448             return VIRTIO_NET_ERR;
1449         }
1450         queue_pairs = virtio_lduw_p(vdev, &mq.virtqueue_pairs);
1451
1452     } else {
1453         return VIRTIO_NET_ERR;
1454     }
1455
1456     if (queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
1457         queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
1458         queue_pairs > n->max_queue_pairs ||
1459         !n->multiqueue) {
1460         return VIRTIO_NET_ERR;
1461     }
1462
1463     n->curr_queue_pairs = queue_pairs;
1464     if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
1465         /*
1466          * Avoid updating the backend for a vdpa device: We're only interested
1467          * in updating the device model queues.
1468          */
1469         return VIRTIO_NET_OK;
1470     }
1471     /* stop the backend before changing the number of queue_pairs to avoid handling a
1472      * disabled queue */
1473     virtio_net_set_status(vdev, vdev->status);
1474     virtio_net_set_queue_pairs(n);
1475
1476     return VIRTIO_NET_OK;
1477 }
1478
1479 size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
1480                                   const struct iovec *in_sg, unsigned in_num,
1481                                   const struct iovec *out_sg,
1482                                   unsigned out_num)
1483 {
1484     VirtIONet *n = VIRTIO_NET(vdev);
1485     struct virtio_net_ctrl_hdr ctrl;
1486     virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1487     size_t s;
1488     struct iovec *iov, *iov2;
1489
1490     if (iov_size(in_sg, in_num) < sizeof(status) ||
1491         iov_size(out_sg, out_num) < sizeof(ctrl)) {
1492         virtio_error(vdev, "virtio-net ctrl missing headers");
1493         return 0;
1494     }
1495
1496     iov2 = iov = g_memdup2(out_sg, sizeof(struct iovec) * out_num);
1497     s = iov_to_buf(iov, out_num, 0, &ctrl, sizeof(ctrl));
1498     iov_discard_front(&iov, &out_num, sizeof(ctrl));
1499     if (s != sizeof(ctrl)) {
1500         status = VIRTIO_NET_ERR;
1501     } else if (ctrl.class == VIRTIO_NET_CTRL_RX) {
1502         status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, out_num);
1503     } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) {
1504         status = virtio_net_handle_mac(n, ctrl.cmd, iov, out_num);
1505     } else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) {
1506         status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, out_num);
1507     } else if (ctrl.class == VIRTIO_NET_CTRL_ANNOUNCE) {
1508         status = virtio_net_handle_announce(n, ctrl.cmd, iov, out_num);
1509     } else if (ctrl.class == VIRTIO_NET_CTRL_MQ) {
1510         status = virtio_net_handle_mq(n, ctrl.cmd, iov, out_num);
1511     } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
1512         status = virtio_net_handle_offloads(n, ctrl.cmd, iov, out_num);
1513     }
1514
1515     s = iov_from_buf(in_sg, in_num, 0, &status, sizeof(status));
1516     assert(s == sizeof(status));
1517
1518     g_free(iov2);
1519     return sizeof(status);
1520 }
1521
1522 static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
1523 {
1524     VirtQueueElement *elem;
1525
1526     for (;;) {
1527         size_t written;
1528         elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
1529         if (!elem) {
1530             break;
1531         }
1532
1533         written = virtio_net_handle_ctrl_iov(vdev, elem->in_sg, elem->in_num,
1534                                              elem->out_sg, elem->out_num);
1535         if (written > 0) {
1536             virtqueue_push(vq, elem, written);
1537             virtio_notify(vdev, vq);
1538             g_free(elem);
1539         } else {
1540             virtqueue_detach_element(vq, elem, 0);
1541             g_free(elem);
1542             break;
1543         }
1544     }
1545 }
1546
1547 /* RX */
1548
1549 static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq)
1550 {
1551     VirtIONet *n = VIRTIO_NET(vdev);
1552     int queue_index = vq2q(virtio_get_queue_index(vq));
1553
1554     qemu_flush_queued_packets(qemu_get_subqueue(n->nic, queue_index));
1555 }
1556
1557 static bool virtio_net_can_receive(NetClientState *nc)
1558 {
1559     VirtIONet *n = qemu_get_nic_opaque(nc);
1560     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1561     VirtIONetQueue *q = virtio_net_get_subqueue(nc);
1562
1563     if (!vdev->vm_running) {
1564         return false;
1565     }
1566
1567     if (nc->queue_index >= n->curr_queue_pairs) {
1568         return false;
1569     }
1570
1571     if (!virtio_queue_ready(q->rx_vq) ||
1572         !(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
1573         return false;
1574     }
1575
1576     return true;
1577 }
1578
1579 static int virtio_net_has_buffers(VirtIONetQueue *q, int bufsize)
1580 {
1581     VirtIONet *n = q->n;
1582     if (virtio_queue_empty(q->rx_vq) ||
1583         (n->mergeable_rx_bufs &&
1584          !virtqueue_avail_bytes(q->rx_vq, bufsize, 0))) {
1585         virtio_queue_set_notification(q->rx_vq, 1);
1586
1587         /* To avoid a race condition where the guest has made some buffers
1588          * available after the above check but before notification was
1589          * enabled, check for available buffers again.
1590          */
1591         if (virtio_queue_empty(q->rx_vq) ||
1592             (n->mergeable_rx_bufs &&
1593              !virtqueue_avail_bytes(q->rx_vq, bufsize, 0))) {
1594             return 0;
1595         }
1596     }
1597
1598     virtio_queue_set_notification(q->rx_vq, 0);
1599     return 1;
1600 }
1601
1602 static void virtio_net_hdr_swap(VirtIODevice *vdev, struct virtio_net_hdr *hdr)
1603 {
1604     virtio_tswap16s(vdev, &hdr->hdr_len);
1605     virtio_tswap16s(vdev, &hdr->gso_size);
1606     virtio_tswap16s(vdev, &hdr->csum_start);
1607     virtio_tswap16s(vdev, &hdr->csum_offset);
1608 }
1609
1610 /* dhclient uses AF_PACKET but doesn't pass auxdata to the kernel so
1611  * it never finds out that the packets don't have valid checksums.  This
1612  * causes dhclient to get upset.  Fedora's carried a patch for ages to
1613  * fix this with Xen but it hasn't appeared in an upstream release of
1614  * dhclient yet.
1615  *
1616  * To avoid breaking existing guests, we catch udp packets and add
1617  * checksums.  This is terrible but it's better than hacking the guest
1618  * kernels.
1619  *
1620  * N.B. if we introduce a zero-copy API, this operation is no longer free so
1621  * we should provide a mechanism to disable it to avoid polluting the host
1622  * cache.
1623  */
1624 static void work_around_broken_dhclient(struct virtio_net_hdr *hdr,
1625                                         uint8_t *buf, size_t size)
1626 {
1627     if ((hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && /* missing csum */
1628         (size > 27 && size < 1500) && /* normal sized MTU */
1629         (buf[12] == 0x08 && buf[13] == 0x00) && /* ethertype == IPv4 */
1630         (buf[23] == 17) && /* ip.protocol == UDP */
1631         (buf[34] == 0 && buf[35] == 67)) { /* udp.srcport == bootps */
1632         net_checksum_calculate(buf, size, CSUM_UDP);
1633         hdr->flags &= ~VIRTIO_NET_HDR_F_NEEDS_CSUM;
1634     }
1635 }
1636
1637 static void receive_header(VirtIONet *n, const struct iovec *iov, int iov_cnt,
1638                            const void *buf, size_t size)
1639 {
1640     if (n->has_vnet_hdr) {
1641         /* FIXME this cast is evil */
1642         void *wbuf = (void *)buf;
1643         work_around_broken_dhclient(wbuf, wbuf + n->host_hdr_len,
1644                                     size - n->host_hdr_len);
1645
1646         if (n->needs_vnet_hdr_swap) {
1647             virtio_net_hdr_swap(VIRTIO_DEVICE(n), wbuf);
1648         }
1649         iov_from_buf(iov, iov_cnt, 0, buf, sizeof(struct virtio_net_hdr));
1650     } else {
1651         struct virtio_net_hdr hdr = {
1652             .flags = 0,
1653             .gso_type = VIRTIO_NET_HDR_GSO_NONE
1654         };
1655         iov_from_buf(iov, iov_cnt, 0, &hdr, sizeof hdr);
1656     }
1657 }
1658
1659 static int receive_filter(VirtIONet *n, const uint8_t *buf, int size)
1660 {
1661     static const uint8_t bcast[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
1662     static const uint8_t vlan[] = {0x81, 0x00};
1663     uint8_t *ptr = (uint8_t *)buf;
1664     int i;
1665
1666     if (n->promisc)
1667         return 1;
1668
1669     ptr += n->host_hdr_len;
1670
1671     if (!memcmp(&ptr[12], vlan, sizeof(vlan))) {
1672         int vid = lduw_be_p(ptr + 14) & 0xfff;
1673         if (!(n->vlans[vid >> 5] & (1U << (vid & 0x1f))))
1674             return 0;
1675     }
1676
1677     if (ptr[0] & 1) { // multicast
1678         if (!memcmp(ptr, bcast, sizeof(bcast))) {
1679             return !n->nobcast;
1680         } else if (n->nomulti) {
1681             return 0;
1682         } else if (n->allmulti || n->mac_table.multi_overflow) {
1683             return 1;
1684         }
1685
1686         for (i = n->mac_table.first_multi; i < n->mac_table.in_use; i++) {
1687             if (!memcmp(ptr, &n->mac_table.macs[i * ETH_ALEN], ETH_ALEN)) {
1688                 return 1;
1689             }
1690         }
1691     } else { // unicast
1692         if (n->nouni) {
1693             return 0;
1694         } else if (n->alluni || n->mac_table.uni_overflow) {
1695             return 1;
1696         } else if (!memcmp(ptr, n->mac, ETH_ALEN)) {
1697             return 1;
1698         }
1699
1700         for (i = 0; i < n->mac_table.first_multi; i++) {
1701             if (!memcmp(ptr, &n->mac_table.macs[i * ETH_ALEN], ETH_ALEN)) {
1702                 return 1;
1703             }
1704         }
1705     }
1706
1707     return 0;
1708 }
1709
1710 static uint8_t virtio_net_get_hash_type(bool isip4,
1711                                         bool isip6,
1712                                         bool isudp,
1713                                         bool istcp,
1714                                         uint32_t types)
1715 {
1716     if (isip4) {
1717         if (istcp && (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4)) {
1718             return NetPktRssIpV4Tcp;
1719         }
1720         if (isudp && (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4)) {
1721             return NetPktRssIpV4Udp;
1722         }
1723         if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
1724             return NetPktRssIpV4;
1725         }
1726     } else if (isip6) {
1727         uint32_t mask = VIRTIO_NET_RSS_HASH_TYPE_TCP_EX |
1728                         VIRTIO_NET_RSS_HASH_TYPE_TCPv6;
1729
1730         if (istcp && (types & mask)) {
1731             return (types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) ?
1732                 NetPktRssIpV6TcpEx : NetPktRssIpV6Tcp;
1733         }
1734         mask = VIRTIO_NET_RSS_HASH_TYPE_UDP_EX | VIRTIO_NET_RSS_HASH_TYPE_UDPv6;
1735         if (isudp && (types & mask)) {
1736             return (types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) ?
1737                 NetPktRssIpV6UdpEx : NetPktRssIpV6Udp;
1738         }
1739         mask = VIRTIO_NET_RSS_HASH_TYPE_IP_EX | VIRTIO_NET_RSS_HASH_TYPE_IPv6;
1740         if (types & mask) {
1741             return (types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) ?
1742                 NetPktRssIpV6Ex : NetPktRssIpV6;
1743         }
1744     }
1745     return 0xff;
1746 }
1747
1748 static void virtio_set_packet_hash(const uint8_t *buf, uint8_t report,
1749                                    uint32_t hash)
1750 {
1751     struct virtio_net_hdr_v1_hash *hdr = (void *)buf;
1752     hdr->hash_value = hash;
1753     hdr->hash_report = report;
1754 }
1755
1756 static int virtio_net_process_rss(NetClientState *nc, const uint8_t *buf,
1757                                   size_t size)
1758 {
1759     VirtIONet *n = qemu_get_nic_opaque(nc);
1760     unsigned int index = nc->queue_index, new_index = index;
1761     struct NetRxPkt *pkt = n->rx_pkt;
1762     uint8_t net_hash_type;
1763     uint32_t hash;
1764     bool isip4, isip6, isudp, istcp;
1765     static const uint8_t reports[NetPktRssIpV6UdpEx + 1] = {
1766         VIRTIO_NET_HASH_REPORT_IPv4,
1767         VIRTIO_NET_HASH_REPORT_TCPv4,
1768         VIRTIO_NET_HASH_REPORT_TCPv6,
1769         VIRTIO_NET_HASH_REPORT_IPv6,
1770         VIRTIO_NET_HASH_REPORT_IPv6_EX,
1771         VIRTIO_NET_HASH_REPORT_TCPv6_EX,
1772         VIRTIO_NET_HASH_REPORT_UDPv4,
1773         VIRTIO_NET_HASH_REPORT_UDPv6,
1774         VIRTIO_NET_HASH_REPORT_UDPv6_EX
1775     };
1776
1777     net_rx_pkt_set_protocols(pkt, buf + n->host_hdr_len,
1778                              size - n->host_hdr_len);
1779     net_rx_pkt_get_protocols(pkt, &isip4, &isip6, &isudp, &istcp);
1780     if (isip4 && (net_rx_pkt_get_ip4_info(pkt)->fragment)) {
1781         istcp = isudp = false;
1782     }
1783     if (isip6 && (net_rx_pkt_get_ip6_info(pkt)->fragment)) {
1784         istcp = isudp = false;
1785     }
1786     net_hash_type = virtio_net_get_hash_type(isip4, isip6, isudp, istcp,
1787                                              n->rss_data.hash_types);
1788     if (net_hash_type > NetPktRssIpV6UdpEx) {
1789         if (n->rss_data.populate_hash) {
1790             virtio_set_packet_hash(buf, VIRTIO_NET_HASH_REPORT_NONE, 0);
1791         }
1792         return n->rss_data.redirect ? n->rss_data.default_queue : -1;
1793     }
1794
1795     hash = net_rx_pkt_calc_rss_hash(pkt, net_hash_type, n->rss_data.key);
1796
1797     if (n->rss_data.populate_hash) {
1798         virtio_set_packet_hash(buf, reports[net_hash_type], hash);
1799     }
1800
1801     if (n->rss_data.redirect) {
1802         new_index = hash & (n->rss_data.indirections_len - 1);
1803         new_index = n->rss_data.indirections_table[new_index];
1804     }
1805
1806     return (index == new_index) ? -1 : new_index;
1807 }
1808
1809 static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
1810                                       size_t size, bool no_rss)
1811 {
1812     VirtIONet *n = qemu_get_nic_opaque(nc);
1813     VirtIONetQueue *q = virtio_net_get_subqueue(nc);
1814     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1815     VirtQueueElement *elems[VIRTQUEUE_MAX_SIZE];
1816     size_t lens[VIRTQUEUE_MAX_SIZE];
1817     struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE];
1818     struct virtio_net_hdr_mrg_rxbuf mhdr;
1819     unsigned mhdr_cnt = 0;
1820     size_t offset, i, guest_offset, j;
1821     ssize_t err;
1822
1823     if (!virtio_net_can_receive(nc)) {
1824         return -1;
1825     }
1826
1827     if (!no_rss && n->rss_data.enabled && n->rss_data.enabled_software_rss) {
1828         int index = virtio_net_process_rss(nc, buf, size);
1829         if (index >= 0) {
1830             NetClientState *nc2 = qemu_get_subqueue(n->nic, index);
1831             return virtio_net_receive_rcu(nc2, buf, size, true);
1832         }
1833     }
1834
1835     /* hdr_len refers to the header we supply to the guest */
1836     if (!virtio_net_has_buffers(q, size + n->guest_hdr_len - n->host_hdr_len)) {
1837         return 0;
1838     }
1839
1840     if (!receive_filter(n, buf, size))
1841         return size;
1842
1843     offset = i = 0;
1844
1845     while (offset < size) {
1846         VirtQueueElement *elem;
1847         int len, total;
1848         const struct iovec *sg;
1849
1850         total = 0;
1851
1852         if (i == VIRTQUEUE_MAX_SIZE) {
1853             virtio_error(vdev, "virtio-net unexpected long buffer chain");
1854             err = size;
1855             goto err;
1856         }
1857
1858         elem = virtqueue_pop(q->rx_vq, sizeof(VirtQueueElement));
1859         if (!elem) {
1860             if (i) {
1861                 virtio_error(vdev, "virtio-net unexpected empty queue: "
1862                              "i %zd mergeable %d offset %zd, size %zd, "
1863                              "guest hdr len %zd, host hdr len %zd "
1864                              "guest features 0x%" PRIx64,
1865                              i, n->mergeable_rx_bufs, offset, size,
1866                              n->guest_hdr_len, n->host_hdr_len,
1867                              vdev->guest_features);
1868             }
1869             err = -1;
1870             goto err;
1871         }
1872
1873         if (elem->in_num < 1) {
1874             virtio_error(vdev,
1875                          "virtio-net receive queue contains no in buffers");
1876             virtqueue_detach_element(q->rx_vq, elem, 0);
1877             g_free(elem);
1878             err = -1;
1879             goto err;
1880         }
1881
1882         sg = elem->in_sg;
1883         if (i == 0) {
1884             assert(offset == 0);
1885             if (n->mergeable_rx_bufs) {
1886                 mhdr_cnt = iov_copy(mhdr_sg, ARRAY_SIZE(mhdr_sg),
1887                                     sg, elem->in_num,
1888                                     offsetof(typeof(mhdr), num_buffers),
1889                                     sizeof(mhdr.num_buffers));
1890             }
1891
1892             receive_header(n, sg, elem->in_num, buf, size);
1893             if (n->rss_data.populate_hash) {
1894                 offset = sizeof(mhdr);
1895                 iov_from_buf(sg, elem->in_num, offset,
1896                              buf + offset, n->host_hdr_len - sizeof(mhdr));
1897             }
1898             offset = n->host_hdr_len;
1899             total += n->guest_hdr_len;
1900             guest_offset = n->guest_hdr_len;
1901         } else {
1902             guest_offset = 0;
1903         }
1904
1905         /* copy in packet.  ugh */
1906         len = iov_from_buf(sg, elem->in_num, guest_offset,
1907                            buf + offset, size - offset);
1908         total += len;
1909         offset += len;
1910         /* If buffers can't be merged, at this point we
1911          * must have consumed the complete packet.
1912          * Otherwise, drop it. */
1913         if (!n->mergeable_rx_bufs && offset < size) {
1914             virtqueue_unpop(q->rx_vq, elem, total);
1915             g_free(elem);
1916             err = size;
1917             goto err;
1918         }
1919
1920         elems[i] = elem;
1921         lens[i] = total;
1922         i++;
1923     }
1924
1925     if (mhdr_cnt) {
1926         virtio_stw_p(vdev, &mhdr.num_buffers, i);
1927         iov_from_buf(mhdr_sg, mhdr_cnt,
1928                      0,
1929                      &mhdr.num_buffers, sizeof mhdr.num_buffers);
1930     }
1931
1932     for (j = 0; j < i; j++) {
1933         /* signal other side */
1934         virtqueue_fill(q->rx_vq, elems[j], lens[j], j);
1935         g_free(elems[j]);
1936     }
1937
1938     virtqueue_flush(q->rx_vq, i);
1939     virtio_notify(vdev, q->rx_vq);
1940
1941     return size;
1942
1943 err:
1944     for (j = 0; j < i; j++) {
1945         virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
1946         g_free(elems[j]);
1947     }
1948
1949     return err;
1950 }
1951
1952 static ssize_t virtio_net_do_receive(NetClientState *nc, const uint8_t *buf,
1953                                   size_t size)
1954 {
1955     RCU_READ_LOCK_GUARD();
1956
1957     return virtio_net_receive_rcu(nc, buf, size, false);
1958 }
1959
1960 static void virtio_net_rsc_extract_unit4(VirtioNetRscChain *chain,
1961                                          const uint8_t *buf,
1962                                          VirtioNetRscUnit *unit)
1963 {
1964     uint16_t ip_hdrlen;
1965     struct ip_header *ip;
1966
1967     ip = (struct ip_header *)(buf + chain->n->guest_hdr_len
1968                               + sizeof(struct eth_header));
1969     unit->ip = (void *)ip;
1970     ip_hdrlen = (ip->ip_ver_len & 0xF) << 2;
1971     unit->ip_plen = &ip->ip_len;
1972     unit->tcp = (struct tcp_header *)(((uint8_t *)unit->ip) + ip_hdrlen);
1973     unit->tcp_hdrlen = (htons(unit->tcp->th_offset_flags) & 0xF000) >> 10;
1974     unit->payload = htons(*unit->ip_plen) - ip_hdrlen - unit->tcp_hdrlen;
1975 }
1976
1977 static void virtio_net_rsc_extract_unit6(VirtioNetRscChain *chain,
1978                                          const uint8_t *buf,
1979                                          VirtioNetRscUnit *unit)
1980 {
1981     struct ip6_header *ip6;
1982
1983     ip6 = (struct ip6_header *)(buf + chain->n->guest_hdr_len
1984                                  + sizeof(struct eth_header));
1985     unit->ip = ip6;
1986     unit->ip_plen = &(ip6->ip6_ctlun.ip6_un1.ip6_un1_plen);
1987     unit->tcp = (struct tcp_header *)(((uint8_t *)unit->ip)
1988                                         + sizeof(struct ip6_header));
1989     unit->tcp_hdrlen = (htons(unit->tcp->th_offset_flags) & 0xF000) >> 10;
1990
1991     /* There is a difference between payload lenght in ipv4 and v6,
1992        ip header is excluded in ipv6 */
1993     unit->payload = htons(*unit->ip_plen) - unit->tcp_hdrlen;
1994 }
1995
1996 static size_t virtio_net_rsc_drain_seg(VirtioNetRscChain *chain,
1997                                        VirtioNetRscSeg *seg)
1998 {
1999     int ret;
2000     struct virtio_net_hdr_v1 *h;
2001
2002     h = (struct virtio_net_hdr_v1 *)seg->buf;
2003     h->flags = 0;
2004     h->gso_type = VIRTIO_NET_HDR_GSO_NONE;
2005
2006     if (seg->is_coalesced) {
2007         h->rsc.segments = seg->packets;
2008         h->rsc.dup_acks = seg->dup_ack;
2009         h->flags = VIRTIO_NET_HDR_F_RSC_INFO;
2010         if (chain->proto == ETH_P_IP) {
2011             h->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2012         } else {
2013             h->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2014         }
2015     }
2016
2017     ret = virtio_net_do_receive(seg->nc, seg->buf, seg->size);
2018     QTAILQ_REMOVE(&chain->buffers, seg, next);
2019     g_free(seg->buf);
2020     g_free(seg);
2021
2022     return ret;
2023 }
2024
2025 static void virtio_net_rsc_purge(void *opq)
2026 {
2027     VirtioNetRscSeg *seg, *rn;
2028     VirtioNetRscChain *chain = (VirtioNetRscChain *)opq;
2029
2030     QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, rn) {
2031         if (virtio_net_rsc_drain_seg(chain, seg) == 0) {
2032             chain->stat.purge_failed++;
2033             continue;
2034         }
2035     }
2036
2037     chain->stat.timer++;
2038     if (!QTAILQ_EMPTY(&chain->buffers)) {
2039         timer_mod(chain->drain_timer,
2040               qemu_clock_get_ns(QEMU_CLOCK_HOST) + chain->n->rsc_timeout);
2041     }
2042 }
2043
2044 static void virtio_net_rsc_cleanup(VirtIONet *n)
2045 {
2046     VirtioNetRscChain *chain, *rn_chain;
2047     VirtioNetRscSeg *seg, *rn_seg;
2048
2049     QTAILQ_FOREACH_SAFE(chain, &n->rsc_chains, next, rn_chain) {
2050         QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, rn_seg) {
2051             QTAILQ_REMOVE(&chain->buffers, seg, next);
2052             g_free(seg->buf);
2053             g_free(seg);
2054         }
2055
2056         timer_free(chain->drain_timer);
2057         QTAILQ_REMOVE(&n->rsc_chains, chain, next);
2058         g_free(chain);
2059     }
2060 }
2061
2062 static void virtio_net_rsc_cache_buf(VirtioNetRscChain *chain,
2063                                      NetClientState *nc,
2064                                      const uint8_t *buf, size_t size)
2065 {
2066     uint16_t hdr_len;
2067     VirtioNetRscSeg *seg;
2068
2069     hdr_len = chain->n->guest_hdr_len;
2070     seg = g_new(VirtioNetRscSeg, 1);
2071     seg->buf = g_malloc(hdr_len + sizeof(struct eth_header)
2072         + sizeof(struct ip6_header) + VIRTIO_NET_MAX_TCP_PAYLOAD);
2073     memcpy(seg->buf, buf, size);
2074     seg->size = size;
2075     seg->packets = 1;
2076     seg->dup_ack = 0;
2077     seg->is_coalesced = 0;
2078     seg->nc = nc;
2079
2080     QTAILQ_INSERT_TAIL(&chain->buffers, seg, next);
2081     chain->stat.cache++;
2082
2083     switch (chain->proto) {
2084     case ETH_P_IP:
2085         virtio_net_rsc_extract_unit4(chain, seg->buf, &seg->unit);
2086         break;
2087     case ETH_P_IPV6:
2088         virtio_net_rsc_extract_unit6(chain, seg->buf, &seg->unit);
2089         break;
2090     default:
2091         g_assert_not_reached();
2092     }
2093 }
2094
2095 static int32_t virtio_net_rsc_handle_ack(VirtioNetRscChain *chain,
2096                                          VirtioNetRscSeg *seg,
2097                                          const uint8_t *buf,
2098                                          struct tcp_header *n_tcp,
2099                                          struct tcp_header *o_tcp)
2100 {
2101     uint32_t nack, oack;
2102     uint16_t nwin, owin;
2103
2104     nack = htonl(n_tcp->th_ack);
2105     nwin = htons(n_tcp->th_win);
2106     oack = htonl(o_tcp->th_ack);
2107     owin = htons(o_tcp->th_win);
2108
2109     if ((nack - oack) >= VIRTIO_NET_MAX_TCP_PAYLOAD) {
2110         chain->stat.ack_out_of_win++;
2111         return RSC_FINAL;
2112     } else if (nack == oack) {
2113         /* duplicated ack or window probe */
2114         if (nwin == owin) {
2115             /* duplicated ack, add dup ack count due to whql test up to 1 */
2116             chain->stat.dup_ack++;
2117             return RSC_FINAL;
2118         } else {
2119             /* Coalesce window update */
2120             o_tcp->th_win = n_tcp->th_win;
2121             chain->stat.win_update++;
2122             return RSC_COALESCE;
2123         }
2124     } else {
2125         /* pure ack, go to 'C', finalize*/
2126         chain->stat.pure_ack++;
2127         return RSC_FINAL;
2128     }
2129 }
2130
2131 static int32_t virtio_net_rsc_coalesce_data(VirtioNetRscChain *chain,
2132                                             VirtioNetRscSeg *seg,
2133                                             const uint8_t *buf,
2134                                             VirtioNetRscUnit *n_unit)
2135 {
2136     void *data;
2137     uint16_t o_ip_len;
2138     uint32_t nseq, oseq;
2139     VirtioNetRscUnit *o_unit;
2140
2141     o_unit = &seg->unit;
2142     o_ip_len = htons(*o_unit->ip_plen);
2143     nseq = htonl(n_unit->tcp->th_seq);
2144     oseq = htonl(o_unit->tcp->th_seq);
2145
2146     /* out of order or retransmitted. */
2147     if ((nseq - oseq) > VIRTIO_NET_MAX_TCP_PAYLOAD) {
2148         chain->stat.data_out_of_win++;
2149         return RSC_FINAL;
2150     }
2151
2152     data = ((uint8_t *)n_unit->tcp) + n_unit->tcp_hdrlen;
2153     if (nseq == oseq) {
2154         if ((o_unit->payload == 0) && n_unit->payload) {
2155             /* From no payload to payload, normal case, not a dup ack or etc */
2156             chain->stat.data_after_pure_ack++;
2157             goto coalesce;
2158         } else {
2159             return virtio_net_rsc_handle_ack(chain, seg, buf,
2160                                              n_unit->tcp, o_unit->tcp);
2161         }
2162     } else if ((nseq - oseq) != o_unit->payload) {
2163         /* Not a consistent packet, out of order */
2164         chain->stat.data_out_of_order++;
2165         return RSC_FINAL;
2166     } else {
2167 coalesce:
2168         if ((o_ip_len + n_unit->payload) > chain->max_payload) {
2169             chain->stat.over_size++;
2170             return RSC_FINAL;
2171         }
2172
2173         /* Here comes the right data, the payload length in v4/v6 is different,
2174            so use the field value to update and record the new data len */
2175         o_unit->payload += n_unit->payload; /* update new data len */
2176
2177         /* update field in ip header */
2178         *o_unit->ip_plen = htons(o_ip_len + n_unit->payload);
2179
2180         /* Bring 'PUSH' big, the whql test guide says 'PUSH' can be coalesced
2181            for windows guest, while this may change the behavior for linux
2182            guest (only if it uses RSC feature). */
2183         o_unit->tcp->th_offset_flags = n_unit->tcp->th_offset_flags;
2184
2185         o_unit->tcp->th_ack = n_unit->tcp->th_ack;
2186         o_unit->tcp->th_win = n_unit->tcp->th_win;
2187
2188         memmove(seg->buf + seg->size, data, n_unit->payload);
2189         seg->size += n_unit->payload;
2190         seg->packets++;
2191         chain->stat.coalesced++;
2192         return RSC_COALESCE;
2193     }
2194 }
2195
2196 static int32_t virtio_net_rsc_coalesce4(VirtioNetRscChain *chain,
2197                                         VirtioNetRscSeg *seg,
2198                                         const uint8_t *buf, size_t size,
2199                                         VirtioNetRscUnit *unit)
2200 {
2201     struct ip_header *ip1, *ip2;
2202
2203     ip1 = (struct ip_header *)(unit->ip);
2204     ip2 = (struct ip_header *)(seg->unit.ip);
2205     if ((ip1->ip_src ^ ip2->ip_src) || (ip1->ip_dst ^ ip2->ip_dst)
2206         || (unit->tcp->th_sport ^ seg->unit.tcp->th_sport)
2207         || (unit->tcp->th_dport ^ seg->unit.tcp->th_dport)) {
2208         chain->stat.no_match++;
2209         return RSC_NO_MATCH;
2210     }
2211
2212     return virtio_net_rsc_coalesce_data(chain, seg, buf, unit);
2213 }
2214
2215 static int32_t virtio_net_rsc_coalesce6(VirtioNetRscChain *chain,
2216                                         VirtioNetRscSeg *seg,
2217                                         const uint8_t *buf, size_t size,
2218                                         VirtioNetRscUnit *unit)
2219 {
2220     struct ip6_header *ip1, *ip2;
2221
2222     ip1 = (struct ip6_header *)(unit->ip);
2223     ip2 = (struct ip6_header *)(seg->unit.ip);
2224     if (memcmp(&ip1->ip6_src, &ip2->ip6_src, sizeof(struct in6_address))
2225         || memcmp(&ip1->ip6_dst, &ip2->ip6_dst, sizeof(struct in6_address))
2226         || (unit->tcp->th_sport ^ seg->unit.tcp->th_sport)
2227         || (unit->tcp->th_dport ^ seg->unit.tcp->th_dport)) {
2228             chain->stat.no_match++;
2229             return RSC_NO_MATCH;
2230     }
2231
2232     return virtio_net_rsc_coalesce_data(chain, seg, buf, unit);
2233 }
2234
2235 /* Packets with 'SYN' should bypass, other flag should be sent after drain
2236  * to prevent out of order */
2237 static int virtio_net_rsc_tcp_ctrl_check(VirtioNetRscChain *chain,
2238                                          struct tcp_header *tcp)
2239 {
2240     uint16_t tcp_hdr;
2241     uint16_t tcp_flag;
2242
2243     tcp_flag = htons(tcp->th_offset_flags);
2244     tcp_hdr = (tcp_flag & VIRTIO_NET_TCP_HDR_LENGTH) >> 10;
2245     tcp_flag &= VIRTIO_NET_TCP_FLAG;
2246     if (tcp_flag & TH_SYN) {
2247         chain->stat.tcp_syn++;
2248         return RSC_BYPASS;
2249     }
2250
2251     if (tcp_flag & (TH_FIN | TH_URG | TH_RST | TH_ECE | TH_CWR)) {
2252         chain->stat.tcp_ctrl_drain++;
2253         return RSC_FINAL;
2254     }
2255
2256     if (tcp_hdr > sizeof(struct tcp_header)) {
2257         chain->stat.tcp_all_opt++;
2258         return RSC_FINAL;
2259     }
2260
2261     return RSC_CANDIDATE;
2262 }
2263
2264 static size_t virtio_net_rsc_do_coalesce(VirtioNetRscChain *chain,
2265                                          NetClientState *nc,
2266                                          const uint8_t *buf, size_t size,
2267                                          VirtioNetRscUnit *unit)
2268 {
2269     int ret;
2270     VirtioNetRscSeg *seg, *nseg;
2271
2272     if (QTAILQ_EMPTY(&chain->buffers)) {
2273         chain->stat.empty_cache++;
2274         virtio_net_rsc_cache_buf(chain, nc, buf, size);
2275         timer_mod(chain->drain_timer,
2276               qemu_clock_get_ns(QEMU_CLOCK_HOST) + chain->n->rsc_timeout);
2277         return size;
2278     }
2279
2280     QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, nseg) {
2281         if (chain->proto == ETH_P_IP) {
2282             ret = virtio_net_rsc_coalesce4(chain, seg, buf, size, unit);
2283         } else {
2284             ret = virtio_net_rsc_coalesce6(chain, seg, buf, size, unit);
2285         }
2286
2287         if (ret == RSC_FINAL) {
2288             if (virtio_net_rsc_drain_seg(chain, seg) == 0) {
2289                 /* Send failed */
2290                 chain->stat.final_failed++;
2291                 return 0;
2292             }
2293
2294             /* Send current packet */
2295             return virtio_net_do_receive(nc, buf, size);
2296         } else if (ret == RSC_NO_MATCH) {
2297             continue;
2298         } else {
2299             /* Coalesced, mark coalesced flag to tell calc cksum for ipv4 */
2300             seg->is_coalesced = 1;
2301             return size;
2302         }
2303     }
2304
2305     chain->stat.no_match_cache++;
2306     virtio_net_rsc_cache_buf(chain, nc, buf, size);
2307     return size;
2308 }
2309
2310 /* Drain a connection data, this is to avoid out of order segments */
2311 static size_t virtio_net_rsc_drain_flow(VirtioNetRscChain *chain,
2312                                         NetClientState *nc,
2313                                         const uint8_t *buf, size_t size,
2314                                         uint16_t ip_start, uint16_t ip_size,
2315                                         uint16_t tcp_port)
2316 {
2317     VirtioNetRscSeg *seg, *nseg;
2318     uint32_t ppair1, ppair2;
2319
2320     ppair1 = *(uint32_t *)(buf + tcp_port);
2321     QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, nseg) {
2322         ppair2 = *(uint32_t *)(seg->buf + tcp_port);
2323         if (memcmp(buf + ip_start, seg->buf + ip_start, ip_size)
2324             || (ppair1 != ppair2)) {
2325             continue;
2326         }
2327         if (virtio_net_rsc_drain_seg(chain, seg) == 0) {
2328             chain->stat.drain_failed++;
2329         }
2330
2331         break;
2332     }
2333
2334     return virtio_net_do_receive(nc, buf, size);
2335 }
2336
2337 static int32_t virtio_net_rsc_sanity_check4(VirtioNetRscChain *chain,
2338                                             struct ip_header *ip,
2339                                             const uint8_t *buf, size_t size)
2340 {
2341     uint16_t ip_len;
2342
2343     /* Not an ipv4 packet */
2344     if (((ip->ip_ver_len & 0xF0) >> 4) != IP_HEADER_VERSION_4) {
2345         chain->stat.ip_option++;
2346         return RSC_BYPASS;
2347     }
2348
2349     /* Don't handle packets with ip option */
2350     if ((ip->ip_ver_len & 0xF) != VIRTIO_NET_IP4_HEADER_LENGTH) {
2351         chain->stat.ip_option++;
2352         return RSC_BYPASS;
2353     }
2354
2355     if (ip->ip_p != IPPROTO_TCP) {
2356         chain->stat.bypass_not_tcp++;
2357         return RSC_BYPASS;
2358     }
2359
2360     /* Don't handle packets with ip fragment */
2361     if (!(htons(ip->ip_off) & IP_DF)) {
2362         chain->stat.ip_frag++;
2363         return RSC_BYPASS;
2364     }
2365
2366     /* Don't handle packets with ecn flag */
2367     if (IPTOS_ECN(ip->ip_tos)) {
2368         chain->stat.ip_ecn++;
2369         return RSC_BYPASS;
2370     }
2371
2372     ip_len = htons(ip->ip_len);
2373     if (ip_len < (sizeof(struct ip_header) + sizeof(struct tcp_header))
2374         || ip_len > (size - chain->n->guest_hdr_len -
2375                      sizeof(struct eth_header))) {
2376         chain->stat.ip_hacked++;
2377         return RSC_BYPASS;
2378     }
2379
2380     return RSC_CANDIDATE;
2381 }
2382
2383 static size_t virtio_net_rsc_receive4(VirtioNetRscChain *chain,
2384                                       NetClientState *nc,
2385                                       const uint8_t *buf, size_t size)
2386 {
2387     int32_t ret;
2388     uint16_t hdr_len;
2389     VirtioNetRscUnit unit;
2390
2391     hdr_len = ((VirtIONet *)(chain->n))->guest_hdr_len;
2392
2393     if (size < (hdr_len + sizeof(struct eth_header) + sizeof(struct ip_header)
2394         + sizeof(struct tcp_header))) {
2395         chain->stat.bypass_not_tcp++;
2396         return virtio_net_do_receive(nc, buf, size);
2397     }
2398
2399     virtio_net_rsc_extract_unit4(chain, buf, &unit);
2400     if (virtio_net_rsc_sanity_check4(chain, unit.ip, buf, size)
2401         != RSC_CANDIDATE) {
2402         return virtio_net_do_receive(nc, buf, size);
2403     }
2404
2405     ret = virtio_net_rsc_tcp_ctrl_check(chain, unit.tcp);
2406     if (ret == RSC_BYPASS) {
2407         return virtio_net_do_receive(nc, buf, size);
2408     } else if (ret == RSC_FINAL) {
2409         return virtio_net_rsc_drain_flow(chain, nc, buf, size,
2410                 ((hdr_len + sizeof(struct eth_header)) + 12),
2411                 VIRTIO_NET_IP4_ADDR_SIZE,
2412                 hdr_len + sizeof(struct eth_header) + sizeof(struct ip_header));
2413     }
2414
2415     return virtio_net_rsc_do_coalesce(chain, nc, buf, size, &unit);
2416 }
2417
2418 static int32_t virtio_net_rsc_sanity_check6(VirtioNetRscChain *chain,
2419                                             struct ip6_header *ip6,
2420                                             const uint8_t *buf, size_t size)
2421 {
2422     uint16_t ip_len;
2423
2424     if (((ip6->ip6_ctlun.ip6_un1.ip6_un1_flow & 0xF0) >> 4)
2425         != IP_HEADER_VERSION_6) {
2426         return RSC_BYPASS;
2427     }
2428
2429     /* Both option and protocol is checked in this */
2430     if (ip6->ip6_ctlun.ip6_un1.ip6_un1_nxt != IPPROTO_TCP) {
2431         chain->stat.bypass_not_tcp++;
2432         return RSC_BYPASS;
2433     }
2434
2435     ip_len = htons(ip6->ip6_ctlun.ip6_un1.ip6_un1_plen);
2436     if (ip_len < sizeof(struct tcp_header) ||
2437         ip_len > (size - chain->n->guest_hdr_len - sizeof(struct eth_header)
2438                   - sizeof(struct ip6_header))) {
2439         chain->stat.ip_hacked++;
2440         return RSC_BYPASS;
2441     }
2442
2443     /* Don't handle packets with ecn flag */
2444     if (IP6_ECN(ip6->ip6_ctlun.ip6_un3.ip6_un3_ecn)) {
2445         chain->stat.ip_ecn++;
2446         return RSC_BYPASS;
2447     }
2448
2449     return RSC_CANDIDATE;
2450 }
2451
2452 static size_t virtio_net_rsc_receive6(void *opq, NetClientState *nc,
2453                                       const uint8_t *buf, size_t size)
2454 {
2455     int32_t ret;
2456     uint16_t hdr_len;
2457     VirtioNetRscChain *chain;
2458     VirtioNetRscUnit unit;
2459
2460     chain = (VirtioNetRscChain *)opq;
2461     hdr_len = ((VirtIONet *)(chain->n))->guest_hdr_len;
2462
2463     if (size < (hdr_len + sizeof(struct eth_header) + sizeof(struct ip6_header)
2464         + sizeof(tcp_header))) {
2465         return virtio_net_do_receive(nc, buf, size);
2466     }
2467
2468     virtio_net_rsc_extract_unit6(chain, buf, &unit);
2469     if (RSC_CANDIDATE != virtio_net_rsc_sanity_check6(chain,
2470                                                  unit.ip, buf, size)) {
2471         return virtio_net_do_receive(nc, buf, size);
2472     }
2473
2474     ret = virtio_net_rsc_tcp_ctrl_check(chain, unit.tcp);
2475     if (ret == RSC_BYPASS) {
2476         return virtio_net_do_receive(nc, buf, size);
2477     } else if (ret == RSC_FINAL) {
2478         return virtio_net_rsc_drain_flow(chain, nc, buf, size,
2479                 ((hdr_len + sizeof(struct eth_header)) + 8),
2480                 VIRTIO_NET_IP6_ADDR_SIZE,
2481                 hdr_len + sizeof(struct eth_header)
2482                 + sizeof(struct ip6_header));
2483     }
2484
2485     return virtio_net_rsc_do_coalesce(chain, nc, buf, size, &unit);
2486 }
2487
2488 static VirtioNetRscChain *virtio_net_rsc_lookup_chain(VirtIONet *n,
2489                                                       NetClientState *nc,
2490                                                       uint16_t proto)
2491 {
2492     VirtioNetRscChain *chain;
2493
2494     if ((proto != (uint16_t)ETH_P_IP) && (proto != (uint16_t)ETH_P_IPV6)) {
2495         return NULL;
2496     }
2497
2498     QTAILQ_FOREACH(chain, &n->rsc_chains, next) {
2499         if (chain->proto == proto) {
2500             return chain;
2501         }
2502     }
2503
2504     chain = g_malloc(sizeof(*chain));
2505     chain->n = n;
2506     chain->proto = proto;
2507     if (proto == (uint16_t)ETH_P_IP) {
2508         chain->max_payload = VIRTIO_NET_MAX_IP4_PAYLOAD;
2509         chain->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2510     } else {
2511         chain->max_payload = VIRTIO_NET_MAX_IP6_PAYLOAD;
2512         chain->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2513     }
2514     chain->drain_timer = timer_new_ns(QEMU_CLOCK_HOST,
2515                                       virtio_net_rsc_purge, chain);
2516     memset(&chain->stat, 0, sizeof(chain->stat));
2517
2518     QTAILQ_INIT(&chain->buffers);
2519     QTAILQ_INSERT_TAIL(&n->rsc_chains, chain, next);
2520
2521     return chain;
2522 }
2523
2524 static ssize_t virtio_net_rsc_receive(NetClientState *nc,
2525                                       const uint8_t *buf,
2526                                       size_t size)
2527 {
2528     uint16_t proto;
2529     VirtioNetRscChain *chain;
2530     struct eth_header *eth;
2531     VirtIONet *n;
2532
2533     n = qemu_get_nic_opaque(nc);
2534     if (size < (n->host_hdr_len + sizeof(struct eth_header))) {
2535         return virtio_net_do_receive(nc, buf, size);
2536     }
2537
2538     eth = (struct eth_header *)(buf + n->guest_hdr_len);
2539     proto = htons(eth->h_proto);
2540
2541     chain = virtio_net_rsc_lookup_chain(n, nc, proto);
2542     if (chain) {
2543         chain->stat.received++;
2544         if (proto == (uint16_t)ETH_P_IP && n->rsc4_enabled) {
2545             return virtio_net_rsc_receive4(chain, nc, buf, size);
2546         } else if (proto == (uint16_t)ETH_P_IPV6 && n->rsc6_enabled) {
2547             return virtio_net_rsc_receive6(chain, nc, buf, size);
2548         }
2549     }
2550     return virtio_net_do_receive(nc, buf, size);
2551 }
2552
2553 static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf,
2554                                   size_t size)
2555 {
2556     VirtIONet *n = qemu_get_nic_opaque(nc);
2557     if ((n->rsc4_enabled || n->rsc6_enabled)) {
2558         return virtio_net_rsc_receive(nc, buf, size);
2559     } else {
2560         return virtio_net_do_receive(nc, buf, size);
2561     }
2562 }
2563
2564 static int32_t virtio_net_flush_tx(VirtIONetQueue *q);
2565
2566 static void virtio_net_tx_complete(NetClientState *nc, ssize_t len)
2567 {
2568     VirtIONet *n = qemu_get_nic_opaque(nc);
2569     VirtIONetQueue *q = virtio_net_get_subqueue(nc);
2570     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2571     int ret;
2572
2573     virtqueue_push(q->tx_vq, q->async_tx.elem, 0);
2574     virtio_notify(vdev, q->tx_vq);
2575
2576     g_free(q->async_tx.elem);
2577     q->async_tx.elem = NULL;
2578
2579     virtio_queue_set_notification(q->tx_vq, 1);
2580     ret = virtio_net_flush_tx(q);
2581     if (ret >= n->tx_burst) {
2582         /*
2583          * the flush has been stopped by tx_burst
2584          * we will not receive notification for the
2585          * remainining part, so re-schedule
2586          */
2587         virtio_queue_set_notification(q->tx_vq, 0);
2588         if (q->tx_bh) {
2589             qemu_bh_schedule(q->tx_bh);
2590         } else {
2591             timer_mod(q->tx_timer,
2592                       qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
2593         }
2594         q->tx_waiting = 1;
2595     }
2596 }
2597
2598 /* TX */
2599 static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
2600 {
2601     VirtIONet *n = q->n;
2602     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2603     VirtQueueElement *elem;
2604     int32_t num_packets = 0;
2605     int queue_index = vq2q(virtio_get_queue_index(q->tx_vq));
2606     if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
2607         return num_packets;
2608     }
2609
2610     if (q->async_tx.elem) {
2611         virtio_queue_set_notification(q->tx_vq, 0);
2612         return num_packets;
2613     }
2614
2615     for (;;) {
2616         ssize_t ret;
2617         unsigned int out_num;
2618         struct iovec sg[VIRTQUEUE_MAX_SIZE], sg2[VIRTQUEUE_MAX_SIZE + 1], *out_sg;
2619         struct virtio_net_hdr_mrg_rxbuf mhdr;
2620
2621         elem = virtqueue_pop(q->tx_vq, sizeof(VirtQueueElement));
2622         if (!elem) {
2623             break;
2624         }
2625
2626         out_num = elem->out_num;
2627         out_sg = elem->out_sg;
2628         if (out_num < 1) {
2629             virtio_error(vdev, "virtio-net header not in first element");
2630             virtqueue_detach_element(q->tx_vq, elem, 0);
2631             g_free(elem);
2632             return -EINVAL;
2633         }
2634
2635         if (n->has_vnet_hdr) {
2636             if (iov_to_buf(out_sg, out_num, 0, &mhdr, n->guest_hdr_len) <
2637                 n->guest_hdr_len) {
2638                 virtio_error(vdev, "virtio-net header incorrect");
2639                 virtqueue_detach_element(q->tx_vq, elem, 0);
2640                 g_free(elem);
2641                 return -EINVAL;
2642             }
2643             if (n->needs_vnet_hdr_swap) {
2644                 virtio_net_hdr_swap(vdev, (void *) &mhdr);
2645                 sg2[0].iov_base = &mhdr;
2646                 sg2[0].iov_len = n->guest_hdr_len;
2647                 out_num = iov_copy(&sg2[1], ARRAY_SIZE(sg2) - 1,
2648                                    out_sg, out_num,
2649                                    n->guest_hdr_len, -1);
2650                 if (out_num == VIRTQUEUE_MAX_SIZE) {
2651                     goto drop;
2652                 }
2653                 out_num += 1;
2654                 out_sg = sg2;
2655             }
2656         }
2657         /*
2658          * If host wants to see the guest header as is, we can
2659          * pass it on unchanged. Otherwise, copy just the parts
2660          * that host is interested in.
2661          */
2662         assert(n->host_hdr_len <= n->guest_hdr_len);
2663         if (n->host_hdr_len != n->guest_hdr_len) {
2664             unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg),
2665                                        out_sg, out_num,
2666                                        0, n->host_hdr_len);
2667             sg_num += iov_copy(sg + sg_num, ARRAY_SIZE(sg) - sg_num,
2668                              out_sg, out_num,
2669                              n->guest_hdr_len, -1);
2670             out_num = sg_num;
2671             out_sg = sg;
2672         }
2673
2674         ret = qemu_sendv_packet_async(qemu_get_subqueue(n->nic, queue_index),
2675                                       out_sg, out_num, virtio_net_tx_complete);
2676         if (ret == 0) {
2677             virtio_queue_set_notification(q->tx_vq, 0);
2678             q->async_tx.elem = elem;
2679             return -EBUSY;
2680         }
2681
2682 drop:
2683         virtqueue_push(q->tx_vq, elem, 0);
2684         virtio_notify(vdev, q->tx_vq);
2685         g_free(elem);
2686
2687         if (++num_packets >= n->tx_burst) {
2688             break;
2689         }
2690     }
2691     return num_packets;
2692 }
2693
2694 static void virtio_net_tx_timer(void *opaque);
2695
2696 static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
2697 {
2698     VirtIONet *n = VIRTIO_NET(vdev);
2699     VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))];
2700
2701     if (unlikely((n->status & VIRTIO_NET_S_LINK_UP) == 0)) {
2702         virtio_net_drop_tx_queue_data(vdev, vq);
2703         return;
2704     }
2705
2706     /* This happens when device was stopped but VCPU wasn't. */
2707     if (!vdev->vm_running) {
2708         q->tx_waiting = 1;
2709         return;
2710     }
2711
2712     if (q->tx_waiting) {
2713         /* We already have queued packets, immediately flush */
2714         timer_del(q->tx_timer);
2715         virtio_net_tx_timer(q);
2716     } else {
2717         /* re-arm timer to flush it (and more) on next tick */
2718         timer_mod(q->tx_timer,
2719                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
2720         q->tx_waiting = 1;
2721         virtio_queue_set_notification(vq, 0);
2722     }
2723 }
2724
2725 static void virtio_net_handle_tx_bh(VirtIODevice *vdev, VirtQueue *vq)
2726 {
2727     VirtIONet *n = VIRTIO_NET(vdev);
2728     VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))];
2729
2730     if (unlikely((n->status & VIRTIO_NET_S_LINK_UP) == 0)) {
2731         virtio_net_drop_tx_queue_data(vdev, vq);
2732         return;
2733     }
2734
2735     if (unlikely(q->tx_waiting)) {
2736         return;
2737     }
2738     q->tx_waiting = 1;
2739     /* This happens when device was stopped but VCPU wasn't. */
2740     if (!vdev->vm_running) {
2741         return;
2742     }
2743     virtio_queue_set_notification(vq, 0);
2744     qemu_bh_schedule(q->tx_bh);
2745 }
2746
2747 static void virtio_net_tx_timer(void *opaque)
2748 {
2749     VirtIONetQueue *q = opaque;
2750     VirtIONet *n = q->n;
2751     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2752     int ret;
2753
2754     /* This happens when device was stopped but BH wasn't. */
2755     if (!vdev->vm_running) {
2756         /* Make sure tx waiting is set, so we'll run when restarted. */
2757         assert(q->tx_waiting);
2758         return;
2759     }
2760
2761     q->tx_waiting = 0;
2762
2763     /* Just in case the driver is not ready on more */
2764     if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
2765         return;
2766     }
2767
2768     ret = virtio_net_flush_tx(q);
2769     if (ret == -EBUSY || ret == -EINVAL) {
2770         return;
2771     }
2772     /*
2773      * If we flush a full burst of packets, assume there are
2774      * more coming and immediately rearm
2775      */
2776     if (ret >= n->tx_burst) {
2777         q->tx_waiting = 1;
2778         timer_mod(q->tx_timer,
2779                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
2780         return;
2781     }
2782     /*
2783      * If less than a full burst, re-enable notification and flush
2784      * anything that may have come in while we weren't looking.  If
2785      * we find something, assume the guest is still active and rearm
2786      */
2787     virtio_queue_set_notification(q->tx_vq, 1);
2788     ret = virtio_net_flush_tx(q);
2789     if (ret > 0) {
2790         virtio_queue_set_notification(q->tx_vq, 0);
2791         q->tx_waiting = 1;
2792         timer_mod(q->tx_timer,
2793                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
2794     }
2795 }
2796
2797 static void virtio_net_tx_bh(void *opaque)
2798 {
2799     VirtIONetQueue *q = opaque;
2800     VirtIONet *n = q->n;
2801     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2802     int32_t ret;
2803
2804     /* This happens when device was stopped but BH wasn't. */
2805     if (!vdev->vm_running) {
2806         /* Make sure tx waiting is set, so we'll run when restarted. */
2807         assert(q->tx_waiting);
2808         return;
2809     }
2810
2811     q->tx_waiting = 0;
2812
2813     /* Just in case the driver is not ready on more */
2814     if (unlikely(!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK))) {
2815         return;
2816     }
2817
2818     ret = virtio_net_flush_tx(q);
2819     if (ret == -EBUSY || ret == -EINVAL) {
2820         return; /* Notification re-enable handled by tx_complete or device
2821                  * broken */
2822     }
2823
2824     /* If we flush a full burst of packets, assume there are
2825      * more coming and immediately reschedule */
2826     if (ret >= n->tx_burst) {
2827         qemu_bh_schedule(q->tx_bh);
2828         q->tx_waiting = 1;
2829         return;
2830     }
2831
2832     /* If less than a full burst, re-enable notification and flush
2833      * anything that may have come in while we weren't looking.  If
2834      * we find something, assume the guest is still active and reschedule */
2835     virtio_queue_set_notification(q->tx_vq, 1);
2836     ret = virtio_net_flush_tx(q);
2837     if (ret == -EINVAL) {
2838         return;
2839     } else if (ret > 0) {
2840         virtio_queue_set_notification(q->tx_vq, 0);
2841         qemu_bh_schedule(q->tx_bh);
2842         q->tx_waiting = 1;
2843     }
2844 }
2845
2846 static void virtio_net_add_queue(VirtIONet *n, int index)
2847 {
2848     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2849
2850     n->vqs[index].rx_vq = virtio_add_queue(vdev, n->net_conf.rx_queue_size,
2851                                            virtio_net_handle_rx);
2852
2853     if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) {
2854         n->vqs[index].tx_vq =
2855             virtio_add_queue(vdev, n->net_conf.tx_queue_size,
2856                              virtio_net_handle_tx_timer);
2857         n->vqs[index].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
2858                                               virtio_net_tx_timer,
2859                                               &n->vqs[index]);
2860     } else {
2861         n->vqs[index].tx_vq =
2862             virtio_add_queue(vdev, n->net_conf.tx_queue_size,
2863                              virtio_net_handle_tx_bh);
2864         n->vqs[index].tx_bh = qemu_bh_new(virtio_net_tx_bh, &n->vqs[index]);
2865     }
2866
2867     n->vqs[index].tx_waiting = 0;
2868     n->vqs[index].n = n;
2869 }
2870
2871 static void virtio_net_del_queue(VirtIONet *n, int index)
2872 {
2873     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2874     VirtIONetQueue *q = &n->vqs[index];
2875     NetClientState *nc = qemu_get_subqueue(n->nic, index);
2876
2877     qemu_purge_queued_packets(nc);
2878
2879     virtio_del_queue(vdev, index * 2);
2880     if (q->tx_timer) {
2881         timer_free(q->tx_timer);
2882         q->tx_timer = NULL;
2883     } else {
2884         qemu_bh_delete(q->tx_bh);
2885         q->tx_bh = NULL;
2886     }
2887     q->tx_waiting = 0;
2888     virtio_del_queue(vdev, index * 2 + 1);
2889 }
2890
2891 static void virtio_net_change_num_queue_pairs(VirtIONet *n, int new_max_queue_pairs)
2892 {
2893     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2894     int old_num_queues = virtio_get_num_queues(vdev);
2895     int new_num_queues = new_max_queue_pairs * 2 + 1;
2896     int i;
2897
2898     assert(old_num_queues >= 3);
2899     assert(old_num_queues % 2 == 1);
2900
2901     if (old_num_queues == new_num_queues) {
2902         return;
2903     }
2904
2905     /*
2906      * We always need to remove and add ctrl vq if
2907      * old_num_queues != new_num_queues. Remove ctrl_vq first,
2908      * and then we only enter one of the following two loops.
2909      */
2910     virtio_del_queue(vdev, old_num_queues - 1);
2911
2912     for (i = new_num_queues - 1; i < old_num_queues - 1; i += 2) {
2913         /* new_num_queues < old_num_queues */
2914         virtio_net_del_queue(n, i / 2);
2915     }
2916
2917     for (i = old_num_queues - 1; i < new_num_queues - 1; i += 2) {
2918         /* new_num_queues > old_num_queues */
2919         virtio_net_add_queue(n, i / 2);
2920     }
2921
2922     /* add ctrl_vq last */
2923     n->ctrl_vq = virtio_add_queue(vdev, 64, virtio_net_handle_ctrl);
2924 }
2925
2926 static void virtio_net_set_multiqueue(VirtIONet *n, int multiqueue)
2927 {
2928     int max = multiqueue ? n->max_queue_pairs : 1;
2929
2930     n->multiqueue = multiqueue;
2931     virtio_net_change_num_queue_pairs(n, max);
2932
2933     virtio_net_set_queue_pairs(n);
2934 }
2935
2936 static int virtio_net_post_load_device(void *opaque, int version_id)
2937 {
2938     VirtIONet *n = opaque;
2939     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2940     int i, link_down;
2941
2942     trace_virtio_net_post_load_device();
2943     virtio_net_set_mrg_rx_bufs(n, n->mergeable_rx_bufs,
2944                                virtio_vdev_has_feature(vdev,
2945                                                        VIRTIO_F_VERSION_1),
2946                                virtio_vdev_has_feature(vdev,
2947                                                        VIRTIO_NET_F_HASH_REPORT));
2948
2949     /* MAC_TABLE_ENTRIES may be different from the saved image */
2950     if (n->mac_table.in_use > MAC_TABLE_ENTRIES) {
2951         n->mac_table.in_use = 0;
2952     }
2953
2954     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
2955         n->curr_guest_offloads = virtio_net_supported_guest_offloads(n);
2956     }
2957
2958     /*
2959      * curr_guest_offloads will be later overwritten by the
2960      * virtio_set_features_nocheck call done from the virtio_load.
2961      * Here we make sure it is preserved and restored accordingly
2962      * in the virtio_net_post_load_virtio callback.
2963      */
2964     n->saved_guest_offloads = n->curr_guest_offloads;
2965
2966     virtio_net_set_queue_pairs(n);
2967
2968     /* Find the first multicast entry in the saved MAC filter */
2969     for (i = 0; i < n->mac_table.in_use; i++) {
2970         if (n->mac_table.macs[i * ETH_ALEN] & 1) {
2971             break;
2972         }
2973     }
2974     n->mac_table.first_multi = i;
2975
2976     /* nc.link_down can't be migrated, so infer link_down according
2977      * to link status bit in n->status */
2978     link_down = (n->status & VIRTIO_NET_S_LINK_UP) == 0;
2979     for (i = 0; i < n->max_queue_pairs; i++) {
2980         qemu_get_subqueue(n->nic, i)->link_down = link_down;
2981     }
2982
2983     if (virtio_vdev_has_feature(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE) &&
2984         virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) {
2985         qemu_announce_timer_reset(&n->announce_timer, migrate_announce_params(),
2986                                   QEMU_CLOCK_VIRTUAL,
2987                                   virtio_net_announce_timer, n);
2988         if (n->announce_timer.round) {
2989             timer_mod(n->announce_timer.tm,
2990                       qemu_clock_get_ms(n->announce_timer.type));
2991         } else {
2992             qemu_announce_timer_del(&n->announce_timer, false);
2993         }
2994     }
2995
2996     if (n->rss_data.enabled) {
2997         n->rss_data.enabled_software_rss = n->rss_data.populate_hash;
2998         if (!n->rss_data.populate_hash) {
2999             if (!virtio_net_attach_epbf_rss(n)) {
3000                 if (get_vhost_net(qemu_get_queue(n->nic)->peer)) {
3001                     warn_report("Can't post-load eBPF RSS for vhost");
3002                 } else {
3003                     warn_report("Can't post-load eBPF RSS - "
3004                                 "fallback to software RSS");
3005                     n->rss_data.enabled_software_rss = true;
3006                 }
3007             }
3008         }
3009
3010         trace_virtio_net_rss_enable(n->rss_data.hash_types,
3011                                     n->rss_data.indirections_len,
3012                                     sizeof(n->rss_data.key));
3013     } else {
3014         trace_virtio_net_rss_disable();
3015     }
3016     return 0;
3017 }
3018
3019 static int virtio_net_post_load_virtio(VirtIODevice *vdev)
3020 {
3021     VirtIONet *n = VIRTIO_NET(vdev);
3022     /*
3023      * The actual needed state is now in saved_guest_offloads,
3024      * see virtio_net_post_load_device for detail.
3025      * Restore it back and apply the desired offloads.
3026      */
3027     n->curr_guest_offloads = n->saved_guest_offloads;
3028     if (peer_has_vnet_hdr(n)) {
3029         virtio_net_apply_guest_offloads(n);
3030     }
3031
3032     return 0;
3033 }
3034
3035 /* tx_waiting field of a VirtIONetQueue */
3036 static const VMStateDescription vmstate_virtio_net_queue_tx_waiting = {
3037     .name = "virtio-net-queue-tx_waiting",
3038     .fields = (VMStateField[]) {
3039         VMSTATE_UINT32(tx_waiting, VirtIONetQueue),
3040         VMSTATE_END_OF_LIST()
3041    },
3042 };
3043
3044 static bool max_queue_pairs_gt_1(void *opaque, int version_id)
3045 {
3046     return VIRTIO_NET(opaque)->max_queue_pairs > 1;
3047 }
3048
3049 static bool has_ctrl_guest_offloads(void *opaque, int version_id)
3050 {
3051     return virtio_vdev_has_feature(VIRTIO_DEVICE(opaque),
3052                                    VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
3053 }
3054
3055 static bool mac_table_fits(void *opaque, int version_id)
3056 {
3057     return VIRTIO_NET(opaque)->mac_table.in_use <= MAC_TABLE_ENTRIES;
3058 }
3059
3060 static bool mac_table_doesnt_fit(void *opaque, int version_id)
3061 {
3062     return !mac_table_fits(opaque, version_id);
3063 }
3064
3065 /* This temporary type is shared by all the WITH_TMP methods
3066  * although only some fields are used by each.
3067  */
3068 struct VirtIONetMigTmp {
3069     VirtIONet      *parent;
3070     VirtIONetQueue *vqs_1;
3071     uint16_t        curr_queue_pairs_1;
3072     uint8_t         has_ufo;
3073     uint32_t        has_vnet_hdr;
3074 };
3075
3076 /* The 2nd and subsequent tx_waiting flags are loaded later than
3077  * the 1st entry in the queue_pairs and only if there's more than one
3078  * entry.  We use the tmp mechanism to calculate a temporary
3079  * pointer and count and also validate the count.
3080  */
3081
3082 static int virtio_net_tx_waiting_pre_save(void *opaque)
3083 {
3084     struct VirtIONetMigTmp *tmp = opaque;
3085
3086     tmp->vqs_1 = tmp->parent->vqs + 1;
3087     tmp->curr_queue_pairs_1 = tmp->parent->curr_queue_pairs - 1;
3088     if (tmp->parent->curr_queue_pairs == 0) {
3089         tmp->curr_queue_pairs_1 = 0;
3090     }
3091
3092     return 0;
3093 }
3094
3095 static int virtio_net_tx_waiting_pre_load(void *opaque)
3096 {
3097     struct VirtIONetMigTmp *tmp = opaque;
3098
3099     /* Reuse the pointer setup from save */
3100     virtio_net_tx_waiting_pre_save(opaque);
3101
3102     if (tmp->parent->curr_queue_pairs > tmp->parent->max_queue_pairs) {
3103         error_report("virtio-net: curr_queue_pairs %x > max_queue_pairs %x",
3104             tmp->parent->curr_queue_pairs, tmp->parent->max_queue_pairs);
3105
3106         return -EINVAL;
3107     }
3108
3109     return 0; /* all good */
3110 }
3111
3112 static const VMStateDescription vmstate_virtio_net_tx_waiting = {
3113     .name      = "virtio-net-tx_waiting",
3114     .pre_load  = virtio_net_tx_waiting_pre_load,
3115     .pre_save  = virtio_net_tx_waiting_pre_save,
3116     .fields    = (VMStateField[]) {
3117         VMSTATE_STRUCT_VARRAY_POINTER_UINT16(vqs_1, struct VirtIONetMigTmp,
3118                                      curr_queue_pairs_1,
3119                                      vmstate_virtio_net_queue_tx_waiting,
3120                                      struct VirtIONetQueue),
3121         VMSTATE_END_OF_LIST()
3122     },
3123 };
3124
3125 /* the 'has_ufo' flag is just tested; if the incoming stream has the
3126  * flag set we need to check that we have it
3127  */
3128 static int virtio_net_ufo_post_load(void *opaque, int version_id)
3129 {
3130     struct VirtIONetMigTmp *tmp = opaque;
3131
3132     if (tmp->has_ufo && !peer_has_ufo(tmp->parent)) {
3133         error_report("virtio-net: saved image requires TUN_F_UFO support");
3134         return -EINVAL;
3135     }
3136
3137     return 0;
3138 }
3139
3140 static int virtio_net_ufo_pre_save(void *opaque)
3141 {
3142     struct VirtIONetMigTmp *tmp = opaque;
3143
3144     tmp->has_ufo = tmp->parent->has_ufo;
3145
3146     return 0;
3147 }
3148
3149 static const VMStateDescription vmstate_virtio_net_has_ufo = {
3150     .name      = "virtio-net-ufo",
3151     .post_load = virtio_net_ufo_post_load,
3152     .pre_save  = virtio_net_ufo_pre_save,
3153     .fields    = (VMStateField[]) {
3154         VMSTATE_UINT8(has_ufo, struct VirtIONetMigTmp),
3155         VMSTATE_END_OF_LIST()
3156     },
3157 };
3158
3159 /* the 'has_vnet_hdr' flag is just tested; if the incoming stream has the
3160  * flag set we need to check that we have it
3161  */
3162 static int virtio_net_vnet_post_load(void *opaque, int version_id)
3163 {
3164     struct VirtIONetMigTmp *tmp = opaque;
3165
3166     if (tmp->has_vnet_hdr && !peer_has_vnet_hdr(tmp->parent)) {
3167         error_report("virtio-net: saved image requires vnet_hdr=on");
3168         return -EINVAL;
3169     }
3170
3171     return 0;
3172 }
3173
3174 static int virtio_net_vnet_pre_save(void *opaque)
3175 {
3176     struct VirtIONetMigTmp *tmp = opaque;
3177
3178     tmp->has_vnet_hdr = tmp->parent->has_vnet_hdr;
3179
3180     return 0;
3181 }
3182
3183 static const VMStateDescription vmstate_virtio_net_has_vnet = {
3184     .name      = "virtio-net-vnet",
3185     .post_load = virtio_net_vnet_post_load,
3186     .pre_save  = virtio_net_vnet_pre_save,
3187     .fields    = (VMStateField[]) {
3188         VMSTATE_UINT32(has_vnet_hdr, struct VirtIONetMigTmp),
3189         VMSTATE_END_OF_LIST()
3190     },
3191 };
3192
3193 static bool virtio_net_rss_needed(void *opaque)
3194 {
3195     return VIRTIO_NET(opaque)->rss_data.enabled;
3196 }
3197
3198 static const VMStateDescription vmstate_virtio_net_rss = {
3199     .name      = "virtio-net-device/rss",
3200     .version_id = 1,
3201     .minimum_version_id = 1,
3202     .needed = virtio_net_rss_needed,
3203     .fields = (VMStateField[]) {
3204         VMSTATE_BOOL(rss_data.enabled, VirtIONet),
3205         VMSTATE_BOOL(rss_data.redirect, VirtIONet),
3206         VMSTATE_BOOL(rss_data.populate_hash, VirtIONet),
3207         VMSTATE_UINT32(rss_data.hash_types, VirtIONet),
3208         VMSTATE_UINT16(rss_data.indirections_len, VirtIONet),
3209         VMSTATE_UINT16(rss_data.default_queue, VirtIONet),
3210         VMSTATE_UINT8_ARRAY(rss_data.key, VirtIONet,
3211                             VIRTIO_NET_RSS_MAX_KEY_SIZE),
3212         VMSTATE_VARRAY_UINT16_ALLOC(rss_data.indirections_table, VirtIONet,
3213                                     rss_data.indirections_len, 0,
3214                                     vmstate_info_uint16, uint16_t),
3215         VMSTATE_END_OF_LIST()
3216     },
3217 };
3218
3219 static const VMStateDescription vmstate_virtio_net_device = {
3220     .name = "virtio-net-device",
3221     .version_id = VIRTIO_NET_VM_VERSION,
3222     .minimum_version_id = VIRTIO_NET_VM_VERSION,
3223     .post_load = virtio_net_post_load_device,
3224     .fields = (VMStateField[]) {
3225         VMSTATE_UINT8_ARRAY(mac, VirtIONet, ETH_ALEN),
3226         VMSTATE_STRUCT_POINTER(vqs, VirtIONet,
3227                                vmstate_virtio_net_queue_tx_waiting,
3228                                VirtIONetQueue),
3229         VMSTATE_UINT32(mergeable_rx_bufs, VirtIONet),
3230         VMSTATE_UINT16(status, VirtIONet),
3231         VMSTATE_UINT8(promisc, VirtIONet),
3232         VMSTATE_UINT8(allmulti, VirtIONet),
3233         VMSTATE_UINT32(mac_table.in_use, VirtIONet),
3234
3235         /* Guarded pair: If it fits we load it, else we throw it away
3236          * - can happen if source has a larger MAC table.; post-load
3237          *  sets flags in this case.
3238          */
3239         VMSTATE_VBUFFER_MULTIPLY(mac_table.macs, VirtIONet,
3240                                 0, mac_table_fits, mac_table.in_use,
3241                                  ETH_ALEN),
3242         VMSTATE_UNUSED_VARRAY_UINT32(VirtIONet, mac_table_doesnt_fit, 0,
3243                                      mac_table.in_use, ETH_ALEN),
3244
3245         /* Note: This is an array of uint32's that's always been saved as a
3246          * buffer; hold onto your endiannesses; it's actually used as a bitmap
3247          * but based on the uint.
3248          */
3249         VMSTATE_BUFFER_POINTER_UNSAFE(vlans, VirtIONet, 0, MAX_VLAN >> 3),
3250         VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp,
3251                          vmstate_virtio_net_has_vnet),
3252         VMSTATE_UINT8(mac_table.multi_overflow, VirtIONet),
3253         VMSTATE_UINT8(mac_table.uni_overflow, VirtIONet),
3254         VMSTATE_UINT8(alluni, VirtIONet),
3255         VMSTATE_UINT8(nomulti, VirtIONet),
3256         VMSTATE_UINT8(nouni, VirtIONet),
3257         VMSTATE_UINT8(nobcast, VirtIONet),
3258         VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp,
3259                          vmstate_virtio_net_has_ufo),
3260         VMSTATE_SINGLE_TEST(max_queue_pairs, VirtIONet, max_queue_pairs_gt_1, 0,
3261                             vmstate_info_uint16_equal, uint16_t),
3262         VMSTATE_UINT16_TEST(curr_queue_pairs, VirtIONet, max_queue_pairs_gt_1),
3263         VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp,
3264                          vmstate_virtio_net_tx_waiting),
3265         VMSTATE_UINT64_TEST(curr_guest_offloads, VirtIONet,
3266                             has_ctrl_guest_offloads),
3267         VMSTATE_END_OF_LIST()
3268    },
3269     .subsections = (const VMStateDescription * []) {
3270         &vmstate_virtio_net_rss,
3271         NULL
3272     }
3273 };
3274
3275 static NetClientInfo net_virtio_info = {
3276     .type = NET_CLIENT_DRIVER_NIC,
3277     .size = sizeof(NICState),
3278     .can_receive = virtio_net_can_receive,
3279     .receive = virtio_net_receive,
3280     .link_status_changed = virtio_net_set_link_status,
3281     .query_rx_filter = virtio_net_query_rxfilter,
3282     .announce = virtio_net_announce,
3283 };
3284
3285 static bool virtio_net_guest_notifier_pending(VirtIODevice *vdev, int idx)
3286 {
3287     VirtIONet *n = VIRTIO_NET(vdev);
3288     NetClientState *nc;
3289     assert(n->vhost_started);
3290     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_MQ) && idx == 2) {
3291         /* Must guard against invalid features and bogus queue index
3292          * from being set by malicious guest, or penetrated through
3293          * buggy migration stream.
3294          */
3295         if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) {
3296             qemu_log_mask(LOG_GUEST_ERROR,
3297                           "%s: bogus vq index ignored\n", __func__);
3298             return false;
3299         }
3300         nc = qemu_get_subqueue(n->nic, n->max_queue_pairs);
3301     } else {
3302         nc = qemu_get_subqueue(n->nic, vq2q(idx));
3303     }
3304     return vhost_net_virtqueue_pending(get_vhost_net(nc->peer), idx);
3305 }
3306
3307 static void virtio_net_guest_notifier_mask(VirtIODevice *vdev, int idx,
3308                                            bool mask)
3309 {
3310     VirtIONet *n = VIRTIO_NET(vdev);
3311     NetClientState *nc;
3312     assert(n->vhost_started);
3313     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_MQ) && idx == 2) {
3314         /* Must guard against invalid features and bogus queue index
3315          * from being set by malicious guest, or penetrated through
3316          * buggy migration stream.
3317          */
3318         if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) {
3319             qemu_log_mask(LOG_GUEST_ERROR,
3320                           "%s: bogus vq index ignored\n", __func__);
3321             return;
3322         }
3323         nc = qemu_get_subqueue(n->nic, n->max_queue_pairs);
3324     } else {
3325         nc = qemu_get_subqueue(n->nic, vq2q(idx));
3326     }
3327     vhost_net_virtqueue_mask(get_vhost_net(nc->peer),
3328                              vdev, idx, mask);
3329 }
3330
3331 static void virtio_net_set_config_size(VirtIONet *n, uint64_t host_features)
3332 {
3333     virtio_add_feature(&host_features, VIRTIO_NET_F_MAC);
3334
3335     n->config_size = virtio_get_config_size(&cfg_size_params, host_features);
3336 }
3337
3338 void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
3339                                    const char *type)
3340 {
3341     /*
3342      * The name can be NULL, the netclient name will be type.x.
3343      */
3344     assert(type != NULL);
3345
3346     g_free(n->netclient_name);
3347     g_free(n->netclient_type);
3348     n->netclient_name = g_strdup(name);
3349     n->netclient_type = g_strdup(type);
3350 }
3351
3352 static bool failover_unplug_primary(VirtIONet *n, DeviceState *dev)
3353 {
3354     HotplugHandler *hotplug_ctrl;
3355     PCIDevice *pci_dev;
3356     Error *err = NULL;
3357
3358     hotplug_ctrl = qdev_get_hotplug_handler(dev);
3359     if (hotplug_ctrl) {
3360         pci_dev = PCI_DEVICE(dev);
3361         pci_dev->partially_hotplugged = true;
3362         hotplug_handler_unplug_request(hotplug_ctrl, dev, &err);
3363         if (err) {
3364             error_report_err(err);
3365             return false;
3366         }
3367     } else {
3368         return false;
3369     }
3370     return true;
3371 }
3372
3373 static bool failover_replug_primary(VirtIONet *n, DeviceState *dev,
3374                                     Error **errp)
3375 {
3376     Error *err = NULL;
3377     HotplugHandler *hotplug_ctrl;
3378     PCIDevice *pdev = PCI_DEVICE(dev);
3379     BusState *primary_bus;
3380
3381     if (!pdev->partially_hotplugged) {
3382         return true;
3383     }
3384     primary_bus = dev->parent_bus;
3385     if (!primary_bus) {
3386         error_setg(errp, "virtio_net: couldn't find primary bus");
3387         return false;
3388     }
3389     qdev_set_parent_bus(dev, primary_bus, &error_abort);
3390     qatomic_set(&n->failover_primary_hidden, false);
3391     hotplug_ctrl = qdev_get_hotplug_handler(dev);
3392     if (hotplug_ctrl) {
3393         hotplug_handler_pre_plug(hotplug_ctrl, dev, &err);
3394         if (err) {
3395             goto out;
3396         }
3397         hotplug_handler_plug(hotplug_ctrl, dev, &err);
3398     }
3399     pdev->partially_hotplugged = false;
3400
3401 out:
3402     error_propagate(errp, err);
3403     return !err;
3404 }
3405
3406 static void virtio_net_handle_migration_primary(VirtIONet *n, MigrationState *s)
3407 {
3408     bool should_be_hidden;
3409     Error *err = NULL;
3410     DeviceState *dev = failover_find_primary_device(n);
3411
3412     if (!dev) {
3413         return;
3414     }
3415
3416     should_be_hidden = qatomic_read(&n->failover_primary_hidden);
3417
3418     if (migration_in_setup(s) && !should_be_hidden) {
3419         if (failover_unplug_primary(n, dev)) {
3420             vmstate_unregister(VMSTATE_IF(dev), qdev_get_vmsd(dev), dev);
3421             qapi_event_send_unplug_primary(dev->id);
3422             qatomic_set(&n->failover_primary_hidden, true);
3423         } else {
3424             warn_report("couldn't unplug primary device");
3425         }
3426     } else if (migration_has_failed(s)) {
3427         /* We already unplugged the device let's plug it back */
3428         if (!failover_replug_primary(n, dev, &err)) {
3429             if (err) {
3430                 error_report_err(err);
3431             }
3432         }
3433     }
3434 }
3435
3436 static void virtio_net_migration_state_notifier(Notifier *notifier, void *data)
3437 {
3438     MigrationState *s = data;
3439     VirtIONet *n = container_of(notifier, VirtIONet, migration_state);
3440     virtio_net_handle_migration_primary(n, s);
3441 }
3442
3443 static bool failover_hide_primary_device(DeviceListener *listener,
3444                                          const QDict *device_opts,
3445                                          bool from_json,
3446                                          Error **errp)
3447 {
3448     VirtIONet *n = container_of(listener, VirtIONet, primary_listener);
3449     const char *standby_id;
3450
3451     if (!device_opts) {
3452         return false;
3453     }
3454
3455     if (!qdict_haskey(device_opts, "failover_pair_id")) {
3456         return false;
3457     }
3458
3459     if (!qdict_haskey(device_opts, "id")) {
3460         error_setg(errp, "Device with failover_pair_id needs to have id");
3461         return false;
3462     }
3463
3464     standby_id = qdict_get_str(device_opts, "failover_pair_id");
3465     if (g_strcmp0(standby_id, n->netclient_name) != 0) {
3466         return false;
3467     }
3468
3469     /*
3470      * The hide helper can be called several times for a given device.
3471      * Check there is only one primary for a virtio-net device but
3472      * don't duplicate the qdict several times if it's called for the same
3473      * device.
3474      */
3475     if (n->primary_opts) {
3476         const char *old, *new;
3477         /* devices with failover_pair_id always have an id */
3478         old = qdict_get_str(n->primary_opts, "id");
3479         new = qdict_get_str(device_opts, "id");
3480         if (strcmp(old, new) != 0) {
3481             error_setg(errp, "Cannot attach more than one primary device to "
3482                        "'%s': '%s' and '%s'", n->netclient_name, old, new);
3483             return false;
3484         }
3485     } else {
3486         n->primary_opts = qdict_clone_shallow(device_opts);
3487         n->primary_opts_from_json = from_json;
3488     }
3489
3490     /* failover_primary_hidden is set during feature negotiation */
3491     return qatomic_read(&n->failover_primary_hidden);
3492 }
3493
3494 static void virtio_net_device_realize(DeviceState *dev, Error **errp)
3495 {
3496     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
3497     VirtIONet *n = VIRTIO_NET(dev);
3498     NetClientState *nc;
3499     int i;
3500
3501     if (n->net_conf.mtu) {
3502         n->host_features |= (1ULL << VIRTIO_NET_F_MTU);
3503     }
3504
3505     if (n->net_conf.duplex_str) {
3506         if (strncmp(n->net_conf.duplex_str, "half", 5) == 0) {
3507             n->net_conf.duplex = DUPLEX_HALF;
3508         } else if (strncmp(n->net_conf.duplex_str, "full", 5) == 0) {
3509             n->net_conf.duplex = DUPLEX_FULL;
3510         } else {
3511             error_setg(errp, "'duplex' must be 'half' or 'full'");
3512             return;
3513         }
3514         n->host_features |= (1ULL << VIRTIO_NET_F_SPEED_DUPLEX);
3515     } else {
3516         n->net_conf.duplex = DUPLEX_UNKNOWN;
3517     }
3518
3519     if (n->net_conf.speed < SPEED_UNKNOWN) {
3520         error_setg(errp, "'speed' must be between 0 and INT_MAX");
3521         return;
3522     }
3523     if (n->net_conf.speed >= 0) {
3524         n->host_features |= (1ULL << VIRTIO_NET_F_SPEED_DUPLEX);
3525     }
3526
3527     if (n->failover) {
3528         n->primary_listener.hide_device = failover_hide_primary_device;
3529         qatomic_set(&n->failover_primary_hidden, true);
3530         device_listener_register(&n->primary_listener);
3531         n->migration_state.notify = virtio_net_migration_state_notifier;
3532         add_migration_state_change_notifier(&n->migration_state);
3533         n->host_features |= (1ULL << VIRTIO_NET_F_STANDBY);
3534     }
3535
3536     virtio_net_set_config_size(n, n->host_features);
3537     virtio_init(vdev, VIRTIO_ID_NET, n->config_size);
3538
3539     /*
3540      * We set a lower limit on RX queue size to what it always was.
3541      * Guests that want a smaller ring can always resize it without
3542      * help from us (using virtio 1 and up).
3543      */
3544     if (n->net_conf.rx_queue_size < VIRTIO_NET_RX_QUEUE_MIN_SIZE ||
3545         n->net_conf.rx_queue_size > VIRTQUEUE_MAX_SIZE ||
3546         !is_power_of_2(n->net_conf.rx_queue_size)) {
3547         error_setg(errp, "Invalid rx_queue_size (= %" PRIu16 "), "
3548                    "must be a power of 2 between %d and %d.",
3549                    n->net_conf.rx_queue_size, VIRTIO_NET_RX_QUEUE_MIN_SIZE,
3550                    VIRTQUEUE_MAX_SIZE);
3551         virtio_cleanup(vdev);
3552         return;
3553     }
3554
3555     if (n->net_conf.tx_queue_size < VIRTIO_NET_TX_QUEUE_MIN_SIZE ||
3556         n->net_conf.tx_queue_size > VIRTQUEUE_MAX_SIZE ||
3557         !is_power_of_2(n->net_conf.tx_queue_size)) {
3558         error_setg(errp, "Invalid tx_queue_size (= %" PRIu16 "), "
3559                    "must be a power of 2 between %d and %d",
3560                    n->net_conf.tx_queue_size, VIRTIO_NET_TX_QUEUE_MIN_SIZE,
3561                    VIRTQUEUE_MAX_SIZE);
3562         virtio_cleanup(vdev);
3563         return;
3564     }
3565
3566     n->max_ncs = MAX(n->nic_conf.peers.queues, 1);
3567
3568     /*
3569      * Figure out the datapath queue pairs since the backend could
3570      * provide control queue via peers as well.
3571      */
3572     if (n->nic_conf.peers.queues) {
3573         for (i = 0; i < n->max_ncs; i++) {
3574             if (n->nic_conf.peers.ncs[i]->is_datapath) {
3575                 ++n->max_queue_pairs;
3576             }
3577         }
3578     }
3579     n->max_queue_pairs = MAX(n->max_queue_pairs, 1);
3580
3581     if (n->max_queue_pairs * 2 + 1 > VIRTIO_QUEUE_MAX) {
3582         error_setg(errp, "Invalid number of queue pairs (= %" PRIu32 "), "
3583                    "must be a positive integer less than %d.",
3584                    n->max_queue_pairs, (VIRTIO_QUEUE_MAX - 1) / 2);
3585         virtio_cleanup(vdev);
3586         return;
3587     }
3588     n->vqs = g_new0(VirtIONetQueue, n->max_queue_pairs);
3589     n->curr_queue_pairs = 1;
3590     n->tx_timeout = n->net_conf.txtimer;
3591
3592     if (n->net_conf.tx && strcmp(n->net_conf.tx, "timer")
3593                        && strcmp(n->net_conf.tx, "bh")) {
3594         warn_report("virtio-net: "
3595                     "Unknown option tx=%s, valid options: \"timer\" \"bh\"",
3596                     n->net_conf.tx);
3597         error_printf("Defaulting to \"bh\"");
3598     }
3599
3600     n->net_conf.tx_queue_size = MIN(virtio_net_max_tx_queue_size(n),
3601                                     n->net_conf.tx_queue_size);
3602
3603     for (i = 0; i < n->max_queue_pairs; i++) {
3604         virtio_net_add_queue(n, i);
3605     }
3606
3607     n->ctrl_vq = virtio_add_queue(vdev, 64, virtio_net_handle_ctrl);
3608     qemu_macaddr_default_if_unset(&n->nic_conf.macaddr);
3609     memcpy(&n->mac[0], &n->nic_conf.macaddr, sizeof(n->mac));
3610     n->status = VIRTIO_NET_S_LINK_UP;
3611     qemu_announce_timer_reset(&n->announce_timer, migrate_announce_params(),
3612                               QEMU_CLOCK_VIRTUAL,
3613                               virtio_net_announce_timer, n);
3614     n->announce_timer.round = 0;
3615
3616     if (n->netclient_type) {
3617         /*
3618          * Happen when virtio_net_set_netclient_name has been called.
3619          */
3620         n->nic = qemu_new_nic(&net_virtio_info, &n->nic_conf,
3621                               n->netclient_type, n->netclient_name, n);
3622     } else {
3623         n->nic = qemu_new_nic(&net_virtio_info, &n->nic_conf,
3624                               object_get_typename(OBJECT(dev)), dev->id, n);
3625     }
3626
3627     for (i = 0; i < n->max_queue_pairs; i++) {
3628         n->nic->ncs[i].do_not_pad = true;
3629     }
3630
3631     peer_test_vnet_hdr(n);
3632     if (peer_has_vnet_hdr(n)) {
3633         for (i = 0; i < n->max_queue_pairs; i++) {
3634             qemu_using_vnet_hdr(qemu_get_subqueue(n->nic, i)->peer, true);
3635         }
3636         n->host_hdr_len = sizeof(struct virtio_net_hdr);
3637     } else {
3638         n->host_hdr_len = 0;
3639     }
3640
3641     qemu_format_nic_info_str(qemu_get_queue(n->nic), n->nic_conf.macaddr.a);
3642
3643     n->vqs[0].tx_waiting = 0;
3644     n->tx_burst = n->net_conf.txburst;
3645     virtio_net_set_mrg_rx_bufs(n, 0, 0, 0);
3646     n->promisc = 1; /* for compatibility */
3647
3648     n->mac_table.macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN);
3649
3650     n->vlans = g_malloc0(MAX_VLAN >> 3);
3651
3652     nc = qemu_get_queue(n->nic);
3653     nc->rxfilter_notify_enabled = 1;
3654
3655    if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
3656         struct virtio_net_config netcfg = {};
3657         memcpy(&netcfg.mac, &n->nic_conf.macaddr, ETH_ALEN);
3658         vhost_net_set_config(get_vhost_net(nc->peer),
3659             (uint8_t *)&netcfg, 0, ETH_ALEN, VHOST_SET_CONFIG_TYPE_MASTER);
3660     }
3661     QTAILQ_INIT(&n->rsc_chains);
3662     n->qdev = dev;
3663
3664     net_rx_pkt_init(&n->rx_pkt, false);
3665
3666     if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
3667         virtio_net_load_ebpf(n);
3668     }
3669 }
3670
3671 static void virtio_net_device_unrealize(DeviceState *dev)
3672 {
3673     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
3674     VirtIONet *n = VIRTIO_NET(dev);
3675     int i, max_queue_pairs;
3676
3677     if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
3678         virtio_net_unload_ebpf(n);
3679     }
3680
3681     /* This will stop vhost backend if appropriate. */
3682     virtio_net_set_status(vdev, 0);
3683
3684     g_free(n->netclient_name);
3685     n->netclient_name = NULL;
3686     g_free(n->netclient_type);
3687     n->netclient_type = NULL;
3688
3689     g_free(n->mac_table.macs);
3690     g_free(n->vlans);
3691
3692     if (n->failover) {
3693         qobject_unref(n->primary_opts);
3694         device_listener_unregister(&n->primary_listener);
3695         remove_migration_state_change_notifier(&n->migration_state);
3696     } else {
3697         assert(n->primary_opts == NULL);
3698     }
3699
3700     max_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
3701     for (i = 0; i < max_queue_pairs; i++) {
3702         virtio_net_del_queue(n, i);
3703     }
3704     /* delete also control vq */
3705     virtio_del_queue(vdev, max_queue_pairs * 2);
3706     qemu_announce_timer_del(&n->announce_timer, false);
3707     g_free(n->vqs);
3708     qemu_del_nic(n->nic);
3709     virtio_net_rsc_cleanup(n);
3710     g_free(n->rss_data.indirections_table);
3711     net_rx_pkt_uninit(n->rx_pkt);
3712     virtio_cleanup(vdev);
3713 }
3714
3715 static void virtio_net_instance_init(Object *obj)
3716 {
3717     VirtIONet *n = VIRTIO_NET(obj);
3718
3719     /*
3720      * The default config_size is sizeof(struct virtio_net_config).
3721      * Can be overriden with virtio_net_set_config_size.
3722      */
3723     n->config_size = sizeof(struct virtio_net_config);
3724     device_add_bootindex_property(obj, &n->nic_conf.bootindex,
3725                                   "bootindex", "/ethernet-phy@0",
3726                                   DEVICE(n));
3727
3728     ebpf_rss_init(&n->ebpf_rss);
3729 }
3730
3731 static int virtio_net_pre_save(void *opaque)
3732 {
3733     VirtIONet *n = opaque;
3734
3735     /* At this point, backend must be stopped, otherwise
3736      * it might keep writing to memory. */
3737     assert(!n->vhost_started);
3738
3739     return 0;
3740 }
3741
3742 static bool primary_unplug_pending(void *opaque)
3743 {
3744     DeviceState *dev = opaque;
3745     DeviceState *primary;
3746     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
3747     VirtIONet *n = VIRTIO_NET(vdev);
3748
3749     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_STANDBY)) {
3750         return false;
3751     }
3752     primary = failover_find_primary_device(n);
3753     return primary ? primary->pending_deleted_event : false;
3754 }
3755
3756 static bool dev_unplug_pending(void *opaque)
3757 {
3758     DeviceState *dev = opaque;
3759     VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(dev);
3760
3761     return vdc->primary_unplug_pending(dev);
3762 }
3763
3764 static struct vhost_dev *virtio_net_get_vhost(VirtIODevice *vdev)
3765 {
3766     VirtIONet *n = VIRTIO_NET(vdev);
3767     NetClientState *nc = qemu_get_queue(n->nic);
3768     struct vhost_net *net = get_vhost_net(nc->peer);
3769     return &net->dev;
3770 }
3771
3772 static const VMStateDescription vmstate_virtio_net = {
3773     .name = "virtio-net",
3774     .minimum_version_id = VIRTIO_NET_VM_VERSION,
3775     .version_id = VIRTIO_NET_VM_VERSION,
3776     .fields = (VMStateField[]) {
3777         VMSTATE_VIRTIO_DEVICE,
3778         VMSTATE_END_OF_LIST()
3779     },
3780     .pre_save = virtio_net_pre_save,
3781     .dev_unplug_pending = dev_unplug_pending,
3782 };
3783
3784 static Property virtio_net_properties[] = {
3785     DEFINE_PROP_BIT64("csum", VirtIONet, host_features,
3786                     VIRTIO_NET_F_CSUM, true),
3787     DEFINE_PROP_BIT64("guest_csum", VirtIONet, host_features,
3788                     VIRTIO_NET_F_GUEST_CSUM, true),
3789     DEFINE_PROP_BIT64("gso", VirtIONet, host_features, VIRTIO_NET_F_GSO, true),
3790     DEFINE_PROP_BIT64("guest_tso4", VirtIONet, host_features,
3791                     VIRTIO_NET_F_GUEST_TSO4, true),
3792     DEFINE_PROP_BIT64("guest_tso6", VirtIONet, host_features,
3793                     VIRTIO_NET_F_GUEST_TSO6, true),
3794     DEFINE_PROP_BIT64("guest_ecn", VirtIONet, host_features,
3795                     VIRTIO_NET_F_GUEST_ECN, true),
3796     DEFINE_PROP_BIT64("guest_ufo", VirtIONet, host_features,
3797                     VIRTIO_NET_F_GUEST_UFO, true),
3798     DEFINE_PROP_BIT64("guest_announce", VirtIONet, host_features,
3799                     VIRTIO_NET_F_GUEST_ANNOUNCE, true),
3800     DEFINE_PROP_BIT64("host_tso4", VirtIONet, host_features,
3801                     VIRTIO_NET_F_HOST_TSO4, true),
3802     DEFINE_PROP_BIT64("host_tso6", VirtIONet, host_features,
3803                     VIRTIO_NET_F_HOST_TSO6, true),
3804     DEFINE_PROP_BIT64("host_ecn", VirtIONet, host_features,
3805                     VIRTIO_NET_F_HOST_ECN, true),
3806     DEFINE_PROP_BIT64("host_ufo", VirtIONet, host_features,
3807                     VIRTIO_NET_F_HOST_UFO, true),
3808     DEFINE_PROP_BIT64("mrg_rxbuf", VirtIONet, host_features,
3809                     VIRTIO_NET_F_MRG_RXBUF, true),
3810     DEFINE_PROP_BIT64("status", VirtIONet, host_features,
3811                     VIRTIO_NET_F_STATUS, true),
3812     DEFINE_PROP_BIT64("ctrl_vq", VirtIONet, host_features,
3813                     VIRTIO_NET_F_CTRL_VQ, true),
3814     DEFINE_PROP_BIT64("ctrl_rx", VirtIONet, host_features,
3815                     VIRTIO_NET_F_CTRL_RX, true),
3816     DEFINE_PROP_BIT64("ctrl_vlan", VirtIONet, host_features,
3817                     VIRTIO_NET_F_CTRL_VLAN, true),
3818     DEFINE_PROP_BIT64("ctrl_rx_extra", VirtIONet, host_features,
3819                     VIRTIO_NET_F_CTRL_RX_EXTRA, true),
3820     DEFINE_PROP_BIT64("ctrl_mac_addr", VirtIONet, host_features,
3821                     VIRTIO_NET_F_CTRL_MAC_ADDR, true),
3822     DEFINE_PROP_BIT64("ctrl_guest_offloads", VirtIONet, host_features,
3823                     VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, true),
3824     DEFINE_PROP_BIT64("mq", VirtIONet, host_features, VIRTIO_NET_F_MQ, false),
3825     DEFINE_PROP_BIT64("rss", VirtIONet, host_features,
3826                     VIRTIO_NET_F_RSS, false),
3827     DEFINE_PROP_BIT64("hash", VirtIONet, host_features,
3828                     VIRTIO_NET_F_HASH_REPORT, false),
3829     DEFINE_PROP_BIT64("guest_rsc_ext", VirtIONet, host_features,
3830                     VIRTIO_NET_F_RSC_EXT, false),
3831     DEFINE_PROP_UINT32("rsc_interval", VirtIONet, rsc_timeout,
3832                        VIRTIO_NET_RSC_DEFAULT_INTERVAL),
3833     DEFINE_NIC_PROPERTIES(VirtIONet, nic_conf),
3834     DEFINE_PROP_UINT32("x-txtimer", VirtIONet, net_conf.txtimer,
3835                        TX_TIMER_INTERVAL),
3836     DEFINE_PROP_INT32("x-txburst", VirtIONet, net_conf.txburst, TX_BURST),
3837     DEFINE_PROP_STRING("tx", VirtIONet, net_conf.tx),
3838     DEFINE_PROP_UINT16("rx_queue_size", VirtIONet, net_conf.rx_queue_size,
3839                        VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE),
3840     DEFINE_PROP_UINT16("tx_queue_size", VirtIONet, net_conf.tx_queue_size,
3841                        VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE),
3842     DEFINE_PROP_UINT16("host_mtu", VirtIONet, net_conf.mtu, 0),
3843     DEFINE_PROP_BOOL("x-mtu-bypass-backend", VirtIONet, mtu_bypass_backend,
3844                      true),
3845     DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN),
3846     DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str),
3847     DEFINE_PROP_BOOL("failover", VirtIONet, failover, false),
3848     DEFINE_PROP_END_OF_LIST(),
3849 };
3850
3851 static void virtio_net_class_init(ObjectClass *klass, void *data)
3852 {
3853     DeviceClass *dc = DEVICE_CLASS(klass);
3854     VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
3855
3856     device_class_set_props(dc, virtio_net_properties);
3857     dc->vmsd = &vmstate_virtio_net;
3858     set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
3859     vdc->realize = virtio_net_device_realize;
3860     vdc->unrealize = virtio_net_device_unrealize;
3861     vdc->get_config = virtio_net_get_config;
3862     vdc->set_config = virtio_net_set_config;
3863     vdc->get_features = virtio_net_get_features;
3864     vdc->set_features = virtio_net_set_features;
3865     vdc->bad_features = virtio_net_bad_features;
3866     vdc->reset = virtio_net_reset;
3867     vdc->queue_reset = virtio_net_queue_reset;
3868     vdc->queue_enable = virtio_net_queue_enable;
3869     vdc->set_status = virtio_net_set_status;
3870     vdc->guest_notifier_mask = virtio_net_guest_notifier_mask;
3871     vdc->guest_notifier_pending = virtio_net_guest_notifier_pending;
3872     vdc->legacy_features |= (0x1 << VIRTIO_NET_F_GSO);
3873     vdc->post_load = virtio_net_post_load_virtio;
3874     vdc->vmsd = &vmstate_virtio_net_device;
3875     vdc->primary_unplug_pending = primary_unplug_pending;
3876     vdc->get_vhost = virtio_net_get_vhost;
3877 }
3878
3879 static const TypeInfo virtio_net_info = {
3880     .name = TYPE_VIRTIO_NET,
3881     .parent = TYPE_VIRTIO_DEVICE,
3882     .instance_size = sizeof(VirtIONet),
3883     .instance_init = virtio_net_instance_init,
3884     .class_init = virtio_net_class_init,
3885 };
3886
3887 static void virtio_register_types(void)
3888 {
3889     type_register_static(&virtio_net_info);
3890 }
3891
3892 type_init(virtio_register_types)