hw/net/virtio-net.c

   1 /*
   2  * Virtio Network Device
   3  *
   4  * Copyright IBM, Corp. 2007
   5  *
   6  * Authors:
   7  *  Anthony Liguori   <aliguori@us.ibm.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2.  See
  10  * the COPYING file in the top-level directory.
  11  *
  12  */
  13
  14 #include "qemu/osdep.h"
  15 #include "qemu/atomic.h"
  16 #include "qemu/iov.h"
  17 #include "qemu/log.h"
  18 #include "qemu/main-loop.h"
  19 #include "qemu/module.h"
  20 #include "hw/virtio/virtio.h"
  21 #include "net/net.h"
  22 #include "net/checksum.h"
  23 #include "net/tap.h"
  24 #include "qemu/error-report.h"
  25 #include "qemu/timer.h"
  26 #include "qemu/option.h"
  27 #include "qemu/option_int.h"
  28 #include "qemu/config-file.h"
  29 #include "qapi/qmp/qdict.h"
  30 #include "hw/virtio/virtio-net.h"
  31 #include "net/vhost_net.h"
  32 #include "net/announce.h"
  33 #include "hw/virtio/virtio-bus.h"
  34 #include "qapi/error.h"
  35 #include "qapi/qapi-events-net.h"
  36 #include "hw/qdev-properties.h"
  37 #include "qapi/qapi-types-migration.h"
  38 #include "qapi/qapi-events-migration.h"
  39 #include "hw/virtio/virtio-access.h"
  40 #include "migration/misc.h"
  41 #include "standard-headers/linux/ethtool.h"
  42 #include "sysemu/sysemu.h"
  43 #include "trace.h"
  44 #include "monitor/qdev.h"
  45 #include "hw/pci/pci_device.h"
  46 #include "net_rx_pkt.h"
  47 #include "hw/virtio/vhost.h"
  48 #include "sysemu/qtest.h"
  49
  50 #define VIRTIO_NET_VM_VERSION    11
  51
  52 #define MAX_VLAN    (1 << 12)   /* Per 802.1Q definition */
  53
  54 /* previously fixed value */
  55 #define VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE 256
  56 #define VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE 256
  57
  58 /* for now, only allow larger queue_pairs; with virtio-1, guest can downsize */
  59 #define VIRTIO_NET_RX_QUEUE_MIN_SIZE VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE
  60 #define VIRTIO_NET_TX_QUEUE_MIN_SIZE VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE
  61
  62 #define VIRTIO_NET_IP4_ADDR_SIZE   8        /* ipv4 saddr + daddr */
  63
  64 #define VIRTIO_NET_TCP_FLAG         0x3F
  65 #define VIRTIO_NET_TCP_HDR_LENGTH   0xF000
  66
  67 /* IPv4 max payload, 16 bits in the header */
  68 #define VIRTIO_NET_MAX_IP4_PAYLOAD (65535 - sizeof(struct ip_header))
  69 #define VIRTIO_NET_MAX_TCP_PAYLOAD 65535
  70
  71 /* header length value in ip header without option */
  72 #define VIRTIO_NET_IP4_HEADER_LENGTH 5
  73
  74 #define VIRTIO_NET_IP6_ADDR_SIZE   32      /* ipv6 saddr + daddr */
  75 #define VIRTIO_NET_MAX_IP6_PAYLOAD VIRTIO_NET_MAX_TCP_PAYLOAD
  76
  77 /* Purge coalesced packets timer interval, This value affects the performance
  78    a lot, and should be tuned carefully, '300000'(300us) is the recommended
  79    value to pass the WHQL test, '50000' can gain 2x netperf throughput with
  80    tso/gso/gro 'off'. */
  81 #define VIRTIO_NET_RSC_DEFAULT_INTERVAL 300000
  82
  83 #define VIRTIO_NET_RSS_SUPPORTED_HASHES (VIRTIO_NET_RSS_HASH_TYPE_IPv4 | \
  84                                          VIRTIO_NET_RSS_HASH_TYPE_TCPv4 | \
  85                                          VIRTIO_NET_RSS_HASH_TYPE_UDPv4 | \
  86                                          VIRTIO_NET_RSS_HASH_TYPE_IPv6 | \
  87                                          VIRTIO_NET_RSS_HASH_TYPE_TCPv6 | \
  88                                          VIRTIO_NET_RSS_HASH_TYPE_UDPv6 | \
  89                                          VIRTIO_NET_RSS_HASH_TYPE_IP_EX | \
  90                                          VIRTIO_NET_RSS_HASH_TYPE_TCP_EX | \
  91                                          VIRTIO_NET_RSS_HASH_TYPE_UDP_EX)
  92
  93 static const VirtIOFeature feature_sizes[] = {
  94     {.flags = 1ULL << VIRTIO_NET_F_MAC,
  95      .end = endof(struct virtio_net_config, mac)},
  96     {.flags = 1ULL << VIRTIO_NET_F_STATUS,
  97      .end = endof(struct virtio_net_config, status)},
  98     {.flags = 1ULL << VIRTIO_NET_F_MQ,
  99      .end = endof(struct virtio_net_config, max_virtqueue_pairs)},
 100     {.flags = 1ULL << VIRTIO_NET_F_MTU,
 101      .end = endof(struct virtio_net_config, mtu)},
 102     {.flags = 1ULL << VIRTIO_NET_F_SPEED_DUPLEX,
 103      .end = endof(struct virtio_net_config, duplex)},
 104     {.flags = (1ULL << VIRTIO_NET_F_RSS) | (1ULL << VIRTIO_NET_F_HASH_REPORT),
 105      .end = endof(struct virtio_net_config, supported_hash_types)},
 106     {}
 107 };
 108
 109 static const VirtIOConfigSizeParams cfg_size_params = {
 110     .min_size = endof(struct virtio_net_config, mac),
 111     .max_size = sizeof(struct virtio_net_config),
 112     .feature_sizes = feature_sizes
 113 };
 114
 115 static VirtIONetQueue *virtio_net_get_subqueue(NetClientState *nc)
 116 {
 117     VirtIONet *n = qemu_get_nic_opaque(nc);
 118
 119     return &n->vqs[nc->queue_index];
 120 }
 121
 122 static int vq2q(int queue_index)
 123 {
 124     return queue_index / 2;
 125 }
 126
 127 static void flush_or_purge_queued_packets(NetClientState *nc)
 128 {
 129     if (!nc->peer) {
 130         return;
 131     }
 132
 133     qemu_flush_or_purge_queued_packets(nc->peer, true);
 134     assert(!virtio_net_get_subqueue(nc)->async_tx.elem);
 135 }
 136
 137 /* TODO
 138  * - we could suppress RX interrupt if we were so inclined.
 139  */
 140
 141 static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
 142 {
 143     VirtIONet *n = VIRTIO_NET(vdev);
 144     struct virtio_net_config netcfg;
 145     NetClientState *nc = qemu_get_queue(n->nic);
 146     static const MACAddr zero = { .a = { 0, 0, 0, 0, 0, 0 } };
 147
 148     int ret = 0;
 149     memset(&netcfg, 0 , sizeof(struct virtio_net_config));
 150     virtio_stw_p(vdev, &netcfg.status, n->status);
 151     virtio_stw_p(vdev, &netcfg.max_virtqueue_pairs, n->max_queue_pairs);
 152     virtio_stw_p(vdev, &netcfg.mtu, n->net_conf.mtu);
 153     memcpy(netcfg.mac, n->mac, ETH_ALEN);
 154     virtio_stl_p(vdev, &netcfg.speed, n->net_conf.speed);
 155     netcfg.duplex = n->net_conf.duplex;
 156     netcfg.rss_max_key_size = VIRTIO_NET_RSS_MAX_KEY_SIZE;
 157     virtio_stw_p(vdev, &netcfg.rss_max_indirection_table_length,
 158                  virtio_host_has_feature(vdev, VIRTIO_NET_F_RSS) ?
 159                  VIRTIO_NET_RSS_MAX_TABLE_LEN : 1);
 160     virtio_stl_p(vdev, &netcfg.supported_hash_types,
 161                  VIRTIO_NET_RSS_SUPPORTED_HASHES);
 162     memcpy(config, &netcfg, n->config_size);
 163
 164     /*
 165      * Is this VDPA? No peer means not VDPA: there's no way to
 166      * disconnect/reconnect a VDPA peer.
 167      */
 168     if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
 169         ret = vhost_net_get_config(get_vhost_net(nc->peer), (uint8_t *)&netcfg,
 170                                    n->config_size);
 171         if (ret == -1) {
 172             return;
 173         }
 174
 175         /*
 176          * Some NIC/kernel combinations present 0 as the mac address.  As that
 177          * is not a legal address, try to proceed with the address from the
 178          * QEMU command line in the hope that the address has been configured
 179          * correctly elsewhere - just not reported by the device.
 180          */
 181         if (memcmp(&netcfg.mac, &zero, sizeof(zero)) == 0) {
 182             info_report("Zero hardware mac address detected. Ignoring.");
 183             memcpy(netcfg.mac, n->mac, ETH_ALEN);
 184         }
 185
 186         netcfg.status |= virtio_tswap16(vdev,
 187                                         n->status & VIRTIO_NET_S_ANNOUNCE);
 188         memcpy(config, &netcfg, n->config_size);
 189     }
 190 }
 191
 192 static void virtio_net_set_config(VirtIODevice *vdev, const uint8_t *config)
 193 {
 194     VirtIONet *n = VIRTIO_NET(vdev);
 195     struct virtio_net_config netcfg = {};
 196     NetClientState *nc = qemu_get_queue(n->nic);
 197
 198     memcpy(&netcfg, config, n->config_size);
 199
 200     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR) &&
 201         !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1) &&
 202         memcmp(netcfg.mac, n->mac, ETH_ALEN)) {
 203         memcpy(n->mac, netcfg.mac, ETH_ALEN);
 204         qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
 205     }
 206
 207     /*
 208      * Is this VDPA? No peer means not VDPA: there's no way to
 209      * disconnect/reconnect a VDPA peer.
 210      */
 211     if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
 212         vhost_net_set_config(get_vhost_net(nc->peer),
 213                              (uint8_t *)&netcfg, 0, n->config_size,
 214                              VHOST_SET_CONFIG_TYPE_FRONTEND);
 215       }
 216 }
 217
 218 static bool virtio_net_started(VirtIONet *n, uint8_t status)
 219 {
 220     VirtIODevice *vdev = VIRTIO_DEVICE(n);
 221     return (status & VIRTIO_CONFIG_S_DRIVER_OK) &&
 222         (n->status & VIRTIO_NET_S_LINK_UP) && vdev->vm_running;
 223 }
 224
 225 static void virtio_net_announce_notify(VirtIONet *net)
 226 {
 227     VirtIODevice *vdev = VIRTIO_DEVICE(net);
 228     trace_virtio_net_announce_notify();
 229
 230     net->status |= VIRTIO_NET_S_ANNOUNCE;
 231     virtio_notify_config(vdev);
 232 }
 233
 234 static void virtio_net_announce_timer(void *opaque)
 235 {
 236     VirtIONet *n = opaque;
 237     trace_virtio_net_announce_timer(n->announce_timer.round);
 238
 239     n->announce_timer.round--;
 240     virtio_net_announce_notify(n);
 241 }
 242
 243 static void virtio_net_announce(NetClientState *nc)
 244 {
 245     VirtIONet *n = qemu_get_nic_opaque(nc);
 246     VirtIODevice *vdev = VIRTIO_DEVICE(n);
 247
 248     /*
 249      * Make sure the virtio migration announcement timer isn't running
 250      * If it is, let it trigger announcement so that we do not cause
 251      * confusion.
 252      */
 253     if (n->announce_timer.round) {
 254         return;
 255     }
 256
 257     if (virtio_vdev_has_feature(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE) &&
 258         virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) {
 259             virtio_net_announce_notify(n);
 260     }
 261 }
 262
 263 static void virtio_net_vhost_status(VirtIONet *n, uint8_t status)
 264 {
 265     VirtIODevice *vdev = VIRTIO_DEVICE(n);
 266     NetClientState *nc = qemu_get_queue(n->nic);
 267     int queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
 268     int cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ?
 269               n->max_ncs - n->max_queue_pairs : 0;
 270
 271     if (!get_vhost_net(nc->peer)) {
 272         return;
 273     }
 274
 275     if ((virtio_net_started(n, status) && !nc->peer->link_down) ==
 276         !!n->vhost_started) {
 277         return;
 278     }
 279     if (!n->vhost_started) {
 280         int r, i;
 281
 282         if (n->needs_vnet_hdr_swap) {
 283             error_report("backend does not support %s vnet headers; "
 284                          "falling back on userspace virtio",
 285                          virtio_is_big_endian(vdev) ? "BE" : "LE");
 286             return;
 287         }
 288
 289         /* Any packets outstanding? Purge them to avoid touching rings
 290          * when vhost is running.
 291          */
 292         for (i = 0;  i < queue_pairs; i++) {
 293             NetClientState *qnc = qemu_get_subqueue(n->nic, i);
 294
 295             /* Purge both directions: TX and RX. */
 296             qemu_net_queue_purge(qnc->peer->incoming_queue, qnc);
 297             qemu_net_queue_purge(qnc->incoming_queue, qnc->peer);
 298         }
 299
 300         if (virtio_has_feature(vdev->guest_features, VIRTIO_NET_F_MTU)) {
 301             r = vhost_net_set_mtu(get_vhost_net(nc->peer), n->net_conf.mtu);
 302             if (r < 0) {
 303                 error_report("%uBytes MTU not supported by the backend",
 304                              n->net_conf.mtu);
 305
 306                 return;
 307             }
 308         }
 309
 310         n->vhost_started = 1;
 311         r = vhost_net_start(vdev, n->nic->ncs, queue_pairs, cvq);
 312         if (r < 0) {
 313             error_report("unable to start vhost net: %d: "
 314                          "falling back on userspace virtio", -r);
 315             n->vhost_started = 0;
 316         }
 317     } else {
 318         vhost_net_stop(vdev, n->nic->ncs, queue_pairs, cvq);
 319         n->vhost_started = 0;
 320     }
 321 }
 322
 323 static int virtio_net_set_vnet_endian_one(VirtIODevice *vdev,
 324                                           NetClientState *peer,
 325                                           bool enable)
 326 {
 327     if (virtio_is_big_endian(vdev)) {
 328         return qemu_set_vnet_be(peer, enable);
 329     } else {
 330         return qemu_set_vnet_le(peer, enable);
 331     }
 332 }
 333
 334 static bool virtio_net_set_vnet_endian(VirtIODevice *vdev, NetClientState *ncs,
 335                                        int queue_pairs, bool enable)
 336 {
 337     int i;
 338
 339     for (i = 0; i < queue_pairs; i++) {
 340         if (virtio_net_set_vnet_endian_one(vdev, ncs[i].peer, enable) < 0 &&
 341             enable) {
 342             while (--i >= 0) {
 343                 virtio_net_set_vnet_endian_one(vdev, ncs[i].peer, false);
 344             }
 345
 346             return true;
 347         }
 348     }
 349
 350     return false;
 351 }
 352
 353 static void virtio_net_vnet_endian_status(VirtIONet *n, uint8_t status)
 354 {
 355     VirtIODevice *vdev = VIRTIO_DEVICE(n);
 356     int queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
 357
 358     if (virtio_net_started(n, status)) {
 359         /* Before using the device, we tell the network backend about the
 360          * endianness to use when parsing vnet headers. If the backend
 361          * can't do it, we fallback onto fixing the headers in the core
 362          * virtio-net code.
 363          */
 364         n->needs_vnet_hdr_swap = virtio_net_set_vnet_endian(vdev, n->nic->ncs,
 365                                                             queue_pairs, true);
 366     } else if (virtio_net_started(n, vdev->status)) {
 367         /* After using the device, we need to reset the network backend to
 368          * the default (guest native endianness), otherwise the guest may
 369          * lose network connectivity if it is rebooted into a different
 370          * endianness.
 371          */
 372         virtio_net_set_vnet_endian(vdev, n->nic->ncs, queue_pairs, false);
 373     }
 374 }
 375
 376 static void virtio_net_drop_tx_queue_data(VirtIODevice *vdev, VirtQueue *vq)
 377 {
 378     unsigned int dropped = virtqueue_drop_all(vq);
 379     if (dropped) {
 380         virtio_notify(vdev, vq);
 381     }
 382 }
 383
 384 static void virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status)
 385 {
 386     VirtIONet *n = VIRTIO_NET(vdev);
 387     VirtIONetQueue *q;
 388     int i;
 389     uint8_t queue_status;
 390
 391     virtio_net_vnet_endian_status(n, status);
 392     virtio_net_vhost_status(n, status);
 393
 394     for (i = 0; i < n->max_queue_pairs; i++) {
 395         NetClientState *ncs = qemu_get_subqueue(n->nic, i);
 396         bool queue_started;
 397         q = &n->vqs[i];
 398
 399         if ((!n->multiqueue && i != 0) || i >= n->curr_queue_pairs) {
 400             queue_status = 0;
 401         } else {
 402             queue_status = status;
 403         }
 404         queue_started =
 405             virtio_net_started(n, queue_status) && !n->vhost_started;
 406
 407         if (queue_started) {
 408             qemu_flush_queued_packets(ncs);
 409         }
 410
 411         if (!q->tx_waiting) {
 412             continue;
 413         }
 414
 415         if (queue_started) {
 416             if (q->tx_timer) {
 417                 timer_mod(q->tx_timer,
 418                                qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
 419             } else {
 420                 qemu_bh_schedule(q->tx_bh);
 421             }
 422         } else {
 423             if (q->tx_timer) {
 424                 timer_del(q->tx_timer);
 425             } else {
 426                 qemu_bh_cancel(q->tx_bh);
 427             }
 428             if ((n->status & VIRTIO_NET_S_LINK_UP) == 0 &&
 429                 (queue_status & VIRTIO_CONFIG_S_DRIVER_OK) &&
 430                 vdev->vm_running) {
 431                 /* if tx is waiting we are likely have some packets in tx queue
 432                  * and disabled notification */
 433                 q->tx_waiting = 0;
 434                 virtio_queue_set_notification(q->tx_vq, 1);
 435                 virtio_net_drop_tx_queue_data(vdev, q->tx_vq);
 436             }
 437         }
 438     }
 439 }
 440
 441 static void virtio_net_set_link_status(NetClientState *nc)
 442 {
 443     VirtIONet *n = qemu_get_nic_opaque(nc);
 444     VirtIODevice *vdev = VIRTIO_DEVICE(n);
 445     uint16_t old_status = n->status;
 446
 447     if (nc->link_down)
 448         n->status &= ~VIRTIO_NET_S_LINK_UP;
 449     else
 450         n->status |= VIRTIO_NET_S_LINK_UP;
 451
 452     if (n->status != old_status)
 453         virtio_notify_config(vdev);
 454
 455     virtio_net_set_status(vdev, vdev->status);
 456 }
 457
 458 static void rxfilter_notify(NetClientState *nc)
 459 {
 460     VirtIONet *n = qemu_get_nic_opaque(nc);
 461
 462     if (nc->rxfilter_notify_enabled) {
 463         char *path = object_get_canonical_path(OBJECT(n->qdev));
 464         qapi_event_send_nic_rx_filter_changed(n->netclient_name, path);
 465         g_free(path);
 466
 467         /* disable event notification to avoid events flooding */
 468         nc->rxfilter_notify_enabled = 0;
 469     }
 470 }
 471
 472 static intList *get_vlan_table(VirtIONet *n)
 473 {
 474     intList *list;
 475     int i, j;
 476
 477     list = NULL;
 478     for (i = 0; i < MAX_VLAN >> 5; i++) {
 479         for (j = 0; n->vlans[i] && j <= 0x1f; j++) {
 480             if (n->vlans[i] & (1U << j)) {
 481                 QAPI_LIST_PREPEND(list, (i << 5) + j);
 482             }
 483         }
 484     }
 485
 486     return list;
 487 }
 488
 489 static RxFilterInfo *virtio_net_query_rxfilter(NetClientState *nc)
 490 {
 491     VirtIONet *n = qemu_get_nic_opaque(nc);
 492     VirtIODevice *vdev = VIRTIO_DEVICE(n);
 493     RxFilterInfo *info;
 494     strList *str_list;
 495     int i;
 496
 497     info = g_malloc0(sizeof(*info));
 498     info->name = g_strdup(nc->name);
 499     info->promiscuous = n->promisc;
 500
 501     if (n->nouni) {
 502         info->unicast = RX_STATE_NONE;
 503     } else if (n->alluni) {
 504         info->unicast = RX_STATE_ALL;
 505     } else {
 506         info->unicast = RX_STATE_NORMAL;
 507     }
 508
 509     if (n->nomulti) {
 510         info->multicast = RX_STATE_NONE;
 511     } else if (n->allmulti) {
 512         info->multicast = RX_STATE_ALL;
 513     } else {
 514         info->multicast = RX_STATE_NORMAL;
 515     }
 516
 517     info->broadcast_allowed = n->nobcast;
 518     info->multicast_overflow = n->mac_table.multi_overflow;
 519     info->unicast_overflow = n->mac_table.uni_overflow;
 520
 521     info->main_mac = qemu_mac_strdup_printf(n->mac);
 522
 523     str_list = NULL;
 524     for (i = 0; i < n->mac_table.first_multi; i++) {
 525         QAPI_LIST_PREPEND(str_list,
 526                       qemu_mac_strdup_printf(n->mac_table.macs + i * ETH_ALEN));
 527     }
 528     info->unicast_table = str_list;
 529
 530     str_list = NULL;
 531     for (i = n->mac_table.first_multi; i < n->mac_table.in_use; i++) {
 532         QAPI_LIST_PREPEND(str_list,
 533                       qemu_mac_strdup_printf(n->mac_table.macs + i * ETH_ALEN));
 534     }
 535     info->multicast_table = str_list;
 536     info->vlan_table = get_vlan_table(n);
 537
 538     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VLAN)) {
 539         info->vlan = RX_STATE_ALL;
 540     } else if (!info->vlan_table) {
 541         info->vlan = RX_STATE_NONE;
 542     } else {
 543         info->vlan = RX_STATE_NORMAL;
 544     }
 545
 546     /* enable event notification after query */
 547     nc->rxfilter_notify_enabled = 1;
 548
 549     return info;
 550 }
 551
 552 static void virtio_net_queue_reset(VirtIODevice *vdev, uint32_t queue_index)
 553 {
 554     VirtIONet *n = VIRTIO_NET(vdev);
 555     NetClientState *nc;
 556
 557     /* validate queue_index and skip for cvq */
 558     if (queue_index >= n->max_queue_pairs * 2) {
 559         return;
 560     }
 561
 562     nc = qemu_get_subqueue(n->nic, vq2q(queue_index));
 563
 564     if (!nc->peer) {
 565         return;
 566     }
 567
 568     if (get_vhost_net(nc->peer) &&
 569         nc->peer->info->type == NET_CLIENT_DRIVER_TAP) {
 570         vhost_net_virtqueue_reset(vdev, nc, queue_index);
 571     }
 572
 573     flush_or_purge_queued_packets(nc);
 574 }
 575
 576 static void virtio_net_queue_enable(VirtIODevice *vdev, uint32_t queue_index)
 577 {
 578     VirtIONet *n = VIRTIO_NET(vdev);
 579     NetClientState *nc;
 580     int r;
 581
 582     /* validate queue_index and skip for cvq */
 583     if (queue_index >= n->max_queue_pairs * 2) {
 584         return;
 585     }
 586
 587     nc = qemu_get_subqueue(n->nic, vq2q(queue_index));
 588
 589     if (!nc->peer || !vdev->vhost_started) {
 590         return;
 591     }
 592
 593     if (get_vhost_net(nc->peer) &&
 594         nc->peer->info->type == NET_CLIENT_DRIVER_TAP) {
 595         r = vhost_net_virtqueue_restart(vdev, nc, queue_index);
 596         if (r < 0) {
 597             error_report("unable to restart vhost net virtqueue: %d, "
 598                             "when resetting the queue", queue_index);
 599         }
 600     }
 601 }
 602
 603 static void virtio_net_reset(VirtIODevice *vdev)
 604 {
 605     VirtIONet *n = VIRTIO_NET(vdev);
 606     int i;
 607
 608     /* Reset back to compatibility mode */
 609     n->promisc = 1;
 610     n->allmulti = 0;
 611     n->alluni = 0;
 612     n->nomulti = 0;
 613     n->nouni = 0;
 614     n->nobcast = 0;
 615     /* multiqueue is disabled by default */
 616     n->curr_queue_pairs = 1;
 617     timer_del(n->announce_timer.tm);
 618     n->announce_timer.round = 0;
 619     n->status &= ~VIRTIO_NET_S_ANNOUNCE;
 620
 621     /* Flush any MAC and VLAN filter table state */
 622     n->mac_table.in_use = 0;
 623     n->mac_table.first_multi = 0;
 624     n->mac_table.multi_overflow = 0;
 625     n->mac_table.uni_overflow = 0;
 626     memset(n->mac_table.macs, 0, MAC_TABLE_ENTRIES * ETH_ALEN);
 627     memcpy(&n->mac[0], &n->nic->conf->macaddr, sizeof(n->mac));
 628     qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
 629     memset(n->vlans, 0, MAX_VLAN >> 3);
 630
 631     /* Flush any async TX */
 632     for (i = 0;  i < n->max_queue_pairs; i++) {
 633         flush_or_purge_queued_packets(qemu_get_subqueue(n->nic, i));
 634     }
 635 }
 636
 637 static void peer_test_vnet_hdr(VirtIONet *n)
 638 {
 639     NetClientState *nc = qemu_get_queue(n->nic);
 640     if (!nc->peer) {
 641         return;
 642     }
 643
 644     n->has_vnet_hdr = qemu_has_vnet_hdr(nc->peer);
 645 }
 646
 647 static int peer_has_vnet_hdr(VirtIONet *n)
 648 {
 649     return n->has_vnet_hdr;
 650 }
 651
 652 static int peer_has_ufo(VirtIONet *n)
 653 {
 654     if (!peer_has_vnet_hdr(n))
 655         return 0;
 656
 657     n->has_ufo = qemu_has_ufo(qemu_get_queue(n->nic)->peer);
 658
 659     return n->has_ufo;
 660 }
 661
 662 static int peer_has_uso(VirtIONet *n)
 663 {
 664     if (!peer_has_vnet_hdr(n)) {
 665         return 0;
 666     }
 667
 668     return qemu_has_uso(qemu_get_queue(n->nic)->peer);
 669 }
 670
 671 static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
 672                                        int version_1, int hash_report)
 673 {
 674     int i;
 675     NetClientState *nc;
 676
 677     n->mergeable_rx_bufs = mergeable_rx_bufs;
 678
 679     if (version_1) {
 680         n->guest_hdr_len = hash_report ?
 681             sizeof(struct virtio_net_hdr_v1_hash) :
 682             sizeof(struct virtio_net_hdr_mrg_rxbuf);
 683         n->rss_data.populate_hash = !!hash_report;
 684     } else {
 685         n->guest_hdr_len = n->mergeable_rx_bufs ?
 686             sizeof(struct virtio_net_hdr_mrg_rxbuf) :
 687             sizeof(struct virtio_net_hdr);
 688     }
 689
 690     for (i = 0; i < n->max_queue_pairs; i++) {
 691         nc = qemu_get_subqueue(n->nic, i);
 692
 693         if (peer_has_vnet_hdr(n) &&
 694             qemu_has_vnet_hdr_len(nc->peer, n->guest_hdr_len)) {
 695             qemu_set_vnet_hdr_len(nc->peer, n->guest_hdr_len);
 696             n->host_hdr_len = n->guest_hdr_len;
 697         }
 698     }
 699 }
 700
 701 static int virtio_net_max_tx_queue_size(VirtIONet *n)
 702 {
 703     NetClientState *peer = n->nic_conf.peers.ncs[0];
 704
 705     /*
 706      * Backends other than vhost-user or vhost-vdpa don't support max queue
 707      * size.
 708      */
 709     if (!peer) {
 710         return VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE;
 711     }
 712
 713     switch(peer->info->type) {
 714     case NET_CLIENT_DRIVER_VHOST_USER:
 715     case NET_CLIENT_DRIVER_VHOST_VDPA:
 716         return VIRTQUEUE_MAX_SIZE;
 717     default:
 718         return VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE;
 719     };
 720 }
 721
 722 static int peer_attach(VirtIONet *n, int index)
 723 {
 724     NetClientState *nc = qemu_get_subqueue(n->nic, index);
 725
 726     if (!nc->peer) {
 727         return 0;
 728     }
 729
 730     if (nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_USER) {
 731         vhost_set_vring_enable(nc->peer, 1);
 732     }
 733
 734     if (nc->peer->info->type != NET_CLIENT_DRIVER_TAP) {
 735         return 0;
 736     }
 737
 738     if (n->max_queue_pairs == 1) {
 739         return 0;
 740     }
 741
 742     return tap_enable(nc->peer);
 743 }
 744
 745 static int peer_detach(VirtIONet *n, int index)
 746 {
 747     NetClientState *nc = qemu_get_subqueue(n->nic, index);
 748
 749     if (!nc->peer) {
 750         return 0;
 751     }
 752
 753     if (nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_USER) {
 754         vhost_set_vring_enable(nc->peer, 0);
 755     }
 756
 757     if (nc->peer->info->type !=  NET_CLIENT_DRIVER_TAP) {
 758         return 0;
 759     }
 760
 761     return tap_disable(nc->peer);
 762 }
 763
 764 static void virtio_net_set_queue_pairs(VirtIONet *n)
 765 {
 766     int i;
 767     int r;
 768
 769     if (n->nic->peer_deleted) {
 770         return;
 771     }
 772
 773     for (i = 0; i < n->max_queue_pairs; i++) {
 774         if (i < n->curr_queue_pairs) {
 775             r = peer_attach(n, i);
 776             assert(!r);
 777         } else {
 778             r = peer_detach(n, i);
 779             assert(!r);
 780         }
 781     }
 782 }
 783
 784 static void virtio_net_set_multiqueue(VirtIONet *n, int multiqueue);
 785
 786 static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
 787                                         Error **errp)
 788 {
 789     VirtIONet *n = VIRTIO_NET(vdev);
 790     NetClientState *nc = qemu_get_queue(n->nic);
 791
 792     /* Firstly sync all virtio-net possible supported features */
 793     features |= n->host_features;
 794
 795     virtio_add_feature(&features, VIRTIO_NET_F_MAC);
 796
 797     if (!peer_has_vnet_hdr(n)) {
 798         virtio_clear_feature(&features, VIRTIO_NET_F_CSUM);
 799         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_TSO4);
 800         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_TSO6);
 801         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_ECN);
 802
 803         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_CSUM);
 804         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO4);
 805         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6);
 806         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN);
 807
 808         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
 809         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
 810         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
 811
 812         virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT);
 813     }
 814
 815     if (!peer_has_vnet_hdr(n) || !peer_has_ufo(n)) {
 816         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_UFO);
 817         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO);
 818     }
 819
 820     if (!peer_has_uso(n)) {
 821         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
 822         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
 823         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
 824     }
 825
 826     if (!get_vhost_net(nc->peer)) {
 827         return features;
 828     }
 829
 830     if (!ebpf_rss_is_loaded(&n->ebpf_rss)) {
 831         virtio_clear_feature(&features, VIRTIO_NET_F_RSS);
 832     }
 833     features = vhost_net_get_features(get_vhost_net(nc->peer), features);
 834     vdev->backend_features = features;
 835
 836     if (n->mtu_bypass_backend &&
 837             (n->host_features & 1ULL << VIRTIO_NET_F_MTU)) {
 838         features |= (1ULL << VIRTIO_NET_F_MTU);
 839     }
 840
 841     /*
 842      * Since GUEST_ANNOUNCE is emulated the feature bit could be set without
 843      * enabled. This happens in the vDPA case.
 844      *
 845      * Make sure the feature set is not incoherent, as the driver could refuse
 846      * to start.
 847      *
 848      * TODO: QEMU is able to emulate a CVQ just for guest_announce purposes,
 849      * helping guest to notify the new location with vDPA devices that does not
 850      * support it.
 851      */
 852     if (!virtio_has_feature(vdev->backend_features, VIRTIO_NET_F_CTRL_VQ)) {
 853         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ANNOUNCE);
 854     }
 855
 856     return features;
 857 }
 858
 859 static uint64_t virtio_net_bad_features(VirtIODevice *vdev)
 860 {
 861     uint64_t features = 0;
 862
 863     /* Linux kernel 2.6.25.  It understood MAC (as everyone must),
 864      * but also these: */
 865     virtio_add_feature(&features, VIRTIO_NET_F_MAC);
 866     virtio_add_feature(&features, VIRTIO_NET_F_CSUM);
 867     virtio_add_feature(&features, VIRTIO_NET_F_HOST_TSO4);
 868     virtio_add_feature(&features, VIRTIO_NET_F_HOST_TSO6);
 869     virtio_add_feature(&features, VIRTIO_NET_F_HOST_ECN);
 870
 871     return features;
 872 }
 873
 874 static void virtio_net_apply_guest_offloads(VirtIONet *n)
 875 {
 876     qemu_set_offload(qemu_get_queue(n->nic)->peer,
 877             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_CSUM)),
 878             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO4)),
 879             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO6)),
 880             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_ECN)),
 881             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)),
 882             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO4)),
 883             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6)));
 884 }
 885
 886 static uint64_t virtio_net_guest_offloads_by_features(uint64_t features)
 887 {
 888     static const uint64_t guest_offloads_mask =
 889         (1ULL << VIRTIO_NET_F_GUEST_CSUM) |
 890         (1ULL << VIRTIO_NET_F_GUEST_TSO4) |
 891         (1ULL << VIRTIO_NET_F_GUEST_TSO6) |
 892         (1ULL << VIRTIO_NET_F_GUEST_ECN)  |
 893         (1ULL << VIRTIO_NET_F_GUEST_UFO)  |
 894         (1ULL << VIRTIO_NET_F_GUEST_USO4) |
 895         (1ULL << VIRTIO_NET_F_GUEST_USO6);
 896
 897     return guest_offloads_mask & features;
 898 }
 899
 900 uint64_t virtio_net_supported_guest_offloads(const VirtIONet *n)
 901 {
 902     VirtIODevice *vdev = VIRTIO_DEVICE(n);
 903     return virtio_net_guest_offloads_by_features(vdev->guest_features);
 904 }
 905
 906 typedef struct {
 907     VirtIONet *n;
 908     DeviceState *dev;
 909 } FailoverDevice;
 910
 911 /**
 912  * Set the failover primary device
 913  *
 914  * @opaque: FailoverId to setup
 915  * @opts: opts for device we are handling
 916  * @errp: returns an error if this function fails
 917  */
 918 static int failover_set_primary(DeviceState *dev, void *opaque)
 919 {
 920     FailoverDevice *fdev = opaque;
 921     PCIDevice *pci_dev = (PCIDevice *)
 922         object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE);
 923
 924     if (!pci_dev) {
 925         return 0;
 926     }
 927
 928     if (!g_strcmp0(pci_dev->failover_pair_id, fdev->n->netclient_name)) {
 929         fdev->dev = dev;
 930         return 1;
 931     }
 932
 933     return 0;
 934 }
 935
 936 /**
 937  * Find the primary device for this failover virtio-net
 938  *
 939  * @n: VirtIONet device
 940  * @errp: returns an error if this function fails
 941  */
 942 static DeviceState *failover_find_primary_device(VirtIONet *n)
 943 {
 944     FailoverDevice fdev = {
 945         .n = n,
 946     };
 947
 948     qbus_walk_children(sysbus_get_default(), failover_set_primary, NULL,
 949                        NULL, NULL, &fdev);
 950     return fdev.dev;
 951 }
 952
 953 static void failover_add_primary(VirtIONet *n, Error **errp)
 954 {
 955     Error *err = NULL;
 956     DeviceState *dev = failover_find_primary_device(n);
 957
 958     if (dev) {
 959         return;
 960     }
 961
 962     if (!n->primary_opts) {
 963         error_setg(errp, "Primary device not found");
 964         error_append_hint(errp, "Virtio-net failover will not work. Make "
 965                           "sure primary device has parameter"
 966                           " failover_pair_id=%s\n", n->netclient_name);
 967         return;
 968     }
 969
 970     dev = qdev_device_add_from_qdict(n->primary_opts,
 971                                      n->primary_opts_from_json,
 972                                      &err);
 973     if (err) {
 974         qobject_unref(n->primary_opts);
 975         n->primary_opts = NULL;
 976     } else {
 977         object_unref(OBJECT(dev));
 978     }
 979     error_propagate(errp, err);
 980 }
 981
 982 static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
 983 {
 984     VirtIONet *n = VIRTIO_NET(vdev);
 985     Error *err = NULL;
 986     int i;
 987
 988     if (n->mtu_bypass_backend &&
 989             !virtio_has_feature(vdev->backend_features, VIRTIO_NET_F_MTU)) {
 990         features &= ~(1ULL << VIRTIO_NET_F_MTU);
 991     }
 992
 993     virtio_net_set_multiqueue(n,
 994                               virtio_has_feature(features, VIRTIO_NET_F_RSS) ||
 995                               virtio_has_feature(features, VIRTIO_NET_F_MQ));
 996
 997     virtio_net_set_mrg_rx_bufs(n,
 998                                virtio_has_feature(features,
 999                                                   VIRTIO_NET_F_MRG_RXBUF),
1000                                virtio_has_feature(features,
1001                                                   VIRTIO_F_VERSION_1),
1002                                virtio_has_feature(features,
1003                                                   VIRTIO_NET_F_HASH_REPORT));
1004
1005     n->rsc4_enabled = virtio_has_feature(features, VIRTIO_NET_F_RSC_EXT) &&
1006         virtio_has_feature(features, VIRTIO_NET_F_GUEST_TSO4);
1007     n->rsc6_enabled = virtio_has_feature(features, VIRTIO_NET_F_RSC_EXT) &&
1008         virtio_has_feature(features, VIRTIO_NET_F_GUEST_TSO6);
1009     n->rss_data.redirect = virtio_has_feature(features, VIRTIO_NET_F_RSS);
1010
1011     if (n->has_vnet_hdr) {
1012         n->curr_guest_offloads =
1013             virtio_net_guest_offloads_by_features(features);
1014         virtio_net_apply_guest_offloads(n);
1015     }
1016
1017     for (i = 0;  i < n->max_queue_pairs; i++) {
1018         NetClientState *nc = qemu_get_subqueue(n->nic, i);
1019
1020         if (!get_vhost_net(nc->peer)) {
1021             continue;
1022         }
1023         vhost_net_ack_features(get_vhost_net(nc->peer), features);
1024
1025         /*
1026          * keep acked_features in NetVhostUserState up-to-date so it
1027          * can't miss any features configured by guest virtio driver.
1028          */
1029         vhost_net_save_acked_features(nc->peer);
1030     }
1031
1032     if (virtio_has_feature(features, VIRTIO_NET_F_CTRL_VLAN)) {
1033         memset(n->vlans, 0, MAX_VLAN >> 3);
1034     } else {
1035         memset(n->vlans, 0xff, MAX_VLAN >> 3);
1036     }
1037
1038     if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) {
1039         qapi_event_send_failover_negotiated(n->netclient_name);
1040         qatomic_set(&n->failover_primary_hidden, false);
1041         failover_add_primary(n, &err);
1042         if (err) {
1043             if (!qtest_enabled()) {
1044                 warn_report_err(err);
1045             } else {
1046                 error_free(err);
1047             }
1048         }
1049     }
1050 }
1051
1052 static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
1053                                      struct iovec *iov, unsigned int iov_cnt)
1054 {
1055     uint8_t on;
1056     size_t s;
1057     NetClientState *nc = qemu_get_queue(n->nic);
1058
1059     s = iov_to_buf(iov, iov_cnt, 0, &on, sizeof(on));
1060     if (s != sizeof(on)) {
1061         return VIRTIO_NET_ERR;
1062     }
1063
1064     if (cmd == VIRTIO_NET_CTRL_RX_PROMISC) {
1065         n->promisc = on;
1066     } else if (cmd == VIRTIO_NET_CTRL_RX_ALLMULTI) {
1067         n->allmulti = on;
1068     } else if (cmd == VIRTIO_NET_CTRL_RX_ALLUNI) {
1069         n->alluni = on;
1070     } else if (cmd == VIRTIO_NET_CTRL_RX_NOMULTI) {
1071         n->nomulti = on;
1072     } else if (cmd == VIRTIO_NET_CTRL_RX_NOUNI) {
1073         n->nouni = on;
1074     } else if (cmd == VIRTIO_NET_CTRL_RX_NOBCAST) {
1075         n->nobcast = on;
1076     } else {
1077         return VIRTIO_NET_ERR;
1078     }
1079
1080     rxfilter_notify(nc);
1081
1082     return VIRTIO_NET_OK;
1083 }
1084
1085 static int virtio_net_handle_offloads(VirtIONet *n, uint8_t cmd,
1086                                      struct iovec *iov, unsigned int iov_cnt)
1087 {
1088     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1089     uint64_t offloads;
1090     size_t s;
1091
1092     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
1093         return VIRTIO_NET_ERR;
1094     }
1095
1096     s = iov_to_buf(iov, iov_cnt, 0, &offloads, sizeof(offloads));
1097     if (s != sizeof(offloads)) {
1098         return VIRTIO_NET_ERR;
1099     }
1100
1101     if (cmd == VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET) {
1102         uint64_t supported_offloads;
1103
1104         offloads = virtio_ldq_p(vdev, &offloads);
1105
1106         if (!n->has_vnet_hdr) {
1107             return VIRTIO_NET_ERR;
1108         }
1109
1110         n->rsc4_enabled = virtio_has_feature(offloads, VIRTIO_NET_F_RSC_EXT) &&
1111             virtio_has_feature(offloads, VIRTIO_NET_F_GUEST_TSO4);
1112         n->rsc6_enabled = virtio_has_feature(offloads, VIRTIO_NET_F_RSC_EXT) &&
1113             virtio_has_feature(offloads, VIRTIO_NET_F_GUEST_TSO6);
1114         virtio_clear_feature(&offloads, VIRTIO_NET_F_RSC_EXT);
1115
1116         supported_offloads = virtio_net_supported_guest_offloads(n);
1117         if (offloads & ~supported_offloads) {
1118             return VIRTIO_NET_ERR;
1119         }
1120
1121         n->curr_guest_offloads = offloads;
1122         virtio_net_apply_guest_offloads(n);
1123
1124         return VIRTIO_NET_OK;
1125     } else {
1126         return VIRTIO_NET_ERR;
1127     }
1128 }
1129
1130 static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd,
1131                                  struct iovec *iov, unsigned int iov_cnt)
1132 {
1133     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1134     struct virtio_net_ctrl_mac mac_data;
1135     size_t s;
1136     NetClientState *nc = qemu_get_queue(n->nic);
1137
1138     if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET) {
1139         if (iov_size(iov, iov_cnt) != sizeof(n->mac)) {
1140             return VIRTIO_NET_ERR;
1141         }
1142         s = iov_to_buf(iov, iov_cnt, 0, &n->mac, sizeof(n->mac));
1143         assert(s == sizeof(n->mac));
1144         qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
1145         rxfilter_notify(nc);
1146
1147         return VIRTIO_NET_OK;
1148     }
1149
1150     if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET) {
1151         return VIRTIO_NET_ERR;
1152     }
1153
1154     int in_use = 0;
1155     int first_multi = 0;
1156     uint8_t uni_overflow = 0;
1157     uint8_t multi_overflow = 0;
1158     uint8_t *macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN);
1159
1160     s = iov_to_buf(iov, iov_cnt, 0, &mac_data.entries,
1161                    sizeof(mac_data.entries));
1162     mac_data.entries = virtio_ldl_p(vdev, &mac_data.entries);
1163     if (s != sizeof(mac_data.entries)) {
1164         goto error;
1165     }
1166     iov_discard_front(&iov, &iov_cnt, s);
1167
1168     if (mac_data.entries * ETH_ALEN > iov_size(iov, iov_cnt)) {
1169         goto error;
1170     }
1171
1172     if (mac_data.entries <= MAC_TABLE_ENTRIES) {
1173         s = iov_to_buf(iov, iov_cnt, 0, macs,
1174                        mac_data.entries * ETH_ALEN);
1175         if (s != mac_data.entries * ETH_ALEN) {
1176             goto error;
1177         }
1178         in_use += mac_data.entries;
1179     } else {
1180         uni_overflow = 1;
1181     }
1182
1183     iov_discard_front(&iov, &iov_cnt, mac_data.entries * ETH_ALEN);
1184
1185     first_multi = in_use;
1186
1187     s = iov_to_buf(iov, iov_cnt, 0, &mac_data.entries,
1188                    sizeof(mac_data.entries));
1189     mac_data.entries = virtio_ldl_p(vdev, &mac_data.entries);
1190     if (s != sizeof(mac_data.entries)) {
1191         goto error;
1192     }
1193
1194     iov_discard_front(&iov, &iov_cnt, s);
1195
1196     if (mac_data.entries * ETH_ALEN != iov_size(iov, iov_cnt)) {
1197         goto error;
1198     }
1199
1200     if (mac_data.entries <= MAC_TABLE_ENTRIES - in_use) {
1201         s = iov_to_buf(iov, iov_cnt, 0, &macs[in_use * ETH_ALEN],
1202                        mac_data.entries * ETH_ALEN);
1203         if (s != mac_data.entries * ETH_ALEN) {
1204             goto error;
1205         }
1206         in_use += mac_data.entries;
1207     } else {
1208         multi_overflow = 1;
1209     }
1210
1211     n->mac_table.in_use = in_use;
1212     n->mac_table.first_multi = first_multi;
1213     n->mac_table.uni_overflow = uni_overflow;
1214     n->mac_table.multi_overflow = multi_overflow;
1215     memcpy(n->mac_table.macs, macs, MAC_TABLE_ENTRIES * ETH_ALEN);
1216     g_free(macs);
1217     rxfilter_notify(nc);
1218
1219     return VIRTIO_NET_OK;
1220
1221 error:
1222     g_free(macs);
1223     return VIRTIO_NET_ERR;
1224 }
1225
1226 static int virtio_net_handle_vlan_table(VirtIONet *n, uint8_t cmd,
1227                                         struct iovec *iov, unsigned int iov_cnt)
1228 {
1229     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1230     uint16_t vid;
1231     size_t s;
1232     NetClientState *nc = qemu_get_queue(n->nic);
1233
1234     s = iov_to_buf(iov, iov_cnt, 0, &vid, sizeof(vid));
1235     vid = virtio_lduw_p(vdev, &vid);
1236     if (s != sizeof(vid)) {
1237         return VIRTIO_NET_ERR;
1238     }
1239
1240     if (vid >= MAX_VLAN)
1241         return VIRTIO_NET_ERR;
1242
1243     if (cmd == VIRTIO_NET_CTRL_VLAN_ADD)
1244         n->vlans[vid >> 5] |= (1U << (vid & 0x1f));
1245     else if (cmd == VIRTIO_NET_CTRL_VLAN_DEL)
1246         n->vlans[vid >> 5] &= ~(1U << (vid & 0x1f));
1247     else
1248         return VIRTIO_NET_ERR;
1249
1250     rxfilter_notify(nc);
1251
1252     return VIRTIO_NET_OK;
1253 }
1254
1255 static int virtio_net_handle_announce(VirtIONet *n, uint8_t cmd,
1256                                       struct iovec *iov, unsigned int iov_cnt)
1257 {
1258     trace_virtio_net_handle_announce(n->announce_timer.round);
1259     if (cmd == VIRTIO_NET_CTRL_ANNOUNCE_ACK &&
1260         n->status & VIRTIO_NET_S_ANNOUNCE) {
1261         n->status &= ~VIRTIO_NET_S_ANNOUNCE;
1262         if (n->announce_timer.round) {
1263             qemu_announce_timer_step(&n->announce_timer);
1264         }
1265         return VIRTIO_NET_OK;
1266     } else {
1267         return VIRTIO_NET_ERR;
1268     }
1269 }
1270
1271 static void virtio_net_detach_epbf_rss(VirtIONet *n);
1272
1273 static void virtio_net_disable_rss(VirtIONet *n)
1274 {
1275     if (n->rss_data.enabled) {
1276         trace_virtio_net_rss_disable();
1277     }
1278     n->rss_data.enabled = false;
1279
1280     virtio_net_detach_epbf_rss(n);
1281 }
1282
1283 static bool virtio_net_attach_ebpf_to_backend(NICState *nic, int prog_fd)
1284 {
1285     NetClientState *nc = qemu_get_peer(qemu_get_queue(nic), 0);
1286     if (nc == NULL || nc->info->set_steering_ebpf == NULL) {
1287         return false;
1288     }
1289
1290     return nc->info->set_steering_ebpf(nc, prog_fd);
1291 }
1292
1293 static void rss_data_to_rss_config(struct VirtioNetRssData *data,
1294                                    struct EBPFRSSConfig *config)
1295 {
1296     config->redirect = data->redirect;
1297     config->populate_hash = data->populate_hash;
1298     config->hash_types = data->hash_types;
1299     config->indirections_len = data->indirections_len;
1300     config->default_queue = data->default_queue;
1301 }
1302
1303 static bool virtio_net_attach_epbf_rss(VirtIONet *n)
1304 {
1305     struct EBPFRSSConfig config = {};
1306
1307     if (!ebpf_rss_is_loaded(&n->ebpf_rss)) {
1308         return false;
1309     }
1310
1311     rss_data_to_rss_config(&n->rss_data, &config);
1312
1313     if (!ebpf_rss_set_all(&n->ebpf_rss, &config,
1314                           n->rss_data.indirections_table, n->rss_data.key)) {
1315         return false;
1316     }
1317
1318     if (!virtio_net_attach_ebpf_to_backend(n->nic, n->ebpf_rss.program_fd)) {
1319         return false;
1320     }
1321
1322     return true;
1323 }
1324
1325 static void virtio_net_detach_epbf_rss(VirtIONet *n)
1326 {
1327     virtio_net_attach_ebpf_to_backend(n->nic, -1);
1328 }
1329
1330 static bool virtio_net_load_ebpf(VirtIONet *n)
1331 {
1332     if (!virtio_net_attach_ebpf_to_backend(n->nic, -1)) {
1333         /* backend doesn't support steering ebpf */
1334         return false;
1335     }
1336
1337     return ebpf_rss_load(&n->ebpf_rss);
1338 }
1339
1340 static void virtio_net_unload_ebpf(VirtIONet *n)
1341 {
1342     virtio_net_attach_ebpf_to_backend(n->nic, -1);
1343     ebpf_rss_unload(&n->ebpf_rss);
1344 }
1345
1346 static uint16_t virtio_net_handle_rss(VirtIONet *n,
1347                                       struct iovec *iov,
1348                                       unsigned int iov_cnt,
1349                                       bool do_rss)
1350 {
1351     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1352     struct virtio_net_rss_config cfg;
1353     size_t s, offset = 0, size_get;
1354     uint16_t queue_pairs, i;
1355     struct {
1356         uint16_t us;
1357         uint8_t b;
1358     } QEMU_PACKED temp;
1359     const char *err_msg = "";
1360     uint32_t err_value = 0;
1361
1362     if (do_rss && !virtio_vdev_has_feature(vdev, VIRTIO_NET_F_RSS)) {
1363         err_msg = "RSS is not negotiated";
1364         goto error;
1365     }
1366     if (!do_rss && !virtio_vdev_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT)) {
1367         err_msg = "Hash report is not negotiated";
1368         goto error;
1369     }
1370     size_get = offsetof(struct virtio_net_rss_config, indirection_table);
1371     s = iov_to_buf(iov, iov_cnt, offset, &cfg, size_get);
1372     if (s != size_get) {
1373         err_msg = "Short command buffer";
1374         err_value = (uint32_t)s;
1375         goto error;
1376     }
1377     n->rss_data.hash_types = virtio_ldl_p(vdev, &cfg.hash_types);
1378     n->rss_data.indirections_len =
1379         virtio_lduw_p(vdev, &cfg.indirection_table_mask);
1380     n->rss_data.indirections_len++;
1381     if (!do_rss) {
1382         n->rss_data.indirections_len = 1;
1383     }
1384     if (!is_power_of_2(n->rss_data.indirections_len)) {
1385         err_msg = "Invalid size of indirection table";
1386         err_value = n->rss_data.indirections_len;
1387         goto error;
1388     }
1389     if (n->rss_data.indirections_len > VIRTIO_NET_RSS_MAX_TABLE_LEN) {
1390         err_msg = "Too large indirection table";
1391         err_value = n->rss_data.indirections_len;
1392         goto error;
1393     }
1394     n->rss_data.default_queue = do_rss ?
1395         virtio_lduw_p(vdev, &cfg.unclassified_queue) : 0;
1396     if (n->rss_data.default_queue >= n->max_queue_pairs) {
1397         err_msg = "Invalid default queue";
1398         err_value = n->rss_data.default_queue;
1399         goto error;
1400     }
1401     offset += size_get;
1402     size_get = sizeof(uint16_t) * n->rss_data.indirections_len;
1403     g_free(n->rss_data.indirections_table);
1404     n->rss_data.indirections_table = g_malloc(size_get);
1405     if (!n->rss_data.indirections_table) {
1406         err_msg = "Can't allocate indirections table";
1407         err_value = n->rss_data.indirections_len;
1408         goto error;
1409     }
1410     s = iov_to_buf(iov, iov_cnt, offset,
1411                    n->rss_data.indirections_table, size_get);
1412     if (s != size_get) {
1413         err_msg = "Short indirection table buffer";
1414         err_value = (uint32_t)s;
1415         goto error;
1416     }
1417     for (i = 0; i < n->rss_data.indirections_len; ++i) {
1418         uint16_t val = n->rss_data.indirections_table[i];
1419         n->rss_data.indirections_table[i] = virtio_lduw_p(vdev, &val);
1420     }
1421     offset += size_get;
1422     size_get = sizeof(temp);
1423     s = iov_to_buf(iov, iov_cnt, offset, &temp, size_get);
1424     if (s != size_get) {
1425         err_msg = "Can't get queue_pairs";
1426         err_value = (uint32_t)s;
1427         goto error;
1428     }
1429     queue_pairs = do_rss ? virtio_lduw_p(vdev, &temp.us) : n->curr_queue_pairs;
1430     if (queue_pairs == 0 || queue_pairs > n->max_queue_pairs) {
1431         err_msg = "Invalid number of queue_pairs";
1432         err_value = queue_pairs;
1433         goto error;
1434     }
1435     if (temp.b > VIRTIO_NET_RSS_MAX_KEY_SIZE) {
1436         err_msg = "Invalid key size";
1437         err_value = temp.b;
1438         goto error;
1439     }
1440     if (!temp.b && n->rss_data.hash_types) {
1441         err_msg = "No key provided";
1442         err_value = 0;
1443         goto error;
1444     }
1445     if (!temp.b && !n->rss_data.hash_types) {
1446         virtio_net_disable_rss(n);
1447         return queue_pairs;
1448     }
1449     offset += size_get;
1450     size_get = temp.b;
1451     s = iov_to_buf(iov, iov_cnt, offset, n->rss_data.key, size_get);
1452     if (s != size_get) {
1453         err_msg = "Can get key buffer";
1454         err_value = (uint32_t)s;
1455         goto error;
1456     }
1457     n->rss_data.enabled = true;
1458
1459     if (!n->rss_data.populate_hash) {
1460         if (!virtio_net_attach_epbf_rss(n)) {
1461             /* EBPF must be loaded for vhost */
1462             if (get_vhost_net(qemu_get_queue(n->nic)->peer)) {
1463                 warn_report("Can't load eBPF RSS for vhost");
1464                 goto error;
1465             }
1466             /* fallback to software RSS */
1467             warn_report("Can't load eBPF RSS - fallback to software RSS");
1468             n->rss_data.enabled_software_rss = true;
1469         }
1470     } else {
1471         /* use software RSS for hash populating */
1472         /* and detach eBPF if was loaded before */
1473         virtio_net_detach_epbf_rss(n);
1474         n->rss_data.enabled_software_rss = true;
1475     }
1476
1477     trace_virtio_net_rss_enable(n->rss_data.hash_types,
1478                                 n->rss_data.indirections_len,
1479                                 temp.b);
1480     return queue_pairs;
1481 error:
1482     trace_virtio_net_rss_error(err_msg, err_value);
1483     virtio_net_disable_rss(n);
1484     return 0;
1485 }
1486
1487 static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd,
1488                                 struct iovec *iov, unsigned int iov_cnt)
1489 {
1490     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1491     uint16_t queue_pairs;
1492     NetClientState *nc = qemu_get_queue(n->nic);
1493
1494     virtio_net_disable_rss(n);
1495     if (cmd == VIRTIO_NET_CTRL_MQ_HASH_CONFIG) {
1496         queue_pairs = virtio_net_handle_rss(n, iov, iov_cnt, false);
1497         return queue_pairs ? VIRTIO_NET_OK : VIRTIO_NET_ERR;
1498     }
1499     if (cmd == VIRTIO_NET_CTRL_MQ_RSS_CONFIG) {
1500         queue_pairs = virtio_net_handle_rss(n, iov, iov_cnt, true);
1501     } else if (cmd == VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET) {
1502         struct virtio_net_ctrl_mq mq;
1503         size_t s;
1504         if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_MQ)) {
1505             return VIRTIO_NET_ERR;
1506         }
1507         s = iov_to_buf(iov, iov_cnt, 0, &mq, sizeof(mq));
1508         if (s != sizeof(mq)) {
1509             return VIRTIO_NET_ERR;
1510         }
1511         queue_pairs = virtio_lduw_p(vdev, &mq.virtqueue_pairs);
1512
1513     } else {
1514         return VIRTIO_NET_ERR;
1515     }
1516
1517     if (queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
1518         queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
1519         queue_pairs > n->max_queue_pairs ||
1520         !n->multiqueue) {
1521         return VIRTIO_NET_ERR;
1522     }
1523
1524     n->curr_queue_pairs = queue_pairs;
1525     if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
1526         /*
1527          * Avoid updating the backend for a vdpa device: We're only interested
1528          * in updating the device model queues.
1529          */
1530         return VIRTIO_NET_OK;
1531     }
1532     /* stop the backend before changing the number of queue_pairs to avoid handling a
1533      * disabled queue */
1534     virtio_net_set_status(vdev, vdev->status);
1535     virtio_net_set_queue_pairs(n);
1536
1537     return VIRTIO_NET_OK;
1538 }
1539
1540 size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
1541                                   const struct iovec *in_sg, unsigned in_num,
1542                                   const struct iovec *out_sg,
1543                                   unsigned out_num)
1544 {
1545     VirtIONet *n = VIRTIO_NET(vdev);
1546     struct virtio_net_ctrl_hdr ctrl;
1547     virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1548     size_t s;
1549     struct iovec *iov, *iov2;
1550
1551     if (iov_size(in_sg, in_num) < sizeof(status) ||
1552         iov_size(out_sg, out_num) < sizeof(ctrl)) {
1553         virtio_error(vdev, "virtio-net ctrl missing headers");
1554         return 0;
1555     }
1556
1557     iov2 = iov = g_memdup2(out_sg, sizeof(struct iovec) * out_num);
1558     s = iov_to_buf(iov, out_num, 0, &ctrl, sizeof(ctrl));
1559     iov_discard_front(&iov, &out_num, sizeof(ctrl));
1560     if (s != sizeof(ctrl)) {
1561         status = VIRTIO_NET_ERR;
1562     } else if (ctrl.class == VIRTIO_NET_CTRL_RX) {
1563         status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, out_num);
1564     } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) {
1565         status = virtio_net_handle_mac(n, ctrl.cmd, iov, out_num);
1566     } else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) {
1567         status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, out_num);
1568     } else if (ctrl.class == VIRTIO_NET_CTRL_ANNOUNCE) {
1569         status = virtio_net_handle_announce(n, ctrl.cmd, iov, out_num);
1570     } else if (ctrl.class == VIRTIO_NET_CTRL_MQ) {
1571         status = virtio_net_handle_mq(n, ctrl.cmd, iov, out_num);
1572     } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
1573         status = virtio_net_handle_offloads(n, ctrl.cmd, iov, out_num);
1574     }
1575
1576     s = iov_from_buf(in_sg, in_num, 0, &status, sizeof(status));
1577     assert(s == sizeof(status));
1578
1579     g_free(iov2);
1580     return sizeof(status);
1581 }
1582
1583 static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
1584 {
1585     VirtQueueElement *elem;
1586
1587     for (;;) {
1588         size_t written;
1589         elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
1590         if (!elem) {
1591             break;
1592         }
1593
1594         written = virtio_net_handle_ctrl_iov(vdev, elem->in_sg, elem->in_num,
1595                                              elem->out_sg, elem->out_num);
1596         if (written > 0) {
1597             virtqueue_push(vq, elem, written);
1598             virtio_notify(vdev, vq);
1599             g_free(elem);
1600         } else {
1601             virtqueue_detach_element(vq, elem, 0);
1602             g_free(elem);
1603             break;
1604         }
1605     }
1606 }
1607
1608 /* RX */
1609
1610 static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq)
1611 {
1612     VirtIONet *n = VIRTIO_NET(vdev);
1613     int queue_index = vq2q(virtio_get_queue_index(vq));
1614
1615     qemu_flush_queued_packets(qemu_get_subqueue(n->nic, queue_index));
1616 }
1617
1618 static bool virtio_net_can_receive(NetClientState *nc)
1619 {
1620     VirtIONet *n = qemu_get_nic_opaque(nc);
1621     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1622     VirtIONetQueue *q = virtio_net_get_subqueue(nc);
1623
1624     if (!vdev->vm_running) {
1625         return false;
1626     }
1627
1628     if (nc->queue_index >= n->curr_queue_pairs) {
1629         return false;
1630     }
1631
1632     if (!virtio_queue_ready(q->rx_vq) ||
1633         !(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
1634         return false;
1635     }
1636
1637     return true;
1638 }
1639
1640 static int virtio_net_has_buffers(VirtIONetQueue *q, int bufsize)
1641 {
1642     VirtIONet *n = q->n;
1643     if (virtio_queue_empty(q->rx_vq) ||
1644         (n->mergeable_rx_bufs &&
1645          !virtqueue_avail_bytes(q->rx_vq, bufsize, 0))) {
1646         virtio_queue_set_notification(q->rx_vq, 1);
1647
1648         /* To avoid a race condition where the guest has made some buffers
1649          * available after the above check but before notification was
1650          * enabled, check for available buffers again.
1651          */
1652         if (virtio_queue_empty(q->rx_vq) ||
1653             (n->mergeable_rx_bufs &&
1654              !virtqueue_avail_bytes(q->rx_vq, bufsize, 0))) {
1655             return 0;
1656         }
1657     }
1658
1659     virtio_queue_set_notification(q->rx_vq, 0);
1660     return 1;
1661 }
1662
1663 static void virtio_net_hdr_swap(VirtIODevice *vdev, struct virtio_net_hdr *hdr)
1664 {
1665     virtio_tswap16s(vdev, &hdr->hdr_len);
1666     virtio_tswap16s(vdev, &hdr->gso_size);
1667     virtio_tswap16s(vdev, &hdr->csum_start);
1668     virtio_tswap16s(vdev, &hdr->csum_offset);
1669 }
1670
1671 /* dhclient uses AF_PACKET but doesn't pass auxdata to the kernel so
1672  * it never finds out that the packets don't have valid checksums.  This
1673  * causes dhclient to get upset.  Fedora's carried a patch for ages to
1674  * fix this with Xen but it hasn't appeared in an upstream release of
1675  * dhclient yet.
1676  *
1677  * To avoid breaking existing guests, we catch udp packets and add
1678  * checksums.  This is terrible but it's better than hacking the guest
1679  * kernels.
1680  *
1681  * N.B. if we introduce a zero-copy API, this operation is no longer free so
1682  * we should provide a mechanism to disable it to avoid polluting the host
1683  * cache.
1684  */
1685 static void work_around_broken_dhclient(struct virtio_net_hdr *hdr,
1686                                         uint8_t *buf, size_t size)
1687 {
1688     if ((hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && /* missing csum */
1689         (size > 27 && size < 1500) && /* normal sized MTU */
1690         (buf[12] == 0x08 && buf[13] == 0x00) && /* ethertype == IPv4 */
1691         (buf[23] == 17) && /* ip.protocol == UDP */
1692         (buf[34] == 0 && buf[35] == 67)) { /* udp.srcport == bootps */
1693         net_checksum_calculate(buf, size, CSUM_UDP);
1694         hdr->flags &= ~VIRTIO_NET_HDR_F_NEEDS_CSUM;
1695     }
1696 }
1697
1698 static void receive_header(VirtIONet *n, const struct iovec *iov, int iov_cnt,
1699                            const void *buf, size_t size)
1700 {
1701     if (n->has_vnet_hdr) {
1702         /* FIXME this cast is evil */
1703         void *wbuf = (void *)buf;
1704         work_around_broken_dhclient(wbuf, wbuf + n->host_hdr_len,
1705                                     size - n->host_hdr_len);
1706
1707         if (n->needs_vnet_hdr_swap) {
1708             virtio_net_hdr_swap(VIRTIO_DEVICE(n), wbuf);
1709         }
1710         iov_from_buf(iov, iov_cnt, 0, buf, sizeof(struct virtio_net_hdr));
1711     } else {
1712         struct virtio_net_hdr hdr = {
1713             .flags = 0,
1714             .gso_type = VIRTIO_NET_HDR_GSO_NONE
1715         };
1716         iov_from_buf(iov, iov_cnt, 0, &hdr, sizeof hdr);
1717     }
1718 }
1719
1720 static int receive_filter(VirtIONet *n, const uint8_t *buf, int size)
1721 {
1722     static const uint8_t bcast[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
1723     static const uint8_t vlan[] = {0x81, 0x00};
1724     uint8_t *ptr = (uint8_t *)buf;
1725     int i;
1726
1727     if (n->promisc)
1728         return 1;
1729
1730     ptr += n->host_hdr_len;
1731
1732     if (!memcmp(&ptr[12], vlan, sizeof(vlan))) {
1733         int vid = lduw_be_p(ptr + 14) & 0xfff;
1734         if (!(n->vlans[vid >> 5] & (1U << (vid & 0x1f))))
1735             return 0;
1736     }
1737
1738     if (ptr[0] & 1) { // multicast
1739         if (!memcmp(ptr, bcast, sizeof(bcast))) {
1740             return !n->nobcast;
1741         } else if (n->nomulti) {
1742             return 0;
1743         } else if (n->allmulti || n->mac_table.multi_overflow) {
1744             return 1;
1745         }
1746
1747         for (i = n->mac_table.first_multi; i < n->mac_table.in_use; i++) {
1748             if (!memcmp(ptr, &n->mac_table.macs[i * ETH_ALEN], ETH_ALEN)) {
1749                 return 1;
1750             }
1751         }
1752     } else { // unicast
1753         if (n->nouni) {
1754             return 0;
1755         } else if (n->alluni || n->mac_table.uni_overflow) {
1756             return 1;
1757         } else if (!memcmp(ptr, n->mac, ETH_ALEN)) {
1758             return 1;
1759         }
1760
1761         for (i = 0; i < n->mac_table.first_multi; i++) {
1762             if (!memcmp(ptr, &n->mac_table.macs[i * ETH_ALEN], ETH_ALEN)) {
1763                 return 1;
1764             }
1765         }
1766     }
1767
1768     return 0;
1769 }
1770
1771 static uint8_t virtio_net_get_hash_type(bool hasip4,
1772                                         bool hasip6,
1773                                         EthL4HdrProto l4hdr_proto,
1774                                         uint32_t types)
1775 {
1776     if (hasip4) {
1777         switch (l4hdr_proto) {
1778         case ETH_L4_HDR_PROTO_TCP:
1779             if (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) {
1780                 return NetPktRssIpV4Tcp;
1781             }
1782             break;
1783
1784         case ETH_L4_HDR_PROTO_UDP:
1785             if (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) {
1786                 return NetPktRssIpV4Udp;
1787             }
1788             break;
1789
1790         default:
1791             break;
1792         }
1793
1794         if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
1795             return NetPktRssIpV4;
1796         }
1797     } else if (hasip6) {
1798         switch (l4hdr_proto) {
1799         case ETH_L4_HDR_PROTO_TCP:
1800             if (types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) {
1801                 return NetPktRssIpV6TcpEx;
1802             }
1803             if (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) {
1804                 return NetPktRssIpV6Tcp;
1805             }
1806             break;
1807
1808         case ETH_L4_HDR_PROTO_UDP:
1809             if (types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) {
1810                 return NetPktRssIpV6UdpEx;
1811             }
1812             if (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) {
1813                 return NetPktRssIpV6Udp;
1814             }
1815             break;
1816
1817         default:
1818             break;
1819         }
1820
1821         if (types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) {
1822             return NetPktRssIpV6Ex;
1823         }
1824         if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv6) {
1825             return NetPktRssIpV6;
1826         }
1827     }
1828     return 0xff;
1829 }
1830
1831 static void virtio_set_packet_hash(const uint8_t *buf, uint8_t report,
1832                                    uint32_t hash)
1833 {
1834     struct virtio_net_hdr_v1_hash *hdr = (void *)buf;
1835     hdr->hash_value = hash;
1836     hdr->hash_report = report;
1837 }
1838
1839 static int virtio_net_process_rss(NetClientState *nc, const uint8_t *buf,
1840                                   size_t size)
1841 {
1842     VirtIONet *n = qemu_get_nic_opaque(nc);
1843     unsigned int index = nc->queue_index, new_index = index;
1844     struct NetRxPkt *pkt = n->rx_pkt;
1845     uint8_t net_hash_type;
1846     uint32_t hash;
1847     bool hasip4, hasip6;
1848     EthL4HdrProto l4hdr_proto;
1849     static const uint8_t reports[NetPktRssIpV6UdpEx + 1] = {
1850         VIRTIO_NET_HASH_REPORT_IPv4,
1851         VIRTIO_NET_HASH_REPORT_TCPv4,
1852         VIRTIO_NET_HASH_REPORT_TCPv6,
1853         VIRTIO_NET_HASH_REPORT_IPv6,
1854         VIRTIO_NET_HASH_REPORT_IPv6_EX,
1855         VIRTIO_NET_HASH_REPORT_TCPv6_EX,
1856         VIRTIO_NET_HASH_REPORT_UDPv4,
1857         VIRTIO_NET_HASH_REPORT_UDPv6,
1858         VIRTIO_NET_HASH_REPORT_UDPv6_EX
1859     };
1860     struct iovec iov = {
1861         .iov_base = (void *)buf,
1862         .iov_len = size
1863     };
1864
1865     net_rx_pkt_set_protocols(pkt, &iov, 1, n->host_hdr_len);
1866     net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
1867     net_hash_type = virtio_net_get_hash_type(hasip4, hasip6, l4hdr_proto,
1868                                              n->rss_data.hash_types);
1869     if (net_hash_type > NetPktRssIpV6UdpEx) {
1870         if (n->rss_data.populate_hash) {
1871             virtio_set_packet_hash(buf, VIRTIO_NET_HASH_REPORT_NONE, 0);
1872         }
1873         return n->rss_data.redirect ? n->rss_data.default_queue : -1;
1874     }
1875
1876     hash = net_rx_pkt_calc_rss_hash(pkt, net_hash_type, n->rss_data.key);
1877
1878     if (n->rss_data.populate_hash) {
1879         virtio_set_packet_hash(buf, reports[net_hash_type], hash);
1880     }
1881
1882     if (n->rss_data.redirect) {
1883         new_index = hash & (n->rss_data.indirections_len - 1);
1884         new_index = n->rss_data.indirections_table[new_index];
1885     }
1886
1887     return (index == new_index) ? -1 : new_index;
1888 }
1889
1890 static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
1891                                       size_t size, bool no_rss)
1892 {
1893     VirtIONet *n = qemu_get_nic_opaque(nc);
1894     VirtIONetQueue *q = virtio_net_get_subqueue(nc);
1895     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1896     VirtQueueElement *elems[VIRTQUEUE_MAX_SIZE];
1897     size_t lens[VIRTQUEUE_MAX_SIZE];
1898     struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE];
1899     struct virtio_net_hdr_mrg_rxbuf mhdr;
1900     unsigned mhdr_cnt = 0;
1901     size_t offset, i, guest_offset, j;
1902     ssize_t err;
1903
1904     if (!virtio_net_can_receive(nc)) {
1905         return -1;
1906     }
1907
1908     if (!no_rss && n->rss_data.enabled && n->rss_data.enabled_software_rss) {
1909         int index = virtio_net_process_rss(nc, buf, size);
1910         if (index >= 0) {
1911             NetClientState *nc2 = qemu_get_subqueue(n->nic, index);
1912             return virtio_net_receive_rcu(nc2, buf, size, true);
1913         }
1914     }
1915
1916     /* hdr_len refers to the header we supply to the guest */
1917     if (!virtio_net_has_buffers(q, size + n->guest_hdr_len - n->host_hdr_len)) {
1918         return 0;
1919     }
1920
1921     if (!receive_filter(n, buf, size))
1922         return size;
1923
1924     offset = i = 0;
1925
1926     while (offset < size) {
1927         VirtQueueElement *elem;
1928         int len, total;
1929         const struct iovec *sg;
1930
1931         total = 0;
1932
1933         if (i == VIRTQUEUE_MAX_SIZE) {
1934             virtio_error(vdev, "virtio-net unexpected long buffer chain");
1935             err = size;
1936             goto err;
1937         }
1938
1939         elem = virtqueue_pop(q->rx_vq, sizeof(VirtQueueElement));
1940         if (!elem) {
1941             if (i) {
1942                 virtio_error(vdev, "virtio-net unexpected empty queue: "
1943                              "i %zd mergeable %d offset %zd, size %zd, "
1944                              "guest hdr len %zd, host hdr len %zd "
1945                              "guest features 0x%" PRIx64,
1946                              i, n->mergeable_rx_bufs, offset, size,
1947                              n->guest_hdr_len, n->host_hdr_len,
1948                              vdev->guest_features);
1949             }
1950             err = -1;
1951             goto err;
1952         }
1953
1954         if (elem->in_num < 1) {
1955             virtio_error(vdev,
1956                          "virtio-net receive queue contains no in buffers");
1957             virtqueue_detach_element(q->rx_vq, elem, 0);
1958             g_free(elem);
1959             err = -1;
1960             goto err;
1961         }
1962
1963         sg = elem->in_sg;
1964         if (i == 0) {
1965             assert(offset == 0);
1966             if (n->mergeable_rx_bufs) {
1967                 mhdr_cnt = iov_copy(mhdr_sg, ARRAY_SIZE(mhdr_sg),
1968                                     sg, elem->in_num,
1969                                     offsetof(typeof(mhdr), num_buffers),
1970                                     sizeof(mhdr.num_buffers));
1971             }
1972
1973             receive_header(n, sg, elem->in_num, buf, size);
1974             if (n->rss_data.populate_hash) {
1975                 offset = sizeof(mhdr);
1976                 iov_from_buf(sg, elem->in_num, offset,
1977                              buf + offset, n->host_hdr_len - sizeof(mhdr));
1978             }
1979             offset = n->host_hdr_len;
1980             total += n->guest_hdr_len;
1981             guest_offset = n->guest_hdr_len;
1982         } else {
1983             guest_offset = 0;
1984         }
1985
1986         /* copy in packet.  ugh */
1987         len = iov_from_buf(sg, elem->in_num, guest_offset,
1988                            buf + offset, size - offset);
1989         total += len;
1990         offset += len;
1991         /* If buffers can't be merged, at this point we
1992          * must have consumed the complete packet.
1993          * Otherwise, drop it. */
1994         if (!n->mergeable_rx_bufs && offset < size) {
1995             virtqueue_unpop(q->rx_vq, elem, total);
1996             g_free(elem);
1997             err = size;
1998             goto err;
1999         }
2000
2001         elems[i] = elem;
2002         lens[i] = total;
2003         i++;
2004     }
2005
2006     if (mhdr_cnt) {
2007         virtio_stw_p(vdev, &mhdr.num_buffers, i);
2008         iov_from_buf(mhdr_sg, mhdr_cnt,
2009                      0,
2010                      &mhdr.num_buffers, sizeof mhdr.num_buffers);
2011     }
2012
2013     for (j = 0; j < i; j++) {
2014         /* signal other side */
2015         virtqueue_fill(q->rx_vq, elems[j], lens[j], j);
2016         g_free(elems[j]);
2017     }
2018
2019     virtqueue_flush(q->rx_vq, i);
2020     virtio_notify(vdev, q->rx_vq);
2021
2022     return size;
2023
2024 err:
2025     for (j = 0; j < i; j++) {
2026         virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
2027         g_free(elems[j]);
2028     }
2029
2030     return err;
2031 }
2032
2033 static ssize_t virtio_net_do_receive(NetClientState *nc, const uint8_t *buf,
2034                                   size_t size)
2035 {
2036     RCU_READ_LOCK_GUARD();
2037
2038     return virtio_net_receive_rcu(nc, buf, size, false);
2039 }
2040
2041 static void virtio_net_rsc_extract_unit4(VirtioNetRscChain *chain,
2042                                          const uint8_t *buf,
2043                                          VirtioNetRscUnit *unit)
2044 {
2045     uint16_t ip_hdrlen;
2046     struct ip_header *ip;
2047
2048     ip = (struct ip_header *)(buf + chain->n->guest_hdr_len
2049                               + sizeof(struct eth_header));
2050     unit->ip = (void *)ip;
2051     ip_hdrlen = (ip->ip_ver_len & 0xF) << 2;
2052     unit->ip_plen = &ip->ip_len;
2053     unit->tcp = (struct tcp_header *)(((uint8_t *)unit->ip) + ip_hdrlen);
2054     unit->tcp_hdrlen = (htons(unit->tcp->th_offset_flags) & 0xF000) >> 10;
2055     unit->payload = htons(*unit->ip_plen) - ip_hdrlen - unit->tcp_hdrlen;
2056 }
2057
2058 static void virtio_net_rsc_extract_unit6(VirtioNetRscChain *chain,
2059                                          const uint8_t *buf,
2060                                          VirtioNetRscUnit *unit)
2061 {
2062     struct ip6_header *ip6;
2063
2064     ip6 = (struct ip6_header *)(buf + chain->n->guest_hdr_len
2065                                  + sizeof(struct eth_header));
2066     unit->ip = ip6;
2067     unit->ip_plen = &(ip6->ip6_ctlun.ip6_un1.ip6_un1_plen);
2068     unit->tcp = (struct tcp_header *)(((uint8_t *)unit->ip)
2069                                         + sizeof(struct ip6_header));
2070     unit->tcp_hdrlen = (htons(unit->tcp->th_offset_flags) & 0xF000) >> 10;
2071
2072     /* There is a difference between payload length in ipv4 and v6,
2073        ip header is excluded in ipv6 */
2074     unit->payload = htons(*unit->ip_plen) - unit->tcp_hdrlen;
2075 }
2076
2077 static size_t virtio_net_rsc_drain_seg(VirtioNetRscChain *chain,
2078                                        VirtioNetRscSeg *seg)
2079 {
2080     int ret;
2081     struct virtio_net_hdr_v1 *h;
2082
2083     h = (struct virtio_net_hdr_v1 *)seg->buf;
2084     h->flags = 0;
2085     h->gso_type = VIRTIO_NET_HDR_GSO_NONE;
2086
2087     if (seg->is_coalesced) {
2088         h->rsc.segments = seg->packets;
2089         h->rsc.dup_acks = seg->dup_ack;
2090         h->flags = VIRTIO_NET_HDR_F_RSC_INFO;
2091         if (chain->proto == ETH_P_IP) {
2092             h->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2093         } else {
2094             h->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2095         }
2096     }
2097
2098     ret = virtio_net_do_receive(seg->nc, seg->buf, seg->size);
2099     QTAILQ_REMOVE(&chain->buffers, seg, next);
2100     g_free(seg->buf);
2101     g_free(seg);
2102
2103     return ret;
2104 }
2105
2106 static void virtio_net_rsc_purge(void *opq)
2107 {
2108     VirtioNetRscSeg *seg, *rn;
2109     VirtioNetRscChain *chain = (VirtioNetRscChain *)opq;
2110
2111     QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, rn) {
2112         if (virtio_net_rsc_drain_seg(chain, seg) == 0) {
2113             chain->stat.purge_failed++;
2114             continue;
2115         }
2116     }
2117
2118     chain->stat.timer++;
2119     if (!QTAILQ_EMPTY(&chain->buffers)) {
2120         timer_mod(chain->drain_timer,
2121               qemu_clock_get_ns(QEMU_CLOCK_HOST) + chain->n->rsc_timeout);
2122     }
2123 }
2124
2125 static void virtio_net_rsc_cleanup(VirtIONet *n)
2126 {
2127     VirtioNetRscChain *chain, *rn_chain;
2128     VirtioNetRscSeg *seg, *rn_seg;
2129
2130     QTAILQ_FOREACH_SAFE(chain, &n->rsc_chains, next, rn_chain) {
2131         QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, rn_seg) {
2132             QTAILQ_REMOVE(&chain->buffers, seg, next);
2133             g_free(seg->buf);
2134             g_free(seg);
2135         }
2136
2137         timer_free(chain->drain_timer);
2138         QTAILQ_REMOVE(&n->rsc_chains, chain, next);
2139         g_free(chain);
2140     }
2141 }
2142
2143 static void virtio_net_rsc_cache_buf(VirtioNetRscChain *chain,
2144                                      NetClientState *nc,
2145                                      const uint8_t *buf, size_t size)
2146 {
2147     uint16_t hdr_len;
2148     VirtioNetRscSeg *seg;
2149
2150     hdr_len = chain->n->guest_hdr_len;
2151     seg = g_new(VirtioNetRscSeg, 1);
2152     seg->buf = g_malloc(hdr_len + sizeof(struct eth_header)
2153         + sizeof(struct ip6_header) + VIRTIO_NET_MAX_TCP_PAYLOAD);
2154     memcpy(seg->buf, buf, size);
2155     seg->size = size;
2156     seg->packets = 1;
2157     seg->dup_ack = 0;
2158     seg->is_coalesced = 0;
2159     seg->nc = nc;
2160
2161     QTAILQ_INSERT_TAIL(&chain->buffers, seg, next);
2162     chain->stat.cache++;
2163
2164     switch (chain->proto) {
2165     case ETH_P_IP:
2166         virtio_net_rsc_extract_unit4(chain, seg->buf, &seg->unit);
2167         break;
2168     case ETH_P_IPV6:
2169         virtio_net_rsc_extract_unit6(chain, seg->buf, &seg->unit);
2170         break;
2171     default:
2172         g_assert_not_reached();
2173     }
2174 }
2175
2176 static int32_t virtio_net_rsc_handle_ack(VirtioNetRscChain *chain,
2177                                          VirtioNetRscSeg *seg,
2178                                          const uint8_t *buf,
2179                                          struct tcp_header *n_tcp,
2180                                          struct tcp_header *o_tcp)
2181 {
2182     uint32_t nack, oack;
2183     uint16_t nwin, owin;
2184
2185     nack = htonl(n_tcp->th_ack);
2186     nwin = htons(n_tcp->th_win);
2187     oack = htonl(o_tcp->th_ack);
2188     owin = htons(o_tcp->th_win);
2189
2190     if ((nack - oack) >= VIRTIO_NET_MAX_TCP_PAYLOAD) {
2191         chain->stat.ack_out_of_win++;
2192         return RSC_FINAL;
2193     } else if (nack == oack) {
2194         /* duplicated ack or window probe */
2195         if (nwin == owin) {
2196             /* duplicated ack, add dup ack count due to whql test up to 1 */
2197             chain->stat.dup_ack++;
2198             return RSC_FINAL;
2199         } else {
2200             /* Coalesce window update */
2201             o_tcp->th_win = n_tcp->th_win;
2202             chain->stat.win_update++;
2203             return RSC_COALESCE;
2204         }
2205     } else {
2206         /* pure ack, go to 'C', finalize*/
2207         chain->stat.pure_ack++;
2208         return RSC_FINAL;
2209     }
2210 }
2211
2212 static int32_t virtio_net_rsc_coalesce_data(VirtioNetRscChain *chain,
2213                                             VirtioNetRscSeg *seg,
2214                                             const uint8_t *buf,
2215                                             VirtioNetRscUnit *n_unit)
2216 {
2217     void *data;
2218     uint16_t o_ip_len;
2219     uint32_t nseq, oseq;
2220     VirtioNetRscUnit *o_unit;
2221
2222     o_unit = &seg->unit;
2223     o_ip_len = htons(*o_unit->ip_plen);
2224     nseq = htonl(n_unit->tcp->th_seq);
2225     oseq = htonl(o_unit->tcp->th_seq);
2226
2227     /* out of order or retransmitted. */
2228     if ((nseq - oseq) > VIRTIO_NET_MAX_TCP_PAYLOAD) {
2229         chain->stat.data_out_of_win++;
2230         return RSC_FINAL;
2231     }
2232
2233     data = ((uint8_t *)n_unit->tcp) + n_unit->tcp_hdrlen;
2234     if (nseq == oseq) {
2235         if ((o_unit->payload == 0) && n_unit->payload) {
2236             /* From no payload to payload, normal case, not a dup ack or etc */
2237             chain->stat.data_after_pure_ack++;
2238             goto coalesce;
2239         } else {
2240             return virtio_net_rsc_handle_ack(chain, seg, buf,
2241                                              n_unit->tcp, o_unit->tcp);
2242         }
2243     } else if ((nseq - oseq) != o_unit->payload) {
2244         /* Not a consistent packet, out of order */
2245         chain->stat.data_out_of_order++;
2246         return RSC_FINAL;
2247     } else {
2248 coalesce:
2249         if ((o_ip_len + n_unit->payload) > chain->max_payload) {
2250             chain->stat.over_size++;
2251             return RSC_FINAL;
2252         }
2253
2254         /* Here comes the right data, the payload length in v4/v6 is different,
2255            so use the field value to update and record the new data len */
2256         o_unit->payload += n_unit->payload; /* update new data len */
2257
2258         /* update field in ip header */
2259         *o_unit->ip_plen = htons(o_ip_len + n_unit->payload);
2260
2261         /* Bring 'PUSH' big, the whql test guide says 'PUSH' can be coalesced
2262            for windows guest, while this may change the behavior for linux
2263            guest (only if it uses RSC feature). */
2264         o_unit->tcp->th_offset_flags = n_unit->tcp->th_offset_flags;
2265
2266         o_unit->tcp->th_ack = n_unit->tcp->th_ack;
2267         o_unit->tcp->th_win = n_unit->tcp->th_win;
2268
2269         memmove(seg->buf + seg->size, data, n_unit->payload);
2270         seg->size += n_unit->payload;
2271         seg->packets++;
2272         chain->stat.coalesced++;
2273         return RSC_COALESCE;
2274     }
2275 }
2276
2277 static int32_t virtio_net_rsc_coalesce4(VirtioNetRscChain *chain,
2278                                         VirtioNetRscSeg *seg,
2279                                         const uint8_t *buf, size_t size,
2280                                         VirtioNetRscUnit *unit)
2281 {
2282     struct ip_header *ip1, *ip2;
2283
2284     ip1 = (struct ip_header *)(unit->ip);
2285     ip2 = (struct ip_header *)(seg->unit.ip);
2286     if ((ip1->ip_src ^ ip2->ip_src) || (ip1->ip_dst ^ ip2->ip_dst)
2287         || (unit->tcp->th_sport ^ seg->unit.tcp->th_sport)
2288         || (unit->tcp->th_dport ^ seg->unit.tcp->th_dport)) {
2289         chain->stat.no_match++;
2290         return RSC_NO_MATCH;
2291     }
2292
2293     return virtio_net_rsc_coalesce_data(chain, seg, buf, unit);
2294 }
2295
2296 static int32_t virtio_net_rsc_coalesce6(VirtioNetRscChain *chain,
2297                                         VirtioNetRscSeg *seg,
2298                                         const uint8_t *buf, size_t size,
2299                                         VirtioNetRscUnit *unit)
2300 {
2301     struct ip6_header *ip1, *ip2;
2302
2303     ip1 = (struct ip6_header *)(unit->ip);
2304     ip2 = (struct ip6_header *)(seg->unit.ip);
2305     if (memcmp(&ip1->ip6_src, &ip2->ip6_src, sizeof(struct in6_address))
2306         || memcmp(&ip1->ip6_dst, &ip2->ip6_dst, sizeof(struct in6_address))
2307         || (unit->tcp->th_sport ^ seg->unit.tcp->th_sport)
2308         || (unit->tcp->th_dport ^ seg->unit.tcp->th_dport)) {
2309             chain->stat.no_match++;
2310             return RSC_NO_MATCH;
2311     }
2312
2313     return virtio_net_rsc_coalesce_data(chain, seg, buf, unit);
2314 }
2315
2316 /* Packets with 'SYN' should bypass, other flag should be sent after drain
2317  * to prevent out of order */
2318 static int virtio_net_rsc_tcp_ctrl_check(VirtioNetRscChain *chain,
2319                                          struct tcp_header *tcp)
2320 {
2321     uint16_t tcp_hdr;
2322     uint16_t tcp_flag;
2323
2324     tcp_flag = htons(tcp->th_offset_flags);
2325     tcp_hdr = (tcp_flag & VIRTIO_NET_TCP_HDR_LENGTH) >> 10;
2326     tcp_flag &= VIRTIO_NET_TCP_FLAG;
2327     if (tcp_flag & TH_SYN) {
2328         chain->stat.tcp_syn++;
2329         return RSC_BYPASS;
2330     }
2331
2332     if (tcp_flag & (TH_FIN | TH_URG | TH_RST | TH_ECE | TH_CWR)) {
2333         chain->stat.tcp_ctrl_drain++;
2334         return RSC_FINAL;
2335     }
2336
2337     if (tcp_hdr > sizeof(struct tcp_header)) {
2338         chain->stat.tcp_all_opt++;
2339         return RSC_FINAL;
2340     }
2341
2342     return RSC_CANDIDATE;
2343 }
2344
2345 static size_t virtio_net_rsc_do_coalesce(VirtioNetRscChain *chain,
2346                                          NetClientState *nc,
2347                                          const uint8_t *buf, size_t size,
2348                                          VirtioNetRscUnit *unit)
2349 {
2350     int ret;
2351     VirtioNetRscSeg *seg, *nseg;
2352
2353     if (QTAILQ_EMPTY(&chain->buffers)) {
2354         chain->stat.empty_cache++;
2355         virtio_net_rsc_cache_buf(chain, nc, buf, size);
2356         timer_mod(chain->drain_timer,
2357               qemu_clock_get_ns(QEMU_CLOCK_HOST) + chain->n->rsc_timeout);
2358         return size;
2359     }
2360
2361     QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, nseg) {
2362         if (chain->proto == ETH_P_IP) {
2363             ret = virtio_net_rsc_coalesce4(chain, seg, buf, size, unit);
2364         } else {
2365             ret = virtio_net_rsc_coalesce6(chain, seg, buf, size, unit);
2366         }
2367
2368         if (ret == RSC_FINAL) {
2369             if (virtio_net_rsc_drain_seg(chain, seg) == 0) {
2370                 /* Send failed */
2371                 chain->stat.final_failed++;
2372                 return 0;
2373             }
2374
2375             /* Send current packet */
2376             return virtio_net_do_receive(nc, buf, size);
2377         } else if (ret == RSC_NO_MATCH) {
2378             continue;
2379         } else {
2380             /* Coalesced, mark coalesced flag to tell calc cksum for ipv4 */
2381             seg->is_coalesced = 1;
2382             return size;
2383         }
2384     }
2385
2386     chain->stat.no_match_cache++;
2387     virtio_net_rsc_cache_buf(chain, nc, buf, size);
2388     return size;
2389 }
2390
2391 /* Drain a connection data, this is to avoid out of order segments */
2392 static size_t virtio_net_rsc_drain_flow(VirtioNetRscChain *chain,
2393                                         NetClientState *nc,
2394                                         const uint8_t *buf, size_t size,
2395                                         uint16_t ip_start, uint16_t ip_size,
2396                                         uint16_t tcp_port)
2397 {
2398     VirtioNetRscSeg *seg, *nseg;
2399     uint32_t ppair1, ppair2;
2400
2401     ppair1 = *(uint32_t *)(buf + tcp_port);
2402     QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, nseg) {
2403         ppair2 = *(uint32_t *)(seg->buf + tcp_port);
2404         if (memcmp(buf + ip_start, seg->buf + ip_start, ip_size)
2405             || (ppair1 != ppair2)) {
2406             continue;
2407         }
2408         if (virtio_net_rsc_drain_seg(chain, seg) == 0) {
2409             chain->stat.drain_failed++;
2410         }
2411
2412         break;
2413     }
2414
2415     return virtio_net_do_receive(nc, buf, size);
2416 }
2417
2418 static int32_t virtio_net_rsc_sanity_check4(VirtioNetRscChain *chain,
2419                                             struct ip_header *ip,
2420                                             const uint8_t *buf, size_t size)
2421 {
2422     uint16_t ip_len;
2423
2424     /* Not an ipv4 packet */
2425     if (((ip->ip_ver_len & 0xF0) >> 4) != IP_HEADER_VERSION_4) {
2426         chain->stat.ip_option++;
2427         return RSC_BYPASS;
2428     }
2429
2430     /* Don't handle packets with ip option */
2431     if ((ip->ip_ver_len & 0xF) != VIRTIO_NET_IP4_HEADER_LENGTH) {
2432         chain->stat.ip_option++;
2433         return RSC_BYPASS;
2434     }
2435
2436     if (ip->ip_p != IPPROTO_TCP) {
2437         chain->stat.bypass_not_tcp++;
2438         return RSC_BYPASS;
2439     }
2440
2441     /* Don't handle packets with ip fragment */
2442     if (!(htons(ip->ip_off) & IP_DF)) {
2443         chain->stat.ip_frag++;
2444         return RSC_BYPASS;
2445     }
2446
2447     /* Don't handle packets with ecn flag */
2448     if (IPTOS_ECN(ip->ip_tos)) {
2449         chain->stat.ip_ecn++;
2450         return RSC_BYPASS;
2451     }
2452
2453     ip_len = htons(ip->ip_len);
2454     if (ip_len < (sizeof(struct ip_header) + sizeof(struct tcp_header))
2455         || ip_len > (size - chain->n->guest_hdr_len -
2456                      sizeof(struct eth_header))) {
2457         chain->stat.ip_hacked++;
2458         return RSC_BYPASS;
2459     }
2460
2461     return RSC_CANDIDATE;
2462 }
2463
2464 static size_t virtio_net_rsc_receive4(VirtioNetRscChain *chain,
2465                                       NetClientState *nc,
2466                                       const uint8_t *buf, size_t size)
2467 {
2468     int32_t ret;
2469     uint16_t hdr_len;
2470     VirtioNetRscUnit unit;
2471
2472     hdr_len = ((VirtIONet *)(chain->n))->guest_hdr_len;
2473
2474     if (size < (hdr_len + sizeof(struct eth_header) + sizeof(struct ip_header)
2475         + sizeof(struct tcp_header))) {
2476         chain->stat.bypass_not_tcp++;
2477         return virtio_net_do_receive(nc, buf, size);
2478     }
2479
2480     virtio_net_rsc_extract_unit4(chain, buf, &unit);
2481     if (virtio_net_rsc_sanity_check4(chain, unit.ip, buf, size)
2482         != RSC_CANDIDATE) {
2483         return virtio_net_do_receive(nc, buf, size);
2484     }
2485
2486     ret = virtio_net_rsc_tcp_ctrl_check(chain, unit.tcp);
2487     if (ret == RSC_BYPASS) {
2488         return virtio_net_do_receive(nc, buf, size);
2489     } else if (ret == RSC_FINAL) {
2490         return virtio_net_rsc_drain_flow(chain, nc, buf, size,
2491                 ((hdr_len + sizeof(struct eth_header)) + 12),
2492                 VIRTIO_NET_IP4_ADDR_SIZE,
2493                 hdr_len + sizeof(struct eth_header) + sizeof(struct ip_header));
2494     }
2495
2496     return virtio_net_rsc_do_coalesce(chain, nc, buf, size, &unit);
2497 }
2498
2499 static int32_t virtio_net_rsc_sanity_check6(VirtioNetRscChain *chain,
2500                                             struct ip6_header *ip6,
2501                                             const uint8_t *buf, size_t size)
2502 {
2503     uint16_t ip_len;
2504
2505     if (((ip6->ip6_ctlun.ip6_un1.ip6_un1_flow & 0xF0) >> 4)
2506         != IP_HEADER_VERSION_6) {
2507         return RSC_BYPASS;
2508     }
2509
2510     /* Both option and protocol is checked in this */
2511     if (ip6->ip6_ctlun.ip6_un1.ip6_un1_nxt != IPPROTO_TCP) {
2512         chain->stat.bypass_not_tcp++;
2513         return RSC_BYPASS;
2514     }
2515
2516     ip_len = htons(ip6->ip6_ctlun.ip6_un1.ip6_un1_plen);
2517     if (ip_len < sizeof(struct tcp_header) ||
2518         ip_len > (size - chain->n->guest_hdr_len - sizeof(struct eth_header)
2519                   - sizeof(struct ip6_header))) {
2520         chain->stat.ip_hacked++;
2521         return RSC_BYPASS;
2522     }
2523
2524     /* Don't handle packets with ecn flag */
2525     if (IP6_ECN(ip6->ip6_ctlun.ip6_un3.ip6_un3_ecn)) {
2526         chain->stat.ip_ecn++;
2527         return RSC_BYPASS;
2528     }
2529
2530     return RSC_CANDIDATE;
2531 }
2532
2533 static size_t virtio_net_rsc_receive6(void *opq, NetClientState *nc,
2534                                       const uint8_t *buf, size_t size)
2535 {
2536     int32_t ret;
2537     uint16_t hdr_len;
2538     VirtioNetRscChain *chain;
2539     VirtioNetRscUnit unit;
2540
2541     chain = opq;
2542     hdr_len = ((VirtIONet *)(chain->n))->guest_hdr_len;
2543
2544     if (size < (hdr_len + sizeof(struct eth_header) + sizeof(struct ip6_header)
2545         + sizeof(tcp_header))) {
2546         return virtio_net_do_receive(nc, buf, size);
2547     }
2548
2549     virtio_net_rsc_extract_unit6(chain, buf, &unit);
2550     if (RSC_CANDIDATE != virtio_net_rsc_sanity_check6(chain,
2551                                                  unit.ip, buf, size)) {
2552         return virtio_net_do_receive(nc, buf, size);
2553     }
2554
2555     ret = virtio_net_rsc_tcp_ctrl_check(chain, unit.tcp);
2556     if (ret == RSC_BYPASS) {
2557         return virtio_net_do_receive(nc, buf, size);
2558     } else if (ret == RSC_FINAL) {
2559         return virtio_net_rsc_drain_flow(chain, nc, buf, size,
2560                 ((hdr_len + sizeof(struct eth_header)) + 8),
2561                 VIRTIO_NET_IP6_ADDR_SIZE,
2562                 hdr_len + sizeof(struct eth_header)
2563                 + sizeof(struct ip6_header));
2564     }
2565
2566     return virtio_net_rsc_do_coalesce(chain, nc, buf, size, &unit);
2567 }
2568
2569 static VirtioNetRscChain *virtio_net_rsc_lookup_chain(VirtIONet *n,
2570                                                       NetClientState *nc,
2571                                                       uint16_t proto)
2572 {
2573     VirtioNetRscChain *chain;
2574
2575     if ((proto != (uint16_t)ETH_P_IP) && (proto != (uint16_t)ETH_P_IPV6)) {
2576         return NULL;
2577     }
2578
2579     QTAILQ_FOREACH(chain, &n->rsc_chains, next) {
2580         if (chain->proto == proto) {
2581             return chain;
2582         }
2583     }
2584
2585     chain = g_malloc(sizeof(*chain));
2586     chain->n = n;
2587     chain->proto = proto;
2588     if (proto == (uint16_t)ETH_P_IP) {
2589         chain->max_payload = VIRTIO_NET_MAX_IP4_PAYLOAD;
2590         chain->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2591     } else {
2592         chain->max_payload = VIRTIO_NET_MAX_IP6_PAYLOAD;
2593         chain->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2594     }
2595     chain->drain_timer = timer_new_ns(QEMU_CLOCK_HOST,
2596                                       virtio_net_rsc_purge, chain);
2597     memset(&chain->stat, 0, sizeof(chain->stat));
2598
2599     QTAILQ_INIT(&chain->buffers);
2600     QTAILQ_INSERT_TAIL(&n->rsc_chains, chain, next);
2601
2602     return chain;
2603 }
2604
2605 static ssize_t virtio_net_rsc_receive(NetClientState *nc,
2606                                       const uint8_t *buf,
2607                                       size_t size)
2608 {
2609     uint16_t proto;
2610     VirtioNetRscChain *chain;
2611     struct eth_header *eth;
2612     VirtIONet *n;
2613
2614     n = qemu_get_nic_opaque(nc);
2615     if (size < (n->host_hdr_len + sizeof(struct eth_header))) {
2616         return virtio_net_do_receive(nc, buf, size);
2617     }
2618
2619     eth = (struct eth_header *)(buf + n->guest_hdr_len);
2620     proto = htons(eth->h_proto);
2621
2622     chain = virtio_net_rsc_lookup_chain(n, nc, proto);
2623     if (chain) {
2624         chain->stat.received++;
2625         if (proto == (uint16_t)ETH_P_IP && n->rsc4_enabled) {
2626             return virtio_net_rsc_receive4(chain, nc, buf, size);
2627         } else if (proto == (uint16_t)ETH_P_IPV6 && n->rsc6_enabled) {
2628             return virtio_net_rsc_receive6(chain, nc, buf, size);
2629         }
2630     }
2631     return virtio_net_do_receive(nc, buf, size);
2632 }
2633
2634 static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf,
2635                                   size_t size)
2636 {
2637     VirtIONet *n = qemu_get_nic_opaque(nc);
2638     if ((n->rsc4_enabled || n->rsc6_enabled)) {
2639         return virtio_net_rsc_receive(nc, buf, size);
2640     } else {
2641         return virtio_net_do_receive(nc, buf, size);
2642     }
2643 }
2644
2645 static int32_t virtio_net_flush_tx(VirtIONetQueue *q);
2646
2647 static void virtio_net_tx_complete(NetClientState *nc, ssize_t len)
2648 {
2649     VirtIONet *n = qemu_get_nic_opaque(nc);
2650     VirtIONetQueue *q = virtio_net_get_subqueue(nc);
2651     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2652     int ret;
2653
2654     virtqueue_push(q->tx_vq, q->async_tx.elem, 0);
2655     virtio_notify(vdev, q->tx_vq);
2656
2657     g_free(q->async_tx.elem);
2658     q->async_tx.elem = NULL;
2659
2660     virtio_queue_set_notification(q->tx_vq, 1);
2661     ret = virtio_net_flush_tx(q);
2662     if (ret >= n->tx_burst) {
2663         /*
2664          * the flush has been stopped by tx_burst
2665          * we will not receive notification for the
2666          * remainining part, so re-schedule
2667          */
2668         virtio_queue_set_notification(q->tx_vq, 0);
2669         if (q->tx_bh) {
2670             qemu_bh_schedule(q->tx_bh);
2671         } else {
2672             timer_mod(q->tx_timer,
2673                       qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
2674         }
2675         q->tx_waiting = 1;
2676     }
2677 }
2678
2679 /* TX */
2680 static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
2681 {
2682     VirtIONet *n = q->n;
2683     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2684     VirtQueueElement *elem;
2685     int32_t num_packets = 0;
2686     int queue_index = vq2q(virtio_get_queue_index(q->tx_vq));
2687     if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
2688         return num_packets;
2689     }
2690
2691     if (q->async_tx.elem) {
2692         virtio_queue_set_notification(q->tx_vq, 0);
2693         return num_packets;
2694     }
2695
2696     for (;;) {
2697         ssize_t ret;
2698         unsigned int out_num;
2699         struct iovec sg[VIRTQUEUE_MAX_SIZE], sg2[VIRTQUEUE_MAX_SIZE + 1], *out_sg;
2700         struct virtio_net_hdr_mrg_rxbuf mhdr;
2701
2702         elem = virtqueue_pop(q->tx_vq, sizeof(VirtQueueElement));
2703         if (!elem) {
2704             break;
2705         }
2706
2707         out_num = elem->out_num;
2708         out_sg = elem->out_sg;
2709         if (out_num < 1) {
2710             virtio_error(vdev, "virtio-net header not in first element");
2711             virtqueue_detach_element(q->tx_vq, elem, 0);
2712             g_free(elem);
2713             return -EINVAL;
2714         }
2715
2716         if (n->has_vnet_hdr) {
2717             if (iov_to_buf(out_sg, out_num, 0, &mhdr, n->guest_hdr_len) <
2718                 n->guest_hdr_len) {
2719                 virtio_error(vdev, "virtio-net header incorrect");
2720                 virtqueue_detach_element(q->tx_vq, elem, 0);
2721                 g_free(elem);
2722                 return -EINVAL;
2723             }
2724             if (n->needs_vnet_hdr_swap) {
2725                 virtio_net_hdr_swap(vdev, (void *) &mhdr);
2726                 sg2[0].iov_base = &mhdr;
2727                 sg2[0].iov_len = n->guest_hdr_len;
2728                 out_num = iov_copy(&sg2[1], ARRAY_SIZE(sg2) - 1,
2729                                    out_sg, out_num,
2730                                    n->guest_hdr_len, -1);
2731                 if (out_num == VIRTQUEUE_MAX_SIZE) {
2732                     goto drop;
2733                 }
2734                 out_num += 1;
2735                 out_sg = sg2;
2736             }
2737         }
2738         /*
2739          * If host wants to see the guest header as is, we can
2740          * pass it on unchanged. Otherwise, copy just the parts
2741          * that host is interested in.
2742          */
2743         assert(n->host_hdr_len <= n->guest_hdr_len);
2744         if (n->host_hdr_len != n->guest_hdr_len) {
2745             unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg),
2746                                        out_sg, out_num,
2747                                        0, n->host_hdr_len);
2748             sg_num += iov_copy(sg + sg_num, ARRAY_SIZE(sg) - sg_num,
2749                              out_sg, out_num,
2750                              n->guest_hdr_len, -1);
2751             out_num = sg_num;
2752             out_sg = sg;
2753         }
2754
2755         ret = qemu_sendv_packet_async(qemu_get_subqueue(n->nic, queue_index),
2756                                       out_sg, out_num, virtio_net_tx_complete);
2757         if (ret == 0) {
2758             virtio_queue_set_notification(q->tx_vq, 0);
2759             q->async_tx.elem = elem;
2760             return -EBUSY;
2761         }
2762
2763 drop:
2764         virtqueue_push(q->tx_vq, elem, 0);
2765         virtio_notify(vdev, q->tx_vq);
2766         g_free(elem);
2767
2768         if (++num_packets >= n->tx_burst) {
2769             break;
2770         }
2771     }
2772     return num_packets;
2773 }
2774
2775 static void virtio_net_tx_timer(void *opaque);
2776
2777 static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
2778 {
2779     VirtIONet *n = VIRTIO_NET(vdev);
2780     VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))];
2781
2782     if (unlikely((n->status & VIRTIO_NET_S_LINK_UP) == 0)) {
2783         virtio_net_drop_tx_queue_data(vdev, vq);
2784         return;
2785     }
2786
2787     /* This happens when device was stopped but VCPU wasn't. */
2788     if (!vdev->vm_running) {
2789         q->tx_waiting = 1;
2790         return;
2791     }
2792
2793     if (q->tx_waiting) {
2794         /* We already have queued packets, immediately flush */
2795         timer_del(q->tx_timer);
2796         virtio_net_tx_timer(q);
2797     } else {
2798         /* re-arm timer to flush it (and more) on next tick */
2799         timer_mod(q->tx_timer,
2800                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
2801         q->tx_waiting = 1;
2802         virtio_queue_set_notification(vq, 0);
2803     }
2804 }
2805
2806 static void virtio_net_handle_tx_bh(VirtIODevice *vdev, VirtQueue *vq)
2807 {
2808     VirtIONet *n = VIRTIO_NET(vdev);
2809     VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))];
2810
2811     if (unlikely((n->status & VIRTIO_NET_S_LINK_UP) == 0)) {
2812         virtio_net_drop_tx_queue_data(vdev, vq);
2813         return;
2814     }
2815
2816     if (unlikely(q->tx_waiting)) {
2817         return;
2818     }
2819     q->tx_waiting = 1;
2820     /* This happens when device was stopped but VCPU wasn't. */
2821     if (!vdev->vm_running) {
2822         return;
2823     }
2824     virtio_queue_set_notification(vq, 0);
2825     qemu_bh_schedule(q->tx_bh);
2826 }
2827
2828 static void virtio_net_tx_timer(void *opaque)
2829 {
2830     VirtIONetQueue *q = opaque;
2831     VirtIONet *n = q->n;
2832     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2833     int ret;
2834
2835     /* This happens when device was stopped but BH wasn't. */
2836     if (!vdev->vm_running) {
2837         /* Make sure tx waiting is set, so we'll run when restarted. */
2838         assert(q->tx_waiting);
2839         return;
2840     }
2841
2842     q->tx_waiting = 0;
2843
2844     /* Just in case the driver is not ready on more */
2845     if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
2846         return;
2847     }
2848
2849     ret = virtio_net_flush_tx(q);
2850     if (ret == -EBUSY || ret == -EINVAL) {
2851         return;
2852     }
2853     /*
2854      * If we flush a full burst of packets, assume there are
2855      * more coming and immediately rearm
2856      */
2857     if (ret >= n->tx_burst) {
2858         q->tx_waiting = 1;
2859         timer_mod(q->tx_timer,
2860                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
2861         return;
2862     }
2863     /*
2864      * If less than a full burst, re-enable notification and flush
2865      * anything that may have come in while we weren't looking.  If
2866      * we find something, assume the guest is still active and rearm
2867      */
2868     virtio_queue_set_notification(q->tx_vq, 1);
2869     ret = virtio_net_flush_tx(q);
2870     if (ret > 0) {
2871         virtio_queue_set_notification(q->tx_vq, 0);
2872         q->tx_waiting = 1;
2873         timer_mod(q->tx_timer,
2874                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
2875     }
2876 }
2877
2878 static void virtio_net_tx_bh(void *opaque)
2879 {
2880     VirtIONetQueue *q = opaque;
2881     VirtIONet *n = q->n;
2882     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2883     int32_t ret;
2884
2885     /* This happens when device was stopped but BH wasn't. */
2886     if (!vdev->vm_running) {
2887         /* Make sure tx waiting is set, so we'll run when restarted. */
2888         assert(q->tx_waiting);
2889         return;
2890     }
2891
2892     q->tx_waiting = 0;
2893
2894     /* Just in case the driver is not ready on more */
2895     if (unlikely(!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK))) {
2896         return;
2897     }
2898
2899     ret = virtio_net_flush_tx(q);
2900     if (ret == -EBUSY || ret == -EINVAL) {
2901         return; /* Notification re-enable handled by tx_complete or device
2902                  * broken */
2903     }
2904
2905     /* If we flush a full burst of packets, assume there are
2906      * more coming and immediately reschedule */
2907     if (ret >= n->tx_burst) {
2908         qemu_bh_schedule(q->tx_bh);
2909         q->tx_waiting = 1;
2910         return;
2911     }
2912
2913     /* If less than a full burst, re-enable notification and flush
2914      * anything that may have come in while we weren't looking.  If
2915      * we find something, assume the guest is still active and reschedule */
2916     virtio_queue_set_notification(q->tx_vq, 1);
2917     ret = virtio_net_flush_tx(q);
2918     if (ret == -EINVAL) {
2919         return;
2920     } else if (ret > 0) {
2921         virtio_queue_set_notification(q->tx_vq, 0);
2922         qemu_bh_schedule(q->tx_bh);
2923         q->tx_waiting = 1;
2924     }
2925 }
2926
2927 static void virtio_net_add_queue(VirtIONet *n, int index)
2928 {
2929     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2930
2931     n->vqs[index].rx_vq = virtio_add_queue(vdev, n->net_conf.rx_queue_size,
2932                                            virtio_net_handle_rx);
2933
2934     if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) {
2935         n->vqs[index].tx_vq =
2936             virtio_add_queue(vdev, n->net_conf.tx_queue_size,
2937                              virtio_net_handle_tx_timer);
2938         n->vqs[index].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
2939                                               virtio_net_tx_timer,
2940                                               &n->vqs[index]);
2941     } else {
2942         n->vqs[index].tx_vq =
2943             virtio_add_queue(vdev, n->net_conf.tx_queue_size,
2944                              virtio_net_handle_tx_bh);
2945         n->vqs[index].tx_bh = qemu_bh_new_guarded(virtio_net_tx_bh, &n->vqs[index],
2946                                                   &DEVICE(vdev)->mem_reentrancy_guard);
2947     }
2948
2949     n->vqs[index].tx_waiting = 0;
2950     n->vqs[index].n = n;
2951 }
2952
2953 static void virtio_net_del_queue(VirtIONet *n, int index)
2954 {
2955     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2956     VirtIONetQueue *q = &n->vqs[index];
2957     NetClientState *nc = qemu_get_subqueue(n->nic, index);
2958
2959     qemu_purge_queued_packets(nc);
2960
2961     virtio_del_queue(vdev, index * 2);
2962     if (q->tx_timer) {
2963         timer_free(q->tx_timer);
2964         q->tx_timer = NULL;
2965     } else {
2966         qemu_bh_delete(q->tx_bh);
2967         q->tx_bh = NULL;
2968     }
2969     q->tx_waiting = 0;
2970     virtio_del_queue(vdev, index * 2 + 1);
2971 }
2972
2973 static void virtio_net_change_num_queue_pairs(VirtIONet *n, int new_max_queue_pairs)
2974 {
2975     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2976     int old_num_queues = virtio_get_num_queues(vdev);
2977     int new_num_queues = new_max_queue_pairs * 2 + 1;
2978     int i;
2979
2980     assert(old_num_queues >= 3);
2981     assert(old_num_queues % 2 == 1);
2982
2983     if (old_num_queues == new_num_queues) {
2984         return;
2985     }
2986
2987     /*
2988      * We always need to remove and add ctrl vq if
2989      * old_num_queues != new_num_queues. Remove ctrl_vq first,
2990      * and then we only enter one of the following two loops.
2991      */
2992     virtio_del_queue(vdev, old_num_queues - 1);
2993
2994     for (i = new_num_queues - 1; i < old_num_queues - 1; i += 2) {
2995         /* new_num_queues < old_num_queues */
2996         virtio_net_del_queue(n, i / 2);
2997     }
2998
2999     for (i = old_num_queues - 1; i < new_num_queues - 1; i += 2) {
3000         /* new_num_queues > old_num_queues */
3001         virtio_net_add_queue(n, i / 2);
3002     }
3003
3004     /* add ctrl_vq last */
3005     n->ctrl_vq = virtio_add_queue(vdev, 64, virtio_net_handle_ctrl);
3006 }
3007
3008 static void virtio_net_set_multiqueue(VirtIONet *n, int multiqueue)
3009 {
3010     int max = multiqueue ? n->max_queue_pairs : 1;
3011
3012     n->multiqueue = multiqueue;
3013     virtio_net_change_num_queue_pairs(n, max);
3014
3015     virtio_net_set_queue_pairs(n);
3016 }
3017
3018 static int virtio_net_post_load_device(void *opaque, int version_id)
3019 {
3020     VirtIONet *n = opaque;
3021     VirtIODevice *vdev = VIRTIO_DEVICE(n);
3022     int i, link_down;
3023
3024     trace_virtio_net_post_load_device();
3025     virtio_net_set_mrg_rx_bufs(n, n->mergeable_rx_bufs,
3026                                virtio_vdev_has_feature(vdev,
3027                                                        VIRTIO_F_VERSION_1),
3028                                virtio_vdev_has_feature(vdev,
3029                                                        VIRTIO_NET_F_HASH_REPORT));
3030
3031     /* MAC_TABLE_ENTRIES may be different from the saved image */
3032     if (n->mac_table.in_use > MAC_TABLE_ENTRIES) {
3033         n->mac_table.in_use = 0;
3034     }
3035
3036     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
3037         n->curr_guest_offloads = virtio_net_supported_guest_offloads(n);
3038     }
3039
3040     /*
3041      * curr_guest_offloads will be later overwritten by the
3042      * virtio_set_features_nocheck call done from the virtio_load.
3043      * Here we make sure it is preserved and restored accordingly
3044      * in the virtio_net_post_load_virtio callback.
3045      */
3046     n->saved_guest_offloads = n->curr_guest_offloads;
3047
3048     virtio_net_set_queue_pairs(n);
3049
3050     /* Find the first multicast entry in the saved MAC filter */
3051     for (i = 0; i < n->mac_table.in_use; i++) {
3052         if (n->mac_table.macs[i * ETH_ALEN] & 1) {
3053             break;
3054         }
3055     }
3056     n->mac_table.first_multi = i;
3057
3058     /* nc.link_down can't be migrated, so infer link_down according
3059      * to link status bit in n->status */
3060     link_down = (n->status & VIRTIO_NET_S_LINK_UP) == 0;
3061     for (i = 0; i < n->max_queue_pairs; i++) {
3062         qemu_get_subqueue(n->nic, i)->link_down = link_down;
3063     }
3064
3065     if (virtio_vdev_has_feature(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE) &&
3066         virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) {
3067         qemu_announce_timer_reset(&n->announce_timer, migrate_announce_params(),
3068                                   QEMU_CLOCK_VIRTUAL,
3069                                   virtio_net_announce_timer, n);
3070         if (n->announce_timer.round) {
3071             timer_mod(n->announce_timer.tm,
3072                       qemu_clock_get_ms(n->announce_timer.type));
3073         } else {
3074             qemu_announce_timer_del(&n->announce_timer, false);
3075         }
3076     }
3077
3078     if (n->rss_data.enabled) {
3079         n->rss_data.enabled_software_rss = n->rss_data.populate_hash;
3080         if (!n->rss_data.populate_hash) {
3081             if (!virtio_net_attach_epbf_rss(n)) {
3082                 if (get_vhost_net(qemu_get_queue(n->nic)->peer)) {
3083                     warn_report("Can't post-load eBPF RSS for vhost");
3084                 } else {
3085                     warn_report("Can't post-load eBPF RSS - "
3086                                 "fallback to software RSS");
3087                     n->rss_data.enabled_software_rss = true;
3088                 }
3089             }
3090         }
3091
3092         trace_virtio_net_rss_enable(n->rss_data.hash_types,
3093                                     n->rss_data.indirections_len,
3094                                     sizeof(n->rss_data.key));
3095     } else {
3096         trace_virtio_net_rss_disable();
3097     }
3098     return 0;
3099 }
3100
3101 static int virtio_net_post_load_virtio(VirtIODevice *vdev)
3102 {
3103     VirtIONet *n = VIRTIO_NET(vdev);
3104     /*
3105      * The actual needed state is now in saved_guest_offloads,
3106      * see virtio_net_post_load_device for detail.
3107      * Restore it back and apply the desired offloads.
3108      */
3109     n->curr_guest_offloads = n->saved_guest_offloads;
3110     if (peer_has_vnet_hdr(n)) {
3111         virtio_net_apply_guest_offloads(n);
3112     }
3113
3114     return 0;
3115 }
3116
3117 /* tx_waiting field of a VirtIONetQueue */
3118 static const VMStateDescription vmstate_virtio_net_queue_tx_waiting = {
3119     .name = "virtio-net-queue-tx_waiting",
3120     .fields = (VMStateField[]) {
3121         VMSTATE_UINT32(tx_waiting, VirtIONetQueue),
3122         VMSTATE_END_OF_LIST()
3123    },
3124 };
3125
3126 static bool max_queue_pairs_gt_1(void *opaque, int version_id)
3127 {
3128     return VIRTIO_NET(opaque)->max_queue_pairs > 1;
3129 }
3130
3131 static bool has_ctrl_guest_offloads(void *opaque, int version_id)
3132 {
3133     return virtio_vdev_has_feature(VIRTIO_DEVICE(opaque),
3134                                    VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
3135 }
3136
3137 static bool mac_table_fits(void *opaque, int version_id)
3138 {
3139     return VIRTIO_NET(opaque)->mac_table.in_use <= MAC_TABLE_ENTRIES;
3140 }
3141
3142 static bool mac_table_doesnt_fit(void *opaque, int version_id)
3143 {
3144     return !mac_table_fits(opaque, version_id);
3145 }
3146
3147 /* This temporary type is shared by all the WITH_TMP methods
3148  * although only some fields are used by each.
3149  */
3150 struct VirtIONetMigTmp {
3151     VirtIONet      *parent;
3152     VirtIONetQueue *vqs_1;
3153     uint16_t        curr_queue_pairs_1;
3154     uint8_t         has_ufo;
3155     uint32_t        has_vnet_hdr;
3156 };
3157
3158 /* The 2nd and subsequent tx_waiting flags are loaded later than
3159  * the 1st entry in the queue_pairs and only if there's more than one
3160  * entry.  We use the tmp mechanism to calculate a temporary
3161  * pointer and count and also validate the count.
3162  */
3163
3164 static int virtio_net_tx_waiting_pre_save(void *opaque)
3165 {
3166     struct VirtIONetMigTmp *tmp = opaque;
3167
3168     tmp->vqs_1 = tmp->parent->vqs + 1;
3169     tmp->curr_queue_pairs_1 = tmp->parent->curr_queue_pairs - 1;
3170     if (tmp->parent->curr_queue_pairs == 0) {
3171         tmp->curr_queue_pairs_1 = 0;
3172     }
3173
3174     return 0;
3175 }
3176
3177 static int virtio_net_tx_waiting_pre_load(void *opaque)
3178 {
3179     struct VirtIONetMigTmp *tmp = opaque;
3180
3181     /* Reuse the pointer setup from save */
3182     virtio_net_tx_waiting_pre_save(opaque);
3183
3184     if (tmp->parent->curr_queue_pairs > tmp->parent->max_queue_pairs) {
3185         error_report("virtio-net: curr_queue_pairs %x > max_queue_pairs %x",
3186             tmp->parent->curr_queue_pairs, tmp->parent->max_queue_pairs);
3187
3188         return -EINVAL;
3189     }
3190
3191     return 0; /* all good */
3192 }
3193
3194 static const VMStateDescription vmstate_virtio_net_tx_waiting = {
3195     .name      = "virtio-net-tx_waiting",
3196     .pre_load  = virtio_net_tx_waiting_pre_load,
3197     .pre_save  = virtio_net_tx_waiting_pre_save,
3198     .fields    = (VMStateField[]) {
3199         VMSTATE_STRUCT_VARRAY_POINTER_UINT16(vqs_1, struct VirtIONetMigTmp,
3200                                      curr_queue_pairs_1,
3201                                      vmstate_virtio_net_queue_tx_waiting,
3202                                      struct VirtIONetQueue),
3203         VMSTATE_END_OF_LIST()
3204     },
3205 };
3206
3207 /* the 'has_ufo' flag is just tested; if the incoming stream has the
3208  * flag set we need to check that we have it
3209  */
3210 static int virtio_net_ufo_post_load(void *opaque, int version_id)
3211 {
3212     struct VirtIONetMigTmp *tmp = opaque;
3213
3214     if (tmp->has_ufo && !peer_has_ufo(tmp->parent)) {
3215         error_report("virtio-net: saved image requires TUN_F_UFO support");
3216         return -EINVAL;
3217     }
3218
3219     return 0;
3220 }
3221
3222 static int virtio_net_ufo_pre_save(void *opaque)
3223 {
3224     struct VirtIONetMigTmp *tmp = opaque;
3225
3226     tmp->has_ufo = tmp->parent->has_ufo;
3227
3228     return 0;
3229 }
3230
3231 static const VMStateDescription vmstate_virtio_net_has_ufo = {
3232     .name      = "virtio-net-ufo",
3233     .post_load = virtio_net_ufo_post_load,
3234     .pre_save  = virtio_net_ufo_pre_save,
3235     .fields    = (VMStateField[]) {
3236         VMSTATE_UINT8(has_ufo, struct VirtIONetMigTmp),
3237         VMSTATE_END_OF_LIST()
3238     },
3239 };
3240
3241 /* the 'has_vnet_hdr' flag is just tested; if the incoming stream has the
3242  * flag set we need to check that we have it
3243  */
3244 static int virtio_net_vnet_post_load(void *opaque, int version_id)
3245 {
3246     struct VirtIONetMigTmp *tmp = opaque;
3247
3248     if (tmp->has_vnet_hdr && !peer_has_vnet_hdr(tmp->parent)) {
3249         error_report("virtio-net: saved image requires vnet_hdr=on");
3250         return -EINVAL;
3251     }
3252
3253     return 0;
3254 }
3255
3256 static int virtio_net_vnet_pre_save(void *opaque)
3257 {
3258     struct VirtIONetMigTmp *tmp = opaque;
3259
3260     tmp->has_vnet_hdr = tmp->parent->has_vnet_hdr;
3261
3262     return 0;
3263 }
3264
3265 static const VMStateDescription vmstate_virtio_net_has_vnet = {
3266     .name      = "virtio-net-vnet",
3267     .post_load = virtio_net_vnet_post_load,
3268     .pre_save  = virtio_net_vnet_pre_save,
3269     .fields    = (VMStateField[]) {
3270         VMSTATE_UINT32(has_vnet_hdr, struct VirtIONetMigTmp),
3271         VMSTATE_END_OF_LIST()
3272     },
3273 };
3274
3275 static bool virtio_net_rss_needed(void *opaque)
3276 {
3277     return VIRTIO_NET(opaque)->rss_data.enabled;
3278 }
3279
3280 static const VMStateDescription vmstate_virtio_net_rss = {
3281     .name      = "virtio-net-device/rss",
3282     .version_id = 1,
3283     .minimum_version_id = 1,
3284     .needed = virtio_net_rss_needed,
3285     .fields = (VMStateField[]) {
3286         VMSTATE_BOOL(rss_data.enabled, VirtIONet),
3287         VMSTATE_BOOL(rss_data.redirect, VirtIONet),
3288         VMSTATE_BOOL(rss_data.populate_hash, VirtIONet),
3289         VMSTATE_UINT32(rss_data.hash_types, VirtIONet),
3290         VMSTATE_UINT16(rss_data.indirections_len, VirtIONet),
3291         VMSTATE_UINT16(rss_data.default_queue, VirtIONet),
3292         VMSTATE_UINT8_ARRAY(rss_data.key, VirtIONet,
3293                             VIRTIO_NET_RSS_MAX_KEY_SIZE),
3294         VMSTATE_VARRAY_UINT16_ALLOC(rss_data.indirections_table, VirtIONet,
3295                                     rss_data.indirections_len, 0,
3296                                     vmstate_info_uint16, uint16_t),
3297         VMSTATE_END_OF_LIST()
3298     },
3299 };
3300
3301 static const VMStateDescription vmstate_virtio_net_device = {
3302     .name = "virtio-net-device",
3303     .version_id = VIRTIO_NET_VM_VERSION,
3304     .minimum_version_id = VIRTIO_NET_VM_VERSION,
3305     .post_load = virtio_net_post_load_device,
3306     .fields = (VMStateField[]) {
3307         VMSTATE_UINT8_ARRAY(mac, VirtIONet, ETH_ALEN),
3308         VMSTATE_STRUCT_POINTER(vqs, VirtIONet,
3309                                vmstate_virtio_net_queue_tx_waiting,
3310                                VirtIONetQueue),
3311         VMSTATE_UINT32(mergeable_rx_bufs, VirtIONet),
3312         VMSTATE_UINT16(status, VirtIONet),
3313         VMSTATE_UINT8(promisc, VirtIONet),
3314         VMSTATE_UINT8(allmulti, VirtIONet),
3315         VMSTATE_UINT32(mac_table.in_use, VirtIONet),
3316
3317         /* Guarded pair: If it fits we load it, else we throw it away
3318          * - can happen if source has a larger MAC table.; post-load
3319          *  sets flags in this case.
3320          */
3321         VMSTATE_VBUFFER_MULTIPLY(mac_table.macs, VirtIONet,
3322                                 0, mac_table_fits, mac_table.in_use,
3323                                  ETH_ALEN),
3324         VMSTATE_UNUSED_VARRAY_UINT32(VirtIONet, mac_table_doesnt_fit, 0,
3325                                      mac_table.in_use, ETH_ALEN),
3326
3327         /* Note: This is an array of uint32's that's always been saved as a
3328          * buffer; hold onto your endiannesses; it's actually used as a bitmap
3329          * but based on the uint.
3330          */
3331         VMSTATE_BUFFER_POINTER_UNSAFE(vlans, VirtIONet, 0, MAX_VLAN >> 3),
3332         VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp,
3333                          vmstate_virtio_net_has_vnet),
3334         VMSTATE_UINT8(mac_table.multi_overflow, VirtIONet),
3335         VMSTATE_UINT8(mac_table.uni_overflow, VirtIONet),
3336         VMSTATE_UINT8(alluni, VirtIONet),
3337         VMSTATE_UINT8(nomulti, VirtIONet),
3338         VMSTATE_UINT8(nouni, VirtIONet),
3339         VMSTATE_UINT8(nobcast, VirtIONet),
3340         VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp,
3341                          vmstate_virtio_net_has_ufo),
3342         VMSTATE_SINGLE_TEST(max_queue_pairs, VirtIONet, max_queue_pairs_gt_1, 0,
3343                             vmstate_info_uint16_equal, uint16_t),
3344         VMSTATE_UINT16_TEST(curr_queue_pairs, VirtIONet, max_queue_pairs_gt_1),
3345         VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp,
3346                          vmstate_virtio_net_tx_waiting),
3347         VMSTATE_UINT64_TEST(curr_guest_offloads, VirtIONet,
3348                             has_ctrl_guest_offloads),
3349         VMSTATE_END_OF_LIST()
3350    },
3351     .subsections = (const VMStateDescription * []) {
3352         &vmstate_virtio_net_rss,
3353         NULL
3354     }
3355 };
3356
3357 static NetClientInfo net_virtio_info = {
3358     .type = NET_CLIENT_DRIVER_NIC,
3359     .size = sizeof(NICState),
3360     .can_receive = virtio_net_can_receive,
3361     .receive = virtio_net_receive,
3362     .link_status_changed = virtio_net_set_link_status,
3363     .query_rx_filter = virtio_net_query_rxfilter,
3364     .announce = virtio_net_announce,
3365 };
3366
3367 static bool virtio_net_guest_notifier_pending(VirtIODevice *vdev, int idx)
3368 {
3369     VirtIONet *n = VIRTIO_NET(vdev);
3370     NetClientState *nc;
3371     assert(n->vhost_started);
3372     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_MQ) && idx == 2) {
3373         /* Must guard against invalid features and bogus queue index
3374          * from being set by malicious guest, or penetrated through
3375          * buggy migration stream.
3376          */
3377         if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) {
3378             qemu_log_mask(LOG_GUEST_ERROR,
3379                           "%s: bogus vq index ignored\n", __func__);
3380             return false;
3381         }
3382         nc = qemu_get_subqueue(n->nic, n->max_queue_pairs);
3383     } else {
3384         nc = qemu_get_subqueue(n->nic, vq2q(idx));
3385     }
3386     /*
3387      * Add the check for configure interrupt, Use VIRTIO_CONFIG_IRQ_IDX -1
3388      * as the macro of configure interrupt's IDX, If this driver does not
3389      * support, the function will return false
3390      */
3391
3392     if (idx == VIRTIO_CONFIG_IRQ_IDX) {
3393         return vhost_net_config_pending(get_vhost_net(nc->peer));
3394     }
3395     return vhost_net_virtqueue_pending(get_vhost_net(nc->peer), idx);
3396 }
3397
3398 static void virtio_net_guest_notifier_mask(VirtIODevice *vdev, int idx,
3399                                            bool mask)
3400 {
3401     VirtIONet *n = VIRTIO_NET(vdev);
3402     NetClientState *nc;
3403     assert(n->vhost_started);
3404     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_MQ) && idx == 2) {
3405         /* Must guard against invalid features and bogus queue index
3406          * from being set by malicious guest, or penetrated through
3407          * buggy migration stream.
3408          */
3409         if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) {
3410             qemu_log_mask(LOG_GUEST_ERROR,
3411                           "%s: bogus vq index ignored\n", __func__);
3412             return;
3413         }
3414         nc = qemu_get_subqueue(n->nic, n->max_queue_pairs);
3415     } else {
3416         nc = qemu_get_subqueue(n->nic, vq2q(idx));
3417     }
3418     /*
3419      *Add the check for configure interrupt, Use VIRTIO_CONFIG_IRQ_IDX -1
3420      * as the macro of configure interrupt's IDX, If this driver does not
3421      * support, the function will return
3422      */
3423
3424     if (idx == VIRTIO_CONFIG_IRQ_IDX) {
3425         vhost_net_config_mask(get_vhost_net(nc->peer), vdev, mask);
3426         return;
3427     }
3428     vhost_net_virtqueue_mask(get_vhost_net(nc->peer), vdev, idx, mask);
3429 }
3430
3431 static void virtio_net_set_config_size(VirtIONet *n, uint64_t host_features)
3432 {
3433     virtio_add_feature(&host_features, VIRTIO_NET_F_MAC);
3434
3435     n->config_size = virtio_get_config_size(&cfg_size_params, host_features);
3436 }
3437
3438 void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
3439                                    const char *type)
3440 {
3441     /*
3442      * The name can be NULL, the netclient name will be type.x.
3443      */
3444     assert(type != NULL);
3445
3446     g_free(n->netclient_name);
3447     g_free(n->netclient_type);
3448     n->netclient_name = g_strdup(name);
3449     n->netclient_type = g_strdup(type);
3450 }
3451
3452 static bool failover_unplug_primary(VirtIONet *n, DeviceState *dev)
3453 {
3454     HotplugHandler *hotplug_ctrl;
3455     PCIDevice *pci_dev;
3456     Error *err = NULL;
3457
3458     hotplug_ctrl = qdev_get_hotplug_handler(dev);
3459     if (hotplug_ctrl) {
3460         pci_dev = PCI_DEVICE(dev);
3461         pci_dev->partially_hotplugged = true;
3462         hotplug_handler_unplug_request(hotplug_ctrl, dev, &err);
3463         if (err) {
3464             error_report_err(err);
3465             return false;
3466         }
3467     } else {
3468         return false;
3469     }
3470     return true;
3471 }
3472
3473 static bool failover_replug_primary(VirtIONet *n, DeviceState *dev,
3474                                     Error **errp)
3475 {
3476     Error *err = NULL;
3477     HotplugHandler *hotplug_ctrl;
3478     PCIDevice *pdev = PCI_DEVICE(dev);
3479     BusState *primary_bus;
3480
3481     if (!pdev->partially_hotplugged) {
3482         return true;
3483     }
3484     primary_bus = dev->parent_bus;
3485     if (!primary_bus) {
3486         error_setg(errp, "virtio_net: couldn't find primary bus");
3487         return false;
3488     }
3489     qdev_set_parent_bus(dev, primary_bus, &error_abort);
3490     qatomic_set(&n->failover_primary_hidden, false);
3491     hotplug_ctrl = qdev_get_hotplug_handler(dev);
3492     if (hotplug_ctrl) {
3493         hotplug_handler_pre_plug(hotplug_ctrl, dev, &err);
3494         if (err) {
3495             goto out;
3496         }
3497         hotplug_handler_plug(hotplug_ctrl, dev, &err);
3498     }
3499     pdev->partially_hotplugged = false;
3500
3501 out:
3502     error_propagate(errp, err);
3503     return !err;
3504 }
3505
3506 static void virtio_net_handle_migration_primary(VirtIONet *n, MigrationState *s)
3507 {
3508     bool should_be_hidden;
3509     Error *err = NULL;
3510     DeviceState *dev = failover_find_primary_device(n);
3511
3512     if (!dev) {
3513         return;
3514     }
3515
3516     should_be_hidden = qatomic_read(&n->failover_primary_hidden);
3517
3518     if (migration_in_setup(s) && !should_be_hidden) {
3519         if (failover_unplug_primary(n, dev)) {
3520             vmstate_unregister(VMSTATE_IF(dev), qdev_get_vmsd(dev), dev);
3521             qapi_event_send_unplug_primary(dev->id);
3522             qatomic_set(&n->failover_primary_hidden, true);
3523         } else {
3524             warn_report("couldn't unplug primary device");
3525         }
3526     } else if (migration_has_failed(s)) {
3527         /* We already unplugged the device let's plug it back */
3528         if (!failover_replug_primary(n, dev, &err)) {
3529             if (err) {
3530                 error_report_err(err);
3531             }
3532         }
3533     }
3534 }
3535
3536 static void virtio_net_migration_state_notifier(Notifier *notifier, void *data)
3537 {
3538     MigrationState *s = data;
3539     VirtIONet *n = container_of(notifier, VirtIONet, migration_state);
3540     virtio_net_handle_migration_primary(n, s);
3541 }
3542
3543 static bool failover_hide_primary_device(DeviceListener *listener,
3544                                          const QDict *device_opts,
3545                                          bool from_json,
3546                                          Error **errp)
3547 {
3548     VirtIONet *n = container_of(listener, VirtIONet, primary_listener);
3549     const char *standby_id;
3550
3551     if (!device_opts) {
3552         return false;
3553     }
3554
3555     if (!qdict_haskey(device_opts, "failover_pair_id")) {
3556         return false;
3557     }
3558
3559     if (!qdict_haskey(device_opts, "id")) {
3560         error_setg(errp, "Device with failover_pair_id needs to have id");
3561         return false;
3562     }
3563
3564     standby_id = qdict_get_str(device_opts, "failover_pair_id");
3565     if (g_strcmp0(standby_id, n->netclient_name) != 0) {
3566         return false;
3567     }
3568
3569     /*
3570      * The hide helper can be called several times for a given device.
3571      * Check there is only one primary for a virtio-net device but
3572      * don't duplicate the qdict several times if it's called for the same
3573      * device.
3574      */
3575     if (n->primary_opts) {
3576         const char *old, *new;
3577         /* devices with failover_pair_id always have an id */
3578         old = qdict_get_str(n->primary_opts, "id");
3579         new = qdict_get_str(device_opts, "id");
3580         if (strcmp(old, new) != 0) {
3581             error_setg(errp, "Cannot attach more than one primary device to "
3582                        "'%s': '%s' and '%s'", n->netclient_name, old, new);
3583             return false;
3584         }
3585     } else {
3586         n->primary_opts = qdict_clone_shallow(device_opts);
3587         n->primary_opts_from_json = from_json;
3588     }
3589
3590     /* failover_primary_hidden is set during feature negotiation */
3591     return qatomic_read(&n->failover_primary_hidden);
3592 }
3593
3594 static void virtio_net_device_realize(DeviceState *dev, Error **errp)
3595 {
3596     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
3597     VirtIONet *n = VIRTIO_NET(dev);
3598     NetClientState *nc;
3599     int i;
3600
3601     if (n->net_conf.mtu) {
3602         n->host_features |= (1ULL << VIRTIO_NET_F_MTU);
3603     }
3604
3605     if (n->net_conf.duplex_str) {
3606         if (strncmp(n->net_conf.duplex_str, "half", 5) == 0) {
3607             n->net_conf.duplex = DUPLEX_HALF;
3608         } else if (strncmp(n->net_conf.duplex_str, "full", 5) == 0) {
3609             n->net_conf.duplex = DUPLEX_FULL;
3610         } else {
3611             error_setg(errp, "'duplex' must be 'half' or 'full'");
3612             return;
3613         }
3614         n->host_features |= (1ULL << VIRTIO_NET_F_SPEED_DUPLEX);
3615     } else {
3616         n->net_conf.duplex = DUPLEX_UNKNOWN;
3617     }
3618
3619     if (n->net_conf.speed < SPEED_UNKNOWN) {
3620         error_setg(errp, "'speed' must be between 0 and INT_MAX");
3621         return;
3622     }
3623     if (n->net_conf.speed >= 0) {
3624         n->host_features |= (1ULL << VIRTIO_NET_F_SPEED_DUPLEX);
3625     }
3626
3627     if (n->failover) {
3628         n->primary_listener.hide_device = failover_hide_primary_device;
3629         qatomic_set(&n->failover_primary_hidden, true);
3630         device_listener_register(&n->primary_listener);
3631         n->migration_state.notify = virtio_net_migration_state_notifier;
3632         add_migration_state_change_notifier(&n->migration_state);
3633         n->host_features |= (1ULL << VIRTIO_NET_F_STANDBY);
3634     }
3635
3636     virtio_net_set_config_size(n, n->host_features);
3637     virtio_init(vdev, VIRTIO_ID_NET, n->config_size);
3638
3639     /*
3640      * We set a lower limit on RX queue size to what it always was.
3641      * Guests that want a smaller ring can always resize it without
3642      * help from us (using virtio 1 and up).
3643      */
3644     if (n->net_conf.rx_queue_size < VIRTIO_NET_RX_QUEUE_MIN_SIZE ||
3645         n->net_conf.rx_queue_size > VIRTQUEUE_MAX_SIZE ||
3646         !is_power_of_2(n->net_conf.rx_queue_size)) {
3647         error_setg(errp, "Invalid rx_queue_size (= %" PRIu16 "), "
3648                    "must be a power of 2 between %d and %d.",
3649                    n->net_conf.rx_queue_size, VIRTIO_NET_RX_QUEUE_MIN_SIZE,
3650                    VIRTQUEUE_MAX_SIZE);
3651         virtio_cleanup(vdev);
3652         return;
3653     }
3654
3655     if (n->net_conf.tx_queue_size < VIRTIO_NET_TX_QUEUE_MIN_SIZE ||
3656         n->net_conf.tx_queue_size > virtio_net_max_tx_queue_size(n) ||
3657         !is_power_of_2(n->net_conf.tx_queue_size)) {
3658         error_setg(errp, "Invalid tx_queue_size (= %" PRIu16 "), "
3659                    "must be a power of 2 between %d and %d",
3660                    n->net_conf.tx_queue_size, VIRTIO_NET_TX_QUEUE_MIN_SIZE,
3661                    virtio_net_max_tx_queue_size(n));
3662         virtio_cleanup(vdev);
3663         return;
3664     }
3665
3666     n->max_ncs = MAX(n->nic_conf.peers.queues, 1);
3667
3668     /*
3669      * Figure out the datapath queue pairs since the backend could
3670      * provide control queue via peers as well.
3671      */
3672     if (n->nic_conf.peers.queues) {
3673         for (i = 0; i < n->max_ncs; i++) {
3674             if (n->nic_conf.peers.ncs[i]->is_datapath) {
3675                 ++n->max_queue_pairs;
3676             }
3677         }
3678     }
3679     n->max_queue_pairs = MAX(n->max_queue_pairs, 1);
3680
3681     if (n->max_queue_pairs * 2 + 1 > VIRTIO_QUEUE_MAX) {
3682         error_setg(errp, "Invalid number of queue pairs (= %" PRIu32 "), "
3683                    "must be a positive integer less than %d.",
3684                    n->max_queue_pairs, (VIRTIO_QUEUE_MAX - 1) / 2);
3685         virtio_cleanup(vdev);
3686         return;
3687     }
3688     n->vqs = g_new0(VirtIONetQueue, n->max_queue_pairs);
3689     n->curr_queue_pairs = 1;
3690     n->tx_timeout = n->net_conf.txtimer;
3691
3692     if (n->net_conf.tx && strcmp(n->net_conf.tx, "timer")
3693                        && strcmp(n->net_conf.tx, "bh")) {
3694         warn_report("virtio-net: "
3695                     "Unknown option tx=%s, valid options: \"timer\" \"bh\"",
3696                     n->net_conf.tx);
3697         error_printf("Defaulting to \"bh\"");
3698     }
3699
3700     n->net_conf.tx_queue_size = MIN(virtio_net_max_tx_queue_size(n),
3701                                     n->net_conf.tx_queue_size);
3702
3703     for (i = 0; i < n->max_queue_pairs; i++) {
3704         virtio_net_add_queue(n, i);
3705     }
3706
3707     n->ctrl_vq = virtio_add_queue(vdev, 64, virtio_net_handle_ctrl);
3708     qemu_macaddr_default_if_unset(&n->nic_conf.macaddr);
3709     memcpy(&n->mac[0], &n->nic_conf.macaddr, sizeof(n->mac));
3710     n->status = VIRTIO_NET_S_LINK_UP;
3711     qemu_announce_timer_reset(&n->announce_timer, migrate_announce_params(),
3712                               QEMU_CLOCK_VIRTUAL,
3713                               virtio_net_announce_timer, n);
3714     n->announce_timer.round = 0;
3715
3716     if (n->netclient_type) {
3717         /*
3718          * Happen when virtio_net_set_netclient_name has been called.
3719          */
3720         n->nic = qemu_new_nic(&net_virtio_info, &n->nic_conf,
3721                               n->netclient_type, n->netclient_name, n);
3722     } else {
3723         n->nic = qemu_new_nic(&net_virtio_info, &n->nic_conf,
3724                               object_get_typename(OBJECT(dev)), dev->id, n);
3725     }
3726
3727     for (i = 0; i < n->max_queue_pairs; i++) {
3728         n->nic->ncs[i].do_not_pad = true;
3729     }
3730
3731     peer_test_vnet_hdr(n);
3732     if (peer_has_vnet_hdr(n)) {
3733         for (i = 0; i < n->max_queue_pairs; i++) {
3734             qemu_using_vnet_hdr(qemu_get_subqueue(n->nic, i)->peer, true);
3735         }
3736         n->host_hdr_len = sizeof(struct virtio_net_hdr);
3737     } else {
3738         n->host_hdr_len = 0;
3739     }
3740
3741     qemu_format_nic_info_str(qemu_get_queue(n->nic), n->nic_conf.macaddr.a);
3742
3743     n->vqs[0].tx_waiting = 0;
3744     n->tx_burst = n->net_conf.txburst;
3745     virtio_net_set_mrg_rx_bufs(n, 0, 0, 0);
3746     n->promisc = 1; /* for compatibility */
3747
3748     n->mac_table.macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN);
3749
3750     n->vlans = g_malloc0(MAX_VLAN >> 3);
3751
3752     nc = qemu_get_queue(n->nic);
3753     nc->rxfilter_notify_enabled = 1;
3754
3755    if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
3756         struct virtio_net_config netcfg = {};
3757         memcpy(&netcfg.mac, &n->nic_conf.macaddr, ETH_ALEN);
3758         vhost_net_set_config(get_vhost_net(nc->peer),
3759             (uint8_t *)&netcfg, 0, ETH_ALEN, VHOST_SET_CONFIG_TYPE_FRONTEND);
3760     }
3761     QTAILQ_INIT(&n->rsc_chains);
3762     n->qdev = dev;
3763
3764     net_rx_pkt_init(&n->rx_pkt);
3765
3766     if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
3767         virtio_net_load_ebpf(n);
3768     }
3769 }
3770
3771 static void virtio_net_device_unrealize(DeviceState *dev)
3772 {
3773     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
3774     VirtIONet *n = VIRTIO_NET(dev);
3775     int i, max_queue_pairs;
3776
3777     if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
3778         virtio_net_unload_ebpf(n);
3779     }
3780
3781     /* This will stop vhost backend if appropriate. */
3782     virtio_net_set_status(vdev, 0);
3783
3784     g_free(n->netclient_name);
3785     n->netclient_name = NULL;
3786     g_free(n->netclient_type);
3787     n->netclient_type = NULL;
3788
3789     g_free(n->mac_table.macs);
3790     g_free(n->vlans);
3791
3792     if (n->failover) {
3793         qobject_unref(n->primary_opts);
3794         device_listener_unregister(&n->primary_listener);
3795         remove_migration_state_change_notifier(&n->migration_state);
3796     } else {
3797         assert(n->primary_opts == NULL);
3798     }
3799
3800     max_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
3801     for (i = 0; i < max_queue_pairs; i++) {
3802         virtio_net_del_queue(n, i);
3803     }
3804     /* delete also control vq */
3805     virtio_del_queue(vdev, max_queue_pairs * 2);
3806     qemu_announce_timer_del(&n->announce_timer, false);
3807     g_free(n->vqs);
3808     qemu_del_nic(n->nic);
3809     virtio_net_rsc_cleanup(n);
3810     g_free(n->rss_data.indirections_table);
3811     net_rx_pkt_uninit(n->rx_pkt);
3812     virtio_cleanup(vdev);
3813 }
3814
3815 static void virtio_net_instance_init(Object *obj)
3816 {
3817     VirtIONet *n = VIRTIO_NET(obj);
3818
3819     /*
3820      * The default config_size is sizeof(struct virtio_net_config).
3821      * Can be overridden with virtio_net_set_config_size.
3822      */
3823     n->config_size = sizeof(struct virtio_net_config);
3824     device_add_bootindex_property(obj, &n->nic_conf.bootindex,
3825                                   "bootindex", "/ethernet-phy@0",
3826                                   DEVICE(n));
3827
3828     ebpf_rss_init(&n->ebpf_rss);
3829 }
3830
3831 static int virtio_net_pre_save(void *opaque)
3832 {
3833     VirtIONet *n = opaque;
3834
3835     /* At this point, backend must be stopped, otherwise
3836      * it might keep writing to memory. */
3837     assert(!n->vhost_started);
3838
3839     return 0;
3840 }
3841
3842 static bool primary_unplug_pending(void *opaque)
3843 {
3844     DeviceState *dev = opaque;
3845     DeviceState *primary;
3846     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
3847     VirtIONet *n = VIRTIO_NET(vdev);
3848
3849     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_STANDBY)) {
3850         return false;
3851     }
3852     primary = failover_find_primary_device(n);
3853     return primary ? primary->pending_deleted_event : false;
3854 }
3855
3856 static bool dev_unplug_pending(void *opaque)
3857 {
3858     DeviceState *dev = opaque;
3859     VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(dev);
3860
3861     return vdc->primary_unplug_pending(dev);
3862 }
3863
3864 static struct vhost_dev *virtio_net_get_vhost(VirtIODevice *vdev)
3865 {
3866     VirtIONet *n = VIRTIO_NET(vdev);
3867     NetClientState *nc = qemu_get_queue(n->nic);
3868     struct vhost_net *net = get_vhost_net(nc->peer);
3869     return &net->dev;
3870 }
3871
3872 static const VMStateDescription vmstate_virtio_net = {
3873     .name = "virtio-net",
3874     .minimum_version_id = VIRTIO_NET_VM_VERSION,
3875     .version_id = VIRTIO_NET_VM_VERSION,
3876     .fields = (VMStateField[]) {
3877         VMSTATE_VIRTIO_DEVICE,
3878         VMSTATE_END_OF_LIST()
3879     },
3880     .pre_save = virtio_net_pre_save,
3881     .dev_unplug_pending = dev_unplug_pending,
3882 };
3883
3884 static Property virtio_net_properties[] = {
3885     DEFINE_PROP_BIT64("csum", VirtIONet, host_features,
3886                     VIRTIO_NET_F_CSUM, true),
3887     DEFINE_PROP_BIT64("guest_csum", VirtIONet, host_features,
3888                     VIRTIO_NET_F_GUEST_CSUM, true),
3889     DEFINE_PROP_BIT64("gso", VirtIONet, host_features, VIRTIO_NET_F_GSO, true),
3890     DEFINE_PROP_BIT64("guest_tso4", VirtIONet, host_features,
3891                     VIRTIO_NET_F_GUEST_TSO4, true),
3892     DEFINE_PROP_BIT64("guest_tso6", VirtIONet, host_features,
3893                     VIRTIO_NET_F_GUEST_TSO6, true),
3894     DEFINE_PROP_BIT64("guest_ecn", VirtIONet, host_features,
3895                     VIRTIO_NET_F_GUEST_ECN, true),
3896     DEFINE_PROP_BIT64("guest_ufo", VirtIONet, host_features,
3897                     VIRTIO_NET_F_GUEST_UFO, true),
3898     DEFINE_PROP_BIT64("guest_announce", VirtIONet, host_features,
3899                     VIRTIO_NET_F_GUEST_ANNOUNCE, true),
3900     DEFINE_PROP_BIT64("host_tso4", VirtIONet, host_features,
3901                     VIRTIO_NET_F_HOST_TSO4, true),
3902     DEFINE_PROP_BIT64("host_tso6", VirtIONet, host_features,
3903                     VIRTIO_NET_F_HOST_TSO6, true),
3904     DEFINE_PROP_BIT64("host_ecn", VirtIONet, host_features,
3905                     VIRTIO_NET_F_HOST_ECN, true),
3906     DEFINE_PROP_BIT64("host_ufo", VirtIONet, host_features,
3907                     VIRTIO_NET_F_HOST_UFO, true),
3908     DEFINE_PROP_BIT64("mrg_rxbuf", VirtIONet, host_features,
3909                     VIRTIO_NET_F_MRG_RXBUF, true),
3910     DEFINE_PROP_BIT64("status", VirtIONet, host_features,
3911                     VIRTIO_NET_F_STATUS, true),
3912     DEFINE_PROP_BIT64("ctrl_vq", VirtIONet, host_features,
3913                     VIRTIO_NET_F_CTRL_VQ, true),
3914     DEFINE_PROP_BIT64("ctrl_rx", VirtIONet, host_features,
3915                     VIRTIO_NET_F_CTRL_RX, true),
3916     DEFINE_PROP_BIT64("ctrl_vlan", VirtIONet, host_features,
3917                     VIRTIO_NET_F_CTRL_VLAN, true),
3918     DEFINE_PROP_BIT64("ctrl_rx_extra", VirtIONet, host_features,
3919                     VIRTIO_NET_F_CTRL_RX_EXTRA, true),
3920     DEFINE_PROP_BIT64("ctrl_mac_addr", VirtIONet, host_features,
3921                     VIRTIO_NET_F_CTRL_MAC_ADDR, true),
3922     DEFINE_PROP_BIT64("ctrl_guest_offloads", VirtIONet, host_features,
3923                     VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, true),
3924     DEFINE_PROP_BIT64("mq", VirtIONet, host_features, VIRTIO_NET_F_MQ, false),
3925     DEFINE_PROP_BIT64("rss", VirtIONet, host_features,
3926                     VIRTIO_NET_F_RSS, false),
3927     DEFINE_PROP_BIT64("hash", VirtIONet, host_features,
3928                     VIRTIO_NET_F_HASH_REPORT, false),
3929     DEFINE_PROP_BIT64("guest_rsc_ext", VirtIONet, host_features,
3930                     VIRTIO_NET_F_RSC_EXT, false),
3931     DEFINE_PROP_UINT32("rsc_interval", VirtIONet, rsc_timeout,
3932                        VIRTIO_NET_RSC_DEFAULT_INTERVAL),
3933     DEFINE_NIC_PROPERTIES(VirtIONet, nic_conf),
3934     DEFINE_PROP_UINT32("x-txtimer", VirtIONet, net_conf.txtimer,
3935                        TX_TIMER_INTERVAL),
3936     DEFINE_PROP_INT32("x-txburst", VirtIONet, net_conf.txburst, TX_BURST),
3937     DEFINE_PROP_STRING("tx", VirtIONet, net_conf.tx),
3938     DEFINE_PROP_UINT16("rx_queue_size", VirtIONet, net_conf.rx_queue_size,
3939                        VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE),
3940     DEFINE_PROP_UINT16("tx_queue_size", VirtIONet, net_conf.tx_queue_size,
3941                        VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE),
3942     DEFINE_PROP_UINT16("host_mtu", VirtIONet, net_conf.mtu, 0),
3943     DEFINE_PROP_BOOL("x-mtu-bypass-backend", VirtIONet, mtu_bypass_backend,
3944                      true),
3945     DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN),
3946     DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str),
3947     DEFINE_PROP_BOOL("failover", VirtIONet, failover, false),
3948     DEFINE_PROP_BIT64("guest_uso4", VirtIONet, host_features,
3949                       VIRTIO_NET_F_GUEST_USO4, true),
3950     DEFINE_PROP_BIT64("guest_uso6", VirtIONet, host_features,
3951                       VIRTIO_NET_F_GUEST_USO6, true),
3952     DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features,
3953                       VIRTIO_NET_F_HOST_USO, true),
3954     DEFINE_PROP_END_OF_LIST(),
3955 };
3956
3957 static void virtio_net_class_init(ObjectClass *klass, void *data)
3958 {
3959     DeviceClass *dc = DEVICE_CLASS(klass);
3960     VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
3961
3962     device_class_set_props(dc, virtio_net_properties);
3963     dc->vmsd = &vmstate_virtio_net;
3964     set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
3965     vdc->realize = virtio_net_device_realize;
3966     vdc->unrealize = virtio_net_device_unrealize;
3967     vdc->get_config = virtio_net_get_config;
3968     vdc->set_config = virtio_net_set_config;
3969     vdc->get_features = virtio_net_get_features;
3970     vdc->set_features = virtio_net_set_features;
3971     vdc->bad_features = virtio_net_bad_features;
3972     vdc->reset = virtio_net_reset;
3973     vdc->queue_reset = virtio_net_queue_reset;
3974     vdc->queue_enable = virtio_net_queue_enable;
3975     vdc->set_status = virtio_net_set_status;
3976     vdc->guest_notifier_mask = virtio_net_guest_notifier_mask;
3977     vdc->guest_notifier_pending = virtio_net_guest_notifier_pending;
3978     vdc->legacy_features |= (0x1 << VIRTIO_NET_F_GSO);
3979     vdc->post_load = virtio_net_post_load_virtio;
3980     vdc->vmsd = &vmstate_virtio_net_device;
3981     vdc->primary_unplug_pending = primary_unplug_pending;
3982     vdc->get_vhost = virtio_net_get_vhost;
3983     vdc->toggle_device_iotlb = vhost_toggle_device_iotlb;
3984 }
3985
3986 static const TypeInfo virtio_net_info = {
3987     .name = TYPE_VIRTIO_NET,
3988     .parent = TYPE_VIRTIO_DEVICE,
3989     .instance_size = sizeof(VirtIONet),
3990     .instance_init = virtio_net_instance_init,
3991     .class_init = virtio_net_class_init,
3992 };
3993
3994 static void virtio_register_types(void)
3995 {
3996     type_register_static(&virtio_net_info);
3997 }
3998
3999 type_init(virtio_register_types)