hw/net/virtio-net.c

   1 /*
   2  * Virtio Network Device
   3  *
   4  * Copyright IBM, Corp. 2007
   5  *
   6  * Authors:
   7  *  Anthony Liguori   <aliguori@us.ibm.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2.  See
  10  * the COPYING file in the top-level directory.
  11  *
  12  */
  13
  14 #include "qemu/osdep.h"
  15 #include "qemu/atomic.h"
  16 #include "qemu/iov.h"
  17 #include "qemu/log.h"
  18 #include "qemu/main-loop.h"
  19 #include "qemu/module.h"
  20 #include "hw/virtio/virtio.h"
  21 #include "net/net.h"
  22 #include "net/checksum.h"
  23 #include "net/tap.h"
  24 #include "qemu/error-report.h"
  25 #include "qemu/timer.h"
  26 #include "qemu/option.h"
  27 #include "qemu/option_int.h"
  28 #include "qemu/config-file.h"
  29 #include "qapi/qmp/qdict.h"
  30 #include "hw/virtio/virtio-net.h"
  31 #include "net/vhost_net.h"
  32 #include "net/announce.h"
  33 #include "hw/virtio/virtio-bus.h"
  34 #include "qapi/error.h"
  35 #include "qapi/qapi-events-net.h"
  36 #include "hw/qdev-properties.h"
  37 #include "qapi/qapi-types-migration.h"
  38 #include "qapi/qapi-events-migration.h"
  39 #include "hw/virtio/virtio-access.h"
  40 #include "migration/misc.h"
  41 #include "standard-headers/linux/ethtool.h"
  42 #include "sysemu/sysemu.h"
  43 #include "trace.h"
  44 #include "monitor/qdev.h"
  45 #include "hw/pci/pci_device.h"
  46 #include "net_rx_pkt.h"
  47 #include "hw/virtio/vhost.h"
  48 #include "sysemu/qtest.h"
  49
  50 #define VIRTIO_NET_VM_VERSION    11
  51
  52 #define MAX_VLAN    (1 << 12)   /* Per 802.1Q definition */
  53
  54 /* previously fixed value */
  55 #define VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE 256
  56 #define VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE 256
  57
  58 /* for now, only allow larger queue_pairs; with virtio-1, guest can downsize */
  59 #define VIRTIO_NET_RX_QUEUE_MIN_SIZE VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE
  60 #define VIRTIO_NET_TX_QUEUE_MIN_SIZE VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE
  61
  62 #define VIRTIO_NET_IP4_ADDR_SIZE   8        /* ipv4 saddr + daddr */
  63
  64 #define VIRTIO_NET_TCP_FLAG         0x3F
  65 #define VIRTIO_NET_TCP_HDR_LENGTH   0xF000
  66
  67 /* IPv4 max payload, 16 bits in the header */
  68 #define VIRTIO_NET_MAX_IP4_PAYLOAD (65535 - sizeof(struct ip_header))
  69 #define VIRTIO_NET_MAX_TCP_PAYLOAD 65535
  70
  71 /* header length value in ip header without option */
  72 #define VIRTIO_NET_IP4_HEADER_LENGTH 5
  73
  74 #define VIRTIO_NET_IP6_ADDR_SIZE   32      /* ipv6 saddr + daddr */
  75 #define VIRTIO_NET_MAX_IP6_PAYLOAD VIRTIO_NET_MAX_TCP_PAYLOAD
  76
  77 /* Purge coalesced packets timer interval, This value affects the performance
  78    a lot, and should be tuned carefully, '300000'(300us) is the recommended
  79    value to pass the WHQL test, '50000' can gain 2x netperf throughput with
  80    tso/gso/gro 'off'. */
  81 #define VIRTIO_NET_RSC_DEFAULT_INTERVAL 300000
  82
  83 #define VIRTIO_NET_RSS_SUPPORTED_HASHES (VIRTIO_NET_RSS_HASH_TYPE_IPv4 | \
  84                                          VIRTIO_NET_RSS_HASH_TYPE_TCPv4 | \
  85                                          VIRTIO_NET_RSS_HASH_TYPE_UDPv4 | \
  86                                          VIRTIO_NET_RSS_HASH_TYPE_IPv6 | \
  87                                          VIRTIO_NET_RSS_HASH_TYPE_TCPv6 | \
  88                                          VIRTIO_NET_RSS_HASH_TYPE_UDPv6 | \
  89                                          VIRTIO_NET_RSS_HASH_TYPE_IP_EX | \
  90                                          VIRTIO_NET_RSS_HASH_TYPE_TCP_EX | \
  91                                          VIRTIO_NET_RSS_HASH_TYPE_UDP_EX)
  92
  93 static const VirtIOFeature feature_sizes[] = {
  94     {.flags = 1ULL << VIRTIO_NET_F_MAC,
  95      .end = endof(struct virtio_net_config, mac)},
  96     {.flags = 1ULL << VIRTIO_NET_F_STATUS,
  97      .end = endof(struct virtio_net_config, status)},
  98     {.flags = 1ULL << VIRTIO_NET_F_MQ,
  99      .end = endof(struct virtio_net_config, max_virtqueue_pairs)},
 100     {.flags = 1ULL << VIRTIO_NET_F_MTU,
 101      .end = endof(struct virtio_net_config, mtu)},
 102     {.flags = 1ULL << VIRTIO_NET_F_SPEED_DUPLEX,
 103      .end = endof(struct virtio_net_config, duplex)},
 104     {.flags = (1ULL << VIRTIO_NET_F_RSS) | (1ULL << VIRTIO_NET_F_HASH_REPORT),
 105      .end = endof(struct virtio_net_config, supported_hash_types)},
 106     {}
 107 };
 108
 109 static const VirtIOConfigSizeParams cfg_size_params = {
 110     .min_size = endof(struct virtio_net_config, mac),
 111     .max_size = sizeof(struct virtio_net_config),
 112     .feature_sizes = feature_sizes
 113 };
 114
 115 static VirtIONetQueue *virtio_net_get_subqueue(NetClientState *nc)
 116 {
 117     VirtIONet *n = qemu_get_nic_opaque(nc);
 118
 119     return &n->vqs[nc->queue_index];
 120 }
 121
 122 static int vq2q(int queue_index)
 123 {
 124     return queue_index / 2;
 125 }
 126
 127 static void flush_or_purge_queued_packets(NetClientState *nc)
 128 {
 129     if (!nc->peer) {
 130         return;
 131     }
 132
 133     qemu_flush_or_purge_queued_packets(nc->peer, true);
 134     assert(!virtio_net_get_subqueue(nc)->async_tx.elem);
 135 }
 136
 137 /* TODO
 138  * - we could suppress RX interrupt if we were so inclined.
 139  */
 140
 141 static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
 142 {
 143     VirtIONet *n = VIRTIO_NET(vdev);
 144     struct virtio_net_config netcfg;
 145     NetClientState *nc = qemu_get_queue(n->nic);
 146     static const MACAddr zero = { .a = { 0, 0, 0, 0, 0, 0 } };
 147
 148     int ret = 0;
 149     memset(&netcfg, 0 , sizeof(struct virtio_net_config));
 150     virtio_stw_p(vdev, &netcfg.status, n->status);
 151     virtio_stw_p(vdev, &netcfg.max_virtqueue_pairs, n->max_queue_pairs);
 152     virtio_stw_p(vdev, &netcfg.mtu, n->net_conf.mtu);
 153     memcpy(netcfg.mac, n->mac, ETH_ALEN);
 154     virtio_stl_p(vdev, &netcfg.speed, n->net_conf.speed);
 155     netcfg.duplex = n->net_conf.duplex;
 156     netcfg.rss_max_key_size = VIRTIO_NET_RSS_MAX_KEY_SIZE;
 157     virtio_stw_p(vdev, &netcfg.rss_max_indirection_table_length,
 158                  virtio_host_has_feature(vdev, VIRTIO_NET_F_RSS) ?
 159                  VIRTIO_NET_RSS_MAX_TABLE_LEN : 1);
 160     virtio_stl_p(vdev, &netcfg.supported_hash_types,
 161                  VIRTIO_NET_RSS_SUPPORTED_HASHES);
 162     memcpy(config, &netcfg, n->config_size);
 163
 164     /*
 165      * Is this VDPA? No peer means not VDPA: there's no way to
 166      * disconnect/reconnect a VDPA peer.
 167      */
 168     if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
 169         ret = vhost_net_get_config(get_vhost_net(nc->peer), (uint8_t *)&netcfg,
 170                                    n->config_size);
 171         if (ret == -1) {
 172             return;
 173         }
 174
 175         /*
 176          * Some NIC/kernel combinations present 0 as the mac address.  As that
 177          * is not a legal address, try to proceed with the address from the
 178          * QEMU command line in the hope that the address has been configured
 179          * correctly elsewhere - just not reported by the device.
 180          */
 181         if (memcmp(&netcfg.mac, &zero, sizeof(zero)) == 0) {
 182             info_report("Zero hardware mac address detected. Ignoring.");
 183             memcpy(netcfg.mac, n->mac, ETH_ALEN);
 184         }
 185
 186         netcfg.status |= virtio_tswap16(vdev,
 187                                         n->status & VIRTIO_NET_S_ANNOUNCE);
 188         memcpy(config, &netcfg, n->config_size);
 189     }
 190 }
 191
 192 static void virtio_net_set_config(VirtIODevice *vdev, const uint8_t *config)
 193 {
 194     VirtIONet *n = VIRTIO_NET(vdev);
 195     struct virtio_net_config netcfg = {};
 196     NetClientState *nc = qemu_get_queue(n->nic);
 197
 198     memcpy(&netcfg, config, n->config_size);
 199
 200     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_MAC_ADDR) &&
 201         !virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1) &&
 202         memcmp(netcfg.mac, n->mac, ETH_ALEN)) {
 203         memcpy(n->mac, netcfg.mac, ETH_ALEN);
 204         qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
 205     }
 206
 207     /*
 208      * Is this VDPA? No peer means not VDPA: there's no way to
 209      * disconnect/reconnect a VDPA peer.
 210      */
 211     if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
 212         vhost_net_set_config(get_vhost_net(nc->peer),
 213                              (uint8_t *)&netcfg, 0, n->config_size,
 214                              VHOST_SET_CONFIG_TYPE_FRONTEND);
 215       }
 216 }
 217
 218 static bool virtio_net_started(VirtIONet *n, uint8_t status)
 219 {
 220     VirtIODevice *vdev = VIRTIO_DEVICE(n);
 221     return (status & VIRTIO_CONFIG_S_DRIVER_OK) &&
 222         (n->status & VIRTIO_NET_S_LINK_UP) && vdev->vm_running;
 223 }
 224
 225 static void virtio_net_announce_notify(VirtIONet *net)
 226 {
 227     VirtIODevice *vdev = VIRTIO_DEVICE(net);
 228     trace_virtio_net_announce_notify();
 229
 230     net->status |= VIRTIO_NET_S_ANNOUNCE;
 231     virtio_notify_config(vdev);
 232 }
 233
 234 static void virtio_net_announce_timer(void *opaque)
 235 {
 236     VirtIONet *n = opaque;
 237     trace_virtio_net_announce_timer(n->announce_timer.round);
 238
 239     n->announce_timer.round--;
 240     virtio_net_announce_notify(n);
 241 }
 242
 243 static void virtio_net_announce(NetClientState *nc)
 244 {
 245     VirtIONet *n = qemu_get_nic_opaque(nc);
 246     VirtIODevice *vdev = VIRTIO_DEVICE(n);
 247
 248     /*
 249      * Make sure the virtio migration announcement timer isn't running
 250      * If it is, let it trigger announcement so that we do not cause
 251      * confusion.
 252      */
 253     if (n->announce_timer.round) {
 254         return;
 255     }
 256
 257     if (virtio_vdev_has_feature(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE) &&
 258         virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) {
 259             virtio_net_announce_notify(n);
 260     }
 261 }
 262
 263 static void virtio_net_vhost_status(VirtIONet *n, uint8_t status)
 264 {
 265     VirtIODevice *vdev = VIRTIO_DEVICE(n);
 266     NetClientState *nc = qemu_get_queue(n->nic);
 267     int queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
 268     int cvq = virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ) ?
 269               n->max_ncs - n->max_queue_pairs : 0;
 270
 271     if (!get_vhost_net(nc->peer)) {
 272         return;
 273     }
 274
 275     if ((virtio_net_started(n, status) && !nc->peer->link_down) ==
 276         !!n->vhost_started) {
 277         return;
 278     }
 279     if (!n->vhost_started) {
 280         int r, i;
 281
 282         if (n->needs_vnet_hdr_swap) {
 283             error_report("backend does not support %s vnet headers; "
 284                          "falling back on userspace virtio",
 285                          virtio_is_big_endian(vdev) ? "BE" : "LE");
 286             return;
 287         }
 288
 289         /* Any packets outstanding? Purge them to avoid touching rings
 290          * when vhost is running.
 291          */
 292         for (i = 0;  i < queue_pairs; i++) {
 293             NetClientState *qnc = qemu_get_subqueue(n->nic, i);
 294
 295             /* Purge both directions: TX and RX. */
 296             qemu_net_queue_purge(qnc->peer->incoming_queue, qnc);
 297             qemu_net_queue_purge(qnc->incoming_queue, qnc->peer);
 298         }
 299
 300         if (virtio_has_feature(vdev->guest_features, VIRTIO_NET_F_MTU)) {
 301             r = vhost_net_set_mtu(get_vhost_net(nc->peer), n->net_conf.mtu);
 302             if (r < 0) {
 303                 error_report("%uBytes MTU not supported by the backend",
 304                              n->net_conf.mtu);
 305
 306                 return;
 307             }
 308         }
 309
 310         n->vhost_started = 1;
 311         r = vhost_net_start(vdev, n->nic->ncs, queue_pairs, cvq);
 312         if (r < 0) {
 313             error_report("unable to start vhost net: %d: "
 314                          "falling back on userspace virtio", -r);
 315             n->vhost_started = 0;
 316         }
 317     } else {
 318         vhost_net_stop(vdev, n->nic->ncs, queue_pairs, cvq);
 319         n->vhost_started = 0;
 320     }
 321 }
 322
 323 static int virtio_net_set_vnet_endian_one(VirtIODevice *vdev,
 324                                           NetClientState *peer,
 325                                           bool enable)
 326 {
 327     if (virtio_is_big_endian(vdev)) {
 328         return qemu_set_vnet_be(peer, enable);
 329     } else {
 330         return qemu_set_vnet_le(peer, enable);
 331     }
 332 }
 333
 334 static bool virtio_net_set_vnet_endian(VirtIODevice *vdev, NetClientState *ncs,
 335                                        int queue_pairs, bool enable)
 336 {
 337     int i;
 338
 339     for (i = 0; i < queue_pairs; i++) {
 340         if (virtio_net_set_vnet_endian_one(vdev, ncs[i].peer, enable) < 0 &&
 341             enable) {
 342             while (--i >= 0) {
 343                 virtio_net_set_vnet_endian_one(vdev, ncs[i].peer, false);
 344             }
 345
 346             return true;
 347         }
 348     }
 349
 350     return false;
 351 }
 352
 353 static void virtio_net_vnet_endian_status(VirtIONet *n, uint8_t status)
 354 {
 355     VirtIODevice *vdev = VIRTIO_DEVICE(n);
 356     int queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
 357
 358     if (virtio_net_started(n, status)) {
 359         /* Before using the device, we tell the network backend about the
 360          * endianness to use when parsing vnet headers. If the backend
 361          * can't do it, we fallback onto fixing the headers in the core
 362          * virtio-net code.
 363          */
 364         n->needs_vnet_hdr_swap = virtio_net_set_vnet_endian(vdev, n->nic->ncs,
 365                                                             queue_pairs, true);
 366     } else if (virtio_net_started(n, vdev->status)) {
 367         /* After using the device, we need to reset the network backend to
 368          * the default (guest native endianness), otherwise the guest may
 369          * lose network connectivity if it is rebooted into a different
 370          * endianness.
 371          */
 372         virtio_net_set_vnet_endian(vdev, n->nic->ncs, queue_pairs, false);
 373     }
 374 }
 375
 376 static void virtio_net_drop_tx_queue_data(VirtIODevice *vdev, VirtQueue *vq)
 377 {
 378     unsigned int dropped = virtqueue_drop_all(vq);
 379     if (dropped) {
 380         virtio_notify(vdev, vq);
 381     }
 382 }
 383
 384 static void virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status)
 385 {
 386     VirtIONet *n = VIRTIO_NET(vdev);
 387     VirtIONetQueue *q;
 388     int i;
 389     uint8_t queue_status;
 390
 391     virtio_net_vnet_endian_status(n, status);
 392     virtio_net_vhost_status(n, status);
 393
 394     for (i = 0; i < n->max_queue_pairs; i++) {
 395         NetClientState *ncs = qemu_get_subqueue(n->nic, i);
 396         bool queue_started;
 397         q = &n->vqs[i];
 398
 399         if ((!n->multiqueue && i != 0) || i >= n->curr_queue_pairs) {
 400             queue_status = 0;
 401         } else {
 402             queue_status = status;
 403         }
 404         queue_started =
 405             virtio_net_started(n, queue_status) && !n->vhost_started;
 406
 407         if (queue_started) {
 408             qemu_flush_queued_packets(ncs);
 409         }
 410
 411         if (!q->tx_waiting) {
 412             continue;
 413         }
 414
 415         if (queue_started) {
 416             if (q->tx_timer) {
 417                 timer_mod(q->tx_timer,
 418                                qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
 419             } else {
 420                 qemu_bh_schedule(q->tx_bh);
 421             }
 422         } else {
 423             if (q->tx_timer) {
 424                 timer_del(q->tx_timer);
 425             } else {
 426                 qemu_bh_cancel(q->tx_bh);
 427             }
 428             if ((n->status & VIRTIO_NET_S_LINK_UP) == 0 &&
 429                 (queue_status & VIRTIO_CONFIG_S_DRIVER_OK) &&
 430                 vdev->vm_running) {
 431                 /* if tx is waiting we are likely have some packets in tx queue
 432                  * and disabled notification */
 433                 q->tx_waiting = 0;
 434                 virtio_queue_set_notification(q->tx_vq, 1);
 435                 virtio_net_drop_tx_queue_data(vdev, q->tx_vq);
 436             }
 437         }
 438     }
 439 }
 440
 441 static void virtio_net_set_link_status(NetClientState *nc)
 442 {
 443     VirtIONet *n = qemu_get_nic_opaque(nc);
 444     VirtIODevice *vdev = VIRTIO_DEVICE(n);
 445     uint16_t old_status = n->status;
 446
 447     if (nc->link_down)
 448         n->status &= ~VIRTIO_NET_S_LINK_UP;
 449     else
 450         n->status |= VIRTIO_NET_S_LINK_UP;
 451
 452     if (n->status != old_status)
 453         virtio_notify_config(vdev);
 454
 455     virtio_net_set_status(vdev, vdev->status);
 456 }
 457
 458 static void rxfilter_notify(NetClientState *nc)
 459 {
 460     VirtIONet *n = qemu_get_nic_opaque(nc);
 461
 462     if (nc->rxfilter_notify_enabled) {
 463         char *path = object_get_canonical_path(OBJECT(n->qdev));
 464         qapi_event_send_nic_rx_filter_changed(n->netclient_name, path);
 465         g_free(path);
 466
 467         /* disable event notification to avoid events flooding */
 468         nc->rxfilter_notify_enabled = 0;
 469     }
 470 }
 471
 472 static intList *get_vlan_table(VirtIONet *n)
 473 {
 474     intList *list;
 475     int i, j;
 476
 477     list = NULL;
 478     for (i = 0; i < MAX_VLAN >> 5; i++) {
 479         for (j = 0; n->vlans[i] && j <= 0x1f; j++) {
 480             if (n->vlans[i] & (1U << j)) {
 481                 QAPI_LIST_PREPEND(list, (i << 5) + j);
 482             }
 483         }
 484     }
 485
 486     return list;
 487 }
 488
 489 static RxFilterInfo *virtio_net_query_rxfilter(NetClientState *nc)
 490 {
 491     VirtIONet *n = qemu_get_nic_opaque(nc);
 492     VirtIODevice *vdev = VIRTIO_DEVICE(n);
 493     RxFilterInfo *info;
 494     strList *str_list;
 495     int i;
 496
 497     info = g_malloc0(sizeof(*info));
 498     info->name = g_strdup(nc->name);
 499     info->promiscuous = n->promisc;
 500
 501     if (n->nouni) {
 502         info->unicast = RX_STATE_NONE;
 503     } else if (n->alluni) {
 504         info->unicast = RX_STATE_ALL;
 505     } else {
 506         info->unicast = RX_STATE_NORMAL;
 507     }
 508
 509     if (n->nomulti) {
 510         info->multicast = RX_STATE_NONE;
 511     } else if (n->allmulti) {
 512         info->multicast = RX_STATE_ALL;
 513     } else {
 514         info->multicast = RX_STATE_NORMAL;
 515     }
 516
 517     info->broadcast_allowed = n->nobcast;
 518     info->multicast_overflow = n->mac_table.multi_overflow;
 519     info->unicast_overflow = n->mac_table.uni_overflow;
 520
 521     info->main_mac = qemu_mac_strdup_printf(n->mac);
 522
 523     str_list = NULL;
 524     for (i = 0; i < n->mac_table.first_multi; i++) {
 525         QAPI_LIST_PREPEND(str_list,
 526                       qemu_mac_strdup_printf(n->mac_table.macs + i * ETH_ALEN));
 527     }
 528     info->unicast_table = str_list;
 529
 530     str_list = NULL;
 531     for (i = n->mac_table.first_multi; i < n->mac_table.in_use; i++) {
 532         QAPI_LIST_PREPEND(str_list,
 533                       qemu_mac_strdup_printf(n->mac_table.macs + i * ETH_ALEN));
 534     }
 535     info->multicast_table = str_list;
 536     info->vlan_table = get_vlan_table(n);
 537
 538     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VLAN)) {
 539         info->vlan = RX_STATE_ALL;
 540     } else if (!info->vlan_table) {
 541         info->vlan = RX_STATE_NONE;
 542     } else {
 543         info->vlan = RX_STATE_NORMAL;
 544     }
 545
 546     /* enable event notification after query */
 547     nc->rxfilter_notify_enabled = 1;
 548
 549     return info;
 550 }
 551
 552 static void virtio_net_queue_reset(VirtIODevice *vdev, uint32_t queue_index)
 553 {
 554     VirtIONet *n = VIRTIO_NET(vdev);
 555     NetClientState *nc;
 556
 557     /* validate queue_index and skip for cvq */
 558     if (queue_index >= n->max_queue_pairs * 2) {
 559         return;
 560     }
 561
 562     nc = qemu_get_subqueue(n->nic, vq2q(queue_index));
 563
 564     if (!nc->peer) {
 565         return;
 566     }
 567
 568     if (get_vhost_net(nc->peer) &&
 569         nc->peer->info->type == NET_CLIENT_DRIVER_TAP) {
 570         vhost_net_virtqueue_reset(vdev, nc, queue_index);
 571     }
 572
 573     flush_or_purge_queued_packets(nc);
 574 }
 575
 576 static void virtio_net_queue_enable(VirtIODevice *vdev, uint32_t queue_index)
 577 {
 578     VirtIONet *n = VIRTIO_NET(vdev);
 579     NetClientState *nc;
 580     int r;
 581
 582     /* validate queue_index and skip for cvq */
 583     if (queue_index >= n->max_queue_pairs * 2) {
 584         return;
 585     }
 586
 587     nc = qemu_get_subqueue(n->nic, vq2q(queue_index));
 588
 589     if (!nc->peer || !vdev->vhost_started) {
 590         return;
 591     }
 592
 593     if (get_vhost_net(nc->peer) &&
 594         nc->peer->info->type == NET_CLIENT_DRIVER_TAP) {
 595         r = vhost_net_virtqueue_restart(vdev, nc, queue_index);
 596         if (r < 0) {
 597             error_report("unable to restart vhost net virtqueue: %d, "
 598                             "when resetting the queue", queue_index);
 599         }
 600     }
 601 }
 602
 603 static void virtio_net_reset(VirtIODevice *vdev)
 604 {
 605     VirtIONet *n = VIRTIO_NET(vdev);
 606     int i;
 607
 608     /* Reset back to compatibility mode */
 609     n->promisc = 1;
 610     n->allmulti = 0;
 611     n->alluni = 0;
 612     n->nomulti = 0;
 613     n->nouni = 0;
 614     n->nobcast = 0;
 615     /* multiqueue is disabled by default */
 616     n->curr_queue_pairs = 1;
 617     timer_del(n->announce_timer.tm);
 618     n->announce_timer.round = 0;
 619     n->status &= ~VIRTIO_NET_S_ANNOUNCE;
 620
 621     /* Flush any MAC and VLAN filter table state */
 622     n->mac_table.in_use = 0;
 623     n->mac_table.first_multi = 0;
 624     n->mac_table.multi_overflow = 0;
 625     n->mac_table.uni_overflow = 0;
 626     memset(n->mac_table.macs, 0, MAC_TABLE_ENTRIES * ETH_ALEN);
 627     memcpy(&n->mac[0], &n->nic->conf->macaddr, sizeof(n->mac));
 628     qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
 629     memset(n->vlans, 0, MAX_VLAN >> 3);
 630
 631     /* Flush any async TX */
 632     for (i = 0;  i < n->max_queue_pairs; i++) {
 633         flush_or_purge_queued_packets(qemu_get_subqueue(n->nic, i));
 634     }
 635 }
 636
 637 static void peer_test_vnet_hdr(VirtIONet *n)
 638 {
 639     NetClientState *nc = qemu_get_queue(n->nic);
 640     if (!nc->peer) {
 641         return;
 642     }
 643
 644     n->has_vnet_hdr = qemu_has_vnet_hdr(nc->peer);
 645 }
 646
 647 static int peer_has_vnet_hdr(VirtIONet *n)
 648 {
 649     return n->has_vnet_hdr;
 650 }
 651
 652 static int peer_has_ufo(VirtIONet *n)
 653 {
 654     if (!peer_has_vnet_hdr(n))
 655         return 0;
 656
 657     n->has_ufo = qemu_has_ufo(qemu_get_queue(n->nic)->peer);
 658
 659     return n->has_ufo;
 660 }
 661
 662 static int peer_has_uso(VirtIONet *n)
 663 {
 664     if (!peer_has_vnet_hdr(n)) {
 665         return 0;
 666     }
 667
 668     return qemu_has_uso(qemu_get_queue(n->nic)->peer);
 669 }
 670
 671 static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs,
 672                                        int version_1, int hash_report)
 673 {
 674     int i;
 675     NetClientState *nc;
 676
 677     n->mergeable_rx_bufs = mergeable_rx_bufs;
 678
 679     if (version_1) {
 680         n->guest_hdr_len = hash_report ?
 681             sizeof(struct virtio_net_hdr_v1_hash) :
 682             sizeof(struct virtio_net_hdr_mrg_rxbuf);
 683         n->rss_data.populate_hash = !!hash_report;
 684     } else {
 685         n->guest_hdr_len = n->mergeable_rx_bufs ?
 686             sizeof(struct virtio_net_hdr_mrg_rxbuf) :
 687             sizeof(struct virtio_net_hdr);
 688     }
 689
 690     for (i = 0; i < n->max_queue_pairs; i++) {
 691         nc = qemu_get_subqueue(n->nic, i);
 692
 693         if (peer_has_vnet_hdr(n) &&
 694             qemu_has_vnet_hdr_len(nc->peer, n->guest_hdr_len)) {
 695             qemu_set_vnet_hdr_len(nc->peer, n->guest_hdr_len);
 696             n->host_hdr_len = n->guest_hdr_len;
 697         }
 698     }
 699 }
 700
 701 static int virtio_net_max_tx_queue_size(VirtIONet *n)
 702 {
 703     NetClientState *peer = n->nic_conf.peers.ncs[0];
 704
 705     /*
 706      * Backends other than vhost-user or vhost-vdpa don't support max queue
 707      * size.
 708      */
 709     if (!peer) {
 710         return VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE;
 711     }
 712
 713     switch(peer->info->type) {
 714     case NET_CLIENT_DRIVER_VHOST_USER:
 715     case NET_CLIENT_DRIVER_VHOST_VDPA:
 716         return VIRTQUEUE_MAX_SIZE;
 717     default:
 718         return VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE;
 719     };
 720 }
 721
 722 static int peer_attach(VirtIONet *n, int index)
 723 {
 724     NetClientState *nc = qemu_get_subqueue(n->nic, index);
 725
 726     if (!nc->peer) {
 727         return 0;
 728     }
 729
 730     if (nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_USER) {
 731         vhost_set_vring_enable(nc->peer, 1);
 732     }
 733
 734     if (nc->peer->info->type != NET_CLIENT_DRIVER_TAP) {
 735         return 0;
 736     }
 737
 738     if (n->max_queue_pairs == 1) {
 739         return 0;
 740     }
 741
 742     return tap_enable(nc->peer);
 743 }
 744
 745 static int peer_detach(VirtIONet *n, int index)
 746 {
 747     NetClientState *nc = qemu_get_subqueue(n->nic, index);
 748
 749     if (!nc->peer) {
 750         return 0;
 751     }
 752
 753     if (nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_USER) {
 754         vhost_set_vring_enable(nc->peer, 0);
 755     }
 756
 757     if (nc->peer->info->type !=  NET_CLIENT_DRIVER_TAP) {
 758         return 0;
 759     }
 760
 761     return tap_disable(nc->peer);
 762 }
 763
 764 static void virtio_net_set_queue_pairs(VirtIONet *n)
 765 {
 766     int i;
 767     int r;
 768
 769     if (n->nic->peer_deleted) {
 770         return;
 771     }
 772
 773     for (i = 0; i < n->max_queue_pairs; i++) {
 774         if (i < n->curr_queue_pairs) {
 775             r = peer_attach(n, i);
 776             assert(!r);
 777         } else {
 778             r = peer_detach(n, i);
 779             assert(!r);
 780         }
 781     }
 782 }
 783
 784 static void virtio_net_set_multiqueue(VirtIONet *n, int multiqueue);
 785
 786 static uint64_t virtio_net_get_features(VirtIODevice *vdev, uint64_t features,
 787                                         Error **errp)
 788 {
 789     VirtIONet *n = VIRTIO_NET(vdev);
 790     NetClientState *nc = qemu_get_queue(n->nic);
 791
 792     /* Firstly sync all virtio-net possible supported features */
 793     features |= n->host_features;
 794
 795     virtio_add_feature(&features, VIRTIO_NET_F_MAC);
 796
 797     if (!peer_has_vnet_hdr(n)) {
 798         virtio_clear_feature(&features, VIRTIO_NET_F_CSUM);
 799         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_TSO4);
 800         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_TSO6);
 801         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_ECN);
 802
 803         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_CSUM);
 804         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO4);
 805         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_TSO6);
 806         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ECN);
 807
 808         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
 809         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
 810         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
 811
 812         virtio_clear_feature(&features, VIRTIO_NET_F_HASH_REPORT);
 813     }
 814
 815     if (!peer_has_vnet_hdr(n) || !peer_has_ufo(n)) {
 816         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_UFO);
 817         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_UFO);
 818     }
 819
 820     if (!peer_has_uso(n)) {
 821         virtio_clear_feature(&features, VIRTIO_NET_F_HOST_USO);
 822         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO4);
 823         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_USO6);
 824     }
 825
 826     if (!get_vhost_net(nc->peer)) {
 827         return features;
 828     }
 829
 830     if (!ebpf_rss_is_loaded(&n->ebpf_rss)) {
 831         virtio_clear_feature(&features, VIRTIO_NET_F_RSS);
 832     }
 833     features = vhost_net_get_features(get_vhost_net(nc->peer), features);
 834     vdev->backend_features = features;
 835
 836     if (n->mtu_bypass_backend &&
 837             (n->host_features & 1ULL << VIRTIO_NET_F_MTU)) {
 838         features |= (1ULL << VIRTIO_NET_F_MTU);
 839     }
 840
 841     /*
 842      * Since GUEST_ANNOUNCE is emulated the feature bit could be set without
 843      * enabled. This happens in the vDPA case.
 844      *
 845      * Make sure the feature set is not incoherent, as the driver could refuse
 846      * to start.
 847      *
 848      * TODO: QEMU is able to emulate a CVQ just for guest_announce purposes,
 849      * helping guest to notify the new location with vDPA devices that does not
 850      * support it.
 851      */
 852     if (!virtio_has_feature(vdev->backend_features, VIRTIO_NET_F_CTRL_VQ)) {
 853         virtio_clear_feature(&features, VIRTIO_NET_F_GUEST_ANNOUNCE);
 854     }
 855
 856     return features;
 857 }
 858
 859 static uint64_t virtio_net_bad_features(VirtIODevice *vdev)
 860 {
 861     uint64_t features = 0;
 862
 863     /* Linux kernel 2.6.25.  It understood MAC (as everyone must),
 864      * but also these: */
 865     virtio_add_feature(&features, VIRTIO_NET_F_MAC);
 866     virtio_add_feature(&features, VIRTIO_NET_F_CSUM);
 867     virtio_add_feature(&features, VIRTIO_NET_F_HOST_TSO4);
 868     virtio_add_feature(&features, VIRTIO_NET_F_HOST_TSO6);
 869     virtio_add_feature(&features, VIRTIO_NET_F_HOST_ECN);
 870
 871     return features;
 872 }
 873
 874 static void virtio_net_apply_guest_offloads(VirtIONet *n)
 875 {
 876     qemu_set_offload(qemu_get_queue(n->nic)->peer,
 877             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_CSUM)),
 878             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO4)),
 879             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_TSO6)),
 880             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_ECN)),
 881             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_UFO)),
 882             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO4)),
 883             !!(n->curr_guest_offloads & (1ULL << VIRTIO_NET_F_GUEST_USO6)));
 884 }
 885
 886 static uint64_t virtio_net_guest_offloads_by_features(uint64_t features)
 887 {
 888     static const uint64_t guest_offloads_mask =
 889         (1ULL << VIRTIO_NET_F_GUEST_CSUM) |
 890         (1ULL << VIRTIO_NET_F_GUEST_TSO4) |
 891         (1ULL << VIRTIO_NET_F_GUEST_TSO6) |
 892         (1ULL << VIRTIO_NET_F_GUEST_ECN)  |
 893         (1ULL << VIRTIO_NET_F_GUEST_UFO)  |
 894         (1ULL << VIRTIO_NET_F_GUEST_USO4) |
 895         (1ULL << VIRTIO_NET_F_GUEST_USO6);
 896
 897     return guest_offloads_mask & features;
 898 }
 899
 900 uint64_t virtio_net_supported_guest_offloads(const VirtIONet *n)
 901 {
 902     VirtIODevice *vdev = VIRTIO_DEVICE(n);
 903     return virtio_net_guest_offloads_by_features(vdev->guest_features);
 904 }
 905
 906 typedef struct {
 907     VirtIONet *n;
 908     DeviceState *dev;
 909 } FailoverDevice;
 910
 911 /**
 912  * Set the failover primary device
 913  *
 914  * @opaque: FailoverId to setup
 915  * @opts: opts for device we are handling
 916  * @errp: returns an error if this function fails
 917  */
 918 static int failover_set_primary(DeviceState *dev, void *opaque)
 919 {
 920     FailoverDevice *fdev = opaque;
 921     PCIDevice *pci_dev = (PCIDevice *)
 922         object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE);
 923
 924     if (!pci_dev) {
 925         return 0;
 926     }
 927
 928     if (!g_strcmp0(pci_dev->failover_pair_id, fdev->n->netclient_name)) {
 929         fdev->dev = dev;
 930         return 1;
 931     }
 932
 933     return 0;
 934 }
 935
 936 /**
 937  * Find the primary device for this failover virtio-net
 938  *
 939  * @n: VirtIONet device
 940  * @errp: returns an error if this function fails
 941  */
 942 static DeviceState *failover_find_primary_device(VirtIONet *n)
 943 {
 944     FailoverDevice fdev = {
 945         .n = n,
 946     };
 947
 948     qbus_walk_children(sysbus_get_default(), failover_set_primary, NULL,
 949                        NULL, NULL, &fdev);
 950     return fdev.dev;
 951 }
 952
 953 static void failover_add_primary(VirtIONet *n, Error **errp)
 954 {
 955     Error *err = NULL;
 956     DeviceState *dev = failover_find_primary_device(n);
 957
 958     if (dev) {
 959         return;
 960     }
 961
 962     if (!n->primary_opts) {
 963         error_setg(errp, "Primary device not found");
 964         error_append_hint(errp, "Virtio-net failover will not work. Make "
 965                           "sure primary device has parameter"
 966                           " failover_pair_id=%s\n", n->netclient_name);
 967         return;
 968     }
 969
 970     dev = qdev_device_add_from_qdict(n->primary_opts,
 971                                      n->primary_opts_from_json,
 972                                      &err);
 973     if (err) {
 974         qobject_unref(n->primary_opts);
 975         n->primary_opts = NULL;
 976     } else {
 977         object_unref(OBJECT(dev));
 978     }
 979     error_propagate(errp, err);
 980 }
 981
 982 static void virtio_net_set_features(VirtIODevice *vdev, uint64_t features)
 983 {
 984     VirtIONet *n = VIRTIO_NET(vdev);
 985     Error *err = NULL;
 986     int i;
 987
 988     if (n->mtu_bypass_backend &&
 989             !virtio_has_feature(vdev->backend_features, VIRTIO_NET_F_MTU)) {
 990         features &= ~(1ULL << VIRTIO_NET_F_MTU);
 991     }
 992
 993     virtio_net_set_multiqueue(n,
 994                               virtio_has_feature(features, VIRTIO_NET_F_RSS) ||
 995                               virtio_has_feature(features, VIRTIO_NET_F_MQ));
 996
 997     virtio_net_set_mrg_rx_bufs(n,
 998                                virtio_has_feature(features,
 999                                                   VIRTIO_NET_F_MRG_RXBUF),
1000                                virtio_has_feature(features,
1001                                                   VIRTIO_F_VERSION_1),
1002                                virtio_has_feature(features,
1003                                                   VIRTIO_NET_F_HASH_REPORT));
1004
1005     n->rsc4_enabled = virtio_has_feature(features, VIRTIO_NET_F_RSC_EXT) &&
1006         virtio_has_feature(features, VIRTIO_NET_F_GUEST_TSO4);
1007     n->rsc6_enabled = virtio_has_feature(features, VIRTIO_NET_F_RSC_EXT) &&
1008         virtio_has_feature(features, VIRTIO_NET_F_GUEST_TSO6);
1009     n->rss_data.redirect = virtio_has_feature(features, VIRTIO_NET_F_RSS);
1010
1011     if (n->has_vnet_hdr) {
1012         n->curr_guest_offloads =
1013             virtio_net_guest_offloads_by_features(features);
1014         virtio_net_apply_guest_offloads(n);
1015     }
1016
1017     for (i = 0;  i < n->max_queue_pairs; i++) {
1018         NetClientState *nc = qemu_get_subqueue(n->nic, i);
1019
1020         if (!get_vhost_net(nc->peer)) {
1021             continue;
1022         }
1023         vhost_net_ack_features(get_vhost_net(nc->peer), features);
1024
1025         /*
1026          * keep acked_features in NetVhostUserState up-to-date so it
1027          * can't miss any features configured by guest virtio driver.
1028          */
1029         vhost_net_save_acked_features(nc->peer);
1030     }
1031
1032     if (!virtio_has_feature(features, VIRTIO_NET_F_CTRL_VLAN)) {
1033         memset(n->vlans, 0xff, MAX_VLAN >> 3);
1034     }
1035
1036     if (virtio_has_feature(features, VIRTIO_NET_F_STANDBY)) {
1037         qapi_event_send_failover_negotiated(n->netclient_name);
1038         qatomic_set(&n->failover_primary_hidden, false);
1039         failover_add_primary(n, &err);
1040         if (err) {
1041             if (!qtest_enabled()) {
1042                 warn_report_err(err);
1043             } else {
1044                 error_free(err);
1045             }
1046         }
1047     }
1048 }
1049
1050 static int virtio_net_handle_rx_mode(VirtIONet *n, uint8_t cmd,
1051                                      struct iovec *iov, unsigned int iov_cnt)
1052 {
1053     uint8_t on;
1054     size_t s;
1055     NetClientState *nc = qemu_get_queue(n->nic);
1056
1057     s = iov_to_buf(iov, iov_cnt, 0, &on, sizeof(on));
1058     if (s != sizeof(on)) {
1059         return VIRTIO_NET_ERR;
1060     }
1061
1062     if (cmd == VIRTIO_NET_CTRL_RX_PROMISC) {
1063         n->promisc = on;
1064     } else if (cmd == VIRTIO_NET_CTRL_RX_ALLMULTI) {
1065         n->allmulti = on;
1066     } else if (cmd == VIRTIO_NET_CTRL_RX_ALLUNI) {
1067         n->alluni = on;
1068     } else if (cmd == VIRTIO_NET_CTRL_RX_NOMULTI) {
1069         n->nomulti = on;
1070     } else if (cmd == VIRTIO_NET_CTRL_RX_NOUNI) {
1071         n->nouni = on;
1072     } else if (cmd == VIRTIO_NET_CTRL_RX_NOBCAST) {
1073         n->nobcast = on;
1074     } else {
1075         return VIRTIO_NET_ERR;
1076     }
1077
1078     rxfilter_notify(nc);
1079
1080     return VIRTIO_NET_OK;
1081 }
1082
1083 static int virtio_net_handle_offloads(VirtIONet *n, uint8_t cmd,
1084                                      struct iovec *iov, unsigned int iov_cnt)
1085 {
1086     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1087     uint64_t offloads;
1088     size_t s;
1089
1090     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
1091         return VIRTIO_NET_ERR;
1092     }
1093
1094     s = iov_to_buf(iov, iov_cnt, 0, &offloads, sizeof(offloads));
1095     if (s != sizeof(offloads)) {
1096         return VIRTIO_NET_ERR;
1097     }
1098
1099     if (cmd == VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET) {
1100         uint64_t supported_offloads;
1101
1102         offloads = virtio_ldq_p(vdev, &offloads);
1103
1104         if (!n->has_vnet_hdr) {
1105             return VIRTIO_NET_ERR;
1106         }
1107
1108         n->rsc4_enabled = virtio_has_feature(offloads, VIRTIO_NET_F_RSC_EXT) &&
1109             virtio_has_feature(offloads, VIRTIO_NET_F_GUEST_TSO4);
1110         n->rsc6_enabled = virtio_has_feature(offloads, VIRTIO_NET_F_RSC_EXT) &&
1111             virtio_has_feature(offloads, VIRTIO_NET_F_GUEST_TSO6);
1112         virtio_clear_feature(&offloads, VIRTIO_NET_F_RSC_EXT);
1113
1114         supported_offloads = virtio_net_supported_guest_offloads(n);
1115         if (offloads & ~supported_offloads) {
1116             return VIRTIO_NET_ERR;
1117         }
1118
1119         n->curr_guest_offloads = offloads;
1120         virtio_net_apply_guest_offloads(n);
1121
1122         return VIRTIO_NET_OK;
1123     } else {
1124         return VIRTIO_NET_ERR;
1125     }
1126 }
1127
1128 static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd,
1129                                  struct iovec *iov, unsigned int iov_cnt)
1130 {
1131     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1132     struct virtio_net_ctrl_mac mac_data;
1133     size_t s;
1134     NetClientState *nc = qemu_get_queue(n->nic);
1135
1136     if (cmd == VIRTIO_NET_CTRL_MAC_ADDR_SET) {
1137         if (iov_size(iov, iov_cnt) != sizeof(n->mac)) {
1138             return VIRTIO_NET_ERR;
1139         }
1140         s = iov_to_buf(iov, iov_cnt, 0, &n->mac, sizeof(n->mac));
1141         assert(s == sizeof(n->mac));
1142         qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
1143         rxfilter_notify(nc);
1144
1145         return VIRTIO_NET_OK;
1146     }
1147
1148     if (cmd != VIRTIO_NET_CTRL_MAC_TABLE_SET) {
1149         return VIRTIO_NET_ERR;
1150     }
1151
1152     int in_use = 0;
1153     int first_multi = 0;
1154     uint8_t uni_overflow = 0;
1155     uint8_t multi_overflow = 0;
1156     uint8_t *macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN);
1157
1158     s = iov_to_buf(iov, iov_cnt, 0, &mac_data.entries,
1159                    sizeof(mac_data.entries));
1160     mac_data.entries = virtio_ldl_p(vdev, &mac_data.entries);
1161     if (s != sizeof(mac_data.entries)) {
1162         goto error;
1163     }
1164     iov_discard_front(&iov, &iov_cnt, s);
1165
1166     if (mac_data.entries * ETH_ALEN > iov_size(iov, iov_cnt)) {
1167         goto error;
1168     }
1169
1170     if (mac_data.entries <= MAC_TABLE_ENTRIES) {
1171         s = iov_to_buf(iov, iov_cnt, 0, macs,
1172                        mac_data.entries * ETH_ALEN);
1173         if (s != mac_data.entries * ETH_ALEN) {
1174             goto error;
1175         }
1176         in_use += mac_data.entries;
1177     } else {
1178         uni_overflow = 1;
1179     }
1180
1181     iov_discard_front(&iov, &iov_cnt, mac_data.entries * ETH_ALEN);
1182
1183     first_multi = in_use;
1184
1185     s = iov_to_buf(iov, iov_cnt, 0, &mac_data.entries,
1186                    sizeof(mac_data.entries));
1187     mac_data.entries = virtio_ldl_p(vdev, &mac_data.entries);
1188     if (s != sizeof(mac_data.entries)) {
1189         goto error;
1190     }
1191
1192     iov_discard_front(&iov, &iov_cnt, s);
1193
1194     if (mac_data.entries * ETH_ALEN != iov_size(iov, iov_cnt)) {
1195         goto error;
1196     }
1197
1198     if (mac_data.entries <= MAC_TABLE_ENTRIES - in_use) {
1199         s = iov_to_buf(iov, iov_cnt, 0, &macs[in_use * ETH_ALEN],
1200                        mac_data.entries * ETH_ALEN);
1201         if (s != mac_data.entries * ETH_ALEN) {
1202             goto error;
1203         }
1204         in_use += mac_data.entries;
1205     } else {
1206         multi_overflow = 1;
1207     }
1208
1209     n->mac_table.in_use = in_use;
1210     n->mac_table.first_multi = first_multi;
1211     n->mac_table.uni_overflow = uni_overflow;
1212     n->mac_table.multi_overflow = multi_overflow;
1213     memcpy(n->mac_table.macs, macs, MAC_TABLE_ENTRIES * ETH_ALEN);
1214     g_free(macs);
1215     rxfilter_notify(nc);
1216
1217     return VIRTIO_NET_OK;
1218
1219 error:
1220     g_free(macs);
1221     return VIRTIO_NET_ERR;
1222 }
1223
1224 static int virtio_net_handle_vlan_table(VirtIONet *n, uint8_t cmd,
1225                                         struct iovec *iov, unsigned int iov_cnt)
1226 {
1227     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1228     uint16_t vid;
1229     size_t s;
1230     NetClientState *nc = qemu_get_queue(n->nic);
1231
1232     s = iov_to_buf(iov, iov_cnt, 0, &vid, sizeof(vid));
1233     vid = virtio_lduw_p(vdev, &vid);
1234     if (s != sizeof(vid)) {
1235         return VIRTIO_NET_ERR;
1236     }
1237
1238     if (vid >= MAX_VLAN)
1239         return VIRTIO_NET_ERR;
1240
1241     if (cmd == VIRTIO_NET_CTRL_VLAN_ADD)
1242         n->vlans[vid >> 5] |= (1U << (vid & 0x1f));
1243     else if (cmd == VIRTIO_NET_CTRL_VLAN_DEL)
1244         n->vlans[vid >> 5] &= ~(1U << (vid & 0x1f));
1245     else
1246         return VIRTIO_NET_ERR;
1247
1248     rxfilter_notify(nc);
1249
1250     return VIRTIO_NET_OK;
1251 }
1252
1253 static int virtio_net_handle_announce(VirtIONet *n, uint8_t cmd,
1254                                       struct iovec *iov, unsigned int iov_cnt)
1255 {
1256     trace_virtio_net_handle_announce(n->announce_timer.round);
1257     if (cmd == VIRTIO_NET_CTRL_ANNOUNCE_ACK &&
1258         n->status & VIRTIO_NET_S_ANNOUNCE) {
1259         n->status &= ~VIRTIO_NET_S_ANNOUNCE;
1260         if (n->announce_timer.round) {
1261             qemu_announce_timer_step(&n->announce_timer);
1262         }
1263         return VIRTIO_NET_OK;
1264     } else {
1265         return VIRTIO_NET_ERR;
1266     }
1267 }
1268
1269 static void virtio_net_detach_epbf_rss(VirtIONet *n);
1270
1271 static void virtio_net_disable_rss(VirtIONet *n)
1272 {
1273     if (n->rss_data.enabled) {
1274         trace_virtio_net_rss_disable();
1275     }
1276     n->rss_data.enabled = false;
1277
1278     virtio_net_detach_epbf_rss(n);
1279 }
1280
1281 static bool virtio_net_attach_ebpf_to_backend(NICState *nic, int prog_fd)
1282 {
1283     NetClientState *nc = qemu_get_peer(qemu_get_queue(nic), 0);
1284     if (nc == NULL || nc->info->set_steering_ebpf == NULL) {
1285         return false;
1286     }
1287
1288     return nc->info->set_steering_ebpf(nc, prog_fd);
1289 }
1290
1291 static void rss_data_to_rss_config(struct VirtioNetRssData *data,
1292                                    struct EBPFRSSConfig *config)
1293 {
1294     config->redirect = data->redirect;
1295     config->populate_hash = data->populate_hash;
1296     config->hash_types = data->hash_types;
1297     config->indirections_len = data->indirections_len;
1298     config->default_queue = data->default_queue;
1299 }
1300
1301 static bool virtio_net_attach_epbf_rss(VirtIONet *n)
1302 {
1303     struct EBPFRSSConfig config = {};
1304
1305     if (!ebpf_rss_is_loaded(&n->ebpf_rss)) {
1306         return false;
1307     }
1308
1309     rss_data_to_rss_config(&n->rss_data, &config);
1310
1311     if (!ebpf_rss_set_all(&n->ebpf_rss, &config,
1312                           n->rss_data.indirections_table, n->rss_data.key)) {
1313         return false;
1314     }
1315
1316     if (!virtio_net_attach_ebpf_to_backend(n->nic, n->ebpf_rss.program_fd)) {
1317         return false;
1318     }
1319
1320     return true;
1321 }
1322
1323 static void virtio_net_detach_epbf_rss(VirtIONet *n)
1324 {
1325     virtio_net_attach_ebpf_to_backend(n->nic, -1);
1326 }
1327
1328 static bool virtio_net_load_ebpf(VirtIONet *n)
1329 {
1330     if (!virtio_net_attach_ebpf_to_backend(n->nic, -1)) {
1331         /* backend doesn't support steering ebpf */
1332         return false;
1333     }
1334
1335     return ebpf_rss_load(&n->ebpf_rss);
1336 }
1337
1338 static void virtio_net_unload_ebpf(VirtIONet *n)
1339 {
1340     virtio_net_attach_ebpf_to_backend(n->nic, -1);
1341     ebpf_rss_unload(&n->ebpf_rss);
1342 }
1343
1344 static uint16_t virtio_net_handle_rss(VirtIONet *n,
1345                                       struct iovec *iov,
1346                                       unsigned int iov_cnt,
1347                                       bool do_rss)
1348 {
1349     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1350     struct virtio_net_rss_config cfg;
1351     size_t s, offset = 0, size_get;
1352     uint16_t queue_pairs, i;
1353     struct {
1354         uint16_t us;
1355         uint8_t b;
1356     } QEMU_PACKED temp;
1357     const char *err_msg = "";
1358     uint32_t err_value = 0;
1359
1360     if (do_rss && !virtio_vdev_has_feature(vdev, VIRTIO_NET_F_RSS)) {
1361         err_msg = "RSS is not negotiated";
1362         goto error;
1363     }
1364     if (!do_rss && !virtio_vdev_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT)) {
1365         err_msg = "Hash report is not negotiated";
1366         goto error;
1367     }
1368     size_get = offsetof(struct virtio_net_rss_config, indirection_table);
1369     s = iov_to_buf(iov, iov_cnt, offset, &cfg, size_get);
1370     if (s != size_get) {
1371         err_msg = "Short command buffer";
1372         err_value = (uint32_t)s;
1373         goto error;
1374     }
1375     n->rss_data.hash_types = virtio_ldl_p(vdev, &cfg.hash_types);
1376     n->rss_data.indirections_len =
1377         virtio_lduw_p(vdev, &cfg.indirection_table_mask);
1378     n->rss_data.indirections_len++;
1379     if (!do_rss) {
1380         n->rss_data.indirections_len = 1;
1381     }
1382     if (!is_power_of_2(n->rss_data.indirections_len)) {
1383         err_msg = "Invalid size of indirection table";
1384         err_value = n->rss_data.indirections_len;
1385         goto error;
1386     }
1387     if (n->rss_data.indirections_len > VIRTIO_NET_RSS_MAX_TABLE_LEN) {
1388         err_msg = "Too large indirection table";
1389         err_value = n->rss_data.indirections_len;
1390         goto error;
1391     }
1392     n->rss_data.default_queue = do_rss ?
1393         virtio_lduw_p(vdev, &cfg.unclassified_queue) : 0;
1394     if (n->rss_data.default_queue >= n->max_queue_pairs) {
1395         err_msg = "Invalid default queue";
1396         err_value = n->rss_data.default_queue;
1397         goto error;
1398     }
1399     offset += size_get;
1400     size_get = sizeof(uint16_t) * n->rss_data.indirections_len;
1401     g_free(n->rss_data.indirections_table);
1402     n->rss_data.indirections_table = g_malloc(size_get);
1403     if (!n->rss_data.indirections_table) {
1404         err_msg = "Can't allocate indirections table";
1405         err_value = n->rss_data.indirections_len;
1406         goto error;
1407     }
1408     s = iov_to_buf(iov, iov_cnt, offset,
1409                    n->rss_data.indirections_table, size_get);
1410     if (s != size_get) {
1411         err_msg = "Short indirection table buffer";
1412         err_value = (uint32_t)s;
1413         goto error;
1414     }
1415     for (i = 0; i < n->rss_data.indirections_len; ++i) {
1416         uint16_t val = n->rss_data.indirections_table[i];
1417         n->rss_data.indirections_table[i] = virtio_lduw_p(vdev, &val);
1418     }
1419     offset += size_get;
1420     size_get = sizeof(temp);
1421     s = iov_to_buf(iov, iov_cnt, offset, &temp, size_get);
1422     if (s != size_get) {
1423         err_msg = "Can't get queue_pairs";
1424         err_value = (uint32_t)s;
1425         goto error;
1426     }
1427     queue_pairs = do_rss ? virtio_lduw_p(vdev, &temp.us) : n->curr_queue_pairs;
1428     if (queue_pairs == 0 || queue_pairs > n->max_queue_pairs) {
1429         err_msg = "Invalid number of queue_pairs";
1430         err_value = queue_pairs;
1431         goto error;
1432     }
1433     if (temp.b > VIRTIO_NET_RSS_MAX_KEY_SIZE) {
1434         err_msg = "Invalid key size";
1435         err_value = temp.b;
1436         goto error;
1437     }
1438     if (!temp.b && n->rss_data.hash_types) {
1439         err_msg = "No key provided";
1440         err_value = 0;
1441         goto error;
1442     }
1443     if (!temp.b && !n->rss_data.hash_types) {
1444         virtio_net_disable_rss(n);
1445         return queue_pairs;
1446     }
1447     offset += size_get;
1448     size_get = temp.b;
1449     s = iov_to_buf(iov, iov_cnt, offset, n->rss_data.key, size_get);
1450     if (s != size_get) {
1451         err_msg = "Can get key buffer";
1452         err_value = (uint32_t)s;
1453         goto error;
1454     }
1455     n->rss_data.enabled = true;
1456
1457     if (!n->rss_data.populate_hash) {
1458         if (!virtio_net_attach_epbf_rss(n)) {
1459             /* EBPF must be loaded for vhost */
1460             if (get_vhost_net(qemu_get_queue(n->nic)->peer)) {
1461                 warn_report("Can't load eBPF RSS for vhost");
1462                 goto error;
1463             }
1464             /* fallback to software RSS */
1465             warn_report("Can't load eBPF RSS - fallback to software RSS");
1466             n->rss_data.enabled_software_rss = true;
1467         }
1468     } else {
1469         /* use software RSS for hash populating */
1470         /* and detach eBPF if was loaded before */
1471         virtio_net_detach_epbf_rss(n);
1472         n->rss_data.enabled_software_rss = true;
1473     }
1474
1475     trace_virtio_net_rss_enable(n->rss_data.hash_types,
1476                                 n->rss_data.indirections_len,
1477                                 temp.b);
1478     return queue_pairs;
1479 error:
1480     trace_virtio_net_rss_error(err_msg, err_value);
1481     virtio_net_disable_rss(n);
1482     return 0;
1483 }
1484
1485 static int virtio_net_handle_mq(VirtIONet *n, uint8_t cmd,
1486                                 struct iovec *iov, unsigned int iov_cnt)
1487 {
1488     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1489     uint16_t queue_pairs;
1490     NetClientState *nc = qemu_get_queue(n->nic);
1491
1492     virtio_net_disable_rss(n);
1493     if (cmd == VIRTIO_NET_CTRL_MQ_HASH_CONFIG) {
1494         queue_pairs = virtio_net_handle_rss(n, iov, iov_cnt, false);
1495         return queue_pairs ? VIRTIO_NET_OK : VIRTIO_NET_ERR;
1496     }
1497     if (cmd == VIRTIO_NET_CTRL_MQ_RSS_CONFIG) {
1498         queue_pairs = virtio_net_handle_rss(n, iov, iov_cnt, true);
1499     } else if (cmd == VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET) {
1500         struct virtio_net_ctrl_mq mq;
1501         size_t s;
1502         if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_MQ)) {
1503             return VIRTIO_NET_ERR;
1504         }
1505         s = iov_to_buf(iov, iov_cnt, 0, &mq, sizeof(mq));
1506         if (s != sizeof(mq)) {
1507             return VIRTIO_NET_ERR;
1508         }
1509         queue_pairs = virtio_lduw_p(vdev, &mq.virtqueue_pairs);
1510
1511     } else {
1512         return VIRTIO_NET_ERR;
1513     }
1514
1515     if (queue_pairs < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
1516         queue_pairs > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX ||
1517         queue_pairs > n->max_queue_pairs ||
1518         !n->multiqueue) {
1519         return VIRTIO_NET_ERR;
1520     }
1521
1522     n->curr_queue_pairs = queue_pairs;
1523     if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
1524         /*
1525          * Avoid updating the backend for a vdpa device: We're only interested
1526          * in updating the device model queues.
1527          */
1528         return VIRTIO_NET_OK;
1529     }
1530     /* stop the backend before changing the number of queue_pairs to avoid handling a
1531      * disabled queue */
1532     virtio_net_set_status(vdev, vdev->status);
1533     virtio_net_set_queue_pairs(n);
1534
1535     return VIRTIO_NET_OK;
1536 }
1537
1538 size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
1539                                   const struct iovec *in_sg, unsigned in_num,
1540                                   const struct iovec *out_sg,
1541                                   unsigned out_num)
1542 {
1543     VirtIONet *n = VIRTIO_NET(vdev);
1544     struct virtio_net_ctrl_hdr ctrl;
1545     virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1546     size_t s;
1547     struct iovec *iov, *iov2;
1548
1549     if (iov_size(in_sg, in_num) < sizeof(status) ||
1550         iov_size(out_sg, out_num) < sizeof(ctrl)) {
1551         virtio_error(vdev, "virtio-net ctrl missing headers");
1552         return 0;
1553     }
1554
1555     iov2 = iov = g_memdup2(out_sg, sizeof(struct iovec) * out_num);
1556     s = iov_to_buf(iov, out_num, 0, &ctrl, sizeof(ctrl));
1557     iov_discard_front(&iov, &out_num, sizeof(ctrl));
1558     if (s != sizeof(ctrl)) {
1559         status = VIRTIO_NET_ERR;
1560     } else if (ctrl.class == VIRTIO_NET_CTRL_RX) {
1561         status = virtio_net_handle_rx_mode(n, ctrl.cmd, iov, out_num);
1562     } else if (ctrl.class == VIRTIO_NET_CTRL_MAC) {
1563         status = virtio_net_handle_mac(n, ctrl.cmd, iov, out_num);
1564     } else if (ctrl.class == VIRTIO_NET_CTRL_VLAN) {
1565         status = virtio_net_handle_vlan_table(n, ctrl.cmd, iov, out_num);
1566     } else if (ctrl.class == VIRTIO_NET_CTRL_ANNOUNCE) {
1567         status = virtio_net_handle_announce(n, ctrl.cmd, iov, out_num);
1568     } else if (ctrl.class == VIRTIO_NET_CTRL_MQ) {
1569         status = virtio_net_handle_mq(n, ctrl.cmd, iov, out_num);
1570     } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
1571         status = virtio_net_handle_offloads(n, ctrl.cmd, iov, out_num);
1572     }
1573
1574     s = iov_from_buf(in_sg, in_num, 0, &status, sizeof(status));
1575     assert(s == sizeof(status));
1576
1577     g_free(iov2);
1578     return sizeof(status);
1579 }
1580
1581 static void virtio_net_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
1582 {
1583     VirtQueueElement *elem;
1584
1585     for (;;) {
1586         size_t written;
1587         elem = virtqueue_pop(vq, sizeof(VirtQueueElement));
1588         if (!elem) {
1589             break;
1590         }
1591
1592         written = virtio_net_handle_ctrl_iov(vdev, elem->in_sg, elem->in_num,
1593                                              elem->out_sg, elem->out_num);
1594         if (written > 0) {
1595             virtqueue_push(vq, elem, written);
1596             virtio_notify(vdev, vq);
1597             g_free(elem);
1598         } else {
1599             virtqueue_detach_element(vq, elem, 0);
1600             g_free(elem);
1601             break;
1602         }
1603     }
1604 }
1605
1606 /* RX */
1607
1608 static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq)
1609 {
1610     VirtIONet *n = VIRTIO_NET(vdev);
1611     int queue_index = vq2q(virtio_get_queue_index(vq));
1612
1613     qemu_flush_queued_packets(qemu_get_subqueue(n->nic, queue_index));
1614 }
1615
1616 static bool virtio_net_can_receive(NetClientState *nc)
1617 {
1618     VirtIONet *n = qemu_get_nic_opaque(nc);
1619     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1620     VirtIONetQueue *q = virtio_net_get_subqueue(nc);
1621
1622     if (!vdev->vm_running) {
1623         return false;
1624     }
1625
1626     if (nc->queue_index >= n->curr_queue_pairs) {
1627         return false;
1628     }
1629
1630     if (!virtio_queue_ready(q->rx_vq) ||
1631         !(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
1632         return false;
1633     }
1634
1635     return true;
1636 }
1637
1638 static int virtio_net_has_buffers(VirtIONetQueue *q, int bufsize)
1639 {
1640     VirtIONet *n = q->n;
1641     if (virtio_queue_empty(q->rx_vq) ||
1642         (n->mergeable_rx_bufs &&
1643          !virtqueue_avail_bytes(q->rx_vq, bufsize, 0))) {
1644         virtio_queue_set_notification(q->rx_vq, 1);
1645
1646         /* To avoid a race condition where the guest has made some buffers
1647          * available after the above check but before notification was
1648          * enabled, check for available buffers again.
1649          */
1650         if (virtio_queue_empty(q->rx_vq) ||
1651             (n->mergeable_rx_bufs &&
1652              !virtqueue_avail_bytes(q->rx_vq, bufsize, 0))) {
1653             return 0;
1654         }
1655     }
1656
1657     virtio_queue_set_notification(q->rx_vq, 0);
1658     return 1;
1659 }
1660
1661 static void virtio_net_hdr_swap(VirtIODevice *vdev, struct virtio_net_hdr *hdr)
1662 {
1663     virtio_tswap16s(vdev, &hdr->hdr_len);
1664     virtio_tswap16s(vdev, &hdr->gso_size);
1665     virtio_tswap16s(vdev, &hdr->csum_start);
1666     virtio_tswap16s(vdev, &hdr->csum_offset);
1667 }
1668
1669 /* dhclient uses AF_PACKET but doesn't pass auxdata to the kernel so
1670  * it never finds out that the packets don't have valid checksums.  This
1671  * causes dhclient to get upset.  Fedora's carried a patch for ages to
1672  * fix this with Xen but it hasn't appeared in an upstream release of
1673  * dhclient yet.
1674  *
1675  * To avoid breaking existing guests, we catch udp packets and add
1676  * checksums.  This is terrible but it's better than hacking the guest
1677  * kernels.
1678  *
1679  * N.B. if we introduce a zero-copy API, this operation is no longer free so
1680  * we should provide a mechanism to disable it to avoid polluting the host
1681  * cache.
1682  */
1683 static void work_around_broken_dhclient(struct virtio_net_hdr *hdr,
1684                                         uint8_t *buf, size_t size)
1685 {
1686     if ((hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && /* missing csum */
1687         (size > 27 && size < 1500) && /* normal sized MTU */
1688         (buf[12] == 0x08 && buf[13] == 0x00) && /* ethertype == IPv4 */
1689         (buf[23] == 17) && /* ip.protocol == UDP */
1690         (buf[34] == 0 && buf[35] == 67)) { /* udp.srcport == bootps */
1691         net_checksum_calculate(buf, size, CSUM_UDP);
1692         hdr->flags &= ~VIRTIO_NET_HDR_F_NEEDS_CSUM;
1693     }
1694 }
1695
1696 static void receive_header(VirtIONet *n, const struct iovec *iov, int iov_cnt,
1697                            const void *buf, size_t size)
1698 {
1699     if (n->has_vnet_hdr) {
1700         /* FIXME this cast is evil */
1701         void *wbuf = (void *)buf;
1702         work_around_broken_dhclient(wbuf, wbuf + n->host_hdr_len,
1703                                     size - n->host_hdr_len);
1704
1705         if (n->needs_vnet_hdr_swap) {
1706             virtio_net_hdr_swap(VIRTIO_DEVICE(n), wbuf);
1707         }
1708         iov_from_buf(iov, iov_cnt, 0, buf, sizeof(struct virtio_net_hdr));
1709     } else {
1710         struct virtio_net_hdr hdr = {
1711             .flags = 0,
1712             .gso_type = VIRTIO_NET_HDR_GSO_NONE
1713         };
1714         iov_from_buf(iov, iov_cnt, 0, &hdr, sizeof hdr);
1715     }
1716 }
1717
1718 static int receive_filter(VirtIONet *n, const uint8_t *buf, int size)
1719 {
1720     static const uint8_t bcast[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
1721     static const uint8_t vlan[] = {0x81, 0x00};
1722     uint8_t *ptr = (uint8_t *)buf;
1723     int i;
1724
1725     if (n->promisc)
1726         return 1;
1727
1728     ptr += n->host_hdr_len;
1729
1730     if (!memcmp(&ptr[12], vlan, sizeof(vlan))) {
1731         int vid = lduw_be_p(ptr + 14) & 0xfff;
1732         if (!(n->vlans[vid >> 5] & (1U << (vid & 0x1f))))
1733             return 0;
1734     }
1735
1736     if (ptr[0] & 1) { // multicast
1737         if (!memcmp(ptr, bcast, sizeof(bcast))) {
1738             return !n->nobcast;
1739         } else if (n->nomulti) {
1740             return 0;
1741         } else if (n->allmulti || n->mac_table.multi_overflow) {
1742             return 1;
1743         }
1744
1745         for (i = n->mac_table.first_multi; i < n->mac_table.in_use; i++) {
1746             if (!memcmp(ptr, &n->mac_table.macs[i * ETH_ALEN], ETH_ALEN)) {
1747                 return 1;
1748             }
1749         }
1750     } else { // unicast
1751         if (n->nouni) {
1752             return 0;
1753         } else if (n->alluni || n->mac_table.uni_overflow) {
1754             return 1;
1755         } else if (!memcmp(ptr, n->mac, ETH_ALEN)) {
1756             return 1;
1757         }
1758
1759         for (i = 0; i < n->mac_table.first_multi; i++) {
1760             if (!memcmp(ptr, &n->mac_table.macs[i * ETH_ALEN], ETH_ALEN)) {
1761                 return 1;
1762             }
1763         }
1764     }
1765
1766     return 0;
1767 }
1768
1769 static uint8_t virtio_net_get_hash_type(bool hasip4,
1770                                         bool hasip6,
1771                                         EthL4HdrProto l4hdr_proto,
1772                                         uint32_t types)
1773 {
1774     if (hasip4) {
1775         switch (l4hdr_proto) {
1776         case ETH_L4_HDR_PROTO_TCP:
1777             if (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) {
1778                 return NetPktRssIpV4Tcp;
1779             }
1780             break;
1781
1782         case ETH_L4_HDR_PROTO_UDP:
1783             if (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) {
1784                 return NetPktRssIpV4Udp;
1785             }
1786             break;
1787
1788         default:
1789             break;
1790         }
1791
1792         if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
1793             return NetPktRssIpV4;
1794         }
1795     } else if (hasip6) {
1796         switch (l4hdr_proto) {
1797         case ETH_L4_HDR_PROTO_TCP:
1798             if (types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) {
1799                 return NetPktRssIpV6TcpEx;
1800             }
1801             if (types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) {
1802                 return NetPktRssIpV6Tcp;
1803             }
1804             break;
1805
1806         case ETH_L4_HDR_PROTO_UDP:
1807             if (types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) {
1808                 return NetPktRssIpV6UdpEx;
1809             }
1810             if (types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) {
1811                 return NetPktRssIpV6Udp;
1812             }
1813             break;
1814
1815         default:
1816             break;
1817         }
1818
1819         if (types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) {
1820             return NetPktRssIpV6Ex;
1821         }
1822         if (types & VIRTIO_NET_RSS_HASH_TYPE_IPv6) {
1823             return NetPktRssIpV6;
1824         }
1825     }
1826     return 0xff;
1827 }
1828
1829 static void virtio_set_packet_hash(const uint8_t *buf, uint8_t report,
1830                                    uint32_t hash)
1831 {
1832     struct virtio_net_hdr_v1_hash *hdr = (void *)buf;
1833     hdr->hash_value = hash;
1834     hdr->hash_report = report;
1835 }
1836
1837 static int virtio_net_process_rss(NetClientState *nc, const uint8_t *buf,
1838                                   size_t size)
1839 {
1840     VirtIONet *n = qemu_get_nic_opaque(nc);
1841     unsigned int index = nc->queue_index, new_index = index;
1842     struct NetRxPkt *pkt = n->rx_pkt;
1843     uint8_t net_hash_type;
1844     uint32_t hash;
1845     bool hasip4, hasip6;
1846     EthL4HdrProto l4hdr_proto;
1847     static const uint8_t reports[NetPktRssIpV6UdpEx + 1] = {
1848         VIRTIO_NET_HASH_REPORT_IPv4,
1849         VIRTIO_NET_HASH_REPORT_TCPv4,
1850         VIRTIO_NET_HASH_REPORT_TCPv6,
1851         VIRTIO_NET_HASH_REPORT_IPv6,
1852         VIRTIO_NET_HASH_REPORT_IPv6_EX,
1853         VIRTIO_NET_HASH_REPORT_TCPv6_EX,
1854         VIRTIO_NET_HASH_REPORT_UDPv4,
1855         VIRTIO_NET_HASH_REPORT_UDPv6,
1856         VIRTIO_NET_HASH_REPORT_UDPv6_EX
1857     };
1858     struct iovec iov = {
1859         .iov_base = (void *)buf,
1860         .iov_len = size
1861     };
1862
1863     net_rx_pkt_set_protocols(pkt, &iov, 1, n->host_hdr_len);
1864     net_rx_pkt_get_protocols(pkt, &hasip4, &hasip6, &l4hdr_proto);
1865     net_hash_type = virtio_net_get_hash_type(hasip4, hasip6, l4hdr_proto,
1866                                              n->rss_data.hash_types);
1867     if (net_hash_type > NetPktRssIpV6UdpEx) {
1868         if (n->rss_data.populate_hash) {
1869             virtio_set_packet_hash(buf, VIRTIO_NET_HASH_REPORT_NONE, 0);
1870         }
1871         return n->rss_data.redirect ? n->rss_data.default_queue : -1;
1872     }
1873
1874     hash = net_rx_pkt_calc_rss_hash(pkt, net_hash_type, n->rss_data.key);
1875
1876     if (n->rss_data.populate_hash) {
1877         virtio_set_packet_hash(buf, reports[net_hash_type], hash);
1878     }
1879
1880     if (n->rss_data.redirect) {
1881         new_index = hash & (n->rss_data.indirections_len - 1);
1882         new_index = n->rss_data.indirections_table[new_index];
1883     }
1884
1885     return (index == new_index) ? -1 : new_index;
1886 }
1887
1888 static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
1889                                       size_t size, bool no_rss)
1890 {
1891     VirtIONet *n = qemu_get_nic_opaque(nc);
1892     VirtIONetQueue *q = virtio_net_get_subqueue(nc);
1893     VirtIODevice *vdev = VIRTIO_DEVICE(n);
1894     VirtQueueElement *elems[VIRTQUEUE_MAX_SIZE];
1895     size_t lens[VIRTQUEUE_MAX_SIZE];
1896     struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE];
1897     struct virtio_net_hdr_mrg_rxbuf mhdr;
1898     unsigned mhdr_cnt = 0;
1899     size_t offset, i, guest_offset, j;
1900     ssize_t err;
1901
1902     if (!virtio_net_can_receive(nc)) {
1903         return -1;
1904     }
1905
1906     if (!no_rss && n->rss_data.enabled && n->rss_data.enabled_software_rss) {
1907         int index = virtio_net_process_rss(nc, buf, size);
1908         if (index >= 0) {
1909             NetClientState *nc2 = qemu_get_subqueue(n->nic, index);
1910             return virtio_net_receive_rcu(nc2, buf, size, true);
1911         }
1912     }
1913
1914     /* hdr_len refers to the header we supply to the guest */
1915     if (!virtio_net_has_buffers(q, size + n->guest_hdr_len - n->host_hdr_len)) {
1916         return 0;
1917     }
1918
1919     if (!receive_filter(n, buf, size))
1920         return size;
1921
1922     offset = i = 0;
1923
1924     while (offset < size) {
1925         VirtQueueElement *elem;
1926         int len, total;
1927         const struct iovec *sg;
1928
1929         total = 0;
1930
1931         if (i == VIRTQUEUE_MAX_SIZE) {
1932             virtio_error(vdev, "virtio-net unexpected long buffer chain");
1933             err = size;
1934             goto err;
1935         }
1936
1937         elem = virtqueue_pop(q->rx_vq, sizeof(VirtQueueElement));
1938         if (!elem) {
1939             if (i) {
1940                 virtio_error(vdev, "virtio-net unexpected empty queue: "
1941                              "i %zd mergeable %d offset %zd, size %zd, "
1942                              "guest hdr len %zd, host hdr len %zd "
1943                              "guest features 0x%" PRIx64,
1944                              i, n->mergeable_rx_bufs, offset, size,
1945                              n->guest_hdr_len, n->host_hdr_len,
1946                              vdev->guest_features);
1947             }
1948             err = -1;
1949             goto err;
1950         }
1951
1952         if (elem->in_num < 1) {
1953             virtio_error(vdev,
1954                          "virtio-net receive queue contains no in buffers");
1955             virtqueue_detach_element(q->rx_vq, elem, 0);
1956             g_free(elem);
1957             err = -1;
1958             goto err;
1959         }
1960
1961         sg = elem->in_sg;
1962         if (i == 0) {
1963             assert(offset == 0);
1964             if (n->mergeable_rx_bufs) {
1965                 mhdr_cnt = iov_copy(mhdr_sg, ARRAY_SIZE(mhdr_sg),
1966                                     sg, elem->in_num,
1967                                     offsetof(typeof(mhdr), num_buffers),
1968                                     sizeof(mhdr.num_buffers));
1969             }
1970
1971             receive_header(n, sg, elem->in_num, buf, size);
1972             if (n->rss_data.populate_hash) {
1973                 offset = sizeof(mhdr);
1974                 iov_from_buf(sg, elem->in_num, offset,
1975                              buf + offset, n->host_hdr_len - sizeof(mhdr));
1976             }
1977             offset = n->host_hdr_len;
1978             total += n->guest_hdr_len;
1979             guest_offset = n->guest_hdr_len;
1980         } else {
1981             guest_offset = 0;
1982         }
1983
1984         /* copy in packet.  ugh */
1985         len = iov_from_buf(sg, elem->in_num, guest_offset,
1986                            buf + offset, size - offset);
1987         total += len;
1988         offset += len;
1989         /* If buffers can't be merged, at this point we
1990          * must have consumed the complete packet.
1991          * Otherwise, drop it. */
1992         if (!n->mergeable_rx_bufs && offset < size) {
1993             virtqueue_unpop(q->rx_vq, elem, total);
1994             g_free(elem);
1995             err = size;
1996             goto err;
1997         }
1998
1999         elems[i] = elem;
2000         lens[i] = total;
2001         i++;
2002     }
2003
2004     if (mhdr_cnt) {
2005         virtio_stw_p(vdev, &mhdr.num_buffers, i);
2006         iov_from_buf(mhdr_sg, mhdr_cnt,
2007                      0,
2008                      &mhdr.num_buffers, sizeof mhdr.num_buffers);
2009     }
2010
2011     for (j = 0; j < i; j++) {
2012         /* signal other side */
2013         virtqueue_fill(q->rx_vq, elems[j], lens[j], j);
2014         g_free(elems[j]);
2015     }
2016
2017     virtqueue_flush(q->rx_vq, i);
2018     virtio_notify(vdev, q->rx_vq);
2019
2020     return size;
2021
2022 err:
2023     for (j = 0; j < i; j++) {
2024         virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
2025         g_free(elems[j]);
2026     }
2027
2028     return err;
2029 }
2030
2031 static ssize_t virtio_net_do_receive(NetClientState *nc, const uint8_t *buf,
2032                                   size_t size)
2033 {
2034     RCU_READ_LOCK_GUARD();
2035
2036     return virtio_net_receive_rcu(nc, buf, size, false);
2037 }
2038
2039 static void virtio_net_rsc_extract_unit4(VirtioNetRscChain *chain,
2040                                          const uint8_t *buf,
2041                                          VirtioNetRscUnit *unit)
2042 {
2043     uint16_t ip_hdrlen;
2044     struct ip_header *ip;
2045
2046     ip = (struct ip_header *)(buf + chain->n->guest_hdr_len
2047                               + sizeof(struct eth_header));
2048     unit->ip = (void *)ip;
2049     ip_hdrlen = (ip->ip_ver_len & 0xF) << 2;
2050     unit->ip_plen = &ip->ip_len;
2051     unit->tcp = (struct tcp_header *)(((uint8_t *)unit->ip) + ip_hdrlen);
2052     unit->tcp_hdrlen = (htons(unit->tcp->th_offset_flags) & 0xF000) >> 10;
2053     unit->payload = htons(*unit->ip_plen) - ip_hdrlen - unit->tcp_hdrlen;
2054 }
2055
2056 static void virtio_net_rsc_extract_unit6(VirtioNetRscChain *chain,
2057                                          const uint8_t *buf,
2058                                          VirtioNetRscUnit *unit)
2059 {
2060     struct ip6_header *ip6;
2061
2062     ip6 = (struct ip6_header *)(buf + chain->n->guest_hdr_len
2063                                  + sizeof(struct eth_header));
2064     unit->ip = ip6;
2065     unit->ip_plen = &(ip6->ip6_ctlun.ip6_un1.ip6_un1_plen);
2066     unit->tcp = (struct tcp_header *)(((uint8_t *)unit->ip)
2067                                         + sizeof(struct ip6_header));
2068     unit->tcp_hdrlen = (htons(unit->tcp->th_offset_flags) & 0xF000) >> 10;
2069
2070     /* There is a difference between payload length in ipv4 and v6,
2071        ip header is excluded in ipv6 */
2072     unit->payload = htons(*unit->ip_plen) - unit->tcp_hdrlen;
2073 }
2074
2075 static size_t virtio_net_rsc_drain_seg(VirtioNetRscChain *chain,
2076                                        VirtioNetRscSeg *seg)
2077 {
2078     int ret;
2079     struct virtio_net_hdr_v1 *h;
2080
2081     h = (struct virtio_net_hdr_v1 *)seg->buf;
2082     h->flags = 0;
2083     h->gso_type = VIRTIO_NET_HDR_GSO_NONE;
2084
2085     if (seg->is_coalesced) {
2086         h->rsc.segments = seg->packets;
2087         h->rsc.dup_acks = seg->dup_ack;
2088         h->flags = VIRTIO_NET_HDR_F_RSC_INFO;
2089         if (chain->proto == ETH_P_IP) {
2090             h->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2091         } else {
2092             h->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2093         }
2094     }
2095
2096     ret = virtio_net_do_receive(seg->nc, seg->buf, seg->size);
2097     QTAILQ_REMOVE(&chain->buffers, seg, next);
2098     g_free(seg->buf);
2099     g_free(seg);
2100
2101     return ret;
2102 }
2103
2104 static void virtio_net_rsc_purge(void *opq)
2105 {
2106     VirtioNetRscSeg *seg, *rn;
2107     VirtioNetRscChain *chain = (VirtioNetRscChain *)opq;
2108
2109     QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, rn) {
2110         if (virtio_net_rsc_drain_seg(chain, seg) == 0) {
2111             chain->stat.purge_failed++;
2112             continue;
2113         }
2114     }
2115
2116     chain->stat.timer++;
2117     if (!QTAILQ_EMPTY(&chain->buffers)) {
2118         timer_mod(chain->drain_timer,
2119               qemu_clock_get_ns(QEMU_CLOCK_HOST) + chain->n->rsc_timeout);
2120     }
2121 }
2122
2123 static void virtio_net_rsc_cleanup(VirtIONet *n)
2124 {
2125     VirtioNetRscChain *chain, *rn_chain;
2126     VirtioNetRscSeg *seg, *rn_seg;
2127
2128     QTAILQ_FOREACH_SAFE(chain, &n->rsc_chains, next, rn_chain) {
2129         QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, rn_seg) {
2130             QTAILQ_REMOVE(&chain->buffers, seg, next);
2131             g_free(seg->buf);
2132             g_free(seg);
2133         }
2134
2135         timer_free(chain->drain_timer);
2136         QTAILQ_REMOVE(&n->rsc_chains, chain, next);
2137         g_free(chain);
2138     }
2139 }
2140
2141 static void virtio_net_rsc_cache_buf(VirtioNetRscChain *chain,
2142                                      NetClientState *nc,
2143                                      const uint8_t *buf, size_t size)
2144 {
2145     uint16_t hdr_len;
2146     VirtioNetRscSeg *seg;
2147
2148     hdr_len = chain->n->guest_hdr_len;
2149     seg = g_new(VirtioNetRscSeg, 1);
2150     seg->buf = g_malloc(hdr_len + sizeof(struct eth_header)
2151         + sizeof(struct ip6_header) + VIRTIO_NET_MAX_TCP_PAYLOAD);
2152     memcpy(seg->buf, buf, size);
2153     seg->size = size;
2154     seg->packets = 1;
2155     seg->dup_ack = 0;
2156     seg->is_coalesced = 0;
2157     seg->nc = nc;
2158
2159     QTAILQ_INSERT_TAIL(&chain->buffers, seg, next);
2160     chain->stat.cache++;
2161
2162     switch (chain->proto) {
2163     case ETH_P_IP:
2164         virtio_net_rsc_extract_unit4(chain, seg->buf, &seg->unit);
2165         break;
2166     case ETH_P_IPV6:
2167         virtio_net_rsc_extract_unit6(chain, seg->buf, &seg->unit);
2168         break;
2169     default:
2170         g_assert_not_reached();
2171     }
2172 }
2173
2174 static int32_t virtio_net_rsc_handle_ack(VirtioNetRscChain *chain,
2175                                          VirtioNetRscSeg *seg,
2176                                          const uint8_t *buf,
2177                                          struct tcp_header *n_tcp,
2178                                          struct tcp_header *o_tcp)
2179 {
2180     uint32_t nack, oack;
2181     uint16_t nwin, owin;
2182
2183     nack = htonl(n_tcp->th_ack);
2184     nwin = htons(n_tcp->th_win);
2185     oack = htonl(o_tcp->th_ack);
2186     owin = htons(o_tcp->th_win);
2187
2188     if ((nack - oack) >= VIRTIO_NET_MAX_TCP_PAYLOAD) {
2189         chain->stat.ack_out_of_win++;
2190         return RSC_FINAL;
2191     } else if (nack == oack) {
2192         /* duplicated ack or window probe */
2193         if (nwin == owin) {
2194             /* duplicated ack, add dup ack count due to whql test up to 1 */
2195             chain->stat.dup_ack++;
2196             return RSC_FINAL;
2197         } else {
2198             /* Coalesce window update */
2199             o_tcp->th_win = n_tcp->th_win;
2200             chain->stat.win_update++;
2201             return RSC_COALESCE;
2202         }
2203     } else {
2204         /* pure ack, go to 'C', finalize*/
2205         chain->stat.pure_ack++;
2206         return RSC_FINAL;
2207     }
2208 }
2209
2210 static int32_t virtio_net_rsc_coalesce_data(VirtioNetRscChain *chain,
2211                                             VirtioNetRscSeg *seg,
2212                                             const uint8_t *buf,
2213                                             VirtioNetRscUnit *n_unit)
2214 {
2215     void *data;
2216     uint16_t o_ip_len;
2217     uint32_t nseq, oseq;
2218     VirtioNetRscUnit *o_unit;
2219
2220     o_unit = &seg->unit;
2221     o_ip_len = htons(*o_unit->ip_plen);
2222     nseq = htonl(n_unit->tcp->th_seq);
2223     oseq = htonl(o_unit->tcp->th_seq);
2224
2225     /* out of order or retransmitted. */
2226     if ((nseq - oseq) > VIRTIO_NET_MAX_TCP_PAYLOAD) {
2227         chain->stat.data_out_of_win++;
2228         return RSC_FINAL;
2229     }
2230
2231     data = ((uint8_t *)n_unit->tcp) + n_unit->tcp_hdrlen;
2232     if (nseq == oseq) {
2233         if ((o_unit->payload == 0) && n_unit->payload) {
2234             /* From no payload to payload, normal case, not a dup ack or etc */
2235             chain->stat.data_after_pure_ack++;
2236             goto coalesce;
2237         } else {
2238             return virtio_net_rsc_handle_ack(chain, seg, buf,
2239                                              n_unit->tcp, o_unit->tcp);
2240         }
2241     } else if ((nseq - oseq) != o_unit->payload) {
2242         /* Not a consistent packet, out of order */
2243         chain->stat.data_out_of_order++;
2244         return RSC_FINAL;
2245     } else {
2246 coalesce:
2247         if ((o_ip_len + n_unit->payload) > chain->max_payload) {
2248             chain->stat.over_size++;
2249             return RSC_FINAL;
2250         }
2251
2252         /* Here comes the right data, the payload length in v4/v6 is different,
2253            so use the field value to update and record the new data len */
2254         o_unit->payload += n_unit->payload; /* update new data len */
2255
2256         /* update field in ip header */
2257         *o_unit->ip_plen = htons(o_ip_len + n_unit->payload);
2258
2259         /* Bring 'PUSH' big, the whql test guide says 'PUSH' can be coalesced
2260            for windows guest, while this may change the behavior for linux
2261            guest (only if it uses RSC feature). */
2262         o_unit->tcp->th_offset_flags = n_unit->tcp->th_offset_flags;
2263
2264         o_unit->tcp->th_ack = n_unit->tcp->th_ack;
2265         o_unit->tcp->th_win = n_unit->tcp->th_win;
2266
2267         memmove(seg->buf + seg->size, data, n_unit->payload);
2268         seg->size += n_unit->payload;
2269         seg->packets++;
2270         chain->stat.coalesced++;
2271         return RSC_COALESCE;
2272     }
2273 }
2274
2275 static int32_t virtio_net_rsc_coalesce4(VirtioNetRscChain *chain,
2276                                         VirtioNetRscSeg *seg,
2277                                         const uint8_t *buf, size_t size,
2278                                         VirtioNetRscUnit *unit)
2279 {
2280     struct ip_header *ip1, *ip2;
2281
2282     ip1 = (struct ip_header *)(unit->ip);
2283     ip2 = (struct ip_header *)(seg->unit.ip);
2284     if ((ip1->ip_src ^ ip2->ip_src) || (ip1->ip_dst ^ ip2->ip_dst)
2285         || (unit->tcp->th_sport ^ seg->unit.tcp->th_sport)
2286         || (unit->tcp->th_dport ^ seg->unit.tcp->th_dport)) {
2287         chain->stat.no_match++;
2288         return RSC_NO_MATCH;
2289     }
2290
2291     return virtio_net_rsc_coalesce_data(chain, seg, buf, unit);
2292 }
2293
2294 static int32_t virtio_net_rsc_coalesce6(VirtioNetRscChain *chain,
2295                                         VirtioNetRscSeg *seg,
2296                                         const uint8_t *buf, size_t size,
2297                                         VirtioNetRscUnit *unit)
2298 {
2299     struct ip6_header *ip1, *ip2;
2300
2301     ip1 = (struct ip6_header *)(unit->ip);
2302     ip2 = (struct ip6_header *)(seg->unit.ip);
2303     if (memcmp(&ip1->ip6_src, &ip2->ip6_src, sizeof(struct in6_address))
2304         || memcmp(&ip1->ip6_dst, &ip2->ip6_dst, sizeof(struct in6_address))
2305         || (unit->tcp->th_sport ^ seg->unit.tcp->th_sport)
2306         || (unit->tcp->th_dport ^ seg->unit.tcp->th_dport)) {
2307             chain->stat.no_match++;
2308             return RSC_NO_MATCH;
2309     }
2310
2311     return virtio_net_rsc_coalesce_data(chain, seg, buf, unit);
2312 }
2313
2314 /* Packets with 'SYN' should bypass, other flag should be sent after drain
2315  * to prevent out of order */
2316 static int virtio_net_rsc_tcp_ctrl_check(VirtioNetRscChain *chain,
2317                                          struct tcp_header *tcp)
2318 {
2319     uint16_t tcp_hdr;
2320     uint16_t tcp_flag;
2321
2322     tcp_flag = htons(tcp->th_offset_flags);
2323     tcp_hdr = (tcp_flag & VIRTIO_NET_TCP_HDR_LENGTH) >> 10;
2324     tcp_flag &= VIRTIO_NET_TCP_FLAG;
2325     if (tcp_flag & TH_SYN) {
2326         chain->stat.tcp_syn++;
2327         return RSC_BYPASS;
2328     }
2329
2330     if (tcp_flag & (TH_FIN | TH_URG | TH_RST | TH_ECE | TH_CWR)) {
2331         chain->stat.tcp_ctrl_drain++;
2332         return RSC_FINAL;
2333     }
2334
2335     if (tcp_hdr > sizeof(struct tcp_header)) {
2336         chain->stat.tcp_all_opt++;
2337         return RSC_FINAL;
2338     }
2339
2340     return RSC_CANDIDATE;
2341 }
2342
2343 static size_t virtio_net_rsc_do_coalesce(VirtioNetRscChain *chain,
2344                                          NetClientState *nc,
2345                                          const uint8_t *buf, size_t size,
2346                                          VirtioNetRscUnit *unit)
2347 {
2348     int ret;
2349     VirtioNetRscSeg *seg, *nseg;
2350
2351     if (QTAILQ_EMPTY(&chain->buffers)) {
2352         chain->stat.empty_cache++;
2353         virtio_net_rsc_cache_buf(chain, nc, buf, size);
2354         timer_mod(chain->drain_timer,
2355               qemu_clock_get_ns(QEMU_CLOCK_HOST) + chain->n->rsc_timeout);
2356         return size;
2357     }
2358
2359     QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, nseg) {
2360         if (chain->proto == ETH_P_IP) {
2361             ret = virtio_net_rsc_coalesce4(chain, seg, buf, size, unit);
2362         } else {
2363             ret = virtio_net_rsc_coalesce6(chain, seg, buf, size, unit);
2364         }
2365
2366         if (ret == RSC_FINAL) {
2367             if (virtio_net_rsc_drain_seg(chain, seg) == 0) {
2368                 /* Send failed */
2369                 chain->stat.final_failed++;
2370                 return 0;
2371             }
2372
2373             /* Send current packet */
2374             return virtio_net_do_receive(nc, buf, size);
2375         } else if (ret == RSC_NO_MATCH) {
2376             continue;
2377         } else {
2378             /* Coalesced, mark coalesced flag to tell calc cksum for ipv4 */
2379             seg->is_coalesced = 1;
2380             return size;
2381         }
2382     }
2383
2384     chain->stat.no_match_cache++;
2385     virtio_net_rsc_cache_buf(chain, nc, buf, size);
2386     return size;
2387 }
2388
2389 /* Drain a connection data, this is to avoid out of order segments */
2390 static size_t virtio_net_rsc_drain_flow(VirtioNetRscChain *chain,
2391                                         NetClientState *nc,
2392                                         const uint8_t *buf, size_t size,
2393                                         uint16_t ip_start, uint16_t ip_size,
2394                                         uint16_t tcp_port)
2395 {
2396     VirtioNetRscSeg *seg, *nseg;
2397     uint32_t ppair1, ppair2;
2398
2399     ppair1 = *(uint32_t *)(buf + tcp_port);
2400     QTAILQ_FOREACH_SAFE(seg, &chain->buffers, next, nseg) {
2401         ppair2 = *(uint32_t *)(seg->buf + tcp_port);
2402         if (memcmp(buf + ip_start, seg->buf + ip_start, ip_size)
2403             || (ppair1 != ppair2)) {
2404             continue;
2405         }
2406         if (virtio_net_rsc_drain_seg(chain, seg) == 0) {
2407             chain->stat.drain_failed++;
2408         }
2409
2410         break;
2411     }
2412
2413     return virtio_net_do_receive(nc, buf, size);
2414 }
2415
2416 static int32_t virtio_net_rsc_sanity_check4(VirtioNetRscChain *chain,
2417                                             struct ip_header *ip,
2418                                             const uint8_t *buf, size_t size)
2419 {
2420     uint16_t ip_len;
2421
2422     /* Not an ipv4 packet */
2423     if (((ip->ip_ver_len & 0xF0) >> 4) != IP_HEADER_VERSION_4) {
2424         chain->stat.ip_option++;
2425         return RSC_BYPASS;
2426     }
2427
2428     /* Don't handle packets with ip option */
2429     if ((ip->ip_ver_len & 0xF) != VIRTIO_NET_IP4_HEADER_LENGTH) {
2430         chain->stat.ip_option++;
2431         return RSC_BYPASS;
2432     }
2433
2434     if (ip->ip_p != IPPROTO_TCP) {
2435         chain->stat.bypass_not_tcp++;
2436         return RSC_BYPASS;
2437     }
2438
2439     /* Don't handle packets with ip fragment */
2440     if (!(htons(ip->ip_off) & IP_DF)) {
2441         chain->stat.ip_frag++;
2442         return RSC_BYPASS;
2443     }
2444
2445     /* Don't handle packets with ecn flag */
2446     if (IPTOS_ECN(ip->ip_tos)) {
2447         chain->stat.ip_ecn++;
2448         return RSC_BYPASS;
2449     }
2450
2451     ip_len = htons(ip->ip_len);
2452     if (ip_len < (sizeof(struct ip_header) + sizeof(struct tcp_header))
2453         || ip_len > (size - chain->n->guest_hdr_len -
2454                      sizeof(struct eth_header))) {
2455         chain->stat.ip_hacked++;
2456         return RSC_BYPASS;
2457     }
2458
2459     return RSC_CANDIDATE;
2460 }
2461
2462 static size_t virtio_net_rsc_receive4(VirtioNetRscChain *chain,
2463                                       NetClientState *nc,
2464                                       const uint8_t *buf, size_t size)
2465 {
2466     int32_t ret;
2467     uint16_t hdr_len;
2468     VirtioNetRscUnit unit;
2469
2470     hdr_len = ((VirtIONet *)(chain->n))->guest_hdr_len;
2471
2472     if (size < (hdr_len + sizeof(struct eth_header) + sizeof(struct ip_header)
2473         + sizeof(struct tcp_header))) {
2474         chain->stat.bypass_not_tcp++;
2475         return virtio_net_do_receive(nc, buf, size);
2476     }
2477
2478     virtio_net_rsc_extract_unit4(chain, buf, &unit);
2479     if (virtio_net_rsc_sanity_check4(chain, unit.ip, buf, size)
2480         != RSC_CANDIDATE) {
2481         return virtio_net_do_receive(nc, buf, size);
2482     }
2483
2484     ret = virtio_net_rsc_tcp_ctrl_check(chain, unit.tcp);
2485     if (ret == RSC_BYPASS) {
2486         return virtio_net_do_receive(nc, buf, size);
2487     } else if (ret == RSC_FINAL) {
2488         return virtio_net_rsc_drain_flow(chain, nc, buf, size,
2489                 ((hdr_len + sizeof(struct eth_header)) + 12),
2490                 VIRTIO_NET_IP4_ADDR_SIZE,
2491                 hdr_len + sizeof(struct eth_header) + sizeof(struct ip_header));
2492     }
2493
2494     return virtio_net_rsc_do_coalesce(chain, nc, buf, size, &unit);
2495 }
2496
2497 static int32_t virtio_net_rsc_sanity_check6(VirtioNetRscChain *chain,
2498                                             struct ip6_header *ip6,
2499                                             const uint8_t *buf, size_t size)
2500 {
2501     uint16_t ip_len;
2502
2503     if (((ip6->ip6_ctlun.ip6_un1.ip6_un1_flow & 0xF0) >> 4)
2504         != IP_HEADER_VERSION_6) {
2505         return RSC_BYPASS;
2506     }
2507
2508     /* Both option and protocol is checked in this */
2509     if (ip6->ip6_ctlun.ip6_un1.ip6_un1_nxt != IPPROTO_TCP) {
2510         chain->stat.bypass_not_tcp++;
2511         return RSC_BYPASS;
2512     }
2513
2514     ip_len = htons(ip6->ip6_ctlun.ip6_un1.ip6_un1_plen);
2515     if (ip_len < sizeof(struct tcp_header) ||
2516         ip_len > (size - chain->n->guest_hdr_len - sizeof(struct eth_header)
2517                   - sizeof(struct ip6_header))) {
2518         chain->stat.ip_hacked++;
2519         return RSC_BYPASS;
2520     }
2521
2522     /* Don't handle packets with ecn flag */
2523     if (IP6_ECN(ip6->ip6_ctlun.ip6_un3.ip6_un3_ecn)) {
2524         chain->stat.ip_ecn++;
2525         return RSC_BYPASS;
2526     }
2527
2528     return RSC_CANDIDATE;
2529 }
2530
2531 static size_t virtio_net_rsc_receive6(void *opq, NetClientState *nc,
2532                                       const uint8_t *buf, size_t size)
2533 {
2534     int32_t ret;
2535     uint16_t hdr_len;
2536     VirtioNetRscChain *chain;
2537     VirtioNetRscUnit unit;
2538
2539     chain = opq;
2540     hdr_len = ((VirtIONet *)(chain->n))->guest_hdr_len;
2541
2542     if (size < (hdr_len + sizeof(struct eth_header) + sizeof(struct ip6_header)
2543         + sizeof(tcp_header))) {
2544         return virtio_net_do_receive(nc, buf, size);
2545     }
2546
2547     virtio_net_rsc_extract_unit6(chain, buf, &unit);
2548     if (RSC_CANDIDATE != virtio_net_rsc_sanity_check6(chain,
2549                                                  unit.ip, buf, size)) {
2550         return virtio_net_do_receive(nc, buf, size);
2551     }
2552
2553     ret = virtio_net_rsc_tcp_ctrl_check(chain, unit.tcp);
2554     if (ret == RSC_BYPASS) {
2555         return virtio_net_do_receive(nc, buf, size);
2556     } else if (ret == RSC_FINAL) {
2557         return virtio_net_rsc_drain_flow(chain, nc, buf, size,
2558                 ((hdr_len + sizeof(struct eth_header)) + 8),
2559                 VIRTIO_NET_IP6_ADDR_SIZE,
2560                 hdr_len + sizeof(struct eth_header)
2561                 + sizeof(struct ip6_header));
2562     }
2563
2564     return virtio_net_rsc_do_coalesce(chain, nc, buf, size, &unit);
2565 }
2566
2567 static VirtioNetRscChain *virtio_net_rsc_lookup_chain(VirtIONet *n,
2568                                                       NetClientState *nc,
2569                                                       uint16_t proto)
2570 {
2571     VirtioNetRscChain *chain;
2572
2573     if ((proto != (uint16_t)ETH_P_IP) && (proto != (uint16_t)ETH_P_IPV6)) {
2574         return NULL;
2575     }
2576
2577     QTAILQ_FOREACH(chain, &n->rsc_chains, next) {
2578         if (chain->proto == proto) {
2579             return chain;
2580         }
2581     }
2582
2583     chain = g_malloc(sizeof(*chain));
2584     chain->n = n;
2585     chain->proto = proto;
2586     if (proto == (uint16_t)ETH_P_IP) {
2587         chain->max_payload = VIRTIO_NET_MAX_IP4_PAYLOAD;
2588         chain->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
2589     } else {
2590         chain->max_payload = VIRTIO_NET_MAX_IP6_PAYLOAD;
2591         chain->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
2592     }
2593     chain->drain_timer = timer_new_ns(QEMU_CLOCK_HOST,
2594                                       virtio_net_rsc_purge, chain);
2595     memset(&chain->stat, 0, sizeof(chain->stat));
2596
2597     QTAILQ_INIT(&chain->buffers);
2598     QTAILQ_INSERT_TAIL(&n->rsc_chains, chain, next);
2599
2600     return chain;
2601 }
2602
2603 static ssize_t virtio_net_rsc_receive(NetClientState *nc,
2604                                       const uint8_t *buf,
2605                                       size_t size)
2606 {
2607     uint16_t proto;
2608     VirtioNetRscChain *chain;
2609     struct eth_header *eth;
2610     VirtIONet *n;
2611
2612     n = qemu_get_nic_opaque(nc);
2613     if (size < (n->host_hdr_len + sizeof(struct eth_header))) {
2614         return virtio_net_do_receive(nc, buf, size);
2615     }
2616
2617     eth = (struct eth_header *)(buf + n->guest_hdr_len);
2618     proto = htons(eth->h_proto);
2619
2620     chain = virtio_net_rsc_lookup_chain(n, nc, proto);
2621     if (chain) {
2622         chain->stat.received++;
2623         if (proto == (uint16_t)ETH_P_IP && n->rsc4_enabled) {
2624             return virtio_net_rsc_receive4(chain, nc, buf, size);
2625         } else if (proto == (uint16_t)ETH_P_IPV6 && n->rsc6_enabled) {
2626             return virtio_net_rsc_receive6(chain, nc, buf, size);
2627         }
2628     }
2629     return virtio_net_do_receive(nc, buf, size);
2630 }
2631
2632 static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf,
2633                                   size_t size)
2634 {
2635     VirtIONet *n = qemu_get_nic_opaque(nc);
2636     if ((n->rsc4_enabled || n->rsc6_enabled)) {
2637         return virtio_net_rsc_receive(nc, buf, size);
2638     } else {
2639         return virtio_net_do_receive(nc, buf, size);
2640     }
2641 }
2642
2643 static int32_t virtio_net_flush_tx(VirtIONetQueue *q);
2644
2645 static void virtio_net_tx_complete(NetClientState *nc, ssize_t len)
2646 {
2647     VirtIONet *n = qemu_get_nic_opaque(nc);
2648     VirtIONetQueue *q = virtio_net_get_subqueue(nc);
2649     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2650     int ret;
2651
2652     virtqueue_push(q->tx_vq, q->async_tx.elem, 0);
2653     virtio_notify(vdev, q->tx_vq);
2654
2655     g_free(q->async_tx.elem);
2656     q->async_tx.elem = NULL;
2657
2658     virtio_queue_set_notification(q->tx_vq, 1);
2659     ret = virtio_net_flush_tx(q);
2660     if (ret >= n->tx_burst) {
2661         /*
2662          * the flush has been stopped by tx_burst
2663          * we will not receive notification for the
2664          * remainining part, so re-schedule
2665          */
2666         virtio_queue_set_notification(q->tx_vq, 0);
2667         if (q->tx_bh) {
2668             qemu_bh_schedule(q->tx_bh);
2669         } else {
2670             timer_mod(q->tx_timer,
2671                       qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
2672         }
2673         q->tx_waiting = 1;
2674     }
2675 }
2676
2677 /* TX */
2678 static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
2679 {
2680     VirtIONet *n = q->n;
2681     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2682     VirtQueueElement *elem;
2683     int32_t num_packets = 0;
2684     int queue_index = vq2q(virtio_get_queue_index(q->tx_vq));
2685     if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
2686         return num_packets;
2687     }
2688
2689     if (q->async_tx.elem) {
2690         virtio_queue_set_notification(q->tx_vq, 0);
2691         return num_packets;
2692     }
2693
2694     for (;;) {
2695         ssize_t ret;
2696         unsigned int out_num;
2697         struct iovec sg[VIRTQUEUE_MAX_SIZE], sg2[VIRTQUEUE_MAX_SIZE + 1], *out_sg;
2698         struct virtio_net_hdr_mrg_rxbuf mhdr;
2699
2700         elem = virtqueue_pop(q->tx_vq, sizeof(VirtQueueElement));
2701         if (!elem) {
2702             break;
2703         }
2704
2705         out_num = elem->out_num;
2706         out_sg = elem->out_sg;
2707         if (out_num < 1) {
2708             virtio_error(vdev, "virtio-net header not in first element");
2709             virtqueue_detach_element(q->tx_vq, elem, 0);
2710             g_free(elem);
2711             return -EINVAL;
2712         }
2713
2714         if (n->has_vnet_hdr) {
2715             if (iov_to_buf(out_sg, out_num, 0, &mhdr, n->guest_hdr_len) <
2716                 n->guest_hdr_len) {
2717                 virtio_error(vdev, "virtio-net header incorrect");
2718                 virtqueue_detach_element(q->tx_vq, elem, 0);
2719                 g_free(elem);
2720                 return -EINVAL;
2721             }
2722             if (n->needs_vnet_hdr_swap) {
2723                 virtio_net_hdr_swap(vdev, (void *) &mhdr);
2724                 sg2[0].iov_base = &mhdr;
2725                 sg2[0].iov_len = n->guest_hdr_len;
2726                 out_num = iov_copy(&sg2[1], ARRAY_SIZE(sg2) - 1,
2727                                    out_sg, out_num,
2728                                    n->guest_hdr_len, -1);
2729                 if (out_num == VIRTQUEUE_MAX_SIZE) {
2730                     goto drop;
2731                 }
2732                 out_num += 1;
2733                 out_sg = sg2;
2734             }
2735         }
2736         /*
2737          * If host wants to see the guest header as is, we can
2738          * pass it on unchanged. Otherwise, copy just the parts
2739          * that host is interested in.
2740          */
2741         assert(n->host_hdr_len <= n->guest_hdr_len);
2742         if (n->host_hdr_len != n->guest_hdr_len) {
2743             unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg),
2744                                        out_sg, out_num,
2745                                        0, n->host_hdr_len);
2746             sg_num += iov_copy(sg + sg_num, ARRAY_SIZE(sg) - sg_num,
2747                              out_sg, out_num,
2748                              n->guest_hdr_len, -1);
2749             out_num = sg_num;
2750             out_sg = sg;
2751         }
2752
2753         ret = qemu_sendv_packet_async(qemu_get_subqueue(n->nic, queue_index),
2754                                       out_sg, out_num, virtio_net_tx_complete);
2755         if (ret == 0) {
2756             virtio_queue_set_notification(q->tx_vq, 0);
2757             q->async_tx.elem = elem;
2758             return -EBUSY;
2759         }
2760
2761 drop:
2762         virtqueue_push(q->tx_vq, elem, 0);
2763         virtio_notify(vdev, q->tx_vq);
2764         g_free(elem);
2765
2766         if (++num_packets >= n->tx_burst) {
2767             break;
2768         }
2769     }
2770     return num_packets;
2771 }
2772
2773 static void virtio_net_tx_timer(void *opaque);
2774
2775 static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
2776 {
2777     VirtIONet *n = VIRTIO_NET(vdev);
2778     VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))];
2779
2780     if (unlikely((n->status & VIRTIO_NET_S_LINK_UP) == 0)) {
2781         virtio_net_drop_tx_queue_data(vdev, vq);
2782         return;
2783     }
2784
2785     /* This happens when device was stopped but VCPU wasn't. */
2786     if (!vdev->vm_running) {
2787         q->tx_waiting = 1;
2788         return;
2789     }
2790
2791     if (q->tx_waiting) {
2792         /* We already have queued packets, immediately flush */
2793         timer_del(q->tx_timer);
2794         virtio_net_tx_timer(q);
2795     } else {
2796         /* re-arm timer to flush it (and more) on next tick */
2797         timer_mod(q->tx_timer,
2798                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
2799         q->tx_waiting = 1;
2800         virtio_queue_set_notification(vq, 0);
2801     }
2802 }
2803
2804 static void virtio_net_handle_tx_bh(VirtIODevice *vdev, VirtQueue *vq)
2805 {
2806     VirtIONet *n = VIRTIO_NET(vdev);
2807     VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))];
2808
2809     if (unlikely((n->status & VIRTIO_NET_S_LINK_UP) == 0)) {
2810         virtio_net_drop_tx_queue_data(vdev, vq);
2811         return;
2812     }
2813
2814     if (unlikely(q->tx_waiting)) {
2815         return;
2816     }
2817     q->tx_waiting = 1;
2818     /* This happens when device was stopped but VCPU wasn't. */
2819     if (!vdev->vm_running) {
2820         return;
2821     }
2822     virtio_queue_set_notification(vq, 0);
2823     qemu_bh_schedule(q->tx_bh);
2824 }
2825
2826 static void virtio_net_tx_timer(void *opaque)
2827 {
2828     VirtIONetQueue *q = opaque;
2829     VirtIONet *n = q->n;
2830     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2831     int ret;
2832
2833     /* This happens when device was stopped but BH wasn't. */
2834     if (!vdev->vm_running) {
2835         /* Make sure tx waiting is set, so we'll run when restarted. */
2836         assert(q->tx_waiting);
2837         return;
2838     }
2839
2840     q->tx_waiting = 0;
2841
2842     /* Just in case the driver is not ready on more */
2843     if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
2844         return;
2845     }
2846
2847     ret = virtio_net_flush_tx(q);
2848     if (ret == -EBUSY || ret == -EINVAL) {
2849         return;
2850     }
2851     /*
2852      * If we flush a full burst of packets, assume there are
2853      * more coming and immediately rearm
2854      */
2855     if (ret >= n->tx_burst) {
2856         q->tx_waiting = 1;
2857         timer_mod(q->tx_timer,
2858                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
2859         return;
2860     }
2861     /*
2862      * If less than a full burst, re-enable notification and flush
2863      * anything that may have come in while we weren't looking.  If
2864      * we find something, assume the guest is still active and rearm
2865      */
2866     virtio_queue_set_notification(q->tx_vq, 1);
2867     ret = virtio_net_flush_tx(q);
2868     if (ret > 0) {
2869         virtio_queue_set_notification(q->tx_vq, 0);
2870         q->tx_waiting = 1;
2871         timer_mod(q->tx_timer,
2872                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
2873     }
2874 }
2875
2876 static void virtio_net_tx_bh(void *opaque)
2877 {
2878     VirtIONetQueue *q = opaque;
2879     VirtIONet *n = q->n;
2880     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2881     int32_t ret;
2882
2883     /* This happens when device was stopped but BH wasn't. */
2884     if (!vdev->vm_running) {
2885         /* Make sure tx waiting is set, so we'll run when restarted. */
2886         assert(q->tx_waiting);
2887         return;
2888     }
2889
2890     q->tx_waiting = 0;
2891
2892     /* Just in case the driver is not ready on more */
2893     if (unlikely(!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK))) {
2894         return;
2895     }
2896
2897     ret = virtio_net_flush_tx(q);
2898     if (ret == -EBUSY || ret == -EINVAL) {
2899         return; /* Notification re-enable handled by tx_complete or device
2900                  * broken */
2901     }
2902
2903     /* If we flush a full burst of packets, assume there are
2904      * more coming and immediately reschedule */
2905     if (ret >= n->tx_burst) {
2906         qemu_bh_schedule(q->tx_bh);
2907         q->tx_waiting = 1;
2908         return;
2909     }
2910
2911     /* If less than a full burst, re-enable notification and flush
2912      * anything that may have come in while we weren't looking.  If
2913      * we find something, assume the guest is still active and reschedule */
2914     virtio_queue_set_notification(q->tx_vq, 1);
2915     ret = virtio_net_flush_tx(q);
2916     if (ret == -EINVAL) {
2917         return;
2918     } else if (ret > 0) {
2919         virtio_queue_set_notification(q->tx_vq, 0);
2920         qemu_bh_schedule(q->tx_bh);
2921         q->tx_waiting = 1;
2922     }
2923 }
2924
2925 static void virtio_net_add_queue(VirtIONet *n, int index)
2926 {
2927     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2928
2929     n->vqs[index].rx_vq = virtio_add_queue(vdev, n->net_conf.rx_queue_size,
2930                                            virtio_net_handle_rx);
2931
2932     if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) {
2933         n->vqs[index].tx_vq =
2934             virtio_add_queue(vdev, n->net_conf.tx_queue_size,
2935                              virtio_net_handle_tx_timer);
2936         n->vqs[index].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
2937                                               virtio_net_tx_timer,
2938                                               &n->vqs[index]);
2939     } else {
2940         n->vqs[index].tx_vq =
2941             virtio_add_queue(vdev, n->net_conf.tx_queue_size,
2942                              virtio_net_handle_tx_bh);
2943         n->vqs[index].tx_bh = qemu_bh_new_guarded(virtio_net_tx_bh, &n->vqs[index],
2944                                                   &DEVICE(vdev)->mem_reentrancy_guard);
2945     }
2946
2947     n->vqs[index].tx_waiting = 0;
2948     n->vqs[index].n = n;
2949 }
2950
2951 static void virtio_net_del_queue(VirtIONet *n, int index)
2952 {
2953     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2954     VirtIONetQueue *q = &n->vqs[index];
2955     NetClientState *nc = qemu_get_subqueue(n->nic, index);
2956
2957     qemu_purge_queued_packets(nc);
2958
2959     virtio_del_queue(vdev, index * 2);
2960     if (q->tx_timer) {
2961         timer_free(q->tx_timer);
2962         q->tx_timer = NULL;
2963     } else {
2964         qemu_bh_delete(q->tx_bh);
2965         q->tx_bh = NULL;
2966     }
2967     q->tx_waiting = 0;
2968     virtio_del_queue(vdev, index * 2 + 1);
2969 }
2970
2971 static void virtio_net_change_num_queue_pairs(VirtIONet *n, int new_max_queue_pairs)
2972 {
2973     VirtIODevice *vdev = VIRTIO_DEVICE(n);
2974     int old_num_queues = virtio_get_num_queues(vdev);
2975     int new_num_queues = new_max_queue_pairs * 2 + 1;
2976     int i;
2977
2978     assert(old_num_queues >= 3);
2979     assert(old_num_queues % 2 == 1);
2980
2981     if (old_num_queues == new_num_queues) {
2982         return;
2983     }
2984
2985     /*
2986      * We always need to remove and add ctrl vq if
2987      * old_num_queues != new_num_queues. Remove ctrl_vq first,
2988      * and then we only enter one of the following two loops.
2989      */
2990     virtio_del_queue(vdev, old_num_queues - 1);
2991
2992     for (i = new_num_queues - 1; i < old_num_queues - 1; i += 2) {
2993         /* new_num_queues < old_num_queues */
2994         virtio_net_del_queue(n, i / 2);
2995     }
2996
2997     for (i = old_num_queues - 1; i < new_num_queues - 1; i += 2) {
2998         /* new_num_queues > old_num_queues */
2999         virtio_net_add_queue(n, i / 2);
3000     }
3001
3002     /* add ctrl_vq last */
3003     n->ctrl_vq = virtio_add_queue(vdev, 64, virtio_net_handle_ctrl);
3004 }
3005
3006 static void virtio_net_set_multiqueue(VirtIONet *n, int multiqueue)
3007 {
3008     int max = multiqueue ? n->max_queue_pairs : 1;
3009
3010     n->multiqueue = multiqueue;
3011     virtio_net_change_num_queue_pairs(n, max);
3012
3013     virtio_net_set_queue_pairs(n);
3014 }
3015
3016 static int virtio_net_post_load_device(void *opaque, int version_id)
3017 {
3018     VirtIONet *n = opaque;
3019     VirtIODevice *vdev = VIRTIO_DEVICE(n);
3020     int i, link_down;
3021
3022     trace_virtio_net_post_load_device();
3023     virtio_net_set_mrg_rx_bufs(n, n->mergeable_rx_bufs,
3024                                virtio_vdev_has_feature(vdev,
3025                                                        VIRTIO_F_VERSION_1),
3026                                virtio_vdev_has_feature(vdev,
3027                                                        VIRTIO_NET_F_HASH_REPORT));
3028
3029     /* MAC_TABLE_ENTRIES may be different from the saved image */
3030     if (n->mac_table.in_use > MAC_TABLE_ENTRIES) {
3031         n->mac_table.in_use = 0;
3032     }
3033
3034     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) {
3035         n->curr_guest_offloads = virtio_net_supported_guest_offloads(n);
3036     }
3037
3038     /*
3039      * curr_guest_offloads will be later overwritten by the
3040      * virtio_set_features_nocheck call done from the virtio_load.
3041      * Here we make sure it is preserved and restored accordingly
3042      * in the virtio_net_post_load_virtio callback.
3043      */
3044     n->saved_guest_offloads = n->curr_guest_offloads;
3045
3046     virtio_net_set_queue_pairs(n);
3047
3048     /* Find the first multicast entry in the saved MAC filter */
3049     for (i = 0; i < n->mac_table.in_use; i++) {
3050         if (n->mac_table.macs[i * ETH_ALEN] & 1) {
3051             break;
3052         }
3053     }
3054     n->mac_table.first_multi = i;
3055
3056     /* nc.link_down can't be migrated, so infer link_down according
3057      * to link status bit in n->status */
3058     link_down = (n->status & VIRTIO_NET_S_LINK_UP) == 0;
3059     for (i = 0; i < n->max_queue_pairs; i++) {
3060         qemu_get_subqueue(n->nic, i)->link_down = link_down;
3061     }
3062
3063     if (virtio_vdev_has_feature(vdev, VIRTIO_NET_F_GUEST_ANNOUNCE) &&
3064         virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) {
3065         qemu_announce_timer_reset(&n->announce_timer, migrate_announce_params(),
3066                                   QEMU_CLOCK_VIRTUAL,
3067                                   virtio_net_announce_timer, n);
3068         if (n->announce_timer.round) {
3069             timer_mod(n->announce_timer.tm,
3070                       qemu_clock_get_ms(n->announce_timer.type));
3071         } else {
3072             qemu_announce_timer_del(&n->announce_timer, false);
3073         }
3074     }
3075
3076     if (n->rss_data.enabled) {
3077         n->rss_data.enabled_software_rss = n->rss_data.populate_hash;
3078         if (!n->rss_data.populate_hash) {
3079             if (!virtio_net_attach_epbf_rss(n)) {
3080                 if (get_vhost_net(qemu_get_queue(n->nic)->peer)) {
3081                     warn_report("Can't post-load eBPF RSS for vhost");
3082                 } else {
3083                     warn_report("Can't post-load eBPF RSS - "
3084                                 "fallback to software RSS");
3085                     n->rss_data.enabled_software_rss = true;
3086                 }
3087             }
3088         }
3089
3090         trace_virtio_net_rss_enable(n->rss_data.hash_types,
3091                                     n->rss_data.indirections_len,
3092                                     sizeof(n->rss_data.key));
3093     } else {
3094         trace_virtio_net_rss_disable();
3095     }
3096     return 0;
3097 }
3098
3099 static int virtio_net_post_load_virtio(VirtIODevice *vdev)
3100 {
3101     VirtIONet *n = VIRTIO_NET(vdev);
3102     /*
3103      * The actual needed state is now in saved_guest_offloads,
3104      * see virtio_net_post_load_device for detail.
3105      * Restore it back and apply the desired offloads.
3106      */
3107     n->curr_guest_offloads = n->saved_guest_offloads;
3108     if (peer_has_vnet_hdr(n)) {
3109         virtio_net_apply_guest_offloads(n);
3110     }
3111
3112     return 0;
3113 }
3114
3115 /* tx_waiting field of a VirtIONetQueue */
3116 static const VMStateDescription vmstate_virtio_net_queue_tx_waiting = {
3117     .name = "virtio-net-queue-tx_waiting",
3118     .fields = (VMStateField[]) {
3119         VMSTATE_UINT32(tx_waiting, VirtIONetQueue),
3120         VMSTATE_END_OF_LIST()
3121    },
3122 };
3123
3124 static bool max_queue_pairs_gt_1(void *opaque, int version_id)
3125 {
3126     return VIRTIO_NET(opaque)->max_queue_pairs > 1;
3127 }
3128
3129 static bool has_ctrl_guest_offloads(void *opaque, int version_id)
3130 {
3131     return virtio_vdev_has_feature(VIRTIO_DEVICE(opaque),
3132                                    VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
3133 }
3134
3135 static bool mac_table_fits(void *opaque, int version_id)
3136 {
3137     return VIRTIO_NET(opaque)->mac_table.in_use <= MAC_TABLE_ENTRIES;
3138 }
3139
3140 static bool mac_table_doesnt_fit(void *opaque, int version_id)
3141 {
3142     return !mac_table_fits(opaque, version_id);
3143 }
3144
3145 /* This temporary type is shared by all the WITH_TMP methods
3146  * although only some fields are used by each.
3147  */
3148 struct VirtIONetMigTmp {
3149     VirtIONet      *parent;
3150     VirtIONetQueue *vqs_1;
3151     uint16_t        curr_queue_pairs_1;
3152     uint8_t         has_ufo;
3153     uint32_t        has_vnet_hdr;
3154 };
3155
3156 /* The 2nd and subsequent tx_waiting flags are loaded later than
3157  * the 1st entry in the queue_pairs and only if there's more than one
3158  * entry.  We use the tmp mechanism to calculate a temporary
3159  * pointer and count and also validate the count.
3160  */
3161
3162 static int virtio_net_tx_waiting_pre_save(void *opaque)
3163 {
3164     struct VirtIONetMigTmp *tmp = opaque;
3165
3166     tmp->vqs_1 = tmp->parent->vqs + 1;
3167     tmp->curr_queue_pairs_1 = tmp->parent->curr_queue_pairs - 1;
3168     if (tmp->parent->curr_queue_pairs == 0) {
3169         tmp->curr_queue_pairs_1 = 0;
3170     }
3171
3172     return 0;
3173 }
3174
3175 static int virtio_net_tx_waiting_pre_load(void *opaque)
3176 {
3177     struct VirtIONetMigTmp *tmp = opaque;
3178
3179     /* Reuse the pointer setup from save */
3180     virtio_net_tx_waiting_pre_save(opaque);
3181
3182     if (tmp->parent->curr_queue_pairs > tmp->parent->max_queue_pairs) {
3183         error_report("virtio-net: curr_queue_pairs %x > max_queue_pairs %x",
3184             tmp->parent->curr_queue_pairs, tmp->parent->max_queue_pairs);
3185
3186         return -EINVAL;
3187     }
3188
3189     return 0; /* all good */
3190 }
3191
3192 static const VMStateDescription vmstate_virtio_net_tx_waiting = {
3193     .name      = "virtio-net-tx_waiting",
3194     .pre_load  = virtio_net_tx_waiting_pre_load,
3195     .pre_save  = virtio_net_tx_waiting_pre_save,
3196     .fields    = (VMStateField[]) {
3197         VMSTATE_STRUCT_VARRAY_POINTER_UINT16(vqs_1, struct VirtIONetMigTmp,
3198                                      curr_queue_pairs_1,
3199                                      vmstate_virtio_net_queue_tx_waiting,
3200                                      struct VirtIONetQueue),
3201         VMSTATE_END_OF_LIST()
3202     },
3203 };
3204
3205 /* the 'has_ufo' flag is just tested; if the incoming stream has the
3206  * flag set we need to check that we have it
3207  */
3208 static int virtio_net_ufo_post_load(void *opaque, int version_id)
3209 {
3210     struct VirtIONetMigTmp *tmp = opaque;
3211
3212     if (tmp->has_ufo && !peer_has_ufo(tmp->parent)) {
3213         error_report("virtio-net: saved image requires TUN_F_UFO support");
3214         return -EINVAL;
3215     }
3216
3217     return 0;
3218 }
3219
3220 static int virtio_net_ufo_pre_save(void *opaque)
3221 {
3222     struct VirtIONetMigTmp *tmp = opaque;
3223
3224     tmp->has_ufo = tmp->parent->has_ufo;
3225
3226     return 0;
3227 }
3228
3229 static const VMStateDescription vmstate_virtio_net_has_ufo = {
3230     .name      = "virtio-net-ufo",
3231     .post_load = virtio_net_ufo_post_load,
3232     .pre_save  = virtio_net_ufo_pre_save,
3233     .fields    = (VMStateField[]) {
3234         VMSTATE_UINT8(has_ufo, struct VirtIONetMigTmp),
3235         VMSTATE_END_OF_LIST()
3236     },
3237 };
3238
3239 /* the 'has_vnet_hdr' flag is just tested; if the incoming stream has the
3240  * flag set we need to check that we have it
3241  */
3242 static int virtio_net_vnet_post_load(void *opaque, int version_id)
3243 {
3244     struct VirtIONetMigTmp *tmp = opaque;
3245
3246     if (tmp->has_vnet_hdr && !peer_has_vnet_hdr(tmp->parent)) {
3247         error_report("virtio-net: saved image requires vnet_hdr=on");
3248         return -EINVAL;
3249     }
3250
3251     return 0;
3252 }
3253
3254 static int virtio_net_vnet_pre_save(void *opaque)
3255 {
3256     struct VirtIONetMigTmp *tmp = opaque;
3257
3258     tmp->has_vnet_hdr = tmp->parent->has_vnet_hdr;
3259
3260     return 0;
3261 }
3262
3263 static const VMStateDescription vmstate_virtio_net_has_vnet = {
3264     .name      = "virtio-net-vnet",
3265     .post_load = virtio_net_vnet_post_load,
3266     .pre_save  = virtio_net_vnet_pre_save,
3267     .fields    = (VMStateField[]) {
3268         VMSTATE_UINT32(has_vnet_hdr, struct VirtIONetMigTmp),
3269         VMSTATE_END_OF_LIST()
3270     },
3271 };
3272
3273 static bool virtio_net_rss_needed(void *opaque)
3274 {
3275     return VIRTIO_NET(opaque)->rss_data.enabled;
3276 }
3277
3278 static const VMStateDescription vmstate_virtio_net_rss = {
3279     .name      = "virtio-net-device/rss",
3280     .version_id = 1,
3281     .minimum_version_id = 1,
3282     .needed = virtio_net_rss_needed,
3283     .fields = (VMStateField[]) {
3284         VMSTATE_BOOL(rss_data.enabled, VirtIONet),
3285         VMSTATE_BOOL(rss_data.redirect, VirtIONet),
3286         VMSTATE_BOOL(rss_data.populate_hash, VirtIONet),
3287         VMSTATE_UINT32(rss_data.hash_types, VirtIONet),
3288         VMSTATE_UINT16(rss_data.indirections_len, VirtIONet),
3289         VMSTATE_UINT16(rss_data.default_queue, VirtIONet),
3290         VMSTATE_UINT8_ARRAY(rss_data.key, VirtIONet,
3291                             VIRTIO_NET_RSS_MAX_KEY_SIZE),
3292         VMSTATE_VARRAY_UINT16_ALLOC(rss_data.indirections_table, VirtIONet,
3293                                     rss_data.indirections_len, 0,
3294                                     vmstate_info_uint16, uint16_t),
3295         VMSTATE_END_OF_LIST()
3296     },
3297 };
3298
3299 static const VMStateDescription vmstate_virtio_net_device = {
3300     .name = "virtio-net-device",
3301     .version_id = VIRTIO_NET_VM_VERSION,
3302     .minimum_version_id = VIRTIO_NET_VM_VERSION,
3303     .post_load = virtio_net_post_load_device,
3304     .fields = (VMStateField[]) {
3305         VMSTATE_UINT8_ARRAY(mac, VirtIONet, ETH_ALEN),
3306         VMSTATE_STRUCT_POINTER(vqs, VirtIONet,
3307                                vmstate_virtio_net_queue_tx_waiting,
3308                                VirtIONetQueue),
3309         VMSTATE_UINT32(mergeable_rx_bufs, VirtIONet),
3310         VMSTATE_UINT16(status, VirtIONet),
3311         VMSTATE_UINT8(promisc, VirtIONet),
3312         VMSTATE_UINT8(allmulti, VirtIONet),
3313         VMSTATE_UINT32(mac_table.in_use, VirtIONet),
3314
3315         /* Guarded pair: If it fits we load it, else we throw it away
3316          * - can happen if source has a larger MAC table.; post-load
3317          *  sets flags in this case.
3318          */
3319         VMSTATE_VBUFFER_MULTIPLY(mac_table.macs, VirtIONet,
3320                                 0, mac_table_fits, mac_table.in_use,
3321                                  ETH_ALEN),
3322         VMSTATE_UNUSED_VARRAY_UINT32(VirtIONet, mac_table_doesnt_fit, 0,
3323                                      mac_table.in_use, ETH_ALEN),
3324
3325         /* Note: This is an array of uint32's that's always been saved as a
3326          * buffer; hold onto your endiannesses; it's actually used as a bitmap
3327          * but based on the uint.
3328          */
3329         VMSTATE_BUFFER_POINTER_UNSAFE(vlans, VirtIONet, 0, MAX_VLAN >> 3),
3330         VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp,
3331                          vmstate_virtio_net_has_vnet),
3332         VMSTATE_UINT8(mac_table.multi_overflow, VirtIONet),
3333         VMSTATE_UINT8(mac_table.uni_overflow, VirtIONet),
3334         VMSTATE_UINT8(alluni, VirtIONet),
3335         VMSTATE_UINT8(nomulti, VirtIONet),
3336         VMSTATE_UINT8(nouni, VirtIONet),
3337         VMSTATE_UINT8(nobcast, VirtIONet),
3338         VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp,
3339                          vmstate_virtio_net_has_ufo),
3340         VMSTATE_SINGLE_TEST(max_queue_pairs, VirtIONet, max_queue_pairs_gt_1, 0,
3341                             vmstate_info_uint16_equal, uint16_t),
3342         VMSTATE_UINT16_TEST(curr_queue_pairs, VirtIONet, max_queue_pairs_gt_1),
3343         VMSTATE_WITH_TMP(VirtIONet, struct VirtIONetMigTmp,
3344                          vmstate_virtio_net_tx_waiting),
3345         VMSTATE_UINT64_TEST(curr_guest_offloads, VirtIONet,
3346                             has_ctrl_guest_offloads),
3347         VMSTATE_END_OF_LIST()
3348    },
3349     .subsections = (const VMStateDescription * []) {
3350         &vmstate_virtio_net_rss,
3351         NULL
3352     }
3353 };
3354
3355 static NetClientInfo net_virtio_info = {
3356     .type = NET_CLIENT_DRIVER_NIC,
3357     .size = sizeof(NICState),
3358     .can_receive = virtio_net_can_receive,
3359     .receive = virtio_net_receive,
3360     .link_status_changed = virtio_net_set_link_status,
3361     .query_rx_filter = virtio_net_query_rxfilter,
3362     .announce = virtio_net_announce,
3363 };
3364
3365 static bool virtio_net_guest_notifier_pending(VirtIODevice *vdev, int idx)
3366 {
3367     VirtIONet *n = VIRTIO_NET(vdev);
3368     NetClientState *nc;
3369     assert(n->vhost_started);
3370     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_MQ) && idx == 2) {
3371         /* Must guard against invalid features and bogus queue index
3372          * from being set by malicious guest, or penetrated through
3373          * buggy migration stream.
3374          */
3375         if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) {
3376             qemu_log_mask(LOG_GUEST_ERROR,
3377                           "%s: bogus vq index ignored\n", __func__);
3378             return false;
3379         }
3380         nc = qemu_get_subqueue(n->nic, n->max_queue_pairs);
3381     } else {
3382         nc = qemu_get_subqueue(n->nic, vq2q(idx));
3383     }
3384     /*
3385      * Add the check for configure interrupt, Use VIRTIO_CONFIG_IRQ_IDX -1
3386      * as the macro of configure interrupt's IDX, If this driver does not
3387      * support, the function will return false
3388      */
3389
3390     if (idx == VIRTIO_CONFIG_IRQ_IDX) {
3391         return vhost_net_config_pending(get_vhost_net(nc->peer));
3392     }
3393     return vhost_net_virtqueue_pending(get_vhost_net(nc->peer), idx);
3394 }
3395
3396 static void virtio_net_guest_notifier_mask(VirtIODevice *vdev, int idx,
3397                                            bool mask)
3398 {
3399     VirtIONet *n = VIRTIO_NET(vdev);
3400     NetClientState *nc;
3401     assert(n->vhost_started);
3402     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_MQ) && idx == 2) {
3403         /* Must guard against invalid features and bogus queue index
3404          * from being set by malicious guest, or penetrated through
3405          * buggy migration stream.
3406          */
3407         if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_CTRL_VQ)) {
3408             qemu_log_mask(LOG_GUEST_ERROR,
3409                           "%s: bogus vq index ignored\n", __func__);
3410             return;
3411         }
3412         nc = qemu_get_subqueue(n->nic, n->max_queue_pairs);
3413     } else {
3414         nc = qemu_get_subqueue(n->nic, vq2q(idx));
3415     }
3416     /*
3417      *Add the check for configure interrupt, Use VIRTIO_CONFIG_IRQ_IDX -1
3418      * as the macro of configure interrupt's IDX, If this driver does not
3419      * support, the function will return
3420      */
3421
3422     if (idx == VIRTIO_CONFIG_IRQ_IDX) {
3423         vhost_net_config_mask(get_vhost_net(nc->peer), vdev, mask);
3424         return;
3425     }
3426     vhost_net_virtqueue_mask(get_vhost_net(nc->peer), vdev, idx, mask);
3427 }
3428
3429 static void virtio_net_set_config_size(VirtIONet *n, uint64_t host_features)
3430 {
3431     virtio_add_feature(&host_features, VIRTIO_NET_F_MAC);
3432
3433     n->config_size = virtio_get_config_size(&cfg_size_params, host_features);
3434 }
3435
3436 void virtio_net_set_netclient_name(VirtIONet *n, const char *name,
3437                                    const char *type)
3438 {
3439     /*
3440      * The name can be NULL, the netclient name will be type.x.
3441      */
3442     assert(type != NULL);
3443
3444     g_free(n->netclient_name);
3445     g_free(n->netclient_type);
3446     n->netclient_name = g_strdup(name);
3447     n->netclient_type = g_strdup(type);
3448 }
3449
3450 static bool failover_unplug_primary(VirtIONet *n, DeviceState *dev)
3451 {
3452     HotplugHandler *hotplug_ctrl;
3453     PCIDevice *pci_dev;
3454     Error *err = NULL;
3455
3456     hotplug_ctrl = qdev_get_hotplug_handler(dev);
3457     if (hotplug_ctrl) {
3458         pci_dev = PCI_DEVICE(dev);
3459         pci_dev->partially_hotplugged = true;
3460         hotplug_handler_unplug_request(hotplug_ctrl, dev, &err);
3461         if (err) {
3462             error_report_err(err);
3463             return false;
3464         }
3465     } else {
3466         return false;
3467     }
3468     return true;
3469 }
3470
3471 static bool failover_replug_primary(VirtIONet *n, DeviceState *dev,
3472                                     Error **errp)
3473 {
3474     Error *err = NULL;
3475     HotplugHandler *hotplug_ctrl;
3476     PCIDevice *pdev = PCI_DEVICE(dev);
3477     BusState *primary_bus;
3478
3479     if (!pdev->partially_hotplugged) {
3480         return true;
3481     }
3482     primary_bus = dev->parent_bus;
3483     if (!primary_bus) {
3484         error_setg(errp, "virtio_net: couldn't find primary bus");
3485         return false;
3486     }
3487     qdev_set_parent_bus(dev, primary_bus, &error_abort);
3488     qatomic_set(&n->failover_primary_hidden, false);
3489     hotplug_ctrl = qdev_get_hotplug_handler(dev);
3490     if (hotplug_ctrl) {
3491         hotplug_handler_pre_plug(hotplug_ctrl, dev, &err);
3492         if (err) {
3493             goto out;
3494         }
3495         hotplug_handler_plug(hotplug_ctrl, dev, &err);
3496     }
3497     pdev->partially_hotplugged = false;
3498
3499 out:
3500     error_propagate(errp, err);
3501     return !err;
3502 }
3503
3504 static void virtio_net_handle_migration_primary(VirtIONet *n, MigrationState *s)
3505 {
3506     bool should_be_hidden;
3507     Error *err = NULL;
3508     DeviceState *dev = failover_find_primary_device(n);
3509
3510     if (!dev) {
3511         return;
3512     }
3513
3514     should_be_hidden = qatomic_read(&n->failover_primary_hidden);
3515
3516     if (migration_in_setup(s) && !should_be_hidden) {
3517         if (failover_unplug_primary(n, dev)) {
3518             vmstate_unregister(VMSTATE_IF(dev), qdev_get_vmsd(dev), dev);
3519             qapi_event_send_unplug_primary(dev->id);
3520             qatomic_set(&n->failover_primary_hidden, true);
3521         } else {
3522             warn_report("couldn't unplug primary device");
3523         }
3524     } else if (migration_has_failed(s)) {
3525         /* We already unplugged the device let's plug it back */
3526         if (!failover_replug_primary(n, dev, &err)) {
3527             if (err) {
3528                 error_report_err(err);
3529             }
3530         }
3531     }
3532 }
3533
3534 static void virtio_net_migration_state_notifier(Notifier *notifier, void *data)
3535 {
3536     MigrationState *s = data;
3537     VirtIONet *n = container_of(notifier, VirtIONet, migration_state);
3538     virtio_net_handle_migration_primary(n, s);
3539 }
3540
3541 static bool failover_hide_primary_device(DeviceListener *listener,
3542                                          const QDict *device_opts,
3543                                          bool from_json,
3544                                          Error **errp)
3545 {
3546     VirtIONet *n = container_of(listener, VirtIONet, primary_listener);
3547     const char *standby_id;
3548
3549     if (!device_opts) {
3550         return false;
3551     }
3552
3553     if (!qdict_haskey(device_opts, "failover_pair_id")) {
3554         return false;
3555     }
3556
3557     if (!qdict_haskey(device_opts, "id")) {
3558         error_setg(errp, "Device with failover_pair_id needs to have id");
3559         return false;
3560     }
3561
3562     standby_id = qdict_get_str(device_opts, "failover_pair_id");
3563     if (g_strcmp0(standby_id, n->netclient_name) != 0) {
3564         return false;
3565     }
3566
3567     /*
3568      * The hide helper can be called several times for a given device.
3569      * Check there is only one primary for a virtio-net device but
3570      * don't duplicate the qdict several times if it's called for the same
3571      * device.
3572      */
3573     if (n->primary_opts) {
3574         const char *old, *new;
3575         /* devices with failover_pair_id always have an id */
3576         old = qdict_get_str(n->primary_opts, "id");
3577         new = qdict_get_str(device_opts, "id");
3578         if (strcmp(old, new) != 0) {
3579             error_setg(errp, "Cannot attach more than one primary device to "
3580                        "'%s': '%s' and '%s'", n->netclient_name, old, new);
3581             return false;
3582         }
3583     } else {
3584         n->primary_opts = qdict_clone_shallow(device_opts);
3585         n->primary_opts_from_json = from_json;
3586     }
3587
3588     /* failover_primary_hidden is set during feature negotiation */
3589     return qatomic_read(&n->failover_primary_hidden);
3590 }
3591
3592 static void virtio_net_device_realize(DeviceState *dev, Error **errp)
3593 {
3594     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
3595     VirtIONet *n = VIRTIO_NET(dev);
3596     NetClientState *nc;
3597     int i;
3598
3599     if (n->net_conf.mtu) {
3600         n->host_features |= (1ULL << VIRTIO_NET_F_MTU);
3601     }
3602
3603     if (n->net_conf.duplex_str) {
3604         if (strncmp(n->net_conf.duplex_str, "half", 5) == 0) {
3605             n->net_conf.duplex = DUPLEX_HALF;
3606         } else if (strncmp(n->net_conf.duplex_str, "full", 5) == 0) {
3607             n->net_conf.duplex = DUPLEX_FULL;
3608         } else {
3609             error_setg(errp, "'duplex' must be 'half' or 'full'");
3610             return;
3611         }
3612         n->host_features |= (1ULL << VIRTIO_NET_F_SPEED_DUPLEX);
3613     } else {
3614         n->net_conf.duplex = DUPLEX_UNKNOWN;
3615     }
3616
3617     if (n->net_conf.speed < SPEED_UNKNOWN) {
3618         error_setg(errp, "'speed' must be between 0 and INT_MAX");
3619         return;
3620     }
3621     if (n->net_conf.speed >= 0) {
3622         n->host_features |= (1ULL << VIRTIO_NET_F_SPEED_DUPLEX);
3623     }
3624
3625     if (n->failover) {
3626         n->primary_listener.hide_device = failover_hide_primary_device;
3627         qatomic_set(&n->failover_primary_hidden, true);
3628         device_listener_register(&n->primary_listener);
3629         n->migration_state.notify = virtio_net_migration_state_notifier;
3630         add_migration_state_change_notifier(&n->migration_state);
3631         n->host_features |= (1ULL << VIRTIO_NET_F_STANDBY);
3632     }
3633
3634     virtio_net_set_config_size(n, n->host_features);
3635     virtio_init(vdev, VIRTIO_ID_NET, n->config_size);
3636
3637     /*
3638      * We set a lower limit on RX queue size to what it always was.
3639      * Guests that want a smaller ring can always resize it without
3640      * help from us (using virtio 1 and up).
3641      */
3642     if (n->net_conf.rx_queue_size < VIRTIO_NET_RX_QUEUE_MIN_SIZE ||
3643         n->net_conf.rx_queue_size > VIRTQUEUE_MAX_SIZE ||
3644         !is_power_of_2(n->net_conf.rx_queue_size)) {
3645         error_setg(errp, "Invalid rx_queue_size (= %" PRIu16 "), "
3646                    "must be a power of 2 between %d and %d.",
3647                    n->net_conf.rx_queue_size, VIRTIO_NET_RX_QUEUE_MIN_SIZE,
3648                    VIRTQUEUE_MAX_SIZE);
3649         virtio_cleanup(vdev);
3650         return;
3651     }
3652
3653     if (n->net_conf.tx_queue_size < VIRTIO_NET_TX_QUEUE_MIN_SIZE ||
3654         n->net_conf.tx_queue_size > virtio_net_max_tx_queue_size(n) ||
3655         !is_power_of_2(n->net_conf.tx_queue_size)) {
3656         error_setg(errp, "Invalid tx_queue_size (= %" PRIu16 "), "
3657                    "must be a power of 2 between %d and %d",
3658                    n->net_conf.tx_queue_size, VIRTIO_NET_TX_QUEUE_MIN_SIZE,
3659                    virtio_net_max_tx_queue_size(n));
3660         virtio_cleanup(vdev);
3661         return;
3662     }
3663
3664     n->max_ncs = MAX(n->nic_conf.peers.queues, 1);
3665
3666     /*
3667      * Figure out the datapath queue pairs since the backend could
3668      * provide control queue via peers as well.
3669      */
3670     if (n->nic_conf.peers.queues) {
3671         for (i = 0; i < n->max_ncs; i++) {
3672             if (n->nic_conf.peers.ncs[i]->is_datapath) {
3673                 ++n->max_queue_pairs;
3674             }
3675         }
3676     }
3677     n->max_queue_pairs = MAX(n->max_queue_pairs, 1);
3678
3679     if (n->max_queue_pairs * 2 + 1 > VIRTIO_QUEUE_MAX) {
3680         error_setg(errp, "Invalid number of queue pairs (= %" PRIu32 "), "
3681                    "must be a positive integer less than %d.",
3682                    n->max_queue_pairs, (VIRTIO_QUEUE_MAX - 1) / 2);
3683         virtio_cleanup(vdev);
3684         return;
3685     }
3686     n->vqs = g_new0(VirtIONetQueue, n->max_queue_pairs);
3687     n->curr_queue_pairs = 1;
3688     n->tx_timeout = n->net_conf.txtimer;
3689
3690     if (n->net_conf.tx && strcmp(n->net_conf.tx, "timer")
3691                        && strcmp(n->net_conf.tx, "bh")) {
3692         warn_report("virtio-net: "
3693                     "Unknown option tx=%s, valid options: \"timer\" \"bh\"",
3694                     n->net_conf.tx);
3695         error_printf("Defaulting to \"bh\"");
3696     }
3697
3698     n->net_conf.tx_queue_size = MIN(virtio_net_max_tx_queue_size(n),
3699                                     n->net_conf.tx_queue_size);
3700
3701     for (i = 0; i < n->max_queue_pairs; i++) {
3702         virtio_net_add_queue(n, i);
3703     }
3704
3705     n->ctrl_vq = virtio_add_queue(vdev, 64, virtio_net_handle_ctrl);
3706     qemu_macaddr_default_if_unset(&n->nic_conf.macaddr);
3707     memcpy(&n->mac[0], &n->nic_conf.macaddr, sizeof(n->mac));
3708     n->status = VIRTIO_NET_S_LINK_UP;
3709     qemu_announce_timer_reset(&n->announce_timer, migrate_announce_params(),
3710                               QEMU_CLOCK_VIRTUAL,
3711                               virtio_net_announce_timer, n);
3712     n->announce_timer.round = 0;
3713
3714     if (n->netclient_type) {
3715         /*
3716          * Happen when virtio_net_set_netclient_name has been called.
3717          */
3718         n->nic = qemu_new_nic(&net_virtio_info, &n->nic_conf,
3719                               n->netclient_type, n->netclient_name, n);
3720     } else {
3721         n->nic = qemu_new_nic(&net_virtio_info, &n->nic_conf,
3722                               object_get_typename(OBJECT(dev)), dev->id, n);
3723     }
3724
3725     for (i = 0; i < n->max_queue_pairs; i++) {
3726         n->nic->ncs[i].do_not_pad = true;
3727     }
3728
3729     peer_test_vnet_hdr(n);
3730     if (peer_has_vnet_hdr(n)) {
3731         for (i = 0; i < n->max_queue_pairs; i++) {
3732             qemu_using_vnet_hdr(qemu_get_subqueue(n->nic, i)->peer, true);
3733         }
3734         n->host_hdr_len = sizeof(struct virtio_net_hdr);
3735     } else {
3736         n->host_hdr_len = 0;
3737     }
3738
3739     qemu_format_nic_info_str(qemu_get_queue(n->nic), n->nic_conf.macaddr.a);
3740
3741     n->vqs[0].tx_waiting = 0;
3742     n->tx_burst = n->net_conf.txburst;
3743     virtio_net_set_mrg_rx_bufs(n, 0, 0, 0);
3744     n->promisc = 1; /* for compatibility */
3745
3746     n->mac_table.macs = g_malloc0(MAC_TABLE_ENTRIES * ETH_ALEN);
3747
3748     n->vlans = g_malloc0(MAX_VLAN >> 3);
3749
3750     nc = qemu_get_queue(n->nic);
3751     nc->rxfilter_notify_enabled = 1;
3752
3753    if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_VHOST_VDPA) {
3754         struct virtio_net_config netcfg = {};
3755         memcpy(&netcfg.mac, &n->nic_conf.macaddr, ETH_ALEN);
3756         vhost_net_set_config(get_vhost_net(nc->peer),
3757             (uint8_t *)&netcfg, 0, ETH_ALEN, VHOST_SET_CONFIG_TYPE_FRONTEND);
3758     }
3759     QTAILQ_INIT(&n->rsc_chains);
3760     n->qdev = dev;
3761
3762     net_rx_pkt_init(&n->rx_pkt);
3763
3764     if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
3765         virtio_net_load_ebpf(n);
3766     }
3767 }
3768
3769 static void virtio_net_device_unrealize(DeviceState *dev)
3770 {
3771     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
3772     VirtIONet *n = VIRTIO_NET(dev);
3773     int i, max_queue_pairs;
3774
3775     if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
3776         virtio_net_unload_ebpf(n);
3777     }
3778
3779     /* This will stop vhost backend if appropriate. */
3780     virtio_net_set_status(vdev, 0);
3781
3782     g_free(n->netclient_name);
3783     n->netclient_name = NULL;
3784     g_free(n->netclient_type);
3785     n->netclient_type = NULL;
3786
3787     g_free(n->mac_table.macs);
3788     g_free(n->vlans);
3789
3790     if (n->failover) {
3791         qobject_unref(n->primary_opts);
3792         device_listener_unregister(&n->primary_listener);
3793         remove_migration_state_change_notifier(&n->migration_state);
3794     } else {
3795         assert(n->primary_opts == NULL);
3796     }
3797
3798     max_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
3799     for (i = 0; i < max_queue_pairs; i++) {
3800         virtio_net_del_queue(n, i);
3801     }
3802     /* delete also control vq */
3803     virtio_del_queue(vdev, max_queue_pairs * 2);
3804     qemu_announce_timer_del(&n->announce_timer, false);
3805     g_free(n->vqs);
3806     qemu_del_nic(n->nic);
3807     virtio_net_rsc_cleanup(n);
3808     g_free(n->rss_data.indirections_table);
3809     net_rx_pkt_uninit(n->rx_pkt);
3810     virtio_cleanup(vdev);
3811 }
3812
3813 static void virtio_net_instance_init(Object *obj)
3814 {
3815     VirtIONet *n = VIRTIO_NET(obj);
3816
3817     /*
3818      * The default config_size is sizeof(struct virtio_net_config).
3819      * Can be overridden with virtio_net_set_config_size.
3820      */
3821     n->config_size = sizeof(struct virtio_net_config);
3822     device_add_bootindex_property(obj, &n->nic_conf.bootindex,
3823                                   "bootindex", "/ethernet-phy@0",
3824                                   DEVICE(n));
3825
3826     ebpf_rss_init(&n->ebpf_rss);
3827 }
3828
3829 static int virtio_net_pre_save(void *opaque)
3830 {
3831     VirtIONet *n = opaque;
3832
3833     /* At this point, backend must be stopped, otherwise
3834      * it might keep writing to memory. */
3835     assert(!n->vhost_started);
3836
3837     return 0;
3838 }
3839
3840 static bool primary_unplug_pending(void *opaque)
3841 {
3842     DeviceState *dev = opaque;
3843     DeviceState *primary;
3844     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
3845     VirtIONet *n = VIRTIO_NET(vdev);
3846
3847     if (!virtio_vdev_has_feature(vdev, VIRTIO_NET_F_STANDBY)) {
3848         return false;
3849     }
3850     primary = failover_find_primary_device(n);
3851     return primary ? primary->pending_deleted_event : false;
3852 }
3853
3854 static bool dev_unplug_pending(void *opaque)
3855 {
3856     DeviceState *dev = opaque;
3857     VirtioDeviceClass *vdc = VIRTIO_DEVICE_GET_CLASS(dev);
3858
3859     return vdc->primary_unplug_pending(dev);
3860 }
3861
3862 static struct vhost_dev *virtio_net_get_vhost(VirtIODevice *vdev)
3863 {
3864     VirtIONet *n = VIRTIO_NET(vdev);
3865     NetClientState *nc = qemu_get_queue(n->nic);
3866     struct vhost_net *net = get_vhost_net(nc->peer);
3867     return &net->dev;
3868 }
3869
3870 static const VMStateDescription vmstate_virtio_net = {
3871     .name = "virtio-net",
3872     .minimum_version_id = VIRTIO_NET_VM_VERSION,
3873     .version_id = VIRTIO_NET_VM_VERSION,
3874     .fields = (VMStateField[]) {
3875         VMSTATE_VIRTIO_DEVICE,
3876         VMSTATE_END_OF_LIST()
3877     },
3878     .pre_save = virtio_net_pre_save,
3879     .dev_unplug_pending = dev_unplug_pending,
3880 };
3881
3882 static Property virtio_net_properties[] = {
3883     DEFINE_PROP_BIT64("csum", VirtIONet, host_features,
3884                     VIRTIO_NET_F_CSUM, true),
3885     DEFINE_PROP_BIT64("guest_csum", VirtIONet, host_features,
3886                     VIRTIO_NET_F_GUEST_CSUM, true),
3887     DEFINE_PROP_BIT64("gso", VirtIONet, host_features, VIRTIO_NET_F_GSO, true),
3888     DEFINE_PROP_BIT64("guest_tso4", VirtIONet, host_features,
3889                     VIRTIO_NET_F_GUEST_TSO4, true),
3890     DEFINE_PROP_BIT64("guest_tso6", VirtIONet, host_features,
3891                     VIRTIO_NET_F_GUEST_TSO6, true),
3892     DEFINE_PROP_BIT64("guest_ecn", VirtIONet, host_features,
3893                     VIRTIO_NET_F_GUEST_ECN, true),
3894     DEFINE_PROP_BIT64("guest_ufo", VirtIONet, host_features,
3895                     VIRTIO_NET_F_GUEST_UFO, true),
3896     DEFINE_PROP_BIT64("guest_announce", VirtIONet, host_features,
3897                     VIRTIO_NET_F_GUEST_ANNOUNCE, true),
3898     DEFINE_PROP_BIT64("host_tso4", VirtIONet, host_features,
3899                     VIRTIO_NET_F_HOST_TSO4, true),
3900     DEFINE_PROP_BIT64("host_tso6", VirtIONet, host_features,
3901                     VIRTIO_NET_F_HOST_TSO6, true),
3902     DEFINE_PROP_BIT64("host_ecn", VirtIONet, host_features,
3903                     VIRTIO_NET_F_HOST_ECN, true),
3904     DEFINE_PROP_BIT64("host_ufo", VirtIONet, host_features,
3905                     VIRTIO_NET_F_HOST_UFO, true),
3906     DEFINE_PROP_BIT64("mrg_rxbuf", VirtIONet, host_features,
3907                     VIRTIO_NET_F_MRG_RXBUF, true),
3908     DEFINE_PROP_BIT64("status", VirtIONet, host_features,
3909                     VIRTIO_NET_F_STATUS, true),
3910     DEFINE_PROP_BIT64("ctrl_vq", VirtIONet, host_features,
3911                     VIRTIO_NET_F_CTRL_VQ, true),
3912     DEFINE_PROP_BIT64("ctrl_rx", VirtIONet, host_features,
3913                     VIRTIO_NET_F_CTRL_RX, true),
3914     DEFINE_PROP_BIT64("ctrl_vlan", VirtIONet, host_features,
3915                     VIRTIO_NET_F_CTRL_VLAN, true),
3916     DEFINE_PROP_BIT64("ctrl_rx_extra", VirtIONet, host_features,
3917                     VIRTIO_NET_F_CTRL_RX_EXTRA, true),
3918     DEFINE_PROP_BIT64("ctrl_mac_addr", VirtIONet, host_features,
3919                     VIRTIO_NET_F_CTRL_MAC_ADDR, true),
3920     DEFINE_PROP_BIT64("ctrl_guest_offloads", VirtIONet, host_features,
3921                     VIRTIO_NET_F_CTRL_GUEST_OFFLOADS, true),
3922     DEFINE_PROP_BIT64("mq", VirtIONet, host_features, VIRTIO_NET_F_MQ, false),
3923     DEFINE_PROP_BIT64("rss", VirtIONet, host_features,
3924                     VIRTIO_NET_F_RSS, false),
3925     DEFINE_PROP_BIT64("hash", VirtIONet, host_features,
3926                     VIRTIO_NET_F_HASH_REPORT, false),
3927     DEFINE_PROP_BIT64("guest_rsc_ext", VirtIONet, host_features,
3928                     VIRTIO_NET_F_RSC_EXT, false),
3929     DEFINE_PROP_UINT32("rsc_interval", VirtIONet, rsc_timeout,
3930                        VIRTIO_NET_RSC_DEFAULT_INTERVAL),
3931     DEFINE_NIC_PROPERTIES(VirtIONet, nic_conf),
3932     DEFINE_PROP_UINT32("x-txtimer", VirtIONet, net_conf.txtimer,
3933                        TX_TIMER_INTERVAL),
3934     DEFINE_PROP_INT32("x-txburst", VirtIONet, net_conf.txburst, TX_BURST),
3935     DEFINE_PROP_STRING("tx", VirtIONet, net_conf.tx),
3936     DEFINE_PROP_UINT16("rx_queue_size", VirtIONet, net_conf.rx_queue_size,
3937                        VIRTIO_NET_RX_QUEUE_DEFAULT_SIZE),
3938     DEFINE_PROP_UINT16("tx_queue_size", VirtIONet, net_conf.tx_queue_size,
3939                        VIRTIO_NET_TX_QUEUE_DEFAULT_SIZE),
3940     DEFINE_PROP_UINT16("host_mtu", VirtIONet, net_conf.mtu, 0),
3941     DEFINE_PROP_BOOL("x-mtu-bypass-backend", VirtIONet, mtu_bypass_backend,
3942                      true),
3943     DEFINE_PROP_INT32("speed", VirtIONet, net_conf.speed, SPEED_UNKNOWN),
3944     DEFINE_PROP_STRING("duplex", VirtIONet, net_conf.duplex_str),
3945     DEFINE_PROP_BOOL("failover", VirtIONet, failover, false),
3946     DEFINE_PROP_BIT64("guest_uso4", VirtIONet, host_features,
3947                       VIRTIO_NET_F_GUEST_USO4, true),
3948     DEFINE_PROP_BIT64("guest_uso6", VirtIONet, host_features,
3949                       VIRTIO_NET_F_GUEST_USO6, true),
3950     DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features,
3951                       VIRTIO_NET_F_HOST_USO, true),
3952     DEFINE_PROP_END_OF_LIST(),
3953 };
3954
3955 static void virtio_net_class_init(ObjectClass *klass, void *data)
3956 {
3957     DeviceClass *dc = DEVICE_CLASS(klass);
3958     VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
3959
3960     device_class_set_props(dc, virtio_net_properties);
3961     dc->vmsd = &vmstate_virtio_net;
3962     set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
3963     vdc->realize = virtio_net_device_realize;
3964     vdc->unrealize = virtio_net_device_unrealize;
3965     vdc->get_config = virtio_net_get_config;
3966     vdc->set_config = virtio_net_set_config;
3967     vdc->get_features = virtio_net_get_features;
3968     vdc->set_features = virtio_net_set_features;
3969     vdc->bad_features = virtio_net_bad_features;
3970     vdc->reset = virtio_net_reset;
3971     vdc->queue_reset = virtio_net_queue_reset;
3972     vdc->queue_enable = virtio_net_queue_enable;
3973     vdc->set_status = virtio_net_set_status;
3974     vdc->guest_notifier_mask = virtio_net_guest_notifier_mask;
3975     vdc->guest_notifier_pending = virtio_net_guest_notifier_pending;
3976     vdc->legacy_features |= (0x1 << VIRTIO_NET_F_GSO);
3977     vdc->post_load = virtio_net_post_load_virtio;
3978     vdc->vmsd = &vmstate_virtio_net_device;
3979     vdc->primary_unplug_pending = primary_unplug_pending;
3980     vdc->get_vhost = virtio_net_get_vhost;
3981     vdc->toggle_device_iotlb = vhost_toggle_device_iotlb;
3982 }
3983
3984 static const TypeInfo virtio_net_info = {
3985     .name = TYPE_VIRTIO_NET,
3986     .parent = TYPE_VIRTIO_DEVICE,
3987     .instance_size = sizeof(VirtIONet),
3988     .instance_init = virtio_net_instance_init,
3989     .class_init = virtio_net_class_init,
3990 };
3991
3992 static void virtio_register_types(void)
3993 {
3994     type_register_static(&virtio_net_info);
3995 }
3996
3997 type_init(virtio_register_types)