net/xdp/xdp_umem.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /* XDP user-space packet buffer
   3  * Copyright(c) 2018 Intel Corporation.
   4  */
   5
   6 #include <linux/init.h>
   7 #include <linux/sched/mm.h>
   8 #include <linux/sched/signal.h>
   9 #include <linux/sched/task.h>
  10 #include <linux/uaccess.h>
  11 #include <linux/slab.h>
  12 #include <linux/bpf.h>
  13 #include <linux/mm.h>
  14 #include <linux/netdevice.h>
  15 #include <linux/rtnetlink.h>
  16 #include <linux/idr.h>
  17 #include <linux/vmalloc.h>
  18
  19 #include "xdp_umem.h"
  20 #include "xsk_queue.h"
  21
  22 #define XDP_UMEM_MIN_CHUNK_SIZE 2048
  23
  24 static DEFINE_IDA(umem_ida);
  25
  26 void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
  27 {
  28         unsigned long flags;
  29
  30         if (!xs->tx)
  31                 return;
  32
  33         spin_lock_irqsave(&umem->xsk_list_lock, flags);
  34         list_add_rcu(&xs->list, &umem->xsk_list);
  35         spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
  36 }
  37
  38 void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
  39 {
  40         unsigned long flags;
  41
  42         if (!xs->tx)
  43                 return;
  44
  45         spin_lock_irqsave(&umem->xsk_list_lock, flags);
  46         list_del_rcu(&xs->list);
  47         spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
  48 }
  49
  50 /* The umem is stored both in the _rx struct and the _tx struct as we do
  51  * not know if the device has more tx queues than rx, or the opposite.
  52  * This might also change during run time.
  53  */
  54 static int xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem *umem,
  55                                u16 queue_id)
  56 {
  57         if (queue_id >= max_t(unsigned int,
  58                               dev->real_num_rx_queues,
  59                               dev->real_num_tx_queues))
  60                 return -EINVAL;
  61
  62         if (queue_id < dev->real_num_rx_queues)
  63                 dev->_rx[queue_id].umem = umem;
  64         if (queue_id < dev->real_num_tx_queues)
  65                 dev->_tx[queue_id].umem = umem;
  66
  67         return 0;
  68 }
  69
  70 struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
  71                                        u16 queue_id)
  72 {
  73         if (queue_id < dev->real_num_rx_queues)
  74                 return dev->_rx[queue_id].umem;
  75         if (queue_id < dev->real_num_tx_queues)
  76                 return dev->_tx[queue_id].umem;
  77
  78         return NULL;
  79 }
  80 EXPORT_SYMBOL(xdp_get_umem_from_qid);
  81
  82 static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id)
  83 {
  84         if (queue_id < dev->real_num_rx_queues)
  85                 dev->_rx[queue_id].umem = NULL;
  86         if (queue_id < dev->real_num_tx_queues)
  87                 dev->_tx[queue_id].umem = NULL;
  88 }
  89
  90 int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
  91                         u16 queue_id, u16 flags)
  92 {
  93         bool force_zc, force_copy;
  94         struct netdev_bpf bpf;
  95         int err = 0;
  96
  97         ASSERT_RTNL();
  98
  99         force_zc = flags & XDP_ZEROCOPY;
 100         force_copy = flags & XDP_COPY;
 101
 102         if (force_zc && force_copy)
 103                 return -EINVAL;
 104
 105         if (xdp_get_umem_from_qid(dev, queue_id))
 106                 return -EBUSY;
 107
 108         err = xdp_reg_umem_at_qid(dev, umem, queue_id);
 109         if (err)
 110                 return err;
 111
 112         umem->dev = dev;
 113         umem->queue_id = queue_id;
 114
 115         if (flags & XDP_USE_NEED_WAKEUP) {
 116                 umem->flags |= XDP_UMEM_USES_NEED_WAKEUP;
 117                 /* Tx needs to be explicitly woken up the first time.
 118                  * Also for supporting drivers that do not implement this
 119                  * feature. They will always have to call sendto().
 120                  */
 121                 xsk_set_tx_need_wakeup(umem);
 122         }
 123
 124         dev_hold(dev);
 125
 126         if (force_copy)
 127                 /* For copy-mode, we are done. */
 128                 return 0;
 129
 130         if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_wakeup) {
 131                 err = -EOPNOTSUPP;
 132                 goto err_unreg_umem;
 133         }
 134
 135         bpf.command = XDP_SETUP_XSK_UMEM;
 136         bpf.xsk.umem = umem;
 137         bpf.xsk.queue_id = queue_id;
 138
 139         err = dev->netdev_ops->ndo_bpf(dev, &bpf);
 140         if (err)
 141                 goto err_unreg_umem;
 142
 143         umem->zc = true;
 144         return 0;
 145
 146 err_unreg_umem:
 147         if (!force_zc)
 148                 err = 0; /* fallback to copy mode */
 149         if (err)
 150                 xdp_clear_umem_at_qid(dev, queue_id);
 151         return err;
 152 }
 153
 154 void xdp_umem_clear_dev(struct xdp_umem *umem)
 155 {
 156         struct netdev_bpf bpf;
 157         int err;
 158
 159         ASSERT_RTNL();
 160
 161         if (!umem->dev)
 162                 return;
 163
 164         if (umem->zc) {
 165                 bpf.command = XDP_SETUP_XSK_UMEM;
 166                 bpf.xsk.umem = NULL;
 167                 bpf.xsk.queue_id = umem->queue_id;
 168
 169                 err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
 170
 171                 if (err)
 172                         WARN(1, "failed to disable umem!\n");
 173         }
 174
 175         xdp_clear_umem_at_qid(umem->dev, umem->queue_id);
 176
 177         dev_put(umem->dev);
 178         umem->dev = NULL;
 179         umem->zc = false;
 180 }
 181
 182 static void xdp_umem_unmap_pages(struct xdp_umem *umem)
 183 {
 184         unsigned int i;
 185
 186         for (i = 0; i < umem->npgs; i++)
 187                 if (PageHighMem(umem->pgs[i]))
 188                         vunmap(umem->pages[i].addr);
 189 }
 190
 191 static int xdp_umem_map_pages(struct xdp_umem *umem)
 192 {
 193         unsigned int i;
 194         void *addr;
 195
 196         for (i = 0; i < umem->npgs; i++) {
 197                 if (PageHighMem(umem->pgs[i]))
 198                         addr = vmap(&umem->pgs[i], 1, VM_MAP, PAGE_KERNEL);
 199                 else
 200                         addr = page_address(umem->pgs[i]);
 201
 202                 if (!addr) {
 203                         xdp_umem_unmap_pages(umem);
 204                         return -ENOMEM;
 205                 }
 206
 207                 umem->pages[i].addr = addr;
 208         }
 209
 210         return 0;
 211 }
 212
 213 static void xdp_umem_unpin_pages(struct xdp_umem *umem)
 214 {
 215         put_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
 216
 217         kfree(umem->pgs);
 218         umem->pgs = NULL;
 219 }
 220
 221 static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
 222 {
 223         if (umem->user) {
 224                 atomic_long_sub(umem->npgs, &umem->user->locked_vm);
 225                 free_uid(umem->user);
 226         }
 227 }
 228
 229 static void xdp_umem_release(struct xdp_umem *umem)
 230 {
 231         rtnl_lock();
 232         xdp_umem_clear_dev(umem);
 233         rtnl_unlock();
 234
 235         ida_simple_remove(&umem_ida, umem->id);
 236
 237         if (umem->fq) {
 238                 xskq_destroy(umem->fq);
 239                 umem->fq = NULL;
 240         }
 241
 242         if (umem->cq) {
 243                 xskq_destroy(umem->cq);
 244                 umem->cq = NULL;
 245         }
 246
 247         xsk_reuseq_destroy(umem);
 248
 249         xdp_umem_unmap_pages(umem);
 250         xdp_umem_unpin_pages(umem);
 251
 252         kfree(umem->pages);
 253         umem->pages = NULL;
 254
 255         xdp_umem_unaccount_pages(umem);
 256         kfree(umem);
 257 }
 258
 259 static void xdp_umem_release_deferred(struct work_struct *work)
 260 {
 261         struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
 262
 263         xdp_umem_release(umem);
 264 }
 265
 266 void xdp_get_umem(struct xdp_umem *umem)
 267 {
 268         refcount_inc(&umem->users);
 269 }
 270
 271 void xdp_put_umem(struct xdp_umem *umem)
 272 {
 273         if (!umem)
 274                 return;
 275
 276         if (refcount_dec_and_test(&umem->users)) {
 277                 INIT_WORK(&umem->work, xdp_umem_release_deferred);
 278                 schedule_work(&umem->work);
 279         }
 280 }
 281
 282 static int xdp_umem_pin_pages(struct xdp_umem *umem)
 283 {
 284         unsigned int gup_flags = FOLL_WRITE;
 285         long npgs;
 286         int err;
 287
 288         umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs),
 289                             GFP_KERNEL | __GFP_NOWARN);
 290         if (!umem->pgs)
 291                 return -ENOMEM;
 292
 293         down_read(&current->mm->mmap_sem);
 294         npgs = get_user_pages(umem->address, umem->npgs,
 295                               gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL);
 296         up_read(&current->mm->mmap_sem);
 297
 298         if (npgs != umem->npgs) {
 299                 if (npgs >= 0) {
 300                         umem->npgs = npgs;
 301                         err = -ENOMEM;
 302                         goto out_pin;
 303                 }
 304                 err = npgs;
 305                 goto out_pgs;
 306         }
 307         return 0;
 308
 309 out_pin:
 310         xdp_umem_unpin_pages(umem);
 311 out_pgs:
 312         kfree(umem->pgs);
 313         umem->pgs = NULL;
 314         return err;
 315 }
 316
 317 static int xdp_umem_account_pages(struct xdp_umem *umem)
 318 {
 319         unsigned long lock_limit, new_npgs, old_npgs;
 320
 321         if (capable(CAP_IPC_LOCK))
 322                 return 0;
 323
 324         lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 325         umem->user = get_uid(current_user());
 326
 327         do {
 328                 old_npgs = atomic_long_read(&umem->user->locked_vm);
 329                 new_npgs = old_npgs + umem->npgs;
 330                 if (new_npgs > lock_limit) {
 331                         free_uid(umem->user);
 332                         umem->user = NULL;
 333                         return -ENOBUFS;
 334                 }
 335         } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
 336                                      new_npgs) != old_npgs);
 337         return 0;
 338 }
 339
 340 static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 341 {
 342         bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
 343         u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
 344         unsigned int chunks, chunks_per_page;
 345         u64 addr = mr->addr, size = mr->len;
 346         int size_chk, err;
 347
 348         if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
 349                 /* Strictly speaking we could support this, if:
 350                  * - huge pages, or*
 351                  * - using an IOMMU, or
 352                  * - making sure the memory area is consecutive
 353                  * but for now, we simply say "computer says no".
 354                  */
 355                 return -EINVAL;
 356         }
 357
 358         if (mr->flags & ~(XDP_UMEM_UNALIGNED_CHUNK_FLAG |
 359                         XDP_UMEM_USES_NEED_WAKEUP))
 360                 return -EINVAL;
 361
 362         if (!unaligned_chunks && !is_power_of_2(chunk_size))
 363                 return -EINVAL;
 364
 365         if (!PAGE_ALIGNED(addr)) {
 366                 /* Memory area has to be page size aligned. For
 367                  * simplicity, this might change.
 368                  */
 369                 return -EINVAL;
 370         }
 371
 372         if ((addr + size) < addr)
 373                 return -EINVAL;
 374
 375         chunks = (unsigned int)div_u64(size, chunk_size);
 376         if (chunks == 0)
 377                 return -EINVAL;
 378
 379         if (!unaligned_chunks) {
 380                 chunks_per_page = PAGE_SIZE / chunk_size;
 381                 if (chunks < chunks_per_page || chunks % chunks_per_page)
 382                         return -EINVAL;
 383         }
 384
 385         size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM;
 386         if (size_chk < 0)
 387                 return -EINVAL;
 388
 389         umem->address = (unsigned long)addr;
 390         umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK
 391                                             : ~((u64)chunk_size - 1);
 392         umem->size = size;
 393         umem->headroom = headroom;
 394         umem->chunk_size_nohr = chunk_size - headroom;
 395         umem->npgs = size / PAGE_SIZE;
 396         umem->pgs = NULL;
 397         umem->user = NULL;
 398         umem->flags = mr->flags;
 399         INIT_LIST_HEAD(&umem->xsk_list);
 400         spin_lock_init(&umem->xsk_list_lock);
 401
 402         refcount_set(&umem->users, 1);
 403
 404         err = xdp_umem_account_pages(umem);
 405         if (err)
 406                 return err;
 407
 408         err = xdp_umem_pin_pages(umem);
 409         if (err)
 410                 goto out_account;
 411
 412         umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL);
 413         if (!umem->pages) {
 414                 err = -ENOMEM;
 415                 goto out_pin;
 416         }
 417
 418         err = xdp_umem_map_pages(umem);
 419         if (!err)
 420                 return 0;
 421
 422         kfree(umem->pages);
 423
 424 out_pin:
 425         xdp_umem_unpin_pages(umem);
 426 out_account:
 427         xdp_umem_unaccount_pages(umem);
 428         return err;
 429 }
 430
 431 struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr)
 432 {
 433         struct xdp_umem *umem;
 434         int err;
 435
 436         umem = kzalloc(sizeof(*umem), GFP_KERNEL);
 437         if (!umem)
 438                 return ERR_PTR(-ENOMEM);
 439
 440         err = ida_simple_get(&umem_ida, 0, 0, GFP_KERNEL);
 441         if (err < 0) {
 442                 kfree(umem);
 443                 return ERR_PTR(err);
 444         }
 445         umem->id = err;
 446
 447         err = xdp_umem_reg(umem, mr);
 448         if (err) {
 449                 ida_simple_remove(&umem_ida, umem->id);
 450                 kfree(umem);
 451                 return ERR_PTR(err);
 452         }
 453
 454         return umem;
 455 }
 456
 457 bool xdp_umem_validate_queues(struct xdp_umem *umem)
 458 {
 459         return umem->fq && umem->cq;
 460 }