net/ipv4/fib_trie.c

   1 /*
   2  *   This program is free software; you can redistribute it and/or
   3  *   modify it under the terms of the GNU General Public License
   4  *   as published by the Free Software Foundation; either version
   5  *   2 of the License, or (at your option) any later version.
   6  *
   7  *   Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
   8  *     & Swedish University of Agricultural Sciences.
   9  *
  10  *   Jens Laas <jens.laas@data.slu.se> Swedish University of
  11  *     Agricultural Sciences.
  12  *
  13  *   Hans Liss <hans.liss@its.uu.se>  Uppsala Universitet
  14  *
  15  * This work is based on the LPC-trie which is originally descibed in:
  16  *
  17  * An experimental study of compression methods for dynamic tries
  18  * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
  19  * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
  20  *
  21  *
  22  * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
  23  * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
  24  *
  25  *
  26  * Code from fib_hash has been reused which includes the following header:
  27  *
  28  *
  29  * INET         An implementation of the TCP/IP protocol suite for the LINUX
  30  *              operating system.  INET is implemented using the  BSD Socket
  31  *              interface as the means of communication with the user level.
  32  *
  33  *              IPv4 FIB: lookup engine and maintenance routines.
  34  *
  35  *
  36  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  37  *
  38  *              This program is free software; you can redistribute it and/or
  39  *              modify it under the terms of the GNU General Public License
  40  *              as published by the Free Software Foundation; either version
  41  *              2 of the License, or (at your option) any later version.
  42  *
  43  * Substantial contributions to this work comes from:
  44  *
  45  *              David S. Miller, <davem@davemloft.net>
  46  *              Stephen Hemminger <shemminger@osdl.org>
  47  *              Paul E. McKenney <paulmck@us.ibm.com>
  48  *              Patrick McHardy <kaber@trash.net>
  49  */
  50
  51 #define VERSION "0.409"
  52
  53 #include <asm/uaccess.h>
  54 #include <asm/system.h>
  55 #include <linux/bitops.h>
  56 #include <linux/types.h>
  57 #include <linux/kernel.h>
  58 #include <linux/mm.h>
  59 #include <linux/string.h>
  60 #include <linux/socket.h>
  61 #include <linux/sockios.h>
  62 #include <linux/errno.h>
  63 #include <linux/in.h>
  64 #include <linux/inet.h>
  65 #include <linux/inetdevice.h>
  66 #include <linux/netdevice.h>
  67 #include <linux/if_arp.h>
  68 #include <linux/proc_fs.h>
  69 #include <linux/rcupdate.h>
  70 #include <linux/skbuff.h>
  71 #include <linux/netlink.h>
  72 #include <linux/init.h>
  73 #include <linux/list.h>
  74 #include <linux/slab.h>
  75 #include <net/net_namespace.h>
  76 #include <net/ip.h>
  77 #include <net/protocol.h>
  78 #include <net/route.h>
  79 #include <net/tcp.h>
  80 #include <net/sock.h>
  81 #include <net/ip_fib.h>
  82 #include "fib_lookup.h"
  83
  84 #define MAX_STAT_DEPTH 32
  85
  86 #define KEYLENGTH (8*sizeof(t_key))
  87
  88 typedef unsigned int t_key;
  89
  90 #define T_TNODE 0
  91 #define T_LEAF  1
  92 #define NODE_TYPE_MASK  0x1UL
  93 #define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
  94
  95 #define IS_TNODE(n) (!(n->parent & T_LEAF))
  96 #define IS_LEAF(n) (n->parent & T_LEAF)
  97
  98 struct node {
  99         unsigned long parent;
 100         t_key key;
 101 };
 102
 103 struct leaf {
 104         unsigned long parent;
 105         t_key key;
 106         struct hlist_head list;
 107         struct rcu_head rcu;
 108 };
 109
 110 struct leaf_info {
 111         struct hlist_node hlist;
 112         struct rcu_head rcu;
 113         int plen;
 114         struct list_head falh;
 115 };
 116
 117 struct tnode {
 118         unsigned long parent;
 119         t_key key;
 120         unsigned char pos;              /* 2log(KEYLENGTH) bits needed */
 121         unsigned char bits;             /* 2log(KEYLENGTH) bits needed */
 122         unsigned int full_children;     /* KEYLENGTH bits needed */
 123         unsigned int empty_children;    /* KEYLENGTH bits needed */
 124         union {
 125                 struct rcu_head rcu;
 126                 struct work_struct work;
 127                 struct tnode *tnode_free;
 128         };
 129         struct node *child[0];
 130 };
 131
 132 #ifdef CONFIG_IP_FIB_TRIE_STATS
 133 struct trie_use_stats {
 134         unsigned int gets;
 135         unsigned int backtrack;
 136         unsigned int semantic_match_passed;
 137         unsigned int semantic_match_miss;
 138         unsigned int null_node_hit;
 139         unsigned int resize_node_skipped;
 140 };
 141 #endif
 142
 143 struct trie_stat {
 144         unsigned int totdepth;
 145         unsigned int maxdepth;
 146         unsigned int tnodes;
 147         unsigned int leaves;
 148         unsigned int nullpointers;
 149         unsigned int prefixes;
 150         unsigned int nodesizes[MAX_STAT_DEPTH];
 151 };
 152
 153 struct trie {
 154         struct node *trie;
 155 #ifdef CONFIG_IP_FIB_TRIE_STATS
 156         struct trie_use_stats stats;
 157 #endif
 158 };
 159
 160 static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
 161 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
 162                                   int wasfull);
 163 static struct node *resize(struct trie *t, struct tnode *tn);
 164 static struct tnode *inflate(struct trie *t, struct tnode *tn);
 165 static struct tnode *halve(struct trie *t, struct tnode *tn);
 166 /* tnodes to free after resize(); protected by RTNL */
 167 static struct tnode *tnode_free_head;
 168 static size_t tnode_free_size;
 169
 170 /*
 171  * synchronize_rcu after call_rcu for that many pages; it should be especially
 172  * useful before resizing the root node with PREEMPT_NONE configs; the value was
 173  * obtained experimentally, aiming to avoid visible slowdown.
 174  */
 175 static const int sync_pages = 128;
 176
 177 static struct kmem_cache *fn_alias_kmem __read_mostly;
 178 static struct kmem_cache *trie_leaf_kmem __read_mostly;
 179
 180 static inline struct tnode *node_parent(struct node *node)
 181 {
 182         return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
 183 }
 184
 185 static inline struct tnode *node_parent_rcu(struct node *node)
 186 {
 187         struct tnode *ret = node_parent(node);
 188
 189         return rcu_dereference_check(ret,
 190                                      rcu_read_lock_held() ||
 191                                      lockdep_rtnl_is_held());
 192 }
 193
 194 /* Same as rcu_assign_pointer
 195  * but that macro() assumes that value is a pointer.
 196  */
 197 static inline void node_set_parent(struct node *node, struct tnode *ptr)
 198 {
 199         smp_wmb();
 200         node->parent = (unsigned long)ptr | NODE_TYPE(node);
 201 }
 202
 203 static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
 204 {
 205         BUG_ON(i >= 1U << tn->bits);
 206
 207         return tn->child[i];
 208 }
 209
 210 static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
 211 {
 212         struct node *ret = tnode_get_child(tn, i);
 213
 214         return rcu_dereference_check(ret,
 215                                      rcu_read_lock_held() ||
 216                                      lockdep_rtnl_is_held());
 217 }
 218
 219 static inline int tnode_child_length(const struct tnode *tn)
 220 {
 221         return 1 << tn->bits;
 222 }
 223
 224 static inline t_key mask_pfx(t_key k, unsigned short l)
 225 {
 226         return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
 227 }
 228
 229 static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
 230 {
 231         if (offset < KEYLENGTH)
 232                 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
 233         else
 234                 return 0;
 235 }
 236
 237 static inline int tkey_equals(t_key a, t_key b)
 238 {
 239         return a == b;
 240 }
 241
 242 static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
 243 {
 244         if (bits == 0 || offset >= KEYLENGTH)
 245                 return 1;
 246         bits = bits > KEYLENGTH ? KEYLENGTH : bits;
 247         return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
 248 }
 249
 250 static inline int tkey_mismatch(t_key a, int offset, t_key b)
 251 {
 252         t_key diff = a ^ b;
 253         int i = offset;
 254
 255         if (!diff)
 256                 return 0;
 257         while ((diff << i) >> (KEYLENGTH-1) == 0)
 258                 i++;
 259         return i;
 260 }
 261
 262 /*
 263   To understand this stuff, an understanding of keys and all their bits is
 264   necessary. Every node in the trie has a key associated with it, but not
 265   all of the bits in that key are significant.
 266
 267   Consider a node 'n' and its parent 'tp'.
 268
 269   If n is a leaf, every bit in its key is significant. Its presence is
 270   necessitated by path compression, since during a tree traversal (when
 271   searching for a leaf - unless we are doing an insertion) we will completely
 272   ignore all skipped bits we encounter. Thus we need to verify, at the end of
 273   a potentially successful search, that we have indeed been walking the
 274   correct key path.
 275
 276   Note that we can never "miss" the correct key in the tree if present by
 277   following the wrong path. Path compression ensures that segments of the key
 278   that are the same for all keys with a given prefix are skipped, but the
 279   skipped part *is* identical for each node in the subtrie below the skipped
 280   bit! trie_insert() in this implementation takes care of that - note the
 281   call to tkey_sub_equals() in trie_insert().
 282
 283   if n is an internal node - a 'tnode' here, the various parts of its key
 284   have many different meanings.
 285
 286   Example:
 287   _________________________________________________________________
 288   | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
 289   -----------------------------------------------------------------
 290     0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 291
 292   _________________________________________________________________
 293   | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
 294   -----------------------------------------------------------------
 295    16  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31
 296
 297   tp->pos = 7
 298   tp->bits = 3
 299   n->pos = 15
 300   n->bits = 4
 301
 302   First, let's just ignore the bits that come before the parent tp, that is
 303   the bits from 0 to (tp->pos-1). They are *known* but at this point we do
 304   not use them for anything.
 305
 306   The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
 307   index into the parent's child array. That is, they will be used to find
 308   'n' among tp's children.
 309
 310   The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
 311   for the node n.
 312
 313   All the bits we have seen so far are significant to the node n. The rest
 314   of the bits are really not needed or indeed known in n->key.
 315
 316   The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
 317   n's child array, and will of course be different for each child.
 318
 319
 320   The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
 321   at this point.
 322
 323 */
 324
 325 static inline void check_tnode(const struct tnode *tn)
 326 {
 327         WARN_ON(tn && tn->pos+tn->bits > 32);
 328 }
 329
 330 static const int halve_threshold = 25;
 331 static const int inflate_threshold = 50;
 332 static const int halve_threshold_root = 15;
 333 static const int inflate_threshold_root = 30;
 334
 335 static void __alias_free_mem(struct rcu_head *head)
 336 {
 337         struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
 338         kmem_cache_free(fn_alias_kmem, fa);
 339 }
 340
 341 static inline void alias_free_mem_rcu(struct fib_alias *fa)
 342 {
 343         call_rcu(&fa->rcu, __alias_free_mem);
 344 }
 345
 346 static void __leaf_free_rcu(struct rcu_head *head)
 347 {
 348         struct leaf *l = container_of(head, struct leaf, rcu);
 349         kmem_cache_free(trie_leaf_kmem, l);
 350 }
 351
 352 static inline void free_leaf(struct leaf *l)
 353 {
 354         call_rcu_bh(&l->rcu, __leaf_free_rcu);
 355 }
 356
 357 static void __leaf_info_free_rcu(struct rcu_head *head)
 358 {
 359         kfree(container_of(head, struct leaf_info, rcu));
 360 }
 361
 362 static inline void free_leaf_info(struct leaf_info *leaf)
 363 {
 364         call_rcu(&leaf->rcu, __leaf_info_free_rcu);
 365 }
 366
 367 static struct tnode *tnode_alloc(size_t size)
 368 {
 369         if (size <= PAGE_SIZE)
 370                 return kzalloc(size, GFP_KERNEL);
 371         else
 372                 return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
 373 }
 374
 375 static void __tnode_vfree(struct work_struct *arg)
 376 {
 377         struct tnode *tn = container_of(arg, struct tnode, work);
 378         vfree(tn);
 379 }
 380
 381 static void __tnode_free_rcu(struct rcu_head *head)
 382 {
 383         struct tnode *tn = container_of(head, struct tnode, rcu);
 384         size_t size = sizeof(struct tnode) +
 385                       (sizeof(struct node *) << tn->bits);
 386
 387         if (size <= PAGE_SIZE)
 388                 kfree(tn);
 389         else {
 390                 INIT_WORK(&tn->work, __tnode_vfree);
 391                 schedule_work(&tn->work);
 392         }
 393 }
 394
 395 static inline void tnode_free(struct tnode *tn)
 396 {
 397         if (IS_LEAF(tn))
 398                 free_leaf((struct leaf *) tn);
 399         else
 400                 call_rcu(&tn->rcu, __tnode_free_rcu);
 401 }
 402
 403 static void tnode_free_safe(struct tnode *tn)
 404 {
 405         BUG_ON(IS_LEAF(tn));
 406         tn->tnode_free = tnode_free_head;
 407         tnode_free_head = tn;
 408         tnode_free_size += sizeof(struct tnode) +
 409                            (sizeof(struct node *) << tn->bits);
 410 }
 411
 412 static void tnode_free_flush(void)
 413 {
 414         struct tnode *tn;
 415
 416         while ((tn = tnode_free_head)) {
 417                 tnode_free_head = tn->tnode_free;
 418                 tn->tnode_free = NULL;
 419                 tnode_free(tn);
 420         }
 421
 422         if (tnode_free_size >= PAGE_SIZE * sync_pages) {
 423                 tnode_free_size = 0;
 424                 synchronize_rcu();
 425         }
 426 }
 427
 428 static struct leaf *leaf_new(void)
 429 {
 430         struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
 431         if (l) {
 432                 l->parent = T_LEAF;
 433                 INIT_HLIST_HEAD(&l->list);
 434         }
 435         return l;
 436 }
 437
 438 static struct leaf_info *leaf_info_new(int plen)
 439 {
 440         struct leaf_info *li = kmalloc(sizeof(struct leaf_info),  GFP_KERNEL);
 441         if (li) {
 442                 li->plen = plen;
 443                 INIT_LIST_HEAD(&li->falh);
 444         }
 445         return li;
 446 }
 447
 448 static struct tnode *tnode_new(t_key key, int pos, int bits)
 449 {
 450         size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
 451         struct tnode *tn = tnode_alloc(sz);
 452
 453         if (tn) {
 454                 tn->parent = T_TNODE;
 455                 tn->pos = pos;
 456                 tn->bits = bits;
 457                 tn->key = key;
 458                 tn->full_children = 0;
 459                 tn->empty_children = 1<<bits;
 460         }
 461
 462         pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
 463                  (unsigned long) (sizeof(struct node) << bits));
 464         return tn;
 465 }
 466
 467 /*
 468  * Check whether a tnode 'n' is "full", i.e. it is an internal node
 469  * and no bits are skipped. See discussion in dyntree paper p. 6
 470  */
 471
 472 static inline int tnode_full(const struct tnode *tn, const struct node *n)
 473 {
 474         if (n == NULL || IS_LEAF(n))
 475                 return 0;
 476
 477         return ((struct tnode *) n)->pos == tn->pos + tn->bits;
 478 }
 479
 480 static inline void put_child(struct trie *t, struct tnode *tn, int i,
 481                              struct node *n)
 482 {
 483         tnode_put_child_reorg(tn, i, n, -1);
 484 }
 485
 486  /*
 487   * Add a child at position i overwriting the old value.
 488   * Update the value of full_children and empty_children.
 489   */
 490
 491 static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
 492                                   int wasfull)
 493 {
 494         struct node *chi = tn->child[i];
 495         int isfull;
 496
 497         BUG_ON(i >= 1<<tn->bits);
 498
 499         /* update emptyChildren */
 500         if (n == NULL && chi != NULL)
 501                 tn->empty_children++;
 502         else if (n != NULL && chi == NULL)
 503                 tn->empty_children--;
 504
 505         /* update fullChildren */
 506         if (wasfull == -1)
 507                 wasfull = tnode_full(tn, chi);
 508
 509         isfull = tnode_full(tn, n);
 510         if (wasfull && !isfull)
 511                 tn->full_children--;
 512         else if (!wasfull && isfull)
 513                 tn->full_children++;
 514
 515         if (n)
 516                 node_set_parent(n, tn);
 517
 518         rcu_assign_pointer(tn->child[i], n);
 519 }
 520
 521 #define MAX_WORK 10
 522 static struct node *resize(struct trie *t, struct tnode *tn)
 523 {
 524         int i;
 525         struct tnode *old_tn;
 526         int inflate_threshold_use;
 527         int halve_threshold_use;
 528         int max_work;
 529
 530         if (!tn)
 531                 return NULL;
 532
 533         pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
 534                  tn, inflate_threshold, halve_threshold);
 535
 536         /* No children */
 537         if (tn->empty_children == tnode_child_length(tn)) {
 538                 tnode_free_safe(tn);
 539                 return NULL;
 540         }
 541         /* One child */
 542         if (tn->empty_children == tnode_child_length(tn) - 1)
 543                 goto one_child;
 544         /*
 545          * Double as long as the resulting node has a number of
 546          * nonempty nodes that are above the threshold.
 547          */
 548
 549         /*
 550          * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
 551          * the Helsinki University of Technology and Matti Tikkanen of Nokia
 552          * Telecommunications, page 6:
 553          * "A node is doubled if the ratio of non-empty children to all
 554          * children in the *doubled* node is at least 'high'."
 555          *
 556          * 'high' in this instance is the variable 'inflate_threshold'. It
 557          * is expressed as a percentage, so we multiply it with
 558          * tnode_child_length() and instead of multiplying by 2 (since the
 559          * child array will be doubled by inflate()) and multiplying
 560          * the left-hand side by 100 (to handle the percentage thing) we
 561          * multiply the left-hand side by 50.
 562          *
 563          * The left-hand side may look a bit weird: tnode_child_length(tn)
 564          * - tn->empty_children is of course the number of non-null children
 565          * in the current node. tn->full_children is the number of "full"
 566          * children, that is non-null tnodes with a skip value of 0.
 567          * All of those will be doubled in the resulting inflated tnode, so
 568          * we just count them one extra time here.
 569          *
 570          * A clearer way to write this would be:
 571          *
 572          * to_be_doubled = tn->full_children;
 573          * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
 574          *     tn->full_children;
 575          *
 576          * new_child_length = tnode_child_length(tn) * 2;
 577          *
 578          * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
 579          *      new_child_length;
 580          * if (new_fill_factor >= inflate_threshold)
 581          *
 582          * ...and so on, tho it would mess up the while () loop.
 583          *
 584          * anyway,
 585          * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
 586          *      inflate_threshold
 587          *
 588          * avoid a division:
 589          * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
 590          *      inflate_threshold * new_child_length
 591          *
 592          * expand not_to_be_doubled and to_be_doubled, and shorten:
 593          * 100 * (tnode_child_length(tn) - tn->empty_children +
 594          *    tn->full_children) >= inflate_threshold * new_child_length
 595          *
 596          * expand new_child_length:
 597          * 100 * (tnode_child_length(tn) - tn->empty_children +
 598          *    tn->full_children) >=
 599          *      inflate_threshold * tnode_child_length(tn) * 2
 600          *
 601          * shorten again:
 602          * 50 * (tn->full_children + tnode_child_length(tn) -
 603          *    tn->empty_children) >= inflate_threshold *
 604          *    tnode_child_length(tn)
 605          *
 606          */
 607
 608         check_tnode(tn);
 609
 610         /* Keep root node larger  */
 611
 612         if (!node_parent((struct node*) tn)) {
 613                 inflate_threshold_use = inflate_threshold_root;
 614                 halve_threshold_use = halve_threshold_root;
 615         }
 616         else {
 617                 inflate_threshold_use = inflate_threshold;
 618                 halve_threshold_use = halve_threshold;
 619         }
 620
 621         max_work = MAX_WORK;
 622         while ((tn->full_children > 0 &&  max_work-- &&
 623                 50 * (tn->full_children + tnode_child_length(tn)
 624                       - tn->empty_children)
 625                 >= inflate_threshold_use * tnode_child_length(tn))) {
 626
 627                 old_tn = tn;
 628                 tn = inflate(t, tn);
 629
 630                 if (IS_ERR(tn)) {
 631                         tn = old_tn;
 632 #ifdef CONFIG_IP_FIB_TRIE_STATS
 633                         t->stats.resize_node_skipped++;
 634 #endif
 635                         break;
 636                 }
 637         }
 638
 639         check_tnode(tn);
 640
 641         /* Return if at least one inflate is run */
 642         if( max_work != MAX_WORK)
 643                 return (struct node *) tn;
 644
 645         /*
 646          * Halve as long as the number of empty children in this
 647          * node is above threshold.
 648          */
 649
 650         max_work = MAX_WORK;
 651         while (tn->bits > 1 &&  max_work-- &&
 652                100 * (tnode_child_length(tn) - tn->empty_children) <
 653                halve_threshold_use * tnode_child_length(tn)) {
 654
 655                 old_tn = tn;
 656                 tn = halve(t, tn);
 657                 if (IS_ERR(tn)) {
 658                         tn = old_tn;
 659 #ifdef CONFIG_IP_FIB_TRIE_STATS
 660                         t->stats.resize_node_skipped++;
 661 #endif
 662                         break;
 663                 }
 664         }
 665
 666
 667         /* Only one child remains */
 668         if (tn->empty_children == tnode_child_length(tn) - 1) {
 669 one_child:
 670                 for (i = 0; i < tnode_child_length(tn); i++) {
 671                         struct node *n;
 672
 673                         n = tn->child[i];
 674                         if (!n)
 675                                 continue;
 676
 677                         /* compress one level */
 678
 679                         node_set_parent(n, NULL);
 680                         tnode_free_safe(tn);
 681                         return n;
 682                 }
 683         }
 684         return (struct node *) tn;
 685 }
 686
 687 static struct tnode *inflate(struct trie *t, struct tnode *tn)
 688 {
 689         struct tnode *oldtnode = tn;
 690         int olen = tnode_child_length(tn);
 691         int i;
 692
 693         pr_debug("In inflate\n");
 694
 695         tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
 696
 697         if (!tn)
 698                 return ERR_PTR(-ENOMEM);
 699
 700         /*
 701          * Preallocate and store tnodes before the actual work so we
 702          * don't get into an inconsistent state if memory allocation
 703          * fails. In case of failure we return the oldnode and  inflate
 704          * of tnode is ignored.
 705          */
 706
 707         for (i = 0; i < olen; i++) {
 708                 struct tnode *inode;
 709
 710                 inode = (struct tnode *) tnode_get_child(oldtnode, i);
 711                 if (inode &&
 712                     IS_TNODE(inode) &&
 713                     inode->pos == oldtnode->pos + oldtnode->bits &&
 714                     inode->bits > 1) {
 715                         struct tnode *left, *right;
 716                         t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
 717
 718                         left = tnode_new(inode->key&(~m), inode->pos + 1,
 719                                          inode->bits - 1);
 720                         if (!left)
 721                                 goto nomem;
 722
 723                         right = tnode_new(inode->key|m, inode->pos + 1,
 724                                           inode->bits - 1);
 725
 726                         if (!right) {
 727                                 tnode_free(left);
 728                                 goto nomem;
 729                         }
 730
 731                         put_child(t, tn, 2*i, (struct node *) left);
 732                         put_child(t, tn, 2*i+1, (struct node *) right);
 733                 }
 734         }
 735
 736         for (i = 0; i < olen; i++) {
 737                 struct tnode *inode;
 738                 struct node *node = tnode_get_child(oldtnode, i);
 739                 struct tnode *left, *right;
 740                 int size, j;
 741
 742                 /* An empty child */
 743                 if (node == NULL)
 744                         continue;
 745
 746                 /* A leaf or an internal node with skipped bits */
 747
 748                 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
 749                    tn->pos + tn->bits - 1) {
 750                         if (tkey_extract_bits(node->key,
 751                                               oldtnode->pos + oldtnode->bits,
 752                                               1) == 0)
 753                                 put_child(t, tn, 2*i, node);
 754                         else
 755                                 put_child(t, tn, 2*i+1, node);
 756                         continue;
 757                 }
 758
 759                 /* An internal node with two children */
 760                 inode = (struct tnode *) node;
 761
 762                 if (inode->bits == 1) {
 763                         put_child(t, tn, 2*i, inode->child[0]);
 764                         put_child(t, tn, 2*i+1, inode->child[1]);
 765
 766                         tnode_free_safe(inode);
 767                         continue;
 768                 }
 769
 770                 /* An internal node with more than two children */
 771
 772                 /* We will replace this node 'inode' with two new
 773                  * ones, 'left' and 'right', each with half of the
 774                  * original children. The two new nodes will have
 775                  * a position one bit further down the key and this
 776                  * means that the "significant" part of their keys
 777                  * (see the discussion near the top of this file)
 778                  * will differ by one bit, which will be "0" in
 779                  * left's key and "1" in right's key. Since we are
 780                  * moving the key position by one step, the bit that
 781                  * we are moving away from - the bit at position
 782                  * (inode->pos) - is the one that will differ between
 783                  * left and right. So... we synthesize that bit in the
 784                  * two  new keys.
 785                  * The mask 'm' below will be a single "one" bit at
 786                  * the position (inode->pos)
 787                  */
 788
 789                 /* Use the old key, but set the new significant
 790                  *   bit to zero.
 791                  */
 792
 793                 left = (struct tnode *) tnode_get_child(tn, 2*i);
 794                 put_child(t, tn, 2*i, NULL);
 795
 796                 BUG_ON(!left);
 797
 798                 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
 799                 put_child(t, tn, 2*i+1, NULL);
 800
 801                 BUG_ON(!right);
 802
 803                 size = tnode_child_length(left);
 804                 for (j = 0; j < size; j++) {
 805                         put_child(t, left, j, inode->child[j]);
 806                         put_child(t, right, j, inode->child[j + size]);
 807                 }
 808                 put_child(t, tn, 2*i, resize(t, left));
 809                 put_child(t, tn, 2*i+1, resize(t, right));
 810
 811                 tnode_free_safe(inode);
 812         }
 813         tnode_free_safe(oldtnode);
 814         return tn;
 815 nomem:
 816         {
 817                 int size = tnode_child_length(tn);
 818                 int j;
 819
 820                 for (j = 0; j < size; j++)
 821                         if (tn->child[j])
 822                                 tnode_free((struct tnode *)tn->child[j]);
 823
 824                 tnode_free(tn);
 825
 826                 return ERR_PTR(-ENOMEM);
 827         }
 828 }
 829
 830 static struct tnode *halve(struct trie *t, struct tnode *tn)
 831 {
 832         struct tnode *oldtnode = tn;
 833         struct node *left, *right;
 834         int i;
 835         int olen = tnode_child_length(tn);
 836
 837         pr_debug("In halve\n");
 838
 839         tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
 840
 841         if (!tn)
 842                 return ERR_PTR(-ENOMEM);
 843
 844         /*
 845          * Preallocate and store tnodes before the actual work so we
 846          * don't get into an inconsistent state if memory allocation
 847          * fails. In case of failure we return the oldnode and halve
 848          * of tnode is ignored.
 849          */
 850
 851         for (i = 0; i < olen; i += 2) {
 852                 left = tnode_get_child(oldtnode, i);
 853                 right = tnode_get_child(oldtnode, i+1);
 854
 855                 /* Two nonempty children */
 856                 if (left && right) {
 857                         struct tnode *newn;
 858
 859                         newn = tnode_new(left->key, tn->pos + tn->bits, 1);
 860
 861                         if (!newn)
 862                                 goto nomem;
 863
 864                         put_child(t, tn, i/2, (struct node *)newn);
 865                 }
 866
 867         }
 868
 869         for (i = 0; i < olen; i += 2) {
 870                 struct tnode *newBinNode;
 871
 872                 left = tnode_get_child(oldtnode, i);
 873                 right = tnode_get_child(oldtnode, i+1);
 874
 875                 /* At least one of the children is empty */
 876                 if (left == NULL) {
 877                         if (right == NULL)    /* Both are empty */
 878                                 continue;
 879                         put_child(t, tn, i/2, right);
 880                         continue;
 881                 }
 882
 883                 if (right == NULL) {
 884                         put_child(t, tn, i/2, left);
 885                         continue;
 886                 }
 887
 888                 /* Two nonempty children */
 889                 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
 890                 put_child(t, tn, i/2, NULL);
 891                 put_child(t, newBinNode, 0, left);
 892                 put_child(t, newBinNode, 1, right);
 893                 put_child(t, tn, i/2, resize(t, newBinNode));
 894         }
 895         tnode_free_safe(oldtnode);
 896         return tn;
 897 nomem:
 898         {
 899                 int size = tnode_child_length(tn);
 900                 int j;
 901
 902                 for (j = 0; j < size; j++)
 903                         if (tn->child[j])
 904                                 tnode_free((struct tnode *)tn->child[j]);
 905
 906                 tnode_free(tn);
 907
 908                 return ERR_PTR(-ENOMEM);
 909         }
 910 }
 911
 912 /* readside must use rcu_read_lock currently dump routines
 913  via get_fa_head and dump */
 914
 915 static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
 916 {
 917         struct hlist_head *head = &l->list;
 918         struct hlist_node *node;
 919         struct leaf_info *li;
 920
 921         hlist_for_each_entry_rcu(li, node, head, hlist)
 922                 if (li->plen == plen)
 923                         return li;
 924
 925         return NULL;
 926 }
 927
 928 static inline struct list_head *get_fa_head(struct leaf *l, int plen)
 929 {
 930         struct leaf_info *li = find_leaf_info(l, plen);
 931
 932         if (!li)
 933                 return NULL;
 934
 935         return &li->falh;
 936 }
 937
 938 static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
 939 {
 940         struct leaf_info *li = NULL, *last = NULL;
 941         struct hlist_node *node;
 942
 943         if (hlist_empty(head)) {
 944                 hlist_add_head_rcu(&new->hlist, head);
 945         } else {
 946                 hlist_for_each_entry(li, node, head, hlist) {
 947                         if (new->plen > li->plen)
 948                                 break;
 949
 950                         last = li;
 951                 }
 952                 if (last)
 953                         hlist_add_after_rcu(&last->hlist, &new->hlist);
 954                 else
 955                         hlist_add_before_rcu(&new->hlist, &li->hlist);
 956         }
 957 }
 958
 959 /* rcu_read_lock needs to be hold by caller from readside */
 960
 961 static struct leaf *
 962 fib_find_node(struct trie *t, u32 key)
 963 {
 964         int pos;
 965         struct tnode *tn;
 966         struct node *n;
 967
 968         pos = 0;
 969         n = rcu_dereference_check(t->trie,
 970                                   rcu_read_lock_held() ||
 971                                   lockdep_rtnl_is_held());
 972
 973         while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 974                 tn = (struct tnode *) n;
 975
 976                 check_tnode(tn);
 977
 978                 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
 979                         pos = tn->pos + tn->bits;
 980                         n = tnode_get_child_rcu(tn,
 981                                                 tkey_extract_bits(key,
 982                                                                   tn->pos,
 983                                                                   tn->bits));
 984                 } else
 985                         break;
 986         }
 987         /* Case we have found a leaf. Compare prefixes */
 988
 989         if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
 990                 return (struct leaf *)n;
 991
 992         return NULL;
 993 }
 994
 995 static void trie_rebalance(struct trie *t, struct tnode *tn)
 996 {
 997         int wasfull;
 998         t_key cindex, key;
 999         struct tnode *tp;
1000
1001         key = tn->key;
1002
1003         while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
1004                 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1005                 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
1006                 tn = (struct tnode *) resize(t, (struct tnode *)tn);
1007
1008                 tnode_put_child_reorg((struct tnode *)tp, cindex,
1009                                       (struct node *)tn, wasfull);
1010
1011                 tp = node_parent((struct node *) tn);
1012                 if (!tp)
1013                         rcu_assign_pointer(t->trie, (struct node *)tn);
1014
1015                 tnode_free_flush();
1016                 if (!tp)
1017                         break;
1018                 tn = tp;
1019         }
1020
1021         /* Handle last (top) tnode */
1022         if (IS_TNODE(tn))
1023                 tn = (struct tnode *)resize(t, (struct tnode *)tn);
1024
1025         rcu_assign_pointer(t->trie, (struct node *)tn);
1026         tnode_free_flush();
1027 }
1028
1029 /* only used from updater-side */
1030
1031 static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1032 {
1033         int pos, newpos;
1034         struct tnode *tp = NULL, *tn = NULL;
1035         struct node *n;
1036         struct leaf *l;
1037         int missbit;
1038         struct list_head *fa_head = NULL;
1039         struct leaf_info *li;
1040         t_key cindex;
1041
1042         pos = 0;
1043         n = t->trie;
1044
1045         /* If we point to NULL, stop. Either the tree is empty and we should
1046          * just put a new leaf in if, or we have reached an empty child slot,
1047          * and we should just put our new leaf in that.
1048          * If we point to a T_TNODE, check if it matches our key. Note that
1049          * a T_TNODE might be skipping any number of bits - its 'pos' need
1050          * not be the parent's 'pos'+'bits'!
1051          *
1052          * If it does match the current key, get pos/bits from it, extract
1053          * the index from our key, push the T_TNODE and walk the tree.
1054          *
1055          * If it doesn't, we have to replace it with a new T_TNODE.
1056          *
1057          * If we point to a T_LEAF, it might or might not have the same key
1058          * as we do. If it does, just change the value, update the T_LEAF's
1059          * value, and return it.
1060          * If it doesn't, we need to replace it with a T_TNODE.
1061          */
1062
1063         while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
1064                 tn = (struct tnode *) n;
1065
1066                 check_tnode(tn);
1067
1068                 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
1069                         tp = tn;
1070                         pos = tn->pos + tn->bits;
1071                         n = tnode_get_child(tn,
1072                                             tkey_extract_bits(key,
1073                                                               tn->pos,
1074                                                               tn->bits));
1075
1076                         BUG_ON(n && node_parent(n) != tn);
1077                 } else
1078                         break;
1079         }
1080
1081         /*
1082          * n  ----> NULL, LEAF or TNODE
1083          *
1084          * tp is n's (parent) ----> NULL or TNODE
1085          */
1086
1087         BUG_ON(tp && IS_LEAF(tp));
1088
1089         /* Case 1: n is a leaf. Compare prefixes */
1090
1091         if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
1092                 l = (struct leaf *) n;
1093                 li = leaf_info_new(plen);
1094
1095                 if (!li)
1096                         return NULL;
1097
1098                 fa_head = &li->falh;
1099                 insert_leaf_info(&l->list, li);
1100                 goto done;
1101         }
1102         l = leaf_new();
1103
1104         if (!l)
1105                 return NULL;
1106
1107         l->key = key;
1108         li = leaf_info_new(plen);
1109
1110         if (!li) {
1111                 free_leaf(l);
1112                 return NULL;
1113         }
1114
1115         fa_head = &li->falh;
1116         insert_leaf_info(&l->list, li);
1117
1118         if (t->trie && n == NULL) {
1119                 /* Case 2: n is NULL, and will just insert a new leaf */
1120
1121                 node_set_parent((struct node *)l, tp);
1122
1123                 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1124                 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1125         } else {
1126                 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1127                 /*
1128                  *  Add a new tnode here
1129                  *  first tnode need some special handling
1130                  */
1131
1132                 if (tp)
1133                         pos = tp->pos+tp->bits;
1134                 else
1135                         pos = 0;
1136
1137                 if (n) {
1138                         newpos = tkey_mismatch(key, pos, n->key);
1139                         tn = tnode_new(n->key, newpos, 1);
1140                 } else {
1141                         newpos = 0;
1142                         tn = tnode_new(key, newpos, 1); /* First tnode */
1143                 }
1144
1145                 if (!tn) {
1146                         free_leaf_info(li);
1147                         free_leaf(l);
1148                         return NULL;
1149                 }
1150
1151                 node_set_parent((struct node *)tn, tp);
1152
1153                 missbit = tkey_extract_bits(key, newpos, 1);
1154                 put_child(t, tn, missbit, (struct node *)l);
1155                 put_child(t, tn, 1-missbit, n);
1156
1157                 if (tp) {
1158                         cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1159                         put_child(t, (struct tnode *)tp, cindex,
1160                                   (struct node *)tn);
1161                 } else {
1162                         rcu_assign_pointer(t->trie, (struct node *)tn);
1163                         tp = tn;
1164                 }
1165         }
1166
1167         if (tp && tp->pos + tp->bits > 32)
1168                 pr_warning("fib_trie"
1169                            " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1170                            tp, tp->pos, tp->bits, key, plen);
1171
1172         /* Rebalance the trie */
1173
1174         trie_rebalance(t, tp);
1175 done:
1176         return fa_head;
1177 }
1178
1179 /*
1180  * Caller must hold RTNL.
1181  */
1182 int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
1183 {
1184         struct trie *t = (struct trie *) tb->tb_data;
1185         struct fib_alias *fa, *new_fa;
1186         struct list_head *fa_head = NULL;
1187         struct fib_info *fi;
1188         int plen = cfg->fc_dst_len;
1189         u8 tos = cfg->fc_tos;
1190         u32 key, mask;
1191         int err;
1192         struct leaf *l;
1193
1194         if (plen > 32)
1195                 return -EINVAL;
1196
1197         key = ntohl(cfg->fc_dst);
1198
1199         pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
1200
1201         mask = ntohl(inet_make_mask(plen));
1202
1203         if (key & ~mask)
1204                 return -EINVAL;
1205
1206         key = key & mask;
1207
1208         fi = fib_create_info(cfg);
1209         if (IS_ERR(fi)) {
1210                 err = PTR_ERR(fi);
1211                 goto err;
1212         }
1213
1214         l = fib_find_node(t, key);
1215         fa = NULL;
1216
1217         if (l) {
1218                 fa_head = get_fa_head(l, plen);
1219                 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1220         }
1221
1222         /* Now fa, if non-NULL, points to the first fib alias
1223          * with the same keys [prefix,tos,priority], if such key already
1224          * exists or to the node before which we will insert new one.
1225          *
1226          * If fa is NULL, we will need to allocate a new one and
1227          * insert to the head of f.
1228          *
1229          * If f is NULL, no fib node matched the destination key
1230          * and we need to allocate a new one of those as well.
1231          */
1232
1233         if (fa && fa->fa_tos == tos &&
1234             fa->fa_info->fib_priority == fi->fib_priority) {
1235                 struct fib_alias *fa_first, *fa_match;
1236
1237                 err = -EEXIST;
1238                 if (cfg->fc_nlflags & NLM_F_EXCL)
1239                         goto out;
1240
1241                 /* We have 2 goals:
1242                  * 1. Find exact match for type, scope, fib_info to avoid
1243                  * duplicate routes
1244                  * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1245                  */
1246                 fa_match = NULL;
1247                 fa_first = fa;
1248                 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1249                 list_for_each_entry_continue(fa, fa_head, fa_list) {
1250                         if (fa->fa_tos != tos)
1251                                 break;
1252                         if (fa->fa_info->fib_priority != fi->fib_priority)
1253                                 break;
1254                         if (fa->fa_type == cfg->fc_type &&
1255                             fa->fa_scope == cfg->fc_scope &&
1256                             fa->fa_info == fi) {
1257                                 fa_match = fa;
1258                                 break;
1259                         }
1260                 }
1261
1262                 if (cfg->fc_nlflags & NLM_F_REPLACE) {
1263                         struct fib_info *fi_drop;
1264                         u8 state;
1265
1266                         fa = fa_first;
1267                         if (fa_match) {
1268                                 if (fa == fa_match)
1269                                         err = 0;
1270                                 goto out;
1271                         }
1272                         err = -ENOBUFS;
1273                         new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1274                         if (new_fa == NULL)
1275                                 goto out;
1276
1277                         fi_drop = fa->fa_info;
1278                         new_fa->fa_tos = fa->fa_tos;
1279                         new_fa->fa_info = fi;
1280                         new_fa->fa_type = cfg->fc_type;
1281                         new_fa->fa_scope = cfg->fc_scope;
1282                         state = fa->fa_state;
1283                         new_fa->fa_state = state & ~FA_S_ACCESSED;
1284
1285                         list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1286                         alias_free_mem_rcu(fa);
1287
1288                         fib_release_info(fi_drop);
1289                         if (state & FA_S_ACCESSED)
1290                                 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1291                         rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1292                                 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
1293
1294                         goto succeeded;
1295                 }
1296                 /* Error if we find a perfect match which
1297                  * uses the same scope, type, and nexthop
1298                  * information.
1299                  */
1300                 if (fa_match)
1301                         goto out;
1302
1303                 if (!(cfg->fc_nlflags & NLM_F_APPEND))
1304                         fa = fa_first;
1305         }
1306         err = -ENOENT;
1307         if (!(cfg->fc_nlflags & NLM_F_CREATE))
1308                 goto out;
1309
1310         err = -ENOBUFS;
1311         new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
1312         if (new_fa == NULL)
1313                 goto out;
1314
1315         new_fa->fa_info = fi;
1316         new_fa->fa_tos = tos;
1317         new_fa->fa_type = cfg->fc_type;
1318         new_fa->fa_scope = cfg->fc_scope;
1319         new_fa->fa_state = 0;
1320         /*
1321          * Insert new entry to the list.
1322          */
1323
1324         if (!fa_head) {
1325                 fa_head = fib_insert_node(t, key, plen);
1326                 if (unlikely(!fa_head)) {
1327                         err = -ENOMEM;
1328                         goto out_free_new_fa;
1329                 }
1330         }
1331
1332         list_add_tail_rcu(&new_fa->fa_list,
1333                           (fa ? &fa->fa_list : fa_head));
1334
1335         rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1336         rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
1337                   &cfg->fc_nlinfo, 0);
1338 succeeded:
1339         return 0;
1340
1341 out_free_new_fa:
1342         kmem_cache_free(fn_alias_kmem, new_fa);
1343 out:
1344         fib_release_info(fi);
1345 err:
1346         return err;
1347 }
1348
1349 /* should be called with rcu_read_lock */
1350 static int check_leaf(struct trie *t, struct leaf *l,
1351                       t_key key,  const struct flowi *flp,
1352                       struct fib_result *res)
1353 {
1354         struct leaf_info *li;
1355         struct hlist_head *hhead = &l->list;
1356         struct hlist_node *node;
1357
1358         hlist_for_each_entry_rcu(li, node, hhead, hlist) {
1359                 int err;
1360                 int plen = li->plen;
1361                 __be32 mask = inet_make_mask(plen);
1362
1363                 if (l->key != (key & ntohl(mask)))
1364                         continue;
1365
1366                 err = fib_semantic_match(&li->falh, flp, res, plen);
1367
1368 #ifdef CONFIG_IP_FIB_TRIE_STATS
1369                 if (err <= 0)
1370                         t->stats.semantic_match_passed++;
1371                 else
1372                         t->stats.semantic_match_miss++;
1373 #endif
1374                 if (err <= 0)
1375                         return err;
1376         }
1377
1378         return 1;
1379 }
1380
1381 int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1382                      struct fib_result *res)
1383 {
1384         struct trie *t = (struct trie *) tb->tb_data;
1385         int ret;
1386         struct node *n;
1387         struct tnode *pn;
1388         int pos, bits;
1389         t_key key = ntohl(flp->fl4_dst);
1390         int chopped_off;
1391         t_key cindex = 0;
1392         int current_prefix_length = KEYLENGTH;
1393         struct tnode *cn;
1394         t_key node_prefix, key_prefix, pref_mismatch;
1395         int mp;
1396
1397         rcu_read_lock();
1398
1399         n = rcu_dereference(t->trie);
1400         if (!n)
1401                 goto failed;
1402
1403 #ifdef CONFIG_IP_FIB_TRIE_STATS
1404         t->stats.gets++;
1405 #endif
1406
1407         /* Just a leaf? */
1408         if (IS_LEAF(n)) {
1409                 ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1410                 goto found;
1411         }
1412
1413         pn = (struct tnode *) n;
1414         chopped_off = 0;
1415
1416         while (pn) {
1417                 pos = pn->pos;
1418                 bits = pn->bits;
1419
1420                 if (!chopped_off)
1421                         cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1422                                                    pos, bits);
1423
1424                 n = tnode_get_child_rcu(pn, cindex);
1425
1426                 if (n == NULL) {
1427 #ifdef CONFIG_IP_FIB_TRIE_STATS
1428                         t->stats.null_node_hit++;
1429 #endif
1430                         goto backtrace;
1431                 }
1432
1433                 if (IS_LEAF(n)) {
1434                         ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1435                         if (ret > 0)
1436                                 goto backtrace;
1437                         goto found;
1438                 }
1439
1440                 cn = (struct tnode *)n;
1441
1442                 /*
1443                  * It's a tnode, and we can do some extra checks here if we
1444                  * like, to avoid descending into a dead-end branch.
1445                  * This tnode is in the parent's child array at index
1446                  * key[p_pos..p_pos+p_bits] but potentially with some bits
1447                  * chopped off, so in reality the index may be just a
1448                  * subprefix, padded with zero at the end.
1449                  * We can also take a look at any skipped bits in this
1450                  * tnode - everything up to p_pos is supposed to be ok,
1451                  * and the non-chopped bits of the index (se previous
1452                  * paragraph) are also guaranteed ok, but the rest is
1453                  * considered unknown.
1454                  *
1455                  * The skipped bits are key[pos+bits..cn->pos].
1456                  */
1457
1458                 /* If current_prefix_length < pos+bits, we are already doing
1459                  * actual prefix  matching, which means everything from
1460                  * pos+(bits-chopped_off) onward must be zero along some
1461                  * branch of this subtree - otherwise there is *no* valid
1462                  * prefix present. Here we can only check the skipped
1463                  * bits. Remember, since we have already indexed into the
1464                  * parent's child array, we know that the bits we chopped of
1465                  * *are* zero.
1466                  */
1467
1468                 /* NOTA BENE: Checking only skipped bits
1469                    for the new node here */
1470
1471                 if (current_prefix_length < pos+bits) {
1472                         if (tkey_extract_bits(cn->key, current_prefix_length,
1473                                                 cn->pos - current_prefix_length)
1474                             || !(cn->child[0]))
1475                                 goto backtrace;
1476                 }
1477
1478                 /*
1479                  * If chopped_off=0, the index is fully validated and we
1480                  * only need to look at the skipped bits for this, the new,
1481                  * tnode. What we actually want to do is to find out if
1482                  * these skipped bits match our key perfectly, or if we will
1483                  * have to count on finding a matching prefix further down,
1484                  * because if we do, we would like to have some way of
1485                  * verifying the existence of such a prefix at this point.
1486                  */
1487
1488                 /* The only thing we can do at this point is to verify that
1489                  * any such matching prefix can indeed be a prefix to our
1490                  * key, and if the bits in the node we are inspecting that
1491                  * do not match our key are not ZERO, this cannot be true.
1492                  * Thus, find out where there is a mismatch (before cn->pos)
1493                  * and verify that all the mismatching bits are zero in the
1494                  * new tnode's key.
1495                  */
1496
1497                 /*
1498                  * Note: We aren't very concerned about the piece of
1499                  * the key that precede pn->pos+pn->bits, since these
1500                  * have already been checked. The bits after cn->pos
1501                  * aren't checked since these are by definition
1502                  * "unknown" at this point. Thus, what we want to see
1503                  * is if we are about to enter the "prefix matching"
1504                  * state, and in that case verify that the skipped
1505                  * bits that will prevail throughout this subtree are
1506                  * zero, as they have to be if we are to find a
1507                  * matching prefix.
1508                  */
1509
1510                 node_prefix = mask_pfx(cn->key, cn->pos);
1511                 key_prefix = mask_pfx(key, cn->pos);
1512                 pref_mismatch = key_prefix^node_prefix;
1513                 mp = 0;
1514
1515                 /*
1516                  * In short: If skipped bits in this node do not match
1517                  * the search key, enter the "prefix matching"
1518                  * state.directly.
1519                  */
1520                 if (pref_mismatch) {
1521                         while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1522                                 mp++;
1523                                 pref_mismatch = pref_mismatch << 1;
1524                         }
1525                         key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1526
1527                         if (key_prefix != 0)
1528                                 goto backtrace;
1529
1530                         if (current_prefix_length >= cn->pos)
1531                                 current_prefix_length = mp;
1532                 }
1533
1534                 pn = (struct tnode *)n; /* Descend */
1535                 chopped_off = 0;
1536                 continue;
1537
1538 backtrace:
1539                 chopped_off++;
1540
1541                 /* As zero don't change the child key (cindex) */
1542                 while ((chopped_off <= pn->bits)
1543                        && !(cindex & (1<<(chopped_off-1))))
1544                         chopped_off++;
1545
1546                 /* Decrease current_... with bits chopped off */
1547                 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1548                         current_prefix_length = pn->pos + pn->bits
1549                                 - chopped_off;
1550
1551                 /*
1552                  * Either we do the actual chop off according or if we have
1553                  * chopped off all bits in this tnode walk up to our parent.
1554                  */
1555
1556                 if (chopped_off <= pn->bits) {
1557                         cindex &= ~(1 << (chopped_off-1));
1558                 } else {
1559                         struct tnode *parent = node_parent_rcu((struct node *) pn);
1560                         if (!parent)
1561                                 goto failed;
1562
1563                         /* Get Child's index */
1564                         cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1565                         pn = parent;
1566                         chopped_off = 0;
1567
1568 #ifdef CONFIG_IP_FIB_TRIE_STATS
1569                         t->stats.backtrack++;
1570 #endif
1571                         goto backtrace;
1572                 }
1573         }
1574 failed:
1575         ret = 1;
1576 found:
1577         rcu_read_unlock();
1578         return ret;
1579 }
1580
1581 /*
1582  * Remove the leaf and return parent.
1583  */
1584 static void trie_leaf_remove(struct trie *t, struct leaf *l)
1585 {
1586         struct tnode *tp = node_parent((struct node *) l);
1587
1588         pr_debug("entering trie_leaf_remove(%p)\n", l);
1589
1590         if (tp) {
1591                 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1592                 put_child(t, (struct tnode *)tp, cindex, NULL);
1593                 trie_rebalance(t, tp);
1594         } else
1595                 rcu_assign_pointer(t->trie, NULL);
1596
1597         free_leaf(l);
1598 }
1599
1600 /*
1601  * Caller must hold RTNL.
1602  */
1603 int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
1604 {
1605         struct trie *t = (struct trie *) tb->tb_data;
1606         u32 key, mask;
1607         int plen = cfg->fc_dst_len;
1608         u8 tos = cfg->fc_tos;
1609         struct fib_alias *fa, *fa_to_delete;
1610         struct list_head *fa_head;
1611         struct leaf *l;
1612         struct leaf_info *li;
1613
1614         if (plen > 32)
1615                 return -EINVAL;
1616
1617         key = ntohl(cfg->fc_dst);
1618         mask = ntohl(inet_make_mask(plen));
1619
1620         if (key & ~mask)
1621                 return -EINVAL;
1622
1623         key = key & mask;
1624         l = fib_find_node(t, key);
1625
1626         if (!l)
1627                 return -ESRCH;
1628
1629         fa_head = get_fa_head(l, plen);
1630         fa = fib_find_alias(fa_head, tos, 0);
1631
1632         if (!fa)
1633                 return -ESRCH;
1634
1635         pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
1636
1637         fa_to_delete = NULL;
1638         fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1639         list_for_each_entry_continue(fa, fa_head, fa_list) {
1640                 struct fib_info *fi = fa->fa_info;
1641
1642                 if (fa->fa_tos != tos)
1643                         break;
1644
1645                 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1646                     (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1647                      fa->fa_scope == cfg->fc_scope) &&
1648                     (!cfg->fc_protocol ||
1649                      fi->fib_protocol == cfg->fc_protocol) &&
1650                     fib_nh_match(cfg, fi) == 0) {
1651                         fa_to_delete = fa;
1652                         break;
1653                 }
1654         }
1655
1656         if (!fa_to_delete)
1657                 return -ESRCH;
1658
1659         fa = fa_to_delete;
1660         rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
1661                   &cfg->fc_nlinfo, 0);
1662
1663         l = fib_find_node(t, key);
1664         li = find_leaf_info(l, plen);
1665
1666         list_del_rcu(&fa->fa_list);
1667
1668         if (list_empty(fa_head)) {
1669                 hlist_del_rcu(&li->hlist);
1670                 free_leaf_info(li);
1671         }
1672
1673         if (hlist_empty(&l->list))
1674                 trie_leaf_remove(t, l);
1675
1676         if (fa->fa_state & FA_S_ACCESSED)
1677                 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
1678
1679         fib_release_info(fa->fa_info);
1680         alias_free_mem_rcu(fa);
1681         return 0;
1682 }
1683
1684 static int trie_flush_list(struct list_head *head)
1685 {
1686         struct fib_alias *fa, *fa_node;
1687         int found = 0;
1688
1689         list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1690                 struct fib_info *fi = fa->fa_info;
1691
1692                 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1693                         list_del_rcu(&fa->fa_list);
1694                         fib_release_info(fa->fa_info);
1695                         alias_free_mem_rcu(fa);
1696                         found++;
1697                 }
1698         }
1699         return found;
1700 }
1701
1702 static int trie_flush_leaf(struct leaf *l)
1703 {
1704         int found = 0;
1705         struct hlist_head *lih = &l->list;
1706         struct hlist_node *node, *tmp;
1707         struct leaf_info *li = NULL;
1708
1709         hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
1710                 found += trie_flush_list(&li->falh);
1711
1712                 if (list_empty(&li->falh)) {
1713                         hlist_del_rcu(&li->hlist);
1714                         free_leaf_info(li);
1715                 }
1716         }
1717         return found;
1718 }
1719
1720 /*
1721  * Scan for the next right leaf starting at node p->child[idx]
1722  * Since we have back pointer, no recursion necessary.
1723  */
1724 static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
1725 {
1726         do {
1727                 t_key idx;
1728
1729                 if (c)
1730                         idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
1731                 else
1732                         idx = 0;
1733
1734                 while (idx < 1u << p->bits) {
1735                         c = tnode_get_child_rcu(p, idx++);
1736                         if (!c)
1737                                 continue;
1738
1739                         if (IS_LEAF(c)) {
1740                                 prefetch(p->child[idx]);
1741                                 return (struct leaf *) c;
1742                         }
1743
1744                         /* Rescan start scanning in new node */
1745                         p = (struct tnode *) c;
1746                         idx = 0;
1747                 }
1748
1749                 /* Node empty, walk back up to parent */
1750                 c = (struct node *) p;
1751         } while ( (p = node_parent_rcu(c)) != NULL);
1752
1753         return NULL; /* Root of trie */
1754 }
1755
1756 static struct leaf *trie_firstleaf(struct trie *t)
1757 {
1758         struct tnode *n = (struct tnode *) rcu_dereference_check(t->trie,
1759                                                         rcu_read_lock_held() ||
1760                                                         lockdep_rtnl_is_held());
1761
1762         if (!n)
1763                 return NULL;
1764
1765         if (IS_LEAF(n))          /* trie is just a leaf */
1766                 return (struct leaf *) n;
1767
1768         return leaf_walk_rcu(n, NULL);
1769 }
1770
1771 static struct leaf *trie_nextleaf(struct leaf *l)
1772 {
1773         struct node *c = (struct node *) l;
1774         struct tnode *p = node_parent_rcu(c);
1775
1776         if (!p)
1777                 return NULL;    /* trie with just one leaf */
1778
1779         return leaf_walk_rcu(p, c);
1780 }
1781
1782 static struct leaf *trie_leafindex(struct trie *t, int index)
1783 {
1784         struct leaf *l = trie_firstleaf(t);
1785
1786         while (l && index-- > 0)
1787                 l = trie_nextleaf(l);
1788
1789         return l;
1790 }
1791
1792
1793 /*
1794  * Caller must hold RTNL.
1795  */
1796 int fib_table_flush(struct fib_table *tb)
1797 {
1798         struct trie *t = (struct trie *) tb->tb_data;
1799         struct leaf *l, *ll = NULL;
1800         int found = 0;
1801
1802         for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
1803                 found += trie_flush_leaf(l);
1804
1805                 if (ll && hlist_empty(&ll->list))
1806                         trie_leaf_remove(t, ll);
1807                 ll = l;
1808         }
1809
1810         if (ll && hlist_empty(&ll->list))
1811                 trie_leaf_remove(t, ll);
1812
1813         pr_debug("trie_flush found=%d\n", found);
1814         return found;
1815 }
1816
1817 void fib_table_select_default(struct fib_table *tb,
1818                               const struct flowi *flp,
1819                               struct fib_result *res)
1820 {
1821         struct trie *t = (struct trie *) tb->tb_data;
1822         int order, last_idx;
1823         struct fib_info *fi = NULL;
1824         struct fib_info *last_resort;
1825         struct fib_alias *fa = NULL;
1826         struct list_head *fa_head;
1827         struct leaf *l;
1828
1829         last_idx = -1;
1830         last_resort = NULL;
1831         order = -1;
1832
1833         rcu_read_lock();
1834
1835         l = fib_find_node(t, 0);
1836         if (!l)
1837                 goto out;
1838
1839         fa_head = get_fa_head(l, 0);
1840         if (!fa_head)
1841                 goto out;
1842
1843         if (list_empty(fa_head))
1844                 goto out;
1845
1846         list_for_each_entry_rcu(fa, fa_head, fa_list) {
1847                 struct fib_info *next_fi = fa->fa_info;
1848
1849                 if (fa->fa_scope != res->scope ||
1850                     fa->fa_type != RTN_UNICAST)
1851                         continue;
1852
1853                 if (next_fi->fib_priority > res->fi->fib_priority)
1854                         break;
1855                 if (!next_fi->fib_nh[0].nh_gw ||
1856                     next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1857                         continue;
1858                 fa->fa_state |= FA_S_ACCESSED;
1859
1860                 if (fi == NULL) {
1861                         if (next_fi != res->fi)
1862                                 break;
1863                 } else if (!fib_detect_death(fi, order, &last_resort,
1864                                              &last_idx, tb->tb_default)) {
1865                         fib_result_assign(res, fi);
1866                         tb->tb_default = order;
1867                         goto out;
1868                 }
1869                 fi = next_fi;
1870                 order++;
1871         }
1872         if (order <= 0 || fi == NULL) {
1873                 tb->tb_default = -1;
1874                 goto out;
1875         }
1876
1877         if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1878                                 tb->tb_default)) {
1879                 fib_result_assign(res, fi);
1880                 tb->tb_default = order;
1881                 goto out;
1882         }
1883         if (last_idx >= 0)
1884                 fib_result_assign(res, last_resort);
1885         tb->tb_default = last_idx;
1886 out:
1887         rcu_read_unlock();
1888 }
1889
1890 static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1891                            struct fib_table *tb,
1892                            struct sk_buff *skb, struct netlink_callback *cb)
1893 {
1894         int i, s_i;
1895         struct fib_alias *fa;
1896         __be32 xkey = htonl(key);
1897
1898         s_i = cb->args[5];
1899         i = 0;
1900
1901         /* rcu_read_lock is hold by caller */
1902
1903         list_for_each_entry_rcu(fa, fah, fa_list) {
1904                 if (i < s_i) {
1905                         i++;
1906                         continue;
1907                 }
1908
1909                 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1910                                   cb->nlh->nlmsg_seq,
1911                                   RTM_NEWROUTE,
1912                                   tb->tb_id,
1913                                   fa->fa_type,
1914                                   fa->fa_scope,
1915                                   xkey,
1916                                   plen,
1917                                   fa->fa_tos,
1918                                   fa->fa_info, NLM_F_MULTI) < 0) {
1919                         cb->args[5] = i;
1920                         return -1;
1921                 }
1922                 i++;
1923         }
1924         cb->args[5] = i;
1925         return skb->len;
1926 }
1927
1928 static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1929                         struct sk_buff *skb, struct netlink_callback *cb)
1930 {
1931         struct leaf_info *li;
1932         struct hlist_node *node;
1933         int i, s_i;
1934
1935         s_i = cb->args[4];
1936         i = 0;
1937
1938         /* rcu_read_lock is hold by caller */
1939         hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1940                 if (i < s_i) {
1941                         i++;
1942                         continue;
1943                 }
1944
1945                 if (i > s_i)
1946                         cb->args[5] = 0;
1947
1948                 if (list_empty(&li->falh))
1949                         continue;
1950
1951                 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
1952                         cb->args[4] = i;
1953                         return -1;
1954                 }
1955                 i++;
1956         }
1957
1958         cb->args[4] = i;
1959         return skb->len;
1960 }
1961
1962 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1963                    struct netlink_callback *cb)
1964 {
1965         struct leaf *l;
1966         struct trie *t = (struct trie *) tb->tb_data;
1967         t_key key = cb->args[2];
1968         int count = cb->args[3];
1969
1970         rcu_read_lock();
1971         /* Dump starting at last key.
1972          * Note: 0.0.0.0/0 (ie default) is first key.
1973          */
1974         if (count == 0)
1975                 l = trie_firstleaf(t);
1976         else {
1977                 /* Normally, continue from last key, but if that is missing
1978                  * fallback to using slow rescan
1979                  */
1980                 l = fib_find_node(t, key);
1981                 if (!l)
1982                         l = trie_leafindex(t, count);
1983         }
1984
1985         while (l) {
1986                 cb->args[2] = l->key;
1987                 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
1988                         cb->args[3] = count;
1989                         rcu_read_unlock();
1990                         return -1;
1991                 }
1992
1993                 ++count;
1994                 l = trie_nextleaf(l);
1995                 memset(&cb->args[4], 0,
1996                        sizeof(cb->args) - 4*sizeof(cb->args[0]));
1997         }
1998         cb->args[3] = count;
1999         rcu_read_unlock();
2000
2001         return skb->len;
2002 }
2003
2004 void __init fib_hash_init(void)
2005 {
2006         fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2007                                           sizeof(struct fib_alias),
2008                                           0, SLAB_PANIC, NULL);
2009
2010         trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2011                                            max(sizeof(struct leaf),
2012                                                sizeof(struct leaf_info)),
2013                                            0, SLAB_PANIC, NULL);
2014 }
2015
2016
2017 /* Fix more generic FIB names for init later */
2018 struct fib_table *fib_hash_table(u32 id)
2019 {
2020         struct fib_table *tb;
2021         struct trie *t;
2022
2023         tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2024                      GFP_KERNEL);
2025         if (tb == NULL)
2026                 return NULL;
2027
2028         tb->tb_id = id;
2029         tb->tb_default = -1;
2030
2031         t = (struct trie *) tb->tb_data;
2032         memset(t, 0, sizeof(*t));
2033
2034         if (id == RT_TABLE_LOCAL)
2035                 pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2036
2037         return tb;
2038 }
2039
2040 #ifdef CONFIG_PROC_FS
2041 /* Depth first Trie walk iterator */
2042 struct fib_trie_iter {
2043         struct seq_net_private p;
2044         struct fib_table *tb;
2045         struct tnode *tnode;
2046         unsigned index;
2047         unsigned depth;
2048 };
2049
2050 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
2051 {
2052         struct tnode *tn = iter->tnode;
2053         unsigned cindex = iter->index;
2054         struct tnode *p;
2055
2056         /* A single entry routing table */
2057         if (!tn)
2058                 return NULL;
2059
2060         pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2061                  iter->tnode, iter->index, iter->depth);
2062 rescan:
2063         while (cindex < (1<<tn->bits)) {
2064                 struct node *n = tnode_get_child_rcu(tn, cindex);
2065
2066                 if (n) {
2067                         if (IS_LEAF(n)) {
2068                                 iter->tnode = tn;
2069                                 iter->index = cindex + 1;
2070                         } else {
2071                                 /* push down one level */
2072                                 iter->tnode = (struct tnode *) n;
2073                                 iter->index = 0;
2074                                 ++iter->depth;
2075                         }
2076                         return n;
2077                 }
2078
2079                 ++cindex;
2080         }
2081
2082         /* Current node exhausted, pop back up */
2083         p = node_parent_rcu((struct node *)tn);
2084         if (p) {
2085                 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2086                 tn = p;
2087                 --iter->depth;
2088                 goto rescan;
2089         }
2090
2091         /* got root? */
2092         return NULL;
2093 }
2094
2095 static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2096                                        struct trie *t)
2097 {
2098         struct node *n;
2099
2100         if (!t)
2101                 return NULL;
2102
2103         n = rcu_dereference(t->trie);
2104         if (!n)
2105                 return NULL;
2106
2107         if (IS_TNODE(n)) {
2108                 iter->tnode = (struct tnode *) n;
2109                 iter->index = 0;
2110                 iter->depth = 1;
2111         } else {
2112                 iter->tnode = NULL;
2113                 iter->index = 0;
2114                 iter->depth = 0;
2115         }
2116
2117         return n;
2118 }
2119
2120 static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2121 {
2122         struct node *n;
2123         struct fib_trie_iter iter;
2124
2125         memset(s, 0, sizeof(*s));
2126
2127         rcu_read_lock();
2128         for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
2129                 if (IS_LEAF(n)) {
2130                         struct leaf *l = (struct leaf *)n;
2131                         struct leaf_info *li;
2132                         struct hlist_node *tmp;
2133
2134                         s->leaves++;
2135                         s->totdepth += iter.depth;
2136                         if (iter.depth > s->maxdepth)
2137                                 s->maxdepth = iter.depth;
2138
2139                         hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2140                                 ++s->prefixes;
2141                 } else {
2142                         const struct tnode *tn = (const struct tnode *) n;
2143                         int i;
2144
2145                         s->tnodes++;
2146                         if (tn->bits < MAX_STAT_DEPTH)
2147                                 s->nodesizes[tn->bits]++;
2148
2149                         for (i = 0; i < (1<<tn->bits); i++)
2150                                 if (!tn->child[i])
2151                                         s->nullpointers++;
2152                 }
2153         }
2154         rcu_read_unlock();
2155 }
2156
2157 /*
2158  *      This outputs /proc/net/fib_triestats
2159  */
2160 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
2161 {
2162         unsigned i, max, pointers, bytes, avdepth;
2163
2164         if (stat->leaves)
2165                 avdepth = stat->totdepth*100 / stat->leaves;
2166         else
2167                 avdepth = 0;
2168
2169         seq_printf(seq, "\tAver depth:     %u.%02d\n",
2170                    avdepth / 100, avdepth % 100);
2171         seq_printf(seq, "\tMax depth:      %u\n", stat->maxdepth);
2172
2173         seq_printf(seq, "\tLeaves:         %u\n", stat->leaves);
2174         bytes = sizeof(struct leaf) * stat->leaves;
2175
2176         seq_printf(seq, "\tPrefixes:       %u\n", stat->prefixes);
2177         bytes += sizeof(struct leaf_info) * stat->prefixes;
2178
2179         seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
2180         bytes += sizeof(struct tnode) * stat->tnodes;
2181
2182         max = MAX_STAT_DEPTH;
2183         while (max > 0 && stat->nodesizes[max-1] == 0)
2184                 max--;
2185
2186         pointers = 0;
2187         for (i = 1; i <= max; i++)
2188                 if (stat->nodesizes[i] != 0) {
2189                         seq_printf(seq, "  %u: %u",  i, stat->nodesizes[i]);
2190                         pointers += (1<<i) * stat->nodesizes[i];
2191                 }
2192         seq_putc(seq, '\n');
2193         seq_printf(seq, "\tPointers: %u\n", pointers);
2194
2195         bytes += sizeof(struct node *) * pointers;
2196         seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2197         seq_printf(seq, "Total size: %u  kB\n", (bytes + 1023) / 1024);
2198 }
2199
2200 #ifdef CONFIG_IP_FIB_TRIE_STATS
2201 static void trie_show_usage(struct seq_file *seq,
2202                             const struct trie_use_stats *stats)
2203 {
2204         seq_printf(seq, "\nCounters:\n---------\n");
2205         seq_printf(seq, "gets = %u\n", stats->gets);
2206         seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2207         seq_printf(seq, "semantic match passed = %u\n",
2208                    stats->semantic_match_passed);
2209         seq_printf(seq, "semantic match miss = %u\n",
2210                    stats->semantic_match_miss);
2211         seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2212         seq_printf(seq, "skipped node resize = %u\n\n",
2213                    stats->resize_node_skipped);
2214 }
2215 #endif /*  CONFIG_IP_FIB_TRIE_STATS */
2216
2217 static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
2218 {
2219         if (tb->tb_id == RT_TABLE_LOCAL)
2220                 seq_puts(seq, "Local:\n");
2221         else if (tb->tb_id == RT_TABLE_MAIN)
2222                 seq_puts(seq, "Main:\n");
2223         else
2224                 seq_printf(seq, "Id %d:\n", tb->tb_id);
2225 }
2226
2227
2228 static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2229 {
2230         struct net *net = (struct net *)seq->private;
2231         unsigned int h;
2232
2233         seq_printf(seq,
2234                    "Basic info: size of leaf:"
2235                    " %Zd bytes, size of tnode: %Zd bytes.\n",
2236                    sizeof(struct leaf), sizeof(struct tnode));
2237
2238         for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2239                 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2240                 struct hlist_node *node;
2241                 struct fib_table *tb;
2242
2243                 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2244                         struct trie *t = (struct trie *) tb->tb_data;
2245                         struct trie_stat stat;
2246
2247                         if (!t)
2248                                 continue;
2249
2250                         fib_table_print(seq, tb);
2251
2252                         trie_collect_stats(t, &stat);
2253                         trie_show_stats(seq, &stat);
2254 #ifdef CONFIG_IP_FIB_TRIE_STATS
2255                         trie_show_usage(seq, &t->stats);
2256 #endif
2257                 }
2258         }
2259
2260         return 0;
2261 }
2262
2263 static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2264 {
2265         return single_open_net(inode, file, fib_triestat_seq_show);
2266 }
2267
2268 static const struct file_operations fib_triestat_fops = {
2269         .owner  = THIS_MODULE,
2270         .open   = fib_triestat_seq_open,
2271         .read   = seq_read,
2272         .llseek = seq_lseek,
2273         .release = single_release_net,
2274 };
2275
2276 static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
2277 {
2278         struct fib_trie_iter *iter = seq->private;
2279         struct net *net = seq_file_net(seq);
2280         loff_t idx = 0;
2281         unsigned int h;
2282
2283         for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2284                 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2285                 struct hlist_node *node;
2286                 struct fib_table *tb;
2287
2288                 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2289                         struct node *n;
2290
2291                         for (n = fib_trie_get_first(iter,
2292                                                     (struct trie *) tb->tb_data);
2293                              n; n = fib_trie_get_next(iter))
2294                                 if (pos == idx++) {
2295                                         iter->tb = tb;
2296                                         return n;
2297                                 }
2298                 }
2299         }
2300
2301         return NULL;
2302 }
2303
2304 static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2305         __acquires(RCU)
2306 {
2307         rcu_read_lock();
2308         return fib_trie_get_idx(seq, *pos);
2309 }
2310
2311 static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2312 {
2313         struct fib_trie_iter *iter = seq->private;
2314         struct net *net = seq_file_net(seq);
2315         struct fib_table *tb = iter->tb;
2316         struct hlist_node *tb_node;
2317         unsigned int h;
2318         struct node *n;
2319
2320         ++*pos;
2321         /* next node in same table */
2322         n = fib_trie_get_next(iter);
2323         if (n)
2324                 return n;
2325
2326         /* walk rest of this hash chain */
2327         h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2328         while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2329                 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2330                 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2331                 if (n)
2332                         goto found;
2333         }
2334
2335         /* new hash chain */
2336         while (++h < FIB_TABLE_HASHSZ) {
2337                 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2338                 hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2339                         n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2340                         if (n)
2341                                 goto found;
2342                 }
2343         }
2344         return NULL;
2345
2346 found:
2347         iter->tb = tb;
2348         return n;
2349 }
2350
2351 static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2352         __releases(RCU)
2353 {
2354         rcu_read_unlock();
2355 }
2356
2357 static void seq_indent(struct seq_file *seq, int n)
2358 {
2359         while (n-- > 0) seq_puts(seq, "   ");
2360 }
2361
2362 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
2363 {
2364         switch (s) {
2365         case RT_SCOPE_UNIVERSE: return "universe";
2366         case RT_SCOPE_SITE:     return "site";
2367         case RT_SCOPE_LINK:     return "link";
2368         case RT_SCOPE_HOST:     return "host";
2369         case RT_SCOPE_NOWHERE:  return "nowhere";
2370         default:
2371                 snprintf(buf, len, "scope=%d", s);
2372                 return buf;
2373         }
2374 }
2375
2376 static const char *const rtn_type_names[__RTN_MAX] = {
2377         [RTN_UNSPEC] = "UNSPEC",
2378         [RTN_UNICAST] = "UNICAST",
2379         [RTN_LOCAL] = "LOCAL",
2380         [RTN_BROADCAST] = "BROADCAST",
2381         [RTN_ANYCAST] = "ANYCAST",
2382         [RTN_MULTICAST] = "MULTICAST",
2383         [RTN_BLACKHOLE] = "BLACKHOLE",
2384         [RTN_UNREACHABLE] = "UNREACHABLE",
2385         [RTN_PROHIBIT] = "PROHIBIT",
2386         [RTN_THROW] = "THROW",
2387         [RTN_NAT] = "NAT",
2388         [RTN_XRESOLVE] = "XRESOLVE",
2389 };
2390
2391 static inline const char *rtn_type(char *buf, size_t len, unsigned t)
2392 {
2393         if (t < __RTN_MAX && rtn_type_names[t])
2394                 return rtn_type_names[t];
2395         snprintf(buf, len, "type %u", t);
2396         return buf;
2397 }
2398
2399 /* Pretty print the trie */
2400 static int fib_trie_seq_show(struct seq_file *seq, void *v)
2401 {
2402         const struct fib_trie_iter *iter = seq->private;
2403         struct node *n = v;
2404
2405         if (!node_parent_rcu(n))
2406                 fib_table_print(seq, iter->tb);
2407
2408         if (IS_TNODE(n)) {
2409                 struct tnode *tn = (struct tnode *) n;
2410                 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
2411
2412                 seq_indent(seq, iter->depth-1);
2413                 seq_printf(seq, "  +-- %pI4/%d %d %d %d\n",
2414                            &prf, tn->pos, tn->bits, tn->full_children,
2415                            tn->empty_children);
2416
2417         } else {
2418                 struct leaf *l = (struct leaf *) n;
2419                 struct leaf_info *li;
2420                 struct hlist_node *node;
2421                 __be32 val = htonl(l->key);
2422
2423                 seq_indent(seq, iter->depth);
2424                 seq_printf(seq, "  |-- %pI4\n", &val);
2425
2426                 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2427                         struct fib_alias *fa;
2428
2429                         list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2430                                 char buf1[32], buf2[32];
2431
2432                                 seq_indent(seq, iter->depth+1);
2433                                 seq_printf(seq, "  /%d %s %s", li->plen,
2434                                            rtn_scope(buf1, sizeof(buf1),
2435                                                      fa->fa_scope),
2436                                            rtn_type(buf2, sizeof(buf2),
2437                                                     fa->fa_type));
2438                                 if (fa->fa_tos)
2439                                         seq_printf(seq, " tos=%d", fa->fa_tos);
2440                                 seq_putc(seq, '\n');
2441                         }
2442                 }
2443         }
2444
2445         return 0;
2446 }
2447
2448 static const struct seq_operations fib_trie_seq_ops = {
2449         .start  = fib_trie_seq_start,
2450         .next   = fib_trie_seq_next,
2451         .stop   = fib_trie_seq_stop,
2452         .show   = fib_trie_seq_show,
2453 };
2454
2455 static int fib_trie_seq_open(struct inode *inode, struct file *file)
2456 {
2457         return seq_open_net(inode, file, &fib_trie_seq_ops,
2458                             sizeof(struct fib_trie_iter));
2459 }
2460
2461 static const struct file_operations fib_trie_fops = {
2462         .owner  = THIS_MODULE,
2463         .open   = fib_trie_seq_open,
2464         .read   = seq_read,
2465         .llseek = seq_lseek,
2466         .release = seq_release_net,
2467 };
2468
2469 struct fib_route_iter {
2470         struct seq_net_private p;
2471         struct trie *main_trie;
2472         loff_t  pos;
2473         t_key   key;
2474 };
2475
2476 static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2477 {
2478         struct leaf *l = NULL;
2479         struct trie *t = iter->main_trie;
2480
2481         /* use cache location of last found key */
2482         if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2483                 pos -= iter->pos;
2484         else {
2485                 iter->pos = 0;
2486                 l = trie_firstleaf(t);
2487         }
2488
2489         while (l && pos-- > 0) {
2490                 iter->pos++;
2491                 l = trie_nextleaf(l);
2492         }
2493
2494         if (l)
2495                 iter->key = pos;        /* remember it */
2496         else
2497                 iter->pos = 0;          /* forget it */
2498
2499         return l;
2500 }
2501
2502 static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2503         __acquires(RCU)
2504 {
2505         struct fib_route_iter *iter = seq->private;
2506         struct fib_table *tb;
2507
2508         rcu_read_lock();
2509         tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
2510         if (!tb)
2511                 return NULL;
2512
2513         iter->main_trie = (struct trie *) tb->tb_data;
2514         if (*pos == 0)
2515                 return SEQ_START_TOKEN;
2516         else
2517                 return fib_route_get_idx(iter, *pos - 1);
2518 }
2519
2520 static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2521 {
2522         struct fib_route_iter *iter = seq->private;
2523         struct leaf *l = v;
2524
2525         ++*pos;
2526         if (v == SEQ_START_TOKEN) {
2527                 iter->pos = 0;
2528                 l = trie_firstleaf(iter->main_trie);
2529         } else {
2530                 iter->pos++;
2531                 l = trie_nextleaf(l);
2532         }
2533
2534         if (l)
2535                 iter->key = l->key;
2536         else
2537                 iter->pos = 0;
2538         return l;
2539 }
2540
2541 static void fib_route_seq_stop(struct seq_file *seq, void *v)
2542         __releases(RCU)
2543 {
2544         rcu_read_unlock();
2545 }
2546
2547 static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
2548 {
2549         static unsigned type2flags[RTN_MAX + 1] = {
2550                 [7] = RTF_REJECT, [8] = RTF_REJECT,
2551         };
2552         unsigned flags = type2flags[type];
2553
2554         if (fi && fi->fib_nh->nh_gw)
2555                 flags |= RTF_GATEWAY;
2556         if (mask == htonl(0xFFFFFFFF))
2557                 flags |= RTF_HOST;
2558         flags |= RTF_UP;
2559         return flags;
2560 }
2561
2562 /*
2563  *      This outputs /proc/net/route.
2564  *      The format of the file is not supposed to be changed
2565  *      and needs to be same as fib_hash output to avoid breaking
2566  *      legacy utilities
2567  */
2568 static int fib_route_seq_show(struct seq_file *seq, void *v)
2569 {
2570         struct leaf *l = v;
2571         struct leaf_info *li;
2572         struct hlist_node *node;
2573
2574         if (v == SEQ_START_TOKEN) {
2575                 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2576                            "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2577                            "\tWindow\tIRTT");
2578                 return 0;
2579         }
2580
2581         hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2582                 struct fib_alias *fa;
2583                 __be32 mask, prefix;
2584
2585                 mask = inet_make_mask(li->plen);
2586                 prefix = htonl(l->key);
2587
2588                 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2589                         const struct fib_info *fi = fa->fa_info;
2590                         unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
2591                         int len;
2592
2593                         if (fa->fa_type == RTN_BROADCAST
2594                             || fa->fa_type == RTN_MULTICAST)
2595                                 continue;
2596
2597                         if (fi)
2598                                 seq_printf(seq,
2599                                          "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2600                                          "%d\t%08X\t%d\t%u\t%u%n",
2601                                          fi->fib_dev ? fi->fib_dev->name : "*",
2602                                          prefix,
2603                                          fi->fib_nh->nh_gw, flags, 0, 0,
2604                                          fi->fib_priority,
2605                                          mask,
2606                                          (fi->fib_advmss ?
2607                                           fi->fib_advmss + 40 : 0),
2608                                          fi->fib_window,
2609                                          fi->fib_rtt >> 3, &len);
2610                         else
2611                                 seq_printf(seq,
2612                                          "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2613                                          "%d\t%08X\t%d\t%u\t%u%n",
2614                                          prefix, 0, flags, 0, 0, 0,
2615                                          mask, 0, 0, 0, &len);
2616
2617                         seq_printf(seq, "%*s\n", 127 - len, "");
2618                 }
2619         }
2620
2621         return 0;
2622 }
2623
2624 static const struct seq_operations fib_route_seq_ops = {
2625         .start  = fib_route_seq_start,
2626         .next   = fib_route_seq_next,
2627         .stop   = fib_route_seq_stop,
2628         .show   = fib_route_seq_show,
2629 };
2630
2631 static int fib_route_seq_open(struct inode *inode, struct file *file)
2632 {
2633         return seq_open_net(inode, file, &fib_route_seq_ops,
2634                             sizeof(struct fib_route_iter));
2635 }
2636
2637 static const struct file_operations fib_route_fops = {
2638         .owner  = THIS_MODULE,
2639         .open   = fib_route_seq_open,
2640         .read   = seq_read,
2641         .llseek = seq_lseek,
2642         .release = seq_release_net,
2643 };
2644
2645 int __net_init fib_proc_init(struct net *net)
2646 {
2647         if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
2648                 goto out1;
2649
2650         if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2651                                   &fib_triestat_fops))
2652                 goto out2;
2653
2654         if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
2655                 goto out3;
2656
2657         return 0;
2658
2659 out3:
2660         proc_net_remove(net, "fib_triestat");
2661 out2:
2662         proc_net_remove(net, "fib_trie");
2663 out1:
2664         return -ENOMEM;
2665 }
2666
2667 void __net_exit fib_proc_exit(struct net *net)
2668 {
2669         proc_net_remove(net, "fib_trie");
2670         proc_net_remove(net, "fib_triestat");
2671         proc_net_remove(net, "route");
2672 }
2673
2674 #endif /* CONFIG_PROC_FS */