mm/memcontrol.c

   1 /* memcontrol.c - Memory Controller
   2  *
   3  * Copyright IBM Corporation, 2007
   4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   5  *
   6  * Copyright 2007 OpenVZ SWsoft Inc
   7  * Author: Pavel Emelianov <xemul@openvz.org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  */
  19
  20 #include <linux/res_counter.h>
  21 #include <linux/memcontrol.h>
  22 #include <linux/cgroup.h>
  23 #include <linux/mm.h>
  24 #include <linux/page-flags.h>
  25 #include <linux/backing-dev.h>
  26 #include <linux/bit_spinlock.h>
  27 #include <linux/rcupdate.h>
  28 #include <linux/swap.h>
  29 #include <linux/spinlock.h>
  30 #include <linux/fs.h>
  31
  32 struct cgroup_subsys mem_cgroup_subsys;
  33 static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
  34
  35 /*
  36  * The memory controller data structure. The memory controller controls both
  37  * page cache and RSS per cgroup. We would eventually like to provide
  38  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
  39  * to help the administrator determine what knobs to tune.
  40  *
  41  * TODO: Add a water mark for the memory controller. Reclaim will begin when
  42  * we hit the water mark. May be even add a low water mark, such that
  43  * no reclaim occurs from a cgroup at it's low water mark, this is
  44  * a feature that will be implemented much later in the future.
  45  */
  46 struct mem_cgroup {
  47         struct cgroup_subsys_state css;
  48         /*
  49          * the counter to account for memory usage
  50          */
  51         struct res_counter res;
  52         /*
  53          * Per cgroup active and inactive list, similar to the
  54          * per zone LRU lists.
  55          * TODO: Consider making these lists per zone
  56          */
  57         struct list_head active_list;
  58         struct list_head inactive_list;
  59         /*
  60          * spin_lock to protect the per cgroup LRU
  61          */
  62         spinlock_t lru_lock;
  63 };
  64
  65 /*
  66  * We use the lower bit of the page->page_cgroup pointer as a bit spin
  67  * lock. We need to ensure that page->page_cgroup is atleast two
  68  * byte aligned (based on comments from Nick Piggin)
  69  */
  70 #define PAGE_CGROUP_LOCK_BIT    0x0
  71 #define PAGE_CGROUP_LOCK                (1 << PAGE_CGROUP_LOCK_BIT)
  72
  73 /*
  74  * A page_cgroup page is associated with every page descriptor. The
  75  * page_cgroup helps us identify information about the cgroup
  76  */
  77 struct page_cgroup {
  78         struct list_head lru;           /* per cgroup LRU list */
  79         struct page *page;
  80         struct mem_cgroup *mem_cgroup;
  81         atomic_t ref_cnt;               /* Helpful when pages move b/w  */
  82                                         /* mapped and cached states     */
  83 };
  84
  85
  86 static inline
  87 struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
  88 {
  89         return container_of(cgroup_subsys_state(cont,
  90                                 mem_cgroup_subsys_id), struct mem_cgroup,
  91                                 css);
  92 }
  93
  94 static inline
  95 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
  96 {
  97         return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
  98                                 struct mem_cgroup, css);
  99 }
 100
 101 void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p)
 102 {
 103         struct mem_cgroup *mem;
 104
 105         mem = mem_cgroup_from_task(p);
 106         css_get(&mem->css);
 107         mm->mem_cgroup = mem;
 108 }
 109
 110 void mm_free_cgroup(struct mm_struct *mm)
 111 {
 112         css_put(&mm->mem_cgroup->css);
 113 }
 114
 115 static inline int page_cgroup_locked(struct page *page)
 116 {
 117         return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT,
 118                                         &page->page_cgroup);
 119 }
 120
 121 void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
 122 {
 123         int locked;
 124
 125         /*
 126          * While resetting the page_cgroup we might not hold the
 127          * page_cgroup lock. free_hot_cold_page() is an example
 128          * of such a scenario
 129          */
 130         if (pc)
 131                 VM_BUG_ON(!page_cgroup_locked(page));
 132         locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
 133         page->page_cgroup = ((unsigned long)pc | locked);
 134 }
 135
 136 struct page_cgroup *page_get_page_cgroup(struct page *page)
 137 {
 138         return (struct page_cgroup *)
 139                 (page->page_cgroup & ~PAGE_CGROUP_LOCK);
 140 }
 141
 142 void __always_inline lock_page_cgroup(struct page *page)
 143 {
 144         bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
 145         VM_BUG_ON(!page_cgroup_locked(page));
 146 }
 147
 148 void __always_inline unlock_page_cgroup(struct page *page)
 149 {
 150         bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
 151 }
 152
 153 void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
 154 {
 155         if (active)
 156                 list_move(&pc->lru, &pc->mem_cgroup->active_list);
 157         else
 158                 list_move(&pc->lru, &pc->mem_cgroup->inactive_list);
 159 }
 160
 161 /*
 162  * This routine assumes that the appropriate zone's lru lock is already held
 163  */
 164 void mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
 165 {
 166         struct mem_cgroup *mem;
 167         if (!pc)
 168                 return;
 169
 170         mem = pc->mem_cgroup;
 171
 172         spin_lock(&mem->lru_lock);
 173         __mem_cgroup_move_lists(pc, active);
 174         spin_unlock(&mem->lru_lock);
 175 }
 176
 177 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 178                                         struct list_head *dst,
 179                                         unsigned long *scanned, int order,
 180                                         int mode, struct zone *z,
 181                                         struct mem_cgroup *mem_cont,
 182                                         int active)
 183 {
 184         unsigned long nr_taken = 0;
 185         struct page *page;
 186         unsigned long scan;
 187         LIST_HEAD(pc_list);
 188         struct list_head *src;
 189         struct page_cgroup *pc;
 190
 191         if (active)
 192                 src = &mem_cont->active_list;
 193         else
 194                 src = &mem_cont->inactive_list;
 195
 196         spin_lock(&mem_cont->lru_lock);
 197         for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
 198                 pc = list_entry(src->prev, struct page_cgroup, lru);
 199                 page = pc->page;
 200                 VM_BUG_ON(!pc);
 201
 202                 if (PageActive(page) && !active) {
 203                         __mem_cgroup_move_lists(pc, true);
 204                         scan--;
 205                         continue;
 206                 }
 207                 if (!PageActive(page) && active) {
 208                         __mem_cgroup_move_lists(pc, false);
 209                         scan--;
 210                         continue;
 211                 }
 212
 213                 /*
 214                  * Reclaim, per zone
 215                  * TODO: make the active/inactive lists per zone
 216                  */
 217                 if (page_zone(page) != z)
 218                         continue;
 219
 220                 /*
 221                  * Check if the meta page went away from under us
 222                  */
 223                 if (!list_empty(&pc->lru))
 224                         list_move(&pc->lru, &pc_list);
 225                 else
 226                         continue;
 227
 228                 if (__isolate_lru_page(page, mode) == 0) {
 229                         list_move(&page->lru, dst);
 230                         nr_taken++;
 231                 }
 232         }
 233
 234         list_splice(&pc_list, src);
 235         spin_unlock(&mem_cont->lru_lock);
 236
 237         *scanned = scan;
 238         return nr_taken;
 239 }
 240
 241 /*
 242  * Charge the memory controller for page usage.
 243  * Return
 244  * 0 if the charge was successful
 245  * < 0 if the cgroup is over its limit
 246  */
 247 int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
 248 {
 249         struct mem_cgroup *mem;
 250         struct page_cgroup *pc, *race_pc;
 251         unsigned long flags;
 252         unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 253
 254         /*
 255          * Should page_cgroup's go to their own slab?
 256          * One could optimize the performance of the charging routine
 257          * by saving a bit in the page_flags and using it as a lock
 258          * to see if the cgroup page already has a page_cgroup associated
 259          * with it
 260          */
 261 retry:
 262         lock_page_cgroup(page);
 263         pc = page_get_page_cgroup(page);
 264         /*
 265          * The page_cgroup exists and the page has already been accounted
 266          */
 267         if (pc) {
 268                 if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) {
 269                         /* this page is under being uncharged ? */
 270                         unlock_page_cgroup(page);
 271                         cpu_relax();
 272                         goto retry;
 273                 } else
 274                         goto done;
 275         }
 276
 277         unlock_page_cgroup(page);
 278
 279         pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL);
 280         if (pc == NULL)
 281                 goto err;
 282
 283         rcu_read_lock();
 284         /*
 285          * We always charge the cgroup the mm_struct belongs to
 286          * the mm_struct's mem_cgroup changes on task migration if the
 287          * thread group leader migrates. It's possible that mm is not
 288          * set, if so charge the init_mm (happens for pagecache usage).
 289          */
 290         if (!mm)
 291                 mm = &init_mm;
 292
 293         mem = rcu_dereference(mm->mem_cgroup);
 294         /*
 295          * For every charge from the cgroup, increment reference
 296          * count
 297          */
 298         css_get(&mem->css);
 299         rcu_read_unlock();
 300
 301         /*
 302          * If we created the page_cgroup, we should free it on exceeding
 303          * the cgroup limit.
 304          */
 305         while (res_counter_charge(&mem->res, PAGE_SIZE)) {
 306                 if (try_to_free_mem_cgroup_pages(mem))
 307                         continue;
 308
 309                 /*
 310                  * try_to_free_mem_cgroup_pages() might not give us a full
 311                  * picture of reclaim. Some pages are reclaimed and might be
 312                  * moved to swap cache or just unmapped from the cgroup.
 313                  * Check the limit again to see if the reclaim reduced the
 314                  * current usage of the cgroup before giving up
 315                  */
 316                 if (res_counter_check_under_limit(&mem->res))
 317                         continue;
 318                         /*
 319                          * Since we control both RSS and cache, we end up with a
 320                          * very interesting scenario where we end up reclaiming
 321                          * memory (essentially RSS), since the memory is pushed
 322                          * to swap cache, we eventually end up adding those
 323                          * pages back to our list. Hence we give ourselves a
 324                          * few chances before we fail
 325                          */
 326                 else if (nr_retries--) {
 327                         congestion_wait(WRITE, HZ/10);
 328                         continue;
 329                 }
 330
 331                 css_put(&mem->css);
 332                 mem_cgroup_out_of_memory(mem, GFP_KERNEL);
 333                 goto free_pc;
 334         }
 335
 336         lock_page_cgroup(page);
 337         /*
 338          * Check if somebody else beat us to allocating the page_cgroup
 339          */
 340         race_pc = page_get_page_cgroup(page);
 341         if (race_pc) {
 342                 kfree(pc);
 343                 pc = race_pc;
 344                 atomic_inc(&pc->ref_cnt);
 345                 res_counter_uncharge(&mem->res, PAGE_SIZE);
 346                 css_put(&mem->css);
 347                 goto done;
 348         }
 349
 350         atomic_set(&pc->ref_cnt, 1);
 351         pc->mem_cgroup = mem;
 352         pc->page = page;
 353         page_assign_page_cgroup(page, pc);
 354
 355         spin_lock_irqsave(&mem->lru_lock, flags);
 356         list_add(&pc->lru, &mem->active_list);
 357         spin_unlock_irqrestore(&mem->lru_lock, flags);
 358
 359 done:
 360         unlock_page_cgroup(page);
 361         return 0;
 362 free_pc:
 363         kfree(pc);
 364 err:
 365         return -ENOMEM;
 366 }
 367
 368 /*
 369  * Uncharging is always a welcome operation, we never complain, simply
 370  * uncharge.
 371  */
 372 void mem_cgroup_uncharge(struct page_cgroup *pc)
 373 {
 374         struct mem_cgroup *mem;
 375         struct page *page;
 376         unsigned long flags;
 377
 378         if (!pc)
 379                 return;
 380
 381         if (atomic_dec_and_test(&pc->ref_cnt)) {
 382                 page = pc->page;
 383                 lock_page_cgroup(page);
 384                 mem = pc->mem_cgroup;
 385                 css_put(&mem->css);
 386                 page_assign_page_cgroup(page, NULL);
 387                 unlock_page_cgroup(page);
 388                 res_counter_uncharge(&mem->res, PAGE_SIZE);
 389
 390                 spin_lock_irqsave(&mem->lru_lock, flags);
 391                 list_del_init(&pc->lru);
 392                 spin_unlock_irqrestore(&mem->lru_lock, flags);
 393                 kfree(pc);
 394         }
 395 }
 396
 397 int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
 398 {
 399         *tmp = memparse(buf, &buf);
 400         if (*buf != '\0')
 401                 return -EINVAL;
 402
 403         /*
 404          * Round up the value to the closest page size
 405          */
 406         *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
 407         return 0;
 408 }
 409
 410 static ssize_t mem_cgroup_read(struct cgroup *cont,
 411                         struct cftype *cft, struct file *file,
 412                         char __user *userbuf, size_t nbytes, loff_t *ppos)
 413 {
 414         return res_counter_read(&mem_cgroup_from_cont(cont)->res,
 415                                 cft->private, userbuf, nbytes, ppos,
 416                                 NULL);
 417 }
 418
 419 static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 420                                 struct file *file, const char __user *userbuf,
 421                                 size_t nbytes, loff_t *ppos)
 422 {
 423         return res_counter_write(&mem_cgroup_from_cont(cont)->res,
 424                                 cft->private, userbuf, nbytes, ppos,
 425                                 mem_cgroup_write_strategy);
 426 }
 427
 428 static struct cftype mem_cgroup_files[] = {
 429         {
 430                 .name = "usage_in_bytes",
 431                 .private = RES_USAGE,
 432                 .read = mem_cgroup_read,
 433         },
 434         {
 435                 .name = "limit_in_bytes",
 436                 .private = RES_LIMIT,
 437                 .write = mem_cgroup_write,
 438                 .read = mem_cgroup_read,
 439         },
 440         {
 441                 .name = "failcnt",
 442                 .private = RES_FAILCNT,
 443                 .read = mem_cgroup_read,
 444         },
 445 };
 446
 447 static struct mem_cgroup init_mem_cgroup;
 448
 449 static struct cgroup_subsys_state *
 450 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 451 {
 452         struct mem_cgroup *mem;
 453
 454         if (unlikely((cont->parent) == NULL)) {
 455                 mem = &init_mem_cgroup;
 456                 init_mm.mem_cgroup = mem;
 457         } else
 458                 mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
 459
 460         if (mem == NULL)
 461                 return NULL;
 462
 463         res_counter_init(&mem->res);
 464         INIT_LIST_HEAD(&mem->active_list);
 465         INIT_LIST_HEAD(&mem->inactive_list);
 466         spin_lock_init(&mem->lru_lock);
 467         return &mem->css;
 468 }
 469
 470 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
 471                                 struct cgroup *cont)
 472 {
 473         kfree(mem_cgroup_from_cont(cont));
 474 }
 475
 476 static int mem_cgroup_populate(struct cgroup_subsys *ss,
 477                                 struct cgroup *cont)
 478 {
 479         return cgroup_add_files(cont, ss, mem_cgroup_files,
 480                                         ARRAY_SIZE(mem_cgroup_files));
 481 }
 482
 483 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
 484                                 struct cgroup *cont,
 485                                 struct cgroup *old_cont,
 486                                 struct task_struct *p)
 487 {
 488         struct mm_struct *mm;
 489         struct mem_cgroup *mem, *old_mem;
 490
 491         mm = get_task_mm(p);
 492         if (mm == NULL)
 493                 return;
 494
 495         mem = mem_cgroup_from_cont(cont);
 496         old_mem = mem_cgroup_from_cont(old_cont);
 497
 498         if (mem == old_mem)
 499                 goto out;
 500
 501         /*
 502          * Only thread group leaders are allowed to migrate, the mm_struct is
 503          * in effect owned by the leader
 504          */
 505         if (p->tgid != p->pid)
 506                 goto out;
 507
 508         css_get(&mem->css);
 509         rcu_assign_pointer(mm->mem_cgroup, mem);
 510         css_put(&old_mem->css);
 511
 512 out:
 513         mmput(mm);
 514         return;
 515 }
 516
 517 struct cgroup_subsys mem_cgroup_subsys = {
 518         .name = "memory",
 519         .subsys_id = mem_cgroup_subsys_id,
 520         .create = mem_cgroup_create,
 521         .destroy = mem_cgroup_destroy,
 522         .populate = mem_cgroup_populate,
 523         .attach = mem_cgroup_move_task,
 524         .early_init = 1,
 525 };