mm/memcontrol.c

   1 /* memcontrol.c - Memory Controller
   2  *
   3  * Copyright IBM Corporation, 2007
   4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   5  *
   6  * Copyright 2007 OpenVZ SWsoft Inc
   7  * Author: Pavel Emelianov <xemul@openvz.org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  */
  19
  20 #include <linux/res_counter.h>
  21 #include <linux/memcontrol.h>
  22 #include <linux/cgroup.h>
  23 #include <linux/mm.h>
  24 #include <linux/pagemap.h>
  25 #include <linux/smp.h>
  26 #include <linux/page-flags.h>
  27 #include <linux/backing-dev.h>
  28 #include <linux/bit_spinlock.h>
  29 #include <linux/rcupdate.h>
  30 #include <linux/mutex.h>
  31 #include <linux/slab.h>
  32 #include <linux/swap.h>
  33 #include <linux/spinlock.h>
  34 #include <linux/fs.h>
  35 #include <linux/seq_file.h>
  36 #include <linux/vmalloc.h>
  37 #include <linux/mm_inline.h>
  38 #include <linux/page_cgroup.h>
  39
  40 #include <asm/uaccess.h>
  41
  42 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
  43 #define MEM_CGROUP_RECLAIM_RETRIES      5
  44
  45 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  46 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
  47 int do_swap_account __read_mostly;
  48 static int really_do_swap_account __initdata = 1; /* for remember boot option*/
  49 #else
  50 #define do_swap_account         (0)
  51 #endif
  52
  53
  54 /*
  55  * Statistics for memory cgroup.
  56  */
  57 enum mem_cgroup_stat_index {
  58         /*
  59          * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
  60          */
  61         MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
  62         MEM_CGROUP_STAT_RSS,       /* # of pages charged as rss */
  63         MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
  64         MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
  65
  66         MEM_CGROUP_STAT_NSTATS,
  67 };
  68
  69 struct mem_cgroup_stat_cpu {
  70         s64 count[MEM_CGROUP_STAT_NSTATS];
  71 } ____cacheline_aligned_in_smp;
  72
  73 struct mem_cgroup_stat {
  74         struct mem_cgroup_stat_cpu cpustat[0];
  75 };
  76
  77 /*
  78  * For accounting under irq disable, no need for increment preempt count.
  79  */
  80 static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
  81                 enum mem_cgroup_stat_index idx, int val)
  82 {
  83         stat->count[idx] += val;
  84 }
  85
  86 static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
  87                 enum mem_cgroup_stat_index idx)
  88 {
  89         int cpu;
  90         s64 ret = 0;
  91         for_each_possible_cpu(cpu)
  92                 ret += stat->cpustat[cpu].count[idx];
  93         return ret;
  94 }
  95
  96 /*
  97  * per-zone information in memory controller.
  98  */
  99 struct mem_cgroup_per_zone {
 100         /*
 101          * spin_lock to protect the per cgroup LRU
 102          */
 103         spinlock_t              lru_lock;
 104         struct list_head        lists[NR_LRU_LISTS];
 105         unsigned long           count[NR_LRU_LISTS];
 106 };
 107 /* Macro for accessing counter */
 108 #define MEM_CGROUP_ZSTAT(mz, idx)       ((mz)->count[(idx)])
 109
 110 struct mem_cgroup_per_node {
 111         struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 112 };
 113
 114 struct mem_cgroup_lru_info {
 115         struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
 116 };
 117
 118 /*
 119  * The memory controller data structure. The memory controller controls both
 120  * page cache and RSS per cgroup. We would eventually like to provide
 121  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 122  * to help the administrator determine what knobs to tune.
 123  *
 124  * TODO: Add a water mark for the memory controller. Reclaim will begin when
 125  * we hit the water mark. May be even add a low water mark, such that
 126  * no reclaim occurs from a cgroup at it's low water mark, this is
 127  * a feature that will be implemented much later in the future.
 128  */
 129 struct mem_cgroup {
 130         struct cgroup_subsys_state css;
 131         /*
 132          * the counter to account for memory usage
 133          */
 134         struct res_counter res;
 135         /*
 136          * the counter to account for mem+swap usage.
 137          */
 138         struct res_counter memsw;
 139         /*
 140          * Per cgroup active and inactive list, similar to the
 141          * per zone LRU lists.
 142          */
 143         struct mem_cgroup_lru_info info;
 144
 145         int     prev_priority;  /* for recording reclaim priority */
 146         int             obsolete;
 147         atomic_t        refcnt;
 148         /*
 149          * statistics. This must be placed at the end of memcg.
 150          */
 151         struct mem_cgroup_stat stat;
 152 };
 153
 154 enum charge_type {
 155         MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 156         MEM_CGROUP_CHARGE_TYPE_MAPPED,
 157         MEM_CGROUP_CHARGE_TYPE_SHMEM,   /* used by page migration of shmem */
 158         MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
 159         MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
 160         NR_CHARGE_TYPE,
 161 };
 162
 163 /* only for here (for easy reading.) */
 164 #define PCGF_CACHE      (1UL << PCG_CACHE)
 165 #define PCGF_USED       (1UL << PCG_USED)
 166 #define PCGF_ACTIVE     (1UL << PCG_ACTIVE)
 167 #define PCGF_LOCK       (1UL << PCG_LOCK)
 168 #define PCGF_FILE       (1UL << PCG_FILE)
 169 static const unsigned long
 170 pcg_default_flags[NR_CHARGE_TYPE] = {
 171         PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
 172         PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
 173         PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
 174         0, /* FORCE */
 175 };
 176
 177
 178 /* for encoding cft->private value on file */
 179 #define _MEM                    (0)
 180 #define _MEMSWAP                (1)
 181 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
 182 #define MEMFILE_TYPE(val)       (((val) >> 16) & 0xffff)
 183 #define MEMFILE_ATTR(val)       ((val) & 0xffff)
 184
 185 static void mem_cgroup_get(struct mem_cgroup *mem);
 186 static void mem_cgroup_put(struct mem_cgroup *mem);
 187
 188 /*
 189  * Always modified under lru lock. Then, not necessary to preempt_disable()
 190  */
 191 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
 192                                          struct page_cgroup *pc,
 193                                          bool charge)
 194 {
 195         int val = (charge)? 1 : -1;
 196         struct mem_cgroup_stat *stat = &mem->stat;
 197         struct mem_cgroup_stat_cpu *cpustat;
 198
 199         VM_BUG_ON(!irqs_disabled());
 200
 201         cpustat = &stat->cpustat[smp_processor_id()];
 202         if (PageCgroupCache(pc))
 203                 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
 204         else
 205                 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
 206
 207         if (charge)
 208                 __mem_cgroup_stat_add_safe(cpustat,
 209                                 MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
 210         else
 211                 __mem_cgroup_stat_add_safe(cpustat,
 212                                 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
 213 }
 214
 215 static struct mem_cgroup_per_zone *
 216 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
 217 {
 218         return &mem->info.nodeinfo[nid]->zoneinfo[zid];
 219 }
 220
 221 static struct mem_cgroup_per_zone *
 222 page_cgroup_zoneinfo(struct page_cgroup *pc)
 223 {
 224         struct mem_cgroup *mem = pc->mem_cgroup;
 225         int nid = page_cgroup_nid(pc);
 226         int zid = page_cgroup_zid(pc);
 227
 228         return mem_cgroup_zoneinfo(mem, nid, zid);
 229 }
 230
 231 static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
 232                                         enum lru_list idx)
 233 {
 234         int nid, zid;
 235         struct mem_cgroup_per_zone *mz;
 236         u64 total = 0;
 237
 238         for_each_online_node(nid)
 239                 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 240                         mz = mem_cgroup_zoneinfo(mem, nid, zid);
 241                         total += MEM_CGROUP_ZSTAT(mz, idx);
 242                 }
 243         return total;
 244 }
 245
 246 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 247 {
 248         return container_of(cgroup_subsys_state(cont,
 249                                 mem_cgroup_subsys_id), struct mem_cgroup,
 250                                 css);
 251 }
 252
 253 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 254 {
 255         /*
 256          * mm_update_next_owner() may clear mm->owner to NULL
 257          * if it races with swapoff, page migration, etc.
 258          * So this can be called with p == NULL.
 259          */
 260         if (unlikely(!p))
 261                 return NULL;
 262
 263         return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
 264                                 struct mem_cgroup, css);
 265 }
 266
 267 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
 268                         struct page_cgroup *pc)
 269 {
 270         int lru = LRU_BASE;
 271
 272         if (PageCgroupUnevictable(pc))
 273                 lru = LRU_UNEVICTABLE;
 274         else {
 275                 if (PageCgroupActive(pc))
 276                         lru += LRU_ACTIVE;
 277                 if (PageCgroupFile(pc))
 278                         lru += LRU_FILE;
 279         }
 280
 281         MEM_CGROUP_ZSTAT(mz, lru) -= 1;
 282
 283         mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
 284         list_del(&pc->lru);
 285 }
 286
 287 static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
 288                                 struct page_cgroup *pc, bool hot)
 289 {
 290         int lru = LRU_BASE;
 291
 292         if (PageCgroupUnevictable(pc))
 293                 lru = LRU_UNEVICTABLE;
 294         else {
 295                 if (PageCgroupActive(pc))
 296                         lru += LRU_ACTIVE;
 297                 if (PageCgroupFile(pc))
 298                         lru += LRU_FILE;
 299         }
 300
 301         MEM_CGROUP_ZSTAT(mz, lru) += 1;
 302         if (hot)
 303                 list_add(&pc->lru, &mz->lists[lru]);
 304         else
 305                 list_add_tail(&pc->lru, &mz->lists[lru]);
 306
 307         mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
 308 }
 309
 310 static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
 311 {
 312         struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
 313         int active    = PageCgroupActive(pc);
 314         int file      = PageCgroupFile(pc);
 315         int unevictable = PageCgroupUnevictable(pc);
 316         enum lru_list from = unevictable ? LRU_UNEVICTABLE :
 317                                 (LRU_FILE * !!file + !!active);
 318
 319         if (lru == from)
 320                 return;
 321
 322         MEM_CGROUP_ZSTAT(mz, from) -= 1;
 323         /*
 324          * However this is done under mz->lru_lock, another flags, which
 325          * are not related to LRU, will be modified from out-of-lock.
 326          * We have to use atomic set/clear flags.
 327          */
 328         if (is_unevictable_lru(lru)) {
 329                 ClearPageCgroupActive(pc);
 330                 SetPageCgroupUnevictable(pc);
 331         } else {
 332                 if (is_active_lru(lru))
 333                         SetPageCgroupActive(pc);
 334                 else
 335                         ClearPageCgroupActive(pc);
 336                 ClearPageCgroupUnevictable(pc);
 337         }
 338
 339         MEM_CGROUP_ZSTAT(mz, lru) += 1;
 340         list_move(&pc->lru, &mz->lists[lru]);
 341 }
 342
 343 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 344 {
 345         int ret;
 346
 347         task_lock(task);
 348         ret = task->mm && mm_match_cgroup(task->mm, mem);
 349         task_unlock(task);
 350         return ret;
 351 }
 352
 353 /*
 354  * This routine assumes that the appropriate zone's lru lock is already held
 355  */
 356 void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
 357 {
 358         struct page_cgroup *pc;
 359         struct mem_cgroup_per_zone *mz;
 360         unsigned long flags;
 361
 362         if (mem_cgroup_subsys.disabled)
 363                 return;
 364
 365         /*
 366          * We cannot lock_page_cgroup while holding zone's lru_lock,
 367          * because other holders of lock_page_cgroup can be interrupted
 368          * with an attempt to rotate_reclaimable_page.  But we cannot
 369          * safely get to page_cgroup without it, so just try_lock it:
 370          * mem_cgroup_isolate_pages allows for page left on wrong list.
 371          */
 372         pc = lookup_page_cgroup(page);
 373         if (!trylock_page_cgroup(pc))
 374                 return;
 375         if (pc && PageCgroupUsed(pc)) {
 376                 mz = page_cgroup_zoneinfo(pc);
 377                 spin_lock_irqsave(&mz->lru_lock, flags);
 378                 __mem_cgroup_move_lists(pc, lru);
 379                 spin_unlock_irqrestore(&mz->lru_lock, flags);
 380         }
 381         unlock_page_cgroup(pc);
 382 }
 383
 384 /*
 385  * Calculate mapped_ratio under memory controller. This will be used in
 386  * vmscan.c for deteremining we have to reclaim mapped pages.
 387  */
 388 int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
 389 {
 390         long total, rss;
 391
 392         /*
 393          * usage is recorded in bytes. But, here, we assume the number of
 394          * physical pages can be represented by "long" on any arch.
 395          */
 396         total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
 397         rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
 398         return (int)((rss * 100L) / total);
 399 }
 400
 401 /*
 402  * prev_priority control...this will be used in memory reclaim path.
 403  */
 404 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
 405 {
 406         return mem->prev_priority;
 407 }
 408
 409 void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
 410 {
 411         if (priority < mem->prev_priority)
 412                 mem->prev_priority = priority;
 413 }
 414
 415 void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
 416 {
 417         mem->prev_priority = priority;
 418 }
 419
 420 /*
 421  * Calculate # of pages to be scanned in this priority/zone.
 422  * See also vmscan.c
 423  *
 424  * priority starts from "DEF_PRIORITY" and decremented in each loop.
 425  * (see include/linux/mmzone.h)
 426  */
 427
 428 long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
 429                                         int priority, enum lru_list lru)
 430 {
 431         long nr_pages;
 432         int nid = zone->zone_pgdat->node_id;
 433         int zid = zone_idx(zone);
 434         struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
 435
 436         nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
 437
 438         return (nr_pages >> priority);
 439 }
 440
 441 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 442                                         struct list_head *dst,
 443                                         unsigned long *scanned, int order,
 444                                         int mode, struct zone *z,
 445                                         struct mem_cgroup *mem_cont,
 446                                         int active, int file)
 447 {
 448         unsigned long nr_taken = 0;
 449         struct page *page;
 450         unsigned long scan;
 451         LIST_HEAD(pc_list);
 452         struct list_head *src;
 453         struct page_cgroup *pc, *tmp;
 454         int nid = z->zone_pgdat->node_id;
 455         int zid = zone_idx(z);
 456         struct mem_cgroup_per_zone *mz;
 457         int lru = LRU_FILE * !!file + !!active;
 458
 459         BUG_ON(!mem_cont);
 460         mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
 461         src = &mz->lists[lru];
 462
 463         spin_lock(&mz->lru_lock);
 464         scan = 0;
 465         list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
 466                 if (scan >= nr_to_scan)
 467                         break;
 468                 if (unlikely(!PageCgroupUsed(pc)))
 469                         continue;
 470                 page = pc->page;
 471
 472                 if (unlikely(!PageLRU(page)))
 473                         continue;
 474
 475                 /*
 476                  * TODO: play better with lumpy reclaim, grabbing anything.
 477                  */
 478                 if (PageUnevictable(page) ||
 479                     (PageActive(page) && !active) ||
 480                     (!PageActive(page) && active)) {
 481                         __mem_cgroup_move_lists(pc, page_lru(page));
 482                         continue;
 483                 }
 484
 485                 scan++;
 486                 list_move(&pc->lru, &pc_list);
 487
 488                 if (__isolate_lru_page(page, mode, file) == 0) {
 489                         list_move(&page->lru, dst);
 490                         nr_taken++;
 491                 }
 492         }
 493
 494         list_splice(&pc_list, src);
 495         spin_unlock(&mz->lru_lock);
 496
 497         *scanned = scan;
 498         return nr_taken;
 499 }
 500
 501 /*
 502  * Unlike exported interface, "oom" parameter is added. if oom==true,
 503  * oom-killer can be invoked.
 504  */
 505 static int __mem_cgroup_try_charge(struct mm_struct *mm,
 506                         gfp_t gfp_mask, struct mem_cgroup **memcg,
 507                         bool oom)
 508 {
 509         struct mem_cgroup *mem;
 510         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 511         /*
 512          * We always charge the cgroup the mm_struct belongs to.
 513          * The mm_struct's mem_cgroup changes on task migration if the
 514          * thread group leader migrates. It's possible that mm is not
 515          * set, if so charge the init_mm (happens for pagecache usage).
 516          */
 517         if (likely(!*memcg)) {
 518                 rcu_read_lock();
 519                 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
 520                 if (unlikely(!mem)) {
 521                         rcu_read_unlock();
 522                         return 0;
 523                 }
 524                 /*
 525                  * For every charge from the cgroup, increment reference count
 526                  */
 527                 css_get(&mem->css);
 528                 *memcg = mem;
 529                 rcu_read_unlock();
 530         } else {
 531                 mem = *memcg;
 532                 css_get(&mem->css);
 533         }
 534
 535         while (1) {
 536                 int ret;
 537                 bool noswap = false;
 538
 539                 ret = res_counter_charge(&mem->res, PAGE_SIZE);
 540                 if (likely(!ret)) {
 541                         if (!do_swap_account)
 542                                 break;
 543                         ret = res_counter_charge(&mem->memsw, PAGE_SIZE);
 544                         if (likely(!ret))
 545                                 break;
 546                         /* mem+swap counter fails */
 547                         res_counter_uncharge(&mem->res, PAGE_SIZE);
 548                         noswap = true;
 549                 }
 550                 if (!(gfp_mask & __GFP_WAIT))
 551                         goto nomem;
 552
 553                 if (try_to_free_mem_cgroup_pages(mem, gfp_mask, noswap))
 554                         continue;
 555
 556                 /*
 557                  * try_to_free_mem_cgroup_pages() might not give us a full
 558                  * picture of reclaim. Some pages are reclaimed and might be
 559                  * moved to swap cache or just unmapped from the cgroup.
 560                  * Check the limit again to see if the reclaim reduced the
 561                  * current usage of the cgroup before giving up
 562                  *
 563                  */
 564                 if (!do_swap_account &&
 565                         res_counter_check_under_limit(&mem->res))
 566                         continue;
 567                 if (do_swap_account &&
 568                         res_counter_check_under_limit(&mem->memsw))
 569                         continue;
 570
 571                 if (!nr_retries--) {
 572                         if (oom)
 573                                 mem_cgroup_out_of_memory(mem, gfp_mask);
 574                         goto nomem;
 575                 }
 576         }
 577         return 0;
 578 nomem:
 579         css_put(&mem->css);
 580         return -ENOMEM;
 581 }
 582
 583 /**
 584  * mem_cgroup_try_charge - get charge of PAGE_SIZE.
 585  * @mm: an mm_struct which is charged against. (when *memcg is NULL)
 586  * @gfp_mask: gfp_mask for reclaim.
 587  * @memcg: a pointer to memory cgroup which is charged against.
 588  *
 589  * charge against memory cgroup pointed by *memcg. if *memcg == NULL, estimated
 590  * memory cgroup from @mm is got and stored in *memcg.
 591  *
 592  * Returns 0 if success. -ENOMEM at failure.
 593  * This call can invoke OOM-Killer.
 594  */
 595
 596 int mem_cgroup_try_charge(struct mm_struct *mm,
 597                           gfp_t mask, struct mem_cgroup **memcg)
 598 {
 599         return __mem_cgroup_try_charge(mm, mask, memcg, true);
 600 }
 601
 602 /*
 603  * commit a charge got by mem_cgroup_try_charge() and makes page_cgroup to be
 604  * USED state. If already USED, uncharge and return.
 605  */
 606
 607 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 608                                      struct page_cgroup *pc,
 609                                      enum charge_type ctype)
 610 {
 611         struct mem_cgroup_per_zone *mz;
 612         unsigned long flags;
 613
 614         /* try_charge() can return NULL to *memcg, taking care of it. */
 615         if (!mem)
 616                 return;
 617
 618         lock_page_cgroup(pc);
 619         if (unlikely(PageCgroupUsed(pc))) {
 620                 unlock_page_cgroup(pc);
 621                 res_counter_uncharge(&mem->res, PAGE_SIZE);
 622                 if (do_swap_account)
 623                         res_counter_uncharge(&mem->memsw, PAGE_SIZE);
 624                 css_put(&mem->css);
 625                 return;
 626         }
 627         pc->mem_cgroup = mem;
 628         /*
 629          * If a page is accounted as a page cache, insert to inactive list.
 630          * If anon, insert to active list.
 631          */
 632         pc->flags = pcg_default_flags[ctype];
 633
 634         mz = page_cgroup_zoneinfo(pc);
 635
 636         spin_lock_irqsave(&mz->lru_lock, flags);
 637         __mem_cgroup_add_list(mz, pc, true);
 638         spin_unlock_irqrestore(&mz->lru_lock, flags);
 639         unlock_page_cgroup(pc);
 640 }
 641
 642 /**
 643  * mem_cgroup_move_account - move account of the page
 644  * @pc: page_cgroup of the page.
 645  * @from: mem_cgroup which the page is moved from.
 646  * @to: mem_cgroup which the page is moved to. @from != @to.
 647  *
 648  * The caller must confirm following.
 649  * 1. disable irq.
 650  * 2. lru_lock of old mem_cgroup(@from) should be held.
 651  *
 652  * returns 0 at success,
 653  * returns -EBUSY when lock is busy or "pc" is unstable.
 654  *
 655  * This function does "uncharge" from old cgroup but doesn't do "charge" to
 656  * new cgroup. It should be done by a caller.
 657  */
 658
 659 static int mem_cgroup_move_account(struct page_cgroup *pc,
 660         struct mem_cgroup *from, struct mem_cgroup *to)
 661 {
 662         struct mem_cgroup_per_zone *from_mz, *to_mz;
 663         int nid, zid;
 664         int ret = -EBUSY;
 665
 666         VM_BUG_ON(!irqs_disabled());
 667         VM_BUG_ON(from == to);
 668
 669         nid = page_cgroup_nid(pc);
 670         zid = page_cgroup_zid(pc);
 671         from_mz =  mem_cgroup_zoneinfo(from, nid, zid);
 672         to_mz =  mem_cgroup_zoneinfo(to, nid, zid);
 673
 674
 675         if (!trylock_page_cgroup(pc))
 676                 return ret;
 677
 678         if (!PageCgroupUsed(pc))
 679                 goto out;
 680
 681         if (pc->mem_cgroup != from)
 682                 goto out;
 683
 684         if (spin_trylock(&to_mz->lru_lock)) {
 685                 __mem_cgroup_remove_list(from_mz, pc);
 686                 css_put(&from->css);
 687                 res_counter_uncharge(&from->res, PAGE_SIZE);
 688                 if (do_swap_account)
 689                         res_counter_uncharge(&from->memsw, PAGE_SIZE);
 690                 pc->mem_cgroup = to;
 691                 css_get(&to->css);
 692                 __mem_cgroup_add_list(to_mz, pc, false);
 693                 ret = 0;
 694                 spin_unlock(&to_mz->lru_lock);
 695         }
 696 out:
 697         unlock_page_cgroup(pc);
 698         return ret;
 699 }
 700
 701 /*
 702  * move charges to its parent.
 703  */
 704
 705 static int mem_cgroup_move_parent(struct page_cgroup *pc,
 706                                   struct mem_cgroup *child,
 707                                   gfp_t gfp_mask)
 708 {
 709         struct cgroup *cg = child->css.cgroup;
 710         struct cgroup *pcg = cg->parent;
 711         struct mem_cgroup *parent;
 712         struct mem_cgroup_per_zone *mz;
 713         unsigned long flags;
 714         int ret;
 715
 716         /* Is ROOT ? */
 717         if (!pcg)
 718                 return -EINVAL;
 719
 720         parent = mem_cgroup_from_cont(pcg);
 721
 722         ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
 723         if (ret)
 724                 return ret;
 725
 726         mz = mem_cgroup_zoneinfo(child,
 727                         page_cgroup_nid(pc), page_cgroup_zid(pc));
 728
 729         spin_lock_irqsave(&mz->lru_lock, flags);
 730         ret = mem_cgroup_move_account(pc, child, parent);
 731         spin_unlock_irqrestore(&mz->lru_lock, flags);
 732
 733         /* drop extra refcnt */
 734         css_put(&parent->css);
 735         /* uncharge if move fails */
 736         if (ret) {
 737                 res_counter_uncharge(&parent->res, PAGE_SIZE);
 738                 if (do_swap_account)
 739                         res_counter_uncharge(&parent->memsw, PAGE_SIZE);
 740         }
 741
 742         return ret;
 743 }
 744
 745 /*
 746  * Charge the memory controller for page usage.
 747  * Return
 748  * 0 if the charge was successful
 749  * < 0 if the cgroup is over its limit
 750  */
 751 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 752                                 gfp_t gfp_mask, enum charge_type ctype,
 753                                 struct mem_cgroup *memcg)
 754 {
 755         struct mem_cgroup *mem;
 756         struct page_cgroup *pc;
 757         int ret;
 758
 759         pc = lookup_page_cgroup(page);
 760         /* can happen at boot */
 761         if (unlikely(!pc))
 762                 return 0;
 763         prefetchw(pc);
 764
 765         mem = memcg;
 766         ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
 767         if (ret)
 768                 return ret;
 769
 770         __mem_cgroup_commit_charge(mem, pc, ctype);
 771         return 0;
 772 }
 773
 774 int mem_cgroup_newpage_charge(struct page *page,
 775                               struct mm_struct *mm, gfp_t gfp_mask)
 776 {
 777         if (mem_cgroup_subsys.disabled)
 778                 return 0;
 779         if (PageCompound(page))
 780                 return 0;
 781         /*
 782          * If already mapped, we don't have to account.
 783          * If page cache, page->mapping has address_space.
 784          * But page->mapping may have out-of-use anon_vma pointer,
 785          * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
 786          * is NULL.
 787          */
 788         if (page_mapped(page) || (page->mapping && !PageAnon(page)))
 789                 return 0;
 790         if (unlikely(!mm))
 791                 mm = &init_mm;
 792         return mem_cgroup_charge_common(page, mm, gfp_mask,
 793                                 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
 794 }
 795
 796 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 797                                 gfp_t gfp_mask)
 798 {
 799         if (mem_cgroup_subsys.disabled)
 800                 return 0;
 801         if (PageCompound(page))
 802                 return 0;
 803         /*
 804          * Corner case handling. This is called from add_to_page_cache()
 805          * in usual. But some FS (shmem) precharges this page before calling it
 806          * and call add_to_page_cache() with GFP_NOWAIT.
 807          *
 808          * For GFP_NOWAIT case, the page may be pre-charged before calling
 809          * add_to_page_cache(). (See shmem.c) check it here and avoid to call
 810          * charge twice. (It works but has to pay a bit larger cost.)
 811          */
 812         if (!(gfp_mask & __GFP_WAIT)) {
 813                 struct page_cgroup *pc;
 814
 815
 816                 pc = lookup_page_cgroup(page);
 817                 if (!pc)
 818                         return 0;
 819                 lock_page_cgroup(pc);
 820                 if (PageCgroupUsed(pc)) {
 821                         unlock_page_cgroup(pc);
 822                         return 0;
 823                 }
 824                 unlock_page_cgroup(pc);
 825         }
 826
 827         if (unlikely(!mm))
 828                 mm = &init_mm;
 829
 830         if (page_is_file_cache(page))
 831                 return mem_cgroup_charge_common(page, mm, gfp_mask,
 832                                 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
 833         else
 834                 return mem_cgroup_charge_common(page, mm, gfp_mask,
 835                                 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
 836 }
 837
 838 int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
 839                                  struct page *page,
 840                                  gfp_t mask, struct mem_cgroup **ptr)
 841 {
 842         struct mem_cgroup *mem;
 843         swp_entry_t     ent;
 844
 845         if (mem_cgroup_subsys.disabled)
 846                 return 0;
 847
 848         if (!do_swap_account)
 849                 goto charge_cur_mm;
 850
 851         /*
 852          * A racing thread's fault, or swapoff, may have already updated
 853          * the pte, and even removed page from swap cache: return success
 854          * to go on to do_swap_page()'s pte_same() test, which should fail.
 855          */
 856         if (!PageSwapCache(page))
 857                 return 0;
 858
 859         ent.val = page_private(page);
 860
 861         mem = lookup_swap_cgroup(ent);
 862         if (!mem || mem->obsolete)
 863                 goto charge_cur_mm;
 864         *ptr = mem;
 865         return __mem_cgroup_try_charge(NULL, mask, ptr, true);
 866 charge_cur_mm:
 867         if (unlikely(!mm))
 868                 mm = &init_mm;
 869         return __mem_cgroup_try_charge(mm, mask, ptr, true);
 870 }
 871
 872 #ifdef CONFIG_SWAP
 873
 874 int mem_cgroup_cache_charge_swapin(struct page *page,
 875                         struct mm_struct *mm, gfp_t mask, bool locked)
 876 {
 877         int ret = 0;
 878
 879         if (mem_cgroup_subsys.disabled)
 880                 return 0;
 881         if (unlikely(!mm))
 882                 mm = &init_mm;
 883         if (!locked)
 884                 lock_page(page);
 885         /*
 886          * If not locked, the page can be dropped from SwapCache until
 887          * we reach here.
 888          */
 889         if (PageSwapCache(page)) {
 890                 struct mem_cgroup *mem = NULL;
 891                 swp_entry_t ent;
 892
 893                 ent.val = page_private(page);
 894                 if (do_swap_account) {
 895                         mem = lookup_swap_cgroup(ent);
 896                         if (mem && mem->obsolete)
 897                                 mem = NULL;
 898                         if (mem)
 899                                 mm = NULL;
 900                 }
 901                 ret = mem_cgroup_charge_common(page, mm, mask,
 902                                 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
 903
 904                 if (!ret && do_swap_account) {
 905                         /* avoid double counting */
 906                         mem = swap_cgroup_record(ent, NULL);
 907                         if (mem) {
 908                                 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
 909                                 mem_cgroup_put(mem);
 910                         }
 911                 }
 912         }
 913         if (!locked)
 914                 unlock_page(page);
 915
 916         return ret;
 917 }
 918 #endif
 919
 920 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
 921 {
 922         struct page_cgroup *pc;
 923
 924         if (mem_cgroup_subsys.disabled)
 925                 return;
 926         if (!ptr)
 927                 return;
 928         pc = lookup_page_cgroup(page);
 929         __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
 930         /*
 931          * Now swap is on-memory. This means this page may be
 932          * counted both as mem and swap....double count.
 933          * Fix it by uncharging from memsw. This SwapCache is stable
 934          * because we're still under lock_page().
 935          */
 936         if (do_swap_account) {
 937                 swp_entry_t ent = {.val = page_private(page)};
 938                 struct mem_cgroup *memcg;
 939                 memcg = swap_cgroup_record(ent, NULL);
 940                 if (memcg) {
 941                         /* If memcg is obsolete, memcg can be != ptr */
 942                         res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
 943                         mem_cgroup_put(memcg);
 944                 }
 945
 946         }
 947 }
 948
 949 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
 950 {
 951         if (mem_cgroup_subsys.disabled)
 952                 return;
 953         if (!mem)
 954                 return;
 955         res_counter_uncharge(&mem->res, PAGE_SIZE);
 956         if (do_swap_account)
 957                 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
 958         css_put(&mem->css);
 959 }
 960
 961
 962 /*
 963  * uncharge if !page_mapped(page)
 964  */
 965 static struct mem_cgroup *
 966 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 967 {
 968         struct page_cgroup *pc;
 969         struct mem_cgroup *mem = NULL;
 970         struct mem_cgroup_per_zone *mz;
 971         unsigned long flags;
 972
 973         if (mem_cgroup_subsys.disabled)
 974                 return NULL;
 975
 976         if (PageSwapCache(page))
 977                 return NULL;
 978
 979         /*
 980          * Check if our page_cgroup is valid
 981          */
 982         pc = lookup_page_cgroup(page);
 983         if (unlikely(!pc || !PageCgroupUsed(pc)))
 984                 return NULL;
 985
 986         lock_page_cgroup(pc);
 987
 988         mem = pc->mem_cgroup;
 989
 990         if (!PageCgroupUsed(pc))
 991                 goto unlock_out;
 992
 993         switch (ctype) {
 994         case MEM_CGROUP_CHARGE_TYPE_MAPPED:
 995                 if (page_mapped(page))
 996                         goto unlock_out;
 997                 break;
 998         case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
 999                 if (!PageAnon(page)) {  /* Shared memory */
1000                         if (page->mapping && !page_is_file_cache(page))
1001                                 goto unlock_out;
1002                 } else if (page_mapped(page)) /* Anon */
1003                                 goto unlock_out;
1004                 break;
1005         default:
1006                 break;
1007         }
1008
1009         res_counter_uncharge(&mem->res, PAGE_SIZE);
1010         if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1011                 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1012
1013         ClearPageCgroupUsed(pc);
1014
1015         mz = page_cgroup_zoneinfo(pc);
1016         spin_lock_irqsave(&mz->lru_lock, flags);
1017         __mem_cgroup_remove_list(mz, pc);
1018         spin_unlock_irqrestore(&mz->lru_lock, flags);
1019         unlock_page_cgroup(pc);
1020
1021         css_put(&mem->css);
1022
1023         return mem;
1024
1025 unlock_out:
1026         unlock_page_cgroup(pc);
1027         return NULL;
1028 }
1029
1030 void mem_cgroup_uncharge_page(struct page *page)
1031 {
1032         /* early check. */
1033         if (page_mapped(page))
1034                 return;
1035         if (page->mapping && !PageAnon(page))
1036                 return;
1037         __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
1038 }
1039
1040 void mem_cgroup_uncharge_cache_page(struct page *page)
1041 {
1042         VM_BUG_ON(page_mapped(page));
1043         VM_BUG_ON(page->mapping);
1044         __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
1045 }
1046
1047 /*
1048  * called from __delete_from_swap_cache() and drop "page" account.
1049  * memcg information is recorded to swap_cgroup of "ent"
1050  */
1051 void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
1052 {
1053         struct mem_cgroup *memcg;
1054
1055         memcg = __mem_cgroup_uncharge_common(page,
1056                                         MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
1057         /* record memcg information */
1058         if (do_swap_account && memcg) {
1059                 swap_cgroup_record(ent, memcg);
1060                 mem_cgroup_get(memcg);
1061         }
1062 }
1063
1064 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1065 /*
1066  * called from swap_entry_free(). remove record in swap_cgroup and
1067  * uncharge "memsw" account.
1068  */
1069 void mem_cgroup_uncharge_swap(swp_entry_t ent)
1070 {
1071         struct mem_cgroup *memcg;
1072
1073         if (!do_swap_account)
1074                 return;
1075
1076         memcg = swap_cgroup_record(ent, NULL);
1077         if (memcg) {
1078                 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1079                 mem_cgroup_put(memcg);
1080         }
1081 }
1082 #endif
1083
1084 /*
1085  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
1086  * page belongs to.
1087  */
1088 int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
1089 {
1090         struct page_cgroup *pc;
1091         struct mem_cgroup *mem = NULL;
1092         int ret = 0;
1093
1094         if (mem_cgroup_subsys.disabled)
1095                 return 0;
1096
1097         pc = lookup_page_cgroup(page);
1098         lock_page_cgroup(pc);
1099         if (PageCgroupUsed(pc)) {
1100                 mem = pc->mem_cgroup;
1101                 css_get(&mem->css);
1102         }
1103         unlock_page_cgroup(pc);
1104
1105         if (mem) {
1106                 ret = mem_cgroup_try_charge(NULL, GFP_HIGHUSER_MOVABLE, &mem);
1107                 css_put(&mem->css);
1108         }
1109         *ptr = mem;
1110         return ret;
1111 }
1112
1113 /* remove redundant charge if migration failed*/
1114 void mem_cgroup_end_migration(struct mem_cgroup *mem,
1115                 struct page *oldpage, struct page *newpage)
1116 {
1117         struct page *target, *unused;
1118         struct page_cgroup *pc;
1119         enum charge_type ctype;
1120
1121         if (!mem)
1122                 return;
1123
1124         /* at migration success, oldpage->mapping is NULL. */
1125         if (oldpage->mapping) {
1126                 target = oldpage;
1127                 unused = NULL;
1128         } else {
1129                 target = newpage;
1130                 unused = oldpage;
1131         }
1132
1133         if (PageAnon(target))
1134                 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
1135         else if (page_is_file_cache(target))
1136                 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
1137         else
1138                 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
1139
1140         /* unused page is not on radix-tree now. */
1141         if (unused)
1142                 __mem_cgroup_uncharge_common(unused, ctype);
1143
1144         pc = lookup_page_cgroup(target);
1145         /*
1146          * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
1147          * So, double-counting is effectively avoided.
1148          */
1149         __mem_cgroup_commit_charge(mem, pc, ctype);
1150
1151         /*
1152          * Both of oldpage and newpage are still under lock_page().
1153          * Then, we don't have to care about race in radix-tree.
1154          * But we have to be careful that this page is unmapped or not.
1155          *
1156          * There is a case for !page_mapped(). At the start of
1157          * migration, oldpage was mapped. But now, it's zapped.
1158          * But we know *target* page is not freed/reused under us.
1159          * mem_cgroup_uncharge_page() does all necessary checks.
1160          */
1161         if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
1162                 mem_cgroup_uncharge_page(target);
1163 }
1164
1165 /*
1166  * A call to try to shrink memory usage under specified resource controller.
1167  * This is typically used for page reclaiming for shmem for reducing side
1168  * effect of page allocation from shmem, which is used by some mem_cgroup.
1169  */
1170 int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
1171 {
1172         struct mem_cgroup *mem;
1173         int progress = 0;
1174         int retry = MEM_CGROUP_RECLAIM_RETRIES;
1175
1176         if (mem_cgroup_subsys.disabled)
1177                 return 0;
1178         if (!mm)
1179                 return 0;
1180
1181         rcu_read_lock();
1182         mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
1183         if (unlikely(!mem)) {
1184                 rcu_read_unlock();
1185                 return 0;
1186         }
1187         css_get(&mem->css);
1188         rcu_read_unlock();
1189
1190         do {
1191                 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask, true);
1192                 progress += res_counter_check_under_limit(&mem->res);
1193         } while (!progress && --retry);
1194
1195         css_put(&mem->css);
1196         if (!retry)
1197                 return -ENOMEM;
1198         return 0;
1199 }
1200
1201 static DEFINE_MUTEX(set_limit_mutex);
1202
1203 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1204                                 unsigned long long val)
1205 {
1206
1207         int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
1208         int progress;
1209         u64 memswlimit;
1210         int ret = 0;
1211
1212         while (retry_count) {
1213                 if (signal_pending(current)) {
1214                         ret = -EINTR;
1215                         break;
1216                 }
1217                 /*
1218                  * Rather than hide all in some function, I do this in
1219                  * open coded manner. You see what this really does.
1220                  * We have to guarantee mem->res.limit < mem->memsw.limit.
1221                  */
1222                 mutex_lock(&set_limit_mutex);
1223                 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1224                 if (memswlimit < val) {
1225                         ret = -EINVAL;
1226                         mutex_unlock(&set_limit_mutex);
1227                         break;
1228                 }
1229                 ret = res_counter_set_limit(&memcg->res, val);
1230                 mutex_unlock(&set_limit_mutex);
1231
1232                 if (!ret)
1233                         break;
1234
1235                 progress = try_to_free_mem_cgroup_pages(memcg,
1236                                 GFP_HIGHUSER_MOVABLE, false);
1237                 if (!progress)                  retry_count--;
1238         }
1239         return ret;
1240 }
1241
1242 int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1243                                 unsigned long long val)
1244 {
1245         int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
1246         u64 memlimit, oldusage, curusage;
1247         int ret;
1248
1249         if (!do_swap_account)
1250                 return -EINVAL;
1251
1252         while (retry_count) {
1253                 if (signal_pending(current)) {
1254                         ret = -EINTR;
1255                         break;
1256                 }
1257                 /*
1258                  * Rather than hide all in some function, I do this in
1259                  * open coded manner. You see what this really does.
1260                  * We have to guarantee mem->res.limit < mem->memsw.limit.
1261                  */
1262                 mutex_lock(&set_limit_mutex);
1263                 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1264                 if (memlimit > val) {
1265                         ret = -EINVAL;
1266                         mutex_unlock(&set_limit_mutex);
1267                         break;
1268                 }
1269                 ret = res_counter_set_limit(&memcg->memsw, val);
1270                 mutex_unlock(&set_limit_mutex);
1271
1272                 if (!ret)
1273                         break;
1274
1275                 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1276                 try_to_free_mem_cgroup_pages(memcg, GFP_HIGHUSER_MOVABLE, true);
1277                 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1278                 if (curusage >= oldusage)
1279                         retry_count--;
1280         }
1281         return ret;
1282 }
1283
1284
1285 /*
1286  * This routine traverse page_cgroup in given list and drop them all.
1287  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
1288  */
1289 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
1290                             struct mem_cgroup_per_zone *mz,
1291                             enum lru_list lru)
1292 {
1293         struct page_cgroup *pc, *busy;
1294         unsigned long flags;
1295         unsigned long loop;
1296         struct list_head *list;
1297         int ret = 0;
1298
1299         list = &mz->lists[lru];
1300
1301         loop = MEM_CGROUP_ZSTAT(mz, lru);
1302         /* give some margin against EBUSY etc...*/
1303         loop += 256;
1304         busy = NULL;
1305         while (loop--) {
1306                 ret = 0;
1307                 spin_lock_irqsave(&mz->lru_lock, flags);
1308                 if (list_empty(list)) {
1309                         spin_unlock_irqrestore(&mz->lru_lock, flags);
1310                         break;
1311                 }
1312                 pc = list_entry(list->prev, struct page_cgroup, lru);
1313                 if (busy == pc) {
1314                         list_move(&pc->lru, list);
1315                         busy = 0;
1316                         spin_unlock_irqrestore(&mz->lru_lock, flags);
1317                         continue;
1318                 }
1319                 spin_unlock_irqrestore(&mz->lru_lock, flags);
1320
1321                 ret = mem_cgroup_move_parent(pc, mem, GFP_HIGHUSER_MOVABLE);
1322                 if (ret == -ENOMEM)
1323                         break;
1324
1325                 if (ret == -EBUSY || ret == -EINVAL) {
1326                         /* found lock contention or "pc" is obsolete. */
1327                         busy = pc;
1328                         cond_resched();
1329                 } else
1330                         busy = NULL;
1331         }
1332         if (!ret && !list_empty(list))
1333                 return -EBUSY;
1334         return ret;
1335 }
1336
1337 /*
1338  * make mem_cgroup's charge to be 0 if there is no task.
1339  * This enables deleting this mem_cgroup.
1340  */
1341 static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
1342 {
1343         int ret;
1344         int node, zid, shrink;
1345         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1346         struct cgroup *cgrp = mem->css.cgroup;
1347
1348         css_get(&mem->css);
1349
1350         shrink = 0;
1351         /* should free all ? */
1352         if (free_all)
1353                 goto try_to_free;
1354 move_account:
1355         while (mem->res.usage > 0) {
1356                 ret = -EBUSY;
1357                 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
1358                         goto out;
1359                 ret = -EINTR;
1360                 if (signal_pending(current))
1361                         goto out;
1362                 /* This is for making all *used* pages to be on LRU. */
1363                 lru_add_drain_all();
1364                 ret = 0;
1365                 for_each_node_state(node, N_POSSIBLE) {
1366                         for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
1367                                 struct mem_cgroup_per_zone *mz;
1368                                 enum lru_list l;
1369                                 mz = mem_cgroup_zoneinfo(mem, node, zid);
1370                                 for_each_lru(l) {
1371                                         ret = mem_cgroup_force_empty_list(mem,
1372                                                                   mz, l);
1373                                         if (ret)
1374                                                 break;
1375                                 }
1376                         }
1377                         if (ret)
1378                                 break;
1379                 }
1380                 /* it seems parent cgroup doesn't have enough mem */
1381                 if (ret == -ENOMEM)
1382                         goto try_to_free;
1383                 cond_resched();
1384         }
1385         ret = 0;
1386 out:
1387         css_put(&mem->css);
1388         return ret;
1389
1390 try_to_free:
1391         /* returns EBUSY if there is a task or if we come here twice. */
1392         if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
1393                 ret = -EBUSY;
1394                 goto out;
1395         }
1396         /* we call try-to-free pages for make this cgroup empty */
1397         lru_add_drain_all();
1398         /* try to free all pages in this cgroup */
1399         shrink = 1;
1400         while (nr_retries && mem->res.usage > 0) {
1401                 int progress;
1402
1403                 if (signal_pending(current)) {
1404                         ret = -EINTR;
1405                         goto out;
1406                 }
1407                 progress = try_to_free_mem_cgroup_pages(mem,
1408                                                   GFP_HIGHUSER_MOVABLE, false);
1409                 if (!progress) {
1410                         nr_retries--;
1411                         /* maybe some writeback is necessary */
1412                         congestion_wait(WRITE, HZ/10);
1413                 }
1414
1415         }
1416         /* try move_account...there may be some *locked* pages. */
1417         if (mem->res.usage)
1418                 goto move_account;
1419         ret = 0;
1420         goto out;
1421 }
1422
1423 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
1424 {
1425         return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
1426 }
1427
1428
1429 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
1430 {
1431         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1432         u64 val = 0;
1433         int type, name;
1434
1435         type = MEMFILE_TYPE(cft->private);
1436         name = MEMFILE_ATTR(cft->private);
1437         switch (type) {
1438         case _MEM:
1439                 val = res_counter_read_u64(&mem->res, name);
1440                 break;
1441         case _MEMSWAP:
1442                 if (do_swap_account)
1443                         val = res_counter_read_u64(&mem->memsw, name);
1444                 break;
1445         default:
1446                 BUG();
1447                 break;
1448         }
1449         return val;
1450 }
1451 /*
1452  * The user of this function is...
1453  * RES_LIMIT.
1454  */
1455 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
1456                             const char *buffer)
1457 {
1458         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
1459         int type, name;
1460         unsigned long long val;
1461         int ret;
1462
1463         type = MEMFILE_TYPE(cft->private);
1464         name = MEMFILE_ATTR(cft->private);
1465         switch (name) {
1466         case RES_LIMIT:
1467                 /* This function does all necessary parse...reuse it */
1468                 ret = res_counter_memparse_write_strategy(buffer, &val);
1469                 if (ret)
1470                         break;
1471                 if (type == _MEM)
1472                         ret = mem_cgroup_resize_limit(memcg, val);
1473                 else
1474                         ret = mem_cgroup_resize_memsw_limit(memcg, val);
1475                 break;
1476         default:
1477                 ret = -EINVAL; /* should be BUG() ? */
1478                 break;
1479         }
1480         return ret;
1481 }
1482
1483 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
1484 {
1485         struct mem_cgroup *mem;
1486         int type, name;
1487
1488         mem = mem_cgroup_from_cont(cont);
1489         type = MEMFILE_TYPE(event);
1490         name = MEMFILE_ATTR(event);
1491         switch (name) {
1492         case RES_MAX_USAGE:
1493                 if (type == _MEM)
1494                         res_counter_reset_max(&mem->res);
1495                 else
1496                         res_counter_reset_max(&mem->memsw);
1497                 break;
1498         case RES_FAILCNT:
1499                 if (type == _MEM)
1500                         res_counter_reset_failcnt(&mem->res);
1501                 else
1502                         res_counter_reset_failcnt(&mem->memsw);
1503                 break;
1504         }
1505         return 0;
1506 }
1507
1508 static const struct mem_cgroup_stat_desc {
1509         const char *msg;
1510         u64 unit;
1511 } mem_cgroup_stat_desc[] = {
1512         [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
1513         [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
1514         [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, },
1515         [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, },
1516 };
1517
1518 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
1519                                  struct cgroup_map_cb *cb)
1520 {
1521         struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
1522         struct mem_cgroup_stat *stat = &mem_cont->stat;
1523         int i;
1524
1525         for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
1526                 s64 val;
1527
1528                 val = mem_cgroup_read_stat(stat, i);
1529                 val *= mem_cgroup_stat_desc[i].unit;
1530                 cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
1531         }
1532         /* showing # of active pages */
1533         {
1534                 unsigned long active_anon, inactive_anon;
1535                 unsigned long active_file, inactive_file;
1536                 unsigned long unevictable;
1537
1538                 inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
1539                                                 LRU_INACTIVE_ANON);
1540                 active_anon = mem_cgroup_get_all_zonestat(mem_cont,
1541                                                 LRU_ACTIVE_ANON);
1542                 inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
1543                                                 LRU_INACTIVE_FILE);
1544                 active_file = mem_cgroup_get_all_zonestat(mem_cont,
1545                                                 LRU_ACTIVE_FILE);
1546                 unevictable = mem_cgroup_get_all_zonestat(mem_cont,
1547                                                         LRU_UNEVICTABLE);
1548
1549                 cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
1550                 cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
1551                 cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
1552                 cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
1553                 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
1554
1555         }
1556         return 0;
1557 }
1558
1559
1560 static struct cftype mem_cgroup_files[] = {
1561         {
1562                 .name = "usage_in_bytes",
1563                 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
1564                 .read_u64 = mem_cgroup_read,
1565         },
1566         {
1567                 .name = "max_usage_in_bytes",
1568                 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
1569                 .trigger = mem_cgroup_reset,
1570                 .read_u64 = mem_cgroup_read,
1571         },
1572         {
1573                 .name = "limit_in_bytes",
1574                 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
1575                 .write_string = mem_cgroup_write,
1576                 .read_u64 = mem_cgroup_read,
1577         },
1578         {
1579                 .name = "failcnt",
1580                 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
1581                 .trigger = mem_cgroup_reset,
1582                 .read_u64 = mem_cgroup_read,
1583         },
1584         {
1585                 .name = "stat",
1586                 .read_map = mem_control_stat_show,
1587         },
1588         {
1589                 .name = "force_empty",
1590                 .trigger = mem_cgroup_force_empty_write,
1591         },
1592 };
1593
1594 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1595 static struct cftype memsw_cgroup_files[] = {
1596         {
1597                 .name = "memsw.usage_in_bytes",
1598                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
1599                 .read_u64 = mem_cgroup_read,
1600         },
1601         {
1602                 .name = "memsw.max_usage_in_bytes",
1603                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
1604                 .trigger = mem_cgroup_reset,
1605                 .read_u64 = mem_cgroup_read,
1606         },
1607         {
1608                 .name = "memsw.limit_in_bytes",
1609                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
1610                 .write_string = mem_cgroup_write,
1611                 .read_u64 = mem_cgroup_read,
1612         },
1613         {
1614                 .name = "memsw.failcnt",
1615                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
1616                 .trigger = mem_cgroup_reset,
1617                 .read_u64 = mem_cgroup_read,
1618         },
1619 };
1620
1621 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
1622 {
1623         if (!do_swap_account)
1624                 return 0;
1625         return cgroup_add_files(cont, ss, memsw_cgroup_files,
1626                                 ARRAY_SIZE(memsw_cgroup_files));
1627 };
1628 #else
1629 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
1630 {
1631         return 0;
1632 }
1633 #endif
1634
1635 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1636 {
1637         struct mem_cgroup_per_node *pn;
1638         struct mem_cgroup_per_zone *mz;
1639         enum lru_list l;
1640         int zone, tmp = node;
1641         /*
1642          * This routine is called against possible nodes.
1643          * But it's BUG to call kmalloc() against offline node.
1644          *
1645          * TODO: this routine can waste much memory for nodes which will
1646          *       never be onlined. It's better to use memory hotplug callback
1647          *       function.
1648          */
1649         if (!node_state(node, N_NORMAL_MEMORY))
1650                 tmp = -1;
1651         pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
1652         if (!pn)
1653                 return 1;
1654
1655         mem->info.nodeinfo[node] = pn;
1656         memset(pn, 0, sizeof(*pn));
1657
1658         for (zone = 0; zone < MAX_NR_ZONES; zone++) {
1659                 mz = &pn->zoneinfo[zone];
1660                 spin_lock_init(&mz->lru_lock);
1661                 for_each_lru(l)
1662                         INIT_LIST_HEAD(&mz->lists[l]);
1663         }
1664         return 0;
1665 }
1666
1667 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1668 {
1669         kfree(mem->info.nodeinfo[node]);
1670 }
1671
1672 static int mem_cgroup_size(void)
1673 {
1674         int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
1675         return sizeof(struct mem_cgroup) + cpustat_size;
1676 }
1677
1678 static struct mem_cgroup *mem_cgroup_alloc(void)
1679 {
1680         struct mem_cgroup *mem;
1681         int size = mem_cgroup_size();
1682
1683         if (size < PAGE_SIZE)
1684                 mem = kmalloc(size, GFP_KERNEL);
1685         else
1686                 mem = vmalloc(size);
1687
1688         if (mem)
1689                 memset(mem, 0, size);
1690         return mem;
1691 }
1692
1693 /*
1694  * At destroying mem_cgroup, references from swap_cgroup can remain.
1695  * (scanning all at force_empty is too costly...)
1696  *
1697  * Instead of clearing all references at force_empty, we remember
1698  * the number of reference from swap_cgroup and free mem_cgroup when
1699  * it goes down to 0.
1700  *
1701  * When mem_cgroup is destroyed, mem->obsolete will be set to 0 and
1702  * entry which points to this memcg will be ignore at swapin.
1703  *
1704  * Removal of cgroup itself succeeds regardless of refs from swap.
1705  */
1706
1707 static void mem_cgroup_free(struct mem_cgroup *mem)
1708 {
1709         if (atomic_read(&mem->refcnt) > 0)
1710                 return;
1711         if (mem_cgroup_size() < PAGE_SIZE)
1712                 kfree(mem);
1713         else
1714                 vfree(mem);
1715 }
1716
1717 static void mem_cgroup_get(struct mem_cgroup *mem)
1718 {
1719         atomic_inc(&mem->refcnt);
1720 }
1721
1722 static void mem_cgroup_put(struct mem_cgroup *mem)
1723 {
1724         if (atomic_dec_and_test(&mem->refcnt)) {
1725                 if (!mem->obsolete)
1726                         return;
1727                 mem_cgroup_free(mem);
1728         }
1729 }
1730
1731
1732 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1733 static void __init enable_swap_cgroup(void)
1734 {
1735         if (!mem_cgroup_subsys.disabled && really_do_swap_account)
1736                 do_swap_account = 1;
1737 }
1738 #else
1739 static void __init enable_swap_cgroup(void)
1740 {
1741 }
1742 #endif
1743
1744 static struct cgroup_subsys_state *
1745 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1746 {
1747         struct mem_cgroup *mem;
1748         int node;
1749
1750         mem = mem_cgroup_alloc();
1751         if (!mem)
1752                 return ERR_PTR(-ENOMEM);
1753
1754         res_counter_init(&mem->res);
1755         res_counter_init(&mem->memsw);
1756
1757         for_each_node_state(node, N_POSSIBLE)
1758                 if (alloc_mem_cgroup_per_zone_info(mem, node))
1759                         goto free_out;
1760         /* root ? */
1761         if (cont->parent == NULL)
1762                 enable_swap_cgroup();
1763
1764         return &mem->css;
1765 free_out:
1766         for_each_node_state(node, N_POSSIBLE)
1767                 free_mem_cgroup_per_zone_info(mem, node);
1768         mem_cgroup_free(mem);
1769         return ERR_PTR(-ENOMEM);
1770 }
1771
1772 static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
1773                                         struct cgroup *cont)
1774 {
1775         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1776         mem->obsolete = 1;
1777         mem_cgroup_force_empty(mem, false);
1778 }
1779
1780 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
1781                                 struct cgroup *cont)
1782 {
1783         int node;
1784         struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1785
1786         for_each_node_state(node, N_POSSIBLE)
1787                 free_mem_cgroup_per_zone_info(mem, node);
1788
1789         mem_cgroup_free(mem_cgroup_from_cont(cont));
1790 }
1791
1792 static int mem_cgroup_populate(struct cgroup_subsys *ss,
1793                                 struct cgroup *cont)
1794 {
1795         int ret;
1796
1797         ret = cgroup_add_files(cont, ss, mem_cgroup_files,
1798                                 ARRAY_SIZE(mem_cgroup_files));
1799
1800         if (!ret)
1801                 ret = register_memsw_files(cont, ss);
1802         return ret;
1803 }
1804
1805 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1806                                 struct cgroup *cont,
1807                                 struct cgroup *old_cont,
1808                                 struct task_struct *p)
1809 {
1810         struct mm_struct *mm;
1811         struct mem_cgroup *mem, *old_mem;
1812
1813         mm = get_task_mm(p);
1814         if (mm == NULL)
1815                 return;
1816
1817         mem = mem_cgroup_from_cont(cont);
1818         old_mem = mem_cgroup_from_cont(old_cont);
1819
1820         /*
1821          * Only thread group leaders are allowed to migrate, the mm_struct is
1822          * in effect owned by the leader
1823          */
1824         if (!thread_group_leader(p))
1825                 goto out;
1826
1827 out:
1828         mmput(mm);
1829 }
1830
1831 struct cgroup_subsys mem_cgroup_subsys = {
1832         .name = "memory",
1833         .subsys_id = mem_cgroup_subsys_id,
1834         .create = mem_cgroup_create,
1835         .pre_destroy = mem_cgroup_pre_destroy,
1836         .destroy = mem_cgroup_destroy,
1837         .populate = mem_cgroup_populate,
1838         .attach = mem_cgroup_move_task,
1839         .early_init = 0,
1840 };
1841
1842 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1843
1844 static int __init disable_swap_account(char *s)
1845 {
1846         really_do_swap_account = 0;
1847         return 1;
1848 }
1849 __setup("noswapaccount", disable_swap_account);
1850 #endif