From c84c24daf590138844d66151616b12dde7617c63 Mon Sep 17 00:00:00 2001
From: Matthew Dillon <dillon@apollo.backplane.com>
Date: Wed, 17 Dec 2008 17:02:38 -0800
Subject: [PATCH] Fix bugs in dealing with low-memory situations when the
 system has run out of swap or has no swap.

* Fix an error where the system started killing processes before it needed
  to.

* Continue propagating pages from the active queue to the inactive queue
  when the system has run out of swap or has no swap, even though the
  inactive queue has become bloated.  This occurs because the inactive
  queue may be unable to drain due to an excess of dirty pages which
  cannot be swapped out.

* Use the active queue to detect excessive stress which combined with
  an out-of-swap or no-swap situation means the system has run out of
  memory.  THEN start killing processes.

* This also allows the system to recycle nearly all the clean pages
  available when it has no swap space left, to try to keep things going,
  leaving only dirty pages in the VM page queues.
---
 sys/vm/swap_pager.c | 10 ++++++-
 sys/vm/swap_pager.h |  1 +
 sys/vm/vm_pageout.c | 77 ++++++++++++++++++++++++++++++++++++++++-------------
 sys/vm/vm_swap.c    |  1 +
 4 files changed, 69 insertions(+), 20 deletions(-)

diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c
index ebf5d6b642..9a8d7511d4 100644
--- a/sys/vm/swap_pager.c
+++ b/sys/vm/swap_pager.c
@@ -1557,13 +1557,20 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
 	}
 }
 
+void
+swap_pager_newswap(void)
+{
+	swp_sizecheck();
+}
+
 /*
  *	swap_pager_sync_iodone:
  *
  *	Completion routine for synchronous reads and writes from/to swap.
  *	We just mark the bp is complete and wake up anyone waiting on it.
  *
- *	This routine may not block.  This routine is called at splbio() or better.
+ *	This routine may not block.  This routine is called at splbio()
+ *	or better.
  */
 
 static void
@@ -1697,6 +1704,7 @@ swp_pager_async_iodone(struct bio *bio)
 				 * then finish the I/O.
 				 */
 				vm_page_dirty(m);
+				kprintf("f");
 				vm_page_activate(m);
 				vm_page_io_finish(m);
 			}
diff --git a/sys/vm/swap_pager.h b/sys/vm/swap_pager.h
index fd4413bf1e..9d24f3473d 100644
--- a/sys/vm/swap_pager.h
+++ b/sys/vm/swap_pager.h
@@ -99,6 +99,7 @@ void swap_pager_copy (vm_object_t, vm_object_t, vm_pindex_t, int);
 void swap_pager_freespace (vm_object_t, vm_pindex_t, vm_size_t);
 void swap_pager_dmzspace (vm_object_t, vm_pindex_t, vm_size_t);
 void swap_pager_swap_init (void);
+void swap_pager_newswap (void);
 int swap_pager_reserve (vm_object_t, vm_pindex_t, vm_size_t);
 
 /*
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index e6d825597d..ed58182e00 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -439,11 +439,15 @@ vm_pageout_flush(vm_page_t *mc, int count, int flags)
 		case VM_PAGER_ERROR:
 		case VM_PAGER_FAIL:
 			/*
-			 * If page couldn't be paged out, then reactivate the
-			 * page so it doesn't clog the inactive list.  (We
-			 * will try paging out it again later).
+			 * A page typically cannot be paged out when we
+			 * have run out of swap.  We leave the page
+			 * marked inactive and will try to page it out
+			 * again later.
+			 *
+			 * Starvation of the active page list is used to
+			 * determine when the system is massively memory
+			 * starved.
 			 */
-			vm_page_activate(mt);
 			break;
 		case VM_PAGER_AGAIN:
 			break;
@@ -700,6 +704,7 @@ vm_pageout_scan(int pass)
 	vm_object_t object;
 	int actcount;
 	int vnodes_skipped = 0;
+	int pages_freed = 0;
 	int maxlaunder;
 
 	/*
@@ -857,10 +862,6 @@ rescan0:
 		 */
 		if (m->dirty == 0) {
 			vm_page_test_dirty(m);
-#if 0
-			if (m->dirty == 0 && (m->flags & PG_WRITEABLE) != 0)
-				pmap_remove_all(m);
-#endif
 		} else {
 			vm_page_dirty(m);
 		}
@@ -872,6 +873,7 @@ rescan0:
 			vm_pageout_page_free(m);
 			mycpu->gd_cnt.v_dfree++;
 			--page_shortage;
+			++pages_freed;
 		} else if (m->dirty == 0) {
 			/*
 			 * Clean pages can be placed onto the cache queue.
@@ -879,6 +881,7 @@ rescan0:
 			 */
 			vm_page_cache(m);
 			--page_shortage;
+			++pages_freed;
 		} else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
 			/*
 			 * Dirty pages need to be paged out, but flushing
@@ -1016,7 +1019,9 @@ rescan0:
 			if (vm_pageout_clean(m) != 0) {
 				--page_shortage;
 				--maxlaunder;
-			} 
+			} else {
+				addl_page_shortage++;
+			}
 			next = TAILQ_NEXT(&marker, pageq);
 			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl, &marker, pageq);
 			if (vp != NULL)
@@ -1029,10 +1034,20 @@ rescan0:
 	 * active queue to the inactive queue.
 	 */
 	page_shortage = vm_paging_target() +
-	    vmstats.v_inactive_target - vmstats.v_inactive_count;
+		        vmstats.v_inactive_target - vmstats.v_inactive_count;
 	page_shortage += addl_page_shortage;
 
 	/*
+	 * If the system is running out of swap or has none a large backlog
+	 * can accumulate in the inactive list.  Continue moving pages to
+	 * the inactive list even though its 'target' has been met due to
+	 * being unable to drain.  We can then use a low active count to
+	 * measure stress and out-of-memory conditions.
+	 */
+	if (page_shortage < addl_page_shortage)
+		page_shortage = addl_page_shortage;
+
+	/*
 	 * Scan the active queue for things we can deactivate. We nominally
 	 * track the per-page activity counter and use it to locate 
 	 * deactivation candidates.
@@ -1112,10 +1127,12 @@ rescan0:
 					vm_page_busy(m);
 					vm_page_protect(m, VM_PROT_NONE);
 					vm_page_wakeup(m);
-					if (m->dirty == 0)
+					if (m->dirty == 0) {
+						++pages_freed;
 						vm_page_cache(m);
-					else
+					} else {
 						vm_page_deactivate(m);
+					}
 				} else {
 					vm_page_deactivate(m);
 				}
@@ -1134,6 +1151,9 @@ rescan0:
 	 * does not effect other calculations.
 	 *
 	 * NOTE: we are still in a critical section.
+	 *
+	 * Pages moved from PQ_CACHE to totally free are not counted in the
+	 * pages_freed counter.
 	 */
 
 	while (vmstats.v_free_count < vmstats.v_free_reserved) {
@@ -1191,14 +1211,33 @@ rescan0:
 	}
 
 	/*
-	 * If we are out of swap and were not able to reach our paging
-	 * target, kill the largest process.
+	 * If we are out of swap space (or have no swap) then we
+	 * can detect when the system has completely run out of
+	 * memory by observing several variables.
+	 *
+	 * - swap_pager_full is set if insufficient swap was
+	 *   available to satisfy a requested pageout.
+	 *
+	 * - vm_page_count_min() means we could not recover
+	 *   enough pages to meet bare minimum needs.
+	 *
+	 * - vm_active_count
+	 *
+	 *and we were
+	 * not able to reach our minimum free page count target,
+	 * then we can detect whether we have run out of memory
+	 * by observing the active count.  A memory starved
+	 * system will reduce the active count
+	 *
+	 * If under these circumstances our paging target exceeds
+	 * 1/2 the number of active pages we have a very serious
+	 * problem that the deactivation of pages failed to solve
+	 * and must start killing things.
 	 */
-	if ((vm_swap_size < 64 && vm_page_count_min()) ||
-	    (swap_pager_full && vm_paging_target() > 0)) {
-#if 0
-	if ((vm_swap_size < 64 || swap_pager_full) && vm_page_count_min()) {
-#endif
+	if (swap_pager_full && vm_page_count_min())
+		kprintf("Warning: system low on memory+swap!\n");
+	if (swap_pager_full && vm_page_count_min() &&
+	    vm_paging_target() > vmstats.v_active_count / 4) {
 		info.bigproc = NULL;
 		info.bigsize = 0;
 		allproc_scan(vm_pageout_scan_callback, &info);
diff --git a/sys/vm/vm_swap.c b/sys/vm/vm_swap.c
index 10c29f265f..2af5228d88 100644
--- a/sys/vm/vm_swap.c
+++ b/sys/vm/vm_swap.c
@@ -337,6 +337,7 @@ swaponvp(struct thread *td, struct vnode *vp, u_long nblks)
 		blist_free(swapblist, vsbase, blk);
 		vm_swap_size += blk;
 	}
+	swap_pager_newswap();
 
 	return (0);
 }
-- 
2.11.4.GIT