patch-2.4.0-test9 linux/mm/vmscan.c

Next file: linux/net/Makefile
Previous file: linux/mm/vmalloc.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.4.0-test8/linux/mm/vmscan.c linux/mm/vmscan.c
@@ -9,6 +9,7 @@
  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
+ *  Multiqueue VM started 5.8.00, Rik van Riel.
  */
 
 #include <linux/slab.h>
@@ -40,6 +41,7 @@
 	swp_entry_t entry;
 	struct page * page;
 	int (*swapout)(struct page *, struct file *);
+	int onlist;
 
 	pte = *page_table;
 	if (!pte_present(pte))
@@ -51,16 +53,37 @@
 	if (mm->swap_cnt)
 		mm->swap_cnt--;
 
+	onlist = PageActive(page);
 	/* Don't look at this pte if it's been accessed recently. */
 	if (pte_young(pte)) {
-		/*
-		 * Transfer the "accessed" bit from the page
-		 * tables to the global page map.
-		 */
 		set_pte(page_table, pte_mkold(pte));
-                SetPageReferenced(page);
+		if (onlist) {
+			/*
+			 * Transfer the "accessed" bit from the page
+			 * tables to the global page map. Page aging
+			 * will be done by refill_inactive_scan().
+			 */
+                	SetPageReferenced(page);
+		} else {
+			/*
+			 * The page is not on the active list, so
+			 * we have to do the page aging ourselves.
+			 */
+			age_page_up(page);
+		}
 		goto out_failed;
 	}
+	if (!onlist)
+		/* The page is still mapped, so it can't be freeable... */
+		age_page_down_ageonly(page);
+
+	/*
+	 * If the page is in active use by us, or if the page
+	 * is in active use by others, don't unmap it or
+	 * (worse) start unneeded IO.
+	 */
+	if (page->age > 0)
+		goto out_failed;
 
 	if (TryLockPage(page))
 		goto out_failed;
@@ -79,8 +102,9 @@
 		set_pte(page_table, swp_entry_to_pte(entry));
 drop_pte:
 		UnlockPage(page);
-		vma->vm_mm->rss--;
+		mm->rss--;
 		flush_tlb_page(vma, address);
+		deactivate_page(page);
 		page_cache_release(page);
 		goto out_failed;
 	}
@@ -96,7 +120,7 @@
 	 * our scan.
 	 *
 	 * Basically, this just makes it possible for us to do
-	 * some real work in the future in "shrink_mmap()".
+	 * some real work in the future in "refill_inactive()".
 	 */
 	if (!pte_dirty(pte)) {
 		flush_cache_page(vma, address);
@@ -116,7 +140,9 @@
 	 * Don't do any of the expensive stuff if
 	 * we're not really interested in this zone.
 	 */
-	if (page->zone->free_pages > page->zone->pages_high)
+	if (page->zone->free_pages + page->zone->inactive_clean_pages
+					+ page->zone->inactive_dirty_pages
+		      	> page->zone->pages_high + inactive_target)
 		goto out_unlock;
 
 	/*
@@ -134,7 +160,7 @@
 	 * NOTE NOTE NOTE! This should just set a
 	 * dirty bit in 'page', and just drop the
 	 * pte. All the hard work would be done by
-	 * shrink_mmap().
+	 * refill_inactive().
 	 *
 	 * That would get rid of a lot of problems.
 	 */
@@ -144,14 +170,15 @@
 		struct file *file = vma->vm_file;
 		if (file) get_file(file);
 		pte_clear(page_table);
-		vma->vm_mm->rss--;
+		mm->rss--;
 		flush_tlb_page(vma, address);
-		vmlist_access_unlock(vma->vm_mm);
+		vmlist_access_unlock(mm);
 		error = swapout(page, file);
 		UnlockPage(page);
 		if (file) fput(file);
 		if (!error)
 			goto out_free_success;
+		deactivate_page(page);
 		page_cache_release(page);
 		return error;
 	}
@@ -175,13 +202,14 @@
 	add_to_swap_cache(page, entry);
 
 	/* Put the swap entry into the pte after the page is in swapcache */
-	vma->vm_mm->rss--;
+	mm->rss--;
 	set_pte(page_table, swp_entry_to_pte(entry));
 	flush_tlb_page(vma, address);
-	vmlist_access_unlock(vma->vm_mm);
+	vmlist_access_unlock(mm);
 
 	/* OK, do a physical asynchronous write to swap.  */
 	rw_swap_page(WRITE, page, 0);
+	deactivate_page(page);
 
 out_free_success:
 	page_cache_release(page);
@@ -230,7 +258,7 @@
 
 	do {
 		int result;
-		vma->vm_mm->swap_address = address + PAGE_SIZE;
+		mm->swap_address = address + PAGE_SIZE;
 		result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
 		if (result)
 			return result;
@@ -282,7 +310,7 @@
 	if (vma->vm_flags & VM_LOCKED)
 		return 0;
 
-	pgdir = pgd_offset(vma->vm_mm, address);
+	pgdir = pgd_offset(mm, address);
 
 	end = vma->vm_end;
 	if (address >= end)
@@ -323,17 +351,22 @@
 			int result = swap_out_vma(mm, vma, address, gfp_mask);
 			if (result)
 				return result;
+			if (!mm->swap_cnt)
+				goto out_unlock;
 			vma = vma->vm_next;
 			if (!vma)
 				break;
 			address = vma->vm_start;
 		}
 	}
+	/* Reset to 0 when we reach the end of address space */
+	mm->swap_address = 0;
+	mm->swap_cnt = 0;
+
+out_unlock:
 	vmlist_access_unlock(mm);
 
 	/* We didn't find anything for the process */
-	mm->swap_cnt = 0;
-	mm->swap_address = 0;
 	return 0;
 }
 
@@ -342,7 +375,10 @@
  * N.B. This function returns only 0 or 1.  Return values != 1 from
  * the lower level routines result in continued processing.
  */
-static int swap_out(unsigned int priority, int gfp_mask)
+#define SWAP_SHIFT 5
+#define SWAP_MIN 8
+
+static int swap_out(unsigned int priority, int gfp_mask, unsigned long idle_time)
 {
 	struct task_struct * p;
 	int counter;
@@ -363,7 +399,7 @@
 	 * Think of swap_cnt as a "shadow rss" - it tells us which process
 	 * we want to page out (always try largest first).
 	 */
-	counter = (nr_threads << 2) >> (priority >> 2);
+	counter = (nr_threads << SWAP_SHIFT) >> priority;
 	if (counter < 1)
 		counter = 1;
 
@@ -372,6 +408,7 @@
 		struct mm_struct *best = NULL;
 		int pid = 0;
 		int assign = 0;
+		int found_task = 0;
 	select:
 		read_lock(&tasklist_lock);
 		p = init_task.next_task;
@@ -381,9 +418,17 @@
 				continue;
 	 		if (mm->rss <= 0)
 				continue;
+			/* Skip tasks which haven't slept long enough yet when idle-swapping. */
+			if (idle_time && !assign && (!(p->state & TASK_INTERRUPTIBLE) ||
+					time_after(p->sleep_time + idle_time * HZ, jiffies)))
+				continue;
+			found_task++;
 			/* Refresh swap_cnt? */
-			if (assign == 1)
-				mm->swap_cnt = mm->rss;
+			if (assign == 1) {
+				mm->swap_cnt = (mm->rss >> SWAP_SHIFT);
+				if (mm->swap_cnt < SWAP_MIN)
+					mm->swap_cnt = SWAP_MIN;
+			}
 			if (mm->swap_cnt > max_cnt) {
 				max_cnt = mm->swap_cnt;
 				best = mm;
@@ -392,7 +437,7 @@
 		}
 		read_unlock(&tasklist_lock);
 		if (!best) {
-			if (!assign) {
+			if (!assign && found_task > 0) {
 				assign = 1;
 				goto select;
 			}
@@ -418,50 +463,409 @@
 	return __ret;
 }
 
-/*
- * Check if there is any memory pressure (free_pages < pages_low)
+
+/**
+ * reclaim_page -	reclaims one page from the inactive_clean list
+ * @zone: reclaim a page from this zone
+ *
+ * The pages on the inactive_clean can be instantly reclaimed.
+ * The tests look impressive, but most of the time we'll grab
+ * the first page of the list and exit successfully.
  */
-static inline int memory_pressure(void)
+struct page * reclaim_page(zone_t * zone)
 {
-	pg_data_t *pgdat = pgdat_list;
+	struct page * page = NULL;
+	struct list_head * page_lru;
+	int maxscan;
 
-	do {
-		int i;
-		for(i = 0; i < MAX_NR_ZONES; i++) {
-			zone_t *zone = pgdat->node_zones+ i;
-			if (zone->size &&
-			    zone->free_pages < zone->pages_low)
-				return 1;
+	/*
+	 * We only need the pagemap_lru_lock if we don't reclaim the page,
+	 * but we have to grab the pagecache_lock before the pagemap_lru_lock
+	 * to avoid deadlocks and most of the time we'll succeed anyway.
+	 */
+	spin_lock(&pagecache_lock);
+	spin_lock(&pagemap_lru_lock);
+	maxscan = zone->inactive_clean_pages;
+	while ((page_lru = zone->inactive_clean_list.prev) !=
+			&zone->inactive_clean_list && maxscan--) {
+		page = list_entry(page_lru, struct page, lru);
+
+		/* Wrong page on list?! (list corruption, should not happen) */
+		if (!PageInactiveClean(page)) {
+			printk("VM: reclaim_page, wrong page on list.\n");
+			list_del(page_lru);
+			page->zone->inactive_clean_pages--;
+			continue;
 		}
-		pgdat = pgdat->node_next;
-	} while (pgdat);
 
-	return 0;
+		/* Page is or was in use?  Move it to the active list. */
+		if (PageTestandClearReferenced(page) || page->age > 0 ||
+				(!page->buffers && page_count(page) > 1)) {
+			del_page_from_inactive_clean_list(page);
+			add_page_to_active_list(page);
+			continue;
+		}
+
+		/* The page is dirty, or locked, move to inactive_diry list. */
+		if (page->buffers || TryLockPage(page)) {
+			del_page_from_inactive_clean_list(page);
+			add_page_to_inactive_dirty_list(page);
+			continue;
+		}
+
+		/* OK, remove the page from the caches. */
+                if (PageSwapCache(page)) {
+			__delete_from_swap_cache(page);
+			goto found_page;
+		}
+
+		if (page->mapping) {
+			__remove_inode_page(page);
+			goto found_page;
+		}
+
+		/* We should never ever get here. */
+		printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
+		list_del(page_lru);
+		zone->inactive_clean_pages--;
+		UnlockPage(page);
+	}
+	/* Reset page pointer, maybe we encountered an unfreeable page. */
+	page = NULL;
+	goto out;
+
+found_page:
+	del_page_from_inactive_clean_list(page);
+	UnlockPage(page);
+	page->age = PAGE_AGE_START;
+	if (page_count(page) != 1)
+		printk("VM: reclaim_page, found page with count %d!\n",
+				page_count(page));
+out:
+	spin_unlock(&pagemap_lru_lock);
+	spin_unlock(&pagecache_lock);
+	memory_pressure++;
+	return page;
+}
+
+/**
+ * page_launder - clean dirty inactive pages, move to inactive_clean list
+ * @gfp_mask: what operations we are allowed to do
+ * @sync: should we wait synchronously for the cleaning of pages
+ *
+ * When this function is called, we are most likely low on free +
+ * inactive_clean pages. Since we want to refill those pages as
+ * soon as possible, we'll make two loops over the inactive list,
+ * one to move the already cleaned pages to the inactive_clean lists
+ * and one to (often asynchronously) clean the dirty inactive pages.
+ *
+ * In situations where kswapd cannot keep up, user processes will
+ * end up calling this function. Since the user process needs to
+ * have a page before it can continue with its allocation, we'll
+ * do synchronous page flushing in that case.
+ *
+ * This code is heavily inspired by the FreeBSD source code. Thanks
+ * go out to Matthew Dillon.
+ */
+#define MAX_LAUNDER 		(4 * (1 << page_cluster))
+int page_launder(int gfp_mask, int sync)
+{
+	int launder_loop, maxscan, cleaned_pages, maxlaunder;
+	int can_get_io_locks;
+	struct list_head * page_lru;
+	struct page * page;
+
+	/*
+	 * We can only grab the IO locks (eg. for flushing dirty
+	 * buffers to disk) if __GFP_IO is set.
+	 */
+	can_get_io_locks = gfp_mask & __GFP_IO;
+
+	launder_loop = 0;
+	maxlaunder = 0;
+	cleaned_pages = 0;
+
+dirty_page_rescan:
+	spin_lock(&pagemap_lru_lock);
+	maxscan = nr_inactive_dirty_pages;
+	while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&
+				maxscan-- > 0) {
+		page = list_entry(page_lru, struct page, lru);
+
+		/* Wrong page on list?! (list corruption, should not happen) */
+		if (!PageInactiveDirty(page)) {
+			printk("VM: page_launder, wrong page on list.\n");
+			list_del(page_lru);
+			nr_inactive_dirty_pages--;
+			page->zone->inactive_dirty_pages--;
+			continue;
+		}
+
+		/* Page is or was in use?  Move it to the active list. */
+		if (PageTestandClearReferenced(page) || page->age > 0 ||
+				(!page->buffers && page_count(page) > 1) ||
+				page_ramdisk(page)) {
+			del_page_from_inactive_dirty_list(page);
+			add_page_to_active_list(page);
+			continue;
+		}
+
+		/*
+		 * The page is locked. IO in progress?
+		 * Move it to the back of the list.
+		 */
+		if (TryLockPage(page)) {
+			list_del(page_lru);
+			list_add(page_lru, &inactive_dirty_list);
+			continue;
+		}
+
+		/*
+		 * If the page has buffers, try to free the buffer mappings
+		 * associated with this page. If we succeed we either free
+		 * the page (in case it was a buffercache only page) or we
+		 * move the page to the inactive_clean list.
+		 *
+		 * On the first round, we should free all previously cleaned
+		 * buffer pages
+		 */
+		if (page->buffers) {
+			int wait, clearedbuf;
+			int freed_page = 0;
+			/*
+			 * Since we might be doing disk IO, we have to
+			 * drop the spinlock and take an extra reference
+			 * on the page so it doesn't go away from under us.
+			 */
+			del_page_from_inactive_dirty_list(page);
+			page_cache_get(page);
+			spin_unlock(&pagemap_lru_lock);
+
+			/* Will we do (asynchronous) IO? */
+			if (launder_loop && maxlaunder == 0 && sync)
+				wait = 2;	/* Synchrounous IO */
+			else if (launder_loop && maxlaunder-- > 0)
+				wait = 1;	/* Async IO */
+			else
+				wait = 0;	/* No IO */
+
+			/* Try to free the page buffers. */
+			clearedbuf = try_to_free_buffers(page, wait);
+
+			/*
+			 * Re-take the spinlock. Note that we cannot
+			 * unlock the page yet since we're still
+			 * accessing the page_struct here...
+			 */
+			spin_lock(&pagemap_lru_lock);
+
+			/* The buffers were not freed. */
+			if (!clearedbuf) {
+				add_page_to_inactive_dirty_list(page);
+
+			/* The page was only in the buffer cache. */
+			} else if (!page->mapping) {
+				atomic_dec(&buffermem_pages);
+				freed_page = 1;
+				cleaned_pages++;
+
+			/* The page has more users besides the cache and us. */
+			} else if (page_count(page) > 2) {
+				add_page_to_active_list(page);
+
+			/* OK, we "created" a freeable page. */
+			} else /* page->mapping && page_count(page) == 2 */ {
+				add_page_to_inactive_clean_list(page);
+				cleaned_pages++;
+			}
+
+			/*
+			 * Unlock the page and drop the extra reference.
+			 * We can only do it here because we ar accessing
+			 * the page struct above.
+			 */
+			UnlockPage(page);
+			page_cache_release(page);
+
+			/* 
+			 * If we're freeing buffer cache pages, stop when
+			 * we've got enough free memory.
+			 */
+			if (freed_page && !free_shortage())
+				break;
+			continue;
+		} else if (page->mapping && !PageDirty(page)) {
+			/*
+			 * If a page had an extra reference in
+			 * deactivate_page(), we will find it here.
+			 * Now the page is really freeable, so we
+			 * move it to the inactive_clean list.
+			 */
+			del_page_from_inactive_dirty_list(page);
+			add_page_to_inactive_clean_list(page);
+			UnlockPage(page);
+			cleaned_pages++;
+		} else {
+			/*
+			 * OK, we don't know what to do with the page.
+			 * It's no use keeping it here, so we move it to
+			 * the active list.
+			 */
+			del_page_from_inactive_dirty_list(page);
+			add_page_to_active_list(page);
+			UnlockPage(page);
+		}
+	}
+	spin_unlock(&pagemap_lru_lock);
+
+	/*
+	 * If we don't have enough free pages, we loop back once
+	 * to queue the dirty pages for writeout. When we were called
+	 * by a user process (that /needs/ a free page) and we didn't
+	 * free anything yet, we wait synchronously on the writeout of
+	 * MAX_SYNC_LAUNDER pages.
+	 *
+	 * We also wake up bdflush, since bdflush should, under most
+	 * loads, flush out the dirty pages before we have to wait on
+	 * IO.
+	 */
+	if (can_get_io_locks && !launder_loop && free_shortage()) {
+		launder_loop = 1;
+		/* If we cleaned pages, never do synchronous IO. */
+		if (cleaned_pages)
+			sync = 0;
+		/* We only do a few "out of order" flushes. */
+		maxlaunder = MAX_LAUNDER;
+		/* Kflushd takes care of the rest. */
+		wakeup_bdflush(0);
+		goto dirty_page_rescan;
+	}
+
+	/* Return the number of pages moved to the inactive_clean list. */
+	return cleaned_pages;
+}
+
+/**
+ * refill_inactive_scan - scan the active list and find pages to deactivate
+ * @priority: the priority at which to scan
+ * @oneshot: exit after deactivating one page
+ *
+ * This function will scan a portion of the active list to find
+ * unused pages, those pages will then be moved to the inactive list.
+ */
+int refill_inactive_scan(unsigned int priority, int oneshot)
+{
+	struct list_head * page_lru;
+	struct page * page;
+	int maxscan, page_active = 0;
+	int ret = 0;
+
+	/* Take the lock while messing with the list... */
+	spin_lock(&pagemap_lru_lock);
+	maxscan = nr_active_pages >> priority;
+	while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {
+		page = list_entry(page_lru, struct page, lru);
+
+		/* Wrong page on list?! (list corruption, should not happen) */
+		if (!PageActive(page)) {
+			printk("VM: refill_inactive, wrong page on list.\n");
+			list_del(page_lru);
+			nr_active_pages--;
+			continue;
+		}
+
+		/* Do aging on the pages. */
+		if (PageTestandClearReferenced(page)) {
+			age_page_up_nolock(page);
+			page_active = 1;
+		} else {
+			age_page_down_ageonly(page);
+			/*
+			 * Since we don't hold a reference on the page
+			 * ourselves, we have to do our test a bit more
+			 * strict then deactivate_page(). This is needed
+			 * since otherwise the system could hang shuffling
+			 * unfreeable pages from the active list to the
+			 * inactive_dirty list and back again...
+			 *
+			 * SUBTLE: we can have buffer pages with count 1.
+			 */
+			if (page_count(page) <= (page->buffers ? 2 : 1)) {
+				deactivate_page_nolock(page);
+				page_active = 0;
+			} else {
+				page_active = 1;
+			}
+		}
+		/*
+		 * If the page is still on the active list, move it
+		 * to the other end of the list. Otherwise it was
+		 * deactivated by age_page_down and we exit successfully.
+		 */
+		if (page_active || PageActive(page)) {
+			list_del(page_lru);
+			list_add(page_lru, &active_list);
+		} else {
+			ret = 1;
+			if (oneshot)
+				break;
+		}
+	}
+	spin_unlock(&pagemap_lru_lock);
+
+	return ret;
 }
 
 /*
- * Check if all zones have recently had memory_pressure (zone_wake_kswapd)
+ * Check if there are zones with a severe shortage of free pages,
+ * or if all zones have a minor shortage.
  */
-static inline int keep_kswapd_awake(void)
+int free_shortage(void)
 {
-	int all_recent = 1;
 	pg_data_t *pgdat = pgdat_list;
+	int sum = 0;
+	int freeable = nr_free_pages() + nr_inactive_clean_pages();
+	int freetarget = freepages.high + inactive_target / 3;
+
+	/* Are we low on free pages globally? */
+	if (freeable < freetarget)
+		return freetarget - freeable;
 
+	/* If not, are we very low on any particular zone? */
 	do {
 		int i;
 		for(i = 0; i < MAX_NR_ZONES; i++) {
 			zone_t *zone = pgdat->node_zones+ i;
-			if (zone->size) {
-				if (zone->free_pages < zone->pages_min)
-					return 1;
-				if (!zone->zone_wake_kswapd)
-					all_recent = 0;
+			if (zone->size && (zone->inactive_clean_pages +
+					zone->free_pages < zone->pages_min)) {
+				sum += zone->pages_min;
+				sum -= zone->free_pages;
+				sum -= zone->inactive_clean_pages;
 			}
 		}
 		pgdat = pgdat->node_next;
 	} while (pgdat);
 
-	return all_recent;
+	return sum;
+}
+
+/*
+ * How many inactive pages are we short?
+ */
+int inactive_shortage(void)
+{
+	int shortage = 0;
+
+	shortage += freepages.high;
+	shortage += inactive_target;
+	shortage -= nr_free_pages();
+	shortage -= nr_inactive_clean_pages();
+	shortage -= nr_inactive_dirty_pages;
+
+	if (shortage > 0)
+		return shortage;
+
+	return 0;
 }
 
 /*
@@ -472,96 +876,140 @@
  * We want to try to free "count" pages, and we want to 
  * cluster them so that we get good swap-out behaviour.
  *
- * Don't try _too_ hard, though. We don't want to have bad
- * latency.
- *
- * Note: only called by kswapd and try_to_free_pages
- *       both can WAIT at top level.
+ * OTOH, if we're a user process (and not kswapd), we
+ * really care about latency. In that case we don't try
+ * to free too many pages.
  */
-#define FREE_COUNT	8
-#define SWAP_COUNT	16
-static int do_try_to_free_pages(unsigned int gfp_mask)
-{
-	int priority;
-	int count = FREE_COUNT;
-	int swap_count;
+static int refill_inactive(unsigned int gfp_mask, int user)
+{
+	int priority, count, start_count, made_progress;
+	unsigned long idle_time;
+
+	count = inactive_shortage() + free_shortage();
+	if (user)
+		count = (1 << page_cluster);
+	start_count = count;
 
 	/* Always trim SLAB caches when memory gets low. */
 	kmem_cache_reap(gfp_mask);
 
-	priority = 64;
+	/*
+	 * Calculate the minimum time (in seconds) a process must
+	 * have slept before we consider it for idle swapping.
+	 * This must be the number of seconds it takes to go through
+	 * all of the cache. Doing this idle swapping makes the VM
+	 * smoother once we start hitting swap.
+	 */
+	idle_time = atomic_read(&page_cache_size);
+	idle_time += atomic_read(&buffermem_pages);
+	idle_time /= (inactive_target + 1);
+
+	priority = 6;
 	do {
+		made_progress = 0;
+
 		if (current->need_resched) {
+			__set_current_state(TASK_RUNNING);
 			schedule();
-			/* time has passed - pressure too? */
-			if (!memory_pressure())
-				goto done;
 		}
 
-		while (shrink_mmap(priority, gfp_mask)) {
-			if (!--count)
+		while (refill_inactive_scan(priority, 1) ||
+				swap_out(priority, gfp_mask, idle_time)) {
+			made_progress = 1;
+			if (--count <= 0)
 				goto done;
 		}
 
-		/* check if mission completed */
-		if (!keep_kswapd_awake())
-			goto done;
+		/*
+		 * don't be too light against the d/i cache since
+	   	 * refill_inactive() almost never fail when there's
+	   	 * really plenty of memory free. 
+		 */
+		shrink_dcache_memory(priority, gfp_mask);
+		shrink_icache_memory(priority, gfp_mask);
 
 		/* Try to get rid of some shared memory pages.. */
-		if (gfp_mask & __GFP_IO) {
-			/*
-			 * don't be too light against the d/i cache since
-		   	 * shrink_mmap() almost never fail when there's
-		   	 * really plenty of memory free. 
-			 */
-			count -= shrink_dcache_memory(priority, gfp_mask);
-			count -= shrink_icache_memory(priority, gfp_mask);
-			/*
-			 * Not currently working, see fixme in shrink_?cache_memory
-			 * In the inner funtions there is a comment:
-			 * "To help debugging, a zero exit status indicates
-			 *  all slabs were released." (-arca?)
-			 * lets handle it in a primitive but working way...
-			 *	if (count <= 0)
-			 *		goto done;
-			 */
-			if (!keep_kswapd_awake())
+		while (shm_swap(priority, gfp_mask)) {
+			made_progress = 1;
+			if (--count <= 0)
 				goto done;
-
-			while (shm_swap(priority, gfp_mask)) {
-				if (!--count)
-					goto done;
-			}
 		}
 
 		/*
 		 * Then, try to page stuff out..
-		 *
-		 * This will not actually free any pages (they get
-		 * put in the swap cache), so we must not count this
-		 * as a "count" success.
-		 */
-		swap_count = SWAP_COUNT;
-		while (swap_out(priority, gfp_mask))
-			if (--swap_count < 0)
-				break;
+		 */
+		while (swap_out(priority, gfp_mask, 0)) {
+			made_progress = 1;
+			if (--count <= 0)
+				goto done;
+		}
 
-	} while (--priority >= 0);
+		/*
+		 * If we either have enough free memory, or if
+		 * page_launder() will be able to make enough
+		 * free memory, then stop.
+		 */
+		if (!inactive_shortage() || !free_shortage())
+			goto done;
 
-	/* Always end on a shrink_mmap.., may sleep... */
-	while (shrink_mmap(0, gfp_mask)) {
-		if (!--count)
+		/*
+		 * Only switch to a lower "priority" if we
+		 * didn't make any useful progress in the
+		 * last loop.
+		 */
+		if (!made_progress)
+			priority--;
+	} while (priority >= 0);
+
+	/* Always end on a refill_inactive.., may sleep... */
+	while (refill_inactive_scan(0, 1)) {
+		if (--count <= 0)
 			goto done;
 	}
-	/* Return 1 if any page is freed, or
-	 * there are no more memory pressure   */
-	return (count < FREE_COUNT || !keep_kswapd_awake());
- 
+
 done:
-	return 1;
+	return (count < start_count);
+}
+
+static int do_try_to_free_pages(unsigned int gfp_mask, int user)
+{
+	int ret = 0;
+
+	/*
+	 * If we're low on free pages, move pages from the
+	 * inactive_dirty list to the inactive_clean list.
+	 *
+	 * Usually bdflush will have pre-cleaned the pages
+	 * before we get around to moving them to the other
+	 * list, so this is a relatively cheap operation.
+	 */
+	if (free_shortage() || nr_inactive_dirty_pages > nr_free_pages() +
+			nr_inactive_clean_pages())
+		ret += page_launder(gfp_mask, user);
+
+	/*
+	 * If needed, we move pages from the active list
+	 * to the inactive list. We also "eat" pages from
+	 * the inode and dentry cache whenever we do this.
+	 */
+	if (free_shortage() || inactive_shortage()) {
+		shrink_dcache_memory(6, gfp_mask);
+		shrink_icache_memory(6, gfp_mask);
+		ret += refill_inactive(gfp_mask, user);
+	} else {
+		/*
+		 * Reclaim unused slab cache memory.
+		 */
+		kmem_cache_reap(gfp_mask);
+		ret = 1;
+	}
+
+	return ret;
 }
 
 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
+DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
+struct task_struct *kswapd_task;
 
 /*
  * The background pageout daemon, started as a kernel thread
@@ -584,6 +1032,7 @@
 	tsk->pgrp = 1;
 	strcpy(tsk->comm, "kswapd");
 	sigfillset(&tsk->blocked);
+	kswapd_task = tsk;
 	
 	/*
 	 * Tell the memory management that we're a "memory allocator",
@@ -599,54 +1048,166 @@
 	 */
 	tsk->flags |= PF_MEMALLOC;
 
+	/*
+	 * Kswapd main loop.
+	 */
 	for (;;) {
-		if (!keep_kswapd_awake()) {
-			interruptible_sleep_on(&kswapd_wait);
+		static int recalc = 0;
+
+		/* If needed, try to free some memory. */
+		if (inactive_shortage() || free_shortage()) {
+			int wait = 0;
+			/* Do we need to do some synchronous flushing? */
+			if (waitqueue_active(&kswapd_done))
+				wait = 1;
+			do_try_to_free_pages(GFP_KSWAPD, wait);
+		}
+
+		/*
+		 * Do some (very minimal) background scanning. This
+		 * will scan all pages on the active list once
+		 * every minute. This clears old referenced bits
+		 * and moves unused pages to the inactive list.
+		 */
+		refill_inactive_scan(6, 0);
+
+		/* Once a second, recalculate some VM stats. */
+		if (time_after(jiffies, recalc + HZ)) {
+			recalc = jiffies;
+			recalculate_vm_stats();
 		}
 
-		do_try_to_free_pages(GFP_KSWAPD);
+		/*
+		 * Wake up everybody waiting for free memory
+		 * and unplug the disk queue.
+		 */
+		wake_up_all(&kswapd_done);
+		run_task_queue(&tq_disk);
+
+		/* 
+		 * We go to sleep if either the free page shortage
+		 * or the inactive page shortage is gone. We do this
+		 * because:
+		 * 1) we need no more free pages   or
+		 * 2) the inactive pages need to be flushed to disk,
+		 *    it wouldn't help to eat CPU time now ...
+		 *
+		 * We go to sleep for one second, but if it's needed
+		 * we'll be woken up earlier...
+		 */
+		if (!free_shortage() || !inactive_shortage())
+			interruptible_sleep_on_timeout(&kswapd_wait, HZ);
+		/*
+		 * TODO: insert out of memory check & oom killer
+		 * invocation in an else branch here.
+		 */
+	}
+}
+
+void wakeup_kswapd(int block)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	if (current == kswapd_task)
+		return;
+
+	if (!block) {
+		if (waitqueue_active(&kswapd_wait))
+			wake_up(&kswapd_wait);
+		return;
 	}
+
+	/*
+	 * Kswapd could wake us up before we get a chance
+	 * to sleep, so we have to be very careful here to
+	 * prevent SMP races...
+	 */
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	add_wait_queue(&kswapd_done, &wait);
+
+	if (waitqueue_active(&kswapd_wait))
+		wake_up(&kswapd_wait);
+	schedule();
+
+	remove_wait_queue(&kswapd_done, &wait);
+	__set_current_state(TASK_RUNNING);
 }
 
 /*
  * Called by non-kswapd processes when they want more
- * memory.
- *
- * In a perfect world, this should just wake up kswapd
- * and return. We don't actually want to swap stuff out
- * from user processes, because the locking issues are
- * nasty to the extreme (file write locks, and MM locking)
- *
- * One option might be to let kswapd do all the page-out
- * and VM page table scanning that needs locking, and this
- * process thread could do just the mmap shrink stage that
- * can be done by just dropping cached pages without having
- * any deadlock issues.
+ * memory but are unable to sleep on kswapd because
+ * they might be holding some IO locks ...
  */
 int try_to_free_pages(unsigned int gfp_mask)
 {
-	int retval = 1;
+	int ret = 1;
 
 	if (gfp_mask & __GFP_WAIT) {
-		current->state = TASK_RUNNING;
 		current->flags |= PF_MEMALLOC;
-		retval = do_try_to_free_pages(gfp_mask);
+		ret = do_try_to_free_pages(gfp_mask, 1);
 		current->flags &= ~PF_MEMALLOC;
 	}
 
-	/* someone needed memory that kswapd had not provided
-	 * make sure kswapd runs, should not happen often */
-	if (waitqueue_active(&kswapd_wait))
-		wake_up_interruptible(&kswapd_wait);
+	return ret;
+}
+
+DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait);
+/*
+ * Kreclaimd will move pages from the inactive_clean list to the
+ * free list, in order to keep atomic allocations possible under
+ * all circumstances. Even when kswapd is blocked on IO.
+ */
+int kreclaimd(void *unused)
+{
+	struct task_struct *tsk = current;
+	pg_data_t *pgdat;
 
-	return retval;
+	tsk->session = 1;
+	tsk->pgrp = 1;
+	strcpy(tsk->comm, "kreclaimd");
+	sigfillset(&tsk->blocked);
+	current->flags |= PF_MEMALLOC;
+
+	while (1) {
+
+		/*
+		 * We sleep until someone wakes us up from
+		 * page_alloc.c::__alloc_pages().
+		 */
+		interruptible_sleep_on(&kreclaimd_wait);
+
+		/*
+		 * Move some pages from the inactive_clean lists to
+		 * the free lists, if it is needed.
+		 */
+		pgdat = pgdat_list;
+		do {
+			int i;
+			for(i = 0; i < MAX_NR_ZONES; i++) {
+				zone_t *zone = pgdat->node_zones + i;
+				if (!zone->size)
+					continue;
+
+				while (zone->free_pages < zone->pages_low) {
+					struct page * page;
+					page = reclaim_page(zone);
+					if (!page)
+						break;
+					__free_page(page);
+				}
+			}
+			pgdat = pgdat->node_next;
+		} while (pgdat);
+	}
 }
 
+
 static int __init kswapd_init(void)
 {
-	printk("Starting kswapd v1.7\n");
+	printk("Starting kswapd v1.8\n");
 	swap_setup();
 	kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+	kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
 	return 0;
 }
 

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)