patch-2.4.0-test9 linux/mm/vmscan.c
Next file: linux/net/Makefile
Previous file: linux/mm/vmalloc.c
Back to the patch index
Back to the overall index
- Lines: 1070
- Date:
Mon Oct 2 12:02:20 2000
- Orig file:
v2.4.0-test8/linux/mm/vmscan.c
- Orig date:
Fri Sep 1 14:25:26 2000
diff -u --recursive --new-file v2.4.0-test8/linux/mm/vmscan.c linux/mm/vmscan.c
@@ -9,6 +9,7 @@
* to bring the system back to freepages.high: 2.4.97, Rik van Riel.
* Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
* Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
+ * Multiqueue VM started 5.8.00, Rik van Riel.
*/
#include <linux/slab.h>
@@ -40,6 +41,7 @@
swp_entry_t entry;
struct page * page;
int (*swapout)(struct page *, struct file *);
+ int onlist;
pte = *page_table;
if (!pte_present(pte))
@@ -51,16 +53,37 @@
if (mm->swap_cnt)
mm->swap_cnt--;
+ onlist = PageActive(page);
/* Don't look at this pte if it's been accessed recently. */
if (pte_young(pte)) {
- /*
- * Transfer the "accessed" bit from the page
- * tables to the global page map.
- */
set_pte(page_table, pte_mkold(pte));
- SetPageReferenced(page);
+ if (onlist) {
+ /*
+ * Transfer the "accessed" bit from the page
+ * tables to the global page map. Page aging
+ * will be done by refill_inactive_scan().
+ */
+ SetPageReferenced(page);
+ } else {
+ /*
+ * The page is not on the active list, so
+ * we have to do the page aging ourselves.
+ */
+ age_page_up(page);
+ }
goto out_failed;
}
+ if (!onlist)
+ /* The page is still mapped, so it can't be freeable... */
+ age_page_down_ageonly(page);
+
+ /*
+ * If the page is in active use by us, or if the page
+ * is in active use by others, don't unmap it or
+ * (worse) start unneeded IO.
+ */
+ if (page->age > 0)
+ goto out_failed;
if (TryLockPage(page))
goto out_failed;
@@ -79,8 +102,9 @@
set_pte(page_table, swp_entry_to_pte(entry));
drop_pte:
UnlockPage(page);
- vma->vm_mm->rss--;
+ mm->rss--;
flush_tlb_page(vma, address);
+ deactivate_page(page);
page_cache_release(page);
goto out_failed;
}
@@ -96,7 +120,7 @@
* our scan.
*
* Basically, this just makes it possible for us to do
- * some real work in the future in "shrink_mmap()".
+ * some real work in the future in "refill_inactive()".
*/
if (!pte_dirty(pte)) {
flush_cache_page(vma, address);
@@ -116,7 +140,9 @@
* Don't do any of the expensive stuff if
* we're not really interested in this zone.
*/
- if (page->zone->free_pages > page->zone->pages_high)
+ if (page->zone->free_pages + page->zone->inactive_clean_pages
+ + page->zone->inactive_dirty_pages
+ > page->zone->pages_high + inactive_target)
goto out_unlock;
/*
@@ -134,7 +160,7 @@
* NOTE NOTE NOTE! This should just set a
* dirty bit in 'page', and just drop the
* pte. All the hard work would be done by
- * shrink_mmap().
+ * refill_inactive().
*
* That would get rid of a lot of problems.
*/
@@ -144,14 +170,15 @@
struct file *file = vma->vm_file;
if (file) get_file(file);
pte_clear(page_table);
- vma->vm_mm->rss--;
+ mm->rss--;
flush_tlb_page(vma, address);
- vmlist_access_unlock(vma->vm_mm);
+ vmlist_access_unlock(mm);
error = swapout(page, file);
UnlockPage(page);
if (file) fput(file);
if (!error)
goto out_free_success;
+ deactivate_page(page);
page_cache_release(page);
return error;
}
@@ -175,13 +202,14 @@
add_to_swap_cache(page, entry);
/* Put the swap entry into the pte after the page is in swapcache */
- vma->vm_mm->rss--;
+ mm->rss--;
set_pte(page_table, swp_entry_to_pte(entry));
flush_tlb_page(vma, address);
- vmlist_access_unlock(vma->vm_mm);
+ vmlist_access_unlock(mm);
/* OK, do a physical asynchronous write to swap. */
rw_swap_page(WRITE, page, 0);
+ deactivate_page(page);
out_free_success:
page_cache_release(page);
@@ -230,7 +258,7 @@
do {
int result;
- vma->vm_mm->swap_address = address + PAGE_SIZE;
+ mm->swap_address = address + PAGE_SIZE;
result = try_to_swap_out(mm, vma, address, pte, gfp_mask);
if (result)
return result;
@@ -282,7 +310,7 @@
if (vma->vm_flags & VM_LOCKED)
return 0;
- pgdir = pgd_offset(vma->vm_mm, address);
+ pgdir = pgd_offset(mm, address);
end = vma->vm_end;
if (address >= end)
@@ -323,17 +351,22 @@
int result = swap_out_vma(mm, vma, address, gfp_mask);
if (result)
return result;
+ if (!mm->swap_cnt)
+ goto out_unlock;
vma = vma->vm_next;
if (!vma)
break;
address = vma->vm_start;
}
}
+ /* Reset to 0 when we reach the end of address space */
+ mm->swap_address = 0;
+ mm->swap_cnt = 0;
+
+out_unlock:
vmlist_access_unlock(mm);
/* We didn't find anything for the process */
- mm->swap_cnt = 0;
- mm->swap_address = 0;
return 0;
}
@@ -342,7 +375,10 @@
* N.B. This function returns only 0 or 1. Return values != 1 from
* the lower level routines result in continued processing.
*/
-static int swap_out(unsigned int priority, int gfp_mask)
+#define SWAP_SHIFT 5
+#define SWAP_MIN 8
+
+static int swap_out(unsigned int priority, int gfp_mask, unsigned long idle_time)
{
struct task_struct * p;
int counter;
@@ -363,7 +399,7 @@
* Think of swap_cnt as a "shadow rss" - it tells us which process
* we want to page out (always try largest first).
*/
- counter = (nr_threads << 2) >> (priority >> 2);
+ counter = (nr_threads << SWAP_SHIFT) >> priority;
if (counter < 1)
counter = 1;
@@ -372,6 +408,7 @@
struct mm_struct *best = NULL;
int pid = 0;
int assign = 0;
+ int found_task = 0;
select:
read_lock(&tasklist_lock);
p = init_task.next_task;
@@ -381,9 +418,17 @@
continue;
if (mm->rss <= 0)
continue;
+ /* Skip tasks which haven't slept long enough yet when idle-swapping. */
+ if (idle_time && !assign && (!(p->state & TASK_INTERRUPTIBLE) ||
+ time_after(p->sleep_time + idle_time * HZ, jiffies)))
+ continue;
+ found_task++;
/* Refresh swap_cnt? */
- if (assign == 1)
- mm->swap_cnt = mm->rss;
+ if (assign == 1) {
+ mm->swap_cnt = (mm->rss >> SWAP_SHIFT);
+ if (mm->swap_cnt < SWAP_MIN)
+ mm->swap_cnt = SWAP_MIN;
+ }
if (mm->swap_cnt > max_cnt) {
max_cnt = mm->swap_cnt;
best = mm;
@@ -392,7 +437,7 @@
}
read_unlock(&tasklist_lock);
if (!best) {
- if (!assign) {
+ if (!assign && found_task > 0) {
assign = 1;
goto select;
}
@@ -418,50 +463,409 @@
return __ret;
}
-/*
- * Check if there is any memory pressure (free_pages < pages_low)
+
+/**
+ * reclaim_page - reclaims one page from the inactive_clean list
+ * @zone: reclaim a page from this zone
+ *
+ * The pages on the inactive_clean can be instantly reclaimed.
+ * The tests look impressive, but most of the time we'll grab
+ * the first page of the list and exit successfully.
*/
-static inline int memory_pressure(void)
+struct page * reclaim_page(zone_t * zone)
{
- pg_data_t *pgdat = pgdat_list;
+ struct page * page = NULL;
+ struct list_head * page_lru;
+ int maxscan;
- do {
- int i;
- for(i = 0; i < MAX_NR_ZONES; i++) {
- zone_t *zone = pgdat->node_zones+ i;
- if (zone->size &&
- zone->free_pages < zone->pages_low)
- return 1;
+ /*
+ * We only need the pagemap_lru_lock if we don't reclaim the page,
+ * but we have to grab the pagecache_lock before the pagemap_lru_lock
+ * to avoid deadlocks and most of the time we'll succeed anyway.
+ */
+ spin_lock(&pagecache_lock);
+ spin_lock(&pagemap_lru_lock);
+ maxscan = zone->inactive_clean_pages;
+ while ((page_lru = zone->inactive_clean_list.prev) !=
+ &zone->inactive_clean_list && maxscan--) {
+ page = list_entry(page_lru, struct page, lru);
+
+ /* Wrong page on list?! (list corruption, should not happen) */
+ if (!PageInactiveClean(page)) {
+ printk("VM: reclaim_page, wrong page on list.\n");
+ list_del(page_lru);
+ page->zone->inactive_clean_pages--;
+ continue;
}
- pgdat = pgdat->node_next;
- } while (pgdat);
- return 0;
+ /* Page is or was in use? Move it to the active list. */
+ if (PageTestandClearReferenced(page) || page->age > 0 ||
+ (!page->buffers && page_count(page) > 1)) {
+ del_page_from_inactive_clean_list(page);
+ add_page_to_active_list(page);
+ continue;
+ }
+
+ /* The page is dirty, or locked, move to inactive_diry list. */
+ if (page->buffers || TryLockPage(page)) {
+ del_page_from_inactive_clean_list(page);
+ add_page_to_inactive_dirty_list(page);
+ continue;
+ }
+
+ /* OK, remove the page from the caches. */
+ if (PageSwapCache(page)) {
+ __delete_from_swap_cache(page);
+ goto found_page;
+ }
+
+ if (page->mapping) {
+ __remove_inode_page(page);
+ goto found_page;
+ }
+
+ /* We should never ever get here. */
+ printk(KERN_ERR "VM: reclaim_page, found unknown page\n");
+ list_del(page_lru);
+ zone->inactive_clean_pages--;
+ UnlockPage(page);
+ }
+ /* Reset page pointer, maybe we encountered an unfreeable page. */
+ page = NULL;
+ goto out;
+
+found_page:
+ del_page_from_inactive_clean_list(page);
+ UnlockPage(page);
+ page->age = PAGE_AGE_START;
+ if (page_count(page) != 1)
+ printk("VM: reclaim_page, found page with count %d!\n",
+ page_count(page));
+out:
+ spin_unlock(&pagemap_lru_lock);
+ spin_unlock(&pagecache_lock);
+ memory_pressure++;
+ return page;
+}
+
+/**
+ * page_launder - clean dirty inactive pages, move to inactive_clean list
+ * @gfp_mask: what operations we are allowed to do
+ * @sync: should we wait synchronously for the cleaning of pages
+ *
+ * When this function is called, we are most likely low on free +
+ * inactive_clean pages. Since we want to refill those pages as
+ * soon as possible, we'll make two loops over the inactive list,
+ * one to move the already cleaned pages to the inactive_clean lists
+ * and one to (often asynchronously) clean the dirty inactive pages.
+ *
+ * In situations where kswapd cannot keep up, user processes will
+ * end up calling this function. Since the user process needs to
+ * have a page before it can continue with its allocation, we'll
+ * do synchronous page flushing in that case.
+ *
+ * This code is heavily inspired by the FreeBSD source code. Thanks
+ * go out to Matthew Dillon.
+ */
+#define MAX_LAUNDER (4 * (1 << page_cluster))
+int page_launder(int gfp_mask, int sync)
+{
+ int launder_loop, maxscan, cleaned_pages, maxlaunder;
+ int can_get_io_locks;
+ struct list_head * page_lru;
+ struct page * page;
+
+ /*
+ * We can only grab the IO locks (eg. for flushing dirty
+ * buffers to disk) if __GFP_IO is set.
+ */
+ can_get_io_locks = gfp_mask & __GFP_IO;
+
+ launder_loop = 0;
+ maxlaunder = 0;
+ cleaned_pages = 0;
+
+dirty_page_rescan:
+ spin_lock(&pagemap_lru_lock);
+ maxscan = nr_inactive_dirty_pages;
+ while ((page_lru = inactive_dirty_list.prev) != &inactive_dirty_list &&
+ maxscan-- > 0) {
+ page = list_entry(page_lru, struct page, lru);
+
+ /* Wrong page on list?! (list corruption, should not happen) */
+ if (!PageInactiveDirty(page)) {
+ printk("VM: page_launder, wrong page on list.\n");
+ list_del(page_lru);
+ nr_inactive_dirty_pages--;
+ page->zone->inactive_dirty_pages--;
+ continue;
+ }
+
+ /* Page is or was in use? Move it to the active list. */
+ if (PageTestandClearReferenced(page) || page->age > 0 ||
+ (!page->buffers && page_count(page) > 1) ||
+ page_ramdisk(page)) {
+ del_page_from_inactive_dirty_list(page);
+ add_page_to_active_list(page);
+ continue;
+ }
+
+ /*
+ * The page is locked. IO in progress?
+ * Move it to the back of the list.
+ */
+ if (TryLockPage(page)) {
+ list_del(page_lru);
+ list_add(page_lru, &inactive_dirty_list);
+ continue;
+ }
+
+ /*
+ * If the page has buffers, try to free the buffer mappings
+ * associated with this page. If we succeed we either free
+ * the page (in case it was a buffercache only page) or we
+ * move the page to the inactive_clean list.
+ *
+ * On the first round, we should free all previously cleaned
+ * buffer pages
+ */
+ if (page->buffers) {
+ int wait, clearedbuf;
+ int freed_page = 0;
+ /*
+ * Since we might be doing disk IO, we have to
+ * drop the spinlock and take an extra reference
+ * on the page so it doesn't go away from under us.
+ */
+ del_page_from_inactive_dirty_list(page);
+ page_cache_get(page);
+ spin_unlock(&pagemap_lru_lock);
+
+ /* Will we do (asynchronous) IO? */
+ if (launder_loop && maxlaunder == 0 && sync)
+ wait = 2; /* Synchrounous IO */
+ else if (launder_loop && maxlaunder-- > 0)
+ wait = 1; /* Async IO */
+ else
+ wait = 0; /* No IO */
+
+ /* Try to free the page buffers. */
+ clearedbuf = try_to_free_buffers(page, wait);
+
+ /*
+ * Re-take the spinlock. Note that we cannot
+ * unlock the page yet since we're still
+ * accessing the page_struct here...
+ */
+ spin_lock(&pagemap_lru_lock);
+
+ /* The buffers were not freed. */
+ if (!clearedbuf) {
+ add_page_to_inactive_dirty_list(page);
+
+ /* The page was only in the buffer cache. */
+ } else if (!page->mapping) {
+ atomic_dec(&buffermem_pages);
+ freed_page = 1;
+ cleaned_pages++;
+
+ /* The page has more users besides the cache and us. */
+ } else if (page_count(page) > 2) {
+ add_page_to_active_list(page);
+
+ /* OK, we "created" a freeable page. */
+ } else /* page->mapping && page_count(page) == 2 */ {
+ add_page_to_inactive_clean_list(page);
+ cleaned_pages++;
+ }
+
+ /*
+ * Unlock the page and drop the extra reference.
+ * We can only do it here because we ar accessing
+ * the page struct above.
+ */
+ UnlockPage(page);
+ page_cache_release(page);
+
+ /*
+ * If we're freeing buffer cache pages, stop when
+ * we've got enough free memory.
+ */
+ if (freed_page && !free_shortage())
+ break;
+ continue;
+ } else if (page->mapping && !PageDirty(page)) {
+ /*
+ * If a page had an extra reference in
+ * deactivate_page(), we will find it here.
+ * Now the page is really freeable, so we
+ * move it to the inactive_clean list.
+ */
+ del_page_from_inactive_dirty_list(page);
+ add_page_to_inactive_clean_list(page);
+ UnlockPage(page);
+ cleaned_pages++;
+ } else {
+ /*
+ * OK, we don't know what to do with the page.
+ * It's no use keeping it here, so we move it to
+ * the active list.
+ */
+ del_page_from_inactive_dirty_list(page);
+ add_page_to_active_list(page);
+ UnlockPage(page);
+ }
+ }
+ spin_unlock(&pagemap_lru_lock);
+
+ /*
+ * If we don't have enough free pages, we loop back once
+ * to queue the dirty pages for writeout. When we were called
+ * by a user process (that /needs/ a free page) and we didn't
+ * free anything yet, we wait synchronously on the writeout of
+ * MAX_SYNC_LAUNDER pages.
+ *
+ * We also wake up bdflush, since bdflush should, under most
+ * loads, flush out the dirty pages before we have to wait on
+ * IO.
+ */
+ if (can_get_io_locks && !launder_loop && free_shortage()) {
+ launder_loop = 1;
+ /* If we cleaned pages, never do synchronous IO. */
+ if (cleaned_pages)
+ sync = 0;
+ /* We only do a few "out of order" flushes. */
+ maxlaunder = MAX_LAUNDER;
+ /* Kflushd takes care of the rest. */
+ wakeup_bdflush(0);
+ goto dirty_page_rescan;
+ }
+
+ /* Return the number of pages moved to the inactive_clean list. */
+ return cleaned_pages;
+}
+
+/**
+ * refill_inactive_scan - scan the active list and find pages to deactivate
+ * @priority: the priority at which to scan
+ * @oneshot: exit after deactivating one page
+ *
+ * This function will scan a portion of the active list to find
+ * unused pages, those pages will then be moved to the inactive list.
+ */
+int refill_inactive_scan(unsigned int priority, int oneshot)
+{
+ struct list_head * page_lru;
+ struct page * page;
+ int maxscan, page_active = 0;
+ int ret = 0;
+
+ /* Take the lock while messing with the list... */
+ spin_lock(&pagemap_lru_lock);
+ maxscan = nr_active_pages >> priority;
+ while (maxscan-- > 0 && (page_lru = active_list.prev) != &active_list) {
+ page = list_entry(page_lru, struct page, lru);
+
+ /* Wrong page on list?! (list corruption, should not happen) */
+ if (!PageActive(page)) {
+ printk("VM: refill_inactive, wrong page on list.\n");
+ list_del(page_lru);
+ nr_active_pages--;
+ continue;
+ }
+
+ /* Do aging on the pages. */
+ if (PageTestandClearReferenced(page)) {
+ age_page_up_nolock(page);
+ page_active = 1;
+ } else {
+ age_page_down_ageonly(page);
+ /*
+ * Since we don't hold a reference on the page
+ * ourselves, we have to do our test a bit more
+ * strict then deactivate_page(). This is needed
+ * since otherwise the system could hang shuffling
+ * unfreeable pages from the active list to the
+ * inactive_dirty list and back again...
+ *
+ * SUBTLE: we can have buffer pages with count 1.
+ */
+ if (page_count(page) <= (page->buffers ? 2 : 1)) {
+ deactivate_page_nolock(page);
+ page_active = 0;
+ } else {
+ page_active = 1;
+ }
+ }
+ /*
+ * If the page is still on the active list, move it
+ * to the other end of the list. Otherwise it was
+ * deactivated by age_page_down and we exit successfully.
+ */
+ if (page_active || PageActive(page)) {
+ list_del(page_lru);
+ list_add(page_lru, &active_list);
+ } else {
+ ret = 1;
+ if (oneshot)
+ break;
+ }
+ }
+ spin_unlock(&pagemap_lru_lock);
+
+ return ret;
}
/*
- * Check if all zones have recently had memory_pressure (zone_wake_kswapd)
+ * Check if there are zones with a severe shortage of free pages,
+ * or if all zones have a minor shortage.
*/
-static inline int keep_kswapd_awake(void)
+int free_shortage(void)
{
- int all_recent = 1;
pg_data_t *pgdat = pgdat_list;
+ int sum = 0;
+ int freeable = nr_free_pages() + nr_inactive_clean_pages();
+ int freetarget = freepages.high + inactive_target / 3;
+
+ /* Are we low on free pages globally? */
+ if (freeable < freetarget)
+ return freetarget - freeable;
+ /* If not, are we very low on any particular zone? */
do {
int i;
for(i = 0; i < MAX_NR_ZONES; i++) {
zone_t *zone = pgdat->node_zones+ i;
- if (zone->size) {
- if (zone->free_pages < zone->pages_min)
- return 1;
- if (!zone->zone_wake_kswapd)
- all_recent = 0;
+ if (zone->size && (zone->inactive_clean_pages +
+ zone->free_pages < zone->pages_min)) {
+ sum += zone->pages_min;
+ sum -= zone->free_pages;
+ sum -= zone->inactive_clean_pages;
}
}
pgdat = pgdat->node_next;
} while (pgdat);
- return all_recent;
+ return sum;
+}
+
+/*
+ * How many inactive pages are we short?
+ */
+int inactive_shortage(void)
+{
+ int shortage = 0;
+
+ shortage += freepages.high;
+ shortage += inactive_target;
+ shortage -= nr_free_pages();
+ shortage -= nr_inactive_clean_pages();
+ shortage -= nr_inactive_dirty_pages;
+
+ if (shortage > 0)
+ return shortage;
+
+ return 0;
}
/*
@@ -472,96 +876,140 @@
* We want to try to free "count" pages, and we want to
* cluster them so that we get good swap-out behaviour.
*
- * Don't try _too_ hard, though. We don't want to have bad
- * latency.
- *
- * Note: only called by kswapd and try_to_free_pages
- * both can WAIT at top level.
+ * OTOH, if we're a user process (and not kswapd), we
+ * really care about latency. In that case we don't try
+ * to free too many pages.
*/
-#define FREE_COUNT 8
-#define SWAP_COUNT 16
-static int do_try_to_free_pages(unsigned int gfp_mask)
-{
- int priority;
- int count = FREE_COUNT;
- int swap_count;
+static int refill_inactive(unsigned int gfp_mask, int user)
+{
+ int priority, count, start_count, made_progress;
+ unsigned long idle_time;
+
+ count = inactive_shortage() + free_shortage();
+ if (user)
+ count = (1 << page_cluster);
+ start_count = count;
/* Always trim SLAB caches when memory gets low. */
kmem_cache_reap(gfp_mask);
- priority = 64;
+ /*
+ * Calculate the minimum time (in seconds) a process must
+ * have slept before we consider it for idle swapping.
+ * This must be the number of seconds it takes to go through
+ * all of the cache. Doing this idle swapping makes the VM
+ * smoother once we start hitting swap.
+ */
+ idle_time = atomic_read(&page_cache_size);
+ idle_time += atomic_read(&buffermem_pages);
+ idle_time /= (inactive_target + 1);
+
+ priority = 6;
do {
+ made_progress = 0;
+
if (current->need_resched) {
+ __set_current_state(TASK_RUNNING);
schedule();
- /* time has passed - pressure too? */
- if (!memory_pressure())
- goto done;
}
- while (shrink_mmap(priority, gfp_mask)) {
- if (!--count)
+ while (refill_inactive_scan(priority, 1) ||
+ swap_out(priority, gfp_mask, idle_time)) {
+ made_progress = 1;
+ if (--count <= 0)
goto done;
}
- /* check if mission completed */
- if (!keep_kswapd_awake())
- goto done;
+ /*
+ * don't be too light against the d/i cache since
+ * refill_inactive() almost never fail when there's
+ * really plenty of memory free.
+ */
+ shrink_dcache_memory(priority, gfp_mask);
+ shrink_icache_memory(priority, gfp_mask);
/* Try to get rid of some shared memory pages.. */
- if (gfp_mask & __GFP_IO) {
- /*
- * don't be too light against the d/i cache since
- * shrink_mmap() almost never fail when there's
- * really plenty of memory free.
- */
- count -= shrink_dcache_memory(priority, gfp_mask);
- count -= shrink_icache_memory(priority, gfp_mask);
- /*
- * Not currently working, see fixme in shrink_?cache_memory
- * In the inner funtions there is a comment:
- * "To help debugging, a zero exit status indicates
- * all slabs were released." (-arca?)
- * lets handle it in a primitive but working way...
- * if (count <= 0)
- * goto done;
- */
- if (!keep_kswapd_awake())
+ while (shm_swap(priority, gfp_mask)) {
+ made_progress = 1;
+ if (--count <= 0)
goto done;
-
- while (shm_swap(priority, gfp_mask)) {
- if (!--count)
- goto done;
- }
}
/*
* Then, try to page stuff out..
- *
- * This will not actually free any pages (they get
- * put in the swap cache), so we must not count this
- * as a "count" success.
- */
- swap_count = SWAP_COUNT;
- while (swap_out(priority, gfp_mask))
- if (--swap_count < 0)
- break;
+ */
+ while (swap_out(priority, gfp_mask, 0)) {
+ made_progress = 1;
+ if (--count <= 0)
+ goto done;
+ }
- } while (--priority >= 0);
+ /*
+ * If we either have enough free memory, or if
+ * page_launder() will be able to make enough
+ * free memory, then stop.
+ */
+ if (!inactive_shortage() || !free_shortage())
+ goto done;
- /* Always end on a shrink_mmap.., may sleep... */
- while (shrink_mmap(0, gfp_mask)) {
- if (!--count)
+ /*
+ * Only switch to a lower "priority" if we
+ * didn't make any useful progress in the
+ * last loop.
+ */
+ if (!made_progress)
+ priority--;
+ } while (priority >= 0);
+
+ /* Always end on a refill_inactive.., may sleep... */
+ while (refill_inactive_scan(0, 1)) {
+ if (--count <= 0)
goto done;
}
- /* Return 1 if any page is freed, or
- * there are no more memory pressure */
- return (count < FREE_COUNT || !keep_kswapd_awake());
-
+
done:
- return 1;
+ return (count < start_count);
+}
+
+static int do_try_to_free_pages(unsigned int gfp_mask, int user)
+{
+ int ret = 0;
+
+ /*
+ * If we're low on free pages, move pages from the
+ * inactive_dirty list to the inactive_clean list.
+ *
+ * Usually bdflush will have pre-cleaned the pages
+ * before we get around to moving them to the other
+ * list, so this is a relatively cheap operation.
+ */
+ if (free_shortage() || nr_inactive_dirty_pages > nr_free_pages() +
+ nr_inactive_clean_pages())
+ ret += page_launder(gfp_mask, user);
+
+ /*
+ * If needed, we move pages from the active list
+ * to the inactive list. We also "eat" pages from
+ * the inode and dentry cache whenever we do this.
+ */
+ if (free_shortage() || inactive_shortage()) {
+ shrink_dcache_memory(6, gfp_mask);
+ shrink_icache_memory(6, gfp_mask);
+ ret += refill_inactive(gfp_mask, user);
+ } else {
+ /*
+ * Reclaim unused slab cache memory.
+ */
+ kmem_cache_reap(gfp_mask);
+ ret = 1;
+ }
+
+ return ret;
}
DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
+DECLARE_WAIT_QUEUE_HEAD(kswapd_done);
+struct task_struct *kswapd_task;
/*
* The background pageout daemon, started as a kernel thread
@@ -584,6 +1032,7 @@
tsk->pgrp = 1;
strcpy(tsk->comm, "kswapd");
sigfillset(&tsk->blocked);
+ kswapd_task = tsk;
/*
* Tell the memory management that we're a "memory allocator",
@@ -599,54 +1048,166 @@
*/
tsk->flags |= PF_MEMALLOC;
+ /*
+ * Kswapd main loop.
+ */
for (;;) {
- if (!keep_kswapd_awake()) {
- interruptible_sleep_on(&kswapd_wait);
+ static int recalc = 0;
+
+ /* If needed, try to free some memory. */
+ if (inactive_shortage() || free_shortage()) {
+ int wait = 0;
+ /* Do we need to do some synchronous flushing? */
+ if (waitqueue_active(&kswapd_done))
+ wait = 1;
+ do_try_to_free_pages(GFP_KSWAPD, wait);
+ }
+
+ /*
+ * Do some (very minimal) background scanning. This
+ * will scan all pages on the active list once
+ * every minute. This clears old referenced bits
+ * and moves unused pages to the inactive list.
+ */
+ refill_inactive_scan(6, 0);
+
+ /* Once a second, recalculate some VM stats. */
+ if (time_after(jiffies, recalc + HZ)) {
+ recalc = jiffies;
+ recalculate_vm_stats();
}
- do_try_to_free_pages(GFP_KSWAPD);
+ /*
+ * Wake up everybody waiting for free memory
+ * and unplug the disk queue.
+ */
+ wake_up_all(&kswapd_done);
+ run_task_queue(&tq_disk);
+
+ /*
+ * We go to sleep if either the free page shortage
+ * or the inactive page shortage is gone. We do this
+ * because:
+ * 1) we need no more free pages or
+ * 2) the inactive pages need to be flushed to disk,
+ * it wouldn't help to eat CPU time now ...
+ *
+ * We go to sleep for one second, but if it's needed
+ * we'll be woken up earlier...
+ */
+ if (!free_shortage() || !inactive_shortage())
+ interruptible_sleep_on_timeout(&kswapd_wait, HZ);
+ /*
+ * TODO: insert out of memory check & oom killer
+ * invocation in an else branch here.
+ */
+ }
+}
+
+void wakeup_kswapd(int block)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ if (current == kswapd_task)
+ return;
+
+ if (!block) {
+ if (waitqueue_active(&kswapd_wait))
+ wake_up(&kswapd_wait);
+ return;
}
+
+ /*
+ * Kswapd could wake us up before we get a chance
+ * to sleep, so we have to be very careful here to
+ * prevent SMP races...
+ */
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ add_wait_queue(&kswapd_done, &wait);
+
+ if (waitqueue_active(&kswapd_wait))
+ wake_up(&kswapd_wait);
+ schedule();
+
+ remove_wait_queue(&kswapd_done, &wait);
+ __set_current_state(TASK_RUNNING);
}
/*
* Called by non-kswapd processes when they want more
- * memory.
- *
- * In a perfect world, this should just wake up kswapd
- * and return. We don't actually want to swap stuff out
- * from user processes, because the locking issues are
- * nasty to the extreme (file write locks, and MM locking)
- *
- * One option might be to let kswapd do all the page-out
- * and VM page table scanning that needs locking, and this
- * process thread could do just the mmap shrink stage that
- * can be done by just dropping cached pages without having
- * any deadlock issues.
+ * memory but are unable to sleep on kswapd because
+ * they might be holding some IO locks ...
*/
int try_to_free_pages(unsigned int gfp_mask)
{
- int retval = 1;
+ int ret = 1;
if (gfp_mask & __GFP_WAIT) {
- current->state = TASK_RUNNING;
current->flags |= PF_MEMALLOC;
- retval = do_try_to_free_pages(gfp_mask);
+ ret = do_try_to_free_pages(gfp_mask, 1);
current->flags &= ~PF_MEMALLOC;
}
- /* someone needed memory that kswapd had not provided
- * make sure kswapd runs, should not happen often */
- if (waitqueue_active(&kswapd_wait))
- wake_up_interruptible(&kswapd_wait);
+ return ret;
+}
+
+DECLARE_WAIT_QUEUE_HEAD(kreclaimd_wait);
+/*
+ * Kreclaimd will move pages from the inactive_clean list to the
+ * free list, in order to keep atomic allocations possible under
+ * all circumstances. Even when kswapd is blocked on IO.
+ */
+int kreclaimd(void *unused)
+{
+ struct task_struct *tsk = current;
+ pg_data_t *pgdat;
- return retval;
+ tsk->session = 1;
+ tsk->pgrp = 1;
+ strcpy(tsk->comm, "kreclaimd");
+ sigfillset(&tsk->blocked);
+ current->flags |= PF_MEMALLOC;
+
+ while (1) {
+
+ /*
+ * We sleep until someone wakes us up from
+ * page_alloc.c::__alloc_pages().
+ */
+ interruptible_sleep_on(&kreclaimd_wait);
+
+ /*
+ * Move some pages from the inactive_clean lists to
+ * the free lists, if it is needed.
+ */
+ pgdat = pgdat_list;
+ do {
+ int i;
+ for(i = 0; i < MAX_NR_ZONES; i++) {
+ zone_t *zone = pgdat->node_zones + i;
+ if (!zone->size)
+ continue;
+
+ while (zone->free_pages < zone->pages_low) {
+ struct page * page;
+ page = reclaim_page(zone);
+ if (!page)
+ break;
+ __free_page(page);
+ }
+ }
+ pgdat = pgdat->node_next;
+ } while (pgdat);
+ }
}
+
static int __init kswapd_init(void)
{
- printk("Starting kswapd v1.7\n");
+ printk("Starting kswapd v1.8\n");
swap_setup();
kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+ kernel_thread(kreclaimd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
return 0;
}
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)