patch-2.4.0-test9 linux/mm/page_alloc.c
Next file: linux/mm/page_io.c
Previous file: linux/mm/mremap.c
Back to the patch index
Back to the overall index
- Lines: 531
- Date:
Mon Oct 2 12:02:20 2000
- Orig file:
v2.4.0-test8/linux/mm/page_alloc.c
- Orig date:
Thu Sep 7 08:44:50 2000
diff -u --recursive --new-file v2.4.0-test8/linux/mm/page_alloc.c linux/mm/page_alloc.c
@@ -25,7 +25,8 @@
#endif
int nr_swap_pages;
-int nr_lru_pages;
+int nr_active_pages;
+int nr_inactive_dirty_pages;
pg_data_t *pgdat_list;
static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
@@ -33,6 +34,8 @@
static int zone_balance_min[MAX_NR_ZONES] = { 10 , 10, 10, };
static int zone_balance_max[MAX_NR_ZONES] = { 255 , 255, 255, };
+struct list_head active_list;
+struct list_head inactive_dirty_list;
/*
* Free_page() adds the page to the free lists. This is optimized for
* fast normal cases (no error jumps taken normally).
@@ -96,7 +99,16 @@
BUG();
if (PageDirty(page))
BUG();
+ if (PageActive(page))
+ BUG();
+ if (PageInactiveDirty(page))
+ BUG();
+ if (PageInactiveClean(page))
+ BUG();
+ page->flags &= ~(1<<PG_referenced);
+ page->age = PAGE_AGE_START;
+
zone = page->zone;
mask = (~0UL) << order;
@@ -142,10 +154,13 @@
spin_unlock_irqrestore(&zone->lock, flags);
- if (zone->free_pages > zone->pages_high) {
- zone->zone_wake_kswapd = 0;
- zone->low_on_memory = 0;
- }
+ /*
+ * We don't want to protect this variable from race conditions
+ * since it's nothing important, but we do want to make sure
+ * it never gets negative.
+ */
+ if (memory_pressure > NR_CPUS)
+ memory_pressure--;
}
#define MARK_USED(index, order, area) \
@@ -203,6 +218,7 @@
set_page_count(page, 1);
if (BAD_RANGE(zone,page))
BUG();
+ DEBUG_ADD_PAGE
return page;
}
curr_order++;
@@ -213,13 +229,77 @@
return NULL;
}
+#define PAGES_MIN 0
+#define PAGES_LOW 1
+#define PAGES_HIGH 2
+
+/*
+ * This function does the dirty work for __alloc_pages
+ * and is separated out to keep the code size smaller.
+ * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)
+ */
+static struct page * __alloc_pages_limit(zonelist_t *zonelist,
+ unsigned long order, int limit, int direct_reclaim)
+{
+ zone_t **zone = zonelist->zones;
+
+ for (;;) {
+ zone_t *z = *(zone++);
+ unsigned long water_mark;
+
+ if (!z)
+ break;
+ if (!z->size)
+ BUG();
+
+ /*
+ * We allocate if the number of free + inactive_clean
+ * pages is above the watermark.
+ */
+ switch (limit) {
+ default:
+ case PAGES_MIN:
+ water_mark = z->pages_min;
+ break;
+ case PAGES_LOW:
+ water_mark = z->pages_low;
+ break;
+ case PAGES_HIGH:
+ water_mark = z->pages_high;
+ }
+
+ if (z->free_pages + z->inactive_clean_pages > water_mark) {
+ struct page *page = NULL;
+ /* If possible, reclaim a page directly. */
+ if (direct_reclaim && z->free_pages < z->pages_min + 8)
+ page = reclaim_page(z);
+ /* If that fails, fall back to rmqueue. */
+ if (!page)
+ page = rmqueue(z, order);
+ if (page)
+ return page;
+ }
+ }
+
+ /* Found nothing. */
+ return NULL;
+}
+
+
/*
* This is the 'heart' of the zoned buddy allocator:
*/
struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
{
zone_t **zone;
- extern wait_queue_head_t kswapd_wait;
+ int direct_reclaim = 0;
+ unsigned int gfp_mask = zonelist->gfp_mask;
+ struct page * page = NULL;
+
+ /*
+ * Allocations put pressure on the VM subsystem.
+ */
+ memory_pressure++;
/*
* (If anyone calls gfp from interrupts nonatomically then it
@@ -229,6 +309,36 @@
* in a higher zone fails.
*/
+ /*
+ * Can we take pages directly from the inactive_clean
+ * list?
+ */
+ if (order == 0 && (gfp_mask & __GFP_WAIT) &&
+ !(current->flags & PF_MEMALLOC))
+ direct_reclaim = 1;
+
+ /*
+ * If we are about to get low on free pages and we also have
+ * an inactive page shortage, wake up kswapd.
+ */
+ if (inactive_shortage() > inactive_target / 2 && free_shortage())
+ wakeup_kswapd(0);
+ /*
+ * If we are about to get low on free pages and cleaning
+ * the inactive_dirty pages would fix the situation,
+ * wake up bdflush.
+ */
+ else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()
+ && nr_inactive_dirty_pages > freepages.high)
+ wakeup_bdflush(0);
+
+try_again:
+ /*
+ * First, see if we have any zones with lots of free memory.
+ *
+ * We allocate free memory first because it doesn't contain
+ * any data ... DUH!
+ */
zone = zonelist->zones;
for (;;) {
zone_t *z = *(zone++);
@@ -237,82 +347,193 @@
if (!z->size)
BUG();
- /* Are we supposed to free memory? Don't make it worse.. */
- if (!z->zone_wake_kswapd) {
- struct page *page = rmqueue(z, order);
- if (z->free_pages < z->pages_low) {
- z->zone_wake_kswapd = 1;
- if (waitqueue_active(&kswapd_wait))
- wake_up_interruptible(&kswapd_wait);
- }
+ if (z->free_pages > z->pages_low) {
+ page = rmqueue(z, order);
if (page)
return page;
+ } else if (z->free_pages < z->pages_min &&
+ waitqueue_active(&kreclaimd_wait)) {
+ wake_up_interruptible(&kreclaimd_wait);
}
}
- /* Three possibilities to get here
- * - Previous alloc_pages resulted in last zone set to have
- * zone_wake_kswapd and start it. kswapd has not been able
- * to release enough pages so that one zone does not have
- * zone_wake_kswapd set.
- * - Different sets of zones (zonelist)
- * previous did not have all zones with zone_wake_kswapd but
- * this one has... should kswapd be woken up? it will run once.
- * - SMP race, kswapd went to sleep slightly after it as running
- * in 'if (waitqueue_active(...))' above.
- * + anyway the test is very cheap to do...
+ /*
+ * Try to allocate a page from a zone with a HIGH
+ * amount of free + inactive_clean pages.
+ *
+ * If there is a lot of activity, inactive_target
+ * will be high and we'll have a good chance of
+ * finding a page using the HIGH limit.
*/
- if (waitqueue_active(&kswapd_wait))
- wake_up_interruptible(&kswapd_wait);
+ page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);
+ if (page)
+ return page;
/*
- * Ok, we don't have any zones that don't need some
- * balancing.. See if we have any that aren't critical..
+ * Then try to allocate a page from a zone with more
+ * than zone->pages_low free + inactive_clean pages.
+ *
+ * When the working set is very large and VM activity
+ * is low, we're most likely to have our allocation
+ * succeed here.
*/
- zone = zonelist->zones;
- for (;;) {
- zone_t *z = *(zone++);
- if (!z)
- break;
- if (!z->low_on_memory) {
- struct page *page = rmqueue(z, order);
- if (z->free_pages < z->pages_min)
- z->low_on_memory = 1;
- if (page)
- return page;
- }
+ page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);
+ if (page)
+ return page;
+
+ /*
+ * OK, none of the zones on our zonelist has lots
+ * of pages free.
+ *
+ * We wake up kswapd, in the hope that kswapd will
+ * resolve this situation before memory gets tight.
+ *
+ * We also yield the CPU, because that:
+ * - gives kswapd a chance to do something
+ * - slows down allocations, in particular the
+ * allocations from the fast allocator that's
+ * causing the problems ...
+ * - ... which minimises the impact the "bad guys"
+ * have on the rest of the system
+ * - if we don't have __GFP_IO set, kswapd may be
+ * able to free some memory we can't free ourselves
+ */
+ wakeup_kswapd(0);
+ if (gfp_mask & __GFP_WAIT) {
+ __set_current_state(TASK_RUNNING);
+ current->policy |= SCHED_YIELD;
+ schedule();
}
/*
- * Uhhuh. All the zones have been critical, which means that
- * we'd better do some synchronous swap-out. kswapd has not
- * been able to cope..
+ * After waking up kswapd, we try to allocate a page
+ * from any zone which isn't critical yet.
+ *
+ * Kswapd should, in most situations, bring the situation
+ * back to normal in no time.
+ */
+ page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);
+ if (page)
+ return page;
+
+ /*
+ * Damn, we didn't succeed.
+ *
+ * This can be due to 2 reasons:
+ * - we're doing a higher-order allocation
+ * --> move pages to the free list until we succeed
+ * - we're /really/ tight on memory
+ * --> wait on the kswapd waitqueue until memory is freed
*/
if (!(current->flags & PF_MEMALLOC)) {
- int gfp_mask = zonelist->gfp_mask;
- if (!try_to_free_pages(gfp_mask)) {
- if (!(gfp_mask & __GFP_HIGH))
- goto fail;
+ /*
+ * Are we dealing with a higher order allocation?
+ *
+ * Move pages from the inactive_clean to the free list
+ * in the hope of creating a large, physically contiguous
+ * piece of free memory.
+ */
+ if (order > 0 && (gfp_mask & __GFP_WAIT)) {
+ zone = zonelist->zones;
+ /* First, clean some dirty pages. */
+ page_launder(gfp_mask, 1);
+ for (;;) {
+ zone_t *z = *(zone++);
+ if (!z)
+ break;
+ if (!z->size)
+ continue;
+ while (z->inactive_clean_pages) {
+ struct page * page;
+ /* Move one page to the free list. */
+ page = reclaim_page(z);
+ if (!page)
+ break;
+ __free_page(page);
+ /* Try if the allocation succeeds. */
+ page = rmqueue(z, order);
+ if (page)
+ return page;
+ }
+ }
+ }
+ /*
+ * When we arrive here, we are really tight on memory.
+ *
+ * We wake up kswapd and sleep until kswapd wakes us
+ * up again. After that we loop back to the start.
+ *
+ * We have to do this because something else might eat
+ * the memory kswapd frees for us and we need to be
+ * reliable. Note that we don't loop back for higher
+ * order allocations since it is possible that kswapd
+ * simply cannot free a large enough contiguous area
+ * of memory *ever*.
+ */
+ if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) {
+ wakeup_kswapd(1);
+ memory_pressure++;
+ if (!order)
+ goto try_again;
+ /*
+ * If __GFP_IO isn't set, we can't wait on kswapd because
+ * kswapd just might need some IO locks /we/ are holding ...
+ *
+ * SUBTLE: The scheduling point above makes sure that
+ * kswapd does get the chance to free memory we can't
+ * free ourselves...
+ */
+ } else if (gfp_mask & __GFP_WAIT) {
+ try_to_free_pages(gfp_mask);
+ memory_pressure++;
+ if (!order)
+ goto try_again;
}
+
}
/*
* Final phase: allocate anything we can!
+ *
+ * Higher order allocations, GFP_ATOMIC allocations and
+ * recursive allocations (PF_MEMALLOC) end up here.
+ *
+ * Only recursive allocations can use the very last pages
+ * in the system, otherwise it would be just too easy to
+ * deadlock the system...
*/
zone = zonelist->zones;
for (;;) {
- struct page *page;
-
zone_t *z = *(zone++);
+ struct page * page = NULL;
if (!z)
break;
- page = rmqueue(z, order);
+ if (!z->size)
+ BUG();
+
+ /*
+ * SUBTLE: direct_reclaim is only possible if the task
+ * becomes PF_MEMALLOC while looping above. This will
+ * happen when the OOM killer selects this task for
+ * instant execution...
+ */
+ if (direct_reclaim)
+ page = reclaim_page(z);
+ if (page)
+ return page;
+
+ /* XXX: is pages_min/4 a good amount to reserve for this? */
+ if (z->free_pages < z->pages_min / 4 &&
+ !(current->flags & PF_MEMALLOC))
+ continue;
+ if (!page)
+ page = rmqueue(z, order);
if (page)
return page;
}
-fail:
/* No luck.. */
+ printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);
return NULL;
}
@@ -377,18 +598,46 @@
}
/*
- * Amount of free RAM allocatable as buffer memory:
+ * Total amount of inactive_clean (allocatable) RAM:
*/
-unsigned int nr_free_buffer_pages (void)
+unsigned int nr_inactive_clean_pages (void)
{
unsigned int sum;
zone_t *zone;
int i;
- sum = nr_lru_pages / 3;
+ sum = 0;
for (i = 0; i < NUMNODES; i++)
- for (zone = NODE_DATA(i)->node_zones; zone <= NODE_DATA(i)->node_zones+ZONE_NORMAL; zone++)
- sum += zone->free_pages;
+ for (zone = NODE_DATA(i)->node_zones; zone < NODE_DATA(i)->node_zones + MAX_NR_ZONES; zone++)
+ sum += zone->inactive_clean_pages;
+ return sum;
+}
+
+/*
+ * Amount of free RAM allocatable as buffer memory:
+ */
+unsigned int nr_free_buffer_pages (void)
+{
+ unsigned int sum;
+
+ sum = nr_free_pages();
+ sum += nr_inactive_clean_pages();
+ sum += nr_inactive_dirty_pages;
+
+ /*
+ * Keep our write behind queue filled, even if
+ * kswapd lags a bit right now.
+ */
+ if (sum < freepages.high + inactive_target)
+ sum = freepages.high + inactive_target;
+ /*
+ * We don't want dirty page writebehind to put too
+ * much pressure on the working set, but we want it
+ * to be possible to have some dirty pages in the
+ * working set without upsetting the writebehind logic.
+ */
+ sum += nr_active_pages >> 4;
+
return sum;
}
@@ -418,9 +667,11 @@
nr_free_pages() << (PAGE_SHIFT-10),
nr_free_highpages() << (PAGE_SHIFT-10));
- printk("( Free: %d, lru_cache: %d (%d %d %d) )\n",
+ printk("( Active: %d, inactive_dirty: %d, inactive_clean: %d, free: %d (%d %d %d) )\n",
+ nr_active_pages,
+ nr_inactive_dirty_pages,
+ nr_inactive_clean_pages(),
nr_free_pages(),
- nr_lru_pages,
freepages.min,
freepages.low,
freepages.high);
@@ -430,17 +681,6 @@
zone_t *zone = NODE_DATA(nid)->node_zones + type;
unsigned long nr, total, flags;
- printk(" %c%d%d %s: ",
- (zone->free_pages > zone->pages_low
- ? (zone->free_pages > zone->pages_high
- ? ' '
- : 'H')
- : (zone->free_pages > zone->pages_min
- ? 'M'
- : 'L')),
- zone->zone_wake_kswapd, zone->low_on_memory,
- zone->name);
-
total = 0;
if (zone->size) {
spin_lock_irqsave(&zone->lock, flags);
@@ -570,7 +810,8 @@
freepages.min += i;
freepages.low += i * 2;
freepages.high += i * 3;
- memlist_init(&lru_cache);
+ memlist_init(&active_list);
+ memlist_init(&inactive_dirty_list);
/*
* Some architectures (with lots of mem and discontinous memory
@@ -618,6 +859,9 @@
zone->lock = SPIN_LOCK_UNLOCKED;
zone->zone_pgdat = pgdat;
zone->free_pages = 0;
+ zone->inactive_clean_pages = 0;
+ zone->inactive_dirty_pages = 0;
+ memlist_init(&zone->inactive_clean_list);
if (!size)
continue;
@@ -631,8 +875,6 @@
zone->pages_min = mask;
zone->pages_low = mask*2;
zone->pages_high = mask*3;
- zone->low_on_memory = 0;
- zone->zone_wake_kswapd = 0;
zone->zone_mem_map = mem_map + offset;
zone->zone_start_mapnr = offset;
zone->zone_start_paddr = zone_start_paddr;
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)