patch-2.4.0-test9 linux/mm/page_alloc.c

Next file: linux/mm/page_io.c
Previous file: linux/mm/mremap.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.4.0-test8/linux/mm/page_alloc.c linux/mm/page_alloc.c
@@ -25,7 +25,8 @@
 #endif
 
 int nr_swap_pages;
-int nr_lru_pages;
+int nr_active_pages;
+int nr_inactive_dirty_pages;
 pg_data_t *pgdat_list;
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
@@ -33,6 +34,8 @@
 static int zone_balance_min[MAX_NR_ZONES] = { 10 , 10, 10, };
 static int zone_balance_max[MAX_NR_ZONES] = { 255 , 255, 255, };
 
+struct list_head active_list;
+struct list_head inactive_dirty_list;
 /*
  * Free_page() adds the page to the free lists. This is optimized for
  * fast normal cases (no error jumps taken normally).
@@ -96,7 +99,16 @@
 		BUG();
 	if (PageDirty(page))
 		BUG();
+	if (PageActive(page))
+		BUG();
+	if (PageInactiveDirty(page))
+		BUG();
+	if (PageInactiveClean(page))
+		BUG();
 
+	page->flags &= ~(1<<PG_referenced);
+	page->age = PAGE_AGE_START;
+	
 	zone = page->zone;
 
 	mask = (~0UL) << order;
@@ -142,10 +154,13 @@
 
 	spin_unlock_irqrestore(&zone->lock, flags);
 
-	if (zone->free_pages > zone->pages_high) {
-		zone->zone_wake_kswapd = 0;
-		zone->low_on_memory = 0;
-	}
+	/*
+	 * We don't want to protect this variable from race conditions
+	 * since it's nothing important, but we do want to make sure
+	 * it never gets negative.
+	 */
+	if (memory_pressure > NR_CPUS)
+		memory_pressure--;
 }
 
 #define MARK_USED(index, order, area) \
@@ -203,6 +218,7 @@
 			set_page_count(page, 1);
 			if (BAD_RANGE(zone,page))
 				BUG();
+			DEBUG_ADD_PAGE
 			return page;	
 		}
 		curr_order++;
@@ -213,13 +229,77 @@
 	return NULL;
 }
 
+#define PAGES_MIN	0
+#define PAGES_LOW	1
+#define PAGES_HIGH	2
+
+/*
+ * This function does the dirty work for __alloc_pages
+ * and is separated out to keep the code size smaller.
+ * (suggested by Davem at 1:30 AM, typed by Rik at 6 AM)
+ */
+static struct page * __alloc_pages_limit(zonelist_t *zonelist,
+			unsigned long order, int limit, int direct_reclaim)
+{
+	zone_t **zone = zonelist->zones;
+
+	for (;;) {
+		zone_t *z = *(zone++);
+		unsigned long water_mark;
+
+		if (!z)
+			break;
+		if (!z->size)
+			BUG();
+
+		/*
+		 * We allocate if the number of free + inactive_clean
+		 * pages is above the watermark.
+		 */
+		switch (limit) {
+			default:
+			case PAGES_MIN:
+				water_mark = z->pages_min;
+				break;
+			case PAGES_LOW:
+				water_mark = z->pages_low;
+				break;
+			case PAGES_HIGH:
+				water_mark = z->pages_high;
+		}
+
+		if (z->free_pages + z->inactive_clean_pages > water_mark) {
+			struct page *page = NULL;
+			/* If possible, reclaim a page directly. */
+			if (direct_reclaim && z->free_pages < z->pages_min + 8)
+				page = reclaim_page(z);
+			/* If that fails, fall back to rmqueue. */
+			if (!page)
+				page = rmqueue(z, order);
+			if (page)
+				return page;
+		}
+	}
+
+	/* Found nothing. */
+	return NULL;
+}
+
+
 /*
  * This is the 'heart' of the zoned buddy allocator:
  */
 struct page * __alloc_pages(zonelist_t *zonelist, unsigned long order)
 {
 	zone_t **zone;
-	extern wait_queue_head_t kswapd_wait;
+	int direct_reclaim = 0;
+	unsigned int gfp_mask = zonelist->gfp_mask;
+	struct page * page = NULL;
+
+	/*
+	 * Allocations put pressure on the VM subsystem.
+	 */
+	memory_pressure++;
 
 	/*
 	 * (If anyone calls gfp from interrupts nonatomically then it
@@ -229,6 +309,36 @@
 	 * in a higher zone fails.
 	 */
 
+	/*
+	 * Can we take pages directly from the inactive_clean
+	 * list?
+	 */
+	if (order == 0 && (gfp_mask & __GFP_WAIT) &&
+			!(current->flags & PF_MEMALLOC))
+		direct_reclaim = 1;
+
+	/*
+	 * If we are about to get low on free pages and we also have
+	 * an inactive page shortage, wake up kswapd.
+	 */
+	if (inactive_shortage() > inactive_target / 2 && free_shortage())
+		wakeup_kswapd(0);
+	/*
+	 * If we are about to get low on free pages and cleaning
+	 * the inactive_dirty pages would fix the situation,
+	 * wake up bdflush.
+	 */
+	else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()
+			&& nr_inactive_dirty_pages > freepages.high)
+		wakeup_bdflush(0);
+
+try_again:
+	/*
+	 * First, see if we have any zones with lots of free memory.
+	 *
+	 * We allocate free memory first because it doesn't contain
+	 * any data ... DUH!
+	 */
 	zone = zonelist->zones;
 	for (;;) {
 		zone_t *z = *(zone++);
@@ -237,82 +347,193 @@
 		if (!z->size)
 			BUG();
 
-		/* Are we supposed to free memory? Don't make it worse.. */
-		if (!z->zone_wake_kswapd) {
-			struct page *page = rmqueue(z, order);
-			if (z->free_pages < z->pages_low) {
-				z->zone_wake_kswapd = 1;
-				if (waitqueue_active(&kswapd_wait))
-					wake_up_interruptible(&kswapd_wait);
-			}
+		if (z->free_pages > z->pages_low) {
+			page = rmqueue(z, order);
 			if (page)
 				return page;
+		} else if (z->free_pages < z->pages_min &&
+					waitqueue_active(&kreclaimd_wait)) {
+				wake_up_interruptible(&kreclaimd_wait);
 		}
 	}
 
-	/* Three possibilities to get here
-	 * - Previous alloc_pages resulted in last zone set to have
-	 *   zone_wake_kswapd and start it. kswapd has not been able
-	 *   to release enough pages so that one zone does not have
-	 *   zone_wake_kswapd set.
-	 * - Different sets of zones (zonelist)
-	 *   previous did not have all zones with zone_wake_kswapd but
-	 *   this one has... should kswapd be woken up? it will run once.
-	 * - SMP race, kswapd went to sleep slightly after it as running
-	 *   in 'if (waitqueue_active(...))' above.
-	 * + anyway the test is very cheap to do...
+	/*
+	 * Try to allocate a page from a zone with a HIGH
+	 * amount of free + inactive_clean pages.
+	 *
+	 * If there is a lot of activity, inactive_target
+	 * will be high and we'll have a good chance of
+	 * finding a page using the HIGH limit.
 	 */
-	if (waitqueue_active(&kswapd_wait))
-		wake_up_interruptible(&kswapd_wait);
+	page = __alloc_pages_limit(zonelist, order, PAGES_HIGH, direct_reclaim);
+	if (page)
+		return page;
 
 	/*
-	 * Ok, we don't have any zones that don't need some
-	 * balancing.. See if we have any that aren't critical..
+	 * Then try to allocate a page from a zone with more
+	 * than zone->pages_low free + inactive_clean pages.
+	 *
+	 * When the working set is very large and VM activity
+	 * is low, we're most likely to have our allocation
+	 * succeed here.
 	 */
-	zone = zonelist->zones;
-	for (;;) {
-		zone_t *z = *(zone++);
-		if (!z)
-			break;
-		if (!z->low_on_memory) {
-			struct page *page = rmqueue(z, order);
-			if (z->free_pages < z->pages_min)
-				z->low_on_memory = 1;
-			if (page)
-				return page;
-		}
+	page = __alloc_pages_limit(zonelist, order, PAGES_LOW, direct_reclaim);
+	if (page)
+		return page;
+
+	/*
+	 * OK, none of the zones on our zonelist has lots
+	 * of pages free.
+	 *
+	 * We wake up kswapd, in the hope that kswapd will
+	 * resolve this situation before memory gets tight.
+	 *
+	 * We also yield the CPU, because that:
+	 * - gives kswapd a chance to do something
+	 * - slows down allocations, in particular the
+	 *   allocations from the fast allocator that's
+	 *   causing the problems ...
+	 * - ... which minimises the impact the "bad guys"
+	 *   have on the rest of the system
+	 * - if we don't have __GFP_IO set, kswapd may be
+	 *   able to free some memory we can't free ourselves
+	 */
+	wakeup_kswapd(0);
+	if (gfp_mask & __GFP_WAIT) {
+		__set_current_state(TASK_RUNNING);
+		current->policy |= SCHED_YIELD;
+		schedule();
 	}
 
 	/*
-	 * Uhhuh. All the zones have been critical, which means that
-	 * we'd better do some synchronous swap-out. kswapd has not
-	 * been able to cope..
+	 * After waking up kswapd, we try to allocate a page
+	 * from any zone which isn't critical yet.
+	 *
+	 * Kswapd should, in most situations, bring the situation
+	 * back to normal in no time.
+	 */
+	page = __alloc_pages_limit(zonelist, order, PAGES_MIN, direct_reclaim);
+	if (page)
+		return page;
+
+	/*
+	 * Damn, we didn't succeed.
+	 *
+	 * This can be due to 2 reasons:
+	 * - we're doing a higher-order allocation
+	 * 	--> move pages to the free list until we succeed
+	 * - we're /really/ tight on memory
+	 * 	--> wait on the kswapd waitqueue until memory is freed
 	 */
 	if (!(current->flags & PF_MEMALLOC)) {
-		int gfp_mask = zonelist->gfp_mask;
-		if (!try_to_free_pages(gfp_mask)) {
-			if (!(gfp_mask & __GFP_HIGH))
-				goto fail;
+		/*
+		 * Are we dealing with a higher order allocation?
+		 *
+		 * Move pages from the inactive_clean to the free list
+		 * in the hope of creating a large, physically contiguous
+		 * piece of free memory.
+		 */
+		if (order > 0 && (gfp_mask & __GFP_WAIT)) {
+			zone = zonelist->zones;
+			/* First, clean some dirty pages. */
+			page_launder(gfp_mask, 1);
+			for (;;) {
+				zone_t *z = *(zone++);
+				if (!z)
+					break;
+				if (!z->size)
+					continue;
+				while (z->inactive_clean_pages) {
+					struct page * page;
+					/* Move one page to the free list. */
+					page = reclaim_page(z);
+					if (!page)
+						break;
+					__free_page(page);
+					/* Try if the allocation succeeds. */
+					page = rmqueue(z, order);
+					if (page)
+						return page;
+				}
+			}
+		}
+		/*
+		 * When we arrive here, we are really tight on memory.
+		 *
+		 * We wake up kswapd and sleep until kswapd wakes us
+		 * up again. After that we loop back to the start.
+		 *
+		 * We have to do this because something else might eat
+		 * the memory kswapd frees for us and we need to be
+		 * reliable. Note that we don't loop back for higher
+		 * order allocations since it is possible that kswapd
+		 * simply cannot free a large enough contiguous area
+		 * of memory *ever*.
+		 */
+		if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) {
+			wakeup_kswapd(1);
+			memory_pressure++;
+			if (!order)
+				goto try_again;
+		/*
+		 * If __GFP_IO isn't set, we can't wait on kswapd because
+		 * kswapd just might need some IO locks /we/ are holding ...
+		 *
+		 * SUBTLE: The scheduling point above makes sure that
+		 * kswapd does get the chance to free memory we can't
+		 * free ourselves...
+		 */
+		} else if (gfp_mask & __GFP_WAIT) {
+			try_to_free_pages(gfp_mask);
+			memory_pressure++;
+			if (!order)
+				goto try_again;
 		}
+
 	}
 
 	/*
 	 * Final phase: allocate anything we can!
+	 *
+	 * Higher order allocations, GFP_ATOMIC allocations and
+	 * recursive allocations (PF_MEMALLOC) end up here.
+	 *
+	 * Only recursive allocations can use the very last pages
+	 * in the system, otherwise it would be just too easy to
+	 * deadlock the system...
 	 */
 	zone = zonelist->zones;
 	for (;;) {
-		struct page *page;
-
 		zone_t *z = *(zone++);
+		struct page * page = NULL;
 		if (!z)
 			break;
-		page = rmqueue(z, order);
+		if (!z->size)
+			BUG();
+
+		/*
+		 * SUBTLE: direct_reclaim is only possible if the task
+		 * becomes PF_MEMALLOC while looping above. This will
+		 * happen when the OOM killer selects this task for
+		 * instant execution...
+		 */
+		if (direct_reclaim)
+			page = reclaim_page(z);
+		if (page)
+			return page;
+
+		/* XXX: is pages_min/4 a good amount to reserve for this? */
+		if (z->free_pages < z->pages_min / 4 &&
+				!(current->flags & PF_MEMALLOC))
+			continue;
+		if (!page)
+			page = rmqueue(z, order);
 		if (page)
 			return page;
 	}
 
-fail:
 	/* No luck.. */
+	printk(KERN_ERR "__alloc_pages: %lu-order allocation failed.\n", order);
 	return NULL;
 }
 
@@ -377,18 +598,46 @@
 }
 
 /*
- * Amount of free RAM allocatable as buffer memory:
+ * Total amount of inactive_clean (allocatable) RAM:
  */
-unsigned int nr_free_buffer_pages (void)
+unsigned int nr_inactive_clean_pages (void)
 {
 	unsigned int sum;
 	zone_t *zone;
 	int i;
 
-	sum = nr_lru_pages / 3;
+	sum = 0;
 	for (i = 0; i < NUMNODES; i++)
-		for (zone = NODE_DATA(i)->node_zones; zone <= NODE_DATA(i)->node_zones+ZONE_NORMAL; zone++)
-			sum += zone->free_pages;
+		for (zone = NODE_DATA(i)->node_zones; zone < NODE_DATA(i)->node_zones + MAX_NR_ZONES; zone++)
+			sum += zone->inactive_clean_pages;
+	return sum;
+}
+
+/*
+ * Amount of free RAM allocatable as buffer memory:
+ */
+unsigned int nr_free_buffer_pages (void)
+{
+	unsigned int sum;
+
+	sum = nr_free_pages();
+	sum += nr_inactive_clean_pages();
+	sum += nr_inactive_dirty_pages;
+
+	/*
+	 * Keep our write behind queue filled, even if
+	 * kswapd lags a bit right now.
+	 */
+	if (sum < freepages.high + inactive_target)
+		sum = freepages.high + inactive_target;
+	/*
+	 * We don't want dirty page writebehind to put too
+	 * much pressure on the working set, but we want it
+	 * to be possible to have some dirty pages in the
+	 * working set without upsetting the writebehind logic.
+	 */
+	sum += nr_active_pages >> 4;
+
 	return sum;
 }
 
@@ -418,9 +667,11 @@
 		nr_free_pages() << (PAGE_SHIFT-10),
 		nr_free_highpages() << (PAGE_SHIFT-10));
 
-	printk("( Free: %d, lru_cache: %d (%d %d %d) )\n",
+	printk("( Active: %d, inactive_dirty: %d, inactive_clean: %d, free: %d (%d %d %d) )\n",
+		nr_active_pages,
+		nr_inactive_dirty_pages,
+		nr_inactive_clean_pages(),
 		nr_free_pages(),
-		nr_lru_pages,
 		freepages.min,
 		freepages.low,
 		freepages.high);
@@ -430,17 +681,6 @@
 		zone_t *zone = NODE_DATA(nid)->node_zones + type;
  		unsigned long nr, total, flags;
 
-		printk("  %c%d%d %s: ",
-		       (zone->free_pages > zone->pages_low
-			? (zone->free_pages > zone->pages_high
-			   ? ' '
-			   : 'H')
-			: (zone->free_pages > zone->pages_min
-			   ? 'M'
-			   : 'L')),
-		       zone->zone_wake_kswapd, zone->low_on_memory,
-		       zone->name);
-
 		total = 0;
 		if (zone->size) {
 			spin_lock_irqsave(&zone->lock, flags);
@@ -570,7 +810,8 @@
 	freepages.min += i;
 	freepages.low += i * 2;
 	freepages.high += i * 3;
-	memlist_init(&lru_cache);
+	memlist_init(&active_list);
+	memlist_init(&inactive_dirty_list);
 
 	/*
 	 * Some architectures (with lots of mem and discontinous memory
@@ -618,6 +859,9 @@
 		zone->lock = SPIN_LOCK_UNLOCKED;
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
+		zone->inactive_clean_pages = 0;
+		zone->inactive_dirty_pages = 0;
+		memlist_init(&zone->inactive_clean_list);
 		if (!size)
 			continue;
 
@@ -631,8 +875,6 @@
 		zone->pages_min = mask;
 		zone->pages_low = mask*2;
 		zone->pages_high = mask*3;
-		zone->low_on_memory = 0;
-		zone->zone_wake_kswapd = 0;
 		zone->zone_mem_map = mem_map + offset;
 		zone->zone_start_mapnr = offset;
 		zone->zone_start_paddr = zone_start_paddr;

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)