diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/arch/i386/Kconfig 500-user_text_replication/arch/i386/Kconfig
--- 490-amd_sysrq_t/arch/i386/Kconfig	2003-12-11 17:29:34.000000000 -0800
+++ 500-user_text_replication/arch/i386/Kconfig	2003-12-12 16:42:18.000000000 -0800
@@ -792,6 +792,17 @@ comment "NUMA (NUMA-Q) requires SMP, 64G
 comment "NUMA (Summit) requires SMP, 64GB highmem support, full ACPI"
 	depends on X86_SUMMIT && (!HIGHMEM64G || !ACPI || ACPI_HT_ONLY)
 
+config MAPPING_REPLICATE
+	bool "  Numa user text replication"
+	depends on NUMA
+	default y
+	help
+	  Selecting this option will allow the NUMA code to make node-local copies
+	  of some kinds of read-only files, including executables and shared 
+	  libraries.
+
+	  If unsure, say "n".
+
 config DISCONTIGMEM
 	bool
 	depends on NUMA
diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/fs/inode.c 500-user_text_replication/fs/inode.c
--- 490-amd_sysrq_t/fs/inode.c	2003-12-11 17:28:01.000000000 -0800
+++ 500-user_text_replication/fs/inode.c	2003-12-12 16:42:18.000000000 -0800
@@ -196,6 +196,9 @@ void inode_init_once(struct inode *inode
 	INIT_LIST_HEAD(&inode->i_data.i_mmap_shared);
 	spin_lock_init(&inode->i_lock);
 	i_size_ordered_init(inode);
+#ifdef CONFIG_MAPPING_REPLICATE
+	atomic_set(&inode->i_data.replicate, 0);
+#endif
 }
 
 EXPORT_SYMBOL(inode_init_once);
@@ -993,6 +996,7 @@ void generic_delete_inode(struct inode *
 
 	if (inode->i_data.nrpages)
 		truncate_inode_pages(&inode->i_data, 0);
+	clear_replication(inode);
 
 	security_inode_delete(inode);
 
@@ -1039,6 +1043,8 @@ static void generic_forget_inode(struct 
 	spin_unlock(&inode_lock);
 	if (inode->i_data.nrpages)
 		truncate_inode_pages(&inode->i_data, 0);
+
+	clear_replication(inode);
 	clear_inode(inode);
 	destroy_inode(inode);
 }
diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/fs/namei.c 500-user_text_replication/fs/namei.c
--- 490-amd_sysrq_t/fs/namei.c	2003-10-14 15:50:29.000000000 -0700
+++ 500-user_text_replication/fs/namei.c	2003-12-12 16:42:18.000000000 -0800
@@ -241,29 +241,76 @@ int permission(struct inode * inode,int 
  * who will try to move it in struct inode - just leave it here.
  */
 static spinlock_t arbitration_lock = SPIN_LOCK_UNLOCKED;
+/* 
+ * if the inability to get_write_access() is because
+ * of replication going on, collapse the replication
+ * and try again
+ */
+static int inode_try_replication_disable(struct inode *inode)
+{
+	struct address_space *mapping = inode->i_mapping;
+	if (unlikely(mapping_replicate(inode->i_mapping))) {
+		spin_unlock(&arbitration_lock);
+
+		/* the collapsing is like truncating, and is protected
+		 * by i_sem */
+		down(&inode->i_sem);
+		collapse_replication(mapping, NULL);
+		spin_lock(&arbitration_lock);
+		up(&inode->i_sem);
+		
+		return 1;
+	}
+	return 0;
+}
 int get_write_access(struct inode * inode)
 {
 	spin_lock(&arbitration_lock);
+retry:
 	if (atomic_read(&inode->i_writecount) < 0) {
+		/* this can drop and reacquire the arbitration_lock */
+		if (inode_try_replication_disable(inode))
+			goto retry;
+
 		spin_unlock(&arbitration_lock);
 		return -ETXTBSY;
 	}
 	atomic_inc(&inode->i_writecount);
+	BUG_ON(mapping_replicate(inode->i_mapping));
 	spin_unlock(&arbitration_lock);
 	return 0;
 }
-int deny_write_access(struct file * file)
+int __deny_write_access(struct file * file, int set_replicate)
 {
+	struct inode *inode = file->f_dentry->d_inode;
+
 	spin_lock(&arbitration_lock);
-	if (atomic_read(&file->f_dentry->d_inode->i_writecount) > 0) {
+	if (atomic_read(&inode->i_writecount) > 0) {
 		spin_unlock(&arbitration_lock);
 		return -ETXTBSY;
 	}
-	atomic_dec(&file->f_dentry->d_inode->i_writecount);
+	atomic_dec(&inode->i_writecount);
+
+	/* 
+	 * this is done under the arbitration_lock to prevent any 
+	 * races where a potential writer might not see that 
+	 * writing is denied because of replication, and not just
+	 * a normal write deny.
+	 */
+#ifdef CONFIG_MAPPING_REPLICATE
+	if (set_replicate && !mapping_replicate(inode->i_mapping))
+		atomic_inc(&inode->i_data.replicate);
+#endif
+	
 	spin_unlock(&arbitration_lock);
 	return 0;
 }
 
+int deny_write_access(struct file * file)
+{
+	return __deny_write_access(file, 0);
+}
+
 void path_release(struct nameidata *nd)
 {
 	dput(nd->dentry);
diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/include/asm-i386/mmzone.h 500-user_text_replication/include/asm-i386/mmzone.h
--- 490-amd_sysrq_t/include/asm-i386/mmzone.h	2003-12-11 17:16:48.000000000 -0800
+++ 500-user_text_replication/include/asm-i386/mmzone.h	2003-12-12 16:42:18.000000000 -0800
@@ -149,5 +149,7 @@ static inline void get_memcfg_numa(void)
 	get_memcfg_numa_flat();
 }
 
+#define page_is_local(page)	(page_to_nid(page) == numa_node_id())
+
 #endif /* CONFIG_DISCONTIGMEM */
 #endif /* _ASM_MMZONE_H_ */
diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/include/linux/fs.h 500-user_text_replication/include/linux/fs.h
--- 490-amd_sysrq_t/include/linux/fs.h	2003-12-11 17:28:01.000000000 -0800
+++ 500-user_text_replication/include/linux/fs.h	2003-12-12 16:42:18.000000000 -0800
@@ -339,8 +339,22 @@ struct address_space {
 #ifdef CONFIG_NUMA
 	struct binding		*binding;	/* for memory bindings */
 #endif
+#ifdef CONFIG_MAPPING_REPLICATE
+	atomic_t		replicate;
+#endif
 };
 
+#ifdef CONFIG_MAPPING_REPLICATE
+ #define mapping_replicate(mapping)    (atomic_read(&(mapping)->replicate) > 0)
+ #define clear_replication(inode) do {			\
+	if (atomic_read(&inode->i_data.replicate))	\
+		atomic_dec(&inode->i_data.replicate);	\
+	} while (0)
+#else
+ #define mapping_replicate(mapping)    (0)
+ #define clear_replication(inode) do {} while(0)
+#endif
+
 struct block_device {
 	dev_t			bd_dev;  /* not a kdev_t - it's a search key */
 	struct inode *		bd_inode;	/* will die */
@@ -1202,9 +1216,11 @@ static inline void invalidate_remote_ino
 }
 extern void invalidate_inode_pages2(struct address_space *mapping);
 extern void write_inode_now(struct inode *, int);
+extern int file_try_replicate(struct file *file);
 extern int filemap_fdatawrite(struct address_space *);
 extern int filemap_flush(struct address_space *);
 extern int filemap_fdatawait(struct address_space *);
+extern void collapse_replication(struct address_space *mapping, struct file *file);
 extern void sync_supers(void);
 extern void sync_filesystems(int wait);
 extern void emergency_sync(void);
@@ -1218,6 +1234,7 @@ extern int permission(struct inode *, in
 extern int vfs_permission(struct inode *, int);
 extern int get_write_access(struct inode *);
 extern int deny_write_access(struct file *);
+extern int __deny_write_access(struct file *, int);
 static inline void put_write_access(struct inode * inode)
 {
 	atomic_dec(&inode->i_writecount);
diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/include/linux/pagemap.h 500-user_text_replication/include/linux/pagemap.h
--- 490-amd_sysrq_t/include/linux/pagemap.h	2003-12-11 17:28:01.000000000 -0800
+++ 500-user_text_replication/include/linux/pagemap.h	2003-12-12 16:42:18.000000000 -0800
@@ -96,6 +96,9 @@ extern struct page * find_or_create_page
 extern unsigned int find_get_pages(struct address_space *mapping,
 				pgoff_t start, unsigned int nr_pages,
 				struct page **pages);
+extern int find_get_replica_pages(struct address_space *mapping,
+				pgoff_t start, unsigned int nr_pages,
+				struct page **pages);
 
 /*
  * Returns locked page at given index in given cache, creating it if needed.
@@ -118,7 +121,10 @@ int add_to_page_cache(struct page *page,
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 				unsigned long index, int gfp_mask);
 extern void remove_from_page_cache(struct page *page);
+extern int __insert_into_page_cache(struct page *page, struct address_space *mapping,
+				pgoff_t offset);
 extern void __remove_from_page_cache(struct page *page);
+extern struct page *__page_cache_lookup(struct address_space *mapping, pgoff_t offset);
 
 extern atomic_t nr_pagecache;
 
diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/include/linux/pagevec.h 500-user_text_replication/include/linux/pagevec.h
--- 490-amd_sysrq_t/include/linux/pagevec.h	2002-12-09 18:46:25.000000000 -0800
+++ 500-user_text_replication/include/linux/pagevec.h	2003-12-12 16:42:18.000000000 -0800
@@ -24,6 +24,8 @@ void __pagevec_lru_add_active(struct pag
 void pagevec_strip(struct pagevec *pvec);
 unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 		pgoff_t start, unsigned int nr_pages);
+unsigned int pagevec_lookup_replicas(struct pagevec *pvec, 
+		struct address_space *mapping, unsigned int nr_pages);
 
 static inline void pagevec_init(struct pagevec *pvec, int cold)
 {
diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/include/linux/radix-tree.h 500-user_text_replication/include/linux/radix-tree.h
--- 490-amd_sysrq_t/include/linux/radix-tree.h	2003-04-21 14:14:50.000000000 -0700
+++ 500-user_text_replication/include/linux/radix-tree.h	2003-12-12 16:42:18.000000000 -0800
@@ -41,7 +41,7 @@ do {					\
 	(root)->rnode = NULL;		\
 } while (0)
 
-extern int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
+extern void *radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
 extern void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
 extern void *radix_tree_delete(struct radix_tree_root *, unsigned long);
 extern unsigned int
diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/init/main.c 500-user_text_replication/init/main.c
--- 490-amd_sysrq_t/init/main.c	2003-12-11 17:16:53.000000000 -0800
+++ 500-user_text_replication/init/main.c	2003-12-12 16:42:18.000000000 -0800
@@ -83,6 +83,7 @@ extern void pidhash_init(void);
 extern void pidmap_init(void);
 extern void pte_chain_init(void);
 extern void radix_tree_init(void);
+extern void page_cache_leaf_init(void);
 extern void free_initmem(void);
 extern void populate_rootfs(void);
 extern void driver_init(void);
@@ -456,6 +457,7 @@ asmlinkage void __init start_kernel(void
 	security_scaffolding_startup();
 	vfs_caches_init(num_physpages);
 	radix_tree_init();
+	page_cache_leaf_init();
 	signals_init();
 	/* rootfs populating might need page-writeback */
 	page_writeback_init();
diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/lib/radix-tree.c 500-user_text_replication/lib/radix-tree.c
--- 490-amd_sysrq_t/lib/radix-tree.c	2003-04-21 14:14:53.000000000 -0700
+++ 500-user_text_replication/lib/radix-tree.c	2003-12-12 16:42:18.000000000 -0800
@@ -18,6 +18,7 @@
  */
 
 #include <linux/errno.h>
+#include <linux/err.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -168,8 +169,11 @@ static int radix_tree_extend(struct radi
  *	@item:		item to insert
  *
  *	Insert an item into the radix tree at position @index.
+ *
+ *	If the insertion fails because a duplicate element is present,
+ *	return that element.
  */
-int radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item)
+void *radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item)
 {
 	struct radix_tree_node *node = NULL, *tmp, **slot;
 	unsigned int height, shift;
@@ -179,7 +183,7 @@ int radix_tree_insert(struct radix_tree_
 	if (index > radix_tree_maxindex(root->height)) {
 		error = radix_tree_extend(root, index);
 		if (error)
-			return error;
+			return ERR_PTR(error);
 	}
     
 	slot = &root->rnode;
@@ -190,7 +194,7 @@ int radix_tree_insert(struct radix_tree_
 		if (*slot == NULL) {
 			/* Have to add a child node.  */
 			if (!(tmp = radix_tree_node_alloc(root)))
-				return -ENOMEM;
+				return ERR_PTR(-ENOMEM);
 			*slot = tmp;
 			if (node)
 				node->count++;
@@ -205,7 +209,7 @@ int radix_tree_insert(struct radix_tree_
 	}
 
 	if (*slot != NULL)
-		return -EEXIST;
+		return *slot; /* used to be -EEXIST */
 	if (node)
 		node->count++;
 
diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/mm/filemap.c 500-user_text_replication/mm/filemap.c
--- 490-amd_sysrq_t/mm/filemap.c	2003-12-11 17:16:05.000000000 -0800
+++ 500-user_text_replication/mm/filemap.c	2003-12-12 16:42:18.000000000 -0800
@@ -10,6 +10,7 @@
  * the NFS filesystem used to do this differently, for example)
  */
 #include <linux/config.h>
+#include <linux/err.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/compiler.h>
@@ -91,6 +92,254 @@
  */
 
 /*
+ * If replication is on, only the node-local page will be returned.  If 
+ * there is not a local page, it will not find anything.
+ *
+ * If find_any is set, a search for all pages will be done even if
+ * replication is on.  This is useful when we're trying to make a 
+ * local copy of the page and we just want any old copy of it.
+ */
+enum page_search {
+	PAGE_LOCAL,
+	PAGE_ANY
+};
+
+#ifndef CONFIG_MAPPING_REPLICATE
+/*
+ * This is an attempt to keep the overhead when not doing replication
+ * to a bare minimum.  Instead of storing a real page_cache_leaf in
+ * the radix tree, a plain page pointer is stored.  
+ *
+ * This abstraction allows more common code to be used for both the 
+ * replicated, and non-replicated cases.
+ */
+struct page_cache_leaf {
+	struct page page;
+};
+
+struct page *page_cache_leaf_to_page(struct page_cache_leaf *leaf, 
+		struct address_space *mapping, enum page_search search_type)
+{
+	return &leaf->page;
+}
+
+#define leaf_free(leaf)	do {} while (0)
+#define leaf_preload(gfpflags)	(0)
+
+static inline struct page *make_local_replica_lock(struct address_space *mapping, 
+		struct page *page)
+{
+	return page;
+}
+
+static inline void drop_replica_pages(struct address_space *mapping)
+{
+}
+void collapse_replication(struct address_space *mapping,
+				struct file *file)
+{
+}
+static inline struct page *make_local_replica(struct address_space *mapping, struct page *page, struct page_cache_leaf *leaf)
+{
+	return page;
+}
+
+#else /* CONFIG_MAPPING_REPLICATE */
+
+struct page_cache_leaf {
+	struct page* pages[MAX_NUMNODES];
+	/*
+	 * This doesn't need to be an atomic because it's always 
+	 * modified under mapping->page_lock
+	 */
+	int count;
+	/* 
+	 * the duplicate_lock is not here to prevent any harmful races, it
+	 * keeps collision overhead to a minimum.  
+	 *
+	 * When 2 CPUs on the same node get into find_get_page() together, they
+	 * can both try to make a copy at the same time.  One is bound to get 
+	 * -EEXIST and back off properly, but copying that page is expensive.  
+	 * Better to just spin on this and wait for the other cpu to do the copy.
+	 * 
+	 * This lock could be per-node.
+	 */
+	spinlock_t duplicate_lock;
+};
+
+DEFINE_PER_CPU(struct page_cache_leaf *, page_cache_leaf_preloads) = { 0, };
+static kmem_cache_t *page_cache_leaf_cachep;
+
+static inline void leaf_free(struct page_cache_leaf *leaf)
+{
+	struct page_cache_leaf **preload;
+	preload = &get_cpu_var(page_cache_leaf_preloads);
+	if (!*preload)
+		*preload = leaf;
+	else
+		kmem_cache_free(page_cache_leaf_cachep, leaf);
+	put_cpu_var(page_cache_leaf_preloads);
+}
+
+void page_cache_leaf_ctor(void *node, kmem_cache_t *cachep, unsigned long flags)
+{
+	struct page_cache_leaf *leaf = node;
+	memset(node, 0, sizeof(struct page_cache_leaf));
+	spin_lock_init(&leaf->duplicate_lock);
+}
+
+int leaf_preload(int gfp_mask)
+{
+	struct page_cache_leaf **preload;
+	int error = 0;
+	
+	preload = &get_cpu_var(page_cache_leaf_preloads);
+	if (!*preload)
+		*preload = kmem_cache_alloc(page_cache_leaf_cachep, gfp_mask);
+	if (!*preload)
+		error = -ENOMEM;
+
+	put_cpu_var(page_cache_leaf_preloads);
+
+	return error;
+}
+
+/*
+ * for the non-numa case, this can just cast *leaf to a page and return
+ */
+struct page *page_cache_leaf_to_page(struct page_cache_leaf *leaf, 
+		struct address_space *mapping, enum page_search search_type)
+{
+	struct page *page = NULL;
+	int nid = numa_node_id();
+	
+	/* Always look for a local copy first */
+	if (mapping_replicate(mapping))
+		page = leaf->pages[nid];
+	
+	if (!page && (!mapping_replicate(mapping) || (search_type == PAGE_ANY))) 
+		for (nid = 0; nid < numnodes; nid++) {
+			page = leaf->pages[nid];
+			if (page)
+				break;
+		}
+	return page;
+}
+#endif
+
+void __init page_cache_leaf_init(void)
+{
+#ifdef CONFIG_MAPPING_REPLICATE
+	page_cache_leaf_cachep = kmem_cache_create("page_cache_leaf",
+			sizeof(struct page_cache_leaf), 0,
+			0, page_cache_leaf_ctor, NULL);
+	if (!page_cache_leaf_cachep)
+		panic ("Failed to create radix_tree_node cache\n");
+#endif
+}
+
+#ifndef CONFIG_MAPPING_REPLICATE
+int __insert_into_page_cache(struct page *page, struct address_space *mapping,
+		pgoff_t offset)
+{
+	struct page_cache_leaf *leaf, *errptr;
+	int error = 0;
+
+	leaf = container_of(page, struct page_cache_leaf, page);
+	errptr = radix_tree_insert(&mapping->page_tree, offset, leaf);
+
+	if (IS_ERR(errptr))
+		error = PTR_ERR(errptr);
+
+	return error;
+}
+#else
+int __insert_into_page_cache(struct page *page, struct address_space *mapping,
+		pgoff_t offset)
+{
+	int error = 0;
+	int nid;
+	struct page_cache_leaf *leaf, **newleaf;
+	
+	nid = page_to_nid(page);
+	
+	/*
+	 * If the leaf preload allocation failed, then at least check the
+	 * tree to see if a leaf is already present.  If one is present, 
+	 * then we got lucky and didn't really need to allocate anything.
+	 *
+	 * If that lookup *fails*, then we were really out of memory and 
+	 * error out.
+	 */
+	newleaf = &get_cpu_var(page_cache_leaf_preloads);
+	if (*newleaf)
+		leaf = radix_tree_insert(&mapping->page_tree, offset, *newleaf);
+	else {
+		leaf = radix_tree_lookup(&mapping->page_tree, offset);
+		if (!leaf)
+			leaf = ERR_PTR(-ENOMEM);
+	}
+
+	if (IS_ERR(leaf)) {
+		error = PTR_ERR(leaf);
+		goto out;
+	}
+	
+	/* there's already a leaf node there */
+	if (!mapping_replicate(mapping) && leaf) {
+		error = -EEXIST;
+		goto out;
+	}
+	
+	/* successful insertion, absorb the preloaded leaf */
+	if (!leaf) {
+		leaf = *newleaf;
+		*newleaf = NULL;
+	}
+	
+	/*
+	 * A !PageUptodate() will have some I/O done on it shortly.
+	 * The readahead code puts pages like that in here.  If
+	 * there's a replica available, don't bother putting the 
+	 * page in, because the I/O is a duplicate.  
+	 */
+	if (leaf->pages[nid]) {
+		error = -EEXIST;
+	} else {
+		/* 
+		 * Instead of -EEXIST, we could look for an 
+		 * Uptodate copy, and use that to make this
+		 * page Uptodate, making a local replica.
+		 */
+		if (leaf->count > 1 && !PageUptodate(page)) {
+			error =  -EEXIST;
+		} else {
+			leaf->pages[nid] = page;
+			leaf->count++;
+		}
+	}
+out:
+	put_cpu_var(page_cache_leaf_preloads);
+	return error;
+}
+#endif
+
+struct page* 
+__page_cache_lookup(struct address_space *mapping, pgoff_t offset)
+{
+	struct page *page = NULL;
+	struct page_cache_leaf *leaf;
+
+	leaf = radix_tree_lookup(&mapping->page_tree, offset);
+	if (!leaf)
+		goto out;
+
+	page = page_cache_leaf_to_page(leaf, mapping, PAGE_ANY);
+out:	
+	return page;	
+}
+
+/*
  * Remove a page from the page cache and free it. Caller has to make
  * sure the page is locked and that nobody else uses it - or that usage
  * is safe.  The caller must hold a write_lock on the mapping's page_lock.
@@ -98,8 +347,21 @@
 void __remove_from_page_cache(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
-
-	radix_tree_delete(&mapping->page_tree, page->index);
+#ifdef CONFIG_MAPPING_REPLICATE	
+	struct page_cache_leaf *leaf;
+	leaf = radix_tree_lookup(&mapping->page_tree, page->index);
+	leaf->pages[page_to_nid(page)] = NULL;
+	if (--leaf->count == 0) {
+#endif
+		radix_tree_delete(&mapping->page_tree, page->index);
+#ifdef CONFIG_MAPPING_REPLICATE
+		/* 
+		 * if there is a free preload slot for this CPU, put the
+		 * leaf back there instead of freeing it
+		 */
+		leaf_free(leaf);
+	}
+#endif
 	list_del(&page->list);
 	page->mapping = NULL;
 
@@ -128,6 +390,22 @@ static inline int sync_page(struct page 
 	return 0;
 }
 
+#ifdef CONFIG_MAPPING_REPLICATE
+/*
+ * synchronized by i_sem
+ */
+extern void drop_replica_pages(struct address_space *mapping);
+inline void collapse_replication(struct address_space *mapping,
+		struct file *file)
+{
+	if (mapping_replicate(mapping)) {
+		atomic_dec(&mapping->replicate);
+		drop_replica_pages(mapping);
+		atomic_inc(&mapping->host->i_writecount);
+	} 
+}
+#endif
+
 /**
  * filemap_fdatawrite - start writeback against all of a mapping's dirty pages
  * @mapping: address space structure to write
@@ -251,10 +529,16 @@ int add_to_page_cache(struct page *page,
 {
 	int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 
+	if (error != 0)
+		goto err;
+	
+	/* this benefits from the radix_tree_preload()'s preempt_disable() */
+	error = leaf_preload(gfp_mask & ~__GFP_HIGHMEM);
+	
 	if (error == 0) {
 		page_cache_get(page);
 		spin_lock(&mapping->page_lock);
-		error = radix_tree_insert(&mapping->page_tree, offset, page);
+		error = __insert_into_page_cache(page, mapping, offset);
 		if (!error) {
 			SetPageLocked(page);
 			___add_to_page_cache(page, mapping, offset);
@@ -264,11 +548,17 @@ int add_to_page_cache(struct page *page,
 		spin_unlock(&mapping->page_lock);
 		radix_tree_preload_end();
 	}
+err:
 	return error;
 }
 
 EXPORT_SYMBOL(add_to_page_cache);
 
+/*
+ * The pages will *not* be added to the LRU immediately.  They're only
+ * added after the entire pagevec is filled up.  Don't worry, they'll
+ * get there eventually.
+ */
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 				pgoff_t offset, int gfp_mask)
 {
@@ -385,24 +675,236 @@ void __lock_page(struct page *page)
 
 EXPORT_SYMBOL(__lock_page);
 
+#ifdef CONFIG_MAPPING_REPLICATE
 /*
- * a rather lightweight function, finding and getting a reference to a
- * hashed page atomically.
+ * This is fairly lazy with preemption to make the code simpler.  It doesn't
+ * need to be perfect.  Making a local replica is by no means required.  If the
+ * replica page allocation fails, one of two things happens:
+ * 1. page cache returns non-local page, which gets mapped in somewhere.
+ *    things are slightly slower
+ * 2. page cache returns NULL, when there was a page in the cache.  
+ *    I/O is resubmitted for the page, and a replica is made with
+ *    the new data.
+ */
+DEFINE_PER_CPU(struct page *, replica_preloads) = { NULL, };
+void refill_replica_page_cpu(void)
+{
+	int cpu = get_cpu();
+	int nid = cpu_to_node(cpu);
+	unsigned int gfp_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_NODE_STRICT;
+	struct page **page = &__get_cpu_var(replica_preloads);
+	
+	if (!*page)
+		*page = alloc_pages_node(nid, gfp_mask, 0);
+
+	put_cpu();
+}
+
+/* I want to see this in the profiles */
+void make_local_replica_copy(struct page *dst, struct page *src)
+{
+	if (!page_is_local(dst)) {
+		printk("%s(): %d dst not local: %08lx src: %08lx\n",
+				__func__, smp_processor_id(),
+				page_to_pfn(dst), page_to_pfn(src));
+	}
+	BUG_ON(!PageUptodate(src));
+	copy_highpage(dst, src);
+}
+
+static struct page *__make_local_replica(struct address_space *mapping, struct page *page) {
+	struct page *copy = page;
+	struct page **prealloc;
+	int err;
+
+	if (!page)
+		goto out;
+
+	if (!mapping_replicate(mapping))
+		goto out;
+
+	/* something is probably writing into the source page 
+	 * do *not* wait for this to get unlocked.  We're under
+	 * a lock here.  Just punt on the copy. */
+	if (TestSetPageLocked(page))
+		goto out;
+
+	/* the old page got unhashed since we pulled it out */
+	if (page->mapping != mapping) {
+		unlock_page(page);
+		goto out;
+	}
+	
+	prealloc = &get_cpu_var(replica_preloads);
+	if (*prealloc) {
+		copy = *prealloc;
+		*prealloc = NULL;
+	}
+	put_cpu_var(replica_preloads);
+
+	if (!copy)
+		goto out;
+	
+	make_local_replica_copy(copy, page);
+	/*
+	 * Do this now so that add_to_page_cache_lru() won't confuse this
+	 * with a readahead page that should get -EEXIST instead of just
+	 * getting added.
+	 */
+	SetPageUptodate(copy);
+
+	/*
+	 * This should never actually have to allocate memory.  It will
+	 * be able to add the page to the already existing leaf.  The
+	 * leaf can't go away because we hold a ref count on the source 
+	 * page.
+	 */
+	err = add_to_page_cache_lru(copy, mapping, page->index, GFP_ATOMIC);
+	unlock_page(page);
+	switch (err) {
+		case 0:
+			unlock_page(copy);
+			break;
+		case -EEXIST:
+			page_cache_release(copy);
+			goto out;
+		default:
+			printk("%s(): ?? %d\n", __FUNCTION__, err);
+			page_cache_release(copy);
+			dump_stack();
+			goto out;
+	}
+	return copy;
+out:
+	return page;
+}
+
+
+/*
+ * We can not be making copies of pages that aren't up to date yet, 
+ * so this function makes sure of that. 
+ *
+ * Instead of just just returning the information that the page is 
+ * unusable, it could go looking for other sources for the page, perhaps
+ * another node. 
+ *
+ * The logic for this was taken from read_cache_page()
  */
-struct page * find_get_page(struct address_space *mapping, unsigned long offset)
+static inline int replica_source_uptodate(struct address_space *mapping, struct page *page)
 {
-	struct page *page;
+	int ret = 1;
+
+	if (likely(PageUptodate(page)))
+		goto out;
+
+	lock_page(page);
+	if (!PageUptodate(page) || !page->mapping)
+		ret = 0;
+	unlock_page(page);
+out:
+	return ret;
+}
+/*
+ * This needs to be called without mapping->page_lock held
+ */
+static inline struct page *make_local_replica(struct address_space *mapping, struct page *page, struct page_cache_leaf *leaf)
+{
+	struct page *copy;
+	
+	if (page_is_local(page))
+		return page;
+
+	/*
+	 * if there's a problem with the source page, don't make a copy
+	 * of it.  The caller will fix this up.
+	 */
+	if (!replica_source_uptodate(mapping, page))
+		return page;
+	
+	refill_replica_page_cpu();
+
+	spin_lock(&leaf->duplicate_lock);
+	/* 
+	 * now that we have the lock, do a crude check to see if anyone
+	 * else has filled in the page we were looking for
+	 */
+	if (mapping_replicate(mapping) &&
+	    leaf->pages[numa_node_id()]) {
+		spin_unlock(&leaf->duplicate_lock);
+		page_cache_release(page);
+		return NULL;
+	}
+	copy = __make_local_replica(mapping, page);
+	spin_unlock(&leaf->duplicate_lock);
+	
+	if (copy != page) {
+		page_cache_release(page);
+		return copy;
+	}
+
+	return page;
+}
+
+
+static struct page *make_local_replica_lock(struct address_space *mapping, struct page *page) {
+	struct page *copy;
+	return page;
+
+	refill_replica_page_cpu();
+	copy = __make_local_replica(mapping, page);
+	
+	/* 
+	 * this is the cowardly way to do it.  Add the new copy, and pray
+	 * that it shows up :)  If the replication appears to have worked,
+	 * drop the references to the source page.  If the new page
+	 * got removed in the meantime, find_lock_page() will just
+	 * redo the locking anyway.
+	 */
+	if (copy != page) {
+		unlock_page(page);
+		page_cache_release(page);
+		copy = find_lock_page(mapping, page->index);
+	}
+	
+	return copy;
+}
+#endif
 
+/*
+ * With no page replication, this is a rather function for finding and 
+ * getting a reference to a hashed page atomically.
+ *
+ * When replicating pages, this becomes the place where the source for
+ * copies found and the new copy made.
+ */
+struct page * find_get_page(struct address_space *mapping, unsigned long offset)
+{
+	struct page_cache_leaf *leaf;
+	struct page *page, *copy;
 	/*
 	 * We scan the hash list read-only. Addition to and removal from
 	 * the hash-list needs a held write-lock.
 	 */
+
+repeat:
 	spin_lock(&mapping->page_lock);
-	page = radix_tree_lookup(&mapping->page_tree, offset);
-	if (page)
-		page_cache_get(page);
+	leaf = radix_tree_lookup(&mapping->page_tree, offset);
+	/* nothing found */
+	if (!leaf) {
+		spin_unlock(&mapping->page_lock);
+		return NULL;
+	}
+
+	page = page_cache_leaf_to_page(leaf, mapping, PAGE_ANY);
+	page_cache_get(page);
 	spin_unlock(&mapping->page_lock);
-	return page;
+
+	/* A NULL in this context is like -EEXIST. Try again. */
+	copy = make_local_replica(mapping, page, leaf);
+	if (!copy)
+		goto repeat;
+
+	return copy;
 }
 
 EXPORT_SYMBOL(find_get_page);
@@ -415,10 +917,11 @@ struct page *find_trylock_page(struct ad
 	struct page *page;
 
 	spin_lock(&mapping->page_lock);
-	page = radix_tree_lookup(&mapping->page_tree, offset);
+	page = __page_cache_lookup(mapping, offset);
 	if (page && TestSetPageLocked(page))
 		page = NULL;
 	spin_unlock(&mapping->page_lock);
+	page = make_local_replica_lock(mapping, page);
 	return page;
 }
 
@@ -442,12 +945,13 @@ struct page *find_lock_page(struct addre
 
 	spin_lock(&mapping->page_lock);
 repeat:
-	page = radix_tree_lookup(&mapping->page_tree, offset);
+	page = __page_cache_lookup(mapping, offset);
 	if (page) {
 		page_cache_get(page);
 		if (TestSetPageLocked(page)) {
 			spin_unlock(&mapping->page_lock);
 			lock_page(page);
+			page = make_local_replica_lock(mapping, page);
 			spin_lock(&mapping->page_lock);
 
 			/* Has the page been truncated while we slept? */
@@ -489,6 +993,8 @@ struct page *find_or_create_page(struct 
 	int err;
 repeat:
 	page = find_lock_page(mapping, index);
+	/* this only locks if a replica is made */
+	page = make_local_replica_lock(mapping, page);
 	if (!page) {
 		if (!cached_page) {
 			cached_page = alloc_page(gfp_mask);
@@ -526,22 +1032,85 @@ EXPORT_SYMBOL(find_or_create_page);
  *
  * find_get_pages() returns the number of pages which were found.
  */
-unsigned int find_get_pages(struct address_space *mapping, pgoff_t start,
-			    unsigned int nr_pages, struct page **pages)
+unsigned int find_get_pages(struct address_space *mapping,
+			    pgoff_t start, unsigned int nr_pages,
+			    struct page **pages)
 {
-	unsigned int i;
 	unsigned int ret;
+	int i;
 
 	spin_lock(&mapping->page_lock);
+	
 	ret = radix_tree_gang_lookup(&mapping->page_tree,
 				(void **)pages, start, nr_pages);
-	for (i = 0; i < ret; i++)
+	
+	for (i = 0; i < ret; i++) {
+		/* 
+		 * The radix tree lookups return leaves, which 
+		 * must be converted to pages 
+		 */
+		struct page_cache_leaf * leaf = (struct page_cache_leaf *)pages[i];
+		pages[i] = page_cache_leaf_to_page(leaf, mapping, PAGE_ANY);
 		page_cache_get(pages[i]);
+	}
 	spin_unlock(&mapping->page_lock);
 	return ret;
 }
 
 /*
+ * This is used to find _just_ the replicated pages . It's 
+ * used when we need to write to something where replication 
+ * is active.
+ */
+int find_get_replica_pages(struct address_space *mapping,
+			    pgoff_t start, unsigned int nr_pages,
+			    struct page **pages)
+{
+#ifdef CONFIG_MAPPING_REPLICATE
+	unsigned int nid = numa_node_id();
+	unsigned int nr_leaves;
+	struct page_cache_leaf *leaf;
+	struct page_cache_leaf **leaves = (struct page_cache_leaf **)pages;
+	int pages_seen;
+	int i, j;
+	
+	/* 
+	 * this is the number of leaves which have been converted
+	 * to leaves to be returned.  Any array indexes <= this
+	 * number are pages.  Any > are leaves
+	 */
+	int nr_ret_pages = 0;
+
+	spin_lock(&mapping->page_lock);
+	
+	nr_leaves = radix_tree_gang_lookup(&mapping->page_tree,
+				(void **)leaves, start, nr_pages);
+	for (i = 0; i < nr_leaves; i++) {
+		leaf = leaves[i];
+		if (leaf->count <= 1)
+			continue;
+		
+		for (j=0, pages_seen = 0; 
+		     j < MAX_NUMNODES && pages_seen < leaf->count; 
+		     j++) {
+			if (j == nid || !leaf->pages[j])
+				continue;
+			pages[nr_ret_pages] = leaf->pages[j];
+			page_cache_get(pages[nr_ret_pages]);
+			pages_seen++;
+			nr_ret_pages++;
+		}
+		if (i < nr_ret_pages)
+			i = nr_ret_pages; /* don't forget i++ */
+	}
+	spin_unlock(&mapping->page_lock);
+	return nr_ret_pages;
+#else
+	return 0;
+#endif
+}
+
+/*
  * Same as grab_cache_page, but do not wait if the page is unavailable.
  * This is intended for speculative data generators, where the data can
  * be regenerated if the page couldn't be grabbed.  This routine should
@@ -1814,6 +2383,7 @@ generic_file_aio_write_nolock(struct kio
 		 */
 		fault_in_pages_readable(buf, bytes);
 
+		collapse_replication(mapping, file);
 		page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
 		if (!page) {
 			status = -ENOMEM;
@@ -1995,3 +2565,38 @@ out:
 }
 
 EXPORT_SYMBOL_GPL(generic_file_direct_IO);
+
+/*
+ * Some of this code is a bit redundant in the case where we're replicating 
+ * an executable.  It does a deny_write_access() just before this is called
+ * so this deny_write_access()'s error checking is unnecessary in that case.
+ *
+ * For overall reduction of code and cleanliness, we do a little extra here
+ */
+ 
+int file_try_replicate(struct file *file)
+{
+#ifdef CONFIG_MAPPING_REPLICATE
+	struct inode *inode = file ? file->f_dentry->d_inode : NULL;
+	int error = 1;
+	down(&inode->i_sem);
+ 	if (!mapping_replicate(inode->i_mapping)) {
+		error = __deny_write_access(file, 1);
+		if (error)
+			goto out_fail;
+
+		/* 
+		 * there used to be a check here for dirty pages.  it
+		 * was incorrect.  dirty pages are allowed, the only
+		 * real problem is !Uptodate pages.  
+		 */
+		BUG_ON(atomic_read(&inode->i_writecount) >= 0);
+		up(&inode->i_sem);
+		return 1;
+	}
+
+out_fail:
+	up(&inode->i_sem);
+#endif
+	return 0;
+}
diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/mm/memory.c 500-user_text_replication/mm/memory.c
--- 490-amd_sysrq_t/mm/memory.c	2003-12-11 17:16:39.000000000 -0800
+++ 500-user_text_replication/mm/memory.c	2003-12-12 16:42:18.000000000 -0800
@@ -1496,8 +1496,11 @@ retry:
 			inc_rss(mm, new_page);
 		flush_icache_page(vma, new_page);
 		entry = mk_pte(new_page, vma->vm_page_prot);
-		if (write_access)
+		if (write_access) {
 			entry = pte_mkwrite(pte_mkdirty(entry));
+			BUG_ON(new_page->mapping && 
+				mapping_replicate(new_page->mapping));
+		}
 		set_pte(page_table, entry);
 		pte_chain = page_add_rmap(new_page, page_table, pte_chain);
 		pte_unmap(page_table);
diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/mm/mmap.c 500-user_text_replication/mm/mmap.c
--- 490-amd_sysrq_t/mm/mmap.c	2003-12-11 17:29:48.000000000 -0800
+++ 500-user_text_replication/mm/mmap.c	2003-12-12 16:42:18.000000000 -0800
@@ -543,6 +543,7 @@ unsigned long do_mmap_pgoff(struct file 
 	inode = file ? file->f_dentry->d_inode : NULL;
 
 	if (file) {
+		int try_to_replicate = 1;
 		switch (flags & MAP_TYPE) {
 		case MAP_SHARED:
 			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
@@ -564,11 +565,19 @@ unsigned long do_mmap_pgoff(struct file 
 			vm_flags |= VM_SHARED | VM_MAYSHARE;
 			if (!(file->f_mode & FMODE_WRITE))
 				vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
-
+			/* 
+			 * If this is set, there is a possibility of a conversion
+			 * to a writeable area later.  Do not replicate 
+			 */
+			if (vm_flags & VM_MAYWRITE)
+				try_to_replicate = 0;
+				
 			/* fall through */
 		case MAP_PRIVATE:
 			if (!(file->f_mode & FMODE_READ))
 				return -EACCES;
+			if (try_to_replicate)
+				file_try_replicate(file);
 			break;
 
 		default:
@@ -661,6 +670,7 @@ munmap_back:
 			if (error)
 				goto free_vma;
 			correct_wcount = 1;
+			file_try_replicate(file);
 		}
 		vma->vm_file = file;
 		get_file(file);
diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/mm/readahead.c 500-user_text_replication/mm/readahead.c
--- 490-amd_sysrq_t/mm/readahead.c	2003-12-11 17:10:40.000000000 -0800
+++ 500-user_text_replication/mm/readahead.c	2003-12-12 16:42:18.000000000 -0800
@@ -236,7 +236,7 @@ __do_page_cache_readahead(struct address
 		if (page_offset > end_index)
 			break;
 
-		page = radix_tree_lookup(&mapping->page_tree, page_offset);
+		page = __page_cache_lookup(mapping, page_offset);
 		if (page)
 			continue;
 
diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/mm/swap.c 500-user_text_replication/mm/swap.c
--- 490-amd_sysrq_t/mm/swap.c	2003-11-24 16:12:33.000000000 -0800
+++ 500-user_text_replication/mm/swap.c	2003-12-12 16:42:18.000000000 -0800
@@ -357,6 +357,12 @@ unsigned int pagevec_lookup(struct pagev
 	return pagevec_count(pvec);
 }
 
+unsigned int pagevec_lookup_replicas(struct pagevec *pvec, struct address_space *mapping, unsigned int nr_pages)
+{
+	pvec->nr = find_get_replica_pages(mapping, 0, nr_pages, pvec->pages);
+	return pagevec_count(pvec);
+}
+
 
 #ifdef CONFIG_SMP
 /*
diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/mm/swap_state.c 500-user_text_replication/mm/swap_state.c
--- 490-amd_sysrq_t/mm/swap_state.c	2003-10-01 11:35:37.000000000 -0700
+++ 500-user_text_replication/mm/swap_state.c	2003-12-12 16:42:18.000000000 -0800
@@ -38,6 +38,9 @@ struct address_space swapper_space = {
 	.truncate_count  = ATOMIC_INIT(0),
 	.private_lock	= SPIN_LOCK_UNLOCKED,
 	.private_list	= LIST_HEAD_INIT(swapper_space.private_list),
+#ifdef CONFIG_MAPPING_REPLICATE
+	.replicate	= ATOMIC_INIT(0),
+#endif
 };
 
 #define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
@@ -198,7 +201,7 @@ int move_to_swap_cache(struct page *page
 	spin_lock(&swapper_space.page_lock);
 	spin_lock(&mapping->page_lock);
 
-	err = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
+	err = __insert_into_page_cache(page, &swapper_space, entry.val);
 	if (!err) {
 		__remove_from_page_cache(page);
 		___add_to_page_cache(page, &swapper_space, entry.val);
@@ -234,7 +237,7 @@ int move_from_swap_cache(struct page *pa
 	spin_lock(&swapper_space.page_lock);
 	spin_lock(&mapping->page_lock);
 
-	err = radix_tree_insert(&mapping->page_tree, index, page);
+	err = __insert_into_page_cache(page, mapping, index);
 	if (!err) {
 		__delete_from_swap_cache(page);
 		___add_to_page_cache(page, mapping, index);
diff -purN -X /home/mbligh/.diff.exclude 490-amd_sysrq_t/mm/truncate.c 500-user_text_replication/mm/truncate.c
--- 490-amd_sysrq_t/mm/truncate.c	2003-10-14 15:50:36.000000000 -0700
+++ 500-user_text_replication/mm/truncate.c	2003-12-12 16:42:18.000000000 -0800
@@ -178,6 +178,33 @@ void truncate_inode_pages(struct address
 
 EXPORT_SYMBOL(truncate_inode_pages);
 
+
+/**
+ * drop_replica_pages - remove all replicated pages from a mapping
+ * @mapping: mapping to remove replication from
+ *
+ * Called under (and serialised by) inode->i_sem.
+ */
+void drop_replica_pages(struct address_space *mapping)
+{
+	struct pagevec pvec;
+	int num;
+	int i;
+
+	pagevec_init(&pvec, 0);
+	while ((num = pagevec_lookup_replicas(&pvec, mapping, PAGEVEC_SIZE))) {
+		for (i=0; i<num; i++) {
+			struct page *page = pvec.pages[i];
+			
+			lock_page(page);
+			wait_on_page_writeback(page);
+			truncate_complete_page(mapping, page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+	}
+}
+
 /**
  * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
  * @mapping: the address_space which holds the pages to invalidate