diff -purN -X /home/mbligh/.diff.exclude 310-irqbal_fast/Makefile 320-kcg/Makefile
--- 310-irqbal_fast/Makefile	2003-10-21 11:45:19.000000000 -0700
+++ 320-kcg/Makefile	2003-10-21 11:52:15.000000000 -0700
@@ -439,6 +439,10 @@ ifndef CONFIG_FRAME_POINTER
 CFLAGS		+= -fomit-frame-pointer
 endif
 
+ifeq ($(CONFIG_MCOUNT),y)
+CFLAGS		+= -pg
+endif
+
 ifdef CONFIG_DEBUG_INFO
 CFLAGS		+= -g
 endif
diff -purN -X /home/mbligh/.diff.exclude 310-irqbal_fast/arch/i386/Kconfig 320-kcg/arch/i386/Kconfig
--- 310-irqbal_fast/arch/i386/Kconfig	2003-10-21 11:45:19.000000000 -0700
+++ 320-kcg/arch/i386/Kconfig	2003-10-21 11:52:15.000000000 -0700
@@ -1363,6 +1363,14 @@ config X86_MPPARSE
 	depends on X86_LOCAL_APIC && !X86_VISWS
 	default y
 
+config MCOUNT
+	bool "Generate function call graph"
+	depends on FRAME_POINTER
+	help
+	  This option instruments the kernel to generate a deterministic
+	  function call graph.  Answering Y here will make your kernel run
+	  ???% slower.
+
 endmenu
 
 source "security/Kconfig"
diff -purN -X /home/mbligh/.diff.exclude 310-irqbal_fast/arch/i386/boot/compressed/Makefile 320-kcg/arch/i386/boot/compressed/Makefile
--- 310-irqbal_fast/arch/i386/boot/compressed/Makefile	2003-10-21 11:45:19.000000000 -0700
+++ 320-kcg/arch/i386/boot/compressed/Makefile	2003-10-21 11:52:15.000000000 -0700
@@ -10,6 +10,11 @@ EXTRA_AFLAGS	:= -traditional
 CFLAGS := $(CFLAGS_NOGCOV)
 LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32
 
+ifeq ($(CONFIG_MCOUNT),y)
+$(obj)/misc.o:
+	$(CC) $(subst -pg,,$(CFLAGS)) -c $(src)/misc.c -o $(obj)/misc.o	
+endif
+
 $(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE
 	$(call if_changed,ld)
 	@:
diff -purN -X /home/mbligh/.diff.exclude 310-irqbal_fast/arch/i386/kernel/i386_ksyms.c 320-kcg/arch/i386/kernel/i386_ksyms.c
--- 310-irqbal_fast/arch/i386/kernel/i386_ksyms.c	2003-10-14 15:50:11.000000000 -0700
+++ 320-kcg/arch/i386/kernel/i386_ksyms.c	2003-10-21 11:52:15.000000000 -0700
@@ -186,6 +186,11 @@ extern void * memcpy(void *,const void *
 EXPORT_SYMBOL_NOVERS(memcpy);
 EXPORT_SYMBOL_NOVERS(memset);
 
+#ifdef CONFIG_MCOUNT
+extern void mcount(void);
+EXPORT_SYMBOL_NOVERS(mcount);
+#endif
+
 #ifdef CONFIG_HAVE_DEC_LOCK
 EXPORT_SYMBOL(atomic_dec_and_lock);
 #endif
diff -purN -X /home/mbligh/.diff.exclude 310-irqbal_fast/arch/i386/kernel/semaphore.c 320-kcg/arch/i386/kernel/semaphore.c
--- 310-irqbal_fast/arch/i386/kernel/semaphore.c	2002-12-09 18:45:55.000000000 -0800
+++ 320-kcg/arch/i386/kernel/semaphore.c	2003-10-21 11:52:15.000000000 -0700
@@ -191,6 +191,7 @@ asm(
 ".align 4\n"
 ".globl __down_failed\n"
 "__down_failed:\n\t"
+//	"call mcount_stext_lock\n\t"
 #if defined(CONFIG_FRAME_POINTER)
 	"pushl %ebp\n\t"
 	"movl  %esp,%ebp\n\t"
@@ -214,6 +215,7 @@ asm(
 ".align 4\n"
 ".globl __down_failed_interruptible\n"
 "__down_failed_interruptible:\n\t"
+//	"call mcount_stext_lock\n\t"
 #if defined(CONFIG_FRAME_POINTER)
 	"pushl %ebp\n\t"
 	"movl  %esp,%ebp\n\t"
@@ -235,6 +237,7 @@ asm(
 ".align 4\n"
 ".globl __down_failed_trylock\n"
 "__down_failed_trylock:\n\t"
+//	"call mcount_stext_lock\n\t"
 #if defined(CONFIG_FRAME_POINTER)
 	"pushl %ebp\n\t"
 	"movl  %esp,%ebp\n\t"
@@ -256,6 +259,7 @@ asm(
 ".align 4\n"
 ".globl __up_wakeup\n"
 "__up_wakeup:\n\t"
+//	"call mcount_stext_lock\n\t"
 	"pushl %eax\n\t"
 	"pushl %edx\n\t"
 	"pushl %ecx\n\t"
diff -purN -X /home/mbligh/.diff.exclude 310-irqbal_fast/arch/i386/lib/Makefile 320-kcg/arch/i386/lib/Makefile
--- 310-irqbal_fast/arch/i386/lib/Makefile	2003-06-19 14:41:15.000000000 -0700
+++ 320-kcg/arch/i386/lib/Makefile	2003-10-21 11:52:15.000000000 -0700
@@ -10,3 +10,4 @@ lib-y = checksum.o delay.o \
 lib-$(CONFIG_X86_USE_3DNOW) += mmx.o
 lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
 lib-$(CONFIG_DEBUG_IOVIRT)  += iodebug.o
+lib-$(CONFIG_MCOUNT) += mcount.o
diff -purN -X /home/mbligh/.diff.exclude 310-irqbal_fast/arch/i386/lib/mcount.S 320-kcg/arch/i386/lib/mcount.S
--- 310-irqbal_fast/arch/i386/lib/mcount.S	1969-12-31 16:00:00.000000000 -0800
+++ 320-kcg/arch/i386/lib/mcount.S	2003-10-21 11:52:15.000000000 -0700
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2000 SGI
+ *
+ * Written by Dimitris Michailidis dimitris@sgi.com
+ *
+ * This file implements mcount(), which is used to collect profiling data.
+ * We provide several variants to accomodate different types of callers at
+ * the lowest possible overhead.
+ */
+
+#include <linux/config.h>
+#include <linux/linkage.h>
+
+#define MCOUNT_HEAD  \
+	pushl %ecx          /* We must protect the arguments of FASTCALLs */; \
+	movl mcount_hook, %ecx;  \
+	testl %ecx, %ecx;  \
+	jz 1f;  \
+	pushl %eax;  \
+	pushl %edx;  \
+        movl 12(%esp), %edx  /* mcount()'s parent */
+
+#define MCOUNT_TAIL \
+	call *%ecx;  \
+	popl %edx;  \
+	popl %eax;  \
+1:	popl %ecx
+
+/*
+ * This is the main variant and is called by C code.  GCC's -pg option
+ * automatically instruments every C function with a call to this.
+ */
+ENTRY(mcount)
+#if defined(CONFIG_MCOUNT)
+	MCOUNT_HEAD
+#ifdef CONFIG_FRAME_POINTER
+        movl 4(%ebp), %eax  /* mcount()'s parent's parent */
+#endif
+	MCOUNT_TAIL
+#endif
+	ret
+
+/*
+ * This variant is used by assembly functions.  Must be inserted by hand.
+ */
+ENTRY(mcount_asm)
+#if defined(CONFIG_MCOUNT)
+	MCOUNT_HEAD
+        movl 16(%esp), %eax  /* mcount()'s parent's parent */
+	MCOUNT_TAIL
+#endif
+	ret
+/*
+ * This variant is used by assembly functions in section .stext.lock.
+ * Must be inserted by hand.
+ */
+ENTRY(mcount_stext_lock)
+#if defined(CONFIG_MCOUNT)
+	MCOUNT_HEAD
+        movl 16(%esp), %eax  /* mcount()'s parent's parent */
+	addl 1(%eax), %eax   /* this and the next lines are magic */
+	leal 5(%eax), %eax
+	MCOUNT_TAIL
+#endif
+	ret
+
diff -purN -X /home/mbligh/.diff.exclude 310-irqbal_fast/fs/proc/proc_misc.c 320-kcg/fs/proc/proc_misc.c
--- 310-irqbal_fast/fs/proc/proc_misc.c	2003-10-21 11:19:06.000000000 -0700
+++ 320-kcg/fs/proc/proc_misc.c	2003-10-21 11:52:15.000000000 -0700
@@ -51,6 +51,10 @@
 #include <asm/tlb.h>
 #include <asm/div64.h>
 
+#ifdef CONFIG_MCOUNT
+#include <linux/mcount.h>
+#endif
+
 #define LOAD_INT(x) ((x) >> FSHIFT)
 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
 /*
@@ -854,4 +858,13 @@ void __init proc_misc_init(void)
 			entry->proc_fops = &ppc_htab_operations;
 	}
 #endif
+#ifdef CONFIG_MCOUNT
+	{
+		extern struct file_operations mcount_operations;
+		extern struct proc_dir_entry *mcount_pde;
+		mcount_pde = create_proc_entry("mcount", S_IRUGO|S_IWUSR, NULL);
+		if (mcount_pde)
+			mcount_pde->proc_fops = &mcount_operations;
+	}
+#endif
 }
diff -purN -X /home/mbligh/.diff.exclude 310-irqbal_fast/include/linux/mcount.h 320-kcg/include/linux/mcount.h
--- 310-irqbal_fast/include/linux/mcount.h	1969-12-31 16:00:00.000000000 -0800
+++ 320-kcg/include/linux/mcount.h	2003-10-21 11:52:15.000000000 -0700
@@ -0,0 +1,77 @@
+/*
+ * include/linux/mcount.h
+ *
+ * Implementation of kernel mcount handler and supporting functions.
+ * 
+ * Code based on kernprof http://oss.sgi.com/projects/kernprof/
+ * Copyright (C) SGI 1999, 2000, 2001
+ * Written by Dimitris Michailidis (dimitris@engr.sgi.com)
+ * Modified by John Hawkes (hawkes@engr.sgi.com)
+ * Contributions from Niels Christiansen (nchr@us.ibm.com)
+ * Adapted for stand-alone call graphing by Adam Litke (agl@us.ibm.com)
+ */
+
+#ifndef __MCOUNT_H
+#define __MCOUNT_H
+
+#include <linux/kernel.h> 
+#include <linux/config.h> 
+#include <linux/proc_fs.h>
+
+#define DFL_PC_RES 4  /* default PC resolution for this platform */
+#define CG_MAX_ARCS (1 << (8 * sizeof(short)))
+#define FUNCTIONPC(func)        (*(unsigned long *)&(func))
+
+#define pc_out_of_range(pc)    \
+        ((pc) < (unsigned long) &_stext || (pc) >= (unsigned long) &_etext)
+
+#if CPU != 386
+/* Like the above but also returns the result */
+static __inline__ int atomic_add_return(int i, atomic_t *v)
+{
+	register int oldval;
+	__asm__ __volatile__(
+		LOCK "xaddl %2,%0"
+		:"=m" (v->counter), "=r" (oldval)
+		:"1" (i), "m" (v->counter) : "memory");
+	return oldval + i;
+}
+#endif
+
+struct prof_mem_map
+{
+	unsigned long     kernel_buckets;   /* number of kernel buckets */
+	//unsigned long     module_buckets;   /* number of module buckets */
+	unsigned long     nr_cpus;          /* number of processors whether profiled or not */
+	unsigned long     cg_from_size;     /* size of one cg_from array */
+	unsigned long     cg_to_size;       /* size of one cg_to array */
+	unsigned long     cg_to_offset;     /* offset of cg_to array */
+	unsigned long     kernel_start;     /* lowest text address in kernel */
+	unsigned long     kernel_end;       /* highest text address in kernel */
+	//unsigned long     module_start;     /* lowest text address in all modules */
+	//unsigned long     module_end;       /* highest text address in all modules */
+};
+	
+struct cg_arc_dest {
+	unsigned long address;
+	atomic_t count;
+	unsigned short link;
+	unsigned short pad;
+};
+
+void cg_record_arc(unsigned long frompc, unsigned long selfpc) __attribute__((regparm(2)));
+
+int mcount_init(void); 
+
+ssize_t mcount_write(struct file * file, const char * buf,
+		       size_t count, loff_t *ppos);
+
+ssize_t mcount_read(struct file * file, char * buf,
+		       size_t count, loff_t *ppos);
+
+static struct file_operations mcount_operations = {
+	        write:  mcount_write,
+	        read:	mcount_read,
+};
+
+#endif
diff -purN -X /home/mbligh/.diff.exclude 310-irqbal_fast/kernel/Makefile 320-kcg/kernel/Makefile
--- 310-irqbal_fast/kernel/Makefile	2003-10-21 11:45:20.000000000 -0700
+++ 320-kcg/kernel/Makefile	2003-10-21 11:53:37.000000000 -0700
@@ -27,6 +27,13 @@ obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_IKCONFIG_PROC) += configs.o
 obj-$(CONFIG_X86_EARLY_PRINTK) += early_printk.o
+obj-$(CONFIG_MCOUNT) += mcount.o
+
+ifeq ($(CONFIG_MCOUNT),y)
+$(obj)/mcount.o:
+	$(CC) $(subst -pg,,$(CFLAGS)) -c -o $(obj)/mcount.o $(src)/mcount.c
+endif
+
 
 ifneq ($(CONFIG_IA64),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff -purN -X /home/mbligh/.diff.exclude 310-irqbal_fast/kernel/mcount.c 320-kcg/kernel/mcount.c
--- 310-irqbal_fast/kernel/mcount.c	1969-12-31 16:00:00.000000000 -0800
+++ 320-kcg/kernel/mcount.c	2003-10-21 11:52:15.000000000 -0700
@@ -0,0 +1,197 @@
+/*
+ * kernel/mcount.c
+ *
+ * Implementation of kernel mcount handler and supporting functions.
+ * 
+ * Code based on kernprof http://oss.sgi.com/projects/kernprof/
+ * Copyright (C) SGI 1999, 2000, 2001
+ * Written by Dimitris Michailidis (dimitris@engr.sgi.com)
+ * Modified by John Hawkes (hawkes@engr.sgi.com)
+ * Contributions from Niels Christiansen (nchr@us.ibm.com)
+ * Adapted for stand-alone call graphing by Adam Litke (agl@us.ibm.com)
+ */
+
+#include <linux/mcount.h>
+#include <linux/vmalloc.h>
+#include <asm/uaccess.h>
+#include <asm/percpu.h>
+#include <linux/kallsyms.h>
+
+void UNKNOWN_KERNEL(void) {} /* Dummy functions to make profiles more */
+void UNKNOWN_MODULE(void) {} /* descriptive */
+
+unsigned int mcount_shift, PC_resolution = DFL_PC_RES;
+
+char* memory_start = NULL;
+unsigned short *cg_from_base = NULL;
+struct cg_arc_dest *cg_to_base = NULL;
+int cg_arc_overflow = 0; /* set when no new arcs can be added to the call graph */
+int n_buckets = 0;
+size_t mem_needed;   /* space needed for the call graph and the PC samples */
+extern char _stext, _etext, _sinittext, _einittext;
+
+void (*mcount_hook)(unsigned long, unsigned long) = NULL;
+struct proc_dir_entry *mcount_pde;
+
+static int mcount_alloc_mem(void)
+{
+	unsigned long cg_from_size, cg_to_size;
+	size_t text_size = (unsigned long) &_etext - (unsigned long) &_stext;
+	struct prof_mem_map *memory_map;
+	
+	for (mcount_shift = 0; (1 << mcount_shift) < PC_resolution; mcount_shift++);
+	n_buckets = text_size >> mcount_shift;
+	cg_from_size = n_buckets * sizeof(short);
+	cg_to_size = CG_MAX_ARCS * sizeof(struct cg_arc_dest);
+	mem_needed = sizeof(struct prof_mem_map) + 
+		((cg_from_size + cg_to_size) * num_online_cpus());
+	if ((memory_start = vmalloc(mem_needed)) == NULL) {
+		return -ENOMEM;
+	}
+	memset(memory_start, 0, mem_needed);
+	
+	cg_from_base = (unsigned short *) (memory_start + sizeof(struct prof_mem_map));
+	cg_to_base = (struct cg_arc_dest *) (memory_start + sizeof(struct prof_mem_map) +
+			(cg_from_size * num_online_cpus()));
+
+	memory_map = (struct prof_mem_map*) memory_start;
+	memory_map->kernel_buckets = n_buckets;
+	memory_map->nr_cpus = num_online_cpus();
+	memory_map->cg_from_size = cg_from_size;
+	memory_map->cg_to_size = cg_to_size;
+	memory_map->cg_to_offset = cg_from_size * num_online_cpus();
+	memory_map->kernel_start = (unsigned long)&_stext;
+	memory_map->kernel_end = (unsigned long)&_etext;
+	return 0;
+}
+
+static void mcount_free_mem(void)
+{
+	vfree(memory_start);
+	memory_start = NULL;
+}
+
+/* Record an arc traversal in the call graph.  Called by mcount().  SMP safe */
+void cg_record_arc(unsigned long frompc, unsigned long selfpc)
+{
+#ifndef __HAVE_ARCH_CMPXCHG
+	static spinlock_t cg_record_lock = SPIN_LOCK_UNLOCKED;
+	unsigned long flags;
+#endif
+	int toindex, fromindex, cpu;
+	unsigned short *q, *cg_from;
+	struct cg_arc_dest *p, *cg_to;
+
+	cpu = smp_processor_id();
+	
+	cg_from = &cg_from_base[n_buckets * cpu];
+	cg_to   = &cg_to_base[CG_MAX_ARCS * cpu];
+	
+	if (pc_out_of_range(frompc))
+		fromindex = (FUNCTIONPC(UNKNOWN_KERNEL) - (unsigned long) &_stext)
+				>> mcount_shift;
+	else 
+		fromindex = (frompc - (unsigned long) &_stext) >> mcount_shift;
+	q = &cg_from[fromindex];
+	
+	/* Easy case: the arc is already in the call graph */
+	for (toindex = *q; toindex != 0; ) {
+		p = &cg_to[toindex];
+		if (p->address == selfpc) {
+			atomic_inc(&p->count);
+			return;
+		}
+		toindex = p->link;
+	}
+	/*
+	 * No luck.  We need to add a new arc.  Since cg_to[0] is unused,
+	 * we use cg_to[0].count to keep track of the next available arc.
+	 */
+	if (cg_arc_overflow) {
+		return;
+	}
+	toindex = atomic_add_return(1, &cg_to->count);
+	if (toindex >= CG_MAX_ARCS) {
+		/*
+		 * We have run out of space for arcs.  We'll keep incrementing
+		 * the existing ones but we won't try to add any more.
+		 */
+		cg_arc_overflow = 1;
+		atomic_set(&cg_to->count, CG_MAX_ARCS - 1);
+		return;
+	}
+	/*
+	 * We have a secured slot for a new arc and all we need to do is
+	 * initialize it and add it to a hash bucket.  We use compare&swap, if
+	 * possible, to avoid any spinlocks whatsoever.
+	 */
+	p = &cg_to[toindex];
+	p->address = selfpc;
+	atomic_set(&p->count, 1);
+#ifdef __HAVE_ARCH_CMPXCHG
+	{
+	  p->link = *q;
+	} while (cmpxchg(q, p->link, toindex) != p->link);
+#else
+	spin_lock_irqsave(&cg_record_lock, flags);
+	p->link = *q;
+	*q = toindex;
+	spin_unlock_irqrestore(&cg_record_lock, flags);
+#endif
+	return;
+}
+
+int mcount_start(void)
+{
+	if (!memory_start) {
+		if(mcount_alloc_mem())
+			return -ENOMEM;
+		mcount_pde->size = mem_needed;
+	}
+	mcount_hook = cg_record_arc;
+	return 0;
+}
+
+int mcount_stop(void)
+{
+	mcount_hook = NULL;
+	return 0;
+}
+
+int mcount_cleanup(void)
+{
+	mcount_stop();
+	mcount_pde->size = 0;
+	mcount_free_mem();
+	return 0;
+}
+
+ssize_t mcount_read(struct file * file, char * buf,
+			size_t count, loff_t *ppos)
+{
+	count = (count + *ppos >= mcount_pde->size) ? 
+		mcount_pde->size - *ppos : count;
+	copy_to_user(buf, memory_start + *ppos, count);
+	*ppos += count;
+	return count;
+}
+
+ssize_t mcount_write(struct file * file, const char * buf,
+		       size_t count, loff_t *ppos)
+{
+	int ret;
+	
+	switch (buf[0]) {
+		case '0':
+			ret = mcount_cleanup();
+			break;
+		case '1':
+			ret = mcount_stop();
+			break;
+		case '2':
+			ret = mcount_start();
+		default:
+			ret = -EINVAL;
+	}
+	return (ret == 0) ? count : ret;
+}
diff -purN -X /home/mbligh/.diff.exclude 310-irqbal_fast/scripts/readcg-0.1/Makefile 320-kcg/scripts/readcg-0.1/Makefile
--- 310-irqbal_fast/scripts/readcg-0.1/Makefile	1969-12-31 16:00:00.000000000 -0800
+++ 320-kcg/scripts/readcg-0.1/Makefile	2003-10-09 10:14:18.000000000 -0700
@@ -0,0 +1,10 @@
+# Makefile for readcg - Kernel Call Graph
+# Adam Litke (agl@us.ibm.com)
+
+CC = /usr/bin/gcc
+
+readcg: clean
+	$(CC) -Wall -o readcg readcg.c
+
+clean:
+	rm -f readcg
diff -purN -X /home/mbligh/.diff.exclude 310-irqbal_fast/scripts/readcg-0.1/README 320-kcg/scripts/readcg-0.1/README
--- 310-irqbal_fast/scripts/readcg-0.1/README	1969-12-31 16:00:00.000000000 -0800
+++ 320-kcg/scripts/readcg-0.1/README	2003-10-14 13:55:32.000000000 -0700
@@ -0,0 +1,27 @@
+READNE - readcg-0.1
+
+10-14-03: Initial Release
+
+readcg is the userspace utility for printing kernel call graphs.
+This tool is in its early stages and most certainly has bugs.  
+Please email me <aglitke@us.ibm.com> to report bugs or to submit 
+patches.
+
+QUICK START:
+
+(1.) Obtain readcg-xx.tar.gz and kcg-X.X.X-V.patch from the lse
+     project page (www.sf.net/projects/lse).
+
+(2.) Patch your kernel and build it.  Make sure 'Generate 
+     function call graph' in the 'Kernel Hacking' section is 
+     turned on.  Install your kernel and boot it.
+
+(3.) Unpack readcg and descend into the readcg directory.
+     Build the tool by typing 'make'.
+
+(4.) To generate a call graph do the following:
+        readcg -c                # Clear counters
+        readcg -e                # Enable profiling
+           <Run benchmark, etc.>
+        readcg -d                # Disable profiling
+        readcg -m <System.map>   # Generate call graph
diff -purN -X /home/mbligh/.diff.exclude 310-irqbal_fast/scripts/readcg-0.1/readcg.c 320-kcg/scripts/readcg-0.1/readcg.c
--- 310-irqbal_fast/scripts/readcg-0.1/readcg.c	1969-12-31 16:00:00.000000000 -0800
+++ 320-kcg/scripts/readcg-0.1/readcg.c	2003-10-14 11:26:10.000000000 -0700
@@ -0,0 +1,451 @@
+/*
+ *  readcg.c - control kernel call graphing
+ *
+ *  Copyright (C) 1999-2002 SGI
+ *
+ *  Written by Dimitris Michailidis (dimitris@engr.sgi.com)
+ *  Modifications by John Hawkes (hawkes@engr.sgi.com)
+ *  Contributions from Niels Christiansen (nchr@us.ibm.com)
+ *  Contributions from Ethan Solomita (ethan@cs.columbia.edu)
+ *  Adapted by Adam Litke (agl@us.ibm.com)
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/gmon.h>
+#include <fcntl.h>
+#include <time.h>
+#include <signal.h>
+#include <sched.h>
+#include "readcg.h"
+
+
+static char optstring[]="edcm:";
+char *prgname;
+struct prof_mem_map *memory_map;
+struct addr_to_name_map *a2n;
+unsigned long num_entries;
+
+void err_exit(const char *s) 
+{
+	perror(s);
+	exit(1);
+}
+
+/*
+ * Create a address -> name mapping using a System.map file
+ */
+void parse_map(char *map_file) {
+	FILE * map;
+	char mapline[S_LEN];
+	unsigned long address, i=0;
+	char mode[2], fn_name[NAME_LEN];
+
+	map = fopen(map_file, "r");
+
+	if (!map)
+		err_exit("Cannot open map file");
+	
+	while(fgets(mapline, S_LEN, map)) {
+		sscanf(mapline, "%lx %s %s", &address, mode, fn_name);
+		if (*mode == 'T' || *mode == 't' ||
+		    *mode == 'W' || *mode == 'w') 
+			++i;
+	}
+	num_entries = i+1;
+	a2n = calloc(i+1, sizeof(struct addr_to_name_map));
+	fseek(map, 0, SEEK_SET);		
+	
+	i = 0;
+	while(fgets(mapline, S_LEN, map) && i < num_entries) {
+		sscanf(mapline, "%lx %s %s", &address, mode, fn_name);
+		if (*mode != 'T' && *mode != 't' &&
+		    *mode != 'W' && *mode != 'w') continue;
+		a2n[i].addr = address;
+		strncpy(a2n[i].name, fn_name, NAME_LEN);
+		a2n[i].name[strlen(fn_name)] = '\0'; 
+		i++;
+	}
+}
+
+/*
+ * Simple array binary search to retrieve function name given an address
+ */
+void search_map(struct addr_to_name_map *map, unsigned long address, char *name){
+	long low, high, bar;
+
+	low = 0;
+	high = num_entries;
+
+	while(low <= high) {
+		bar = low + ((high - low) / 2);
+		if (map[bar].addr == address ||
+		    (map[bar].addr < address &&
+		     map[bar+1].addr > address)) {
+			strcpy(name, map[bar].name);
+			return;
+		} else if (map[bar].addr > address)
+			high = bar - 1;
+		else
+			low = bar + 1;
+	}
+	name = "\0";
+	return;
+}	
+
+/*
+ * Read the contents of a file into a dynamically allocated buffer.
+ * Also return the size of the buffer.
+ */
+char *read_file(const char *name, size_t off, size_t *lenp)
+{
+	char *buf;
+	int fd;
+	off_t end;
+	size_t len;
+
+	if ((fd = open(name, O_RDONLY)) < 0 || 
+	    (end = lseek(fd, off, SEEK_END)) < 0 || 
+	    lseek(fd, 0, SEEK_SET) < 0)
+		err_exit(name);
+
+	len = (size_t) end;
+	if ((buf = malloc(len)) == NULL)
+		err_exit("malloc");
+
+	lseek(fd, off, SEEK_SET);
+	read(fd, buf, len);
+
+	close(fd);
+	if (lenp) *lenp = len;
+	return buf;
+}
+
+/*
+ * Read header from beginning of call graph data
+ */
+void get_memory_map(struct prof_mem_map *map)
+{
+	int fd;
+	off_t end;
+	size_t mapsz = sizeof(struct prof_mem_map);
+	
+	if ((fd = open(PROC_FILE, O_RDONLY)) < 0 || 
+	    (end = lseek(fd, 0, SEEK_END)) < 0 || 
+	    lseek(fd, 0, SEEK_SET) < 0)
+		err_exit(PROC_FILE);
+	if(read(fd, map, mapsz) != mapsz)
+		err_exit(PROC_FILE);
+	close(fd);
+}
+
+/*
+ * Insert a destination address record into list
+ */
+struct cg_entry *insert_to_record(struct cg_entry *list, 
+				  kaddress_t addr, 
+				  unsigned long count,
+				  int merge)
+{
+	struct cg_entry *cur, *prev, *new;
+	char fn_name[NAME_LEN];
+
+	new = malloc(sizeof(struct cg_entry));
+	if (!new)
+		err_exit("malloc");
+
+	search_map(a2n, addr, fn_name);
+	strncpy(new->name, fn_name, NAME_LEN);
+	new->count = count;
+	new->next = NULL;
+	
+	if (list == NULL) {
+		list = new;
+		return list;
+	}
+	
+	if (new->count > list->count) {
+		new->next = list;
+		list = new;
+		return list;
+	}		
+	
+	cur = list->next;
+	prev = list;
+	while (cur && new->count < cur->count) {
+		prev = cur;
+		cur = cur->next;
+	}
+	new->next = cur;
+	prev->next = new;
+	return list;
+}
+
+/*
+ * Return the correct from address record if it exists, else create one
+ */
+struct cg_entry *get_from_record(struct cg_entry *list, char *name)
+{
+	struct cg_entry *prev, *cur;
+	
+	prev = NULL;
+	cur = list;
+	while (cur && strcmp(cur->name, name)) {
+		prev = cur;
+		cur = cur->next;
+	}
+	
+	if (!cur) {
+		cur = malloc(sizeof(struct cg_entry));
+		if (!cur)
+			err_exit("malloc");
+		
+		strncpy(cur->name, name, NAME_LEN);
+		cur->next = NULL;
+		cur->list = NULL;
+		if (prev)
+			prev->next = cur;
+	}
+	return cur;
+}	
+
+/*
+ * Print the call graph according to the following format:
+ * 			nnnn parent_to_base_function
+ * base_function
+ * 			nnnn child_of_base_function
+ */
+void output_cg_data(struct cg_entry *from_head, struct cg_entry *to_head)
+{
+	struct cg_entry *from;
+	struct cg_entry *to;
+
+	for(from = from_head; from; from = from->next) {
+		printf("%s\n", "========================================");
+		/* Print parent functions */
+		to = get_from_record(to_head, from->name);
+		for (to = to->list; to; to = to->next)
+			printf("   %10lu %s\n", to->count, to->name);			
+		/* Current function */
+		printf("%s\n", from->name);
+
+		/* Print children */
+		for(to = from->list; to; to = to->next) {
+			printf("   %10lu %s\n", to->count, to->name);
+		}
+	}
+}
+
+/*
+ * Get the call graph from /proc.  The function has two phases:
+ * 	(1) Combine per-cpu graphs into a single graph
+ * 	(2) Create src->dst mapping and a reverse mapping
+ */
+struct cg_entry *get_gprof_call_graph()
+{
+   int from_index, from_len, step;
+   unsigned short *fromc, *froms;
+   struct cg_arc_dest *toc, *tos;
+   char *buf = NULL;
+   char *b;
+   char *p;
+   int cpu;
+   int to_index;
+   int i;
+   int curix;
+   address_t frompc, lowpc, highpc;
+   long total_count = 0L;
+   long cg_input = 0L;
+   long cg_merged = 0L;
+   long cg_base = 0L;
+   long cg_records = 0L;
+   long lost_ones = 0L;
+   char name[NAME_LEN];
+   struct cg_entry *fcur = NULL;
+   struct cg_entry *from_head, *to_head;
+   
+   lowpc = memory_map->kernel_start;
+   highpc = memory_map->kernel_end;
+   
+   step = PROF_GET_PC_RES;
+
+   buf = read_file(PROC_FILE, sizeof (struct prof_mem_map), NULL);
+   froms = (unsigned short *) buf;
+   from_len = (highpc - lowpc) / step;
+   p = buf + memory_map->cg_to_offset;
+   tos = (struct cg_arc_dest *)p;
+
+   b = buf;
+   curix = tos[0].count + 1;
+
+   for (cpu = 0; cpu < memory_map->nr_cpus; cpu++)
+   {
+      b = buf + memory_map->cg_from_size * cpu;
+      fromc = (unsigned short *)b;
+      p = buf + memory_map->cg_to_offset + memory_map->cg_to_size * cpu;
+      toc = (struct cg_arc_dest *)p;
+      for (from_index = 0; from_index < from_len; ++from_index)
+      {
+         frompc = lowpc + from_index * step;
+         for (to_index = fromc[from_index]; to_index != 0; to_index = toc[to_index].link)
+         {
+	    cg_input++;
+            if (!cpu)
+            {
+               cg_base++;
+               continue;
+            }
+            for (i = froms[from_index]; i != 0; i = tos[i].link)
+            {
+               if (tos[i].address == toc[to_index].address)
+               {
+                  cg_merged++;
+                  tos[i].count += toc[to_index].count;
+                  break;
+               }
+            }
+            if (i == 0)
+            {
+               if (curix >= CG_MAX_ARCS)
+                  lost_ones++;
+               else
+               {
+                  tos[curix].link = froms[from_index];
+                  tos[curix].address = toc[to_index].address;
+                  tos[curix].count = toc[to_index].count;
+                  froms[from_index] = curix++;
+               }
+            }
+         }
+      }
+   }
+
+// Merge completed
+
+   from_head = NULL;
+   to_head = NULL;
+   for (from_index = 0; from_index < from_len; ++from_index)
+   {
+      kaddress_t src, dst;
+      
+      frompc = lowpc + from_index * step;
+      src = (kaddress_t)frompc;
+      search_map(a2n, src, name);
+      
+      for (to_index = froms[from_index]; to_index != 0;
+           to_index = tos[to_index].link)
+      {
+	 
+         total_count += tos[to_index].count;
+         cg_records++;
+	 dst = (kaddress_t)tos[to_index].address;
+	 if (dst) {
+		char name2[NAME_LEN];
+		/* Create src->dst mapping */
+		search_map(a2n, src, name);
+		fcur = get_from_record(from_head, name);
+		if (!from_head) from_head = fcur;
+		fcur->list = insert_to_record(fcur->list, dst, tos[to_index].count, 1);
+		/* Create dst->src mapping */
+		search_map(a2n, dst, name2);
+		fcur = get_from_record(to_head, name2);
+		if (!to_head) to_head = fcur;
+		fcur->list = insert_to_record(fcur->list, src, tos[to_index].count, 1);
+		
+	 }
+      }
+   }
+   
+   free(buf);
+   from_head->list->list = to_head; /* Dirty dirty hack to return both lists */
+   return from_head;
+}
+
+/* generate call graph information from /proc/mcount */
+void output_cg_profile()
+{
+	struct cg_entry *from_head, *to_head;
+
+	memory_map = malloc(sizeof(struct prof_mem_map) + 4);
+	if (!memory_map)
+		err_exit("malloc");
+	
+	get_memory_map(memory_map);
+
+	from_head = get_gprof_call_graph();
+	to_head = from_head->list->list;   /* See dirty dirty hack above */
+	output_cg_data(from_head, to_head);
+}
+
+void profile_cmd(char cmd)
+{
+	FILE * file;
+
+	file = fopen(PROC_FILE, "r+");
+	if (!file)
+		err_exit(PROC_FILE);
+	
+	if (fputc(cmd, file) != cmd)
+		err_exit(PROC_FILE);
+	
+	fclose(file);
+	exit(0);
+}
+
+void usage()
+{
+	fprintf(stderr, "Usage: %s [ -e | -d | -c | -m <System.map>]\n", prgname);
+	fprintf(stderr, "\t-m <System.map>  Print profile\n");
+	fprintf(stderr, "\t-e  Enable profiler\n");
+	fprintf(stderr, "\t-d  Disable profiler\n");
+	fprintf(stderr, "\t-c  Clear profile data\n");
+	exit(1);
+}
+
+int main (int argc, char **argv)
+{
+	char c;
+	char *mapfile = NULL;
+	prgname = argv[0];
+	
+	while ((c = getopt(argc, argv, optstring)) != -1) {
+		switch(c) {
+		case 'm':
+			mapfile = optarg;
+			break;
+		case 'e':
+			profile_cmd(MCOUNT_ENABLE);
+		case 'd':
+			profile_cmd(MCOUNT_DISABLE);
+		case 'c':
+			profile_cmd(MCOUNT_CLEAR);
+		default:
+			usage();
+		}
+	}
+	if (!mapfile) usage();
+	
+	parse_map(mapfile);
+	output_cg_profile();
+	return 0;
+}
diff -purN -X /home/mbligh/.diff.exclude 310-irqbal_fast/scripts/readcg-0.1/readcg.h 320-kcg/scripts/readcg-0.1/readcg.h
--- 310-irqbal_fast/scripts/readcg-0.1/readcg.h	1969-12-31 16:00:00.000000000 -0800
+++ 320-kcg/scripts/readcg-0.1/readcg.h	2003-10-09 10:30:17.000000000 -0700
@@ -0,0 +1,65 @@
+#ifndef __READCG_H
+#define __READCG_H
+
+#define VERSION "0.1"
+
+#define CG_MAX_ARCS (1 << (8 * sizeof(short))) 
+#define PROF_GET_PC_RES 4
+#define S_LEN 128
+#define NAME_LEN 50
+
+//#define PTR_ALIGNED  __attribute__ ((aligned (__alignof__ (char *))))
+
+#define PROC_FILE      "/proc/mcount"
+#define MCOUNT_ENABLE  '2'
+#define MCOUNT_DISABLE '1'
+#define MCOUNT_CLEAR   '0'
+
+/* User space address */
+typedef unsigned long address_t;
+
+/* Kernel address */
+#if defined(__ppc64__)
+typedef unsigned long long kaddress_t;
+#else
+typedef address_t kaddress_t;
+#endif
+
+/*
+ * Stores memory mapping of profiler buckets
+ */
+struct prof_mem_map
+{
+   unsigned long     kernel_buckets;   /* number of kernel buckets */
+   //unsigned long     module_buckets;   /* number of module buckets */
+   unsigned long     nr_cpus;          /* number of processors whether profiled or not */
+   unsigned long     cg_from_size;     /* size of one cg_from array */
+   unsigned long     cg_to_size;       /* size of one cg_to array */
+   unsigned long     cg_to_offset;     /* offset of cg_to array */
+   unsigned long     kernel_start;     /* lowest text address in kernel */
+   unsigned long     kernel_end;       /* highest text address in kernel */
+   //unsigned long     module_start;     /* lowest text address in all modules */
+   //unsigned long     module_end;       /* highest text address in all modules */
+};
+
+struct addr_to_name_map {
+	unsigned long addr;
+	char name[NAME_LEN];
+};
+
+/* This must match the kernel's definition. */
+struct cg_arc_dest {
+   kaddress_t address;
+   int count;
+   unsigned short link;
+   unsigned short pad;
+};
+
+struct cg_entry {
+	char name[NAME_LEN];
+	unsigned long count;
+	struct cg_entry *list;
+	struct cg_entry *next;
+};
+
+#endif /* __READCG_H */