From d6790ee609a62386c2803cbe74b84354af99bb73 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Thu, 20 Jun 2013 12:49:18 +0200
Subject: [PATCH 001/119] Add LITMUS^RT directory

Hookup litmus/ with kernel and add extra version.
---
 Makefile         | 2 +-
 arch/arm/Kconfig | 3 +++
 arch/x86/Kconfig | 2 ++
 litmus/Kconfig   | 3 +++
 litmus/Makefile  | 3 +++
 5 files changed, 12 insertions(+), 1 deletion(-)
 create mode 100644 litmus/Kconfig
 create mode 100644 litmus/Makefile

diff --git a/Makefile b/Makefile
index 3071428..dd0cb2e 100644
--- a/Makefile
+++ b/Makefile
@@ -733,7 +733,7 @@ export mod_sign_cmd
 
 
 ifeq ($(KBUILD_EXTMOD),)
-core-y		+= kernel/ mm/ fs/ ipc/ security/ crypto/ block/
+core-y		+= kernel/ mm/ fs/ ipc/ security/ crypto/ block/ litmus/
 
 vmlinux-dirs	:= $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
 		     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index ea6ec7e..131ec84 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -2269,3 +2269,6 @@ source "crypto/Kconfig"
 source "lib/Kconfig"
 
 source "arch/arm/kvm/Kconfig"
+
+source "litmus/Kconfig"
+
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fe120da..bd67fd1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2346,3 +2346,5 @@ source "crypto/Kconfig"
 source "arch/x86/kvm/Kconfig"
 
 source "lib/Kconfig"
+
+source "litmus/Kconfig"
diff --git a/litmus/Kconfig b/litmus/Kconfig
new file mode 100644
index 0000000..382b2e4
--- /dev/null
+++ b/litmus/Kconfig
@@ -0,0 +1,3 @@
+menu "LITMUS^RT"
+
+endmenu
diff --git a/litmus/Makefile b/litmus/Makefile
new file mode 100644
index 0000000..f0ed31f
--- /dev/null
+++ b/litmus/Makefile
@@ -0,0 +1,3 @@
+#
+# Makefile for LITMUS^RT
+#
-- 
1.8.1.2


From efbaae0016a8bc98cc6d24e17ee242a52b356f17 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Sun, 23 Jun 2013 11:41:27 +0200
Subject: [PATCH 002/119] Feather-Trace: add platform independent
 implementation

This patch adds the simple fallback implementation and creates dummy
hooks in the x86 and ARM Kconfig files.
---
 arch/arm/Kconfig                |   3 +
 arch/x86/Kconfig                |   3 +
 include/litmus/feather_buffer.h | 118 ++++++++++++++++++++++++++++++++++++++++
 include/litmus/feather_trace.h  |  69 +++++++++++++++++++++++
 litmus/Kconfig                  |  25 +++++++++
 litmus/Makefile                 |   2 +
 litmus/ft_event.c               |  43 +++++++++++++++
 7 files changed, 263 insertions(+)
 create mode 100644 include/litmus/feather_buffer.h
 create mode 100644 include/litmus/feather_trace.h
 create mode 100644 litmus/ft_event.c

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 131ec84..ecfd735 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -2270,5 +2270,8 @@ source "lib/Kconfig"
 
 source "arch/arm/kvm/Kconfig"
 
+config ARCH_HAS_FEATHER_TRACE
+	def_bool n
+
 source "litmus/Kconfig"
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bd67fd1..0216c93 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2347,4 +2347,7 @@ source "arch/x86/kvm/Kconfig"
 
 source "lib/Kconfig"
 
+config ARCH_HAS_FEATHER_TRACE
+	def_bool n
+
 source "litmus/Kconfig"
diff --git a/include/litmus/feather_buffer.h b/include/litmus/feather_buffer.h
new file mode 100644
index 0000000..38de95b
--- /dev/null
+++ b/include/litmus/feather_buffer.h
@@ -0,0 +1,118 @@
+#ifndef _FEATHER_BUFFER_H_
+#define _FEATHER_BUFFER_H_
+
+/* requires UINT_MAX and memcpy */
+
+#define SLOT_FREE	0
+#define	SLOT_BUSY 	1
+#define	SLOT_READY	2
+
+struct ft_buffer {
+	unsigned int	slot_count;
+	unsigned int	slot_size;
+
+	int 		free_count;
+	unsigned int 	write_idx;
+	unsigned int 	read_idx;
+
+	char*		slots;
+	void*		buffer_mem;
+	unsigned int	failed_writes;
+};
+
+static inline int init_ft_buffer(struct ft_buffer*	buf,
+				 unsigned int 		slot_count,
+				 unsigned int 		slot_size,
+				 char*			slots,
+				 void* 			buffer_mem)
+{
+	int i = 0;
+	if (!slot_count || UINT_MAX % slot_count != slot_count - 1) {
+		/* The slot count must divide UNIT_MAX + 1 so that when it
+		 * wraps around the index correctly points to 0.
+		 */
+		return 0;
+	} else {
+		buf->slot_count    = slot_count;
+		buf->slot_size     = slot_size;
+		buf->slots         = slots;
+		buf->buffer_mem    = buffer_mem;
+		buf->free_count    = slot_count;
+		buf->write_idx     = 0;
+		buf->read_idx      = 0;
+		buf->failed_writes = 0;
+		for (i = 0; i < slot_count; i++)
+			buf->slots[i] = SLOT_FREE;
+		return 1;
+	}
+}
+
+static inline int ft_buffer_start_write(struct ft_buffer* buf, void **ptr)
+{
+	int free = fetch_and_dec(&buf->free_count);
+	unsigned int idx;
+	if (free <= 0) {
+		fetch_and_inc(&buf->free_count);
+		*ptr = 0;
+		fetch_and_inc(&buf->failed_writes);
+		return 0;
+	} else {
+		idx  = fetch_and_inc((int*) &buf->write_idx) % buf->slot_count;
+		buf->slots[idx] = SLOT_BUSY;
+		*ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size;
+		return 1;
+	}
+}
+
+/* For single writer scenarios, with fewer atomic ops. */
+static inline int ft_buffer_start_single_write(struct ft_buffer* buf, void **ptr)
+{
+	unsigned int idx;
+
+	if (buf->free_count <= 0) {
+		*ptr = 0;
+		/* single writer: no atomicity needed */
+		buf->failed_writes++;
+		return 0;
+	} else {
+		/* free_count is positive, and can only increase since we are
+		 * (by assumption) the only writer accessing the buffer.
+		 */
+
+		idx  = buf->write_idx++ % buf->slot_count;
+		buf->slots[idx] = SLOT_BUSY;
+		*ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size;
+
+		ft_atomic_dec(&buf->free_count);
+		return 1;
+	}
+}
+
+static inline void ft_buffer_finish_write(struct ft_buffer* buf, void *ptr)
+{
+	unsigned int idx = ((char*) ptr - (char*) buf->buffer_mem) / buf->slot_size;
+	buf->slots[idx]  = SLOT_READY;
+}
+
+
+/* exclusive reader access is assumed */
+static inline int ft_buffer_read(struct ft_buffer* buf, void* dest)
+{
+	unsigned int idx;
+	if (buf->free_count == buf->slot_count)
+		/* nothing available */
+		return 0;
+	idx = buf->read_idx % buf->slot_count;
+	if (buf->slots[idx] == SLOT_READY) {
+		memcpy(dest, ((char*) buf->buffer_mem) + idx * buf->slot_size,
+		       buf->slot_size);
+		buf->slots[idx] = SLOT_FREE;
+		buf->read_idx++;
+		fetch_and_inc(&buf->free_count);
+		return 1;
+	} else
+		return 0;
+}
+
+
+#endif
diff --git a/include/litmus/feather_trace.h b/include/litmus/feather_trace.h
new file mode 100644
index 0000000..dbeca46
--- /dev/null
+++ b/include/litmus/feather_trace.h
@@ -0,0 +1,69 @@
+#ifndef _FEATHER_TRACE_H_
+#define _FEATHER_TRACE_H_
+
+#include <asm/atomic.h>
+
+int ft_enable_event(unsigned long id);
+int ft_disable_event(unsigned long id);
+int ft_is_event_enabled(unsigned long id);
+int ft_disable_all_events(void);
+
+/* atomic_* funcitons are inline anyway */
+static inline int fetch_and_inc(int *val)
+{
+	return atomic_add_return(1, (atomic_t*) val) - 1;
+}
+
+static inline int fetch_and_dec(int *val)
+{
+	return atomic_sub_return(1, (atomic_t*) val) + 1;
+}
+
+static inline void ft_atomic_dec(int *val)
+{
+	atomic_sub(1, (atomic_t*) val);
+}
+
+/* Don't use rewriting implementation if kernel text pages are read-only.
+ * Ftrace gets around this by using the identity mapping, but that's more
+ * effort that is warrented right now for Feather-Trace.
+ * Eventually, it may make sense to replace Feather-Trace with ftrace.
+ */
+#if defined(CONFIG_ARCH_HAS_FEATHER_TRACE) && !defined(CONFIG_DEBUG_RODATA)
+
+#include <asm/feather_trace.h>
+
+#else /* !__ARCH_HAS_FEATHER_TRACE */
+
+/* provide default implementation */
+#include <linux/timex.h> /* for get_cycles() */
+
+static inline unsigned long long ft_timestamp(void)
+{
+	return get_cycles();
+}
+
+#define feather_callback
+
+#define MAX_EVENTS 1024
+
+extern int ft_events[MAX_EVENTS];
+
+#define ft_event(id, callback) \
+	if (ft_events[id]) callback();
+
+#define ft_event0(id, callback) \
+	if (ft_events[id]) callback(id);
+
+#define ft_event1(id, callback, param) \
+	if (ft_events[id]) callback(id, param);
+
+#define ft_event2(id, callback, param, param2) \
+	if (ft_events[id]) callback(id, param, param2);
+
+#define ft_event3(id, callback, p, p2, p3) \
+	if (ft_events[id]) callback(id, p, p2, p3);
+
+#endif /* __ARCH_HAS_FEATHER_TRACE */
+
+#endif
diff --git a/litmus/Kconfig b/litmus/Kconfig
index 382b2e4..70ddbad 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -1,3 +1,28 @@
 menu "LITMUS^RT"
 
+menu "Tracing"
+
+config FEATHER_TRACE
+	bool "Feather-Trace Infrastructure"
+	default y
+	help
+	  Feather-Trace basic tracing infrastructure. Includes device file
+	  driver and instrumentation point support.
+
+	  There are actually two implementations of Feather-Trace.
+	  1) A slower, but portable, default implementation.
+	  2) Architecture-specific implementations that rewrite kernel .text at runtime.
+
+	  If enabled, Feather-Trace will be based on 2) if available (currently only for x86).
+	  However, if DEBUG_RODATA=y, then Feather-Trace will choose option 1) in any case
+	  to avoid problems with write-protected .text pages.
+
+	  Bottom line: to avoid increased overheads, choose DEBUG_RODATA=n.
+
+	  Note that this option only enables the basic Feather-Trace infrastructure;
+	  you still need to enable SCHED_TASK_TRACE and/or SCHED_OVERHEAD_TRACE to
+	  actually enable any events.
+
+endmenu
+
 endmenu
diff --git a/litmus/Makefile b/litmus/Makefile
index f0ed31f..4c6130b 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -1,3 +1,5 @@
 #
 # Makefile for LITMUS^RT
 #
+
+obj-$(CONFIG_FEATHER_TRACE) += ft_event.o
diff --git a/litmus/ft_event.c b/litmus/ft_event.c
new file mode 100644
index 0000000..399a07b
--- /dev/null
+++ b/litmus/ft_event.c
@@ -0,0 +1,43 @@
+#include <linux/types.h>
+
+#include <litmus/feather_trace.h>
+
+#if !defined(CONFIG_ARCH_HAS_FEATHER_TRACE) || defined(CONFIG_DEBUG_RODATA)
+/* provide dummy implementation */
+
+int ft_events[MAX_EVENTS];
+
+int ft_enable_event(unsigned long id)
+{
+	if (id < MAX_EVENTS) {
+		ft_events[id]++;
+		return 1;
+	} else
+		return 0;
+}
+
+int ft_disable_event(unsigned long id)
+{
+	if (id < MAX_EVENTS && ft_events[id]) {
+		ft_events[id]--;
+		return 1;
+	} else
+		return 0;
+}
+
+int ft_disable_all_events(void)
+{
+	int i;
+
+	for (i = 0; i < MAX_EVENTS; i++)
+		ft_events[i] = 0;
+
+	return MAX_EVENTS;
+}
+
+int ft_is_event_enabled(unsigned long id)
+{
+	return 	id < MAX_EVENTS && ft_events[id];
+}
+
+#endif
-- 
1.8.1.2


From 32f9c06ee90e860a7c6fbe8d27b2c219804bfee2 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Sun, 23 Jun 2013 11:46:23 +0200
Subject: [PATCH 003/119] Feather-Trace: add x86 binary rewriting
 implementation

This patch adds the x86-specific implementation of Feather-Trace
triggers that works by rewriting jump instructions.
---
 arch/x86/Kconfig                        |   2 +-
 arch/x86/include/asm/feather_trace.h    |  17 +++++
 arch/x86/include/asm/feather_trace_32.h | 115 +++++++++++++++++++++++++++++
 arch/x86/include/asm/feather_trace_64.h | 124 ++++++++++++++++++++++++++++++++
 arch/x86/kernel/Makefile                |   2 +
 arch/x86/kernel/ft_event.c              | 118 ++++++++++++++++++++++++++++++
 litmus/Kconfig                          |   1 +
 7 files changed, 378 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/include/asm/feather_trace.h
 create mode 100644 arch/x86/include/asm/feather_trace_32.h
 create mode 100644 arch/x86/include/asm/feather_trace_64.h
 create mode 100644 arch/x86/kernel/ft_event.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0216c93..171cdc9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2348,6 +2348,6 @@ source "arch/x86/kvm/Kconfig"
 source "lib/Kconfig"
 
 config ARCH_HAS_FEATHER_TRACE
-	def_bool n
+	def_bool y
 
 source "litmus/Kconfig"
diff --git a/arch/x86/include/asm/feather_trace.h b/arch/x86/include/asm/feather_trace.h
new file mode 100644
index 0000000..4fd3163
--- /dev/null
+++ b/arch/x86/include/asm/feather_trace.h
@@ -0,0 +1,17 @@
+#ifndef _ARCH_FEATHER_TRACE_H
+#define _ARCH_FEATHER_TRACE_H
+
+#include <asm/msr.h>
+
+static inline unsigned long long ft_timestamp(void)
+{
+	return __native_read_tsc();
+}
+
+#ifdef CONFIG_X86_32
+#include "feather_trace_32.h"
+#else
+#include "feather_trace_64.h"
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/feather_trace_32.h b/arch/x86/include/asm/feather_trace_32.h
new file mode 100644
index 0000000..75e81a9
--- /dev/null
+++ b/arch/x86/include/asm/feather_trace_32.h
@@ -0,0 +1,115 @@
+/* Copyright (c) 2007-2012 Björn Brandenburg, <bbb@mpi-sws.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Do not directly include this file. Include feather_trace.h instead */
+
+#define feather_callback __attribute__((regparm(3)))  __attribute__((used))
+
+/*
+ * Make the compiler reload any register that is not saved in a cdecl function
+ * call (minus the registers that we explicitly clobber as output registers).
+ */
+#define __FT_CLOBBER_LIST0 "memory", "cc", "eax", "edx", "ecx"
+#define __FT_CLOBBER_LIST1 "memory", "cc", "eax", "ecx"
+#define __FT_CLOBBER_LIST2 "memory", "cc", "eax"
+#define __FT_CLOBBER_LIST3 "memory", "cc", "eax"
+
+#define __FT_TMP1(x) "=d" (x)
+#define __FT_ARG1(x) "0" ((long) (x))
+#define __FT_TMP2(x) "=c" (x)
+#define __FT_ARG2(x) "1" ((long) (x))
+
+#define __FT_ARG3(x) "r" ((long) (x))
+
+#define ft_event(id, callback)                                  \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+	    " call " #callback "                          \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+	    : : : __FT_CLOBBER_LIST0)
+
+#define ft_event0(id, callback)                                 \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+            " movl $" #id  ", %%eax                       \n\t" \
+	    " call " #callback "                          \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+	    : : : __FT_CLOBBER_LIST0)
+
+#define ft_event1(id, callback, param)				\
+	do {							\
+		long __ft_tmp1;					\
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+            " movl $" #id  ", %%eax                       \n\t" \
+	    " call " #callback "                          \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+	    : __FT_TMP1(__ft_tmp1)				\
+	    : __FT_ARG1(param)					\
+	    : __FT_CLOBBER_LIST1);				\
+	} while (0);
+
+#define ft_event2(id, callback, param, param2)                  \
+	do {							\
+		long __ft_tmp1, __ft_tmp2;			\
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+            " movl $" #id  ", %%eax                       \n\t" \
+	    " call " #callback "                          \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+	    : __FT_TMP1(__ft_tmp1), __FT_TMP2(__ft_tmp2)	\
+	    : __FT_ARG1(param), __FT_ARG2(param2)		\
+	    : __FT_CLOBBER_LIST2);				\
+	} while (0);
+
+
+#define ft_event3(id, callback, param, param2, param3)		\
+	do {							\
+		long __ft_tmp1, __ft_tmp2;			\
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+	    " subl $4, %%esp                              \n\t" \
+            " movl $" #id  ", %%eax                       \n\t" \
+	    " movl %2, (%%esp)                            \n\t" \
+	    " call " #callback "                          \n\t" \
+	    " addl $4, %%esp                              \n\t" \
+            ".section __event_table, \"aw\"               \n\t" \
+            ".long " #id  ", 0, 1b, 2f                    \n\t" \
+            ".previous                                    \n\t" \
+            "2:                                           \n\t" \
+	    : __FT_TMP1(__ft_tmp1), __FT_TMP2(__ft_tmp2)	\
+	    : __FT_ARG1(param), __FT_ARG2(param2), __FT_ARG3(param3)	\
+	    : __FT_CLOBBER_LIST3);				\
+	} while (0);
diff --git a/arch/x86/include/asm/feather_trace_64.h b/arch/x86/include/asm/feather_trace_64.h
new file mode 100644
index 0000000..5ce49e2
--- /dev/null
+++ b/arch/x86/include/asm/feather_trace_64.h
@@ -0,0 +1,124 @@
+/* Copyright (c) 2010 Andrea Bastoni, <bastoni@cs.unc.edu>
+ * Copyright (c) 2012 Björn Brandenburg, <bbb@mpi-sws.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Do not directly include this file. Include feather_trace.h instead */
+
+/* regparm is the default on x86_64 */
+#define feather_callback  __attribute__((used))
+
+#define __FT_EVENT_TABLE(id,from,to) \
+            ".section __event_table, \"aw\"\n\t" \
+	    ".balign 8\n\t" \
+            ".quad " #id  ", 0, " #from ", " #to " \n\t" \
+            ".previous \n\t"
+
+/*
+ * x86_64 caller only owns rbp, rbx, r12-r15;
+ * the callee can freely modify the others.
+ */
+#define __FT_CLOBBER_LIST0	"memory", "cc", "rdi", "rsi", "rdx", "rcx", \
+			"r8", "r9", "r10", "r11", "rax"
+
+#define __FT_CLOBBER_LIST1	"memory", "cc", "rdi", "rdx", "rcx", \
+			"r8", "r9", "r10", "r11", "rax"
+
+#define __FT_CLOBBER_LIST2	"memory", "cc", "rdi", "rcx", \
+			"r8", "r9", "r10", "r11", "rax"
+
+#define __FT_CLOBBER_LIST3	"memory", "cc", "rdi", \
+			"r8", "r9", "r10", "r11", "rax"
+
+/* The registers RDI, RSI, RDX, RCX, R8 and R9 are used for integer and pointer
+ * arguments. */
+
+/* RSI */
+#define __FT_TMP1(x) "=S" (x)
+#define __FT_ARG1(x) "0" ((long) (x))
+
+/* RDX */
+#define __FT_TMP2(x) "=d" (x)
+#define __FT_ARG2(x) "1" ((long) (x))
+
+/* RCX */
+#define __FT_TMP3(x) "=c" (x)
+#define __FT_ARG3(x) "2" ((long) (x))
+
+#define ft_event(id, callback)                                  \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+	    " call " #callback "                          \n\t" \
+            __FT_EVENT_TABLE(id,1b,2f)				\
+            "2:                                           \n\t" \
+        : : : __FT_CLOBBER_LIST0)
+
+#define ft_event0(id, callback)                                 \
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+	    " movq $" #id ", %%rdi			  \n\t" \
+	    " call " #callback "                          \n\t" \
+	    __FT_EVENT_TABLE(id,1b,2f)				\
+            "2:                                           \n\t" \
+        : :  : __FT_CLOBBER_LIST0)
+
+#define ft_event1(id, callback, param)                          \
+	do {							\
+		long __ft_tmp1;					\
+	__asm__ __volatile__(                                   \
+	    "1: jmp 2f                                    \n\t" \
+	    " movq $" #id ", %%rdi			  \n\t" \
+	    " call " #callback "                          \n\t" \
+	    __FT_EVENT_TABLE(id,1b,2f)				\
+	    "2:                                           \n\t" \
+	    : __FT_TMP1(__ft_tmp1)				\
+	    : __FT_ARG1(param)					\
+	    : __FT_CLOBBER_LIST1);				\
+	} while (0);
+
+#define ft_event2(id, callback, param, param2)                  \
+	do {							\
+		long __ft_tmp1, __ft_tmp2;			\
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+	    " movq $" #id ", %%rdi			  \n\t" \
+	    " call " #callback "                          \n\t" \
+            __FT_EVENT_TABLE(id,1b,2f)				\
+            "2:                                           \n\t" \
+	    : __FT_TMP1(__ft_tmp1), __FT_TMP2(__ft_tmp2)	\
+	    : __FT_ARG1(param), __FT_ARG2(param2)		\
+	    : __FT_CLOBBER_LIST2);				\
+	} while (0);
+
+#define ft_event3(id, callback, param, param2, param3)		\
+	do {							\
+		long __ft_tmp1, __ft_tmp2, __ft_tmp3;		\
+        __asm__ __volatile__(                                   \
+            "1: jmp 2f                                    \n\t" \
+	    " movq $" #id ", %%rdi			  \n\t" \
+	    " call " #callback "                          \n\t" \
+            __FT_EVENT_TABLE(id,1b,2f)				\
+            "2:                                           \n\t" \
+	    : __FT_TMP1(__ft_tmp1), __FT_TMP2(__ft_tmp2), __FT_TMP3(__ft_tmp3) \
+	    : __FT_ARG1(param), __FT_ARG2(param2), __FT_ARG3(param3)	\
+	    : __FT_CLOBBER_LIST3);				\
+	} while (0);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 7bd3bd3..d38a5a7 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -103,6 +103,8 @@ obj-$(CONFIG_UPROBES)			+= uprobes.o
 
 obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o
 
+obj-$(CONFIG_FEATHER_TRACE)	+= ft_event.o
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/ft_event.c b/arch/x86/kernel/ft_event.c
new file mode 100644
index 0000000..37cc332
--- /dev/null
+++ b/arch/x86/kernel/ft_event.c
@@ -0,0 +1,118 @@
+#include <linux/types.h>
+
+#include <litmus/feather_trace.h>
+
+/* the feather trace management functions assume
+ * exclusive access to the event table
+ */
+
+#ifndef CONFIG_DEBUG_RODATA
+
+#define BYTE_JUMP      0xeb
+#define BYTE_JUMP_LEN  0x02
+
+/* for each event, there is an entry in the event table */
+struct trace_event {
+	long 	id;
+	long	count;
+	long	start_addr;
+	long	end_addr;
+};
+
+extern struct trace_event  __start___event_table[];
+extern struct trace_event  __stop___event_table[];
+
+/* Workaround: if no events are defined, then the event_table section does not
+ * exist and the above references cause linker errors. This could probably be
+ * fixed by adjusting the linker script, but it is easier to maintain for us if
+ * we simply create a dummy symbol in the event table section.
+ */
+int __event_table_dummy[0] __attribute__ ((section("__event_table")));
+
+int ft_enable_event(unsigned long id)
+{
+	struct trace_event* te = __start___event_table;
+	int count = 0;
+	char* delta;
+	unsigned char* instr;
+
+	while (te < __stop___event_table) {
+		if (te->id == id && ++te->count == 1) {
+			instr  = (unsigned char*) te->start_addr;
+			/* make sure we don't clobber something wrong */
+			if (*instr == BYTE_JUMP) {
+				delta  = (((unsigned char*) te->start_addr) + 1);
+				*delta = 0;
+			}
+		}
+		if (te->id == id)
+			count++;
+		te++;
+	}
+
+	printk(KERN_DEBUG "ft_enable_event: enabled %d events\n", count);
+	return count;
+}
+
+int ft_disable_event(unsigned long id)
+{
+	struct trace_event* te = __start___event_table;
+	int count = 0;
+	char* delta;
+	unsigned char* instr;
+
+	while (te < __stop___event_table) {
+		if (te->id == id && --te->count == 0) {
+			instr  = (unsigned char*) te->start_addr;
+			if (*instr == BYTE_JUMP) {
+				delta  = (((unsigned char*) te->start_addr) + 1);
+				*delta = te->end_addr - te->start_addr -
+					BYTE_JUMP_LEN;
+			}
+		}
+		if (te->id == id)
+			count++;
+		te++;
+	}
+
+	printk(KERN_DEBUG "ft_disable_event: disabled %d events\n", count);
+	return count;
+}
+
+int ft_disable_all_events(void)
+{
+	struct trace_event* te = __start___event_table;
+	int count = 0;
+	char* delta;
+	unsigned char* instr;
+
+	while (te < __stop___event_table) {
+		if (te->count) {
+			instr  = (unsigned char*) te->start_addr;
+			if (*instr == BYTE_JUMP) {
+				delta  = (((unsigned char*) te->start_addr)
+					  + 1);
+				*delta = te->end_addr - te->start_addr -
+					BYTE_JUMP_LEN;
+				te->count = 0;
+				count++;
+			}
+		}
+		te++;
+	}
+	return count;
+}
+
+int ft_is_event_enabled(unsigned long id)
+{
+	struct trace_event* te = __start___event_table;
+
+	while (te < __stop___event_table) {
+		if (te->id == id)
+			return te->count;
+		te++;
+	}
+	return 0;
+}
+
+#endif
diff --git a/litmus/Kconfig b/litmus/Kconfig
index 70ddbad..7456eb2 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -4,6 +4,7 @@ menu "Tracing"
 
 config FEATHER_TRACE
 	bool "Feather-Trace Infrastructure"
+	depends on !RELOCATABLE
 	default y
 	help
 	  Feather-Trace basic tracing infrastructure. Includes device file
-- 
1.8.1.2


From e46b5c3c4264a15b363502bcb980e3587131d826 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Sun, 23 Jun 2013 12:00:35 +0200
Subject: [PATCH 004/119] Feather-Trace: add generic ftdev device driver

This patch adds the ftdev device driver, which is used to export
samples collected with Feather-Trace to userspace.
---
 include/litmus/ftdev.h |  58 +++++++
 litmus/Makefile        |   2 +-
 litmus/ftdev.c         | 439 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 498 insertions(+), 1 deletion(-)
 create mode 100644 include/litmus/ftdev.h
 create mode 100644 litmus/ftdev.c

diff --git a/include/litmus/ftdev.h b/include/litmus/ftdev.h
new file mode 100644
index 0000000..a566b0b
--- /dev/null
+++ b/include/litmus/ftdev.h
@@ -0,0 +1,58 @@
+#ifndef _LITMUS_FTDEV_H_
+#define	_LITMUS_FTDEV_H_
+
+#include <litmus/feather_trace.h>
+#include <litmus/feather_buffer.h>
+#include <linux/mutex.h>
+#include <linux/cdev.h>
+
+#define FTDEV_ENABLE_CMD 	0
+#define FTDEV_DISABLE_CMD 	1
+#define FTDEV_CALIBRATE		0x1410
+
+struct ftdev;
+
+/* return 0 if buffer can be opened, otherwise -$REASON */
+typedef int  (*ftdev_can_open_t)(struct ftdev* dev, unsigned int buf_no);
+/* return 0 on success, otherwise -$REASON */
+typedef int  (*ftdev_alloc_t)(struct ftdev* dev, unsigned int buf_no);
+typedef void (*ftdev_free_t)(struct ftdev* dev, unsigned int buf_no);
+typedef long (*ftdev_calibrate_t)(struct ftdev* dev, unsigned int buf_no, unsigned long user_arg);
+/* Let devices handle writes from userspace. No synchronization provided. */
+typedef ssize_t (*ftdev_write_t)(struct ft_buffer* buf, size_t len, const char __user *from);
+
+struct ftdev_event;
+
+struct ftdev_minor {
+	struct ft_buffer*	buf;
+	unsigned int		readers;
+	struct mutex		lock;
+	/* FIXME: filter for authorized events */
+	struct ftdev_event*	events;
+	struct device*		device;
+	struct ftdev*		ftdev;
+};
+
+struct ftdev {
+	dev_t			major;
+	struct cdev		cdev;
+	struct class*		class;
+	const char*		name;
+	struct ftdev_minor*	minor;
+	unsigned int		minor_cnt;
+	ftdev_alloc_t		alloc;
+	ftdev_free_t		free;
+	ftdev_can_open_t	can_open;
+	ftdev_write_t		write;
+	ftdev_calibrate_t	calibrate;
+};
+
+struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size);
+void free_ft_buffer(struct ft_buffer* buf);
+
+int ftdev_init(	struct ftdev* ftdev, struct module* owner,
+		const int minor_cnt, const char* name);
+void ftdev_exit(struct ftdev* ftdev);
+int register_ftdev(struct ftdev* ftdev);
+
+#endif
diff --git a/litmus/Makefile b/litmus/Makefile
index 4c6130b..bca61e6 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -2,4 +2,4 @@
 # Makefile for LITMUS^RT
 #
 
-obj-$(CONFIG_FEATHER_TRACE) += ft_event.o
+obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
diff --git a/litmus/ftdev.c b/litmus/ftdev.c
new file mode 100644
index 0000000..13f1d48
--- /dev/null
+++ b/litmus/ftdev.c
@@ -0,0 +1,439 @@
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/cdev.h>
+#include <asm/uaccess.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/vmalloc.h>
+
+#include <litmus/feather_trace.h>
+#include <litmus/ftdev.h>
+
+struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size)
+{
+	struct ft_buffer* buf;
+	size_t total = (size + 1) * count;
+	char* mem;
+
+	buf = kmalloc(sizeof(*buf), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+
+	mem = vmalloc(total);
+
+	if (!mem) {
+		kfree(buf);
+		return NULL;
+	}
+
+	if (!init_ft_buffer(buf, count, size,
+			    mem + (count * size),  /* markers at the end */
+			    mem)) {                /* buffer objects     */
+		vfree(mem);
+		kfree(buf);
+		return NULL;
+	}
+	return buf;
+}
+
+void free_ft_buffer(struct ft_buffer* buf)
+{
+	if (buf) {
+		vfree(buf->buffer_mem);
+		kfree(buf);
+	}
+}
+
+struct ftdev_event {
+	int id;
+	struct ftdev_event* next;
+};
+
+static int activate(struct ftdev_event** chain, int id)
+{
+	struct ftdev_event* ev = kmalloc(sizeof(*ev), GFP_KERNEL);
+	if (ev) {
+		printk(KERN_INFO
+		       "Enabling feather-trace event %d.\n", (int) id);
+		ft_enable_event(id);
+		ev->id = id;
+		ev->next = *chain;
+		*chain    = ev;
+	}
+	return ev ? 0 : -ENOMEM;
+}
+
+static void deactivate(struct ftdev_event** chain, int id)
+{
+	struct ftdev_event **cur = chain;
+	struct ftdev_event *nxt;
+	while (*cur) {
+		if ((*cur)->id == id) {
+			nxt   = (*cur)->next;
+			kfree(*cur);
+			*cur  = nxt;
+			printk(KERN_INFO
+			       "Disabling feather-trace event %d.\n", (int) id);
+			ft_disable_event(id);
+			break;
+		}
+		cur = &(*cur)->next;
+	}
+}
+
+static int ftdev_open(struct inode *in, struct file *filp)
+{
+	struct ftdev* ftdev;
+	struct ftdev_minor* ftdm;
+	unsigned int buf_idx = iminor(in);
+	int err = 0;
+
+	ftdev = container_of(in->i_cdev, struct ftdev, cdev);
+
+	if (buf_idx >= ftdev->minor_cnt) {
+		err = -ENODEV;
+		goto out;
+	}
+	if (ftdev->can_open && (err = ftdev->can_open(ftdev, buf_idx)))
+		goto out;
+
+	ftdm = ftdev->minor + buf_idx;
+	ftdm->ftdev = ftdev;
+	filp->private_data = ftdm;
+
+	if (mutex_lock_interruptible(&ftdm->lock)) {
+		err = -ERESTARTSYS;
+		goto out;
+	}
+
+	if (!ftdm->readers && ftdev->alloc)
+		err = ftdev->alloc(ftdev, buf_idx);
+	if (0 == err)
+		ftdm->readers++;
+
+	mutex_unlock(&ftdm->lock);
+out:
+	return err;
+}
+
+static int ftdev_release(struct inode *in, struct file *filp)
+{
+	struct ftdev* ftdev;
+	struct ftdev_minor* ftdm;
+	unsigned int buf_idx = iminor(in);
+	int err = 0;
+
+	ftdev = container_of(in->i_cdev, struct ftdev, cdev);
+
+	if (buf_idx >= ftdev->minor_cnt) {
+		err = -ENODEV;
+		goto out;
+	}
+	ftdm = ftdev->minor + buf_idx;
+
+	if (mutex_lock_interruptible(&ftdm->lock)) {
+		err = -ERESTARTSYS;
+		goto out;
+	}
+
+	if (ftdm->readers == 1) {
+		while (ftdm->events)
+			deactivate(&ftdm->events, ftdm->events->id);
+
+		/* wait for any pending events to complete */
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(HZ);
+
+		printk(KERN_ALERT "Failed trace writes: %u\n",
+		       ftdm->buf->failed_writes);
+
+		if (ftdev->free)
+			ftdev->free(ftdev, buf_idx);
+	}
+
+	ftdm->readers--;
+	mutex_unlock(&ftdm->lock);
+out:
+	return err;
+}
+
+/* based on ft_buffer_read
+ * @returns < 0 : page fault
+ *          = 0 : no data available
+ *          = 1 : one slot copied
+ */
+static int ft_buffer_copy_to_user(struct ft_buffer* buf, char __user *dest)
+{
+	unsigned int idx;
+	int err = 0;
+	if (buf->free_count != buf->slot_count) {
+		/* data available */
+		idx = buf->read_idx % buf->slot_count;
+		if (buf->slots[idx] == SLOT_READY) {
+			err = copy_to_user(dest, ((char*) buf->buffer_mem) +
+					   idx * buf->slot_size,
+					   buf->slot_size);
+			if (err == 0) {
+				/* copy ok */
+				buf->slots[idx] = SLOT_FREE;
+				buf->read_idx++;
+				fetch_and_inc(&buf->free_count);
+				err = 1;
+			}
+		}
+	}
+	return err;
+}
+
+static ssize_t ftdev_read(struct file *filp,
+			  char __user *to, size_t len, loff_t *f_pos)
+{
+	/* 	we ignore f_pos, this is strictly sequential */
+
+	ssize_t err = 0;
+	size_t chunk;
+	int copied;
+	struct ftdev_minor* ftdm = filp->private_data;
+
+	if (mutex_lock_interruptible(&ftdm->lock)) {
+		err = -ERESTARTSYS;
+		goto out;
+	}
+
+
+	chunk = ftdm->buf->slot_size;
+	while (len >= chunk) {
+		copied = ft_buffer_copy_to_user(ftdm->buf, to);
+		if (copied == 1) {
+			len    -= chunk;
+			to     += chunk;
+			err    += chunk;
+	        } else if (err == 0 && copied == 0 && ftdm->events) {
+			/* Only wait if there are any events enabled and only
+			 * if we haven't copied some data yet. We cannot wait
+			 * here with copied data because that data would get
+			 * lost if the task is interrupted (e.g., killed).
+			 */
+			mutex_unlock(&ftdm->lock);
+			set_current_state(TASK_INTERRUPTIBLE);
+
+			schedule_timeout(50);
+
+			if (signal_pending(current)) {
+				if (err == 0)
+					/* nothing read yet, signal problem */
+					err = -ERESTARTSYS;
+				goto out;
+			}
+			if (mutex_lock_interruptible(&ftdm->lock)) {
+				err = -ERESTARTSYS;
+				goto out;
+			}
+		} else if (copied < 0) {
+			/* page fault */
+			err = copied;
+			break;
+		} else
+			/* nothing left to get, return to user space */
+			break;
+	}
+	mutex_unlock(&ftdm->lock);
+out:
+	return err;
+}
+
+static long ftdev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	long err = -ENOIOCTLCMD;
+	struct ftdev_minor* ftdm = filp->private_data;
+
+	if (mutex_lock_interruptible(&ftdm->lock)) {
+		err = -ERESTARTSYS;
+		goto out;
+	}
+
+	/* FIXME: check id against list of acceptable events */
+
+	switch (cmd) {
+	case  FTDEV_ENABLE_CMD:
+		if (activate(&ftdm->events, arg))
+			err = -ENOMEM;
+		else
+			err = 0;
+		break;
+
+	case FTDEV_DISABLE_CMD:
+		deactivate(&ftdm->events, arg);
+		err = 0;
+		break;
+
+	case FTDEV_CALIBRATE:
+		if (ftdm->ftdev->calibrate) {
+			err = ftdm->ftdev->calibrate(ftdm->ftdev, iminor(filp->f_dentry->d_inode), arg);
+		}
+		break;
+
+	default:
+		printk(KERN_DEBUG "ftdev: strange ioctl (%u, %lu)\n", cmd, arg);
+	};
+
+	mutex_unlock(&ftdm->lock);
+out:
+	return err;
+}
+
+static ssize_t ftdev_write(struct file *filp, const char __user *from,
+			   size_t len, loff_t *f_pos)
+{
+	struct ftdev_minor* ftdm = filp->private_data;
+	ssize_t err = -EINVAL;
+	struct ftdev* ftdev = ftdm->ftdev;
+
+	/* dispatch write to buffer-specific code, if available */
+	if (ftdev->write)
+		err = ftdev->write(ftdm->buf, len, from);
+
+	return err;
+}
+
+struct file_operations ftdev_fops = {
+	.owner   = THIS_MODULE,
+	.open    = ftdev_open,
+	.release = ftdev_release,
+	.write   = ftdev_write,
+	.read    = ftdev_read,
+	.unlocked_ioctl = ftdev_ioctl,
+};
+
+int ftdev_init(	struct ftdev* ftdev, struct module* owner,
+		const int minor_cnt, const char* name)
+{
+	int i, err;
+
+	BUG_ON(minor_cnt < 1);
+
+	cdev_init(&ftdev->cdev, &ftdev_fops);
+	ftdev->name = name;
+	ftdev->minor_cnt = minor_cnt;
+	ftdev->cdev.owner = owner;
+	ftdev->cdev.ops = &ftdev_fops;
+	ftdev->alloc    = NULL;
+	ftdev->free     = NULL;
+	ftdev->can_open = NULL;
+	ftdev->write	= NULL;
+	ftdev->calibrate = NULL;
+
+	ftdev->minor = kcalloc(ftdev->minor_cnt, sizeof(*ftdev->minor),
+			GFP_KERNEL);
+	if (!ftdev->minor) {
+		printk(KERN_WARNING "ftdev(%s): Could not allocate memory\n",
+			ftdev->name);
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	for (i = 0; i < ftdev->minor_cnt; i++) {
+		mutex_init(&ftdev->minor[i].lock);
+		ftdev->minor[i].readers = 0;
+		ftdev->minor[i].buf     = NULL;
+		ftdev->minor[i].events  = NULL;
+	}
+
+	ftdev->class = class_create(owner, ftdev->name);
+	if (IS_ERR(ftdev->class)) {
+		err = PTR_ERR(ftdev->class);
+		printk(KERN_WARNING "ftdev(%s): "
+			"Could not create device class.\n", ftdev->name);
+		goto err_dealloc;
+	}
+
+	return 0;
+
+err_dealloc:
+	kfree(ftdev->minor);
+err_out:
+	return err;
+}
+
+/*
+ * Destroy minor devices up to, but not including, up_to.
+ */
+static void ftdev_device_destroy(struct ftdev* ftdev, unsigned int up_to)
+{
+	dev_t minor_cntr;
+
+	if (up_to < 1)
+		up_to = (ftdev->minor_cnt < 1) ? 0 : ftdev->minor_cnt;
+
+	for (minor_cntr = 0; minor_cntr < up_to; ++minor_cntr)
+		device_destroy(ftdev->class, MKDEV(ftdev->major, minor_cntr));
+}
+
+void ftdev_exit(struct ftdev* ftdev)
+{
+	printk("ftdev(%s): Exiting\n", ftdev->name);
+	ftdev_device_destroy(ftdev, -1);
+	cdev_del(&ftdev->cdev);
+	unregister_chrdev_region(MKDEV(ftdev->major, 0), ftdev->minor_cnt);
+	class_destroy(ftdev->class);
+	kfree(ftdev->minor);
+}
+
+int register_ftdev(struct ftdev* ftdev)
+{
+	struct device **device;
+	dev_t trace_dev_tmp, minor_cntr;
+	int err;
+
+	err = alloc_chrdev_region(&trace_dev_tmp, 0, ftdev->minor_cnt,
+			ftdev->name);
+	if (err) {
+		printk(KERN_WARNING "ftdev(%s): "
+		       "Could not allocate char. device region (%d minors)\n",
+		       ftdev->name, ftdev->minor_cnt);
+		goto err_out;
+	}
+
+	ftdev->major = MAJOR(trace_dev_tmp);
+
+	err = cdev_add(&ftdev->cdev, trace_dev_tmp, ftdev->minor_cnt);
+	if (err) {
+		printk(KERN_WARNING "ftdev(%s): "
+		       "Could not add cdev for major %u with %u minor(s).\n",
+		       ftdev->name, ftdev->major, ftdev->minor_cnt);
+		goto err_unregister;
+	}
+
+	/* create the minor device(s) */
+	for (minor_cntr = 0; minor_cntr < ftdev->minor_cnt; ++minor_cntr)
+	{
+		trace_dev_tmp = MKDEV(ftdev->major, minor_cntr);
+		device = &ftdev->minor[minor_cntr].device;
+
+		*device = device_create(ftdev->class, NULL, trace_dev_tmp, NULL,
+				"litmus/%s%d", ftdev->name, minor_cntr);
+		if (IS_ERR(*device)) {
+			err = PTR_ERR(*device);
+			printk(KERN_WARNING "ftdev(%s): "
+				"Could not create device major/minor number "
+				"%u/%u\n", ftdev->name, ftdev->major,
+				minor_cntr);
+			printk(KERN_WARNING "ftdev(%s): "
+				"will attempt deletion of allocated devices.\n",
+				ftdev->name);
+			goto err_minors;
+		}
+	}
+
+	return 0;
+
+err_minors:
+	ftdev_device_destroy(ftdev, minor_cntr);
+	cdev_del(&ftdev->cdev);
+err_unregister:
+	unregister_chrdev_region(MKDEV(ftdev->major, 0), ftdev->minor_cnt);
+err_out:
+	return err;
+}
-- 
1.8.1.2


From 7b317d9036b3fc0280327586ef52dfa6cb6dd250 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Sun, 23 Jun 2013 14:40:52 +0200
Subject: [PATCH 005/119] Feather-Trace: add LITMUS^RT overhead tracing
 infrastructure

This patch adds the main infrastructure for tracing overheads in
LITMUS^RT.  It does not yet introduce any tracepoints into the kernel.
---
 include/litmus/trace.h | 142 +++++++++++++
 litmus/Kconfig         |  25 +++
 litmus/Makefile        |   1 +
 litmus/trace.c         | 562 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 730 insertions(+)
 create mode 100644 include/litmus/trace.h
 create mode 100644 litmus/trace.c

diff --git a/include/litmus/trace.h b/include/litmus/trace.h
new file mode 100644
index 0000000..6017872
--- /dev/null
+++ b/include/litmus/trace.h
@@ -0,0 +1,142 @@
+#ifndef _SYS_TRACE_H_
+#define	_SYS_TRACE_H_
+
+#ifdef CONFIG_SCHED_OVERHEAD_TRACE
+
+
+#include <litmus/feather_trace.h>
+#include <litmus/feather_buffer.h>
+
+
+/*********************** TIMESTAMPS ************************/
+
+enum task_type_marker {
+	TSK_BE,
+	TSK_RT,
+	TSK_UNKNOWN
+};
+
+struct timestamp {
+	uint64_t		timestamp:48;
+	uint64_t		pid:16;
+	uint32_t		seq_no;
+	uint8_t			cpu;
+	uint8_t			event;
+	uint8_t			task_type:2;
+	uint8_t			irq_flag:1;
+	uint8_t			irq_count:5;
+};
+
+/* tracing callbacks */
+feather_callback void msg_sent(unsigned long event, unsigned long to);
+feather_callback void msg_received(unsigned long event);
+
+#define MSG_TIMESTAMP_SENT(id, to) \
+	ft_event1(id, msg_sent, (unsigned long) to);
+
+#define MSG_TIMESTAMP_RECEIVED(id) \
+	ft_event0(id, msg_received);
+
+feather_callback void save_cpu_timestamp(unsigned long event);
+feather_callback void save_cpu_timestamp_time(unsigned long event, unsigned long time_ptr);
+feather_callback void save_cpu_timestamp_irq(unsigned long event, unsigned long irq_count_ptr);
+feather_callback void save_cpu_timestamp_task(unsigned long event, unsigned long t_ptr);
+feather_callback void save_cpu_timestamp_def(unsigned long event, unsigned long type);
+feather_callback void save_cpu_task_latency(unsigned long event, unsigned long when_ptr);
+
+#define CPU_TIMESTAMP_TIME(id, time_ptr) \
+	ft_event1(id, save_cpu_timestamp_time, (unsigned long) time_ptr)
+
+#define CPU_TIMESTAMP_IRQ(id, irq_count_ptr) \
+	ft_event1(id, save_cpu_timestamp_irq, (unsigned long) irq_count_ptr)
+
+#define CPU_TIMESTAMP(id) ft_event0(id, save_cpu_timestamp)
+
+#define CPU_DTIMESTAMP(id, def)  ft_event1(id, save_cpu_timestamp_def, (unsigned long) def)
+
+#define CPU_TIMESTAMP_CUR(id) CPU_DTIMESTAMP(id, is_realtime(current) ? TSK_RT : TSK_BE)
+
+#define CPU_TTIMESTAMP(id, task) \
+	ft_event1(id, save_cpu_timestamp_task, (unsigned long) task)
+
+#define CPU_LTIMESTAMP(id, task) \
+	ft_event1(id, save_cpu_task_latency, (unsigned long) task)
+
+#else /* !CONFIG_SCHED_OVERHEAD_TRACE */
+
+#define MSG_TIMESTAMP_SENT(id, to)
+#define MSG_TIMESTAMP_RECEIVED(id)
+
+#define CPU_TIMESTAMP_TIME(id, time_ptr)
+#define CPU_TIMESTAMP_IRQ(id, irq_count_ptr)
+#define CPU_TIMESTAMP(id)
+#define CPU_DTIMESTAMP(id, def)
+#define CPU_TIMESTAMP_CUR(id)
+#define CPU_TTIMESTAMP(id, task)
+#define CPU_LTIMESTAMP(id, task)
+
+#endif
+
+
+/* Convention for timestamps
+ * =========================
+ *
+ * In order to process the trace files with a common tool, we use the following
+ * convention to measure execution times: The end time id of a code segment is
+ * always the next number after the start time event id.
+ */
+
+#define __TS_SYSCALL_IN_START(p)	CPU_TIMESTAMP_TIME(10, p)
+#define __TS_SYSCALL_IN_END(p)		CPU_TIMESTAMP_IRQ(11, p)
+
+#define TS_SYSCALL_OUT_START		CPU_TIMESTAMP_CUR(20)
+#define TS_SYSCALL_OUT_END		CPU_TIMESTAMP_CUR(21)
+
+#define TS_LOCK_START			CPU_TIMESTAMP_CUR(30)
+#define TS_LOCK_END			CPU_TIMESTAMP_CUR(31)
+
+#define TS_LOCK_SUSPEND			CPU_TIMESTAMP_CUR(38)
+#define TS_LOCK_RESUME			CPU_TIMESTAMP_CUR(39)
+
+#define TS_UNLOCK_START			CPU_TIMESTAMP_CUR(40)
+#define TS_UNLOCK_END			CPU_TIMESTAMP_CUR(41)
+
+#define TS_SCHED_START			CPU_DTIMESTAMP(100, TSK_UNKNOWN) /* we only
+								      * care
+								      * about
+								      * next */
+#define TS_SCHED_END(t)			CPU_TTIMESTAMP(101, t)
+#define TS_SCHED2_START(t) 		CPU_TTIMESTAMP(102, t)
+#define TS_SCHED2_END(t)       		CPU_TTIMESTAMP(103, t)
+
+#define TS_CXS_START(t)			CPU_TTIMESTAMP(104, t)
+#define TS_CXS_END(t)			CPU_TTIMESTAMP(105, t)
+
+#define TS_RELEASE_START		CPU_DTIMESTAMP(106, TSK_RT)
+#define TS_RELEASE_END			CPU_DTIMESTAMP(107, TSK_RT)
+
+#define TS_TICK_START(t)		CPU_TTIMESTAMP(110, t)
+#define TS_TICK_END(t) 			CPU_TTIMESTAMP(111, t)
+
+#define TS_QUANTUM_BOUNDARY_START	CPU_TIMESTAMP_CUR(112)
+#define TS_QUANTUM_BOUNDARY_END		CPU_TIMESTAMP_CUR(113)
+
+
+#define TS_PLUGIN_SCHED_START		/* TIMESTAMP(120) */  /* currently unused */
+#define TS_PLUGIN_SCHED_END		/* TIMESTAMP(121) */
+
+#define TS_PLUGIN_TICK_START		/* TIMESTAMP(130) */
+#define TS_PLUGIN_TICK_END		/* TIMESTAMP(131) */
+
+#define TS_ENTER_NP_START		CPU_TIMESTAMP(140)
+#define TS_ENTER_NP_END			CPU_TIMESTAMP(141)
+
+#define TS_EXIT_NP_START		CPU_TIMESTAMP(150)
+#define TS_EXIT_NP_END			CPU_TIMESTAMP(151)
+
+#define TS_SEND_RESCHED_START(c)	MSG_TIMESTAMP_SENT(190, c)
+#define TS_SEND_RESCHED_END		MSG_TIMESTAMP_RECEIVED(191)
+
+#define TS_RELEASE_LATENCY(when)	CPU_LTIMESTAMP(208, &(when))
+
+#endif /* !_SYS_TRACE_H_ */
diff --git a/litmus/Kconfig b/litmus/Kconfig
index 7456eb2..0c7e06b 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -24,6 +24,31 @@ config FEATHER_TRACE
 	  you still need to enable SCHED_TASK_TRACE and/or SCHED_OVERHEAD_TRACE to
 	  actually enable any events.
 
+config SCHED_OVERHEAD_TRACE
+	bool "Record timestamps for overhead measurements"
+	depends on FEATHER_TRACE
+	default y
+	help
+	  Export event stream for overhead tracing.
+	  Say Yes for overhead tracing.
+
+config SCHED_OVERHEAD_TRACE_SHIFT
+       int "Buffer size for Feather-Trace overhead data"
+       depends on SCHED_OVERHEAD_TRACE
+       range 15 32
+       default 22
+       help
+
+         Select the buffer size for the Feather-Trace overhead tracing
+         infrastructure (/dev/litmus/ft_trace0 & ftcat) as a power of two.  The
+         larger the buffer, the less likely the chance of buffer overflows if
+         the ftcat process is starved by real-time activity. In machines with
+         large memories, large buffer sizes are recommended.
+
+	 Examples: 16 =>   2 MB
+		   24 => 512 MB
+		   26 =>  2G MB
+
 endmenu
 
 endmenu
diff --git a/litmus/Makefile b/litmus/Makefile
index bca61e6..99f90c3 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -3,3 +3,4 @@
 #
 
 obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
+obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o
diff --git a/litmus/trace.c b/litmus/trace.c
new file mode 100644
index 0000000..a378623
--- /dev/null
+++ b/litmus/trace.c
@@ -0,0 +1,562 @@
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+
+#include <litmus/ftdev.h>
+#include <litmus/trace.h>
+
+/* dummy definition of is_realtime() */
+#define is_realtime(t) (0)
+
+/******************************************************************************/
+/*                          Allocation                                        */
+/******************************************************************************/
+
+static struct ftdev cpu_overhead_dev;
+static struct ftdev msg_overhead_dev;
+
+#define cpu_trace_ts_buf(cpu) cpu_overhead_dev.minor[(cpu)].buf
+#define msg_trace_ts_buf(cpu) msg_overhead_dev.minor[(cpu)].buf
+
+DEFINE_PER_CPU(atomic_t, irq_fired_count;)
+DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, cpu_irq_fired_count);
+
+static DEFINE_PER_CPU(unsigned int, cpu_ts_seq_no);
+static DEFINE_PER_CPU(unsigned int, msg_ts_seq_no);
+
+static int64_t cycle_offset[NR_CPUS][NR_CPUS];
+
+void ft_irq_fired(void)
+{
+	/* Only called with preemptions disabled.  */
+	atomic_inc(&__get_cpu_var(irq_fired_count));
+	atomic_inc(&__get_cpu_var(cpu_irq_fired_count));
+}
+
+static inline void clear_irq_fired(void)
+{
+	atomic_set(&__raw_get_cpu_var(irq_fired_count), 0);
+}
+
+static inline unsigned int get_and_clear_irq_fired(void)
+{
+	/* This is potentially not atomic  since we might migrate if
+	 * preemptions are not disabled. As a tradeoff between
+	 * accuracy and tracing overheads, this seems acceptable.
+	 * If it proves to be a problem, then one could add a callback
+	 * from the migration code to invalidate irq_fired_count.
+	 */
+	return atomic_xchg(&__raw_get_cpu_var(irq_fired_count), 0);
+}
+
+static inline unsigned int get_and_clear_irq_fired_for_cpu(int cpu)
+{
+	return atomic_xchg(&per_cpu(irq_fired_count, cpu), 0);
+}
+
+static inline void cpu_clear_irq_fired(void)
+{
+	atomic_set(&__raw_get_cpu_var(cpu_irq_fired_count), 0);
+}
+
+static inline unsigned int cpu_get_and_clear_irq_fired(void)
+{
+	return atomic_xchg(&__raw_get_cpu_var(cpu_irq_fired_count), 0);
+}
+
+static inline void save_irq_flags(struct timestamp *ts, unsigned int irq_count)
+{
+	/* Store how many interrupts occurred. */
+	ts->irq_count = irq_count;
+	/* Extra flag because ts->irq_count overflows quickly. */
+	ts->irq_flag  = irq_count > 0;
+}
+
+#define NO_IRQ_COUNT 0
+#define LOCAL_IRQ_COUNT 1
+#define REMOTE_IRQ_COUNT 2
+
+#define DO_NOT_RECORD_TIMESTAMP 0
+#define RECORD_LOCAL_TIMESTAMP 1
+#define RECORD_OFFSET_TIMESTAMP 2
+
+static inline void __write_record(
+	uint8_t event,
+	uint8_t type,
+	uint16_t pid_fragment,
+	unsigned int irq_count,
+	int record_irq,
+	int hide_irq,
+	uint64_t timestamp,
+	int record_timestamp,
+
+	int only_single_writer,
+	int is_cpu_timestamp,
+	int local_cpu,
+	uint8_t other_cpu)
+{
+	unsigned long flags;
+	unsigned int seq_no;
+	struct timestamp *ts;
+	int cpu;
+	struct ft_buffer* buf;
+
+	/* Avoid preemptions while recording the timestamp. This reduces the
+	 * number of "out of order" timestamps in the stream and makes
+	 * post-processing easier. */
+
+	local_irq_save(flags);
+
+	if (local_cpu)
+		cpu = smp_processor_id();
+	else
+		cpu = other_cpu;
+
+	/* resolved during function inlining */
+	if (is_cpu_timestamp) {
+		seq_no = __get_cpu_var(cpu_ts_seq_no)++;
+		buf = cpu_trace_ts_buf(cpu);
+	} else {
+		seq_no = fetch_and_inc((int *) &per_cpu(msg_ts_seq_no, cpu));
+		buf = msg_trace_ts_buf(cpu);
+	}
+
+	/* If buf is non-NULL here, then the buffer cannot be deallocated until
+	 * we turn interrupts on again. This is because free_timestamp_buffer()
+	 * indirectly causes TLB invalidations due to modifications of the
+	 * kernel address space, namely via vfree() in free_ft_buffer(), which
+	 * cannot be processed until we turn on interrupts again.
+	 */
+
+	if (buf &&
+	    (only_single_writer /* resolved during function inlining */
+	     ? ft_buffer_start_single_write(buf, (void**)  &ts)
+	     : ft_buffer_start_write(buf, (void**) &ts))) {
+		ts->event     = event;
+		ts->seq_no    = seq_no;
+
+		ts->task_type = type;
+		ts->pid	      = pid_fragment;
+
+		ts->cpu       = cpu;
+
+		if (record_irq) {
+			if (local_cpu)
+				irq_count = cpu_get_and_clear_irq_fired();
+			else
+				irq_count = get_and_clear_irq_fired_for_cpu(cpu);
+		}
+
+		save_irq_flags(ts, irq_count - hide_irq);
+
+		if (record_timestamp)
+			timestamp = ft_timestamp();
+		if (record_timestamp == RECORD_OFFSET_TIMESTAMP)
+			timestamp += cycle_offset[smp_processor_id()][cpu];
+
+		ts->timestamp = timestamp;
+		ft_buffer_finish_write(buf, ts);
+	}
+
+	local_irq_restore(flags);
+}
+
+
+static inline void write_cpu_timestamp(
+	uint8_t event,
+	uint8_t type,
+	uint16_t pid_fragment,
+	unsigned int irq_count,
+	int record_irq,
+	int hide_irq,
+	uint64_t timestamp,
+	int record_timestamp)
+{
+	__write_record(event, type,
+		       pid_fragment,
+		       irq_count, record_irq, hide_irq,
+		       timestamp, record_timestamp,
+		       1 /* only_single_writer */,
+		       1 /* is_cpu_timestamp */,
+		       1 /* local_cpu */,
+		       0xff /* other_cpu */);
+}
+
+static inline void save_msg_timestamp(
+	uint8_t event,
+	int hide_irq)
+{
+	struct task_struct *t  = current;
+	__write_record(event, is_realtime(t) ? TSK_RT : TSK_BE,
+		       t->pid,
+		       0, LOCAL_IRQ_COUNT, hide_irq,
+		       0, RECORD_LOCAL_TIMESTAMP,
+		       0 /* only_single_writer */,
+		       0 /* is_cpu_timestamp */,
+		       1 /* local_cpu */,
+		       0xff /* other_cpu */);
+}
+
+static inline void save_remote_msg_timestamp(
+	uint8_t event,
+	uint8_t remote_cpu)
+{
+	struct task_struct *t  = current;
+	__write_record(event, is_realtime(t) ? TSK_RT : TSK_BE,
+		       t->pid,
+		       0, REMOTE_IRQ_COUNT, 0,
+		       0, RECORD_OFFSET_TIMESTAMP,
+		       0 /* only_single_writer */,
+		       0 /* is_cpu_timestamp */,
+		       0 /* local_cpu */,
+		       remote_cpu);
+}
+
+feather_callback void save_cpu_timestamp_def(unsigned long event,
+					     unsigned long type)
+{
+	write_cpu_timestamp(event, type,
+			    current->pid,
+			    0, LOCAL_IRQ_COUNT, 0,
+			    0, RECORD_LOCAL_TIMESTAMP);
+}
+
+feather_callback void save_cpu_timestamp_task(unsigned long event,
+					      unsigned long t_ptr)
+{
+	struct task_struct *t = (struct task_struct *) t_ptr;
+	int rt = is_realtime(t);
+
+	write_cpu_timestamp(event, rt ? TSK_RT : TSK_BE,
+			    t->pid,
+			    0, LOCAL_IRQ_COUNT, 0,
+			    0, RECORD_LOCAL_TIMESTAMP);
+}
+
+feather_callback void save_cpu_task_latency(unsigned long event,
+					    unsigned long when_ptr)
+{
+	lt_t now = litmus_clock();
+	lt_t *when = (lt_t*) when_ptr;
+
+	write_cpu_timestamp(event, TSK_RT,
+			    0,
+			    0, LOCAL_IRQ_COUNT, 0,
+			    now - *when, DO_NOT_RECORD_TIMESTAMP);
+}
+
+/* fake timestamp to user-reported time */
+feather_callback void save_cpu_timestamp_time(unsigned long event,
+			 unsigned long ptr)
+{
+	uint64_t* time = (uint64_t*) ptr;
+
+	write_cpu_timestamp(event, is_realtime(current) ? TSK_RT : TSK_BE,
+			    current->pid,
+			    0, LOCAL_IRQ_COUNT, 0,
+			    *time, DO_NOT_RECORD_TIMESTAMP);
+}
+
+/* Record user-reported IRQ count */
+feather_callback void save_cpu_timestamp_irq(unsigned long event,
+			unsigned long irq_counter_ptr)
+{
+	uint64_t* irqs = (uint64_t*) irq_counter_ptr;
+
+	write_cpu_timestamp(event, is_realtime(current) ? TSK_RT : TSK_BE,
+			    current->pid,
+			    *irqs, NO_IRQ_COUNT, 0,
+			    0, RECORD_LOCAL_TIMESTAMP);
+}
+
+
+feather_callback void msg_sent(unsigned long event, unsigned long to)
+{
+	save_remote_msg_timestamp(event, to);
+}
+
+/* Suppresses one IRQ from the irq count. Used by TS_SEND_RESCHED_END, which is
+ * called from within an interrupt that is expected. */
+feather_callback void msg_received(unsigned long event)
+{
+	save_msg_timestamp(event, 1);
+}
+
+static void __add_timestamp_user(struct timestamp *pre_recorded)
+{
+	unsigned long flags;
+	unsigned int seq_no;
+	struct timestamp *ts;
+	struct ft_buffer* buf;
+	int cpu;
+
+	local_irq_save(flags);
+
+	cpu = smp_processor_id();
+	buf = cpu_trace_ts_buf(cpu);
+
+	seq_no = __get_cpu_var(cpu_ts_seq_no)++;
+	if (buf && ft_buffer_start_single_write(buf, (void**)  &ts)) {
+		*ts = *pre_recorded;
+		ts->seq_no = seq_no;
+		ts->cpu	   = raw_smp_processor_id();
+	        save_irq_flags(ts, get_and_clear_irq_fired());
+		ft_buffer_finish_write(buf, ts);
+	}
+
+	local_irq_restore(flags);
+}
+
+/******************************************************************************/
+/*                        DEVICE FILE DRIVER                                  */
+/******************************************************************************/
+
+struct calibrate_info {
+	atomic_t ready;
+
+	uint64_t cycle_count;
+};
+
+static void calibrate_helper(void *_info)
+{
+	struct calibrate_info *info = _info;
+	/* check in with master */
+	atomic_inc(&info->ready);
+
+	/* wait for master to signal start */
+	while (atomic_read(&info->ready))
+		cpu_relax();
+
+	/* report time stamp */
+	info->cycle_count = ft_timestamp();
+
+	/* tell master that we are done */
+	atomic_inc(&info->ready);
+}
+
+
+static int64_t calibrate_cpu(int cpu)
+{
+	uint64_t cycles;
+	struct calibrate_info info;
+	unsigned long flags;
+	int64_t  delta;
+
+	atomic_set(&info.ready, 0);
+	info.cycle_count = 0;
+	smp_wmb();
+
+	smp_call_function_single(cpu, calibrate_helper, &info, 0);
+
+	/* wait for helper to become active */
+	while (!atomic_read(&info.ready))
+		cpu_relax();
+
+	/* avoid interrupt interference */
+	local_irq_save(flags);
+
+	/* take measurement */
+	atomic_set(&info.ready, 0);
+	smp_wmb();
+	cycles = ft_timestamp();
+
+	/* wait for helper reading */
+	while (!atomic_read(&info.ready))
+		cpu_relax();
+
+	/* positive offset: the other guy is ahead of us */
+	delta  = (int64_t) info.cycle_count;
+	delta -= (int64_t) cycles;
+
+	local_irq_restore(flags);
+
+	return delta;
+}
+
+#define NUM_SAMPLES 10
+
+static long calibrate_tsc_offsets(struct ftdev* ftdev, unsigned int idx,
+				  unsigned long uarg)
+{
+	int cpu, self, i;
+	int64_t delta, sample;
+
+	preempt_disable();
+	self = smp_processor_id();
+
+	if (uarg)
+		printk(KERN_INFO "Feather-Trace: determining TSC offsets for P%d\n", self);
+
+	for_each_online_cpu(cpu)
+		if (cpu != self) {
+			delta = calibrate_cpu(cpu);
+			for (i = 1; i < NUM_SAMPLES; i++) {
+			        sample = calibrate_cpu(cpu);
+				delta = sample < delta ? sample : delta;
+			}
+
+			cycle_offset[self][cpu] = delta;
+
+			if (uarg)
+				printk(KERN_INFO "Feather-Trace: TSC offset for P%d->P%d is %lld cycles.\n",
+				       self, cpu, cycle_offset[self][cpu]);
+		}
+
+	preempt_enable();
+	return 0;
+}
+
+#define NO_TIMESTAMPS (2 << CONFIG_SCHED_OVERHEAD_TRACE_SHIFT)
+
+static int alloc_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
+{
+	unsigned int count = NO_TIMESTAMPS;
+
+	/* An overhead-tracing timestamp should be exactly 16 bytes long. */
+	BUILD_BUG_ON(sizeof(struct timestamp) != 16);
+
+	while (count && !ftdev->minor[idx].buf) {
+		printk("time stamp buffer: trying to allocate %u time stamps for minor=%u.\n", count, idx);
+		ftdev->minor[idx].buf = alloc_ft_buffer(count, sizeof(struct timestamp));
+		count /= 2;
+	}
+	return ftdev->minor[idx].buf ? 0 : -ENOMEM;
+}
+
+static void free_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
+{
+	ftdev->minor[idx].buf = NULL;
+	/* Make sure all cores have actually seen buf == NULL before
+	 * yanking out the mappings from underneath them. */
+	smp_wmb();
+	free_ft_buffer(ftdev->minor[idx].buf);
+}
+
+static ssize_t write_timestamp_from_user(struct ft_buffer* buf, size_t len,
+					 const char __user *from)
+{
+	ssize_t consumed = 0;
+	struct timestamp ts;
+
+	/* don't give us partial timestamps */
+	if (len % sizeof(ts))
+		return -EINVAL;
+
+	while (len >= sizeof(ts)) {
+		if (copy_from_user(&ts, from, sizeof(ts))) {
+			consumed = -EFAULT;
+			goto out;
+		}
+		len  -= sizeof(ts);
+		from += sizeof(ts);
+		consumed += sizeof(ts);
+
+		/* Note: this always adds to the buffer of the CPU-local
+		 * device, not necessarily to the device that the system call
+		 * was invoked on. This is admittedly a bit ugly, but requiring
+		 * tasks to only write to the appropriate device would make
+		 * tracing from userspace under global and clustered scheduling
+		 * exceedingly difficult. Writing to remote buffers would
+		 * require to not use ft_buffer_start_single_write(), which we
+		 * want to do to reduce the number of atomic ops in the common
+		 * case (which is the recording of CPU-local scheduling
+		 * overheads).
+		 */
+		__add_timestamp_user(&ts);
+	}
+
+out:
+	return consumed;
+}
+
+static int __init init_cpu_ft_overhead_trace(void)
+{
+	int err, cpu;
+
+	printk("Initializing Feather-Trace per-cpu overhead tracing device.\n");
+	err = ftdev_init(&cpu_overhead_dev, THIS_MODULE,
+			 num_online_cpus(), "ft_cpu_trace");
+	if (err)
+		goto err_out;
+
+	cpu_overhead_dev.alloc = alloc_timestamp_buffer;
+	cpu_overhead_dev.free  = free_timestamp_buffer;
+	cpu_overhead_dev.write = write_timestamp_from_user;
+
+	err = register_ftdev(&cpu_overhead_dev);
+	if (err)
+		goto err_dealloc;
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+		per_cpu(cpu_ts_seq_no, cpu) = 0;
+	}
+
+	return 0;
+
+err_dealloc:
+	ftdev_exit(&cpu_overhead_dev);
+err_out:
+	printk(KERN_WARNING "Could not register per-cpu ft_trace device.\n");
+	return err;
+}
+
+static int __init init_msg_ft_overhead_trace(void)
+{
+	int err, cpu;
+
+	printk("Initializing Feather-Trace per-cpu message overhead tracing device.\n");
+	err = ftdev_init(&msg_overhead_dev, THIS_MODULE,
+			 num_online_cpus(), "ft_msg_trace");
+	if (err)
+		goto err_out;
+
+	msg_overhead_dev.alloc = alloc_timestamp_buffer;
+	msg_overhead_dev.free  = free_timestamp_buffer;
+	msg_overhead_dev.calibrate = calibrate_tsc_offsets;
+
+	err = register_ftdev(&msg_overhead_dev);
+	if (err)
+		goto err_dealloc;
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+		per_cpu(msg_ts_seq_no, cpu) = 0;
+	}
+
+	return 0;
+
+err_dealloc:
+	ftdev_exit(&msg_overhead_dev);
+err_out:
+	printk(KERN_WARNING "Could not register message ft_trace device.\n");
+	return err;
+}
+
+
+static int __init init_ft_overhead_trace(void)
+{
+	int err, i, j;
+
+	for (i = 0; i < NR_CPUS; i++)
+		for (j = 0; j < NR_CPUS; j++)
+			cycle_offset[i][j] = 0;
+
+	err = init_cpu_ft_overhead_trace();
+	if (err)
+		return err;
+
+	err = init_msg_ft_overhead_trace();
+	if (err)
+		ftdev_exit(&cpu_overhead_dev);
+		return err;
+
+	return 0;
+}
+
+static void __exit exit_ft_overhead_trace(void)
+{
+	ftdev_exit(&cpu_overhead_dev);
+	ftdev_exit(&msg_overhead_dev);
+}
+
+module_init(init_ft_overhead_trace);
+module_exit(exit_ft_overhead_trace);
-- 
1.8.1.2


From 530f3c252277104613501e10cbfa63c09e4ca9c0 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Sun, 23 Jun 2013 14:51:57 +0200
Subject: [PATCH 006/119] Integrate ft_irq_fired() with Linux

This patch hooks up Feather-Trace's ft_irq_fired() handler with
Linux's interrupt handling infrastructure.
---
 include/linux/hardirq.h    |  4 ++++
 include/litmus/trace_irq.h | 14 ++++++++++++++
 kernel/sched/core.c        |  5 +++++
 kernel/softirq.c           |  3 +++
 4 files changed, 26 insertions(+)
 create mode 100644 include/litmus/trace_irq.h

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index c1d6555..7ad5fd8 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -7,6 +7,8 @@
 #include <linux/vtime.h>
 #include <asm/hardirq.h>
 
+#include <litmus/trace_irq.h>
+
 /*
  * We put the hardirq and softirq counter into the preemption
  * counter. The bitmask has the following meaning:
@@ -154,6 +156,7 @@ extern void rcu_nmi_exit(void);
 		account_irq_enter_time(current);	\
 		add_preempt_count(HARDIRQ_OFFSET);	\
 		trace_hardirq_enter();			\
+		ft_irq_fired();				\
 	} while (0)
 
 /*
@@ -184,6 +187,7 @@ extern void irq_exit(void);
 		add_preempt_count(NMI_OFFSET + HARDIRQ_OFFSET);	\
 		rcu_nmi_enter();				\
 		trace_hardirq_enter();				\
+		ft_irq_fired();					\
 	} while (0)
 
 #define nmi_exit()						\
diff --git a/include/litmus/trace_irq.h b/include/litmus/trace_irq.h
new file mode 100644
index 0000000..0d0c042
--- /dev/null
+++ b/include/litmus/trace_irq.h
@@ -0,0 +1,14 @@
+#ifndef _LITMUS_TRACE_IRQ_H_
+#define	_LITMUS_TRACE_IRQ_H_
+
+#ifdef CONFIG_SCHED_OVERHEAD_TRACE
+
+void ft_irq_fired(void);
+
+#else
+
+#define ft_irq_fired() /* nothing to do */
+
+#endif
+
+#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b4308d7..7eefaab 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1408,7 +1408,12 @@ void scheduler_ipi(void)
 	if (llist_empty(&this_rq()->wake_list)
 			&& !tick_nohz_full_cpu(smp_processor_id())
 			&& !got_nohz_idle_kick())
+	{
+		/* If we don't call irq_enter(), we need to triggger the IRQ
+		 * tracing manually. */
+		ft_irq_fired();
 		return;
+	}
 
 	/*
 	 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 787b3a0..fe8890b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -226,6 +226,9 @@ asmlinkage void __do_softirq(void)
 	 */
 	current->flags &= ~PF_MEMALLOC;
 
+	/* Mark Feather-Trace samples as "disturbed". */
+	ft_irq_fired();
+
 	pending = local_softirq_pending();
 	account_irq_enter_time(current);
 
-- 
1.8.1.2


From 93919aeeb7b30971603a3d67b3d12091a45e9fbf Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Wed, 26 Jun 2013 18:45:30 +0200
Subject: [PATCH 007/119] Add SCHED, SCHED2, TICK, and CXS overhead tracepoints

This patch integrates the overhead tracepoints into the Linux
scheduler that are compatible with plain vanilla Linux (i.e., not
specific to LITMUS^RT plugins).  This can be used to measure the
overheads of an otherwise unmodified kernel.
---
 kernel/sched/core.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7eefaab..3a471d6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -86,6 +86,8 @@
 #include "../workqueue_internal.h"
 #include "../smpboot.h"
 
+#include <litmus/trace.h>
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
 
@@ -2748,6 +2750,8 @@ void scheduler_tick(void)
 
 	sched_clock_tick();
 
+	TS_TICK_START(current);
+
 	raw_spin_lock(&rq->lock);
 	update_rq_clock(rq);
 	update_cpu_load_active(rq);
@@ -2761,6 +2765,8 @@ void scheduler_tick(void)
 	trigger_load_balance(rq, cpu);
 #endif
 	rq_last_tick_reset(rq);
+
+	TS_TICK_END(current);
 }
 
 #ifdef CONFIG_NO_HZ_FULL
@@ -2972,6 +2978,8 @@ need_resched:
 	rcu_note_context_switch(cpu);
 	prev = rq->curr;
 
+	TS_SCHED_START;
+
 	schedule_debug(prev);
 
 	if (sched_feat(HRTICK))
@@ -3024,7 +3032,10 @@ need_resched:
 		rq->curr = next;
 		++*switch_count;
 
+		TS_SCHED_END(next);
+		TS_CXS_START(next);
 		context_switch(rq, prev, next); /* unlocks the rq */
+		TS_CXS_END(current);
 		/*
 		 * The context switch have flipped the stack from under us
 		 * and restored the local variables which were saved when
@@ -3033,12 +3044,19 @@ need_resched:
 		 */
 		cpu = smp_processor_id();
 		rq = cpu_rq(cpu);
-	} else
+	} else {
+		TS_SCHED_END(prev);
 		raw_spin_unlock_irq(&rq->lock);
+	}
+
+	TS_SCHED2_START(prev);
 
 	post_schedule(rq);
 
 	sched_preempt_enable_no_resched();
+
+	TS_SCHED2_END(prev);
+
 	if (need_resched())
 		goto need_resched;
 }
-- 
1.8.1.2


From 5ce8d2c4a1fa8e5a4bca25891f2e892ef55fc89f Mon Sep 17 00:00:00 2001
From: Felipe Cerqueira <felipec@mpi-sws.org>
Date: Mon, 11 Feb 2013 18:10:50 +0100
Subject: [PATCH 008/119] Export x86 cache topology

This patch adds get_shared_cpu_map(), which allows the caller to infer
which CPUs share a cache at a given level.
---
 arch/x86/include/asm/processor.h      |  4 ++++
 arch/x86/kernel/cpu/intel_cacheinfo.c | 17 +++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 22224b3..254dd2b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -173,6 +173,10 @@ void print_cpu_msr(struct cpuinfo_x86 *);
 extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
 extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
+#ifdef CONFIG_SYSFS
+extern int get_shared_cpu_map(cpumask_var_t mask,
+			       unsigned int cpu, int index);
+#endif
 
 extern void detect_extended_topology(struct cpuinfo_x86 *c);
 extern void detect_ht(struct cpuinfo_x86 *c);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 7c6f7d5..033939b 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -744,6 +744,23 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
 #define CPUID4_INFO_IDX(x, y)	(&((per_cpu(ici_cpuid4_info, x))[y]))
 
+/* returns CPUs that share the index cache with cpu */
+int get_shared_cpu_map(cpumask_var_t mask, unsigned int cpu, int index)
+{
+	int ret = 0;
+	struct _cpuid4_info *this_leaf;
+
+	if (index >= num_cache_leaves) {
+		index = num_cache_leaves - 1;
+		ret = index;
+	}
+
+	this_leaf = CPUID4_INFO_IDX(cpu,index);
+	cpumask_copy(mask, to_cpumask(this_leaf->shared_cpu_map));
+
+	return ret;
+}
+
 #ifdef CONFIG_SMP
 
 static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
-- 
1.8.1.2


From 74a89132e046e7a35f16f6eab9c6884679d48f27 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Sun, 23 Jun 2013 15:26:01 +0200
Subject: [PATCH 009/119] Add object list to inodes

This patch adds a list of arbitrary objects to inodes.

This is used by Linux's locking API to attach lock objects to inodes
(which represent namespaces in Linux's locking API).
---
 fs/inode.c         | 2 ++
 include/linux/fs.h | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/fs/inode.c b/fs/inode.c
index 00d5fc3..a80e326 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -370,6 +370,8 @@ void inode_init_once(struct inode *inode)
 #ifdef CONFIG_FSNOTIFY
 	INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
 #endif
+	INIT_LIST_HEAD(&inode->i_obj_list);
+	mutex_init(&inode->i_obj_mutex);
 }
 EXPORT_SYMBOL(inode_init_once);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 65c2be2..8a0ac17 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -508,6 +508,7 @@ static inline int mapping_writably_mapped(struct address_space *mapping)
 
 struct posix_acl;
 #define ACL_NOT_CACHED ((void *)(-1))
+struct inode_obj_id_table;
 
 #define IOP_FASTPERM	0x0001
 #define IOP_LOOKUP	0x0002
@@ -606,6 +607,10 @@ struct inode {
 #ifdef CONFIG_IMA
 	atomic_t		i_readcount; /* struct files open RO */
 #endif
+
+	struct list_head	i_obj_list;
+	struct mutex		i_obj_mutex;
+
 	void			*i_private; /* fs or device private pointer */
 };
 
-- 
1.8.1.2


From 7fb0ac2758b6e277de7bd753fdbe8596048d156c Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Sun, 23 Jun 2013 16:29:07 +0200
Subject: [PATCH 010/119] Add TRACE() debug tracing support

This patch adds the infrastructure for the TRACE() debug macro.
---
 include/litmus/debug_trace.h |  40 +++++++
 kernel/printk.c              |  13 ++-
 litmus/Kconfig               |  53 +++++++++
 litmus/Makefile              |   1 +
 litmus/sched_trace.c         | 251 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 357 insertions(+), 1 deletion(-)
 create mode 100644 include/litmus/debug_trace.h
 create mode 100644 litmus/sched_trace.c

diff --git a/include/litmus/debug_trace.h b/include/litmus/debug_trace.h
new file mode 100644
index 0000000..1266ac6
--- /dev/null
+++ b/include/litmus/debug_trace.h
@@ -0,0 +1,40 @@
+#ifndef LITMUS_DEBUG_TRACE_H
+#define LITMUS_DEBUG_TRACE_H
+
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+void sched_trace_log_message(const char* fmt, ...);
+void dump_trace_buffer(int max);
+#else
+
+#define sched_trace_log_message(fmt, ...)
+
+#endif
+
+extern atomic_t __log_seq_no;
+
+#ifdef CONFIG_SCHED_DEBUG_TRACE_CALLER
+#define TRACE_PREFIX "%d P%d [%s@%s:%d]: "
+#define TRACE_ARGS  atomic_add_return(1, &__log_seq_no),	\
+		raw_smp_processor_id(),				\
+		__FUNCTION__, __FILE__, __LINE__
+#else
+#define TRACE_PREFIX "%d P%d: "
+#define TRACE_ARGS  atomic_add_return(1, &__log_seq_no), \
+		raw_smp_processor_id()
+#endif
+
+#define TRACE(fmt, args...)						\
+	sched_trace_log_message(TRACE_PREFIX fmt,			\
+				TRACE_ARGS,  ## args)
+
+#define TRACE_TASK(t, fmt, args...)			\
+	TRACE("(%s/%d:%d) " fmt,			 \
+	      t ? (t)->comm : "null",			 \
+	      t ? (t)->pid : 0,				 \
+	      t ? (t)->rt_param.job_params.job_no : 0,	 \
+	      ##args)
+
+#define TRACE_CUR(fmt, args...) \
+	TRACE_TASK(current, fmt, ## args)
+
+#endif
diff --git a/kernel/printk.c b/kernel/printk.c
index d37d45c..5616e59 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -66,6 +66,13 @@ int console_printk[4] = {
 };
 
 /*
+ * divert printk() messages when there is a LITMUS^RT debug listener
+ */
+#include <litmus/debug_trace.h>
+int trace_override = 0;
+int trace_recurse  = 0;
+
+/*
  * Low level drivers may need that to know if they can schedule in
  * their unblank() callback or not. So let's export it.
  */
@@ -1552,6 +1559,10 @@ asmlinkage int vprintk_emit(int facility, int level,
 	 */
 	text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
 
+	/* if LITMUS^RT tracer is active divert printk() msgs */
+	if (trace_override && !trace_recurse)
+		TRACE("%s", text);
+
 	/* mark and strip a trailing newline */
 	if (text_len && text[text_len-1] == '\n') {
 		text_len--;
@@ -2478,7 +2489,7 @@ static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
 void wake_up_klogd(void)
 {
 	preempt_disable();
-	if (waitqueue_active(&log_wait)) {
+	if (!trace_override && waitqueue_active(&log_wait)) {
 		this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
 		irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
 	}
diff --git a/litmus/Kconfig b/litmus/Kconfig
index 0c7e06b..e4624ee 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -49,6 +49,59 @@ config SCHED_OVERHEAD_TRACE_SHIFT
 		   24 => 512 MB
 		   26 =>  2G MB
 
+config SCHED_DEBUG_TRACE
+	bool "TRACE() debugging"
+	default n
+	help
+	  Include support for sched_trace_log_messageg(), which is used to
+	  implement TRACE(). If disabled, no TRACE() messages will be included
+	  in the kernel, and no overheads due to debugging statements will be
+	  incurred by the scheduler. Disable if the overhead is not acceptable
+	  (e.g. benchmarking).
+
+	  Say Yes for debugging.
+	  Say No for overhead tracing.
+
+config SCHED_DEBUG_TRACE_SHIFT
+       int "Buffer size for TRACE() buffer"
+       depends on SCHED_DEBUG_TRACE
+       range 14 22
+       default 18
+       help
+
+	Select the amount of memory needed per for the TRACE() buffer, as a
+	power of two. The TRACE() buffer is global and statically allocated. If
+	the buffer is too small, there will be holes in the TRACE() log if the
+	buffer-flushing task is starved.
+
+	The default should be sufficient for most systems. Increase the buffer
+	size if the log contains holes. Reduce the buffer size when running on
+	a memory-constrained system.
+
+	Examples: 14 =>  16KB
+		  18 => 256KB
+		  20 =>   1MB
+
+        This buffer is exported to usespace using a misc device as
+        'litmus/log'. On a system with default udev rules, a corresponding
+        character device node should be created at /dev/litmus/log. The buffer
+        can be flushed using cat, e.g., 'cat /dev/litmus/log > my_log_file.txt'.
+
+config SCHED_DEBUG_TRACE_CALLER
+       bool "Include [function@file:line] tag in TRACE() log"
+       depends on SCHED_DEBUG_TRACE
+       default n
+       help
+         With this option enabled, TRACE() prepends
+
+	      "[<function name>@<filename>:<line number>]"
+
+	 to each message in the debug log. Enable this to aid in figuring out
+         what was called in which order. The downside is that it adds a lot of
+         clutter.
+
+	 If unsure, say No.
+
 endmenu
 
 endmenu
diff --git a/litmus/Makefile b/litmus/Makefile
index 99f90c3..07f065f 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -3,4 +3,5 @@
 #
 
 obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
+obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o
 obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o
diff --git a/litmus/sched_trace.c b/litmus/sched_trace.c
new file mode 100644
index 0000000..426a9dd
--- /dev/null
+++ b/litmus/sched_trace.c
@@ -0,0 +1,251 @@
+/*
+ * sched_trace.c -- record scheduling events to a byte stream.
+ */
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/miscdevice.h>
+#include <asm/uaccess.h>
+#include <linux/module.h>
+#include <linux/sysrq.h>
+#include <linux/sched.h>
+#include <linux/kfifo.h>
+
+atomic_t __log_seq_no = ATOMIC_INIT(0);
+
+#define SCHED_TRACE_NAME "litmus/log"
+
+/* Compute size of TRACE() buffer */
+#define LITMUS_TRACE_BUF_SIZE (1 << CONFIG_SCHED_DEBUG_TRACE_SHIFT)
+
+/* Max length of one read from the buffer */
+#define MAX_READ_LEN (64 * 1024)
+
+/* Max length for one write --- by TRACE() --- to the buffer. This is used to
+ * allocate a per-cpu buffer for printf() formatting. */
+#define MSG_SIZE 255
+
+
+static DEFINE_MUTEX(reader_mutex);
+static atomic_t reader_cnt = ATOMIC_INIT(0);
+static DEFINE_KFIFO(debug_buffer, char, LITMUS_TRACE_BUF_SIZE);
+
+
+static DEFINE_RAW_SPINLOCK(log_buffer_lock);
+static DEFINE_PER_CPU(char[MSG_SIZE], fmt_buffer);
+
+/*
+ * sched_trace_log_message - Write to the trace buffer (log_buffer)
+ *
+ * This is the only function accessing the log_buffer from inside the
+ * kernel for writing.
+ * Concurrent access to sched_trace_log_message must be serialized using
+ * log_buffer_lock
+ * The maximum length of a formatted message is 255
+ */
+void sched_trace_log_message(const char* fmt, ...)
+{
+	unsigned long 	flags;
+	va_list 	args;
+	size_t		len;
+	char*		buf;
+
+	if (!atomic_read(&reader_cnt))
+		/* early exit if nobody is listening */
+		return;
+
+	va_start(args, fmt);
+	local_irq_save(flags);
+
+	/* format message */
+	buf = __get_cpu_var(fmt_buffer);
+	len = vscnprintf(buf, MSG_SIZE, fmt, args);
+
+	raw_spin_lock(&log_buffer_lock);
+	/* Don't copy the trailing null byte, we don't want null bytes in a
+	 * text file.
+	 */
+	kfifo_in(&debug_buffer, buf, len);
+	raw_spin_unlock(&log_buffer_lock);
+
+	local_irq_restore(flags);
+	va_end(args);
+}
+
+
+/*
+ * log_read - Read the trace buffer
+ *
+ * This function is called as a file operation from userspace.
+ * Readers can sleep. Access is serialized through reader_mutex
+ */
+static ssize_t log_read(struct file *filp,
+			char __user *to, size_t len,
+			loff_t *f_pos)
+{
+	/* we ignore f_pos, this is strictly sequential */
+
+	ssize_t error = -EINVAL;
+	char* mem;
+
+	if (mutex_lock_interruptible(&reader_mutex)) {
+		error = -ERESTARTSYS;
+		goto out;
+	}
+
+	if (len > MAX_READ_LEN)
+		len = MAX_READ_LEN;
+
+	mem = kmalloc(len, GFP_KERNEL);
+	if (!mem) {
+		error = -ENOMEM;
+		goto out_unlock;
+	}
+
+	error = kfifo_out(&debug_buffer, mem, len);
+	while (!error) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(110);
+		if (signal_pending(current))
+			error = -ERESTARTSYS;
+		else
+			error = kfifo_out(&debug_buffer, mem, len);
+	}
+
+	if (error > 0 && copy_to_user(to, mem, error))
+		error = -EFAULT;
+
+	kfree(mem);
+ out_unlock:
+	mutex_unlock(&reader_mutex);
+ out:
+	return error;
+}
+
+/*
+ * Enable redirection of printk() messages to the trace buffer.
+ * Defined in kernel/printk.c
+ */
+extern int trace_override;
+extern int trace_recurse;
+
+/*
+ * log_open - open the global log message ring buffer.
+ */
+static int log_open(struct inode *in, struct file *filp)
+{
+	int error = -EINVAL;
+
+	if (mutex_lock_interruptible(&reader_mutex)) {
+		error = -ERESTARTSYS;
+		goto out;
+	}
+
+	atomic_inc(&reader_cnt);
+	error = 0;
+
+	printk(KERN_DEBUG
+	       "sched_trace kfifo with buffer starting at: 0x%p\n",
+	       debug_buffer.buf);
+
+	/* override printk() */
+	trace_override++;
+
+	mutex_unlock(&reader_mutex);
+ out:
+	return error;
+}
+
+static int log_release(struct inode *in, struct file *filp)
+{
+	int error = -EINVAL;
+
+	if (mutex_lock_interruptible(&reader_mutex)) {
+		error = -ERESTARTSYS;
+		goto out;
+	}
+
+	atomic_dec(&reader_cnt);
+
+	/* release printk() overriding */
+	trace_override--;
+
+	printk(KERN_DEBUG "sched_trace kfifo released\n");
+
+	mutex_unlock(&reader_mutex);
+ out:
+	return error;
+}
+
+/*
+ * log_fops  - The file operations for accessing the global LITMUS log message
+ *             buffer.
+ *
+ * Except for opening the device file it uses the same operations as trace_fops.
+ */
+static struct file_operations log_fops = {
+	.owner   = THIS_MODULE,
+	.open    = log_open,
+	.release = log_release,
+	.read    = log_read,
+};
+
+static struct miscdevice litmus_log_dev = {
+	.name    = SCHED_TRACE_NAME,
+	.minor   = MISC_DYNAMIC_MINOR,
+	.fops    = &log_fops,
+};
+
+#ifdef CONFIG_MAGIC_SYSRQ
+void dump_trace_buffer(int max)
+{
+	char line[80];
+	int len;
+	int count = 0;
+
+	/* potential, but very unlikely, race... */
+	trace_recurse = 1;
+	while ((max == 0 || count++ < max) &&
+	       (len = kfifo_out(&debug_buffer, line, sizeof(line - 1))) > 0) {
+		line[len] = '\0';
+		printk("%s", line);
+	}
+	trace_recurse = 0;
+}
+
+static void sysrq_dump_trace_buffer(int key)
+{
+	dump_trace_buffer(100);
+}
+
+static struct sysrq_key_op sysrq_dump_trace_buffer_op = {
+	.handler	= sysrq_dump_trace_buffer,
+	.help_msg	= "dump-trace-buffer(Y)",
+	.action_msg	= "writing content of TRACE() buffer",
+};
+#endif
+
+static int __init init_sched_trace(void)
+{
+	printk("Initializing TRACE() device\n");
+
+#ifdef CONFIG_MAGIC_SYSRQ
+	/* offer some debugging help */
+	if (!register_sysrq_key('y', &sysrq_dump_trace_buffer_op))
+		printk("Registered dump-trace-buffer(Y) magic sysrq.\n");
+	else
+		printk("Could not register dump-trace-buffer(Y) magic sysrq.\n");
+#endif
+
+	return misc_register(&litmus_log_dev);
+}
+
+static void __exit exit_sched_trace(void)
+{
+	misc_deregister(&litmus_log_dev);
+}
+
+module_init(init_sched_trace);
+module_exit(exit_sched_trace);
-- 
1.8.1.2


From 39587f773d6b64c8f7ab82d7e222de45899cb36f Mon Sep 17 00:00:00 2001
From: Felipe Cerqueira <felipec@mpi-sws.org>
Date: Mon, 11 Feb 2013 16:36:35 +0100
Subject: [PATCH 011/119] Add hrtimer_start_on() support

This patch adds hrtimer_start_on(), which allows arming timers on
remote CPUs.  This is needed to avoided timer interrupts on "shielded"
CPUs and is also useful for implementing semi-partitioned schedulers.
---
 arch/arm/Kconfig                   |  3 ++
 arch/x86/Kconfig                   |  3 ++
 arch/x86/include/asm/entry_arch.h  |  1 +
 arch/x86/include/asm/hw_irq.h      |  3 ++
 arch/x86/include/asm/irq_vectors.h |  6 +++
 arch/x86/kernel/entry_64.S         |  2 +
 arch/x86/kernel/irqinit.c          |  3 ++
 arch/x86/kernel/smp.c              | 23 +++++++++
 include/linux/hrtimer.h            | 32 +++++++++++++
 include/linux/smp.h                |  5 ++
 kernel/hrtimer.c                   | 95 ++++++++++++++++++++++++++++++++++++++
 11 files changed, 176 insertions(+)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index ecfd735..81dddd7 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -2273,5 +2273,8 @@ source "arch/arm/kvm/Kconfig"
 config ARCH_HAS_FEATHER_TRACE
 	def_bool n
 
+config ARCH_HAS_SEND_PULL_TIMERS
+	def_bool n
+
 source "litmus/Kconfig"
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 171cdc9..b069526 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2350,4 +2350,7 @@ source "lib/Kconfig"
 config ARCH_HAS_FEATHER_TRACE
 	def_bool y
 
+config ARCH_HAS_SEND_PULL_TIMERS
+	def_bool y
+
 source "litmus/Kconfig"
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 9bd4eca..3a3c2f1 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -13,6 +13,7 @@
 BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
 BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
 BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
+BUILD_INTERRUPT(pull_timers_interrupt,PULL_TIMERS_VECTOR)
 BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
 BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
 #endif
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 1da97ef..672de93 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -77,6 +77,8 @@ extern void threshold_interrupt(void);
 extern void call_function_interrupt(void);
 extern void call_function_single_interrupt(void);
 
+extern void pull_timers_interrupt(void);
+
 /* IOAPIC */
 #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))
 extern unsigned long io_apic_irqs;
@@ -166,6 +168,7 @@ extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
 extern void smp_reschedule_interrupt(struct pt_regs *);
 extern void smp_call_function_interrupt(struct pt_regs *);
 extern void smp_call_function_single_interrupt(struct pt_regs *);
+extern void smp_pull_timers_interrupt(struct pt_regs *);
 #ifdef CONFIG_X86_32
 extern void smp_invalidate_interrupt(struct pt_regs *);
 #else
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 5702d7e..224116b 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -124,6 +124,12 @@
  */
 #define LOCAL_TIMER_VECTOR		0xef
 
+/*
+ * LITMUS^RT pull timers IRQ vector.
+ * Make sure it's not used by Linux.
+ */
+#define PULL_TIMERS_VECTOR		0xdf
+
 #define NR_VECTORS			 256
 
 #define FPU_IRQ				  13
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 7ac938a..2a54337 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1183,6 +1183,8 @@ apicinterrupt CALL_FUNCTION_VECTOR \
 	call_function_interrupt smp_call_function_interrupt
 apicinterrupt RESCHEDULE_VECTOR \
 	reschedule_interrupt smp_reschedule_interrupt
+apicinterrupt PULL_TIMERS_VECTOR \
+	pull_timers_interrupt smp_pull_timers_interrupt
 #endif
 
 apicinterrupt ERROR_APIC_VECTOR \
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index a2a1fbc..77979d9 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -145,6 +145,9 @@ static void __init smp_intr_init(void)
 	alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
 			call_function_single_interrupt);
 
+	/* IPI for hrtimer pulling on remote cpus */
+	alloc_intr_gate(PULL_TIMERS_VECTOR, pull_timers_interrupt);
+
 	/* Low priority IPI to cleanup after moving an irq */
 	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
 	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 48d2b7d..a52ef7f 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -24,6 +24,8 @@
 #include <linux/cpu.h>
 #include <linux/gfp.h>
 
+#include <litmus/debug_trace.h>
+
 #include <asm/mtrr.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -163,6 +165,16 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
 	return NMI_HANDLED;
 }
 
+/* trigger timers on remote cpu */
+void smp_send_pull_timers(int cpu)
+{
+	if (unlikely(cpu_is_offline(cpu))) {
+		WARN_ON(1);
+		return;
+	}
+	apic->send_IPI_mask(cpumask_of(cpu), PULL_TIMERS_VECTOR);
+}
+
 /*
  * this function calls the 'stop' function on all other CPUs in the system.
  */
@@ -285,6 +297,17 @@ static int __init nonmi_ipi_setup(char *str)
 
 __setup("nonmi_ipi", nonmi_ipi_setup);
 
+extern void hrtimer_pull(void);
+
+void smp_pull_timers_interrupt(struct pt_regs *regs)
+{
+	ack_APIC_irq();
+	irq_enter();
+	TRACE("pull timer interrupt\n");
+	hrtimer_pull();
+	irq_exit();
+}
+
 struct smp_ops smp_ops = {
 	.smp_prepare_boot_cpu	= native_smp_prepare_boot_cpu,
 	.smp_prepare_cpus	= native_smp_prepare_cpus,
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index d19a5c2..93def50 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -176,6 +176,7 @@ enum  hrtimer_base_type {
  * @nr_hangs:		Total number of hrtimer interrupt hangs
  * @max_hang_time:	Maximum time spent in hrtimer_interrupt
  * @clock_base:		array of clock bases for this cpu
+ * @to_pull:		LITMUS^RT list of timers to be pulled on this cpu
  */
 struct hrtimer_cpu_base {
 	raw_spinlock_t			lock;
@@ -191,8 +192,32 @@ struct hrtimer_cpu_base {
 	ktime_t				max_hang_time;
 #endif
 	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
+	struct list_head		to_pull;
 };
 
+#ifdef CONFIG_ARCH_HAS_SEND_PULL_TIMERS
+
+#define HRTIMER_START_ON_INACTIVE	0
+#define HRTIMER_START_ON_QUEUED		1
+
+/*
+ * struct hrtimer_start_on_info - save timer info on remote cpu
+ * @list:	list of hrtimer_start_on_info on remote cpu (to_pull)
+ * @timer:	timer to be triggered on remote cpu
+ * @time:	time event
+ * @mode:	timer mode
+ * @state:	activity flag
+ */
+struct hrtimer_start_on_info {
+	struct list_head	list;
+	struct hrtimer		*timer;
+	ktime_t			time;
+	enum hrtimer_mode	mode;
+	atomic_t		state;
+};
+
+#endif
+
 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
 {
 	timer->node.expires = time;
@@ -366,6 +391,13 @@ __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 			 unsigned long delta_ns,
 			 const enum hrtimer_mode mode, int wakeup);
 
+#ifdef CONFIG_ARCH_HAS_SEND_PULL_TIMERS
+extern void hrtimer_start_on_info_init(struct hrtimer_start_on_info *info);
+extern int hrtimer_start_on(int cpu, struct hrtimer_start_on_info *info,
+			struct hrtimer *timer, ktime_t time,
+			const enum hrtimer_mode mode);
+#endif
+
 extern int hrtimer_cancel(struct hrtimer *timer);
 extern int hrtimer_try_to_cancel(struct hrtimer *timer);
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index c848876..4f78ea7 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -84,6 +84,11 @@ int smp_call_function_any(const struct cpumask *mask,
 void kick_all_cpus_sync(void);
 
 /*
+ * sends a 'pull timer' event to a remote CPU
+ */
+extern void smp_send_pull_timers(int cpu);
+
+/*
  * Generic and arch helpers
  */
 #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 2288fbd..c7f0c79 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -48,6 +48,8 @@
 #include <linux/sched/rt.h>
 #include <linux/timer.h>
 
+#include <litmus/debug_trace.h>
+
 #include <asm/uaccess.h>
 
 #include <trace/events/timer.h>
@@ -1064,6 +1066,98 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 }
 EXPORT_SYMBOL_GPL(hrtimer_start);
 
+#if defined(CONFIG_ARCH_HAS_SEND_PULL_TIMERS) && defined(CONFIG_SMP)
+
+/**
+ * hrtimer_start_on_info_init - Initialize hrtimer_start_on_info
+ */
+void hrtimer_start_on_info_init(struct hrtimer_start_on_info *info)
+{
+	memset(info, 0, sizeof(struct hrtimer_start_on_info));
+	atomic_set(&info->state, HRTIMER_START_ON_INACTIVE);
+}
+
+/**
+ *  hrtimer_pull - PULL_TIMERS_VECTOR callback on remote cpu
+ */
+void hrtimer_pull(void)
+{
+	struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+	struct hrtimer_start_on_info *info;
+	struct list_head *pos, *safe, list;
+
+	raw_spin_lock(&base->lock);
+	list_replace_init(&base->to_pull, &list);
+	raw_spin_unlock(&base->lock);
+
+	list_for_each_safe(pos, safe, &list) {
+		info = list_entry(pos, struct hrtimer_start_on_info, list);
+		TRACE("pulled timer 0x%x\n", info->timer);
+		list_del(pos);
+		hrtimer_start(info->timer, info->time, info->mode);
+	}
+}
+
+/**
+ *  hrtimer_start_on - trigger timer arming on remote cpu
+ *  @cpu:	remote cpu
+ *  @info:	save timer information for enqueuing on remote cpu
+ *  @timer:	timer to be pulled
+ *  @time:	expire time
+ *  @mode:	timer mode
+ */
+int hrtimer_start_on(int cpu, struct hrtimer_start_on_info* info,
+		struct hrtimer *timer, ktime_t time,
+		const enum hrtimer_mode mode)
+{
+	unsigned long flags;
+	struct hrtimer_cpu_base* base;
+	int in_use = 0, was_empty;
+
+	/* serialize access to info through the timer base */
+	lock_hrtimer_base(timer, &flags);
+
+	in_use = (atomic_read(&info->state) != HRTIMER_START_ON_INACTIVE);
+	if (!in_use) {
+		INIT_LIST_HEAD(&info->list);
+		info->timer = timer;
+		info->time  = time;
+		info->mode  = mode;
+		/* mark as in use */
+		atomic_set(&info->state, HRTIMER_START_ON_QUEUED);
+	}
+
+	unlock_hrtimer_base(timer, &flags);
+
+	if (!in_use) {
+		/* initiate pull  */
+		preempt_disable();
+		if (cpu == smp_processor_id()) {
+			/* start timer locally; we may get called
+			 * with rq->lock held, do not wake up anything
+			 */
+			TRACE("hrtimer_start_on: starting on local CPU\n");
+			__hrtimer_start_range_ns(info->timer, info->time,
+						 0, info->mode, 0);
+		} else {
+			TRACE("hrtimer_start_on: pulling to remote CPU\n");
+			base = &per_cpu(hrtimer_bases, cpu);
+			raw_spin_lock_irqsave(&base->lock, flags);
+			was_empty = list_empty(&base->to_pull);
+			list_add(&info->list, &base->to_pull);
+			raw_spin_unlock_irqrestore(&base->lock, flags);
+			if (was_empty)
+				/* only send IPI if other no else
+				 * has done so already
+				 */
+				smp_send_pull_timers(cpu);
+		}
+		preempt_enable();
+	}
+	return in_use;
+}
+
+#endif
 
 /**
  * hrtimer_try_to_cancel - try to deactivate a timer
@@ -1667,6 +1761,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 	}
 
 	hrtimer_init_hres(cpu_base);
+	INIT_LIST_HEAD(&cpu_base->to_pull);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-- 
1.8.1.2


From 0014dd7899cb1e7109516d70c6db29223b3bdbdd Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Tue, 25 Jun 2013 06:22:06 +0200
Subject: [PATCH 012/119] Extend task_struct with rt_param

This patch adds the PCB extensions required for LITMUS^RT.
---
 include/linux/sched.h     |   5 +
 include/litmus/rt_param.h | 285 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 290 insertions(+)
 create mode 100644 include/litmus/rt_param.h

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 597c8ab..164bb0d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -55,6 +55,8 @@ struct sched_param {
 
 #include <asm/processor.h>
 
+#include <litmus/rt_param.h>
+
 struct exec_domain;
 struct futex_pi_state;
 struct robust_list_head;
@@ -1369,6 +1371,9 @@ struct task_struct {
 	int nr_dirtied_pause;
 	unsigned long dirty_paused_when; /* start of a write-and-pause period */
 
+	/* LITMUS RT parameters and state */
+	struct rt_param rt_param;
+
 #ifdef CONFIG_LATENCYTOP
 	int latency_record_count;
 	struct latency_record latency_record[LT_SAVECOUNT];
diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
new file mode 100644
index 0000000..ce76faa
--- /dev/null
+++ b/include/litmus/rt_param.h
@@ -0,0 +1,285 @@
+/*
+ * Definition of the scheduler plugin interface.
+ *
+ */
+#ifndef _LINUX_RT_PARAM_H_
+#define _LINUX_RT_PARAM_H_
+
+/* Litmus time type. */
+typedef unsigned long long lt_t;
+
+static inline int lt_after(lt_t a, lt_t b)
+{
+	return ((long long) b) - ((long long) a) < 0;
+}
+#define lt_before(a, b) lt_after(b, a)
+
+static inline int lt_after_eq(lt_t a, lt_t b)
+{
+	return ((long long) a) - ((long long) b) >= 0;
+}
+#define lt_before_eq(a, b) lt_after_eq(b, a)
+
+/* different types of clients */
+typedef enum {
+	RT_CLASS_HARD,
+	RT_CLASS_SOFT,
+	RT_CLASS_BEST_EFFORT
+} task_class_t;
+
+typedef enum {
+	NO_ENFORCEMENT,      /* job may overrun unhindered */
+	QUANTUM_ENFORCEMENT, /* budgets are only checked on quantum boundaries */
+	PRECISE_ENFORCEMENT  /* budgets are enforced with hrtimers */
+} budget_policy_t;
+
+/* Release behaviors for jobs. PERIODIC and EARLY jobs
+   must end by calling sys_complete_job() (or equivalent)
+   to set up their next release and deadline. */
+typedef enum {
+	/* Jobs are released sporadically (provided job precedence
+       constraints are met). */
+	TASK_SPORADIC,
+
+	/* Jobs are released periodically (provided job precedence
+       constraints are met). */
+	TASK_PERIODIC,
+
+    /* Jobs are released immediately after meeting precedence
+       constraints. Beware this can peg your CPUs if used in
+       the wrong applications. Only supported by EDF schedulers. */
+	TASK_EARLY
+} release_policy_t;
+
+/* We use the common priority interpretation "lower index == higher priority",
+ * which is commonly used in fixed-priority schedulability analysis papers.
+ * So, a numerically lower priority value implies higher scheduling priority,
+ * with priority 1 being the highest priority. Priority 0 is reserved for
+ * priority boosting. LITMUS_MAX_PRIORITY denotes the maximum priority value
+ * range.
+ */
+
+#define LITMUS_MAX_PRIORITY     512
+#define LITMUS_HIGHEST_PRIORITY   1
+#define LITMUS_LOWEST_PRIORITY    (LITMUS_MAX_PRIORITY - 1)
+
+/* Provide generic comparison macros for userspace,
+ * in case that we change this later. */
+#define litmus_higher_fixed_prio(a, b)	(a < b)
+#define litmus_lower_fixed_prio(a, b)	(a > b)
+#define litmus_is_valid_fixed_prio(p)		\
+	((p) >= LITMUS_HIGHEST_PRIORITY &&	\
+	 (p) <= LITMUS_LOWEST_PRIORITY)
+
+struct rt_task {
+	lt_t 		exec_cost;
+	lt_t 		period;
+	lt_t		relative_deadline;
+	lt_t		phase;
+	unsigned int	cpu;
+	unsigned int	priority;
+	task_class_t	cls;
+	budget_policy_t  budget_policy;  /* ignored by pfair */
+	release_policy_t release_policy;
+};
+
+union np_flag {
+	uint64_t raw;
+	struct {
+		/* Is the task currently in a non-preemptive section? */
+		uint64_t flag:31;
+		/* Should the task call into the scheduler? */
+		uint64_t preempt:1;
+	} np;
+};
+
+/* The definition of the data that is shared between the kernel and real-time
+ * tasks via a shared page (see litmus/ctrldev.c).
+ *
+ * WARNING: User space can write to this, so don't trust
+ * the correctness of the fields!
+ *
+ * This servees two purposes: to enable efficient signaling
+ * of non-preemptive sections (user->kernel) and
+ * delayed preemptions (kernel->user), and to export
+ * some real-time relevant statistics such as preemption and
+ * migration data to user space. We can't use a device to export
+ * statistics because we want to avoid system call overhead when
+ * determining preemption/migration overheads).
+ */
+struct control_page {
+	/* This flag is used by userspace to communicate non-preempive
+	 * sections. */
+	volatile union np_flag sched;
+
+	volatile uint64_t irq_count; /* Incremented by the kernel each time an IRQ is
+				      * handled. */
+
+	/* Locking overhead tracing: userspace records here the time stamp
+	 * and IRQ counter prior to starting the system call. */
+	uint64_t ts_syscall_start;  /* Feather-Trace cycles */
+	uint64_t irq_syscall_start; /* Snapshot of irq_count when the syscall
+				     * started. */
+
+	/* to be extended */
+};
+
+/* Expected offsets within the control page. */
+
+#define LITMUS_CP_OFFSET_SCHED		0
+#define LITMUS_CP_OFFSET_IRQ_COUNT	8
+#define LITMUS_CP_OFFSET_TS_SC_START	16
+#define LITMUS_CP_OFFSET_IRQ_SC_START	24
+
+/* don't export internal data structures to user space (liblitmus) */
+#ifdef __KERNEL__
+
+struct _rt_domain;
+struct bheap_node;
+struct release_heap;
+
+struct rt_job {
+	/* Time instant the the job was or will be released.  */
+	lt_t	release;
+
+	/* What is the current deadline? */
+	lt_t   	deadline;
+
+	/* How much service has this job received so far? */
+	lt_t	exec_time;
+
+	/* By how much did the prior job miss its deadline by?
+	 * Value differs from tardiness in that lateness may
+	 * be negative (when job finishes before its deadline).
+	 */
+	long long	lateness;
+
+	/* Which job is this. This is used to let user space
+	 * specify which job to wait for, which is important if jobs
+	 * overrun. If we just call sys_sleep_next_period() then we
+	 * will unintentionally miss jobs after an overrun.
+	 *
+	 * Increase this sequence number when a job is released.
+	 */
+	unsigned int    job_no;
+};
+
+struct pfair_param;
+
+/*	RT task parameters for scheduling extensions
+ *	These parameters are inherited during clone and therefore must
+ *	be explicitly set up before the task set is launched.
+ */
+struct rt_param {
+	/* Generic flags available for plugin-internal use. */
+	unsigned int 		flags:8;
+
+	/* do we need to check for srp blocking? */
+	unsigned int		srp_non_recurse:1;
+
+	/* is the task present? (true if it can be scheduled) */
+	unsigned int		present:1;
+
+	/* has the task completed? */
+	unsigned int		completed:1;
+
+#ifdef CONFIG_LITMUS_LOCKING
+	/* Is the task being priority-boosted by a locking protocol? */
+	unsigned int		priority_boosted:1;
+	/* If so, when did this start? */
+	lt_t			boost_start_time;
+
+	/* How many LITMUS^RT locks does the task currently hold/wait for? */
+	unsigned int		num_locks_held;
+	/* How many PCP/SRP locks does the task currently hold/wait for? */
+	unsigned int		num_local_locks_held;
+#endif
+
+	/* user controlled parameters */
+	struct rt_task 		task_params;
+
+	/* timing parameters */
+	struct rt_job 		job_params;
+
+	/* Should the next job be released at some time other than
+	 * just period time units after the last release?
+	 */
+	unsigned int		sporadic_release:1;
+	lt_t			sporadic_release_time;
+
+
+	/* task representing the current "inherited" task
+	 * priority, assigned by inherit_priority and
+	 * return priority in the scheduler plugins.
+	 * could point to self if PI does not result in
+	 * an increased task priority.
+	 */
+	 struct task_struct*	inh_task;
+
+#ifdef CONFIG_NP_SECTION
+	/* For the FMLP under PSN-EDF, it is required to make the task
+	 * non-preemptive from kernel space. In order not to interfere with
+	 * user space, this counter indicates the kernel space np setting.
+	 * kernel_np > 0 => task is non-preemptive
+	 */
+	unsigned int	kernel_np;
+#endif
+
+	/* This field can be used by plugins to store where the task
+	 * is currently scheduled. It is the responsibility of the
+	 * plugin to avoid race conditions.
+	 *
+	 * This used by GSN-EDF and PFAIR.
+	 */
+	volatile int		scheduled_on;
+
+	/* Is the stack of the task currently in use? This is updated by
+	 * the LITMUS core.
+	 *
+	 * Be careful to avoid deadlocks!
+	 */
+	volatile int		stack_in_use;
+
+	/* This field can be used by plugins to store where the task
+	 * is currently linked. It is the responsibility of the plugin
+	 * to avoid race conditions.
+	 *
+	 * Used by GSN-EDF.
+	 */
+	volatile int		linked_on;
+
+	/* PFAIR/PD^2 state. Allocated on demand. */
+	struct pfair_param*	pfair;
+
+	/* Fields saved before BE->RT transition.
+	 */
+	int old_policy;
+	int old_prio;
+
+	/* ready queue for this task */
+	struct _rt_domain* domain;
+
+	/* heap element for this task
+	 *
+	 * Warning: Don't statically allocate this node. The heap
+	 *          implementation swaps these between tasks, thus after
+	 *          dequeuing from a heap you may end up with a different node
+	 *          then the one you had when enqueuing the task.  For the same
+	 *          reason, don't obtain and store references to this node
+	 *          other than this pointer (which is updated by the heap
+	 *          implementation).
+	 */
+	struct bheap_node*	heap_node;
+	struct release_heap*	rel_heap;
+
+	/* Used by rt_domain to queue task in release list.
+	 */
+	struct list_head list;
+
+	/* Pointer to the page shared between userspace and kernel. */
+	struct control_page * ctrl_page;
+};
+
+#endif
+
+#endif
-- 
1.8.1.2


From c158cd6f9b7928a5ab4a514f9edc044d31f3913b Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Tue, 25 Jun 2013 06:31:46 +0200
Subject: [PATCH 013/119] Introduce main LITMUS^RT header

This patch adds a basic litmus/litmus.h, which is required for basic
LITMUS^RT infrastructure to compile.
---
 include/litmus/litmus.h    | 61 ++++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/sched.h |  1 +
 2 files changed, 62 insertions(+)
 create mode 100644 include/litmus/litmus.h

diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h
new file mode 100644
index 0000000..c87863c
--- /dev/null
+++ b/include/litmus/litmus.h
@@ -0,0 +1,61 @@
+/*
+ * Constant definitions related to
+ * scheduling policy.
+ */
+
+#ifndef _LINUX_LITMUS_H_
+#define _LINUX_LITMUS_H_
+
+#define is_realtime(t) 		((t)->policy == SCHED_LITMUS)
+
+#define tsk_rt(t)		(&(t)->rt_param)
+
+/*	Realtime utility macros */
+#ifdef CONFIG_LITMUS_LOCKING
+#define is_priority_boosted(t)  (tsk_rt(t)->priority_boosted)
+#define get_boost_start(t)  (tsk_rt(t)->boost_start_time)
+#else
+#define is_priority_boosted(t)  0
+#define get_boost_start(t)      0
+#endif
+
+
+/* task_params macros */
+#define get_exec_cost(t)  	(tsk_rt(t)->task_params.exec_cost)
+#define get_rt_period(t)	(tsk_rt(t)->task_params.period)
+#define get_rt_relative_deadline(t)	(tsk_rt(t)->task_params.relative_deadline)
+#define get_rt_phase(t)		(tsk_rt(t)->task_params.phase)
+#define get_partition(t) 	(tsk_rt(t)->task_params.cpu)
+#define get_priority(t) 	(tsk_rt(t)->task_params.priority)
+#define get_class(t)        (tsk_rt(t)->task_params.cls)
+
+/* job_param macros */
+#define get_exec_time(t)    (tsk_rt(t)->job_params.exec_time)
+#define get_deadline(t)		(tsk_rt(t)->job_params.deadline)
+#define get_release(t)		(tsk_rt(t)->job_params.release)
+#define get_lateness(t)		(tsk_rt(t)->job_params.lateness)
+
+#define is_hrt(t)     		\
+	(tsk_rt(t)->task_params.cls == RT_CLASS_HARD)
+#define is_srt(t)     		\
+	(tsk_rt(t)->task_params.cls == RT_CLASS_SOFT)
+#define is_be(t)      		\
+	(tsk_rt(t)->task_params.cls == RT_CLASS_BEST_EFFORT)
+
+/* Our notion of time within LITMUS: kernel monotonic time. */
+static inline lt_t litmus_clock(void)
+{
+	return ktime_to_ns(ktime_get());
+}
+
+static inline struct control_page* get_control_page(struct task_struct *t)
+{
+	return tsk_rt(t)->ctrl_page;
+}
+
+static inline int has_control_page(struct task_struct* t)
+{
+	return tsk_rt(t)->ctrl_page != NULL;
+}
+
+#endif
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 5a0f945..6a7b1b7 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -39,6 +39,7 @@
 #define SCHED_BATCH		3
 /* SCHED_ISO: reserved but not implemented yet */
 #define SCHED_IDLE		5
+#define SCHED_LITMUS		6
 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
 #define SCHED_RESET_ON_FORK     0x40000000
 
-- 
1.8.1.2


From fd09632251e9988dc2d064b80d69f73a3b50e4e9 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Sun, 23 Jun 2013 14:42:21 +0200
Subject: [PATCH 014/119] Feather-Trace: use proper definition of is_realtime()

Remove dummy implementation of is_realtime() in trace.c.
---
 litmus/trace.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/litmus/trace.c b/litmus/trace.c
index a378623..7eacf2e 100644
--- a/litmus/trace.c
+++ b/litmus/trace.c
@@ -3,11 +3,9 @@
 #include <linux/uaccess.h>
 
 #include <litmus/ftdev.h>
+#include <litmus/litmus.h>
 #include <litmus/trace.h>
 
-/* dummy definition of is_realtime() */
-#define is_realtime(t) (0)
-
 /******************************************************************************/
 /*                          Allocation                                        */
 /******************************************************************************/
-- 
1.8.1.2


From f2c8dedbf8d03df6f0a977f9e8aee41227cd8893 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Sun, 23 Jun 2013 14:43:54 +0200
Subject: [PATCH 015/119] Feather-Trace: write interrupt counts to control page

This patch exports the interrupt counter to userspace via the control
page.
---
 litmus/trace.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/litmus/trace.c b/litmus/trace.c
index 7eacf2e..6f2e295 100644
--- a/litmus/trace.c
+++ b/litmus/trace.c
@@ -29,6 +29,9 @@ void ft_irq_fired(void)
 	/* Only called with preemptions disabled.  */
 	atomic_inc(&__get_cpu_var(irq_fired_count));
 	atomic_inc(&__get_cpu_var(cpu_irq_fired_count));
+
+	if (has_control_page(current))
+		get_control_page(current)->irq_count++;
 }
 
 static inline void clear_irq_fired(void)
-- 
1.8.1.2


From b8512db6218d2c86320c7ec0d07a6efa465e3fcd Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Tue, 12 Feb 2013 13:45:47 +0100
Subject: [PATCH 016/119] Add schedule tracing support

This patch introduces the sched_trace infrastructure, which in
principle allows tracing the generated schedule.  However, this patch
does not yet integrate the callbacks with the kernel.
---
 include/litmus/sched_trace.h | 251 +++++++++++++++++++++++++++++++++++++++++++
 litmus/Kconfig               |  34 ++++++
 litmus/Makefile              |   1 +
 litmus/sched_task_trace.c    | 241 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 527 insertions(+)
 create mode 100644 include/litmus/sched_trace.h
 create mode 100644 litmus/sched_task_trace.c

diff --git a/include/litmus/sched_trace.h b/include/litmus/sched_trace.h
new file mode 100644
index 0000000..6044d9f
--- /dev/null
+++ b/include/litmus/sched_trace.h
@@ -0,0 +1,251 @@
+/*
+ * sched_trace.h -- record scheduler events to a byte stream for offline analysis.
+ */
+#ifndef _LINUX_SCHED_TRACE_H_
+#define _LINUX_SCHED_TRACE_H_
+
+/* all times in nanoseconds */
+
+struct st_trace_header {
+	u8	type;		/* Of what type is this record?  */
+	u8	cpu;		/* On which CPU was it recorded? */
+	u16	pid;		/* PID of the task.              */
+	u32	job;		/* The job sequence number.      */
+};
+
+#define ST_NAME_LEN 16
+struct st_name_data {
+	char	cmd[ST_NAME_LEN];/* The name of the executable of this process. */
+};
+
+struct st_param_data {		/* regular params */
+	u32	wcet;
+	u32	period;
+	u32	phase;
+	u8	partition;
+	u8	class;
+	u8	__unused[2];
+};
+
+struct st_release_data {	/* A job is was/is going to be released. */
+	u64	release;	/* What's the release time?              */
+	u64	deadline;	/* By when must it finish?		 */
+};
+
+struct st_assigned_data {	/* A job was asigned to a CPU. 		 */
+	u64	when;
+	u8	target;		/* Where should it execute?	         */
+	u8	__unused[7];
+};
+
+struct st_switch_to_data {	/* A process was switched to on a given CPU.   */
+	u64	when;		/* When did this occur?                        */
+	u32	exec_time;	/* Time the current job has executed.          */
+	u8	__unused[4];
+
+};
+
+struct st_switch_away_data {	/* A process was switched away from on a given CPU. */
+	u64	when;
+	u64	exec_time;
+};
+
+struct st_completion_data {	/* A job completed. */
+	u64	when;
+	u8	forced:1; 	/* Set to 1 if job overran and kernel advanced to the
+				 * next task automatically; set to 0 otherwise.
+				 */
+	u8	__uflags:7;
+	u8	__unused[7];
+};
+
+struct st_block_data {		/* A task blocks. */
+	u64	when;
+	u64	__unused;
+};
+
+struct st_resume_data {		/* A task resumes. */
+	u64	when;
+	u64	__unused;
+};
+
+struct st_action_data {
+	u64	when;
+	u8	action;
+	u8	__unused[7];
+};
+
+struct st_sys_release_data {
+	u64	when;
+	u64	release;
+};
+
+#define DATA(x) struct st_ ## x ## _data x;
+
+typedef enum {
+        ST_NAME = 1,		/* Start at one, so that we can spot
+				 * uninitialized records. */
+	ST_PARAM,
+	ST_RELEASE,
+	ST_ASSIGNED,
+	ST_SWITCH_TO,
+	ST_SWITCH_AWAY,
+	ST_COMPLETION,
+	ST_BLOCK,
+	ST_RESUME,
+	ST_ACTION,
+	ST_SYS_RELEASE
+} st_event_record_type_t;
+
+struct st_event_record {
+	struct st_trace_header hdr;
+	union {
+		u64 raw[2];
+
+		DATA(name);
+		DATA(param);
+		DATA(release);
+		DATA(assigned);
+		DATA(switch_to);
+		DATA(switch_away);
+		DATA(completion);
+		DATA(block);
+		DATA(resume);
+		DATA(action);
+		DATA(sys_release);
+	} data;
+};
+
+#undef DATA
+
+#ifdef __KERNEL__
+
+#include <linux/sched.h>
+#include <litmus/feather_trace.h>
+
+#ifdef CONFIG_SCHED_TASK_TRACE
+
+#define SCHED_TRACE(id, callback, task) \
+	ft_event1(id, callback, task)
+#define SCHED_TRACE2(id, callback, task, xtra) \
+	ft_event2(id, callback, task, xtra)
+
+/* provide prototypes; needed on sparc64 */
+#ifndef NO_TASK_TRACE_DECLS
+feather_callback void do_sched_trace_task_name(unsigned long id,
+					       struct task_struct* task);
+feather_callback void do_sched_trace_task_param(unsigned long id,
+						struct task_struct* task);
+feather_callback void do_sched_trace_task_release(unsigned long id,
+						  struct task_struct* task);
+feather_callback void do_sched_trace_task_switch_to(unsigned long id,
+						    struct task_struct* task);
+feather_callback void do_sched_trace_task_switch_away(unsigned long id,
+						      struct task_struct* task);
+feather_callback void do_sched_trace_task_completion(unsigned long id,
+						     struct task_struct* task,
+						     unsigned long forced);
+feather_callback void do_sched_trace_task_block(unsigned long id,
+						struct task_struct* task);
+feather_callback void do_sched_trace_task_resume(unsigned long id,
+						 struct task_struct* task);
+feather_callback void do_sched_trace_action(unsigned long id,
+					    struct task_struct* task,
+					    unsigned long action);
+feather_callback void do_sched_trace_sys_release(unsigned long id,
+						 lt_t* start);
+
+#endif
+
+#else
+
+#define SCHED_TRACE(id, callback, task)        /* no tracing */
+#define SCHED_TRACE2(id, callback, task, xtra) /* no tracing */
+
+#endif
+
+#ifdef CONFIG_SCHED_LITMUS_TRACEPOINT
+
+#include <trace/events/litmus.h>
+
+#else
+
+/* Override trace macros to actually do nothing */
+#define trace_litmus_task_param(t)
+#define trace_litmus_task_release(t)
+#define trace_litmus_switch_to(t)
+#define trace_litmus_switch_away(prev)
+#define trace_litmus_task_completion(t, forced)
+#define trace_litmus_task_block(t)
+#define trace_litmus_task_resume(t)
+#define trace_litmus_sys_release(start)
+
+#endif
+
+
+#define SCHED_TRACE_BASE_ID 500
+
+
+#define sched_trace_task_name(t)					\
+	SCHED_TRACE(SCHED_TRACE_BASE_ID + 1,				\
+			do_sched_trace_task_name, t)
+
+#define sched_trace_task_param(t)					\
+	do {								\
+		SCHED_TRACE(SCHED_TRACE_BASE_ID + 2,			\
+				do_sched_trace_task_param, t);		\
+	} while (0)
+
+#define sched_trace_task_release(t)					\
+	do {								\
+		SCHED_TRACE(SCHED_TRACE_BASE_ID + 3,			\
+				do_sched_trace_task_release, t);	\
+	} while (0)
+
+#define sched_trace_task_switch_to(t)					\
+	do {								\
+		SCHED_TRACE(SCHED_TRACE_BASE_ID + 4,			\
+			do_sched_trace_task_switch_to, t);		\
+	} while (0)
+
+#define sched_trace_task_switch_away(t)					\
+	do {								\
+		SCHED_TRACE(SCHED_TRACE_BASE_ID + 5,			\
+			do_sched_trace_task_switch_away, t);		\
+	} while (0)
+
+#define sched_trace_task_completion(t, forced)				\
+	do {								\
+		SCHED_TRACE2(SCHED_TRACE_BASE_ID + 6,			\
+				do_sched_trace_task_completion, t,	\
+				(unsigned long) forced);		\
+	} while (0)
+
+#define sched_trace_task_block(t)					\
+	do {								\
+		SCHED_TRACE(SCHED_TRACE_BASE_ID + 7,			\
+			do_sched_trace_task_block, t);			\
+	} while (0)
+
+#define sched_trace_task_resume(t)					\
+	do {								\
+		SCHED_TRACE(SCHED_TRACE_BASE_ID + 8,			\
+				do_sched_trace_task_resume, t);		\
+	} while (0)
+
+#define sched_trace_action(t, action)					\
+	SCHED_TRACE2(SCHED_TRACE_BASE_ID + 9,				\
+		do_sched_trace_action, t, (unsigned long) action);
+
+/* when is a pointer, it does not need an explicit cast to unsigned long */
+#define sched_trace_sys_release(when)					\
+	do {								\
+		SCHED_TRACE(SCHED_TRACE_BASE_ID + 10,			\
+			do_sched_trace_sys_release, when);		\
+	} while (0)
+
+#define sched_trace_quantum_boundary() /* NOT IMPLEMENTED */
+
+#endif /* __KERNEL__ */
+
+#endif
diff --git a/litmus/Kconfig b/litmus/Kconfig
index e4624ee..19211ac 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -24,6 +24,40 @@ config FEATHER_TRACE
 	  you still need to enable SCHED_TASK_TRACE and/or SCHED_OVERHEAD_TRACE to
 	  actually enable any events.
 
+config SCHED_TASK_TRACE
+	bool "Trace real-time tasks"
+	depends on FEATHER_TRACE
+	default y
+	help
+	  Include support for the sched_trace_XXX() tracing functions. This
+          allows the collection of real-time task events such as job
+	  completions, job releases, early completions, etc. This results in  a
+	  small overhead in the scheduling code. Disable if the overhead is not
+	  acceptable (e.g., benchmarking).
+
+	  Say Yes for debugging.
+	  Say No for overhead tracing.
+
+config SCHED_TASK_TRACE_SHIFT
+       int "Buffer size for sched_trace_xxx() events"
+       depends on SCHED_TASK_TRACE
+       range 8 13
+       default 9
+       help
+
+         Select the buffer size of sched_trace_xxx() events as a power of two.
+	 These buffers are statically allocated as per-CPU data. Each event
+	 requires 24 bytes storage plus one additional flag byte. Too large
+	 buffers can cause issues with the per-cpu allocator (and waste
+	 memory). Too small buffers can cause scheduling events to be lost. The
+	 "right" size is workload dependent and depends on the number of tasks,
+	 each task's period, each task's number of suspensions, and how often
+	 the buffer is flushed.
+
+	 Examples: 12 =>   4k events
+		   10 =>   1k events
+		    8 =>  512 events
+
 config SCHED_OVERHEAD_TRACE
 	bool "Record timestamps for overhead measurements"
 	depends on FEATHER_TRACE
diff --git a/litmus/Makefile b/litmus/Makefile
index 07f065f..6318f1c 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -3,5 +3,6 @@
 #
 
 obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
+obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
 obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o
 obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o
diff --git a/litmus/sched_task_trace.c b/litmus/sched_task_trace.c
new file mode 100644
index 0000000..5ef8d09
--- /dev/null
+++ b/litmus/sched_task_trace.c
@@ -0,0 +1,241 @@
+/*
+ * sched_task_trace.c -- record scheduling events to a byte stream
+ */
+
+#define NO_TASK_TRACE_DECLS
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/percpu.h>
+
+#include <litmus/ftdev.h>
+#include <litmus/litmus.h>
+
+#include <litmus/sched_trace.h>
+#include <litmus/feather_trace.h>
+#include <litmus/ftdev.h>
+
+
+#define NO_EVENTS		(1 << CONFIG_SCHED_TASK_TRACE_SHIFT)
+
+#define now() litmus_clock()
+
+struct local_buffer {
+	struct st_event_record record[NO_EVENTS];
+	char   flag[NO_EVENTS];
+	struct ft_buffer ftbuf;
+};
+
+DEFINE_PER_CPU(struct local_buffer, st_event_buffer);
+
+static struct ftdev st_dev;
+
+static int st_dev_can_open(struct ftdev *dev, unsigned int cpu)
+{
+	return cpu_online(cpu) ? 0 : -ENODEV;
+}
+
+static int __init init_sched_task_trace(void)
+{
+	struct local_buffer* buf;
+	int i, ok = 0, err;
+	printk("Allocated %u sched_trace_xxx() events per CPU "
+	       "(buffer size: %d bytes)\n",
+	       NO_EVENTS, (int) sizeof(struct local_buffer));
+
+	err = ftdev_init(&st_dev, THIS_MODULE,
+			num_online_cpus(), "sched_trace");
+	if (err)
+		goto err_out;
+
+	for (i = 0; i < st_dev.minor_cnt; i++) {
+		buf = &per_cpu(st_event_buffer, i);
+		ok += init_ft_buffer(&buf->ftbuf, NO_EVENTS,
+				     sizeof(struct st_event_record),
+				     buf->flag,
+				     buf->record);
+		st_dev.minor[i].buf = &buf->ftbuf;
+	}
+	if (ok == st_dev.minor_cnt) {
+		st_dev.can_open = st_dev_can_open;
+		err = register_ftdev(&st_dev);
+		if (err)
+			goto err_dealloc;
+	} else {
+		err = -EINVAL;
+		goto err_dealloc;
+	}
+
+	return 0;
+
+err_dealloc:
+	ftdev_exit(&st_dev);
+err_out:
+	printk(KERN_WARNING "Could not register sched_trace module\n");
+	return err;
+}
+
+static void __exit exit_sched_task_trace(void)
+{
+	ftdev_exit(&st_dev);
+}
+
+module_init(init_sched_task_trace);
+module_exit(exit_sched_task_trace);
+
+
+static inline struct st_event_record* get_record(u8 type, struct task_struct* t)
+{
+	struct st_event_record* rec = NULL;
+	struct local_buffer* buf;
+
+	buf = &get_cpu_var(st_event_buffer);
+	if (ft_buffer_start_write(&buf->ftbuf, (void**) &rec)) {
+		rec->hdr.type = type;
+		rec->hdr.cpu  = smp_processor_id();
+		rec->hdr.pid  = t ? t->pid : 0;
+		rec->hdr.job  = t ? t->rt_param.job_params.job_no : 0;
+	} else {
+		put_cpu_var(st_event_buffer);
+	}
+	/* rec will be NULL if it failed */
+	return rec;
+}
+
+static inline void put_record(struct st_event_record* rec)
+{
+	struct local_buffer* buf;
+	buf = &__get_cpu_var(st_event_buffer);
+	ft_buffer_finish_write(&buf->ftbuf, rec);
+	put_cpu_var(st_event_buffer);
+}
+
+feather_callback void do_sched_trace_task_name(unsigned long id, unsigned long _task)
+{
+	struct task_struct *t = (struct task_struct*) _task;
+	struct st_event_record* rec = get_record(ST_NAME, t);
+	int i;
+	if (rec) {
+		for (i = 0; i < min(TASK_COMM_LEN, ST_NAME_LEN); i++)
+			rec->data.name.cmd[i] = t->comm[i];
+		put_record(rec);
+	}
+}
+
+feather_callback void do_sched_trace_task_param(unsigned long id, unsigned long _task)
+{
+	struct task_struct *t = (struct task_struct*) _task;
+	struct st_event_record* rec = get_record(ST_PARAM, t);
+	if (rec) {
+		rec->data.param.wcet      = get_exec_cost(t);
+		rec->data.param.period    = get_rt_period(t);
+		rec->data.param.phase     = get_rt_phase(t);
+		rec->data.param.partition = get_partition(t);
+		rec->data.param.class     = get_class(t);
+		put_record(rec);
+	}
+}
+
+feather_callback void do_sched_trace_task_release(unsigned long id, unsigned long _task)
+{
+	struct task_struct *t = (struct task_struct*) _task;
+	struct st_event_record* rec = get_record(ST_RELEASE, t);
+	if (rec) {
+		rec->data.release.release  = get_release(t);
+		rec->data.release.deadline = get_deadline(t);
+		put_record(rec);
+	}
+}
+
+/* skipped: st_assigned_data, we don't use it atm */
+
+feather_callback void do_sched_trace_task_switch_to(unsigned long id,
+						    unsigned long _task)
+{
+	struct task_struct *t = (struct task_struct*) _task;
+	struct st_event_record* rec;
+	if (is_realtime(t)) {
+		rec = get_record(ST_SWITCH_TO, t);
+		if (rec) {
+			rec->data.switch_to.when      = now();
+			rec->data.switch_to.exec_time = get_exec_time(t);
+			put_record(rec);
+		}
+	}
+}
+
+feather_callback void do_sched_trace_task_switch_away(unsigned long id,
+						      unsigned long _task)
+{
+	struct task_struct *t = (struct task_struct*) _task;
+	struct st_event_record* rec;
+	if (is_realtime(t)) {
+		rec = get_record(ST_SWITCH_AWAY, t);
+		if (rec) {
+			rec->data.switch_away.when      = now();
+			rec->data.switch_away.exec_time = get_exec_time(t);
+			put_record(rec);
+		}
+	}
+}
+
+feather_callback void do_sched_trace_task_completion(unsigned long id,
+						     unsigned long _task,
+						     unsigned long forced)
+{
+	struct task_struct *t = (struct task_struct*) _task;
+	struct st_event_record* rec = get_record(ST_COMPLETION, t);
+	if (rec) {
+		rec->data.completion.when   = now();
+		rec->data.completion.forced = forced;
+		put_record(rec);
+	}
+}
+
+feather_callback void do_sched_trace_task_block(unsigned long id,
+						unsigned long _task)
+{
+	struct task_struct *t = (struct task_struct*) _task;
+	struct st_event_record* rec = get_record(ST_BLOCK, t);
+	if (rec) {
+		rec->data.block.when      = now();
+		put_record(rec);
+	}
+}
+
+feather_callback void do_sched_trace_task_resume(unsigned long id,
+						 unsigned long _task)
+{
+	struct task_struct *t = (struct task_struct*) _task;
+	struct st_event_record* rec = get_record(ST_RESUME, t);
+	if (rec) {
+		rec->data.resume.when      = now();
+		put_record(rec);
+	}
+}
+
+feather_callback void do_sched_trace_sys_release(unsigned long id,
+						 unsigned long _start)
+{
+	lt_t *start = (lt_t*) _start;
+	struct st_event_record* rec = get_record(ST_SYS_RELEASE, NULL);
+	if (rec) {
+		rec->data.sys_release.when    = now();
+		rec->data.sys_release.release = *start;
+		put_record(rec);
+	}
+}
+
+feather_callback void do_sched_trace_action(unsigned long id,
+					    unsigned long _task,
+					    unsigned long action)
+{
+	struct task_struct *t = (struct task_struct*) _task;
+	struct st_event_record* rec = get_record(ST_ACTION, t);
+
+	if (rec) {
+		rec->data.action.when   = now();
+		rec->data.action.action = action;
+		put_record(rec);
+	}
+}
-- 
1.8.1.2


From bac300c4dc3c9c0ed4f317514e1f8496ebe10cac Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Tue, 12 Feb 2013 16:31:10 +0100
Subject: [PATCH 017/119] Add tracepoint support

This patch integrates LITMUS^RT's sched_trace_XXX() macros with
Linux's notion of tracepoints. This is useful to visualize schedules
in kernel shark and similar tools. Historically, LITMUS^RT's
sched_trace predates Linux's tracepoint infrastructure.
---
 include/litmus/sched_trace.h  |   8 ++
 include/trace/events/litmus.h | 231 ++++++++++++++++++++++++++++++++++++++++++
 litmus/Kconfig                |  18 ++++
 litmus/sched_task_trace.c     |   4 +
 4 files changed, 261 insertions(+)
 create mode 100644 include/trace/events/litmus.h

diff --git a/include/litmus/sched_trace.h b/include/litmus/sched_trace.h
index 6044d9f..82bde82 100644
--- a/include/litmus/sched_trace.h
+++ b/include/litmus/sched_trace.h
@@ -194,24 +194,28 @@ feather_callback void do_sched_trace_sys_release(unsigned long id,
 	do {								\
 		SCHED_TRACE(SCHED_TRACE_BASE_ID + 2,			\
 				do_sched_trace_task_param, t);		\
+		trace_litmus_task_param(t);				\
 	} while (0)
 
 #define sched_trace_task_release(t)					\
 	do {								\
 		SCHED_TRACE(SCHED_TRACE_BASE_ID + 3,			\
 				do_sched_trace_task_release, t);	\
+		trace_litmus_task_release(t);				\
 	} while (0)
 
 #define sched_trace_task_switch_to(t)					\
 	do {								\
 		SCHED_TRACE(SCHED_TRACE_BASE_ID + 4,			\
 			do_sched_trace_task_switch_to, t);		\
+		trace_litmus_switch_to(t);				\
 	} while (0)
 
 #define sched_trace_task_switch_away(t)					\
 	do {								\
 		SCHED_TRACE(SCHED_TRACE_BASE_ID + 5,			\
 			do_sched_trace_task_switch_away, t);		\
+		trace_litmus_switch_away(t);				\
 	} while (0)
 
 #define sched_trace_task_completion(t, forced)				\
@@ -219,18 +223,21 @@ feather_callback void do_sched_trace_sys_release(unsigned long id,
 		SCHED_TRACE2(SCHED_TRACE_BASE_ID + 6,			\
 				do_sched_trace_task_completion, t,	\
 				(unsigned long) forced);		\
+		trace_litmus_task_completion(t, forced);		\
 	} while (0)
 
 #define sched_trace_task_block(t)					\
 	do {								\
 		SCHED_TRACE(SCHED_TRACE_BASE_ID + 7,			\
 			do_sched_trace_task_block, t);			\
+		trace_litmus_task_block(t);				\
 	} while (0)
 
 #define sched_trace_task_resume(t)					\
 	do {								\
 		SCHED_TRACE(SCHED_TRACE_BASE_ID + 8,			\
 				do_sched_trace_task_resume, t);		\
+		trace_litmus_task_resume(t);				\
 	} while (0)
 
 #define sched_trace_action(t, action)					\
@@ -242,6 +249,7 @@ feather_callback void do_sched_trace_sys_release(unsigned long id,
 	do {								\
 		SCHED_TRACE(SCHED_TRACE_BASE_ID + 10,			\
 			do_sched_trace_sys_release, when);		\
+		trace_litmus_sys_release(when);				\
 	} while (0)
 
 #define sched_trace_quantum_boundary() /* NOT IMPLEMENTED */
diff --git a/include/trace/events/litmus.h b/include/trace/events/litmus.h
new file mode 100644
index 0000000..0fffcee
--- /dev/null
+++ b/include/trace/events/litmus.h
@@ -0,0 +1,231 @@
+/*
+ * LITMUS^RT kernel style scheduling tracepoints
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM litmus
+
+#if !defined(_SCHED_TASK_TRACEPOINT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _SCHED_TASK_TRACEPOINT_H
+
+#include <linux/tracepoint.h>
+
+#include <litmus/litmus.h>
+#include <litmus/rt_param.h>
+
+/*
+ * Tracing task admission
+ */
+TRACE_EVENT(litmus_task_param,
+
+	TP_PROTO(struct task_struct *t),
+
+	TP_ARGS(t),
+
+	TP_STRUCT__entry(
+		__field( pid_t,		pid	)
+		__field( unsigned int,	job	)
+		__field( lt_t,		wcet	)
+		__field( lt_t,		period	)
+		__field( lt_t,		phase	)
+		__field( int,		partition )
+	),
+
+	TP_fast_assign(
+		__entry->pid	= t ? t->pid : 0;
+		__entry->job	= t ? t->rt_param.job_params.job_no : 0;
+		__entry->wcet	= get_exec_cost(t);
+		__entry->period	= get_rt_period(t);
+		__entry->phase	= get_rt_phase(t);
+		__entry->partition = get_partition(t);
+	),
+
+	TP_printk("period(%d, %Lu).\nwcet(%d, %Lu).\n",
+		__entry->pid, __entry->period,
+		__entry->pid, __entry->wcet)
+);
+
+/*
+ * Tracing jobs release
+ */
+TRACE_EVENT(litmus_task_release,
+
+	TP_PROTO(struct task_struct *t),
+
+	TP_ARGS(t),
+
+	TP_STRUCT__entry(
+		__field( pid_t,		pid	)
+		__field( unsigned int,	job	)
+		__field( lt_t,		release	)
+		__field( lt_t,		deadline	)
+	),
+
+	TP_fast_assign(
+		__entry->pid	= t ? t->pid : 0;
+		__entry->job	= t ? t->rt_param.job_params.job_no : 0;
+		__entry->release	= get_release(t);
+		__entry->deadline	= get_deadline(t);
+	),
+
+	TP_printk("release(job(%u, %u)): %Lu\ndeadline(job(%u, %u)): %Lu\n",
+			__entry->pid, __entry->job, __entry->release,
+			__entry->pid, __entry->job, __entry->deadline)
+);
+
+/*
+ * Tracepoint for switching to new task
+ */
+TRACE_EVENT(litmus_switch_to,
+
+	TP_PROTO(struct task_struct *t),
+
+	TP_ARGS(t),
+
+	TP_STRUCT__entry(
+		__field( pid_t,		pid	)
+		__field( unsigned int,	job	)
+		__field( lt_t,		when	)
+		__field( lt_t,		exec_time	)
+	),
+
+	TP_fast_assign(
+		__entry->pid	= is_realtime(t) ? t->pid : 0;
+		__entry->job	= is_realtime(t) ? t->rt_param.job_params.job_no : 0;
+		__entry->when		= litmus_clock();
+		__entry->exec_time	= get_exec_time(t);
+	),
+
+	TP_printk("switch_to(job(%u, %u)): %Lu (exec: %Lu)\n",
+			__entry->pid, __entry->job,
+			__entry->when, __entry->exec_time)
+);
+
+/*
+ * Tracepoint for switching away previous task
+ */
+TRACE_EVENT(litmus_switch_away,
+
+	TP_PROTO(struct task_struct *t),
+
+	TP_ARGS(t),
+
+	TP_STRUCT__entry(
+		__field( pid_t,		pid	)
+		__field( unsigned int,	job	)
+		__field( lt_t,		when	)
+		__field( lt_t,		exec_time	)
+	),
+
+	TP_fast_assign(
+		__entry->pid	= is_realtime(t) ? t->pid : 0;
+		__entry->job	= is_realtime(t) ? t->rt_param.job_params.job_no : 0;
+		__entry->when		= litmus_clock();
+		__entry->exec_time	= get_exec_time(t);
+	),
+
+	TP_printk("switch_away(job(%u, %u)): %Lu (exec: %Lu)\n",
+			__entry->pid, __entry->job,
+			__entry->when, __entry->exec_time)
+);
+
+/*
+ * Tracing jobs completion
+ */
+TRACE_EVENT(litmus_task_completion,
+
+	TP_PROTO(struct task_struct *t, unsigned long forced),
+
+	TP_ARGS(t, forced),
+
+	TP_STRUCT__entry(
+		__field( pid_t,		pid	)
+		__field( unsigned int,	job	)
+		__field( lt_t,		when	)
+		__field( unsigned long,	forced	)
+	),
+
+	TP_fast_assign(
+		__entry->pid	= t ? t->pid : 0;
+		__entry->job	= t ? t->rt_param.job_params.job_no : 0;
+		__entry->when	= litmus_clock();
+		__entry->forced	= forced;
+	),
+
+	TP_printk("completed(job(%u, %u)): %Lu (forced: %lu)\n",
+			__entry->pid, __entry->job,
+			__entry->when, __entry->forced)
+);
+
+/*
+ * Trace blocking tasks.
+ */
+TRACE_EVENT(litmus_task_block,
+
+	TP_PROTO(struct task_struct *t),
+
+	TP_ARGS(t),
+
+	TP_STRUCT__entry(
+		__field( pid_t,		pid	)
+		__field( lt_t,		when	)
+	),
+
+	TP_fast_assign(
+		__entry->pid	= t ? t->pid : 0;
+		__entry->when	= litmus_clock();
+	),
+
+	TP_printk("(%u) blocks: %Lu\n", __entry->pid, __entry->when)
+);
+
+/*
+ * Tracing jobs resume
+ */
+TRACE_EVENT(litmus_task_resume,
+
+	TP_PROTO(struct task_struct *t),
+
+	TP_ARGS(t),
+
+	TP_STRUCT__entry(
+		__field( pid_t,		pid	)
+		__field( unsigned int,	job	)
+		__field( lt_t,		when	)
+	),
+
+	TP_fast_assign(
+		__entry->pid	= t ? t->pid : 0;
+		__entry->job	= t ? t->rt_param.job_params.job_no : 0;
+		__entry->when	= litmus_clock();
+	),
+
+	TP_printk("resume(job(%u, %u)): %Lu\n",
+			__entry->pid, __entry->job, __entry->when)
+);
+
+/*
+ * Trace synchronous release
+ */
+TRACE_EVENT(litmus_sys_release,
+
+	TP_PROTO(lt_t *start),
+
+	TP_ARGS(start),
+
+	TP_STRUCT__entry(
+		__field( lt_t,		rel	)
+		__field( lt_t,		when	)
+	),
+
+	TP_fast_assign(
+		__entry->rel	= *start;
+		__entry->when	= litmus_clock();
+	),
+
+	TP_printk("SynRelease(%Lu) at %Lu\n", __entry->rel, __entry->when)
+);
+
+#endif /* _SCHED_TASK_TRACEPOINT_H */
+
+/* Must stay outside the protection */
+#include <trace/define_trace.h>
diff --git a/litmus/Kconfig b/litmus/Kconfig
index 19211ac..5408ef6 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -58,6 +58,24 @@ config SCHED_TASK_TRACE_SHIFT
 		   10 =>   1k events
 		    8 =>  512 events
 
+config SCHED_LITMUS_TRACEPOINT
+	bool "Enable Event/Tracepoint Tracing for real-time task tracing"
+	depends on TRACEPOINTS
+	default n
+	help
+	  Enable kernel-style events (tracepoint) for Litmus. Litmus events
+	  trace the same functions as the above sched_trace_XXX(), but can
+	  be enabled independently.
+	  Litmus tracepoints can be recorded and analyzed together (single
+	  time reference) with all other kernel tracing events (e.g.,
+	  sched:sched_switch, etc.).
+
+	  This also enables a quick way to visualize schedule traces using
+	  trace-cmd utility and kernelshark visualizer.
+
+	  Say Yes for debugging and visualization purposes.
+	  Say No for overhead tracing.
+
 config SCHED_OVERHEAD_TRACE
 	bool "Record timestamps for overhead measurements"
 	depends on FEATHER_TRACE
diff --git a/litmus/sched_task_trace.c b/litmus/sched_task_trace.c
index 5ef8d09..2bdfbbd 100644
--- a/litmus/sched_task_trace.c
+++ b/litmus/sched_task_trace.c
@@ -15,6 +15,10 @@
 #include <litmus/feather_trace.h>
 #include <litmus/ftdev.h>
 
+#ifdef CONFIG_SCHED_LITMUS_TRACEPOINT
+#define CREATE_TRACE_POINTS
+#include <trace/events/litmus.h>
+#endif
 
 #define NO_EVENTS		(1 << CONFIG_SCHED_TASK_TRACE_SHIFT)
 
-- 
1.8.1.2


From ce3268cf0ebd29c52dd5130f8430f3a3b61b0cec Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Tue, 25 Jun 2013 07:32:20 +0200
Subject: [PATCH 018/119] Add object descriptor table to Linux's task_struct

This table is similar to a file descriptor table. It keeps track of
which "objects" (locks) a real-time task holds a handle to.
---
 include/linux/sched.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 164bb0d..cbb3b44 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1029,6 +1029,7 @@ struct sched_rt_entity {
 
 
 struct rcu_node;
+struct od_table_entry;
 
 enum perf_event_task_context {
 	perf_invalid_context = -1,
@@ -1374,6 +1375,9 @@ struct task_struct {
 	/* LITMUS RT parameters and state */
 	struct rt_param rt_param;
 
+	/* references to PI semaphores, etc. */
+	struct od_table_entry *od_table;
+
 #ifdef CONFIG_LATENCYTOP
 	int latency_record_count;
 	struct latency_record latency_record[LT_SAVECOUNT];
-- 
1.8.1.2


From 134e525e8a1e90e002e5b51bd6fcfc65dd7c1615 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Thu, 27 Jun 2013 09:54:55 +0200
Subject: [PATCH 019/119] Export num_cache_leaves in asm/processor.h

This is required to make litmus/affinity.c compile.
---
 arch/x86/include/asm/processor.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 254dd2b..add9c71 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -174,6 +174,11 @@ extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
 extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
 #ifdef CONFIG_SYSFS
+/* Note: commit 04a1541828ea223169eb44a336bfad8ec0dfb46a hid num_cache_leaves,
+ * but LITMUS^RT currently still depends on it.
+ * FIXME: port LITMUS^RT's affinity-aware scheduling to use proper interfaces.
+ */
+extern unsigned short num_cache_leaves;
 extern int get_shared_cpu_map(cpumask_var_t mask,
 			       unsigned int cpu, int index);
 #endif
-- 
1.8.1.2


From beda58fbd93b3cb07becd08696a09437dc95561c Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Tue, 25 Jun 2013 07:27:07 +0200
Subject: [PATCH 020/119] Add LITMUS^RT core implementation

This patch adds the core of LITMUS^RT:

 - library functionality (heaps, rt_domain, prioritization, etc.)
 - budget enforcement logic
 - job management
 - system call backends
 - virtual devices (control page, etc.)
 - scheduler plugin API (and dummy plugin)

This code compiles, but is not yet integrated with the rest of Linux.
---
 include/litmus/affinity.h     |  80 ++++++
 include/litmus/bheap.h        |  77 +++++
 include/litmus/binheap.h      | 205 ++++++++++++++
 include/litmus/budget.h       |  36 +++
 include/litmus/clustered.h    |  44 +++
 include/litmus/edf_common.h   |  25 ++
 include/litmus/fdso.h         |  78 ++++++
 include/litmus/fp_common.h    | 105 +++++++
 include/litmus/fpmath.h       | 147 ++++++++++
 include/litmus/jobs.h         |  10 +
 include/litmus/litmus.h       | 268 ++++++++++++++++++
 include/litmus/litmus_proc.h  |  63 +++++
 include/litmus/locking.h      |  28 ++
 include/litmus/preempt.h      | 164 +++++++++++
 include/litmus/rt_domain.h    | 182 ++++++++++++
 include/litmus/rt_param.h     |  12 +-
 include/litmus/sched_plugin.h | 128 +++++++++
 include/litmus/srp.h          |  28 ++
 include/litmus/unistd_32.h    |  21 ++
 include/litmus/unistd_64.h    |  33 +++
 include/litmus/wait.h         |  57 ++++
 kernel/sched/litmus.c         | 340 ++++++++++++++++++++++
 litmus/Kconfig                | 193 +++++++++++++
 litmus/Makefile               |  20 ++
 litmus/affinity.c             |  41 +++
 litmus/bheap.c                | 316 +++++++++++++++++++++
 litmus/binheap.c              | 387 +++++++++++++++++++++++++
 litmus/budget.c               | 116 ++++++++
 litmus/clustered.c            | 111 ++++++++
 litmus/ctrldev.c              | 160 +++++++++++
 litmus/edf_common.c           | 200 +++++++++++++
 litmus/fdso.c                 | 308 ++++++++++++++++++++
 litmus/fp_common.c            | 119 ++++++++
 litmus/jobs.c                 |  77 +++++
 litmus/litmus.c               | 639 ++++++++++++++++++++++++++++++++++++++++++
 litmus/litmus_proc.c          | 576 +++++++++++++++++++++++++++++++++++++
 litmus/locking.c              | 188 +++++++++++++
 litmus/preempt.c              | 137 +++++++++
 litmus/rt_domain.c            | 353 +++++++++++++++++++++++
 litmus/sched_plugin.c         | 238 ++++++++++++++++
 litmus/srp.c                  | 313 +++++++++++++++++++++
 litmus/sync.c                 | 152 ++++++++++
 litmus/uncachedev.c           | 102 +++++++
 43 files changed, 6871 insertions(+), 6 deletions(-)
 create mode 100644 include/litmus/affinity.h
 create mode 100644 include/litmus/bheap.h
 create mode 100644 include/litmus/binheap.h
 create mode 100644 include/litmus/budget.h
 create mode 100644 include/litmus/clustered.h
 create mode 100644 include/litmus/edf_common.h
 create mode 100644 include/litmus/fdso.h
 create mode 100644 include/litmus/fp_common.h
 create mode 100644 include/litmus/fpmath.h
 create mode 100644 include/litmus/jobs.h
 create mode 100644 include/litmus/litmus_proc.h
 create mode 100644 include/litmus/locking.h
 create mode 100644 include/litmus/preempt.h
 create mode 100644 include/litmus/rt_domain.h
 create mode 100644 include/litmus/sched_plugin.h
 create mode 100644 include/litmus/srp.h
 create mode 100644 include/litmus/unistd_32.h
 create mode 100644 include/litmus/unistd_64.h
 create mode 100644 include/litmus/wait.h
 create mode 100644 kernel/sched/litmus.c
 create mode 100644 litmus/affinity.c
 create mode 100644 litmus/bheap.c
 create mode 100644 litmus/binheap.c
 create mode 100644 litmus/budget.c
 create mode 100644 litmus/clustered.c
 create mode 100644 litmus/ctrldev.c
 create mode 100644 litmus/edf_common.c
 create mode 100644 litmus/fdso.c
 create mode 100644 litmus/fp_common.c
 create mode 100644 litmus/jobs.c
 create mode 100644 litmus/litmus.c
 create mode 100644 litmus/litmus_proc.c
 create mode 100644 litmus/locking.c
 create mode 100644 litmus/preempt.c
 create mode 100644 litmus/rt_domain.c
 create mode 100644 litmus/sched_plugin.c
 create mode 100644 litmus/srp.c
 create mode 100644 litmus/sync.c
 create mode 100644 litmus/uncachedev.c

diff --git a/include/litmus/affinity.h b/include/litmus/affinity.h
new file mode 100644
index 0000000..ca2e442
--- /dev/null
+++ b/include/litmus/affinity.h
@@ -0,0 +1,80 @@
+#ifndef __LITMUS_AFFINITY_H
+#define __LITMUS_AFFINITY_H
+
+#include <linux/cpumask.h>
+
+/*
+  L1 (instr) = depth 0
+  L1 (data)  = depth 1
+  L2 = depth 2
+  L3 = depth 3
+ */
+#define NUM_CACHE_LEVELS 4
+
+struct neighborhood
+{
+	unsigned int size[NUM_CACHE_LEVELS];
+	cpumask_var_t neighbors[NUM_CACHE_LEVELS];
+};
+
+/* topology info is stored redundently in a big array for fast lookups */
+extern struct neighborhood neigh_info[NR_CPUS];
+
+void init_topology(void); /* called by Litmus module's _init_litmus() */
+
+/* Works like:
+void get_nearest_available_cpu(
+	cpu_entry_t **nearest,
+	cpu_entry_t *start,
+	cpu_entry_t *entries,
+	int release_master)
+
+Set release_master = NO_CPU for no Release Master.
+
+We use a macro here to exploit the fact that C-EDF and G-EDF
+have similar structures for their cpu_entry_t structs, even though
+they do not share a common base-struct.  The macro allows us to
+avoid code duplication.
+
+TODO: Factor out the job-to-processor linking from C/G-EDF into
+a reusable "processor mapping".  (See B.B.'s RTSS'09 paper &
+dissertation.)
+ */
+#define get_nearest_available_cpu(nearest, start, entries, release_master) \
+{ \
+	(nearest) = NULL; \
+	if (!(start)->linked) { \
+		(nearest) = (start); \
+	} else { \
+		int __level; \
+		int __cpu; \
+		int __release_master = ((release_master) == NO_CPU) ? -1 : (release_master); \
+		struct neighborhood *__neighbors = &neigh_info[(start)->cpu]; \
+		\
+		for (__level = 0; (__level < NUM_CACHE_LEVELS) && !(nearest); ++__level) { \
+			if (__neighbors->size[__level] > 1) { \
+				for_each_cpu(__cpu, __neighbors->neighbors[__level]) { \
+					if (__cpu != __release_master) { \
+						cpu_entry_t *__entry = &per_cpu((entries), __cpu); \
+						if (!__entry->linked) { \
+							(nearest) = __entry; \
+							break; \
+						} \
+					} \
+				} \
+			} else if (__neighbors->size[__level] == 0) { \
+				break; \
+			} \
+		} \
+	} \
+	\
+	if ((nearest)) { \
+		TRACE("P%d is closest available CPU to P%d\n", \
+				(nearest)->cpu, (start)->cpu); \
+	} else { \
+		TRACE("Could not find an available CPU close to P%d\n", \
+				(start)->cpu); \
+	} \
+}
+
+#endif
diff --git a/include/litmus/bheap.h b/include/litmus/bheap.h
new file mode 100644
index 0000000..cf4864a
--- /dev/null
+++ b/include/litmus/bheap.h
@@ -0,0 +1,77 @@
+/* bheaps.h -- Binomial Heaps
+ *
+ * (c) 2008, 2009 Bjoern Brandenburg
+ */
+
+#ifndef BHEAP_H
+#define BHEAP_H
+
+#define NOT_IN_HEAP UINT_MAX
+
+struct bheap_node {
+	struct bheap_node* 	parent;
+	struct bheap_node* 	next;
+	struct bheap_node* 	child;
+
+	unsigned int 		degree;
+	void*			value;
+	struct bheap_node**	ref;
+};
+
+struct bheap {
+	struct bheap_node* 	head;
+	/* We cache the minimum of the heap.
+	 * This speeds up repeated peek operations.
+	 */
+	struct bheap_node*	min;
+};
+
+typedef int (*bheap_prio_t)(struct bheap_node* a, struct bheap_node* b);
+
+void bheap_init(struct bheap* heap);
+void bheap_node_init(struct bheap_node** ref_to_bheap_node_ptr, void* value);
+
+static inline int bheap_node_in_heap(struct bheap_node* h)
+{
+	return h->degree != NOT_IN_HEAP;
+}
+
+static inline int bheap_empty(struct bheap* heap)
+{
+	return heap->head == NULL && heap->min == NULL;
+}
+
+/* insert (and reinitialize) a node into the heap */
+void bheap_insert(bheap_prio_t higher_prio,
+		 struct bheap* heap,
+		 struct bheap_node* node);
+
+/* merge addition into target */
+void bheap_union(bheap_prio_t higher_prio,
+		struct bheap* target,
+		struct bheap* addition);
+
+struct bheap_node* bheap_peek(bheap_prio_t higher_prio,
+			    struct bheap* heap);
+
+struct bheap_node* bheap_take(bheap_prio_t higher_prio,
+			    struct bheap* heap);
+
+void bheap_uncache_min(bheap_prio_t higher_prio, struct bheap* heap);
+int  bheap_decrease(bheap_prio_t higher_prio, struct bheap_node* node);
+
+void bheap_delete(bheap_prio_t higher_prio,
+		 struct bheap* heap,
+		 struct bheap_node* node);
+
+/* allocate from memcache */
+struct bheap_node* bheap_node_alloc(int gfp_flags);
+void bheap_node_free(struct bheap_node* hn);
+
+/* allocate a heap node for value and insert into the heap */
+int bheap_add(bheap_prio_t higher_prio, struct bheap* heap,
+	     void* value, int gfp_flags);
+
+void* bheap_take_del(bheap_prio_t higher_prio,
+		    struct bheap* heap);
+#endif
diff --git a/include/litmus/binheap.h b/include/litmus/binheap.h
new file mode 100644
index 0000000..1cf3647
--- /dev/null
+++ b/include/litmus/binheap.h
@@ -0,0 +1,205 @@
+#ifndef LITMUS_BINARY_HEAP_H
+#define LITMUS_BINARY_HEAP_H
+
+#include <linux/kernel.h>
+
+/**
+ * Simple binary heap with add, arbitrary delete, delete_root, and top
+ * operations.
+ *
+ * Style meant to conform with list.h.
+ *
+ * Motivation: Linux's prio_heap.h is of fixed size. Litmus's binomial
+ * heap may be overkill (and perhaps not general enough) for some applications.
+ *
+ * Note: In order to make node swaps fast, a node inserted with a data pointer
+ * may not always hold said data pointer. This is similar to the binomial heap
+ * implementation. This does make node deletion tricky since we have to
+ * (1) locate the node that holds the data pointer to delete, and (2) the
+ * node that was originally inserted with said data pointer. These have to be
+ * coalesced into a single node before removal (see usage of
+ * __binheap_safe_swap()). We have to track node references to accomplish this.
+ */
+
+struct binheap_node {
+	void	*data;
+	struct binheap_node *parent;
+	struct binheap_node *left;
+	struct binheap_node *right;
+
+	/* pointer to binheap_node that holds *data for which this binheap_node
+	 * was originally inserted.  (*data "owns" this node)
+	 */
+	struct binheap_node *ref;
+	struct binheap_node **ref_ptr;
+};
+
+/**
+ * Signature of compator function.  Assumed 'less-than' (min-heap).
+ * Pass in 'greater-than' for max-heap.
+ *
+ * TODO: Consider macro-based implementation that allows comparator to be
+ * inlined (similar to Linux red/black tree) for greater efficiency.
+ */
+typedef int (*binheap_order_t)(struct binheap_node *a,
+				struct binheap_node *b);
+
+
+struct binheap {
+	struct binheap_node *root;
+
+	/* pointer to node to take next inserted child */
+	struct binheap_node *next;
+
+	/* pointer to last node in complete binary tree */
+	struct binheap_node *last;
+
+	/* comparator function pointer */
+	binheap_order_t compare;
+};
+
+
+/* Initialized heap nodes not in a heap have parent
+ * set to BINHEAP_POISON.
+ */
+#define BINHEAP_POISON	((void*)(0xdeadbeef))
+
+
+/**
+ * binheap_entry - get the struct for this heap node.
+ *  Only valid when called upon heap nodes other than the root handle.
+ * @ptr:	the heap node.
+ * @type:	the type of struct pointed to by binheap_node::data.
+ * @member:	unused.
+ */
+#define binheap_entry(ptr, type, member) \
+((type *)((ptr)->data))
+
+/**
+ * binheap_node_container - get the struct that contains this node.
+ *  Only valid when called upon heap nodes other than the root handle.
+ * @ptr:	the heap node.
+ * @type:	the type of struct the node is embedded in.
+ * @member:	the name of the binheap_struct within the (type) struct.
+ */
+#define binheap_node_container(ptr, type, member) \
+container_of((ptr), type, member)
+
+/**
+ * binheap_top_entry - get the struct for the node at the top of the heap.
+ *  Only valid when called upon the heap handle node.
+ * @ptr:    the special heap-handle node.
+ * @type:   the type of the struct the head is embedded in.
+ * @member:	the name of the binheap_struct within the (type) struct.
+ */
+#define binheap_top_entry(ptr, type, member) \
+binheap_entry((ptr)->root, type, member)
+
+/**
+ * binheap_delete_root - remove the root element from the heap.
+ * @handle:	 handle to the heap.
+ * @type:    the type of the struct the head is embedded in.
+ * @member:	 the name of the binheap_struct within the (type) struct.
+ */
+#define binheap_delete_root(handle, type, member) \
+__binheap_delete_root((handle), &((type *)((handle)->root->data))->member)
+
+/**
+ * binheap_delete - remove an arbitrary element from the heap.
+ * @to_delete:  pointer to node to be removed.
+ * @handle:	 handle to the heap.
+ */
+#define binheap_delete(to_delete, handle) \
+__binheap_delete((to_delete), (handle))
+
+/**
+ * binheap_add - insert an element to the heap
+ * new_node: node to add.
+ * @handle:	 handle to the heap.
+ * @type:    the type of the struct the head is embedded in.
+ * @member:	 the name of the binheap_struct within the (type) struct.
+ */
+#define binheap_add(new_node, handle, type, member) \
+__binheap_add((new_node), (handle), container_of((new_node), type, member))
+
+/**
+ * binheap_decrease - re-eval the position of a node (based upon its
+ * original data pointer).
+ * @handle: handle to the heap.
+ * @orig_node: node that was associated with the data pointer
+ *             (whose value has changed) when said pointer was
+ *             added to the heap.
+ */
+#define binheap_decrease(orig_node, handle) \
+__binheap_decrease((orig_node), (handle))
+
+#define BINHEAP_NODE_INIT() { NULL, BINHEAP_POISON, NULL, NULL , NULL, NULL}
+
+#define BINHEAP_NODE(name) \
+	struct binheap_node name = BINHEAP_NODE_INIT()
+
+
+static inline void INIT_BINHEAP_NODE(struct binheap_node *n)
+{
+	n->data = NULL;
+	n->parent = BINHEAP_POISON;
+	n->left = NULL;
+	n->right = NULL;
+	n->ref = NULL;
+	n->ref_ptr = NULL;
+}
+
+static inline void INIT_BINHEAP_HANDLE(struct binheap *handle,
+				binheap_order_t compare)
+{
+	handle->root = NULL;
+	handle->next = NULL;
+	handle->last = NULL;
+	handle->compare = compare;
+}
+
+/* Returns true if binheap is empty. */
+static inline int binheap_empty(struct binheap *handle)
+{
+	return(handle->root == NULL);
+}
+
+/* Returns true if binheap node is in a heap. */
+static inline int binheap_is_in_heap(struct binheap_node *node)
+{
+	return (node->parent != BINHEAP_POISON);
+}
+
+/* Returns true if binheap node is in given heap. */
+int binheap_is_in_this_heap(struct binheap_node *node, struct binheap* heap);
+
+/* Add a node to a heap */
+void __binheap_add(struct binheap_node *new_node,
+				struct binheap *handle,
+				void *data);
+
+/**
+ * Removes the root node from the heap. The node is removed after coalescing
+ * the binheap_node with its original data pointer at the root of the tree.
+ *
+ * The 'last' node in the tree is then swapped up to the root and bubbled
+ * down.
+ */
+void __binheap_delete_root(struct binheap *handle,
+				struct binheap_node *container);
+
+/**
+ * Delete an arbitrary node.  Bubble node to delete up to the root,
+ * and then delete to root.
+ */
+void __binheap_delete(struct binheap_node *node_to_delete,
+				struct binheap *handle);
+
+/**
+ * Bubble up a node whose pointer has decreased in value.
+ */
+void __binheap_decrease(struct binheap_node *orig_node,
+						struct binheap *handle);
+
+
+#endif
diff --git a/include/litmus/budget.h b/include/litmus/budget.h
new file mode 100644
index 0000000..bd2d5c9
--- /dev/null
+++ b/include/litmus/budget.h
@@ -0,0 +1,36 @@
+#ifndef _LITMUS_BUDGET_H_
+#define _LITMUS_BUDGET_H_
+
+/* Update the per-processor enforcement timer (arm/reproram/cancel) for
+ * the next task. */
+void update_enforcement_timer(struct task_struct* t);
+
+inline static int budget_exhausted(struct task_struct* t)
+{
+	return get_exec_time(t) >= get_exec_cost(t);
+}
+
+inline static lt_t budget_remaining(struct task_struct* t)
+{
+	if (!budget_exhausted(t))
+		return get_exec_cost(t) - get_exec_time(t);
+	else
+		/* avoid overflow */
+		return 0;
+}
+
+#define budget_enforced(t) (tsk_rt(t)->task_params.budget_policy != NO_ENFORCEMENT)
+
+#define budget_precisely_enforced(t) (tsk_rt(t)->task_params.budget_policy \
+				      == PRECISE_ENFORCEMENT)
+
+static inline int requeue_preempted_job(struct task_struct* t)
+{
+	/* Add task to ready queue only if not subject to budget enforcement or
+	 * if the job has budget remaining. t may be NULL.
+	 */
+	return t && !is_completed(t) &&
+		(!budget_exhausted(t) || !budget_enforced(t));
+}
+
+#endif
diff --git a/include/litmus/clustered.h b/include/litmus/clustered.h
new file mode 100644
index 0000000..0c18dcb
--- /dev/null
+++ b/include/litmus/clustered.h
@@ -0,0 +1,44 @@
+#ifndef CLUSTERED_H
+#define CLUSTERED_H
+
+/* Which cache level should be used to group CPUs into clusters?
+ * GLOBAL_CLUSTER means that all CPUs form a single cluster (just like under
+ * global scheduling).
+ */
+enum cache_level {
+	GLOBAL_CLUSTER = 0,
+	L1_CLUSTER     = 1,
+	L2_CLUSTER     = 2,
+	L3_CLUSTER     = 3
+};
+
+int parse_cache_level(const char *str, enum cache_level *level);
+const char* cache_level_name(enum cache_level level);
+
+/* expose a cache level in a /proc dir */
+struct proc_dir_entry* create_cluster_file(struct proc_dir_entry* parent,
+					   enum cache_level* level);
+
+
+
+struct scheduling_cluster {
+	unsigned int id;
+	/* list of CPUs that are part of this cluster */
+	struct list_head cpus;
+};
+
+struct cluster_cpu {
+	unsigned int id; /* which CPU is this? */
+	struct list_head cluster_list; /* List of the CPUs in this cluster. */
+	struct scheduling_cluster* cluster; /* The cluster that this CPU belongs to. */
+};
+
+int get_cluster_size(enum cache_level level);
+
+int assign_cpus_to_clusters(enum cache_level level,
+			    struct scheduling_cluster* clusters[],
+			    unsigned int num_clusters,
+			    struct cluster_cpu* cpus[],
+			    unsigned int num_cpus);
+
+#endif
diff --git a/include/litmus/edf_common.h b/include/litmus/edf_common.h
new file mode 100644
index 0000000..bbaf22e
--- /dev/null
+++ b/include/litmus/edf_common.h
@@ -0,0 +1,25 @@
+/*
+ * EDF common data structures and utility functions shared by all EDF
+ * based scheduler plugins
+ */
+
+/* CLEANUP: Add comments and make it less messy.
+ *
+ */
+
+#ifndef __UNC_EDF_COMMON_H__
+#define __UNC_EDF_COMMON_H__
+
+#include <litmus/rt_domain.h>
+
+void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+		     release_jobs_t release);
+
+int edf_higher_prio(struct task_struct* first,
+		    struct task_struct* second);
+
+int edf_ready_order(struct bheap_node* a, struct bheap_node* b);
+
+int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t);
+
+#endif
diff --git a/include/litmus/fdso.h b/include/litmus/fdso.h
new file mode 100644
index 0000000..fd9b30d
--- /dev/null
+++ b/include/litmus/fdso.h
@@ -0,0 +1,78 @@
+/* fdso.h - file descriptor attached shared objects
+ *
+ * (c) 2007 B. Brandenburg, LITMUS^RT project
+ */
+
+#ifndef _LINUX_FDSO_H_
+#define _LINUX_FDSO_H_
+
+#include <linux/list.h>
+#include <asm/atomic.h>
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+
+#define MAX_OBJECT_DESCRIPTORS 85
+
+typedef enum  {
+	MIN_OBJ_TYPE 	= 0,
+
+	FMLP_SEM	= 0,
+	SRP_SEM		= 1,
+
+	MPCP_SEM	= 2,
+	MPCP_VS_SEM	= 3,
+	DPCP_SEM	= 4,
+	PCP_SEM         = 5,
+
+	DFLP_SEM	= 6,
+
+	MAX_OBJ_TYPE	= 6
+} obj_type_t;
+
+struct inode_obj_id {
+	struct list_head	list;
+	atomic_t		count;
+	struct inode*		inode;
+
+	obj_type_t 		type;
+	void*			obj;
+	unsigned int		id;
+};
+
+struct fdso_ops;
+
+struct od_table_entry {
+	unsigned int		used;
+
+	struct inode_obj_id*	obj;
+	const struct fdso_ops*	class;
+};
+
+struct fdso_ops {
+	int   (*create)(void** obj_ref, obj_type_t type, void* __user);
+	void  (*destroy)(obj_type_t type, void*);
+	int   (*open)	(struct od_table_entry*, void* __user);
+	int   (*close)	(struct od_table_entry*);
+};
+
+/* translate a userspace supplied od into the raw table entry
+ * returns NULL if od is invalid
+ */
+struct od_table_entry* get_entry_for_od(int od);
+
+/* translate a userspace supplied od into the associated object
+ * returns NULL if od is invalid
+ */
+static inline void* od_lookup(int od, obj_type_t type)
+{
+	struct od_table_entry* e = get_entry_for_od(od);
+	return e && e->obj->type == type ? e->obj->obj : NULL;
+}
+
+#define lookup_fmlp_sem(od)((struct pi_semaphore*)  od_lookup(od, FMLP_SEM))
+#define lookup_srp_sem(od) ((struct srp_semaphore*) od_lookup(od, SRP_SEM))
+#define lookup_ics(od)     ((struct ics*)           od_lookup(od, ICS_ID))
+
+
+#endif
diff --git a/include/litmus/fp_common.h b/include/litmus/fp_common.h
new file mode 100644
index 0000000..19356c0
--- /dev/null
+++ b/include/litmus/fp_common.h
@@ -0,0 +1,105 @@
+/* Fixed-priority scheduler support.
+ */
+
+#ifndef __FP_COMMON_H__
+#define __FP_COMMON_H__
+
+#include <litmus/rt_domain.h>
+
+#include <asm/bitops.h>
+
+
+void fp_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+		    release_jobs_t release);
+
+int fp_higher_prio(struct task_struct* first,
+		   struct task_struct* second);
+
+int fp_ready_order(struct bheap_node* a, struct bheap_node* b);
+
+#define FP_PRIO_BIT_WORDS (LITMUS_MAX_PRIORITY / BITS_PER_LONG)
+
+#if (LITMUS_MAX_PRIORITY % BITS_PER_LONG)
+#error LITMUS_MAX_PRIORITY must be a multiple of BITS_PER_LONG
+#endif
+
+/* bitmask-inexed priority queue */
+struct fp_prio_queue {
+	unsigned long	bitmask[FP_PRIO_BIT_WORDS];
+	struct bheap	queue[LITMUS_MAX_PRIORITY];
+};
+
+void fp_prio_queue_init(struct fp_prio_queue* q);
+
+static inline void fpq_set(struct fp_prio_queue* q, unsigned int index)
+{
+	unsigned long *word = q->bitmask + (index / BITS_PER_LONG);
+	__set_bit(index % BITS_PER_LONG, word);
+}
+
+static inline void fpq_clear(struct fp_prio_queue* q, unsigned int index)
+{
+	unsigned long *word = q->bitmask + (index / BITS_PER_LONG);
+	__clear_bit(index % BITS_PER_LONG, word);
+}
+
+static inline unsigned int fpq_find(struct fp_prio_queue* q)
+{
+	int i;
+
+	/* loop optimizer should unroll this */
+	for (i = 0; i < FP_PRIO_BIT_WORDS; i++)
+		if (q->bitmask[i])
+			return __ffs(q->bitmask[i]) + i * BITS_PER_LONG;
+
+	return LITMUS_MAX_PRIORITY; /* nothing found */
+}
+
+static inline void fp_prio_add(struct fp_prio_queue* q, struct task_struct* t, unsigned int index)
+{
+	BUG_ON(index >= LITMUS_MAX_PRIORITY);
+	BUG_ON(bheap_node_in_heap(tsk_rt(t)->heap_node));
+
+	fpq_set(q, index);
+	bheap_insert(fp_ready_order, &q->queue[index], tsk_rt(t)->heap_node);
+}
+
+static inline void fp_prio_remove(struct fp_prio_queue* q, struct task_struct* t, unsigned int index)
+{
+	BUG_ON(!is_queued(t));
+
+	bheap_delete(fp_ready_order, &q->queue[index], tsk_rt(t)->heap_node);
+	if (likely(bheap_empty(&q->queue[index])))
+		fpq_clear(q, index);
+}
+
+static inline struct task_struct* fp_prio_peek(struct fp_prio_queue* q)
+{
+	unsigned int idx = fpq_find(q);
+	struct bheap_node* hn;
+
+	if (idx < LITMUS_MAX_PRIORITY) {
+		hn = bheap_peek(fp_ready_order, &q->queue[idx]);
+		return bheap2task(hn);
+	} else
+		return NULL;
+}
+
+static inline struct task_struct* fp_prio_take(struct fp_prio_queue* q)
+{
+	unsigned int idx = fpq_find(q);
+	struct bheap_node* hn;
+
+	if (idx < LITMUS_MAX_PRIORITY) {
+		hn = bheap_take(fp_ready_order, &q->queue[idx]);
+		if (likely(bheap_empty(&q->queue[idx])))
+			fpq_clear(q, idx);
+		return bheap2task(hn);
+	} else
+		return NULL;
+}
+
+int fp_preemption_needed(struct fp_prio_queue*  q, struct task_struct *t);
+
+
+#endif
diff --git a/include/litmus/fpmath.h b/include/litmus/fpmath.h
new file mode 100644
index 0000000..642de98
--- /dev/null
+++ b/include/litmus/fpmath.h
@@ -0,0 +1,147 @@
+#ifndef __FP_MATH_H__
+#define __FP_MATH_H__
+
+#include <linux/math64.h>
+
+#ifndef __KERNEL__
+#include <stdint.h>
+#define abs(x) (((x) < 0) ? -(x) : x)
+#endif
+
+// Use 64-bit because we want to track things at the nanosecond scale.
+// This can lead to very large numbers.
+typedef int64_t fpbuf_t;
+typedef struct
+{
+	fpbuf_t val;
+} fp_t;
+
+#define FP_SHIFT 10
+#define ROUND_BIT (FP_SHIFT - 1)
+
+#define _fp(x) ((fp_t) {x})
+
+#ifdef __KERNEL__
+static const fp_t LITMUS_FP_ZERO = {.val = 0};
+static const fp_t LITMUS_FP_ONE = {.val = (1 << FP_SHIFT)};
+#endif
+
+static inline fp_t FP(fpbuf_t x)
+{
+	return _fp(((fpbuf_t) x) << FP_SHIFT);
+}
+
+/* divide two integers to obtain a fixed point value  */
+static inline fp_t _frac(fpbuf_t a, fpbuf_t b)
+{
+	return _fp(div64_s64(FP(a).val, (b)));
+}
+
+static inline fpbuf_t _point(fp_t x)
+{
+	return (x.val % (1 << FP_SHIFT));
+
+}
+
+#define fp2str(x) x.val
+/*(x.val >> FP_SHIFT), (x.val % (1 << FP_SHIFT)) */
+#define _FP_  "%ld/1024"
+
+static inline fpbuf_t _floor(fp_t x)
+{
+	return x.val >> FP_SHIFT;
+}
+
+/* FIXME: negative rounding */
+static inline fpbuf_t _round(fp_t x)
+{
+	return _floor(x) + ((x.val >> ROUND_BIT) & 1);
+}
+
+/* multiply two fixed point values */
+static inline fp_t _mul(fp_t a, fp_t b)
+{
+	return _fp((a.val * b.val) >> FP_SHIFT);
+}
+
+static inline fp_t _div(fp_t a, fp_t b)
+{
+#if !defined(__KERNEL__) && !defined(unlikely)
+#define unlikely(x) (x)
+#define DO_UNDEF_UNLIKELY
+#endif
+	/* try not to overflow */
+	if (unlikely(  a.val > (2l << ((sizeof(fpbuf_t)*8) - FP_SHIFT)) ))
+		return _fp((a.val / b.val) << FP_SHIFT);
+	else
+		return _fp((a.val << FP_SHIFT) / b.val);
+#ifdef DO_UNDEF_UNLIKELY
+#undef unlikely
+#undef DO_UNDEF_UNLIKELY
+#endif
+}
+
+static inline fp_t _add(fp_t a, fp_t b)
+{
+	return _fp(a.val + b.val);
+}
+
+static inline fp_t _sub(fp_t a, fp_t b)
+{
+	return _fp(a.val - b.val);
+}
+
+static inline fp_t _neg(fp_t x)
+{
+	return _fp(-x.val);
+}
+
+static inline fp_t _abs(fp_t x)
+{
+	return _fp(abs(x.val));
+}
+
+/* works the same as casting float/double to integer */
+static inline fpbuf_t _fp_to_integer(fp_t x)
+{
+	return _floor(_abs(x)) * ((x.val > 0) ? 1 : -1);
+}
+
+static inline fp_t _integer_to_fp(fpbuf_t x)
+{
+	return _frac(x,1);
+}
+
+static inline int _leq(fp_t a, fp_t b)
+{
+	return a.val <= b.val;
+}
+
+static inline int _geq(fp_t a, fp_t b)
+{
+	return a.val >= b.val;
+}
+
+static inline int _lt(fp_t a, fp_t b)
+{
+	return a.val < b.val;
+}
+
+static inline int _gt(fp_t a, fp_t b)
+{
+	return a.val > b.val;
+}
+
+static inline int _eq(fp_t a, fp_t b)
+{
+	return a.val == b.val;
+}
+
+static inline fp_t _max(fp_t a, fp_t b)
+{
+	if (a.val < b.val)
+		return b;
+	else
+		return a;
+}
+#endif
diff --git a/include/litmus/jobs.h b/include/litmus/jobs.h
new file mode 100644
index 0000000..24771df
--- /dev/null
+++ b/include/litmus/jobs.h
@@ -0,0 +1,10 @@
+#ifndef __LITMUS_JOBS_H__
+#define __LITMUS_JOBS_H__
+
+void prepare_for_next_period(struct task_struct *t);
+void release_at(struct task_struct *t, lt_t start);
+
+long default_wait_for_release_at(lt_t release_time);
+long complete_job(void);
+
+#endif
diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h
index c87863c..0519831 100644
--- a/include/litmus/litmus.h
+++ b/include/litmus/litmus.h
@@ -6,7 +6,49 @@
 #ifndef _LINUX_LITMUS_H_
 #define _LINUX_LITMUS_H_
 
+#include <litmus/debug_trace.h>
+
+#ifdef CONFIG_RELEASE_MASTER
+extern atomic_t release_master_cpu;
+#endif
+
+/* in_list - is a given list_head queued on some list?
+ */
+static inline int in_list(struct list_head* list)
+{
+	return !(  /* case 1: deleted */
+		   (list->next == LIST_POISON1 &&
+		    list->prev == LIST_POISON2)
+		 ||
+		   /* case 2: initialized */
+		   (list->next == list &&
+		    list->prev == list)
+		);
+}
+
+struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq);
+
+#define NO_CPU			0xffffffff
+
+void litmus_fork(struct task_struct *tsk);
+void litmus_exec(void);
+/* clean up real-time state of a task */
+void litmus_clear_state(struct task_struct *dead_tsk);
+void exit_litmus(struct task_struct *dead_tsk);
+
+/* Prevent the plugin from being switched-out from underneath a code
+ * path. Might sleep, so may be called only from non-atomic context. */
+void litmus_plugin_switch_disable(void);
+void litmus_plugin_switch_enable(void);
+
+long litmus_admit_task(struct task_struct *tsk);
+void litmus_exit_task(struct task_struct *tsk);
+void litmus_dealloc(struct task_struct *tsk);
+void litmus_do_exit(struct task_struct *tsk);
+
 #define is_realtime(t) 		((t)->policy == SCHED_LITMUS)
+#define rt_transition_pending(t) \
+	((t)->rt_param.transition_pending)
 
 #define tsk_rt(t)		(&(t)->rt_param)
 
@@ -28,6 +70,7 @@
 #define get_partition(t) 	(tsk_rt(t)->task_params.cpu)
 #define get_priority(t) 	(tsk_rt(t)->task_params.priority)
 #define get_class(t)        (tsk_rt(t)->task_params.cls)
+#define get_release_policy(t) (tsk_rt(t)->task_params.release_policy)
 
 /* job_param macros */
 #define get_exec_time(t)    (tsk_rt(t)->job_params.exec_time)
@@ -35,6 +78,15 @@
 #define get_release(t)		(tsk_rt(t)->job_params.release)
 #define get_lateness(t)		(tsk_rt(t)->job_params.lateness)
 
+/* release policy macros */
+#define is_periodic(t)		(get_release_policy(t) == TASK_PERIODIC)
+#define is_sporadic(t)		(get_release_policy(t) == TASK_SPORADIC)
+#ifdef CONFIG_ALLOW_EARLY_RELEASE
+#define is_early_releasing(t)	(get_release_policy(t) == TASK_EARLY)
+#else
+#define is_early_releasing(t)	(0)
+#endif
+
 #define is_hrt(t)     		\
 	(tsk_rt(t)->task_params.cls == RT_CLASS_HARD)
 #define is_srt(t)     		\
@@ -48,6 +100,196 @@ static inline lt_t litmus_clock(void)
 	return ktime_to_ns(ktime_get());
 }
 
+/* A macro to convert from nanoseconds to ktime_t. */
+#define ns_to_ktime(t)		ktime_add_ns(ktime_set(0, 0), t)
+
+#define get_domain(t) (tsk_rt(t)->domain)
+
+/* Honor the flag in the preempt_count variable that is set
+ * when scheduling is in progress.
+ */
+#define is_running(t) 			\
+	((t)->state == TASK_RUNNING || 	\
+	 task_thread_info(t)->preempt_count & PREEMPT_ACTIVE)
+
+#define is_blocked(t)       \
+	(!is_running(t))
+#define is_released(t, now)	\
+	(lt_before_eq(get_release(t), now))
+#define is_tardy(t, now)    \
+	(lt_before_eq(tsk_rt(t)->job_params.deadline, now))
+
+/* real-time comparison macros */
+#define earlier_deadline(a, b) (lt_before(\
+	(a)->rt_param.job_params.deadline,\
+	(b)->rt_param.job_params.deadline))
+#define earlier_release(a, b)  (lt_before(\
+	(a)->rt_param.job_params.release,\
+	(b)->rt_param.job_params.release))
+
+void preempt_if_preemptable(struct task_struct* t, int on_cpu);
+
+#ifdef CONFIG_LITMUS_LOCKING
+void srp_ceiling_block(void);
+#else
+#define srp_ceiling_block() /* nothing */
+#endif
+
+#define bheap2task(hn) ((struct task_struct*) hn->value)
+
+#ifdef CONFIG_NP_SECTION
+
+static inline int is_kernel_np(struct task_struct *t)
+{
+	return tsk_rt(t)->kernel_np;
+}
+
+static inline int is_user_np(struct task_struct *t)
+{
+	return tsk_rt(t)->ctrl_page ? tsk_rt(t)->ctrl_page->sched.np.flag : 0;
+}
+
+static inline void request_exit_np(struct task_struct *t)
+{
+	if (is_user_np(t)) {
+		/* Set the flag that tells user space to call
+		 * into the kernel at the end of a critical section. */
+		if (likely(tsk_rt(t)->ctrl_page)) {
+			TRACE_TASK(t, "setting delayed_preemption flag\n");
+			tsk_rt(t)->ctrl_page->sched.np.preempt = 1;
+		}
+	}
+}
+
+static inline void make_np(struct task_struct *t)
+{
+	tsk_rt(t)->kernel_np++;
+}
+
+/* Caller should check if preemption is necessary when
+ * the function return 0.
+ */
+static inline int take_np(struct task_struct *t)
+{
+	return --tsk_rt(t)->kernel_np;
+}
+
+/* returns 0 if remote CPU needs an IPI to preempt, 1 if no IPI is required */
+static inline int request_exit_np_atomic(struct task_struct *t)
+{
+	union np_flag old, new;
+
+	if (tsk_rt(t)->ctrl_page) {
+		old.raw = tsk_rt(t)->ctrl_page->sched.raw;
+		if (old.np.flag == 0) {
+			/* no longer non-preemptive */
+			return 0;
+		} else if (old.np.preempt) {
+			/* already set, nothing for us to do */
+			return 1;
+		} else {
+			/* non preemptive and flag not set */
+			new.raw = old.raw;
+			new.np.preempt = 1;
+			/* if we get old back, then we atomically set the flag */
+			return cmpxchg(&tsk_rt(t)->ctrl_page->sched.raw, old.raw, new.raw) == old.raw;
+			/* If we raced with a concurrent change, then so be
+			 * it. Deliver it by IPI.  We don't want an unbounded
+			 * retry loop here since tasks might exploit that to
+			 * keep the kernel busy indefinitely. */
+		}
+	} else
+		return 0;
+}
+
+#else
+
+static inline int is_kernel_np(struct task_struct* t)
+{
+	return 0;
+}
+
+static inline int is_user_np(struct task_struct* t)
+{
+	return 0;
+}
+
+static inline void request_exit_np(struct task_struct *t)
+{
+	/* request_exit_np() shouldn't be called if !CONFIG_NP_SECTION */
+	BUG();
+}
+
+static inline int request_exit_np_atomic(struct task_struct *t)
+{
+	return 0;
+}
+
+#endif
+
+static inline void clear_exit_np(struct task_struct *t)
+{
+	if (likely(tsk_rt(t)->ctrl_page))
+		tsk_rt(t)->ctrl_page->sched.np.preempt = 0;
+}
+
+static inline int is_np(struct task_struct *t)
+{
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+	int kernel, user;
+	kernel = is_kernel_np(t);
+	user   = is_user_np(t);
+	if (kernel || user)
+		TRACE_TASK(t, " is non-preemptive: kernel=%d user=%d\n",
+
+			   kernel, user);
+	return kernel || user;
+#else
+	return unlikely(is_kernel_np(t) || is_user_np(t));
+#endif
+}
+
+static inline int is_present(struct task_struct* t)
+{
+	return t && tsk_rt(t)->present;
+}
+
+static inline int is_completed(struct task_struct* t)
+{
+	return t && tsk_rt(t)->completed;
+}
+
+
+/* Used to convert ns-specified execution costs and periods into
+ * integral quanta equivalents.
+ */
+#define LITMUS_QUANTUM_LENGTH_NS (CONFIG_LITMUS_QUANTUM_LENGTH_US * 1000ULL)
+
+/* make the unit explicit */
+typedef unsigned long quanta_t;
+
+enum round {
+	FLOOR,
+	CEIL
+};
+
+static inline quanta_t time2quanta(lt_t time, enum round round)
+{
+	s64  quantum_length = LITMUS_QUANTUM_LENGTH_NS;
+
+	if (do_div(time, quantum_length) && round == CEIL)
+		time++;
+	return (quanta_t) time;
+}
+
+static inline lt_t quanta2time(quanta_t quanta)
+{
+	return quanta * LITMUS_QUANTUM_LENGTH_NS;
+}
+
+/* By how much is cpu staggered behind CPU 0? */
+u64 cpu_stagger_offset(int cpu);
+
 static inline struct control_page* get_control_page(struct task_struct *t)
 {
 	return tsk_rt(t)->ctrl_page;
@@ -58,4 +300,30 @@ static inline int has_control_page(struct task_struct* t)
 	return tsk_rt(t)->ctrl_page != NULL;
 }
 
+
+#ifdef CONFIG_SCHED_OVERHEAD_TRACE
+
+#define TS_SYSCALL_IN_START						\
+	if (has_control_page(current)) {				\
+		__TS_SYSCALL_IN_START(&get_control_page(current)->ts_syscall_start); \
+	}
+
+#define TS_SYSCALL_IN_END						\
+	if (has_control_page(current)) {				\
+		unsigned long flags;					\
+		uint64_t irqs;						\
+		local_irq_save(flags);					\
+		irqs = get_control_page(current)->irq_count -		\
+			get_control_page(current)->irq_syscall_start;	\
+		__TS_SYSCALL_IN_END(&irqs);				\
+		local_irq_restore(flags);				\
+	}
+
+#else
+
+#define TS_SYSCALL_IN_START
+#define TS_SYSCALL_IN_END
+
+#endif
+
 #endif
diff --git a/include/litmus/litmus_proc.h b/include/litmus/litmus_proc.h
new file mode 100644
index 0000000..a5db24c
--- /dev/null
+++ b/include/litmus/litmus_proc.h
@@ -0,0 +1,63 @@
+#include <litmus/sched_plugin.h>
+#include <linux/proc_fs.h>
+
+int __init init_litmus_proc(void);
+void exit_litmus_proc(void);
+
+struct cd_mapping
+{
+	int id;
+	cpumask_var_t mask;
+	struct proc_dir_entry *proc_file;
+};
+
+struct domain_proc_info
+{
+	int num_cpus;
+	int num_domains;
+
+	struct cd_mapping *cpu_to_domains;
+	struct cd_mapping *domain_to_cpus;
+};
+
+/*
+ * On success, returns 0 and sets the pointer to the location of the new
+ * proc dir entry, otherwise returns an error code and sets pde to NULL.
+ */
+long make_plugin_proc_dir(struct sched_plugin* plugin,
+		struct proc_dir_entry** pde);
+
+/*
+ * Plugins should deallocate all child proc directory entries before
+ * calling this, to avoid memory leaks.
+ */
+void remove_plugin_proc_dir(struct sched_plugin* plugin);
+
+/*
+ * Setup the CPU <-> sched domain mappings in proc
+ */
+long activate_domain_proc(struct domain_proc_info* map);
+
+/*
+ * Remove the CPU <-> sched domain mappings from proc
+ */
+long deactivate_domain_proc(void);
+
+/*
+ * Alloc memory for the mapping
+ * Note: Does not set up proc files. Use make_sched_domain_maps for that.
+ */
+long init_domain_proc_info(struct domain_proc_info* map,
+	int num_cpus, int num_domains);
+
+/*
+ * Free memory of the mapping
+ * Note: Does not clean up proc files. Use deactivate_domain_proc for that.
+ */
+void destroy_domain_proc_info(struct domain_proc_info* map);
+
+/* Copy at most size-1 bytes from ubuf into kbuf, null-terminate buf, and
+ * remove a '\n' if present. Returns the number of bytes that were read or
+ * -EFAULT. */
+int copy_and_chomp(char *kbuf, unsigned long ksize,
+		   __user const char* ubuf, unsigned long ulength);
diff --git a/include/litmus/locking.h b/include/litmus/locking.h
new file mode 100644
index 0000000..4d7b870
--- /dev/null
+++ b/include/litmus/locking.h
@@ -0,0 +1,28 @@
+#ifndef LITMUS_LOCKING_H
+#define LITMUS_LOCKING_H
+
+struct litmus_lock_ops;
+
+/* Generic base struct for LITMUS^RT userspace semaphores.
+ * This structure should be embedded in protocol-specific semaphores.
+ */
+struct litmus_lock {
+	struct litmus_lock_ops *ops;
+	int type;
+};
+
+struct litmus_lock_ops {
+	/* Current task tries to obtain / drop a reference to a lock.
+	 * Optional methods, allowed by default. */
+	int (*open)(struct litmus_lock*, void* __user);
+	int (*close)(struct litmus_lock*);
+
+	/* Current tries to lock/unlock this lock (mandatory methods). */
+	int (*lock)(struct litmus_lock*);
+	int (*unlock)(struct litmus_lock*);
+
+	/* The lock is no longer being referenced (mandatory method). */
+	void (*deallocate)(struct litmus_lock*);
+};
+
+#endif
diff --git a/include/litmus/preempt.h b/include/litmus/preempt.h
new file mode 100644
index 0000000..4fd108a
--- /dev/null
+++ b/include/litmus/preempt.h
@@ -0,0 +1,164 @@
+#ifndef LITMUS_PREEMPT_H
+#define LITMUS_PREEMPT_H
+
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/percpu.h>
+#include <asm/atomic.h>
+
+#include <litmus/debug_trace.h>
+
+DECLARE_PER_CPU_SHARED_ALIGNED(atomic_t, resched_state);
+
+#ifdef CONFIG_PREEMPT_STATE_TRACE
+const char* sched_state_name(int s);
+#define TRACE_STATE(fmt, args...) TRACE("SCHED_STATE " fmt, args)
+#else
+#define TRACE_STATE(fmt, args...) /* ignore */
+#endif
+
+#define VERIFY_SCHED_STATE(x)						\
+	do { int __s = get_sched_state();				\
+		if ((__s & (x)) == 0)					\
+			TRACE_STATE("INVALID s=0x%x (%s) not "		\
+				    "in 0x%x (%s) [%s]\n",		\
+				    __s, sched_state_name(__s),		\
+				    (x), #x, __FUNCTION__);		\
+	} while (0);
+
+#define TRACE_SCHED_STATE_CHANGE(x, y, cpu)				\
+	TRACE_STATE("[P%d] 0x%x (%s) -> 0x%x (%s)\n",			\
+		    cpu,  (x), sched_state_name(x),			\
+		    (y), sched_state_name(y))
+
+
+typedef enum scheduling_state {
+	TASK_SCHEDULED    = (1 << 0),  /* The currently scheduled task is the one that
+					* should be scheduled, and the processor does not
+					* plan to invoke schedule(). */
+	SHOULD_SCHEDULE   = (1 << 1),  /* A remote processor has determined that the
+					* processor should reschedule, but this has not
+					* been communicated yet (IPI still pending). */
+	WILL_SCHEDULE     = (1 << 2),  /* The processor has noticed that it has to
+					* reschedule and will do so shortly. */
+	TASK_PICKED       = (1 << 3),  /* The processor is currently executing schedule(),
+					* has selected a new task to schedule, but has not
+					* yet performed the actual context switch. */
+	PICKED_WRONG_TASK = (1 << 4),  /* The processor has not yet performed the context
+					* switch, but a remote processor has already
+					* determined that a higher-priority task became
+					* eligible after the task was picked. */
+} sched_state_t;
+
+static inline sched_state_t get_sched_state_on(int cpu)
+{
+	return atomic_read(&per_cpu(resched_state, cpu));
+}
+
+static inline sched_state_t get_sched_state(void)
+{
+	return atomic_read(&__get_cpu_var(resched_state));
+}
+
+static inline int is_in_sched_state(int possible_states)
+{
+	return get_sched_state() & possible_states;
+}
+
+static inline int cpu_is_in_sched_state(int cpu, int possible_states)
+{
+	return get_sched_state_on(cpu) & possible_states;
+}
+
+static inline void set_sched_state(sched_state_t s)
+{
+	TRACE_SCHED_STATE_CHANGE(get_sched_state(), s, smp_processor_id());
+	atomic_set(&__get_cpu_var(resched_state), s);
+}
+
+static inline int sched_state_transition(sched_state_t from, sched_state_t to)
+{
+	sched_state_t old_state;
+
+	old_state = atomic_cmpxchg(&__get_cpu_var(resched_state), from, to);
+	if (old_state == from) {
+		TRACE_SCHED_STATE_CHANGE(from, to, smp_processor_id());
+		return 1;
+	} else
+		return 0;
+}
+
+static inline int sched_state_transition_on(int cpu,
+					    sched_state_t from,
+					    sched_state_t to)
+{
+	sched_state_t old_state;
+
+	old_state = atomic_cmpxchg(&per_cpu(resched_state, cpu), from, to);
+	if (old_state == from) {
+		TRACE_SCHED_STATE_CHANGE(from, to, cpu);
+		return 1;
+	} else
+		return 0;
+}
+
+/* Plugins must call this function after they have decided which job to
+ * schedule next.  IMPORTANT: this function must be called while still holding
+ * the lock that is used to serialize scheduling decisions.
+ *
+ * (Ideally, we would like to use runqueue locks for this purpose, but that
+ * would lead to deadlocks with the migration code.)
+ */
+static inline void sched_state_task_picked(void)
+{
+	VERIFY_SCHED_STATE(WILL_SCHEDULE);
+
+	/* WILL_SCHEDULE has only a local tansition => simple store is ok */
+	set_sched_state(TASK_PICKED);
+}
+
+static inline void sched_state_entered_schedule(void)
+{
+	/* Update state for the case that we entered schedule() not due to
+	 * set_tsk_need_resched() */
+	set_sched_state(WILL_SCHEDULE);
+}
+
+/* Called by schedule() to check if the scheduling decision is still valid
+ * after a context switch. Returns 1 if the CPU needs to reschdule. */
+static inline int sched_state_validate_switch(void)
+{
+	int left_state_ok = 0;
+
+	VERIFY_SCHED_STATE(PICKED_WRONG_TASK | TASK_PICKED);
+
+	if (is_in_sched_state(TASK_PICKED)) {
+		/* Might be good; let's try to transition out of this
+		 * state. This must be done atomically since remote processors
+		 * may try to change the state, too. */
+		left_state_ok = sched_state_transition(TASK_PICKED, TASK_SCHEDULED);
+	}
+
+	if (!left_state_ok) {
+		/* We raced with a higher-priority task arrival => not
+		 * valid. The CPU needs to reschedule. */
+		set_sched_state(WILL_SCHEDULE);
+		return 1;
+	} else
+		return 0;
+}
+
+/* State transition events. See litmus/preempt.c for details. */
+void sched_state_will_schedule(struct task_struct* tsk);
+void sched_state_ipi(void);
+/* Cause a CPU (remote or local) to reschedule. */
+void litmus_reschedule(int cpu);
+void litmus_reschedule_local(void);
+
+#ifdef CONFIG_DEBUG_KERNEL
+void sched_state_plugin_check(void);
+#else
+#define sched_state_plugin_check() /* no check */
+#endif
+
+#endif
diff --git a/include/litmus/rt_domain.h b/include/litmus/rt_domain.h
new file mode 100644
index 0000000..ac24929
--- /dev/null
+++ b/include/litmus/rt_domain.h
@@ -0,0 +1,182 @@
+/* CLEANUP: Add comments and make it less messy.
+ *
+ */
+
+#ifndef __UNC_RT_DOMAIN_H__
+#define __UNC_RT_DOMAIN_H__
+
+#include <litmus/bheap.h>
+
+#define RELEASE_QUEUE_SLOTS 127 /* prime */
+
+struct _rt_domain;
+
+typedef int (*check_resched_needed_t)(struct _rt_domain *rt);
+typedef void (*release_jobs_t)(struct _rt_domain *rt, struct bheap* tasks);
+
+struct release_queue {
+	/* each slot maintains a list of release heaps sorted
+	 * by release time */
+	struct list_head		slot[RELEASE_QUEUE_SLOTS];
+};
+
+typedef struct _rt_domain {
+	/* runnable rt tasks are in here */
+	raw_spinlock_t 			ready_lock;
+	struct bheap	 		ready_queue;
+
+	/* real-time tasks waiting for release are in here */
+	raw_spinlock_t 			release_lock;
+	struct release_queue 		release_queue;
+
+#ifdef CONFIG_RELEASE_MASTER
+	int				release_master;
+#endif
+
+	/* for moving tasks to the release queue */
+	raw_spinlock_t			tobe_lock;
+	struct list_head		tobe_released;
+
+	/* how do we check if we need to kick another CPU? */
+	check_resched_needed_t		check_resched;
+
+	/* how do we release jobs? */
+	release_jobs_t			release_jobs;
+
+	/* how are tasks ordered in the ready queue? */
+	bheap_prio_t			order;
+} rt_domain_t;
+
+struct release_heap {
+	/* list_head for per-time-slot list */
+	struct list_head		list;
+	lt_t				release_time;
+	/* all tasks to be released at release_time */
+	struct bheap			heap;
+	/* used to trigger the release */
+	struct hrtimer			timer;
+
+#ifdef CONFIG_RELEASE_MASTER
+	/* used to delegate releases */
+	struct hrtimer_start_on_info	info;
+#endif
+	/* required for the timer callback */
+	rt_domain_t*			dom;
+};
+
+
+static inline struct task_struct* __next_ready(rt_domain_t* rt)
+{
+	struct bheap_node *hn = bheap_peek(rt->order, &rt->ready_queue);
+	if (hn)
+		return bheap2task(hn);
+	else
+		return NULL;
+}
+
+void rt_domain_init(rt_domain_t *rt, bheap_prio_t order,
+		    check_resched_needed_t check,
+		    release_jobs_t relase);
+
+void __add_ready(rt_domain_t* rt, struct task_struct *new);
+void __merge_ready(rt_domain_t* rt, struct bheap *tasks);
+void __add_release(rt_domain_t* rt, struct task_struct *task);
+
+static inline struct task_struct* __take_ready(rt_domain_t* rt)
+{
+	struct bheap_node* hn = bheap_take(rt->order, &rt->ready_queue);
+	if (hn)
+		return bheap2task(hn);
+	else
+		return NULL;
+}
+
+static inline struct task_struct* __peek_ready(rt_domain_t* rt)
+{
+	struct bheap_node* hn = bheap_peek(rt->order, &rt->ready_queue);
+	if (hn)
+		return bheap2task(hn);
+	else
+		return NULL;
+}
+
+static inline int  is_queued(struct task_struct *t)
+{
+	BUG_ON(!tsk_rt(t)->heap_node);
+	return bheap_node_in_heap(tsk_rt(t)->heap_node);
+}
+
+static inline void remove(rt_domain_t* rt, struct task_struct *t)
+{
+	bheap_delete(rt->order, &rt->ready_queue, tsk_rt(t)->heap_node);
+}
+
+static inline void add_ready(rt_domain_t* rt, struct task_struct *new)
+{
+	unsigned long flags;
+	/* first we need the write lock for rt_ready_queue */
+	raw_spin_lock_irqsave(&rt->ready_lock, flags);
+	__add_ready(rt, new);
+	raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
+}
+
+static inline void merge_ready(rt_domain_t* rt, struct bheap* tasks)
+{
+	unsigned long flags;
+	raw_spin_lock_irqsave(&rt->ready_lock, flags);
+	__merge_ready(rt, tasks);
+	raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
+}
+
+static inline struct task_struct* take_ready(rt_domain_t* rt)
+{
+	unsigned long flags;
+	struct task_struct* ret;
+	/* first we need the write lock for rt_ready_queue */
+	raw_spin_lock_irqsave(&rt->ready_lock, flags);
+	ret = __take_ready(rt);
+	raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
+	return ret;
+}
+
+
+static inline void add_release(rt_domain_t* rt, struct task_struct *task)
+{
+	unsigned long flags;
+	raw_spin_lock_irqsave(&rt->tobe_lock, flags);
+	__add_release(rt, task);
+	raw_spin_unlock_irqrestore(&rt->tobe_lock, flags);
+}
+
+#ifdef CONFIG_RELEASE_MASTER
+void __add_release_on(rt_domain_t* rt, struct task_struct *task,
+		      int target_cpu);
+
+static inline void add_release_on(rt_domain_t* rt,
+				  struct task_struct *task,
+				  int target_cpu)
+{
+	unsigned long flags;
+	raw_spin_lock_irqsave(&rt->tobe_lock, flags);
+	__add_release_on(rt, task, target_cpu);
+	raw_spin_unlock_irqrestore(&rt->tobe_lock, flags);
+}
+#endif
+
+static inline int __jobs_pending(rt_domain_t* rt)
+{
+	return !bheap_empty(&rt->ready_queue);
+}
+
+static inline int jobs_pending(rt_domain_t* rt)
+{
+	unsigned long flags;
+	int ret;
+	/* first we need the write lock for rt_ready_queue */
+	raw_spin_lock_irqsave(&rt->ready_lock, flags);
+	ret = !bheap_empty(&rt->ready_queue);
+	raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
+	return ret;
+}
+
+#endif
diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
index ce76faa..e26535b 100644
--- a/include/litmus/rt_param.h
+++ b/include/litmus/rt_param.h
@@ -84,12 +84,12 @@ struct rt_task {
 };
 
 union np_flag {
-	uint64_t raw;
+	uint32_t raw;
 	struct {
 		/* Is the task currently in a non-preemptive section? */
-		uint64_t flag:31;
+		uint32_t flag:31;
 		/* Should the task call into the scheduler? */
-		uint64_t preempt:1;
+		uint32_t preempt:1;
 	} np;
 };
 
@@ -110,10 +110,10 @@ union np_flag {
 struct control_page {
 	/* This flag is used by userspace to communicate non-preempive
 	 * sections. */
-	volatile union np_flag sched;
+	volatile __attribute__ ((aligned (8))) union np_flag sched;
 
-	volatile uint64_t irq_count; /* Incremented by the kernel each time an IRQ is
-				      * handled. */
+	/* Incremented by the kernel each time an IRQ is handled. */
+	volatile __attribute__ ((aligned (8))) uint64_t irq_count;
 
 	/* Locking overhead tracing: userspace records here the time stamp
 	 * and IRQ counter prior to starting the system call. */
diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h
new file mode 100644
index 0000000..0ccccd6
--- /dev/null
+++ b/include/litmus/sched_plugin.h
@@ -0,0 +1,128 @@
+/*
+ * Definition of the scheduler plugin interface.
+ *
+ */
+#ifndef _LINUX_SCHED_PLUGIN_H_
+#define _LINUX_SCHED_PLUGIN_H_
+
+#include <linux/sched.h>
+
+#ifdef CONFIG_LITMUS_LOCKING
+#include <litmus/locking.h>
+#endif
+
+/************************ setup/tear down ********************/
+
+typedef long (*activate_plugin_t) (void);
+typedef long (*deactivate_plugin_t) (void);
+
+struct domain_proc_info;
+typedef long (*get_domain_proc_info_t) (struct domain_proc_info **info);
+
+
+/********************* scheduler invocation ******************/
+/* The main scheduling function, called to select the next task to dispatch. */
+typedef struct task_struct* (*schedule_t)(struct task_struct * prev);
+/* Clean up after the task switch has occured.
+ * This function is called after every (even non-rt) task switch.
+ */
+typedef void (*finish_switch_t)(struct task_struct *prev);
+
+
+/********************* task state changes ********************/
+
+/* Called to setup a new real-time task.
+ * Release the first job, enqueue, etc.
+ * Task may already be running.
+ */
+typedef void (*task_new_t) (struct task_struct *task,
+			    int on_rq,
+			    int running);
+
+/* Called to re-introduce a task after blocking.
+ * Can potentially be called multiple times.
+ */
+typedef void (*task_wake_up_t) (struct task_struct *task);
+/* called to notify the plugin of a blocking real-time task
+ * it will only be called for real-time tasks and before schedule is called */
+typedef void (*task_block_t)  (struct task_struct *task);
+/* Called when a real-time task exits or changes to a different scheduling
+ * class.
+ * Free any allocated resources
+ */
+typedef void (*task_exit_t)    (struct task_struct *);
+
+/* task_exit() is called with interrupts disabled and runqueue locks held, and
+ * thus and cannot block or spin.  task_cleanup() is called sometime later
+ * without any locks being held.
+ */
+typedef void (*task_cleanup_t)	(struct task_struct *);
+
+#ifdef CONFIG_LITMUS_LOCKING
+/* Called when the current task attempts to create a new lock of a given
+ * protocol type. */
+typedef long (*allocate_lock_t) (struct litmus_lock **lock, int type,
+				 void* __user config);
+#endif
+
+
+/********************* sys call backends  ********************/
+/* This function causes the caller to sleep until the next release */
+typedef long (*complete_job_t) (void);
+
+typedef long (*admit_task_t)(struct task_struct* tsk);
+
+typedef long (*wait_for_release_at_t)(lt_t release_time);
+
+/* Informs the plugin when a synchronous release takes place. */
+typedef void (*synchronous_release_at_t)(lt_t time_zero);
+
+/************************ misc routines ***********************/
+
+
+struct sched_plugin {
+	struct list_head	list;
+	/* 	basic info 		*/
+	char 			*plugin_name;
+
+	/*	setup			*/
+	activate_plugin_t	activate_plugin;
+	deactivate_plugin_t	deactivate_plugin;
+	get_domain_proc_info_t	get_domain_proc_info;
+
+	/* 	scheduler invocation 	*/
+	schedule_t 		schedule;
+	finish_switch_t 	finish_switch;
+
+	/*	syscall backend 	*/
+	complete_job_t 		complete_job;
+	wait_for_release_at_t	wait_for_release_at;
+	synchronous_release_at_t synchronous_release_at;
+
+	/*	task state changes 	*/
+	admit_task_t		admit_task;
+
+        task_new_t 		task_new;
+	task_wake_up_t		task_wake_up;
+	task_block_t		task_block;
+
+	task_exit_t 		task_exit;
+	task_cleanup_t		task_cleanup;
+
+#ifdef CONFIG_LITMUS_LOCKING
+	/*	locking protocols	*/
+	allocate_lock_t		allocate_lock;
+#endif
+} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
+
+
+extern struct sched_plugin *litmus;
+
+int register_sched_plugin(struct sched_plugin* plugin);
+struct sched_plugin* find_sched_plugin(const char* name);
+void print_sched_plugins(struct seq_file *m);
+
+
+extern struct sched_plugin linux_sched_plugin;
+
+#endif
diff --git a/include/litmus/srp.h b/include/litmus/srp.h
new file mode 100644
index 0000000..c9a4552
--- /dev/null
+++ b/include/litmus/srp.h
@@ -0,0 +1,28 @@
+#ifndef LITMUS_SRP_H
+#define LITMUS_SRP_H
+
+struct srp_semaphore;
+
+struct srp_priority {
+	struct list_head	list;
+        unsigned int 		priority;
+	pid_t			pid;
+};
+#define list2prio(l) list_entry(l, struct srp_priority, list)
+
+/* struct for uniprocessor SRP "semaphore" */
+struct srp_semaphore {
+	struct litmus_lock litmus_lock;
+	struct srp_priority ceiling;
+	struct task_struct* owner;
+	int cpu; /* cpu associated with this "semaphore" and resource */
+};
+
+/* map a task to its SRP preemption level priority */
+typedef unsigned int (*srp_prioritization_t)(struct task_struct* t);
+/* Must be updated by each plugin that uses SRP.*/
+extern srp_prioritization_t get_srp_prio;
+
+struct srp_semaphore* allocate_srp_semaphore(void);
+
+#endif
diff --git a/include/litmus/unistd_32.h b/include/litmus/unistd_32.h
new file mode 100644
index 0000000..94264c2
--- /dev/null
+++ b/include/litmus/unistd_32.h
@@ -0,0 +1,21 @@
+/*
+ * included from arch/x86/include/asm/unistd_32.h
+ *
+ * LITMUS^RT syscalls with "relative" numbers
+ */
+#define __LSC(x) (__NR_LITMUS + x)
+
+#define __NR_set_rt_task_param	__LSC(0)
+#define __NR_get_rt_task_param	__LSC(1)
+#define __NR_complete_job	__LSC(2)
+#define __NR_od_open		__LSC(3)
+#define __NR_od_close		__LSC(4)
+#define __NR_litmus_lock       	__LSC(5)
+#define __NR_litmus_unlock	__LSC(6)
+#define __NR_query_job_no	__LSC(7)
+#define __NR_wait_for_job_release __LSC(8)
+#define __NR_wait_for_ts_release __LSC(9)
+#define __NR_release_ts		__LSC(10)
+#define __NR_null_call		__LSC(11)
+
+#define NR_litmus_syscalls 12
diff --git a/include/litmus/unistd_64.h b/include/litmus/unistd_64.h
new file mode 100644
index 0000000..d5ced0d
--- /dev/null
+++ b/include/litmus/unistd_64.h
@@ -0,0 +1,33 @@
+/*
+ * included from arch/x86/include/asm/unistd_64.h
+ *
+ * LITMUS^RT syscalls with "relative" numbers
+ */
+#define __LSC(x) (__NR_LITMUS + x)
+
+#define __NR_set_rt_task_param			__LSC(0)
+__SYSCALL(__NR_set_rt_task_param, sys_set_rt_task_param)
+#define __NR_get_rt_task_param			__LSC(1)
+__SYSCALL(__NR_get_rt_task_param, sys_get_rt_task_param)
+#define __NR_complete_job	  		__LSC(2)
+__SYSCALL(__NR_complete_job, sys_complete_job)
+#define __NR_od_open				__LSC(3)
+__SYSCALL(__NR_od_open, sys_od_open)
+#define __NR_od_close				__LSC(4)
+__SYSCALL(__NR_od_close, sys_od_close)
+#define __NR_litmus_lock	       		__LSC(5)
+__SYSCALL(__NR_litmus_lock, sys_litmus_lock)
+#define __NR_litmus_unlock	       		__LSC(6)
+__SYSCALL(__NR_litmus_unlock, sys_litmus_unlock)
+#define __NR_query_job_no			__LSC(7)
+__SYSCALL(__NR_query_job_no, sys_query_job_no)
+#define __NR_wait_for_job_release		__LSC(8)
+__SYSCALL(__NR_wait_for_job_release, sys_wait_for_job_release)
+#define __NR_wait_for_ts_release		__LSC(9)
+__SYSCALL(__NR_wait_for_ts_release, sys_wait_for_ts_release)
+#define __NR_release_ts				__LSC(10)
+__SYSCALL(__NR_release_ts, sys_release_ts)
+#define __NR_null_call				__LSC(11)
+__SYSCALL(__NR_null_call, sys_null_call)
+
+#define NR_litmus_syscalls 12
diff --git a/include/litmus/wait.h b/include/litmus/wait.h
new file mode 100644
index 0000000..ce1347c
--- /dev/null
+++ b/include/litmus/wait.h
@@ -0,0 +1,57 @@
+#ifndef _LITMUS_WAIT_H_
+#define _LITMUS_WAIT_H_
+
+struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq);
+
+/* wrap regular wait_queue_t head */
+struct __prio_wait_queue {
+	wait_queue_t wq;
+
+	/* some priority point */
+	lt_t priority;
+	/* break ties in priority by lower tie_breaker */
+	unsigned int tie_breaker;
+};
+
+typedef struct __prio_wait_queue prio_wait_queue_t;
+
+static inline void init_prio_waitqueue_entry(prio_wait_queue_t *pwq,
+					     struct task_struct* t,
+					     lt_t priority)
+{
+	init_waitqueue_entry(&pwq->wq, t);
+	pwq->priority    = priority;
+	pwq->tie_breaker = 0;
+}
+
+static inline void init_prio_waitqueue_entry_tie(prio_wait_queue_t *pwq,
+						 struct task_struct* t,
+						 lt_t priority,
+						 unsigned int tie_breaker)
+{
+	init_waitqueue_entry(&pwq->wq, t);
+	pwq->priority    = priority;
+	pwq->tie_breaker = tie_breaker;
+}
+
+unsigned int __add_wait_queue_prio_exclusive(
+	wait_queue_head_t* head,
+	prio_wait_queue_t *new);
+
+static inline unsigned int add_wait_queue_prio_exclusive(
+	wait_queue_head_t* head,
+	prio_wait_queue_t *new)
+{
+	unsigned long flags;
+	unsigned int passed;
+
+	spin_lock_irqsave(&head->lock, flags);
+	passed = __add_wait_queue_prio_exclusive(head, new);
+
+	spin_unlock_irqrestore(&head->lock, flags);
+
+	return passed;
+}
+
+
+#endif
diff --git a/kernel/sched/litmus.c b/kernel/sched/litmus.c
new file mode 100644
index 0000000..ad88a14
--- /dev/null
+++ b/kernel/sched/litmus.c
@@ -0,0 +1,340 @@
+/* This file is included from kernel/sched.c */
+
+#include "sched.h"
+
+#include <litmus/trace.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/litmus.h>
+#include <litmus/budget.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/preempt.h>
+
+static void update_time_litmus(struct rq *rq, struct task_struct *p)
+{
+	u64 delta = rq->clock - p->se.exec_start;
+	if (unlikely((s64)delta < 0))
+		delta = 0;
+	/* per job counter */
+	p->rt_param.job_params.exec_time += delta;
+	/* task counter */
+	p->se.sum_exec_runtime += delta;
+	/* sched_clock() */
+	p->se.exec_start = rq->clock;
+	cpuacct_charge(p, delta);
+}
+
+static void double_rq_lock(struct rq *rq1, struct rq *rq2);
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2);
+
+static struct task_struct *
+litmus_schedule(struct rq *rq, struct task_struct *prev)
+{
+	struct task_struct *next;
+
+#ifdef CONFIG_SMP
+	struct rq* other_rq;
+	long was_running;
+	lt_t _maybe_deadlock = 0;
+#endif
+
+	/* let the plugin schedule */
+	next = litmus->schedule(prev);
+
+	sched_state_plugin_check();
+
+#ifdef CONFIG_SMP
+	/* check if a global plugin pulled a task from a different RQ */
+	if (next && task_rq(next) != rq) {
+		/* we need to migrate the task */
+		other_rq = task_rq(next);
+		TRACE_TASK(next, "migrate from %d\n", other_rq->cpu);
+
+		/* while we drop the lock, the prev task could change its
+		 * state
+		 */
+		was_running = is_running(prev);
+		mb();
+		raw_spin_unlock(&rq->lock);
+
+		/* Don't race with a concurrent switch.  This could deadlock in
+		 * the case of cross or circular migrations.  It's the job of
+		 * the plugin to make sure that doesn't happen.
+		 */
+		TRACE_TASK(next, "stack_in_use=%d\n",
+			   next->rt_param.stack_in_use);
+		if (next->rt_param.stack_in_use != NO_CPU) {
+			TRACE_TASK(next, "waiting to deschedule\n");
+			_maybe_deadlock = litmus_clock();
+		}
+		while (next->rt_param.stack_in_use != NO_CPU) {
+			cpu_relax();
+			mb();
+			if (next->rt_param.stack_in_use == NO_CPU)
+				TRACE_TASK(next,"descheduled. Proceeding.\n");
+
+			if (lt_before(_maybe_deadlock + 1000000000L,
+				      litmus_clock())) {
+				/* We've been spinning for 1s.
+				 * Something can't be right!
+				 * Let's abandon the task and bail out; at least
+				 * we will have debug info instead of a hard
+				 * deadlock.
+				 */
+#ifdef CONFIG_BUG_ON_MIGRATION_DEADLOCK
+				BUG();
+#else
+				TRACE_TASK(next,"stack too long in use. "
+					   "Deadlock?\n");
+				next = NULL;
+
+				/* bail out */
+				raw_spin_lock(&rq->lock);
+				return next;
+#endif
+			}
+		}
+#ifdef  __ARCH_WANT_UNLOCKED_CTXSW
+		if (next->on_cpu)
+			TRACE_TASK(next, "waiting for !oncpu");
+		while (next->on_cpu) {
+			cpu_relax();
+			mb();
+		}
+#endif
+		double_rq_lock(rq, other_rq);
+		mb();
+		if (is_realtime(prev) && is_running(prev) != was_running) {
+			TRACE_TASK(prev,
+				   "state changed while we dropped"
+				   " the lock: is_running=%d, was_running=%d\n",
+				   is_running(prev), was_running);
+			if (is_running(prev) && !was_running) {
+				/* prev task became unblocked
+				 * we need to simulate normal sequence of events
+				 * to scheduler plugins.
+				 */
+				litmus->task_block(prev);
+				litmus->task_wake_up(prev);
+			}
+		}
+
+		set_task_cpu(next, smp_processor_id());
+
+		/* DEBUG: now that we have the lock we need to make sure a
+		 *  couple of things still hold:
+		 *  - it is still a real-time task
+		 *  - it is still runnable (could have been stopped)
+		 * If either is violated, then the active plugin is
+		 * doing something wrong.
+		 */
+		if (!is_realtime(next) || !is_running(next)) {
+			/* BAD BAD BAD */
+			TRACE_TASK(next,"BAD: migration invariant FAILED: "
+				   "rt=%d running=%d\n",
+				   is_realtime(next),
+				   is_running(next));
+			/* drop the task */
+			next = NULL;
+		}
+		/* release the other CPU's runqueue, but keep ours */
+		raw_spin_unlock(&other_rq->lock);
+	}
+#endif
+
+	if (next) {
+#ifdef CONFIG_SMP
+		next->rt_param.stack_in_use = rq->cpu;
+#else
+		next->rt_param.stack_in_use = 0;
+#endif
+		update_rq_clock(rq);
+		next->se.exec_start = rq->clock;
+	}
+
+	update_enforcement_timer(next);
+	return next;
+}
+
+static void enqueue_task_litmus(struct rq *rq, struct task_struct *p,
+				int flags)
+{
+	if (flags & ENQUEUE_WAKEUP) {
+		sched_trace_task_resume(p);
+		tsk_rt(p)->present = 1;
+		/* LITMUS^RT plugins need to update the state
+		 * _before_ making it available in global structures.
+		 * Linux gets away with being lazy about the task state
+		 * update. We can't do that, hence we update the task
+		 * state already here.
+		 *
+		 * WARNING: this needs to be re-evaluated when porting
+		 *          to newer kernel versions.
+		 */
+		p->state = TASK_RUNNING;
+		litmus->task_wake_up(p);
+
+		rq->litmus.nr_running++;
+	} else
+		TRACE_TASK(p, "ignoring an enqueue, not a wake up.\n");
+}
+
+static void dequeue_task_litmus(struct rq *rq, struct task_struct *p,
+				int flags)
+{
+	if (flags & DEQUEUE_SLEEP) {
+		litmus->task_block(p);
+		tsk_rt(p)->present = 0;
+		sched_trace_task_block(p);
+
+		rq->litmus.nr_running--;
+	} else
+		TRACE_TASK(p, "ignoring a dequeue, not going to sleep.\n");
+}
+
+static void yield_task_litmus(struct rq *rq)
+{
+	TS_SYSCALL_IN_START;
+	TS_SYSCALL_IN_END;
+
+	BUG_ON(rq->curr != current);
+	/* sched_yield() is called to trigger delayed preemptions.
+	 * Thus, mark the current task as needing to be rescheduled.
+	 * This will cause the scheduler plugin to be invoked, which can
+	 * then determine if a preemption is still required.
+	 */
+	clear_exit_np(current);
+	litmus_reschedule_local();
+
+	TS_SYSCALL_OUT_START;
+}
+
+/* Plugins are responsible for this.
+ */
+static void check_preempt_curr_litmus(struct rq *rq, struct task_struct *p, int flags)
+{
+}
+
+static void put_prev_task_litmus(struct rq *rq, struct task_struct *p)
+{
+}
+
+#ifdef CONFIG_SMP
+static void pre_schedule_litmus(struct rq *rq, struct task_struct *prev)
+{
+	update_rq_clock(rq);
+	/* tell update_rq_clock() that we just did that */
+	rq->skip_clock_update = 1;
+	update_time_litmus(rq, prev);
+	if (!is_running(prev))
+		tsk_rt(prev)->present = 0;
+}
+#endif
+
+/* pick_next_task_litmus() - litmus_schedule() function
+ *
+ * return the next task to be scheduled
+ */
+static struct task_struct *pick_next_task_litmus(struct rq *rq)
+{
+	/* get the to-be-switched-out task (prev) */
+	struct task_struct *prev = rq->litmus.prev;
+	struct task_struct *next;
+
+	/* if not called from schedule() but from somewhere
+	 * else (e.g., migration), return now!
+	 */
+	if(!rq->litmus.prev)
+		return NULL;
+
+	rq->litmus.prev = NULL;
+
+	TS_PLUGIN_SCHED_START;
+	next = litmus_schedule(rq, prev);
+	TS_PLUGIN_SCHED_END;
+
+	return next;
+}
+
+static void task_tick_litmus(struct rq *rq, struct task_struct *p, int queued)
+{
+	if (is_realtime(p) && !queued) {
+		update_time_litmus(rq, p);
+		/* budget check for QUANTUM_ENFORCEMENT tasks */
+		if (budget_enforced(p) && budget_exhausted(p)) {
+			litmus_reschedule_local();
+		}
+	}
+}
+
+static void switched_to_litmus(struct rq *rq, struct task_struct *p)
+{
+}
+
+static void prio_changed_litmus(struct rq *rq, struct task_struct *p,
+				int oldprio)
+{
+}
+
+unsigned int get_rr_interval_litmus(struct rq *rq, struct task_struct *p)
+{
+	/* return infinity */
+	return 0;
+}
+
+/* This is called when a task became a real-time task, either due to a SCHED_*
+ * class transition or due to PI mutex inheritance. We don't handle Linux PI
+ * mutex inheritance yet (and probably never will). Use LITMUS provided
+ * synchronization primitives instead.
+ */
+static void set_curr_task_litmus(struct rq *rq)
+{
+	rq->curr->se.exec_start = rq->clock;
+}
+
+
+#ifdef CONFIG_SMP
+/* execve tries to rebalance task in this scheduling domain.
+ * We don't care about the scheduling domain; can gets called from
+ * exec, fork, wakeup.
+ */
+static int
+select_task_rq_litmus(struct task_struct *p, int sd_flag, int flags)
+{
+	/* preemption is already disabled.
+	 * We don't want to change cpu here
+	 */
+	return task_cpu(p);
+}
+#endif
+
+const struct sched_class litmus_sched_class = {
+	/* From 34f971f6 the stop/migrate worker threads have a class on
+	 * their own, which is the highest prio class. We don't support
+	 * cpu-hotplug or cpu throttling. Allows Litmus to use up to 1.0
+	 * CPU capacity.
+	 */
+	.next			= &rt_sched_class,
+	.enqueue_task		= enqueue_task_litmus,
+	.dequeue_task		= dequeue_task_litmus,
+	.yield_task		= yield_task_litmus,
+
+	.check_preempt_curr	= check_preempt_curr_litmus,
+
+	.pick_next_task		= pick_next_task_litmus,
+	.put_prev_task		= put_prev_task_litmus,
+
+#ifdef CONFIG_SMP
+	.select_task_rq		= select_task_rq_litmus,
+
+	.pre_schedule		= pre_schedule_litmus,
+#endif
+
+	.set_curr_task          = set_curr_task_litmus,
+	.task_tick		= task_tick_litmus,
+
+	.get_rr_interval	= get_rr_interval_litmus,
+
+	.prio_changed		= prio_changed_litmus,
+	.switched_to		= switched_to_litmus,
+};
diff --git a/litmus/Kconfig b/litmus/Kconfig
index 5408ef6..fdf31f3 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -1,5 +1,184 @@
 menu "LITMUS^RT"
 
+menu "Scheduling"
+
+config RELEASE_MASTER
+        bool "Release-master Support"
+	depends on ARCH_HAS_SEND_PULL_TIMERS && SMP
+	default n
+	help
+           Allow one processor to act as a dedicated interrupt processor
+           that services all timer interrupts, but that does not schedule
+           real-time tasks. See RTSS'09 paper for details
+	   (http://www.cs.unc.edu/~anderson/papers.html).
+
+config PREFER_LOCAL_LINKING
+       bool "Link newly arrived tasks locally if possible"
+       depends on SMP
+       default y
+       help
+          In linking-based schedulers such as GSN-EDF, if an idle CPU processes
+	  a job arrival (i.e., when a job resumed or was released), it can
+	  either link the task to itself and schedule it immediately (to avoid
+	  unnecessary scheduling latency) or it can try to link it to the CPU
+	  where it executed previously (to maximize cache affinity, at the
+	  expense of increased latency due to the need to send an IPI).
+
+	  In lightly loaded systems, this option can significantly reduce
+	  scheduling latencies. In heavily loaded systems (where CPUs are
+	  rarely idle), it will likely make hardly a difference.
+
+	  If unsure, say yes.
+
+config LITMUS_QUANTUM_LENGTH_US
+    int "quantum length (in us)"
+    default 1000
+    range 500 10000
+    help 
+      Determine the desired quantum length, in microseconds, which 
+      is used to determine the granularity of scheduling in
+      quantum-driven plugins (primarily PFAIR). This parameter does not
+      affect event-driven plugins (such as the EDF-based plugins and P-FP).
+      Default: 1000us = 1ms.
+
+config BUG_ON_MIGRATION_DEADLOCK
+       bool "Panic on suspected migration deadlock"
+       default y
+       help
+          This is a debugging option. The LITMUS^RT migration support code for
+	  global scheduling contains a simple heuristic to detect when the
+	  system deadlocks due to circular stack dependencies.
+
+	  For example, such a deadlock exists if CPU 0 waits for task A's stack
+	  to become available while using task B's stack, and CPU 1 waits for
+	  task B's stack to become available while using task A's stack. Such
+	  a situation can arise in (buggy) global scheduling plugins.
+
+	  With this option enabled, such a scenario with result in a BUG().
+	  You can turn off this option when debugging on real hardware (e.g.,
+	  to rescue traces, etc. that would be hard to get after a panic).
+
+	  Only turn this off if you really know what you are doing. If this
+	  BUG() triggers, the scheduler is broken and turning off this option
+	  won't fix it.
+
+
+endmenu
+
+menu "Real-Time Synchronization"
+
+config NP_SECTION
+        bool "Non-preemptive section support"
+	default y
+	help
+	  Allow tasks to become non-preemptable.
+          Note that plugins still need to explicitly support non-preemptivity.
+          Currently, only the GSN-EDF, PSN-EDF, and P-FP plugins have such support.
+
+	  This is required to support locking protocols such as the FMLP.
+	  If disabled, all tasks will be considered preemptable at all times.
+
+config LITMUS_LOCKING
+        bool "Support for real-time locking protocols"
+	depends on NP_SECTION
+	default y
+	help
+	  Enable LITMUS^RT's multiprocessor real-time locking protocols with
+	  predicable maximum blocking times.
+
+	  Say Yes if you want to include locking protocols such as the FMLP and
+	  Baker's SRP.
+
+endmenu
+
+menu "Performance Enhancements"
+
+config SCHED_CPU_AFFINITY
+	bool "Local Migration Affinity"
+	depends on X86 && SYSFS
+	default y
+	help
+	  Rescheduled tasks prefer CPUs near to their previously used CPU.
+	  This may improve cache performance through possible preservation of
+	  cache affinity, at the expense of (slightly) more involved scheduling
+	  logic.
+
+	  Warning: May make bugs harder to find since tasks may migrate less often.
+
+	  NOTES:
+		* Feature is not utilized by PFair/PD^2.
+
+	  Say Yes if unsure.
+
+config ALLOW_EARLY_RELEASE
+	bool "Allow Early Releasing"
+	default y
+	help
+	  Allow tasks to release jobs early (while still maintaining job
+	  precedence constraints). Only supported by EDF schedulers. Early
+	  releasing must be explicitly requested by real-time tasks via
+	  the task_params passed to sys_set_task_rt_param().
+
+	  Early releasing can improve job response times while maintaining
+	  real-time correctness. However, it can easily peg your CPUs
+	  since tasks never suspend to wait for their next job. As such, early
+	  releasing is really only useful in the context of implementing
+	  bandwidth servers, interrupt handling threads, or short-lived
+	  computations.
+
+	  Beware that early releasing may affect real-time analysis
+	  if using locking protocols or I/O.
+
+	  Say Yes if unsure.
+
+choice
+	prompt "EDF Tie-Break Behavior"
+	default EDF_TIE_BREAK_LATENESS_NORM
+	help
+	  Allows the configuration of tie-breaking behavior when the deadlines
+	  of two EDF-scheduled tasks are equal.
+
+	config EDF_TIE_BREAK_LATENESS
+	bool "Lateness-based Tie Break"
+	help
+	  Break ties between two jobs, A and B, based upon the lateness of their
+	  prior jobs. The job with the greatest lateness has priority. Note that
+	  lateness has a negative value if the prior job finished before its
+	  deadline.
+
+	config EDF_TIE_BREAK_LATENESS_NORM
+	bool "Normalized Lateness-based Tie Break"
+	help
+	  Break ties between two jobs, A and B, based upon the lateness, normalized
+	  by relative deadline, of their prior jobs. The job with the greatest
+	  normalized lateness has priority. Note that lateness has a negative value
+	  if the prior job finished before its deadline.
+
+	  Normalized lateness tie-breaks are likely desireable over non-normalized
+	  tie-breaks if the execution times and/or relative deadlines of tasks in a
+	  task set vary greatly.
+
+	config EDF_TIE_BREAK_HASH
+	bool "Hash-based Tie Breaks"
+	help
+	  Break ties between two jobs, A and B, with equal deadlines by using a
+	  uniform hash; i.e.: hash(A.pid, A.job_num) < hash(B.pid, B.job_num). Job
+	  A has ~50% of winning a given tie-break.
+
+	config EDF_PID_TIE_BREAK
+	bool "PID-based Tie Breaks"
+	help
+	  Break ties based upon OS-assigned thread IDs. Use this option if
+	  required by algorithm's real-time analysis or per-task response-time
+	  jitter must be minimized.
+
+	  NOTES:
+	    * This tie-breaking method was default in Litmus 2012.2 and before.
+
+endchoice
+
+endmenu
+
 menu "Tracing"
 
 config FEATHER_TRACE
@@ -154,6 +333,20 @@ config SCHED_DEBUG_TRACE_CALLER
 
 	 If unsure, say No.
 
+config PREEMPT_STATE_TRACE
+       bool "Trace preemption state machine transitions"
+       depends on SCHED_DEBUG_TRACE && DEBUG_KERNEL
+       default n
+       help
+         With this option enabled, each CPU will log when it transitions
+	 states in the preemption state machine. This state machine is
+	 used to determine how to react to IPIs (avoid races with in-flight IPIs).
+
+	 Warning: this creates a lot of information in the debug trace. Only
+	 recommended when you are debugging preemption-related races.
+
+	 If unsure, say No.
+
 endmenu
 
 endmenu
diff --git a/litmus/Makefile b/litmus/Makefile
index 6318f1c..f7ceabc 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -2,6 +2,26 @@
 # Makefile for LITMUS^RT
 #
 
+obj-y     = sched_plugin.o litmus.o \
+	    preempt.o \
+	    litmus_proc.o \
+	    budget.o \
+	    clustered.o \
+	    jobs.o \
+	    sync.o \
+	    rt_domain.o \
+	    edf_common.o \
+	    fp_common.o \
+	    fdso.o \
+	    locking.o \
+	    srp.o \
+	    bheap.o \
+	    binheap.o \
+	    ctrldev.o \
+	    uncachedev.o
+
+obj-$(CONFIG_SCHED_CPU_AFFINITY) += affinity.o
+
 obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
 obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
 obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o
diff --git a/litmus/affinity.c b/litmus/affinity.c
new file mode 100644
index 0000000..a5b437b
--- /dev/null
+++ b/litmus/affinity.c
@@ -0,0 +1,41 @@
+#include <linux/cpu.h>
+
+#include <litmus/affinity.h>
+
+struct neighborhood neigh_info[NR_CPUS];
+
+/* called by _init_litmus() */
+void init_topology(void) {
+	int cpu;
+	int i;
+	int chk;
+	int depth = num_cache_leaves;
+
+	if (depth > NUM_CACHE_LEVELS)
+		depth = NUM_CACHE_LEVELS;
+
+	for_each_online_cpu(cpu) {
+		for (i = 0; i < depth; ++i) {
+			chk = get_shared_cpu_map((struct cpumask *)&neigh_info[cpu].neighbors[i], cpu, i);
+			if (chk) {
+				/* failed */
+				neigh_info[cpu].size[i] = 0;
+			} else {
+				/* size = num bits in mask */
+				neigh_info[cpu].size[i] =
+					cpumask_weight((struct cpumask *)&neigh_info[cpu].neighbors[i]);
+			}
+			printk("CPU %d has %d neighbors at level %d. (mask = %lx)\n",
+							cpu, neigh_info[cpu].size[i], i,
+							*cpumask_bits(neigh_info[cpu].neighbors[i]));
+		}
+
+		/* set data for non-existent levels */
+		for (; i < NUM_CACHE_LEVELS; ++i) {
+			neigh_info[cpu].size[i] = 0;
+
+			printk("CPU %d has %d neighbors at level %d. (mask = %lx)\n",
+						cpu, neigh_info[cpu].size[i], i, 0lu);
+		}
+	}
+}
diff --git a/litmus/bheap.c b/litmus/bheap.c
new file mode 100644
index 0000000..2707e01
--- /dev/null
+++ b/litmus/bheap.c
@@ -0,0 +1,316 @@
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <litmus/bheap.h>
+
+void bheap_init(struct bheap* heap)
+{
+	heap->head = NULL;
+	heap->min  = NULL;
+}
+
+void bheap_node_init(struct bheap_node** _h, void* value)
+{
+	struct bheap_node* h = *_h;
+	h->parent = NULL;
+	h->next   = NULL;
+	h->child  = NULL;
+	h->degree = NOT_IN_HEAP;
+	h->value  = value;
+	h->ref    = _h;
+}
+
+
+/* make child a subtree of root */
+static void __bheap_link(struct bheap_node* root,
+			struct bheap_node* child)
+{
+	child->parent = root;
+	child->next   = root->child;
+	root->child   = child;
+	root->degree++;
+}
+
+/* merge root lists */
+static  struct bheap_node* __bheap_merge(struct bheap_node* a,
+					     struct bheap_node* b)
+{
+	struct bheap_node* head = NULL;
+	struct bheap_node** pos = &head;
+
+	while (a && b) {
+		if (a->degree < b->degree) {
+			*pos = a;
+			a = a->next;
+		} else {
+			*pos = b;
+			b = b->next;
+		}
+		pos = &(*pos)->next;
+	}
+	if (a)
+		*pos = a;
+	else
+		*pos = b;
+	return head;
+}
+
+/* reverse a linked list of nodes. also clears parent pointer */
+static  struct bheap_node* __bheap_reverse(struct bheap_node* h)
+{
+	struct bheap_node* tail = NULL;
+	struct bheap_node* next;
+
+	if (!h)
+		return h;
+
+	h->parent = NULL;
+	while (h->next) {
+		next    = h->next;
+		h->next = tail;
+		tail    = h;
+		h       = next;
+		h->parent = NULL;
+	}
+	h->next = tail;
+	return h;
+}
+
+static  void __bheap_min(bheap_prio_t higher_prio, struct bheap* heap,
+			      struct bheap_node** prev, struct bheap_node** node)
+{
+	struct bheap_node *_prev, *cur;
+	*prev = NULL;
+
+	if (!heap->head) {
+		*node = NULL;
+		return;
+	}
+
+	*node = heap->head;
+	_prev = heap->head;
+	cur   = heap->head->next;
+	while (cur) {
+		if (higher_prio(cur, *node)) {
+			*node = cur;
+			*prev = _prev;
+		}
+		_prev = cur;
+		cur   = cur->next;
+	}
+}
+
+static  void __bheap_union(bheap_prio_t higher_prio, struct bheap* heap,
+				struct bheap_node* h2)
+{
+	struct bheap_node* h1;
+	struct bheap_node *prev, *x, *next;
+	if (!h2)
+		return;
+	h1 = heap->head;
+	if (!h1) {
+		heap->head = h2;
+		return;
+	}
+	h1 = __bheap_merge(h1, h2);
+	prev = NULL;
+	x    = h1;
+	next = x->next;
+	while (next) {
+		if (x->degree != next->degree ||
+		    (next->next && next->next->degree == x->degree)) {
+			/* nothing to do, advance */
+			prev = x;
+			x    = next;
+		} else if (higher_prio(x, next)) {
+			/* x becomes the root of next */
+			x->next = next->next;
+			__bheap_link(x, next);
+		} else {
+			/* next becomes the root of x */
+			if (prev)
+				prev->next = next;
+			else
+				h1 = next;
+			__bheap_link(next, x);
+			x = next;
+		}
+		next = x->next;
+	}
+	heap->head = h1;
+}
+
+static struct bheap_node* __bheap_extract_min(bheap_prio_t higher_prio,
+					    struct bheap* heap)
+{
+	struct bheap_node *prev, *node;
+	__bheap_min(higher_prio, heap, &prev, &node);
+	if (!node)
+		return NULL;
+	if (prev)
+		prev->next = node->next;
+	else
+		heap->head = node->next;
+	__bheap_union(higher_prio, heap, __bheap_reverse(node->child));
+	return node;
+}
+
+/* insert (and reinitialize) a node into the heap */
+void bheap_insert(bheap_prio_t higher_prio, struct bheap* heap,
+		 struct bheap_node* node)
+{
+	struct bheap_node *min;
+	node->child  = NULL;
+	node->parent = NULL;
+	node->next   = NULL;
+	node->degree = 0;
+	if (heap->min && higher_prio(node, heap->min)) {
+		/* swap min cache */
+		min = heap->min;
+		min->child  = NULL;
+		min->parent = NULL;
+		min->next   = NULL;
+		min->degree = 0;
+		__bheap_union(higher_prio, heap, min);
+		heap->min   = node;
+	} else
+		__bheap_union(higher_prio, heap, node);
+}
+
+void bheap_uncache_min(bheap_prio_t higher_prio, struct bheap* heap)
+{
+	struct bheap_node* min;
+	if (heap->min) {
+		min = heap->min;
+		heap->min = NULL;
+		bheap_insert(higher_prio, heap, min);
+	}
+}
+
+/* merge addition into target */
+void bheap_union(bheap_prio_t higher_prio,
+		struct bheap* target, struct bheap* addition)
+{
+	/* first insert any cached minima, if necessary */
+	bheap_uncache_min(higher_prio, target);
+	bheap_uncache_min(higher_prio, addition);
+	__bheap_union(higher_prio, target, addition->head);
+	/* this is a destructive merge */
+	addition->head = NULL;
+}
+
+struct bheap_node* bheap_peek(bheap_prio_t higher_prio,
+			    struct bheap* heap)
+{
+	if (!heap->min)
+		heap->min = __bheap_extract_min(higher_prio, heap);
+	return heap->min;
+}
+
+struct bheap_node* bheap_take(bheap_prio_t higher_prio,
+			    struct bheap* heap)
+{
+	struct bheap_node *node;
+	if (!heap->min)
+		heap->min = __bheap_extract_min(higher_prio, heap);
+	node = heap->min;
+	heap->min = NULL;
+	if (node)
+		node->degree = NOT_IN_HEAP;
+	return node;
+}
+
+int bheap_decrease(bheap_prio_t higher_prio, struct bheap_node* node)
+{
+	struct bheap_node  *parent;
+	struct bheap_node** tmp_ref;
+	void* tmp;
+
+	/* bubble up */
+	parent = node->parent;
+	while (parent && higher_prio(node, parent)) {
+		/* swap parent and node */
+		tmp           = parent->value;
+		parent->value = node->value;
+		node->value   = tmp;
+		/* swap references */
+		*(parent->ref) = node;
+		*(node->ref)   = parent;
+		tmp_ref        = parent->ref;
+		parent->ref    = node->ref;
+		node->ref      = tmp_ref;
+		/* step up */
+		node   = parent;
+		parent = node->parent;
+	}
+
+	return parent != NULL;
+}
+
+void bheap_delete(bheap_prio_t higher_prio, struct bheap* heap,
+		 struct bheap_node* node)
+{
+	struct bheap_node *parent, *prev, *pos;
+	struct bheap_node** tmp_ref;
+	void* tmp;
+
+	if (heap->min != node) {
+		/* bubble up */
+		parent = node->parent;
+		while (parent) {
+			/* swap parent and node */
+			tmp           = parent->value;
+			parent->value = node->value;
+			node->value   = tmp;
+			/* swap references */
+			*(parent->ref) = node;
+			*(node->ref)   = parent;
+			tmp_ref        = parent->ref;
+			parent->ref    = node->ref;
+			node->ref      = tmp_ref;
+			/* step up */
+			node   = parent;
+			parent = node->parent;
+		}
+		/* now delete:
+		 * first find prev */
+		prev = NULL;
+		pos  = heap->head;
+		while (pos != node) {
+			BUG_ON(!pos); /* fell off the list -> deleted from wrong heap */
+			prev = pos;
+			pos  = pos->next;
+		}
+		/* we have prev, now remove node */
+		if (prev)
+			prev->next = node->next;
+		else
+			heap->head = node->next;
+		__bheap_union(higher_prio, heap, __bheap_reverse(node->child));
+	} else
+		heap->min = NULL;
+	node->degree = NOT_IN_HEAP;
+}
+
+/* allocate a heap node for value and insert into the heap */
+int bheap_add(bheap_prio_t higher_prio, struct bheap* heap,
+	     void* value, int gfp_flags)
+{
+	struct bheap_node* hn = bheap_node_alloc(gfp_flags);
+	if (likely(hn)) {
+		bheap_node_init(&hn, value);
+		bheap_insert(higher_prio, heap, hn);
+	}
+	return hn != NULL;
+}
+
+void* bheap_take_del(bheap_prio_t higher_prio,
+		    struct bheap* heap)
+{
+	struct bheap_node* hn = bheap_take(higher_prio, heap);
+	void* ret = NULL;
+	if (hn) {
+		ret = hn->value;
+		bheap_node_free(hn);
+	}
+	return ret;
+}
diff --git a/litmus/binheap.c b/litmus/binheap.c
new file mode 100644
index 0000000..d3ab34b
--- /dev/null
+++ b/litmus/binheap.c
@@ -0,0 +1,387 @@
+#include <litmus/binheap.h>
+
+/* Returns true of the root ancestor of node is the root of the given heap. */
+int binheap_is_in_this_heap(struct binheap_node *node,
+	struct binheap* heap)
+{
+	if(!binheap_is_in_heap(node)) {
+		return 0;
+	}
+
+	while(node->parent != NULL) {
+		node = node->parent;
+	}
+
+	return (node == heap->root);
+}
+
+
+/* Update the node reference pointers.  Same logic as Litmus binomial heap. */
+static void __update_ref(struct binheap_node *parent,
+				struct binheap_node *child)
+{
+	*(parent->ref_ptr) = child;
+	*(child->ref_ptr) = parent;
+
+	swap(parent->ref_ptr, child->ref_ptr);
+}
+
+
+/* Swaps data between two nodes. */
+static void __binheap_swap(struct binheap_node *parent,
+				struct binheap_node *child)
+{
+	swap(parent->data, child->data);
+	__update_ref(parent, child);
+}
+
+
+/* Swaps memory and data between two nodes. Actual nodes swap instead of
+ * just data.  Needed when we delete nodes from the heap.
+ */
+static void __binheap_swap_safe(struct binheap *handle,
+				struct binheap_node *a,
+				struct binheap_node *b)
+{
+	swap(a->data, b->data);
+	__update_ref(a, b);
+
+	if((a->parent != NULL) && (a->parent == b->parent)) {
+		/* special case: shared parent */
+		swap(a->parent->left, a->parent->right);
+	}
+	else {
+		/* Update pointers to swap parents. */
+
+		if(a->parent) {
+			if(a == a->parent->left) {
+				a->parent->left = b;
+			}
+			else {
+				a->parent->right = b;
+			}
+		}
+
+		if(b->parent) {
+			if(b == b->parent->left) {
+				b->parent->left = a;
+			}
+			else {
+				b->parent->right = a;
+			}
+		}
+
+		swap(a->parent, b->parent);
+	}
+
+	/* swap children */
+
+	if(a->left) {
+		a->left->parent = b;
+
+		if(a->right) {
+			a->right->parent = b;
+		}
+	}
+
+	if(b->left) {
+		b->left->parent = a;
+
+		if(b->right) {
+			b->right->parent = a;
+		}
+	}
+
+	swap(a->left, b->left);
+	swap(a->right, b->right);
+
+
+	/* update next/last/root pointers */
+
+	if(a == handle->next) {
+		handle->next = b;
+	}
+	else if(b == handle->next) {
+		handle->next = a;
+	}
+
+	if(a == handle->last) {
+		handle->last = b;
+	}
+	else if(b == handle->last) {
+		handle->last = a;
+	}
+
+	if(a == handle->root) {
+		handle->root = b;
+	}
+	else if(b == handle->root) {
+		handle->root = a;
+	}
+}
+
+
+/**
+ * Update the pointer to the last node in the complete binary tree.
+ * Called internally after the root node has been deleted.
+ */
+static void __binheap_update_last(struct binheap *handle)
+{
+	struct binheap_node *temp = handle->last;
+
+	/* find a "bend" in the tree. */
+	while(temp->parent && (temp == temp->parent->left)) {
+		temp = temp->parent;
+	}
+
+	/* step over to sibling if we're not at root */
+	if(temp->parent != NULL) {
+		temp = temp->parent->left;
+	}
+
+	/* now travel right as far as possible. */
+	while(temp->right != NULL) {
+		temp = temp->right;
+	}
+
+	/* take one step to the left if we're not at the bottom-most level. */
+	if(temp->left != NULL) {
+		temp = temp->left;
+	}
+
+	handle->last = temp;
+}
+
+
+/**
+ * Update the pointer to the node that will take the next inserted node.
+ * Called internally after a node has been inserted.
+ */
+static void __binheap_update_next(struct binheap *handle)
+{
+	struct binheap_node *temp = handle->next;
+
+	/* find a "bend" in the tree. */
+	while(temp->parent && (temp == temp->parent->right)) {
+		temp = temp->parent;
+	}
+
+	/* step over to sibling if we're not at root */
+	if(temp->parent != NULL) {
+		temp = temp->parent->right;
+	}
+
+	/* now travel left as far as possible. */
+	while(temp->left != NULL) {
+		temp = temp->left;
+	}
+
+	handle->next = temp;
+}
+
+
+
+/* bubble node up towards root */
+static void __binheap_bubble_up(struct binheap *handle,
+				struct binheap_node *node)
+{
+	/* let BINHEAP_POISON data bubble to the top */
+
+	while((node->parent != NULL) &&
+		  ((node->data == BINHEAP_POISON) ||
+		   handle->compare(node, node->parent))) {
+			  __binheap_swap(node->parent, node);
+			  node = node->parent;
+	}
+}
+
+
+/* bubble node down, swapping with min-child */
+static void __binheap_bubble_down(struct binheap *handle)
+{
+	struct binheap_node *node = handle->root;
+
+	while(node->left != NULL) {
+		if(node->right && handle->compare(node->right, node->left)) {
+			if(handle->compare(node->right, node)) {
+				__binheap_swap(node, node->right);
+				node = node->right;
+			}
+			else {
+				break;
+			}
+		}
+		else {
+			if(handle->compare(node->left, node)) {
+				__binheap_swap(node, node->left);
+				node = node->left;
+			}
+			else {
+				break;
+			}
+		}
+	}
+}
+
+
+void __binheap_add(struct binheap_node *new_node,
+				struct binheap *handle,
+				void *data)
+{
+	new_node->data = data;
+	new_node->ref = new_node;
+	new_node->ref_ptr = &(new_node->ref);
+
+	if(!binheap_empty(handle)) {
+		/* insert left side first */
+		if(handle->next->left == NULL) {
+			handle->next->left = new_node;
+			new_node->parent = handle->next;
+			new_node->left = NULL;
+			new_node->right = NULL;
+
+			handle->last = new_node;
+
+			__binheap_bubble_up(handle, new_node);
+		}
+		else {
+			/* left occupied. insert right. */
+			handle->next->right = new_node;
+			new_node->parent = handle->next;
+			new_node->left = NULL;
+			new_node->right = NULL;
+
+			handle->last = new_node;
+
+			__binheap_update_next(handle);
+			__binheap_bubble_up(handle, new_node);
+		}
+	}
+	else {
+		/* first node in heap */
+
+		new_node->parent = NULL;
+		new_node->left = NULL;
+		new_node->right = NULL;
+
+		handle->root = new_node;
+		handle->next = new_node;
+		handle->last = new_node;
+	}
+}
+
+
+/**
+ * Removes the root node from the heap. The node is removed after coalescing
+ * the binheap_node with its original data pointer at the root of the tree.
+ *
+ * The 'last' node in the tree is then swapped up to the root and bubbled
+ * down.
+ */
+void __binheap_delete_root(struct binheap *handle,
+				struct binheap_node *container)
+{
+	struct binheap_node *root = handle->root;
+
+	if(root != container) {
+		/* coalesce */
+		__binheap_swap_safe(handle, root, container);
+		root = container;
+	}
+
+	if(handle->last != root) {
+		/* swap 'last' node up to root and bubble it down. */
+
+		struct binheap_node *to_move = handle->last;
+
+		if(to_move->parent != root) {
+			handle->next = to_move->parent;
+
+			if(handle->next->right == to_move) {
+				/* disconnect from parent */
+				to_move->parent->right = NULL;
+				handle->last = handle->next->left;
+			}
+			else {
+				/* find new 'last' before we disconnect */
+				__binheap_update_last(handle);
+
+				/* disconnect from parent */
+				to_move->parent->left = NULL;
+			}
+		}
+		else {
+			/* 'last' is direct child of root */
+
+			handle->next = to_move;
+
+			if(to_move == to_move->parent->right) {
+				to_move->parent->right = NULL;
+				handle->last = to_move->parent->left;
+			}
+			else {
+				to_move->parent->left = NULL;
+				handle->last = to_move;
+			}
+		}
+		to_move->parent = NULL;
+
+		/* reconnect as root.  We can't just swap data ptrs since root node
+		 * may be freed after this function returns.
+		 */
+		to_move->left = root->left;
+		to_move->right = root->right;
+		if(to_move->left != NULL) {
+			to_move->left->parent = to_move;
+		}
+		if(to_move->right != NULL) {
+			to_move->right->parent = to_move;
+		}
+
+		handle->root = to_move;
+
+		/* bubble down */
+		__binheap_bubble_down(handle);
+	}
+	else {
+		/* removing last node in tree */
+		handle->root = NULL;
+		handle->next = NULL;
+		handle->last = NULL;
+	}
+
+	/* mark as removed */
+	container->parent = BINHEAP_POISON;
+}
+
+
+/**
+ * Delete an arbitrary node.  Bubble node to delete up to the root,
+ * and then delete to root.
+ */
+void __binheap_delete(struct binheap_node *node_to_delete,
+				struct binheap *handle)
+{
+	struct binheap_node *target = node_to_delete->ref;
+	void *temp_data = target->data;
+
+	/* temporarily set data to null to allow node to bubble up to the top. */
+	target->data = BINHEAP_POISON;
+
+	__binheap_bubble_up(handle, target);
+	__binheap_delete_root(handle, node_to_delete);
+
+	node_to_delete->data = temp_data;  /* restore node data pointer */
+}
+
+
+/**
+ * Bubble up a node whose pointer has decreased in value.
+ */
+void __binheap_decrease(struct binheap_node *orig_node,
+				struct binheap *handle)
+{
+	struct binheap_node *target = orig_node->ref;
+
+	__binheap_bubble_up(handle, target);
+}
diff --git a/litmus/budget.c b/litmus/budget.c
new file mode 100644
index 0000000..1ffb8e3
--- /dev/null
+++ b/litmus/budget.c
@@ -0,0 +1,116 @@
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <linux/hrtimer.h>
+
+#include <litmus/litmus.h>
+#include <litmus/preempt.h>
+
+#include <litmus/budget.h>
+
+struct enforcement_timer {
+	/* The enforcement timer is used to accurately police
+	 * slice budgets. */
+	struct hrtimer		timer;
+	int			armed;
+};
+
+DEFINE_PER_CPU(struct enforcement_timer, budget_timer);
+
+static enum hrtimer_restart on_enforcement_timeout(struct hrtimer *timer)
+{
+	struct enforcement_timer* et = container_of(timer,
+						    struct enforcement_timer,
+						    timer);
+	unsigned long flags;
+
+	local_irq_save(flags);
+	TRACE("enforcement timer fired.\n");
+	et->armed = 0;
+	/* activate scheduler */
+	litmus_reschedule_local();
+	local_irq_restore(flags);
+
+	return  HRTIMER_NORESTART;
+}
+
+/* assumes called with IRQs off */
+static void cancel_enforcement_timer(struct enforcement_timer* et)
+{
+	int ret;
+
+	TRACE("cancelling enforcement timer.\n");
+
+	/* Since interrupts are disabled and et->armed is only
+	 * modified locally, we do not need any locks.
+	 */
+
+	if (et->armed) {
+		ret = hrtimer_try_to_cancel(&et->timer);
+		/* Should never be inactive. */
+		BUG_ON(ret == 0);
+		/* Should never be running concurrently. */
+		BUG_ON(ret == -1);
+
+		et->armed = 0;
+	}
+}
+
+/* assumes called with IRQs off */
+static void arm_enforcement_timer(struct enforcement_timer* et,
+				  struct task_struct* t)
+{
+	lt_t when_to_fire;
+	TRACE_TASK(t, "arming enforcement timer.\n");
+
+	WARN_ONCE(!hrtimer_is_hres_active(&et->timer),
+		KERN_ERR "WARNING: no high resolution timers available!?\n");
+
+	/* Calling this when there is no budget left for the task
+	 * makes no sense, unless the task is non-preemptive. */
+	BUG_ON(budget_exhausted(t) && (!is_np(t)));
+
+	/* __hrtimer_start_range_ns() cancels the timer
+	 * anyway, so we don't have to check whether it is still armed */
+
+	if (likely(!is_np(t))) {
+		when_to_fire = litmus_clock() + budget_remaining(t);
+		__hrtimer_start_range_ns(&et->timer,
+					 ns_to_ktime(when_to_fire),
+					 0 /* delta */,
+					 HRTIMER_MODE_ABS_PINNED,
+					 0 /* no wakeup */);
+		et->armed = 1;
+	}
+}
+
+
+/* expects to be called with IRQs off */
+void update_enforcement_timer(struct task_struct* t)
+{
+	struct enforcement_timer* et = &__get_cpu_var(budget_timer);
+
+	if (t && budget_precisely_enforced(t)) {
+		/* Make sure we call into the scheduler when this budget
+		 * expires. */
+		arm_enforcement_timer(et, t);
+	} else if (et->armed) {
+		/* Make sure we don't cause unnecessary interrupts. */
+		cancel_enforcement_timer(et);
+	}
+}
+
+
+static int __init init_budget_enforcement(void)
+{
+	int cpu;
+	struct enforcement_timer* et;
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+		et = &per_cpu(budget_timer, cpu);
+		hrtimer_init(&et->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+		et->timer.function = on_enforcement_timeout;
+	}
+	return 0;
+}
+
+module_init(init_budget_enforcement);
diff --git a/litmus/clustered.c b/litmus/clustered.c
new file mode 100644
index 0000000..979fac6
--- /dev/null
+++ b/litmus/clustered.c
@@ -0,0 +1,111 @@
+#include <linux/gfp.h>
+#include <linux/cpumask.h>
+#include <linux/list.h>
+
+#include <litmus/clustered.h>
+
+#if !defined(CONFIG_X86) || !defined(CONFIG_SYSFS)
+/* fake get_shared_cpu_map() on non-x86 architectures */
+
+int get_shared_cpu_map(cpumask_var_t mask, unsigned int cpu, int index)
+{
+	if (index != 1)
+		return 1;
+	else {
+		/* Fake L1: CPU is all by itself. */
+		cpumask_clear(mask);
+		cpumask_set_cpu(cpu, mask);
+		return 0;
+	}
+}
+
+#endif
+
+int get_cluster_size(enum cache_level level)
+{
+	cpumask_var_t mask;
+	int ok;
+	int num_cpus;
+
+	if (level == GLOBAL_CLUSTER)
+		return num_online_cpus();
+	else {
+		if (!zalloc_cpumask_var(&mask, GFP_ATOMIC))
+			return -ENOMEM;
+		/* assumes CPU 0 is representative of all CPUs */
+		ok = get_shared_cpu_map(mask, 0, level);
+		/* ok == 0 means we got the map; otherwise it's an invalid cache level */
+		if (ok == 0)
+			num_cpus = cpumask_weight(mask);
+		free_cpumask_var(mask);
+
+		if (ok == 0)
+			return num_cpus;
+		else
+			return -EINVAL;
+	}
+}
+
+int assign_cpus_to_clusters(enum cache_level level,
+			    struct scheduling_cluster* clusters[],
+			    unsigned int num_clusters,
+			    struct cluster_cpu* cpus[],
+			    unsigned int num_cpus)
+{
+	cpumask_var_t mask;
+	unsigned int i, free_cluster = 0, low_cpu;
+	int err = 0;
+
+	if (!zalloc_cpumask_var(&mask, GFP_ATOMIC))
+		return -ENOMEM;
+
+	/* clear cluster pointers */
+	for (i = 0; i < num_cpus; i++) {
+		cpus[i]->id      = i;
+		cpus[i]->cluster = NULL;
+	}
+
+	/* initialize clusters */
+	for (i = 0; i < num_clusters; i++) {
+		clusters[i]->id = i;
+		INIT_LIST_HEAD(&clusters[i]->cpus);
+	}
+
+	/* Assign each CPU. Two assumtions are made:
+	 * 1) The index of a cpu in cpus corresponds to its processor id (i.e., the index in a cpu mask).
+	 * 2) All cpus that belong to some cluster are online.
+	 */
+	for_each_online_cpu(i) {
+		/* get lowest-id CPU in cluster */
+		if (level != GLOBAL_CLUSTER) {
+			err = get_shared_cpu_map(mask, cpus[i]->id, level);
+			if (err != 0) {
+				/* ugh... wrong cache level? Either caller screwed up
+				 * or the CPU topology is weird. */
+				printk(KERN_ERR "Could not set up clusters for L%d sharing (max: L%d).\n",
+				       level, err);
+				err = -EINVAL;
+				goto out;
+			}
+			low_cpu = cpumask_first(mask);
+		} else
+			low_cpu = 0;
+		if (low_cpu == i) {
+			/* caller must provide an appropriate number of clusters */
+			BUG_ON(free_cluster >= num_clusters);
+
+			/* create new cluster */
+			cpus[i]->cluster = clusters[free_cluster++];
+		} else {
+			/* low_cpu points to the right cluster
+			 * Assumption: low_cpu is actually online and was processed earlier. */
+			cpus[i]->cluster = cpus[low_cpu]->cluster;
+		}
+		/* enqueue in cpus list */
+		list_add_tail(&cpus[i]->cluster_list, &cpus[i]->cluster->cpus);
+		printk(KERN_INFO "Assigning CPU%u to cluster %u\n.", i, cpus[i]->cluster->id);
+	}
+out:
+	free_cpumask_var(mask);
+	return err;
+}
diff --git a/litmus/ctrldev.c b/litmus/ctrldev.c
new file mode 100644
index 0000000..877f278
--- /dev/null
+++ b/litmus/ctrldev.c
@@ -0,0 +1,160 @@
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+
+#include <litmus/litmus.h>
+
+/* only one page for now, but we might want to add a RO version at some point */
+
+#define CTRL_NAME        "litmus/ctrl"
+
+/* allocate t->rt_param.ctrl_page*/
+static int alloc_ctrl_page(struct task_struct *t)
+{
+	int err = 0;
+
+	/* only allocate if the task doesn't have one yet */
+	if (!tsk_rt(t)->ctrl_page) {
+		tsk_rt(t)->ctrl_page = (void*) get_zeroed_page(GFP_KERNEL);
+		if (!tsk_rt(t)->ctrl_page)
+			err = -ENOMEM;
+		/* will get de-allocated in task teardown */
+		TRACE_TASK(t, "%s ctrl_page = %p\n", __FUNCTION__,
+			   tsk_rt(t)->ctrl_page);
+	}
+	return err;
+}
+
+static int map_ctrl_page(struct task_struct *t, struct vm_area_struct* vma)
+{
+	int err;
+
+	struct page* ctrl = virt_to_page(tsk_rt(t)->ctrl_page);
+
+	TRACE_CUR(CTRL_NAME
+		  ": mapping %p (pfn:%lx) to 0x%lx (prot:%lx)\n",
+		  tsk_rt(t)->ctrl_page,page_to_pfn(ctrl), vma->vm_start,
+		  vma->vm_page_prot);
+
+	/* Map it into the vma. */
+	err = vm_insert_page(vma, vma->vm_start, ctrl);
+
+	if (err)
+		TRACE_CUR(CTRL_NAME ": vm_insert_page() failed (%d)\n", err);
+
+	return err;
+}
+
+static void litmus_ctrl_vm_close(struct vm_area_struct* vma)
+{
+	TRACE_CUR("%s flags=0x%x prot=0x%x\n", __FUNCTION__,
+		  vma->vm_flags, vma->vm_page_prot);
+
+	TRACE_CUR(CTRL_NAME
+		  ": %p:%p vma:%p vma->vm_private_data:%p closed.\n",
+		  (void*) vma->vm_start, (void*) vma->vm_end, vma,
+		  vma->vm_private_data);
+}
+
+static int litmus_ctrl_vm_fault(struct vm_area_struct* vma,
+				      struct vm_fault* vmf)
+{
+	TRACE_CUR("%s flags=0x%x (off:%ld)\n", __FUNCTION__,
+		  vma->vm_flags, vmf->pgoff);
+
+	/* This function should never be called, since all pages should have
+	 * been mapped by mmap() already. */
+	WARN_ONCE(1, "Page faults should be impossible in the control page\n");
+
+	return VM_FAULT_SIGBUS;
+}
+
+static struct vm_operations_struct litmus_ctrl_vm_ops = {
+	.close = litmus_ctrl_vm_close,
+	.fault = litmus_ctrl_vm_fault,
+};
+
+static int litmus_ctrl_mmap(struct file* filp, struct vm_area_struct* vma)
+{
+	int err = 0;
+
+	/* first make sure mapper knows what he's doing */
+
+	/* you can only get one page */
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	/* you can only map the "first" page */
+	if (vma->vm_pgoff != 0)
+		return -EINVAL;
+
+	/* you can't share it with anyone */
+	if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+		return -EINVAL;
+
+	vma->vm_ops = &litmus_ctrl_vm_ops;
+	/* This mapping should not be kept across forks,
+	 * cannot be expanded, and is not a "normal" page. */
+	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_READ | VM_WRITE;
+
+	/* We don't want the first write access to trigger a "minor" page fault
+	 * to mark the page as dirty.  This is transient, private memory, we
+	 * don't care if it was touched or not. PAGE_SHARED means RW access, but
+	 * not execute, and avoids copy-on-write behavior.
+	 * See protection_map in mmap.c.  */
+	vma->vm_page_prot = PAGE_SHARED;
+
+	err = alloc_ctrl_page(current);
+	if (!err)
+		err = map_ctrl_page(current, vma);
+
+	TRACE_CUR("%s flags=0x%x prot=0x%lx\n",
+		  __FUNCTION__, vma->vm_flags, vma->vm_page_prot);
+
+	return err;
+}
+
+static struct file_operations litmus_ctrl_fops = {
+	.owner = THIS_MODULE,
+	.mmap  = litmus_ctrl_mmap,
+};
+
+static struct miscdevice litmus_ctrl_dev = {
+	.name  = CTRL_NAME,
+	.minor = MISC_DYNAMIC_MINOR,
+	.fops  = &litmus_ctrl_fops,
+};
+
+static int __init init_litmus_ctrl_dev(void)
+{
+	int err;
+
+	BUILD_BUG_ON(sizeof(struct control_page) > PAGE_SIZE);
+
+	BUILD_BUG_ON(sizeof(union np_flag) != sizeof(uint32_t));
+
+	BUILD_BUG_ON(offsetof(struct control_page, sched.raw)
+		     != LITMUS_CP_OFFSET_SCHED);
+	BUILD_BUG_ON(offsetof(struct control_page, irq_count)
+		     != LITMUS_CP_OFFSET_IRQ_COUNT);
+	BUILD_BUG_ON(offsetof(struct control_page, ts_syscall_start)
+		     != LITMUS_CP_OFFSET_TS_SC_START);
+	BUILD_BUG_ON(offsetof(struct control_page, irq_syscall_start)
+		     != LITMUS_CP_OFFSET_IRQ_SC_START);
+
+	printk("Initializing LITMUS^RT control device.\n");
+	err = misc_register(&litmus_ctrl_dev);
+	if (err)
+		printk("Could not allocate %s device (%d).\n", CTRL_NAME, err);
+	return err;
+}
+
+static void __exit exit_litmus_ctrl_dev(void)
+{
+	misc_deregister(&litmus_ctrl_dev);
+}
+
+module_init(init_litmus_ctrl_dev);
+module_exit(exit_litmus_ctrl_dev);
diff --git a/litmus/edf_common.c b/litmus/edf_common.c
new file mode 100644
index 0000000..5aca293
--- /dev/null
+++ b/litmus/edf_common.c
@@ -0,0 +1,200 @@
+/*
+ * kernel/edf_common.c
+ *
+ * Common functions for EDF based scheduler.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/edf_common.h>
+
+#ifdef CONFIG_EDF_TIE_BREAK_LATENESS_NORM
+#include <litmus/fpmath.h>
+#endif
+
+#ifdef CONFIG_EDF_TIE_BREAK_HASH
+#include <linux/hash.h>
+static inline long edf_hash(struct task_struct *t)
+{
+	/* pid is 32 bits, so normally we would shove that into the
+	 * upper 32-bits and and put the job number in the bottom
+	 * and hash the 64-bit number with hash_64(). Sadly,
+	 * in testing, hash_64() doesn't distribute keys were the
+	 * upper bits are close together (as would be the case with
+	 * pids) and job numbers are equal (as would be the case with
+	 * synchronous task sets with all relative deadlines equal).
+	 *
+	 * A 2006 Linux patch proposed the following solution
+	 * (but for some reason it wasn't accepted...).
+	 *
+	 * At least this workaround works for 32-bit systems as well.
+	 */
+	return hash_32(hash_32((u32)tsk_rt(t)->job_params.job_no, 32) ^ t->pid, 32);
+}
+#endif
+
+
+/* edf_higher_prio -  returns true if first has a higher EDF priority
+ *                    than second. Deadline ties are broken by PID.
+ *
+ * both first and second may be NULL
+ */
+int edf_higher_prio(struct task_struct* first,
+		    struct task_struct* second)
+{
+	struct task_struct *first_task = first;
+	struct task_struct *second_task = second;
+
+	/* There is no point in comparing a task to itself. */
+	if (first && first == second) {
+		TRACE_TASK(first,
+			   "WARNING: pointless edf priority comparison.\n");
+		return 0;
+	}
+
+
+	/* check for NULL tasks */
+	if (!first || !second)
+		return first && !second;
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+	/* Check for inherited priorities. Change task
+	 * used for comparison in such a case.
+	 */
+	if (unlikely(first->rt_param.inh_task))
+		first_task = first->rt_param.inh_task;
+	if (unlikely(second->rt_param.inh_task))
+		second_task = second->rt_param.inh_task;
+
+	/* Check for priority boosting. Tie-break by start of boosting.
+	 */
+	if (unlikely(is_priority_boosted(first_task))) {
+		/* first_task is boosted, how about second_task? */
+		if (!is_priority_boosted(second_task) ||
+		    lt_before(get_boost_start(first_task),
+			      get_boost_start(second_task)))
+			return 1;
+		else
+			return 0;
+	} else if (unlikely(is_priority_boosted(second_task)))
+		/* second_task is boosted, first is not*/
+		return 0;
+
+#endif
+
+	if (earlier_deadline(first_task, second_task)) {
+		return 1;
+	}
+	else if (get_deadline(first_task) == get_deadline(second_task)) {
+		/* Need to tie break. All methods must set pid_break to 0/1 if
+		 * first_task does not have priority over second_task.
+		 */
+		int pid_break;
+
+
+#if defined(CONFIG_EDF_TIE_BREAK_LATENESS)
+		/* Tie break by lateness. Jobs with greater lateness get
+		 * priority. This should spread tardiness across all tasks,
+		 * especially in task sets where all tasks have the same
+		 * period and relative deadlines.
+		 */
+		if (get_lateness(first_task) > get_lateness(second_task)) {
+			return 1;
+		}
+		pid_break = (get_lateness(first_task) == get_lateness(second_task));
+
+
+#elif defined(CONFIG_EDF_TIE_BREAK_LATENESS_NORM)
+		/* Tie break by lateness, normalized by relative deadline. Jobs with
+		 * greater normalized lateness get priority.
+		 *
+		 * Note: Considered using the algebraically equivalent
+		 *	lateness(first)*relative_deadline(second) >
+					lateness(second)*relative_deadline(first)
+		 * to avoid fixed-point math, but values are prone to overflow if inputs
+		 * are on the order of several seconds, even in 64-bit.
+		 */
+		fp_t fnorm = _frac(get_lateness(first_task),
+						   get_rt_relative_deadline(first_task));
+		fp_t snorm = _frac(get_lateness(second_task),
+						   get_rt_relative_deadline(second_task));
+		if (_gt(fnorm, snorm)) {
+			return 1;
+		}
+		pid_break = _eq(fnorm, snorm);
+
+
+#elif defined(CONFIG_EDF_TIE_BREAK_HASH)
+		/* Tie break by comparing hashs of (pid, job#) tuple.  There should be
+		 * a 50% chance that first_task has a higher priority than second_task.
+		 */
+		long fhash = edf_hash(first_task);
+		long shash = edf_hash(second_task);
+		if (fhash < shash) {
+			return 1;
+		}
+		pid_break = (fhash == shash);
+#else
+
+
+		/* CONFIG_EDF_PID_TIE_BREAK */
+		pid_break = 1; // fall through to tie-break by pid;
+#endif
+
+		/* Tie break by pid */
+		if(pid_break) {
+			if (first_task->pid < second_task->pid) {
+				return 1;
+			}
+			else if (first_task->pid == second_task->pid) {
+				/* If the PIDs are the same then the task with the
+				 * inherited priority wins.
+				 */
+				if (!second->rt_param.inh_task) {
+					return 1;
+				}
+			}
+		}
+	}
+	return 0; /* fall-through. prio(second_task) > prio(first_task) */
+}
+
+int edf_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+	return edf_higher_prio(bheap2task(a), bheap2task(b));
+}
+
+void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+		      release_jobs_t release)
+{
+	rt_domain_init(rt,  edf_ready_order, resched, release);
+}
+
+/* need_to_preempt - check whether the task t needs to be preempted
+ *                   call only with irqs disabled and with  ready_lock acquired
+ *                   THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT!
+ */
+int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t)
+{
+	/* we need the read lock for edf_ready_queue */
+	/* no need to preempt if there is nothing pending */
+	if (!__jobs_pending(rt))
+		return 0;
+	/* we need to reschedule if t doesn't exist */
+	if (!t)
+		return 1;
+
+	/* NOTE: We cannot check for non-preemptibility since we
+	 *       don't know what address space we're currently in.
+	 */
+
+	/* make sure to get non-rt stuff out of the way */
+	return !is_realtime(t) || edf_higher_prio(__next_ready(rt), t);
+}
diff --git a/litmus/fdso.c b/litmus/fdso.c
new file mode 100644
index 0000000..31d7028
--- /dev/null
+++ b/litmus/fdso.c
@@ -0,0 +1,308 @@
+/* fdso.c - file descriptor attached shared objects
+ *
+ * (c) 2007 B. Brandenburg, LITMUS^RT project
+ *
+ * Notes:
+ *   - objects descriptor (OD) tables are not cloned during a fork.
+ *   - objects are created on-demand, and freed after the last reference
+ *     is dropped.
+ *   - for now, object types are hard coded.
+ *   - As long as we have live objects, we keep a reference to the inode.
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include <asm/uaccess.h>
+
+#include <litmus/fdso.h>
+
+extern struct fdso_ops generic_lock_ops;
+
+static const struct fdso_ops* fdso_ops[] = {
+	&generic_lock_ops, /* FMLP_SEM */
+	&generic_lock_ops, /* SRP_SEM */
+	&generic_lock_ops, /* MPCP_SEM */
+	&generic_lock_ops, /* MPCP_VS_SEM */
+	&generic_lock_ops, /* DPCP_SEM */
+	&generic_lock_ops, /* PCP_SEM */
+	&generic_lock_ops, /* DFLP_SEM */
+};
+
+static int fdso_create(void** obj_ref, obj_type_t type, void* __user config)
+{
+	BUILD_BUG_ON(ARRAY_SIZE(fdso_ops) != MAX_OBJ_TYPE + 1);
+
+	if (fdso_ops[type]->create)
+		return fdso_ops[type]->create(obj_ref, type, config);
+	else
+		return -EINVAL;
+}
+
+static void fdso_destroy(obj_type_t type, void* obj)
+{
+	fdso_ops[type]->destroy(type, obj);
+}
+
+static int fdso_open(struct od_table_entry* entry, void* __user config)
+{
+	if (fdso_ops[entry->obj->type]->open)
+		return fdso_ops[entry->obj->type]->open(entry, config);
+	else
+		return 0;
+}
+
+static int fdso_close(struct od_table_entry* entry)
+{
+	if (fdso_ops[entry->obj->type]->close)
+		return fdso_ops[entry->obj->type]->close(entry);
+	else
+		return 0;
+}
+
+/* inode must be locked already */
+static int alloc_inode_obj(struct inode_obj_id** obj_ref,
+			   struct inode* inode,
+			   obj_type_t type,
+			   unsigned int id,
+			   void* __user config)
+{
+	struct inode_obj_id* obj;
+	void* raw_obj;
+	int err;
+
+	obj = kmalloc(sizeof(*obj), GFP_KERNEL);
+	if (!obj) {
+		return -ENOMEM;
+	}
+
+	err = fdso_create(&raw_obj, type, config);
+	if (err != 0) {
+		kfree(obj);
+		return err;
+	}
+
+	INIT_LIST_HEAD(&obj->list);
+	atomic_set(&obj->count, 1);
+	obj->type  = type;
+	obj->id    = id;
+	obj->obj   = raw_obj;
+	obj->inode = inode;
+
+	list_add(&obj->list, &inode->i_obj_list);
+	atomic_inc(&inode->i_count);
+
+	printk(KERN_DEBUG "alloc_inode_obj(%p, %d, %d): object created\n", inode, type, id);
+
+	*obj_ref = obj;
+	return 0;
+}
+
+/* inode must be locked already */
+static struct inode_obj_id* get_inode_obj(struct inode* inode,
+					  obj_type_t type,
+					  unsigned int id)
+{
+	struct list_head* pos;
+	struct inode_obj_id* obj = NULL;
+
+	list_for_each(pos, &inode->i_obj_list) {
+		obj = list_entry(pos, struct inode_obj_id, list);
+		if (obj->id == id && obj->type == type) {
+			atomic_inc(&obj->count);
+			return obj;
+		}
+	}
+	printk(KERN_DEBUG "get_inode_obj(%p, %d, %d): couldn't find object\n", inode, type, id);
+	return NULL;
+}
+
+
+static void put_inode_obj(struct inode_obj_id* obj)
+{
+	struct inode* inode;
+	int let_go = 0;
+
+	inode = obj->inode;
+	if (atomic_dec_and_test(&obj->count)) {
+
+		mutex_lock(&inode->i_obj_mutex);
+		/* no new references can be obtained */
+		if (!atomic_read(&obj->count)) {
+			list_del(&obj->list);
+			fdso_destroy(obj->type, obj->obj);
+			kfree(obj);
+			let_go = 1;
+		}
+		mutex_unlock(&inode->i_obj_mutex);
+		if (let_go)
+			iput(inode);
+	}
+}
+
+static struct od_table_entry*  get_od_entry(struct task_struct* t)
+{
+	struct od_table_entry* table;
+	int i;
+
+
+	table = t->od_table;
+	if (!table) {
+		table = kzalloc(sizeof(*table) * MAX_OBJECT_DESCRIPTORS,
+				GFP_KERNEL);
+		t->od_table = table;
+	}
+
+	for (i = 0; table &&  i < MAX_OBJECT_DESCRIPTORS; i++)
+		if (!table[i].used) {
+			table[i].used = 1;
+			return table + i;
+		}
+	return NULL;
+}
+
+static int put_od_entry(struct od_table_entry* od)
+{
+	put_inode_obj(od->obj);
+	od->used = 0;
+	return 0;
+}
+
+static long close_od_entry(struct od_table_entry *od)
+{
+	long ret;
+
+	/* Give the class a chance to reject the close. */
+	ret = fdso_close(od);
+	if (ret == 0)
+		ret = put_od_entry(od);
+
+	return ret;
+}
+
+void exit_od_table(struct task_struct* t)
+{
+	int i;
+
+	if (t->od_table) {
+		for (i = 0; i < MAX_OBJECT_DESCRIPTORS; i++)
+			if (t->od_table[i].used)
+				close_od_entry(t->od_table + i);
+		kfree(t->od_table);
+		t->od_table = NULL;
+	}
+}
+
+static int do_sys_od_open(struct file* file, obj_type_t type, int id,
+			  void* __user config)
+{
+	int idx = 0, err = 0;
+	struct inode* inode;
+	struct inode_obj_id* obj = NULL;
+	struct od_table_entry* entry;
+
+	inode = file->f_dentry->d_inode;
+
+	entry = get_od_entry(current);
+	if (!entry)
+		return -ENOMEM;
+
+	mutex_lock(&inode->i_obj_mutex);
+	obj = get_inode_obj(inode, type, id);
+	if (!obj)
+		err = alloc_inode_obj(&obj, inode, type, id, config);
+	if (err != 0) {
+		obj = NULL;
+		idx = err;
+		entry->used = 0;
+	} else {
+		entry->obj   = obj;
+		entry->class = fdso_ops[type];
+		idx = entry - current->od_table;
+	}
+
+	mutex_unlock(&inode->i_obj_mutex);
+
+	/* open only if creation succeeded */
+	if (!err)
+		err = fdso_open(entry, config);
+	if (err < 0) {
+		/* The class rejected the open call.
+		 * We need to clean up and tell user space.
+		 */
+		if (obj)
+			put_od_entry(entry);
+		idx = err;
+	}
+
+	return idx;
+}
+
+
+struct od_table_entry* get_entry_for_od(int od)
+{
+	struct task_struct *t = current;
+
+	if (!t->od_table)
+		return NULL;
+	if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
+		return NULL;
+	if (!t->od_table[od].used)
+		return NULL;
+	return t->od_table + od;
+}
+
+
+asmlinkage long sys_od_open(int fd, int type, int obj_id, void* __user config)
+{
+	int ret = 0;
+	struct file*  file;
+
+	/*
+	   1) get file from fd, get inode from file
+	   2) lock inode
+	   3) try to lookup object
+	   4) if not present create and enqueue object, inc inode refcnt
+	   5) increment refcnt of object
+	   6) alloc od_table_entry, setup ptrs
+	   7) unlock inode
+	   8) return offset in od_table as OD
+	 */
+
+	if (type < MIN_OBJ_TYPE || type > MAX_OBJ_TYPE) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	file = fget(fd);
+	if (!file) {
+		ret = -EBADF;
+		goto out;
+	}
+
+	ret = do_sys_od_open(file, type, obj_id, config);
+
+	fput(file);
+
+out:
+	return ret;
+}
+
+
+asmlinkage long sys_od_close(int od)
+{
+	int ret = -EINVAL;
+	struct task_struct *t = current;
+
+	if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
+		return ret;
+
+	if (!t->od_table || !t->od_table[od].used)
+		return ret;
+
+
+	ret = close_od_entry(t->od_table + od);
+
+	return ret;
+}
diff --git a/litmus/fp_common.c b/litmus/fp_common.c
new file mode 100644
index 0000000..964a472
--- /dev/null
+++ b/litmus/fp_common.c
@@ -0,0 +1,119 @@
+/*
+ * litmus/fp_common.c
+ *
+ * Common functions for fixed-priority scheduler.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/fp_common.h>
+
+/* fp_higher_prio -  returns true if first has a higher static priority
+ *                   than second. Ties are broken by PID.
+ *
+ * both first and second may be NULL
+ */
+int fp_higher_prio(struct task_struct* first,
+		   struct task_struct* second)
+{
+	struct task_struct *first_task = first;
+	struct task_struct *second_task = second;
+
+	/* There is no point in comparing a task to itself. */
+	if (unlikely(first && first == second)) {
+		TRACE_TASK(first,
+			   "WARNING: pointless FP priority comparison.\n");
+		return 0;
+	}
+
+
+	/* check for NULL tasks */
+	if (!first || !second)
+		return first && !second;
+
+	if (!is_realtime(second_task))
+		return 1;
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+	/* Check for inherited priorities. Change task
+	 * used for comparison in such a case.
+	 */
+	if (unlikely(first->rt_param.inh_task))
+		first_task = first->rt_param.inh_task;
+	if (unlikely(second->rt_param.inh_task))
+		second_task = second->rt_param.inh_task;
+
+	/* Check for priority boosting. Tie-break by start of boosting.
+	 */
+	if (unlikely(is_priority_boosted(first_task))) {
+		/* first_task is boosted, how about second_task? */
+		if (is_priority_boosted(second_task))
+			/* break by priority point */
+			return lt_before(get_boost_start(first_task),
+					 get_boost_start(second_task));
+		else
+			/* priority boosting wins. */
+			return 1;
+	} else if (unlikely(is_priority_boosted(second_task)))
+		/* second_task is boosted, first is not*/
+		return 0;
+
+#endif
+
+	/* Comparisons to itself are not expected; priority inheritance
+	 * should also not cause this to happen. */
+	BUG_ON(first_task == second_task);
+
+	if (get_priority(first_task) < get_priority(second_task))
+		return 1;
+	else if (get_priority(first_task) == get_priority(second_task))
+		/* Break by PID. */
+		return first_task->pid < second_task->pid;
+	else
+		return 0;
+}
+
+int fp_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+	return fp_higher_prio(bheap2task(a), bheap2task(b));
+}
+
+void fp_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+		    release_jobs_t release)
+{
+	rt_domain_init(rt,  fp_ready_order, resched, release);
+}
+
+/* need_to_preempt - check whether the task t needs to be preempted
+ */
+int fp_preemption_needed(struct fp_prio_queue *q, struct task_struct *t)
+{
+	struct task_struct *pending;
+
+	pending = fp_prio_peek(q);
+
+	if (!pending)
+		return 0;
+	if (!t)
+		return 1;
+
+	/* make sure to get non-rt stuff out of the way */
+	return !is_realtime(t) || fp_higher_prio(pending, t);
+}
+
+void fp_prio_queue_init(struct fp_prio_queue* q)
+{
+	int i;
+
+	for (i = 0; i < FP_PRIO_BIT_WORDS; i++)
+		q->bitmask[i] = 0;
+	for (i = 0; i < LITMUS_MAX_PRIORITY; i++)
+		bheap_init(&q->queue[i]);
+}
diff --git a/litmus/jobs.c b/litmus/jobs.c
new file mode 100644
index 0000000..2d9f8aa
--- /dev/null
+++ b/litmus/jobs.c
@@ -0,0 +1,77 @@
+/* litmus/jobs.c - common job control code
+ */
+
+#include <linux/sched.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+
+static inline void setup_release(struct task_struct *t, lt_t release)
+{
+	/* prepare next release */
+	t->rt_param.job_params.release = release;
+	t->rt_param.job_params.deadline = release + get_rt_relative_deadline(t);
+	t->rt_param.job_params.exec_time = 0;
+
+	/* update job sequence number */
+	t->rt_param.job_params.job_no++;
+}
+
+void prepare_for_next_period(struct task_struct *t)
+{
+	BUG_ON(!t);
+
+	/* Record lateness before we set up the next job's
+	 * release and deadline. Lateness may be negative.
+	 */
+	t->rt_param.job_params.lateness =
+		(long long)litmus_clock() -
+		(long long)t->rt_param.job_params.deadline;
+
+	if (tsk_rt(t)->sporadic_release) {
+		TRACE_TASK(t, "sporadic release at %llu\n",
+			   tsk_rt(t)->sporadic_release_time);
+		/* sporadic release */
+		setup_release(t, tsk_rt(t)->sporadic_release_time);
+		tsk_rt(t)->sporadic_release = 0;
+	} else {
+		/* periodic release => add period */
+		setup_release(t, get_release(t) + get_rt_period(t));
+	}
+}
+
+void release_at(struct task_struct *t, lt_t start)
+{
+	BUG_ON(!t);
+	setup_release(t, start);
+	tsk_rt(t)->completed = 0;
+}
+
+long default_wait_for_release_at(lt_t release_time)
+{
+	struct task_struct *t = current;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	tsk_rt(t)->sporadic_release_time = release_time;
+	smp_wmb();
+	tsk_rt(t)->sporadic_release = 1;
+	local_irq_restore(flags);
+
+	return complete_job();
+}
+
+
+/*
+ *	Deactivate current task until the beginning of the next period.
+ */
+long complete_job(void)
+{
+	/* Mark that we do not excute anymore */
+	tsk_rt(current)->completed = 1;
+	/* call schedule, this will return when a new job arrives
+	 * it also takes care of preparing for the next release
+	 */
+	schedule();
+	return 0;
+}
diff --git a/litmus/litmus.c b/litmus/litmus.c
new file mode 100644
index 0000000..9c419cd
--- /dev/null
+++ b/litmus/litmus.c
@@ -0,0 +1,639 @@
+/*
+ * litmus.c -- Implementation of the LITMUS syscalls,
+ *             the LITMUS intialization code,
+ *             and the procfs interface..
+ */
+#include <asm/uaccess.h>
+#include <linux/uaccess.h>
+#include <linux/sysrq.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/reboot.h>
+#include <linux/stop_machine.h>
+#include <linux/sched/rt.h>
+#include <linux/rwsem.h>
+
+#include <litmus/litmus.h>
+#include <litmus/bheap.h>
+#include <litmus/trace.h>
+#include <litmus/rt_domain.h>
+#include <litmus/litmus_proc.h>
+#include <litmus/sched_trace.h>
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
+
+/* Number of RT tasks that exist in the system */
+atomic_t rt_task_count 		= ATOMIC_INIT(0);
+
+#ifdef CONFIG_RELEASE_MASTER
+/* current master CPU for handling timer IRQs */
+atomic_t release_master_cpu = ATOMIC_INIT(NO_CPU);
+#endif
+
+static struct kmem_cache * bheap_node_cache;
+extern struct kmem_cache * release_heap_cache;
+
+struct bheap_node* bheap_node_alloc(int gfp_flags)
+{
+	return kmem_cache_alloc(bheap_node_cache, gfp_flags);
+}
+
+void bheap_node_free(struct bheap_node* hn)
+{
+	kmem_cache_free(bheap_node_cache, hn);
+}
+
+struct release_heap* release_heap_alloc(int gfp_flags);
+void release_heap_free(struct release_heap* rh);
+
+/**
+ * Get the quantum alignment as a cmdline option.
+ * Default is staggered quanta, as this results in lower overheads.
+ */
+static bool aligned_quanta = 0;
+module_param(aligned_quanta, bool, 0644);
+
+u64 cpu_stagger_offset(int cpu)
+{
+	u64 offset = 0;
+
+	if (!aligned_quanta) {
+		offset = LITMUS_QUANTUM_LENGTH_NS;
+		do_div(offset, num_possible_cpus());
+		offset *= cpu;
+	}
+	return offset;
+}
+
+/*
+ * sys_set_task_rt_param
+ * @pid: Pid of the task which scheduling parameters must be changed
+ * @param: New real-time extension parameters such as the execution cost and
+ *         period
+ * Syscall for manipulating with task rt extension params
+ * Returns EFAULT  if param is NULL.
+ *         ESRCH   if pid is not corrsponding
+ *	           to a valid task.
+ *	   EINVAL  if either period or execution cost is <=0
+ *	   EPERM   if pid is a real-time task
+ *	   0       if success
+ *
+ * Only non-real-time tasks may be configured with this system call
+ * to avoid races with the scheduler. In practice, this means that a
+ * task's parameters must be set _before_ calling sys_prepare_rt_task()
+ *
+ * find_task_by_vpid() assumes that we are in the same namespace of the
+ * target.
+ */
+asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param)
+{
+	struct rt_task tp;
+	struct task_struct *target;
+	int retval = -EINVAL;
+
+	printk("Setting up rt task parameters for process %d.\n", pid);
+
+	if (pid < 0 || param == 0) {
+		goto out;
+	}
+	if (copy_from_user(&tp, param, sizeof(tp))) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	/* Task search and manipulation must be protected */
+	read_lock_irq(&tasklist_lock);
+	if (!(target = find_task_by_vpid(pid))) {
+		retval = -ESRCH;
+		goto out_unlock;
+	}
+
+	if (is_realtime(target)) {
+		/* The task is already a real-time task.
+		 * We cannot not allow parameter changes at this point.
+		 */
+		retval = -EBUSY;
+		goto out_unlock;
+	}
+
+	/* set relative deadline to be implicit if left unspecified */
+	if (tp.relative_deadline == 0)
+		tp.relative_deadline = tp.period;
+
+	if (tp.exec_cost <= 0)
+		goto out_unlock;
+	if (tp.period <= 0)
+		goto out_unlock;
+	if (min(tp.relative_deadline, tp.period) < tp.exec_cost) /*density check*/
+	{
+		printk(KERN_INFO "litmus: real-time task %d rejected "
+		       "because task density > 1.0\n", pid);
+		goto out_unlock;
+	}
+	if (tp.cls != RT_CLASS_HARD &&
+	    tp.cls != RT_CLASS_SOFT &&
+	    tp.cls != RT_CLASS_BEST_EFFORT)
+	{
+		printk(KERN_INFO "litmus: real-time task %d rejected "
+				 "because its class is invalid\n", pid);
+		goto out_unlock;
+	}
+	if (tp.budget_policy != NO_ENFORCEMENT &&
+	    tp.budget_policy != QUANTUM_ENFORCEMENT &&
+	    tp.budget_policy != PRECISE_ENFORCEMENT)
+	{
+		printk(KERN_INFO "litmus: real-time task %d rejected "
+		       "because unsupported budget enforcement policy "
+		       "specified (%d)\n",
+		       pid, tp.budget_policy);
+		goto out_unlock;
+	}
+
+	target->rt_param.task_params = tp;
+
+	retval = 0;
+      out_unlock:
+	read_unlock_irq(&tasklist_lock);
+      out:
+	return retval;
+}
+
+/*
+ * Getter of task's RT params
+ *   returns EINVAL if param or pid is NULL
+ *   returns ESRCH  if pid does not correspond to a valid task
+ *   returns EFAULT if copying of parameters has failed.
+ *
+ *   find_task_by_vpid() assumes that we are in the same namespace of the
+ *   target.
+ */
+asmlinkage long sys_get_rt_task_param(pid_t pid, struct rt_task __user * param)
+{
+	int retval = -EINVAL;
+	struct task_struct *source;
+	struct rt_task lp;
+	if (param == 0 || pid < 0)
+		goto out;
+	read_lock(&tasklist_lock);
+	if (!(source = find_task_by_vpid(pid))) {
+		retval = -ESRCH;
+		goto out_unlock;
+	}
+	lp = source->rt_param.task_params;
+	read_unlock(&tasklist_lock);
+	/* Do copying outside the lock */
+	retval =
+	    copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0;
+	return retval;
+      out_unlock:
+	read_unlock(&tasklist_lock);
+      out:
+	return retval;
+
+}
+
+/*
+ *	This is the crucial function for periodic task implementation,
+ *	It checks if a task is periodic, checks if such kind of sleep
+ *	is permitted and calls plugin-specific sleep, which puts the
+ *	task into a wait array.
+ *	returns 0 on successful wakeup
+ *	returns EPERM if current conditions do not permit such sleep
+ *	returns EINVAL if current task is not able to go to sleep
+ */
+asmlinkage long sys_complete_job(void)
+{
+	int retval = -EPERM;
+	if (!is_realtime(current)) {
+		retval = -EINVAL;
+		goto out;
+	}
+	/* Task with negative or zero period cannot sleep */
+	if (get_rt_period(current) <= 0) {
+		retval = -EINVAL;
+		goto out;
+	}
+	/* The plugin has to put the task into an
+	 * appropriate queue and call schedule
+	 */
+	retval = litmus->complete_job();
+      out:
+	return retval;
+}
+
+/*	This is an "improved" version of sys_complete_job that
+ *      addresses the problem of unintentionally missing a job after
+ *      an overrun.
+ *
+ *	returns 0 on successful wakeup
+ *	returns EPERM if current conditions do not permit such sleep
+ *	returns EINVAL if current task is not able to go to sleep
+ */
+asmlinkage long sys_wait_for_job_release(unsigned int job)
+{
+	int retval = -EPERM;
+	if (!is_realtime(current)) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	/* Task with negative or zero period cannot sleep */
+	if (get_rt_period(current) <= 0) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	retval = 0;
+
+	/* first wait until we have "reached" the desired job
+	 *
+	 * This implementation has at least two problems:
+	 *
+	 * 1) It doesn't gracefully handle the wrap around of
+	 *    job_no. Since LITMUS is a prototype, this is not much
+	 *    of a problem right now.
+	 *
+	 * 2) It is theoretically racy if a job release occurs
+	 *    between checking job_no and calling sleep_next_period().
+	 *    A proper solution would requiring adding another callback
+	 *    in the plugin structure and testing the condition with
+	 *    interrupts disabled.
+	 *
+	 * FIXME: At least problem 2 should be taken care of eventually.
+	 */
+	while (!retval && job > current->rt_param.job_params.job_no)
+		/* If the last job overran then job <= job_no and we
+		 * don't send the task to sleep.
+		 */
+		retval = litmus->complete_job();
+      out:
+	return retval;
+}
+
+/*	This is a helper syscall to query the current job sequence number.
+ *
+ *	returns 0 on successful query
+ *	returns EPERM if task is not a real-time task.
+ *      returns EFAULT if &job is not a valid pointer.
+ */
+asmlinkage long sys_query_job_no(unsigned int __user *job)
+{
+	int retval = -EPERM;
+	if (is_realtime(current))
+		retval = put_user(current->rt_param.job_params.job_no, job);
+
+	return retval;
+}
+
+/* sys_null_call() is only used for determining raw system call
+ * overheads (kernel entry, kernel exit). It has no useful side effects.
+ * If ts is non-NULL, then the current Feather-Trace time is recorded.
+ */
+asmlinkage long sys_null_call(cycles_t __user *ts)
+{
+	long ret = 0;
+	cycles_t now;
+
+	if (ts) {
+		now = get_cycles();
+		ret = put_user(now, ts);
+	}
+
+	return ret;
+}
+
+/* p is a real-time task. Re-init its state as a best-effort task. */
+static void reinit_litmus_state(struct task_struct* p, int restore)
+{
+	struct rt_task  user_config = {};
+	void*  ctrl_page     = NULL;
+
+	if (restore) {
+		/* Safe user-space provided configuration data.
+		 * and allocated page. */
+		user_config = p->rt_param.task_params;
+		ctrl_page   = p->rt_param.ctrl_page;
+	}
+
+	/* We probably should not be inheriting any task's priority
+	 * at this point in time.
+	 */
+	WARN_ON(p->rt_param.inh_task);
+
+	/* Cleanup everything else. */
+	memset(&p->rt_param, 0, sizeof(p->rt_param));
+
+	/* Restore preserved fields. */
+	if (restore) {
+		p->rt_param.task_params = user_config;
+		p->rt_param.ctrl_page   = ctrl_page;
+	}
+}
+
+long litmus_admit_task(struct task_struct* tsk)
+{
+	long retval = 0;
+
+	BUG_ON(is_realtime(tsk));
+
+	tsk_rt(tsk)->heap_node = NULL;
+	tsk_rt(tsk)->rel_heap = NULL;
+
+	if (get_rt_relative_deadline(tsk) == 0 ||
+	    get_exec_cost(tsk) >
+			min(get_rt_relative_deadline(tsk), get_rt_period(tsk)) ) {
+		TRACE_TASK(tsk,
+			"litmus admit: invalid task parameters "
+			"(e = %lu, p = %lu, d = %lu)\n",
+			get_exec_cost(tsk), get_rt_period(tsk),
+			get_rt_relative_deadline(tsk));
+		retval = -EINVAL;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&tsk_rt(tsk)->list);
+
+	/* allocate heap node for this task */
+	tsk_rt(tsk)->heap_node = bheap_node_alloc(GFP_ATOMIC);
+	tsk_rt(tsk)->rel_heap = release_heap_alloc(GFP_ATOMIC);
+
+	if (!tsk_rt(tsk)->heap_node || !tsk_rt(tsk)->rel_heap) {
+		printk(KERN_WARNING "litmus: no more heap node memory!?\n");
+
+		retval = -ENOMEM;
+		goto out;
+	} else {
+		bheap_node_init(&tsk_rt(tsk)->heap_node, tsk);
+	}
+
+	preempt_disable();
+
+	retval = litmus->admit_task(tsk);
+
+	if (!retval) {
+		sched_trace_task_name(tsk);
+		sched_trace_task_param(tsk);
+		atomic_inc(&rt_task_count);
+	}
+
+	preempt_enable();
+
+out:
+	if (retval) {
+		if (tsk_rt(tsk)->heap_node)
+			bheap_node_free(tsk_rt(tsk)->heap_node);
+		if (tsk_rt(tsk)->rel_heap)
+			release_heap_free(tsk_rt(tsk)->rel_heap);
+	}
+	return retval;
+}
+
+void litmus_clear_state(struct task_struct* tsk)
+{
+    BUG_ON(bheap_node_in_heap(tsk_rt(tsk)->heap_node));
+    bheap_node_free(tsk_rt(tsk)->heap_node);
+    release_heap_free(tsk_rt(tsk)->rel_heap);
+
+    atomic_dec(&rt_task_count);
+    reinit_litmus_state(tsk, 1);
+}
+
+/* called from sched_setscheduler() */
+void litmus_exit_task(struct task_struct* tsk)
+{
+	if (is_realtime(tsk)) {
+		sched_trace_task_completion(tsk, 1);
+
+		litmus->task_exit(tsk);
+	}
+}
+
+static DECLARE_RWSEM(plugin_switch_mutex);
+
+void litmus_plugin_switch_disable(void)
+{
+	down_read(&plugin_switch_mutex);
+}
+
+void litmus_plugin_switch_enable(void)
+{
+	up_read(&plugin_switch_mutex);
+}
+
+static int do_plugin_switch(void *_plugin)
+{
+	int ret;
+	struct sched_plugin* plugin = _plugin;
+
+	/* don't switch if there are active real-time tasks */
+	if (atomic_read(&rt_task_count) == 0) {
+		ret = litmus->deactivate_plugin();
+		if (0 != ret)
+			goto out;
+		ret = plugin->activate_plugin();
+		if (0 != ret) {
+			printk(KERN_INFO "Can't activate %s (%d).\n",
+			       plugin->plugin_name, ret);
+			plugin = &linux_sched_plugin;
+		}
+
+		printk(KERN_INFO "Switching to LITMUS^RT plugin %s.\n", plugin->plugin_name);
+		litmus = plugin;
+	} else
+		ret = -EBUSY;
+out:
+	return ret;
+}
+
+/* Switching a plugin in use is tricky.
+ * We must watch out that no real-time tasks exists
+ * (and that none is created in parallel) and that the plugin is not
+ * currently in use on any processor (in theory).
+ */
+int switch_sched_plugin(struct sched_plugin* plugin)
+{
+	int err;
+	struct domain_proc_info* domain_info;
+
+	BUG_ON(!plugin);
+
+	if (atomic_read(&rt_task_count) == 0) {
+		down_write(&plugin_switch_mutex);
+
+		deactivate_domain_proc();
+
+		err =  stop_machine(do_plugin_switch, plugin, NULL);
+
+		if(!litmus->get_domain_proc_info(&domain_info))
+			activate_domain_proc(domain_info);
+
+		up_write(&plugin_switch_mutex);
+		return err;
+	} else
+		return -EBUSY;
+}
+
+/* Called upon fork.
+ * p is the newly forked task.
+ */
+void litmus_fork(struct task_struct* p)
+{
+	if (is_realtime(p)) {
+		/* clean out any litmus related state, don't preserve anything */
+		reinit_litmus_state(p, 0);
+		/* Don't let the child be a real-time task.  */
+		p->sched_reset_on_fork = 1;
+	} else
+		/* non-rt tasks might have ctrl_page set */
+		tsk_rt(p)->ctrl_page = NULL;
+
+	/* od tables are never inherited across a fork */
+	p->od_table = NULL;
+}
+
+/* Called upon execve().
+ * current is doing the exec.
+ * Don't let address space specific stuff leak.
+ */
+void litmus_exec(void)
+{
+	struct task_struct* p = current;
+
+	if (is_realtime(p)) {
+		WARN_ON(p->rt_param.inh_task);
+		if (tsk_rt(p)->ctrl_page) {
+			free_page((unsigned long) tsk_rt(p)->ctrl_page);
+			tsk_rt(p)->ctrl_page = NULL;
+		}
+	}
+}
+
+/* Called when dead_tsk is being deallocated
+ */
+void exit_litmus(struct task_struct *dead_tsk)
+{
+	/* We also allow non-RT tasks to
+	 * allocate control pages to allow
+	 * measurements with non-RT tasks.
+	 * So check if we need to free the page
+	 * in any case.
+	 */
+	if (tsk_rt(dead_tsk)->ctrl_page) {
+		TRACE_TASK(dead_tsk,
+			   "freeing ctrl_page %p\n",
+			   tsk_rt(dead_tsk)->ctrl_page);
+		free_page((unsigned long) tsk_rt(dead_tsk)->ctrl_page);
+	}
+
+	/* Tasks should not be real-time tasks any longer at this point. */
+	BUG_ON(is_realtime(dead_tsk));
+}
+
+void litmus_do_exit(struct task_struct *exiting_tsk)
+{
+	/* This task called do_exit(), but is still a real-time task. To avoid
+	 * complications later, we force it to be a non-real-time task now. */
+
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+
+	TRACE_TASK(exiting_tsk, "exiting, demoted to SCHED_FIFO\n");
+	sched_setscheduler_nocheck(exiting_tsk, SCHED_FIFO, &param);
+}
+
+void litmus_dealloc(struct task_struct *tsk)
+{
+	/* tsk is no longer a real-time task */
+	TRACE_TASK(tsk, "Deallocating real-time task data\n");
+	litmus->task_cleanup(tsk);
+	litmus_clear_state(tsk);
+}
+
+#ifdef CONFIG_MAGIC_SYSRQ
+int sys_kill(int pid, int sig);
+
+static void sysrq_handle_kill_rt_tasks(int key)
+{
+	struct task_struct *t;
+	read_lock(&tasklist_lock);
+	for_each_process(t) {
+		if (is_realtime(t)) {
+			sys_kill(t->pid, SIGKILL);
+		}
+	}
+	read_unlock(&tasklist_lock);
+}
+
+static struct sysrq_key_op sysrq_kill_rt_tasks_op = {
+	.handler	= sysrq_handle_kill_rt_tasks,
+	.help_msg	= "quit-rt-tasks(X)",
+	.action_msg	= "sent SIGKILL to all LITMUS^RT real-time tasks",
+};
+#endif
+
+extern struct sched_plugin linux_sched_plugin;
+
+static int litmus_shutdown_nb(struct notifier_block *unused1,
+				unsigned long unused2, void *unused3)
+{
+	/* Attempt to switch back to regular Linux scheduling.
+	 * Forces the active plugin to clean up.
+	 */
+	if (litmus != &linux_sched_plugin) {
+		int ret = switch_sched_plugin(&linux_sched_plugin);
+		if (ret) {
+			printk("Auto-shutdown of active Litmus plugin failed.\n");
+		}
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block shutdown_notifier = {
+	.notifier_call = litmus_shutdown_nb,
+};
+
+static int __init _init_litmus(void)
+{
+	/*      Common initializers,
+	 *      mode change lock is used to enforce single mode change
+	 *      operation.
+	 */
+	printk("Starting LITMUS^RT kernel\n");
+
+	register_sched_plugin(&linux_sched_plugin);
+
+	bheap_node_cache    = KMEM_CACHE(bheap_node, SLAB_PANIC);
+	release_heap_cache = KMEM_CACHE(release_heap, SLAB_PANIC);
+
+#ifdef CONFIG_MAGIC_SYSRQ
+	/* offer some debugging help */
+	if (!register_sysrq_key('x', &sysrq_kill_rt_tasks_op))
+		printk("Registered kill rt tasks magic sysrq.\n");
+	else
+		printk("Could not register kill rt tasks magic sysrq.\n");
+#endif
+
+	init_litmus_proc();
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+	init_topology();
+#endif
+
+	register_reboot_notifier(&shutdown_notifier);
+
+	return 0;
+}
+
+static void _exit_litmus(void)
+{
+	unregister_reboot_notifier(&shutdown_notifier);
+
+	exit_litmus_proc();
+	kmem_cache_destroy(bheap_node_cache);
+	kmem_cache_destroy(release_heap_cache);
+}
+
+module_init(_init_litmus);
+module_exit(_exit_litmus);
diff --git a/litmus/litmus_proc.c b/litmus/litmus_proc.c
new file mode 100644
index 0000000..4db3fe2
--- /dev/null
+++ b/litmus/litmus_proc.c
@@ -0,0 +1,576 @@
+/*
+ * litmus_proc.c -- Implementation of the /proc/litmus directory tree.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/seq_file.h>
+
+#include <litmus/litmus.h>
+#include <litmus/litmus_proc.h>
+
+#include <litmus/clustered.h>
+
+/* in litmus/litmus.c */
+extern atomic_t rt_task_count;
+
+static struct proc_dir_entry *litmus_dir = NULL,
+	*curr_file = NULL,
+	*stat_file = NULL,
+	*plugs_dir = NULL,
+#ifdef CONFIG_RELEASE_MASTER
+	*release_master_file = NULL,
+#endif
+	*plugs_file = NULL,
+	*domains_dir = NULL,
+	*cpus_dir = NULL;
+
+
+/* in litmus/sync.c */
+int count_tasks_waiting_for_release(void);
+
+static int litmus_stats_proc_show(struct seq_file *m, void *v)
+{
+        seq_printf(m,
+		   "real-time tasks   = %d\n"
+		   "ready for release = %d\n",
+		   atomic_read(&rt_task_count),
+		   count_tasks_waiting_for_release());
+	return 0;
+}
+
+static int litmus_stats_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, litmus_stats_proc_show, PDE_DATA(inode));
+}
+
+static const struct file_operations litmus_stats_proc_fops = {
+	.open		= litmus_stats_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+
+static int litmus_loaded_proc_show(struct seq_file *m, void *v)
+{
+	print_sched_plugins(m);
+	return 0;
+}
+
+static int litmus_loaded_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, litmus_loaded_proc_show, PDE_DATA(inode));
+}
+
+static const struct file_operations litmus_loaded_proc_fops = {
+	.open		= litmus_loaded_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+
+
+
+/* in litmus/litmus.c */
+int switch_sched_plugin(struct sched_plugin*);
+
+static ssize_t litmus_active_proc_write(struct file *file,
+					const char __user *buffer, size_t count,
+					loff_t *ppos)
+{
+	char name[65];
+	struct sched_plugin* found;
+	ssize_t ret = -EINVAL;
+	int err;
+
+
+	ret = copy_and_chomp(name, sizeof(name), buffer, count);
+	if (ret < 0)
+		return ret;
+
+	found = find_sched_plugin(name);
+
+	if (found) {
+		err = switch_sched_plugin(found);
+		if (err) {
+			printk(KERN_INFO "Could not switch plugin: %d\n", err);
+			ret = err;
+		}
+	} else {
+		printk(KERN_INFO "Plugin '%s' is unknown.\n", name);
+		ret = -ESRCH;
+	}
+
+	return ret;
+}
+
+static int litmus_active_proc_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%s\n", litmus->plugin_name);
+	return 0;
+}
+
+static int litmus_active_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, litmus_active_proc_show, PDE_DATA(inode));
+}
+
+static const struct file_operations litmus_active_proc_fops = {
+	.open		= litmus_active_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= litmus_active_proc_write,
+};
+
+
+#ifdef CONFIG_RELEASE_MASTER
+static ssize_t litmus_release_master_proc_write(
+	struct file *file,
+	const char __user *buffer, size_t count,
+	loff_t *ppos)
+{
+	int cpu, err, online = 0;
+	char msg[64];
+	ssize_t len;
+
+	len = copy_and_chomp(msg, sizeof(msg), buffer, count);
+
+	if (len < 0)
+		return len;
+
+	if (strcmp(msg, "NO_CPU") == 0)
+		atomic_set(&release_master_cpu, NO_CPU);
+	else {
+		err = sscanf(msg, "%d", &cpu);
+		if (err == 1 && cpu >= 0 && (online = cpu_online(cpu))) {
+			atomic_set(&release_master_cpu, cpu);
+		} else {
+			TRACE("invalid release master: '%s' "
+			      "(err:%d cpu:%d online:%d)\n",
+			      msg, err, cpu, online);
+			len = -EINVAL;
+		}
+	}
+	return len;
+}
+
+static int litmus_release_master_proc_show(struct seq_file *m, void *v)
+{
+	int master;
+	master = atomic_read(&release_master_cpu);
+	if (master == NO_CPU)
+		seq_printf(m, "NO_CPU\n");
+	else
+		seq_printf(m, "%d\n", master);
+	return 0;
+}
+
+static int litmus_release_master_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, litmus_release_master_proc_show, PDE_DATA(inode));
+}
+
+static const struct file_operations litmus_release_master_proc_fops = {
+	.open		= litmus_release_master_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= litmus_release_master_proc_write,
+};
+#endif
+
+int __init init_litmus_proc(void)
+{
+	litmus_dir = proc_mkdir("litmus", NULL);
+	if (!litmus_dir) {
+		printk(KERN_ERR "Could not allocate LITMUS^RT procfs entry.\n");
+		return -ENOMEM;
+	}
+
+	curr_file = proc_create("active_plugin", 0644, litmus_dir,
+				&litmus_active_proc_fops);
+
+	if (!curr_file) {
+		printk(KERN_ERR "Could not allocate active_plugin "
+		       "procfs entry.\n");
+		return -ENOMEM;
+	}
+
+#ifdef CONFIG_RELEASE_MASTER
+	release_master_file = proc_create("release_master", 0644, litmus_dir,
+					  &litmus_release_master_proc_fops);
+	if (!release_master_file) {
+		printk(KERN_ERR "Could not allocate release_master "
+		       "procfs entry.\n");
+		return -ENOMEM;
+	}
+#endif
+
+	stat_file = proc_create("stats", 0444, litmus_dir, &litmus_stats_proc_fops);
+
+	plugs_dir = proc_mkdir("plugins", litmus_dir);
+	if (!plugs_dir){
+		printk(KERN_ERR "Could not allocate plugins directory "
+				"procfs entry.\n");
+		return -ENOMEM;
+	}
+
+	plugs_file = proc_create("loaded", 0444, plugs_dir,
+				 &litmus_loaded_proc_fops);
+
+	domains_dir = proc_mkdir("domains", litmus_dir);
+	if (!domains_dir) {
+		printk(KERN_ERR "Could not allocate domains directory "
+				"procfs entry.\n");
+		return -ENOMEM;
+	}
+
+	cpus_dir = proc_mkdir("cpus", litmus_dir);
+	if (!cpus_dir) {
+		printk(KERN_ERR "Could not allocate cpus directory "
+				"procfs entry.\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void exit_litmus_proc(void)
+{
+	if (cpus_dir || domains_dir) {
+		deactivate_domain_proc();
+		if (cpus_dir)
+			remove_proc_entry("cpus", litmus_dir);
+		if (domains_dir)
+			remove_proc_entry("domains", litmus_dir);
+	}
+	if (plugs_file)
+		remove_proc_entry("loaded", plugs_dir);
+	if (plugs_dir)
+		remove_proc_entry("plugins", litmus_dir);
+	if (stat_file)
+		remove_proc_entry("stats", litmus_dir);
+	if (curr_file)
+		remove_proc_entry("active_plugin", litmus_dir);
+#ifdef CONFIG_RELEASE_MASTER
+	if (release_master_file)
+		remove_proc_entry("release_master", litmus_dir);
+#endif
+	if (litmus_dir)
+		remove_proc_entry("litmus", NULL);
+}
+
+long make_plugin_proc_dir(struct sched_plugin* plugin,
+		struct proc_dir_entry** pde_in)
+{
+	struct proc_dir_entry *pde_new = NULL;
+	long rv;
+
+	if (!plugin || !plugin->plugin_name){
+		printk(KERN_ERR "Invalid plugin struct passed to %s.\n",
+				__func__);
+		rv = -EINVAL;
+		goto out_no_pde;
+	}
+
+	if (!plugs_dir){
+		printk(KERN_ERR "Could not make plugin sub-directory, because "
+				"/proc/litmus/plugins does not exist.\n");
+		rv = -ENOENT;
+		goto out_no_pde;
+	}
+
+	pde_new = proc_mkdir(plugin->plugin_name, plugs_dir);
+	if (!pde_new){
+		printk(KERN_ERR "Could not make plugin sub-directory: "
+				"out of memory?.\n");
+		rv = -ENOMEM;
+		goto out_no_pde;
+	}
+
+	rv = 0;
+	*pde_in = pde_new;
+	goto out_ok;
+
+out_no_pde:
+	*pde_in = NULL;
+out_ok:
+	return rv;
+}
+
+void remove_plugin_proc_dir(struct sched_plugin* plugin)
+{
+	if (!plugin || !plugin->plugin_name){
+		printk(KERN_ERR "Invalid plugin struct passed to %s.\n",
+				__func__);
+		return;
+	}
+	remove_proc_entry(plugin->plugin_name, plugs_dir);
+}
+
+
+
+/* misc. I/O helper functions */
+
+int copy_and_chomp(char *kbuf, unsigned long ksize,
+		   __user const char* ubuf, unsigned long ulength)
+{
+	/* caller must provide buffer space */
+	BUG_ON(!ksize);
+
+	ksize--; /* leave space for null byte */
+
+	if (ksize > ulength)
+		ksize = ulength;
+
+	if(copy_from_user(kbuf, ubuf, ksize))
+		return -EFAULT;
+
+	kbuf[ksize] = '\0';
+
+	/* chomp kbuf */
+	if (ksize > 0 && kbuf[ksize - 1] == '\n')
+		kbuf[ksize - 1] = '\0';
+
+	return ksize;
+}
+
+/* helper functions for clustered plugins */
+static const char* cache_level_names[] = {
+	"ALL",
+	"L1",
+	"L2",
+	"L3",
+};
+
+int parse_cache_level(const char *cache_name, enum cache_level *level)
+{
+	int err = -EINVAL;
+	int i;
+	/* do a quick and dirty comparison to find the cluster size */
+	for (i = GLOBAL_CLUSTER; i <= L3_CLUSTER; i++)
+		if (!strcmp(cache_name, cache_level_names[i])) {
+			*level = (enum cache_level) i;
+			err = 0;
+			break;
+		}
+	return err;
+}
+
+const char* cache_level_name(enum cache_level level)
+{
+	int idx = level;
+
+	if (idx >= GLOBAL_CLUSTER && idx <= L3_CLUSTER)
+		return cache_level_names[idx];
+	else
+		return "INVALID";
+}
+
+
+
+
+/* proc file interface to configure the cluster size */
+
+static ssize_t litmus_cluster_proc_write(struct file *file,
+					const char __user *buffer, size_t count,
+					loff_t *ppos)
+{
+	enum cache_level *level = (enum cache_level *) PDE_DATA(file_inode(file));
+	ssize_t len;
+	char cache_name[8];
+
+	len = copy_and_chomp(cache_name, sizeof(cache_name), buffer, count);
+
+	if (len > 0 && parse_cache_level(cache_name, level)) {
+		printk(KERN_INFO "Cluster '%s' is unknown.\n", cache_name);
+		len = -EINVAL;
+	}
+
+	return len;
+}
+
+static int litmus_cluster_proc_show(struct seq_file *m, void *v)
+{
+	enum cache_level *level = (enum cache_level *)  m->private;
+
+	seq_printf(m, "%s\n", cache_level_name(*level));
+	return 0;
+}
+
+static int litmus_cluster_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, litmus_cluster_proc_show, PDE_DATA(inode));
+}
+
+static const struct file_operations litmus_cluster_proc_fops = {
+	.open		= litmus_cluster_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= litmus_cluster_proc_write,
+};
+
+struct proc_dir_entry* create_cluster_file(struct proc_dir_entry* parent,
+					   enum cache_level* level)
+{
+	struct proc_dir_entry* cluster_file;
+
+
+	cluster_file = proc_create_data("cluster", 0644, parent,
+					&litmus_cluster_proc_fops,
+					(void *) level);
+	if (!cluster_file) {
+		printk(KERN_ERR
+		       "Could not cluster procfs entry.\n");
+	}
+	return cluster_file;
+}
+
+static struct domain_proc_info* active_mapping = NULL;
+
+static int litmus_mapping_proc_show(struct seq_file *m, void *v)
+{
+	struct cd_mapping *mapping = (struct cd_mapping*) m->private;
+	char buf[256];
+
+	if(!mapping)
+		return 0;
+
+	cpumask_scnprintf(buf, sizeof(buf), mapping->mask);
+	buf[255] = '\0'; /* just in case... */
+	seq_printf(m, "%s\n", buf);
+	return 0;
+}
+
+static int litmus_mapping_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, litmus_mapping_proc_show, PDE_DATA(inode));
+}
+
+static const struct file_operations litmus_domain_proc_fops = {
+	.open		= litmus_mapping_proc_open,
+	.read		= seq_read,
+	.llseek 	= seq_lseek,
+	.release 	= single_release,
+};
+
+long activate_domain_proc(struct domain_proc_info* map)
+{
+	int i;
+	char name[8];
+
+	if (!map)
+		return -EINVAL;
+	if (cpus_dir == NULL || domains_dir == NULL)
+		return -EINVAL;
+
+	if (active_mapping)
+		deactivate_domain_proc();
+
+	active_mapping = map;
+
+	for (i = 0; i < map->num_cpus; ++i) {
+		struct cd_mapping* m = &map->cpu_to_domains[i];
+		snprintf(name, sizeof(name), "%d", m->id);
+		m->proc_file = proc_create_data(name, 0444, cpus_dir,
+			&litmus_domain_proc_fops, (void*)m);
+	}
+
+	for (i = 0; i < map->num_domains; ++i) {
+		struct cd_mapping* m = &map->domain_to_cpus[i];
+		snprintf(name, sizeof(name), "%d", m->id);
+		m->proc_file = proc_create_data(name, 0444, domains_dir,
+			&litmus_domain_proc_fops, (void*)m);
+	}
+
+	return 0;
+}
+
+long deactivate_domain_proc()
+{
+	int i;
+	char name[65];
+
+	struct domain_proc_info* map = active_mapping;
+
+	if (!map)
+		return -EINVAL;
+
+	for (i = 0; i < map->num_cpus; ++i) {
+		struct cd_mapping* m = &map->cpu_to_domains[i];
+		snprintf(name, sizeof(name), "%d", m->id);
+		remove_proc_entry(name, cpus_dir);
+		m->proc_file = NULL;
+	}
+	for (i = 0; i < map->num_domains; ++i) {
+		struct cd_mapping* m = &map->domain_to_cpus[i];
+		snprintf(name, sizeof(name), "%d", m->id);
+		remove_proc_entry(name, domains_dir);
+		m->proc_file = NULL;
+	}
+
+	active_mapping = NULL;
+
+	return 0;
+}
+
+long init_domain_proc_info(struct domain_proc_info* m,
+				int num_cpus, int num_domains)
+{
+	int i;
+	int num_alloced_cpu_masks = 0;
+	int num_alloced_domain_masks = 0;
+
+	m->cpu_to_domains =
+		kmalloc(sizeof(*(m->cpu_to_domains))*num_cpus,
+			GFP_ATOMIC);
+	if(!m->cpu_to_domains)
+		goto failure;
+
+	m->domain_to_cpus =
+		kmalloc(sizeof(*(m->domain_to_cpus))*num_domains,
+			GFP_ATOMIC);
+	if(!m->domain_to_cpus)
+		goto failure;
+
+	for(i = 0; i < num_cpus; ++i) {
+		if(!zalloc_cpumask_var(&m->cpu_to_domains[i].mask, GFP_ATOMIC))
+			goto failure;
+		++num_alloced_cpu_masks;
+	}
+	for(i = 0; i < num_domains; ++i) {
+		if(!zalloc_cpumask_var(&m->domain_to_cpus[i].mask, GFP_ATOMIC))
+			goto failure;
+		++num_alloced_domain_masks;
+	}
+
+	return 0;
+
+failure:
+	for(i = 0; i < num_alloced_cpu_masks; ++i)
+		free_cpumask_var(m->cpu_to_domains[i].mask);
+	for(i = 0; i < num_alloced_domain_masks; ++i)
+		free_cpumask_var(m->domain_to_cpus[i].mask);
+	if(m->cpu_to_domains)
+		kfree(m->cpu_to_domains);
+	if(m->domain_to_cpus)
+		kfree(m->domain_to_cpus);
+	return -ENOMEM;
+}
+
+void destroy_domain_proc_info(struct domain_proc_info* m)
+{
+	int i;
+	for(i = 0; i < m->num_cpus; ++i)
+		free_cpumask_var(m->cpu_to_domains[i].mask);
+	for(i = 0; i < m->num_domains; ++i)
+		free_cpumask_var(m->domain_to_cpus[i].mask);
+	kfree(m->cpu_to_domains);
+	kfree(m->domain_to_cpus);
+	memset(m, sizeof(*m), 0);
+}
diff --git a/litmus/locking.c b/litmus/locking.c
new file mode 100644
index 0000000..43d9aec
--- /dev/null
+++ b/litmus/locking.c
@@ -0,0 +1,188 @@
+#include <linux/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/fdso.h>
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <linux/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/trace.h>
+#include <litmus/wait.h>
+
+static int create_generic_lock(void** obj_ref, obj_type_t type, void* __user arg);
+static int open_generic_lock(struct od_table_entry* entry, void* __user arg);
+static int close_generic_lock(struct od_table_entry* entry);
+static void destroy_generic_lock(obj_type_t type, void* sem);
+
+struct fdso_ops generic_lock_ops = {
+	.create  = create_generic_lock,
+	.open    = open_generic_lock,
+	.close   = close_generic_lock,
+	.destroy = destroy_generic_lock
+};
+
+static inline bool is_lock(struct od_table_entry* entry)
+{
+	return entry->class == &generic_lock_ops;
+}
+
+static inline struct litmus_lock* get_lock(struct od_table_entry* entry)
+{
+	BUG_ON(!is_lock(entry));
+	return (struct litmus_lock*) entry->obj->obj;
+}
+
+static  int create_generic_lock(void** obj_ref, obj_type_t type, void* __user arg)
+{
+	struct litmus_lock* lock;
+	int err;
+
+	err = litmus->allocate_lock(&lock, type, arg);
+	if (err == 0)
+		*obj_ref = lock;
+	return err;
+}
+
+static int open_generic_lock(struct od_table_entry* entry, void* __user arg)
+{
+	struct litmus_lock* lock = get_lock(entry);
+	if (lock->ops->open)
+		return lock->ops->open(lock, arg);
+	else
+		return 0; /* default: any task can open it */
+}
+
+static int close_generic_lock(struct od_table_entry* entry)
+{
+	struct litmus_lock* lock = get_lock(entry);
+	if (lock->ops->close)
+		return lock->ops->close(lock);
+	else
+		return 0; /* default: closing succeeds */
+}
+
+static void destroy_generic_lock(obj_type_t type, void* obj)
+{
+	struct litmus_lock* lock = (struct litmus_lock*) obj;
+	lock->ops->deallocate(lock);
+}
+
+asmlinkage long sys_litmus_lock(int lock_od)
+{
+	long err = -EINVAL;
+	struct od_table_entry* entry;
+	struct litmus_lock* l;
+
+	TS_SYSCALL_IN_START;
+
+	TS_SYSCALL_IN_END;
+
+	TS_LOCK_START;
+
+	entry = get_entry_for_od(lock_od);
+	if (entry && is_lock(entry)) {
+		l = get_lock(entry);
+		TRACE_CUR("attempts to lock 0x%p\n", l);
+		err = l->ops->lock(l);
+	}
+
+	/* Note: task my have been suspended or preempted in between!  Take
+	 * this into account when computing overheads. */
+	TS_LOCK_END;
+
+	TS_SYSCALL_OUT_START;
+
+	return err;
+}
+
+asmlinkage long sys_litmus_unlock(int lock_od)
+{
+	long err = -EINVAL;
+	struct od_table_entry* entry;
+	struct litmus_lock* l;
+
+	TS_SYSCALL_IN_START;
+
+	TS_SYSCALL_IN_END;
+
+	TS_UNLOCK_START;
+
+	entry = get_entry_for_od(lock_od);
+	if (entry && is_lock(entry)) {
+		l = get_lock(entry);
+		TRACE_CUR("attempts to unlock 0x%p\n", l);
+		err = l->ops->unlock(l);
+	}
+
+	/* Note: task my have been preempted in between!  Take this into
+	 * account when computing overheads. */
+	TS_UNLOCK_END;
+
+	TS_SYSCALL_OUT_START;
+
+	return err;
+}
+
+struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq)
+{
+	wait_queue_t* q;
+	struct task_struct* t = NULL;
+
+	if (waitqueue_active(wq)) {
+		q = list_entry(wq->task_list.next,
+			       wait_queue_t, task_list);
+		t = (struct task_struct*) q->private;
+		__remove_wait_queue(wq, q);
+	}
+	return(t);
+}
+
+unsigned int __add_wait_queue_prio_exclusive(
+	wait_queue_head_t* head,
+	prio_wait_queue_t *new)
+{
+	struct list_head *pos;
+	unsigned int passed = 0;
+
+	new->wq.flags |= WQ_FLAG_EXCLUSIVE;
+
+	/* find a spot where the new entry is less than the next */
+	list_for_each(pos, &head->task_list) {
+		prio_wait_queue_t* queued = list_entry(pos, prio_wait_queue_t,
+						       wq.task_list);
+
+		if (unlikely(lt_before(new->priority, queued->priority) ||
+			     (new->priority == queued->priority &&
+			      new->tie_breaker < queued->tie_breaker))) {
+			/* pos is not less than new, thus insert here */
+			__list_add(&new->wq.task_list, pos->prev, pos);
+			goto out;
+		}
+		passed++;
+	}
+
+	/* if we get to this point either the list is empty or every entry
+	 * queued element is less than new.
+	 * Let's add new to the end. */
+	list_add_tail(&new->wq.task_list, &head->task_list);
+out:
+	return passed;
+}
+
+
+#else
+
+struct fdso_ops generic_lock_ops = {};
+
+asmlinkage long sys_litmus_lock(int sem_od)
+{
+	return -ENOSYS;
+}
+
+asmlinkage long sys_litmus_unlock(int sem_od)
+{
+	return -ENOSYS;
+}
+
+#endif
diff --git a/litmus/preempt.c b/litmus/preempt.c
new file mode 100644
index 0000000..6be2f26
--- /dev/null
+++ b/litmus/preempt.c
@@ -0,0 +1,137 @@
+#include <linux/sched.h>
+
+#include <litmus/litmus.h>
+#include <litmus/preempt.h>
+#include <litmus/trace.h>
+
+/* The rescheduling state of each processor.
+ */
+DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, resched_state);
+
+void sched_state_will_schedule(struct task_struct* tsk)
+{
+	/* Litmus hack: we only care about processor-local invocations of
+	 * set_tsk_need_resched(). We can't reliably set the flag remotely
+	 * since it might race with other updates to the scheduling state.  We
+	 * can't rely on the runqueue lock protecting updates to the sched
+	 * state since processors do not acquire the runqueue locks for all
+	 * updates to the sched state (to avoid acquiring two runqueue locks at
+	 * the same time). Further, if tsk is residing on a remote processor,
+	 * then that processor doesn't actually know yet that it is going to
+	 * reschedule; it still must receive an IPI (unless a local invocation
+	 * races).
+	 */
+	if (likely(task_cpu(tsk) == smp_processor_id())) {
+		VERIFY_SCHED_STATE(TASK_SCHEDULED | SHOULD_SCHEDULE | TASK_PICKED | WILL_SCHEDULE);
+		if (is_in_sched_state(TASK_PICKED | PICKED_WRONG_TASK))
+			set_sched_state(PICKED_WRONG_TASK);
+		else
+			set_sched_state(WILL_SCHEDULE);
+	} else
+		/* Litmus tasks should never be subject to a remote
+		 * set_tsk_need_resched(). */
+		BUG_ON(is_realtime(tsk));
+#ifdef CONFIG_PREEMPT_STATE_TRACE
+	TRACE_TASK(tsk, "set_tsk_need_resched() ret:%p\n",
+		   __builtin_return_address(0));
+#endif
+}
+
+/* Called by the IPI handler after another CPU called smp_send_resched(). */
+void sched_state_ipi(void)
+{
+	/* If the IPI was slow, we might be in any state right now. The IPI is
+	 * only meaningful if we are in SHOULD_SCHEDULE. */
+	if (is_in_sched_state(SHOULD_SCHEDULE)) {
+		/* Cause scheduler to be invoked.
+		 * This will cause a transition to WILL_SCHEDULE. */
+		set_tsk_need_resched(current);
+		TRACE_STATE("IPI -> set_tsk_need_resched(%s/%d)\n",
+			    current->comm, current->pid);
+		TS_SEND_RESCHED_END;
+	} else {
+		/* ignore */
+		TRACE_STATE("ignoring IPI in state %x (%s)\n",
+			    get_sched_state(),
+			    sched_state_name(get_sched_state()));
+	}
+}
+
+/* Called by plugins to cause a CPU to reschedule. IMPORTANT: the caller must
+ * hold the lock that is used to serialize scheduling decisions. */
+void litmus_reschedule(int cpu)
+{
+	int picked_transition_ok = 0;
+	int scheduled_transition_ok = 0;
+
+	/* The (remote) CPU could be in any state. */
+
+	/* The critical states are TASK_PICKED and TASK_SCHEDULED, as the CPU
+	 * is not aware of the need to reschedule at this point. */
+
+	/* is a context switch in progress? */
+	if (cpu_is_in_sched_state(cpu, TASK_PICKED))
+		picked_transition_ok = sched_state_transition_on(
+			cpu, TASK_PICKED, PICKED_WRONG_TASK);
+
+	if (!picked_transition_ok &&
+	    cpu_is_in_sched_state(cpu, TASK_SCHEDULED)) {
+		/* We either raced with the end of the context switch, or the
+		 * CPU was in TASK_SCHEDULED anyway. */
+		scheduled_transition_ok = sched_state_transition_on(
+			cpu, TASK_SCHEDULED, SHOULD_SCHEDULE);
+	}
+
+	/* If the CPU was in state TASK_SCHEDULED, then we need to cause the
+	 * scheduler to be invoked. */
+	if (scheduled_transition_ok) {
+		if (smp_processor_id() == cpu)
+			set_tsk_need_resched(current);
+		else {
+			TS_SEND_RESCHED_START(cpu);
+			smp_send_reschedule(cpu);
+		}
+	}
+
+	TRACE_STATE("%s picked-ok:%d sched-ok:%d\n",
+		    __FUNCTION__,
+		    picked_transition_ok,
+		    scheduled_transition_ok);
+}
+
+void litmus_reschedule_local(void)
+{
+	if (is_in_sched_state(TASK_PICKED))
+		set_sched_state(PICKED_WRONG_TASK);
+	else if (is_in_sched_state(TASK_SCHEDULED | SHOULD_SCHEDULE)) {
+		set_sched_state(WILL_SCHEDULE);
+		set_tsk_need_resched(current);
+	}
+}
+
+#ifdef CONFIG_DEBUG_KERNEL
+
+void sched_state_plugin_check(void)
+{
+	if (!is_in_sched_state(TASK_PICKED | PICKED_WRONG_TASK)) {
+		TRACE("!!!! plugin did not call sched_state_task_picked()!"
+		      "Calling sched_state_task_picked() is mandatory---fix this.\n");
+		set_sched_state(TASK_PICKED);
+	}
+}
+
+#define NAME_CHECK(x) case x:  return #x
+const char* sched_state_name(int s)
+{
+	switch (s) {
+		NAME_CHECK(TASK_SCHEDULED);
+		NAME_CHECK(SHOULD_SCHEDULE);
+		NAME_CHECK(WILL_SCHEDULE);
+		NAME_CHECK(TASK_PICKED);
+		NAME_CHECK(PICKED_WRONG_TASK);
+	default:
+		return "UNKNOWN";
+	};
+}
+
+#endif
diff --git a/litmus/rt_domain.c b/litmus/rt_domain.c
new file mode 100644
index 0000000..e5dec0b
--- /dev/null
+++ b/litmus/rt_domain.c
@@ -0,0 +1,353 @@
+/*
+ * litmus/rt_domain.c
+ *
+ * LITMUS real-time infrastructure. This file contains the
+ * functions that manipulate RT domains. RT domains are an abstraction
+ * of a ready queue and a release queue.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/rt_domain.h>
+
+#include <litmus/trace.h>
+
+#include <litmus/bheap.h>
+
+/* Uncomment when debugging timer races... */
+#if 0
+#define VTRACE_TASK TRACE_TASK
+#define VTRACE TRACE
+#else
+#define VTRACE_TASK(t, fmt, args...) /* shut up */
+#define VTRACE(fmt, args...) /* be quiet already */
+#endif
+
+static int dummy_resched(rt_domain_t *rt)
+{
+	return 0;
+}
+
+static int dummy_order(struct bheap_node* a, struct bheap_node* b)
+{
+	return 0;
+}
+
+/* default implementation: use default lock */
+static void default_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+	merge_ready(rt, tasks);
+}
+
+static unsigned int time2slot(lt_t time)
+{
+	return (unsigned int) time2quanta(time, FLOOR) % RELEASE_QUEUE_SLOTS;
+}
+
+static enum hrtimer_restart on_release_timer(struct hrtimer *timer)
+{
+	unsigned long flags;
+	struct release_heap* rh;
+	rh = container_of(timer, struct release_heap, timer);
+
+	TS_RELEASE_LATENCY(rh->release_time);
+
+	VTRACE("on_release_timer(0x%p) starts.\n", timer);
+
+	TS_RELEASE_START;
+
+
+	raw_spin_lock_irqsave(&rh->dom->release_lock, flags);
+	VTRACE("CB has the release_lock 0x%p\n", &rh->dom->release_lock);
+	/* remove from release queue */
+	list_del(&rh->list);
+	raw_spin_unlock_irqrestore(&rh->dom->release_lock, flags);
+	VTRACE("CB returned release_lock 0x%p\n", &rh->dom->release_lock);
+
+	/* call release callback */
+	rh->dom->release_jobs(rh->dom, &rh->heap);
+	/* WARNING: rh can be referenced from other CPUs from now on. */
+
+	TS_RELEASE_END;
+
+	VTRACE("on_release_timer(0x%p) ends.\n", timer);
+
+	return  HRTIMER_NORESTART;
+}
+
+/* allocated in litmus.c */
+struct kmem_cache * release_heap_cache;
+
+struct release_heap* release_heap_alloc(int gfp_flags)
+{
+	struct release_heap* rh;
+	rh= kmem_cache_alloc(release_heap_cache, gfp_flags);
+	if (rh) {
+		/* initialize timer */
+		hrtimer_init(&rh->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+		rh->timer.function = on_release_timer;
+	}
+	return rh;
+}
+
+void release_heap_free(struct release_heap* rh)
+{
+	/* make sure timer is no longer in use */
+	hrtimer_cancel(&rh->timer);
+	kmem_cache_free(release_heap_cache, rh);
+}
+
+/* Caller must hold release lock.
+ * Will return heap for given time. If no such heap exists prior to
+ * the invocation it will be created.
+ */
+static struct release_heap* get_release_heap(rt_domain_t *rt,
+					     struct task_struct* t,
+					     int use_task_heap)
+{
+	struct list_head* pos;
+	struct release_heap* heap = NULL;
+	struct release_heap* rh;
+	lt_t release_time = get_release(t);
+	unsigned int slot = time2slot(release_time);
+
+	/* initialize pos for the case that the list is empty */
+	pos = rt->release_queue.slot[slot].next;
+	list_for_each(pos, &rt->release_queue.slot[slot]) {
+		rh = list_entry(pos, struct release_heap, list);
+		if (release_time == rh->release_time) {
+			/* perfect match -- this happens on hyperperiod
+			 * boundaries
+			 */
+			heap = rh;
+			break;
+		} else if (lt_before(release_time, rh->release_time)) {
+			/* we need to insert a new node since rh is
+			 * already in the future
+			 */
+			break;
+		}
+	}
+	if (!heap && use_task_heap) {
+		/* use pre-allocated release heap */
+		rh = tsk_rt(t)->rel_heap;
+
+		rh->dom = rt;
+		rh->release_time = release_time;
+
+		/* add to release queue */
+		list_add(&rh->list, pos->prev);
+		heap = rh;
+	}
+	return heap;
+}
+
+static void reinit_release_heap(struct task_struct* t)
+{
+	struct release_heap* rh;
+
+	/* use pre-allocated release heap */
+	rh = tsk_rt(t)->rel_heap;
+
+	/* Make sure it is safe to use.  The timer callback could still
+	 * be executing on another CPU; hrtimer_cancel() will wait
+	 * until the timer callback has completed.  However, under no
+	 * circumstances should the timer be active (= yet to be
+	 * triggered).
+	 *
+	 * WARNING: If the CPU still holds the release_lock at this point,
+	 *          deadlock may occur!
+	 */
+	BUG_ON(hrtimer_cancel(&rh->timer));
+
+	/* initialize */
+	bheap_init(&rh->heap);
+#ifdef CONFIG_RELEASE_MASTER
+	atomic_set(&rh->info.state, HRTIMER_START_ON_INACTIVE);
+#endif
+}
+/* arm_release_timer() - start local release timer or trigger
+ *     remote timer (pull timer)
+ *
+ * Called by add_release() with:
+ * - tobe_lock taken
+ * - IRQ disabled
+ */
+#ifdef CONFIG_RELEASE_MASTER
+#define arm_release_timer(t) arm_release_timer_on((t), NO_CPU)
+static void arm_release_timer_on(rt_domain_t *_rt , int target_cpu)
+#else
+static void arm_release_timer(rt_domain_t *_rt)
+#endif
+{
+	rt_domain_t *rt = _rt;
+	struct list_head list;
+	struct list_head *pos, *safe;
+	struct task_struct* t;
+	struct release_heap* rh;
+
+	VTRACE("arm_release_timer() at %llu\n", litmus_clock());
+	list_replace_init(&rt->tobe_released, &list);
+
+	list_for_each_safe(pos, safe, &list) {
+		/* pick task of work list */
+		t = list_entry(pos, struct task_struct, rt_param.list);
+		sched_trace_task_release(t);
+		list_del(pos);
+
+		/* put into release heap while holding release_lock */
+		raw_spin_lock(&rt->release_lock);
+		VTRACE_TASK(t, "I have the release_lock 0x%p\n", &rt->release_lock);
+
+		rh = get_release_heap(rt, t, 0);
+		if (!rh) {
+			/* need to use our own, but drop lock first */
+			raw_spin_unlock(&rt->release_lock);
+			VTRACE_TASK(t, "Dropped release_lock 0x%p\n",
+				    &rt->release_lock);
+
+			reinit_release_heap(t);
+			VTRACE_TASK(t, "release_heap ready\n");
+
+			raw_spin_lock(&rt->release_lock);
+			VTRACE_TASK(t, "Re-acquired release_lock 0x%p\n",
+				    &rt->release_lock);
+
+			rh = get_release_heap(rt, t, 1);
+		}
+		bheap_insert(rt->order, &rh->heap, tsk_rt(t)->heap_node);
+		VTRACE_TASK(t, "arm_release_timer(): added to release heap\n");
+
+		raw_spin_unlock(&rt->release_lock);
+		VTRACE_TASK(t, "Returned the release_lock 0x%p\n", &rt->release_lock);
+
+		/* To avoid arming the timer multiple times, we only let the
+		 * owner do the arming (which is the "first" task to reference
+		 * this release_heap anyway).
+		 */
+		if (rh == tsk_rt(t)->rel_heap) {
+			VTRACE_TASK(t, "arming timer 0x%p\n", &rh->timer);
+
+			if (!hrtimer_is_hres_active(&rh->timer)) {
+				TRACE_TASK(t, "WARNING: no hires timer!!!\n");
+			}
+
+			/* we cannot arm the timer using hrtimer_start()
+			 * as it may deadlock on rq->lock
+			 *
+			 * PINNED mode is ok on both local and remote CPU
+			 */
+#ifdef CONFIG_RELEASE_MASTER
+			if (rt->release_master == NO_CPU &&
+			    target_cpu == NO_CPU)
+#endif
+				__hrtimer_start_range_ns(&rh->timer,
+						ns_to_ktime(rh->release_time),
+						0, HRTIMER_MODE_ABS_PINNED, 0);
+#ifdef CONFIG_RELEASE_MASTER
+			else
+				hrtimer_start_on(
+					/* target_cpu overrides release master */
+					(target_cpu != NO_CPU ?
+					 target_cpu : rt->release_master),
+					&rh->info, &rh->timer,
+					ns_to_ktime(rh->release_time),
+					HRTIMER_MODE_ABS_PINNED);
+#endif
+		} else
+			VTRACE_TASK(t, "0x%p is not my timer\n", &rh->timer);
+	}
+}
+
+void rt_domain_init(rt_domain_t *rt,
+		    bheap_prio_t order,
+		    check_resched_needed_t check,
+		    release_jobs_t release
+		   )
+{
+	int i;
+
+	BUG_ON(!rt);
+	if (!check)
+		check = dummy_resched;
+	if (!release)
+		release = default_release_jobs;
+	if (!order)
+		order = dummy_order;
+
+#ifdef CONFIG_RELEASE_MASTER
+	rt->release_master = NO_CPU;
+#endif
+
+	bheap_init(&rt->ready_queue);
+	INIT_LIST_HEAD(&rt->tobe_released);
+	for (i = 0; i < RELEASE_QUEUE_SLOTS; i++)
+		INIT_LIST_HEAD(&rt->release_queue.slot[i]);
+
+	raw_spin_lock_init(&rt->ready_lock);
+	raw_spin_lock_init(&rt->release_lock);
+	raw_spin_lock_init(&rt->tobe_lock);
+
+	rt->check_resched 	= check;
+	rt->release_jobs	= release;
+	rt->order		= order;
+}
+
+/* add_ready - add a real-time task to the rt ready queue. It must be runnable.
+ * @new:       the newly released task
+ */
+void __add_ready(rt_domain_t* rt, struct task_struct *new)
+{
+	TRACE("rt: adding %s/%d (%llu, %llu, %llu) rel=%llu "
+		"to ready queue at %llu\n",
+		new->comm, new->pid,
+		get_exec_cost(new), get_rt_period(new), get_rt_relative_deadline(new),
+		get_release(new), litmus_clock());
+
+	BUG_ON(bheap_node_in_heap(tsk_rt(new)->heap_node));
+
+	bheap_insert(rt->order, &rt->ready_queue, tsk_rt(new)->heap_node);
+	rt->check_resched(rt);
+}
+
+/* merge_ready - Add a sorted set of tasks to the rt ready queue. They must be runnable.
+ * @tasks      - the newly released tasks
+ */
+void __merge_ready(rt_domain_t* rt, struct bheap* tasks)
+{
+	bheap_union(rt->order, &rt->ready_queue, tasks);
+	rt->check_resched(rt);
+}
+
+
+#ifdef CONFIG_RELEASE_MASTER
+void __add_release_on(rt_domain_t* rt, struct task_struct *task,
+		      int target_cpu)
+{
+	TRACE_TASK(task, "add_release_on(), rel=%llu, target=%d\n",
+		   get_release(task), target_cpu);
+	list_add(&tsk_rt(task)->list, &rt->tobe_released);
+	task->rt_param.domain = rt;
+
+	arm_release_timer_on(rt, target_cpu);
+}
+#endif
+
+/* add_release - add a real-time task to the rt release queue.
+ * @task:        the sleeping task
+ */
+void __add_release(rt_domain_t* rt, struct task_struct *task)
+{
+	TRACE_TASK(task, "add_release(), rel=%llu\n", get_release(task));
+	list_add(&tsk_rt(task)->list, &rt->tobe_released);
+	task->rt_param.domain = rt;
+
+	arm_release_timer(rt);
+}
diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
new file mode 100644
index 0000000..edd91e9
--- /dev/null
+++ b/litmus/sched_plugin.c
@@ -0,0 +1,238 @@
+/* sched_plugin.c -- core infrastructure for the scheduler plugin system
+ *
+ * This file includes the initialization of the plugin system, the no-op Linux
+ * scheduler plugin, some dummy functions, and some helper functions.
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/preempt.h>
+#include <litmus/jobs.h>
+
+/*
+ * Generic function to trigger preemption on either local or remote cpu
+ * from scheduler plugins. The key feature is that this function is
+ * non-preemptive section aware and does not invoke the scheduler / send
+ * IPIs if the to-be-preempted task is actually non-preemptive.
+ */
+void preempt_if_preemptable(struct task_struct* t, int cpu)
+{
+	/* t is the real-time task executing on CPU on_cpu If t is NULL, then
+	 * on_cpu is currently scheduling background work.
+	 */
+
+	int reschedule = 0;
+
+	if (!t)
+		/* move non-real-time task out of the way */
+		reschedule = 1;
+	else {
+		if (smp_processor_id() == cpu) {
+			/* local CPU case */
+			/* check if we need to poke userspace */
+			if (is_user_np(t))
+				/* Yes, poke it. This doesn't have to be atomic since
+				 * the task is definitely not executing. */
+				request_exit_np(t);
+			else if (!is_kernel_np(t))
+				/* only if we are allowed to preempt the
+				 * currently-executing task */
+				reschedule = 1;
+		} else {
+			/* Remote CPU case.  Only notify if it's not a kernel
+			 * NP section and if we didn't set the userspace
+			 * flag. */
+			reschedule = !(is_kernel_np(t) || request_exit_np_atomic(t));
+		}
+	}
+	if (likely(reschedule))
+		litmus_reschedule(cpu);
+}
+
+
+/*************************************************************
+ *                   Dummy plugin functions                  *
+ *************************************************************/
+
+static void litmus_dummy_finish_switch(struct task_struct * prev)
+{
+}
+
+static struct task_struct* litmus_dummy_schedule(struct task_struct * prev)
+{
+	sched_state_task_picked();
+	return NULL;
+}
+
+static long litmus_dummy_admit_task(struct task_struct* tsk)
+{
+	printk(KERN_CRIT "LITMUS^RT: Linux plugin rejects %s/%d.\n",
+		tsk->comm, tsk->pid);
+	return -EINVAL;
+}
+
+static void litmus_dummy_task_new(struct task_struct *t, int on_rq, int running)
+{
+}
+
+static void litmus_dummy_task_wake_up(struct task_struct *task)
+{
+}
+
+static void litmus_dummy_task_block(struct task_struct *task)
+{
+}
+
+static void litmus_dummy_task_exit(struct task_struct *task)
+{
+}
+
+static void litmus_dummy_task_cleanup(struct task_struct *task)
+{
+}
+
+static long litmus_dummy_complete_job(void)
+{
+	return -ENOSYS;
+}
+
+static long litmus_dummy_activate_plugin(void)
+{
+	return 0;
+}
+
+static long litmus_dummy_deactivate_plugin(void)
+{
+	return 0;
+}
+
+static long litmus_dummy_get_domain_proc_info(struct domain_proc_info **d)
+{
+	*d = NULL;
+	return 0;
+}
+
+static void litmus_dummy_synchronous_release_at(lt_t time_zero)
+{
+	/* ignore */
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+static long litmus_dummy_allocate_lock(struct litmus_lock **lock, int type,
+				       void* __user config)
+{
+	return -ENXIO;
+}
+
+#endif
+
+
+/* The default scheduler plugin. It doesn't do anything and lets Linux do its
+ * job.
+ */
+struct sched_plugin linux_sched_plugin = {
+	.plugin_name = "Linux",
+	.task_new   = litmus_dummy_task_new,
+	.task_exit = litmus_dummy_task_exit,
+	.task_wake_up = litmus_dummy_task_wake_up,
+	.task_block = litmus_dummy_task_block,
+	.complete_job = litmus_dummy_complete_job,
+	.schedule = litmus_dummy_schedule,
+	.finish_switch = litmus_dummy_finish_switch,
+	.activate_plugin = litmus_dummy_activate_plugin,
+	.deactivate_plugin = litmus_dummy_deactivate_plugin,
+	.get_domain_proc_info = litmus_dummy_get_domain_proc_info,
+	.synchronous_release_at = litmus_dummy_synchronous_release_at,
+#ifdef CONFIG_LITMUS_LOCKING
+	.allocate_lock = litmus_dummy_allocate_lock,
+#endif
+	.admit_task = litmus_dummy_admit_task
+};
+
+/*
+ *	The reference to current plugin that is used to schedule tasks within
+ *	the system. It stores references to actual function implementations
+ *	Should be initialized by calling "init_***_plugin()"
+ */
+struct sched_plugin *litmus = &linux_sched_plugin;
+
+/* the list of registered scheduling plugins */
+static LIST_HEAD(sched_plugins);
+static DEFINE_RAW_SPINLOCK(sched_plugins_lock);
+
+#define CHECK(func) {\
+	if (!plugin->func) \
+		plugin->func = litmus_dummy_ ## func;}
+
+/* FIXME: get reference to module  */
+int register_sched_plugin(struct sched_plugin* plugin)
+{
+	printk(KERN_INFO "Registering LITMUS^RT plugin %s.\n",
+	       plugin->plugin_name);
+
+	/* make sure we don't trip over null pointers later */
+	CHECK(finish_switch);
+	CHECK(schedule);
+	CHECK(task_wake_up);
+	CHECK(task_exit);
+	CHECK(task_cleanup);
+	CHECK(task_block);
+	CHECK(task_new);
+	CHECK(complete_job);
+	CHECK(activate_plugin);
+	CHECK(deactivate_plugin);
+	CHECK(get_domain_proc_info);
+#ifdef CONFIG_LITMUS_LOCKING
+	CHECK(allocate_lock);
+#endif
+	CHECK(admit_task);
+	CHECK(synchronous_release_at);
+
+	if (!plugin->wait_for_release_at)
+		plugin->wait_for_release_at = default_wait_for_release_at;
+
+	raw_spin_lock(&sched_plugins_lock);
+	list_add(&plugin->list, &sched_plugins);
+	raw_spin_unlock(&sched_plugins_lock);
+
+	return 0;
+}
+
+
+/* FIXME: reference counting, etc. */
+struct sched_plugin* find_sched_plugin(const char* name)
+{
+	struct list_head *pos;
+	struct sched_plugin *plugin;
+
+	raw_spin_lock(&sched_plugins_lock);
+	list_for_each(pos, &sched_plugins) {
+		plugin = list_entry(pos, struct sched_plugin, list);
+		if (!strcmp(plugin->plugin_name, name))
+		    goto out_unlock;
+	}
+	plugin = NULL;
+
+out_unlock:
+	raw_spin_unlock(&sched_plugins_lock);
+	return plugin;
+}
+
+void print_sched_plugins(struct seq_file *m)
+{
+	struct list_head *pos;
+	struct sched_plugin *plugin;
+
+	raw_spin_lock(&sched_plugins_lock);
+	list_for_each(pos, &sched_plugins) {
+		plugin = list_entry(pos, struct sched_plugin, list);
+		seq_printf(m, "%s\n", plugin->plugin_name);
+	}
+	raw_spin_unlock(&sched_plugins_lock);
+}
diff --git a/litmus/srp.c b/litmus/srp.c
new file mode 100644
index 0000000..e4e3811
--- /dev/null
+++ b/litmus/srp.c
@@ -0,0 +1,313 @@
+/* ************************************************************************** */
+/*                          STACK RESOURCE POLICY                             */
+/* ************************************************************************** */
+
+#include <asm/atomic.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/fdso.h>
+#include <litmus/trace.h>
+
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <litmus/srp.h>
+
+srp_prioritization_t get_srp_prio;
+
+struct srp {
+	struct list_head	ceiling;
+	wait_queue_head_t	ceiling_blocked;
+};
+#define system_ceiling(srp) list2prio(srp->ceiling.next)
+#define ceiling2sem(c) container_of(c, struct srp_semaphore, ceiling)
+
+#define UNDEF_SEM -2
+
+atomic_t srp_objects_in_use = ATOMIC_INIT(0);
+
+DEFINE_PER_CPU(struct srp, srp);
+
+/* Initialize SRP semaphores at boot time. */
+static int __init srp_init(void)
+{
+	int i;
+
+	printk("Initializing SRP per-CPU ceilings...");
+	for (i = 0; i < NR_CPUS; i++) {
+		init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked);
+		INIT_LIST_HEAD(&per_cpu(srp, i).ceiling);
+	}
+	printk(" done!\n");
+
+	return 0;
+}
+module_init(srp_init);
+
+/* SRP task priority comparison function. Smaller numeric values have higher
+ * priority, tie-break is PID. Special case: priority == 0 <=> no priority
+ */
+static int srp_higher_prio(struct srp_priority* first,
+			   struct srp_priority* second)
+{
+	if (!first->priority)
+		return 0;
+	else
+		return  !second->priority ||
+			first->priority < second->priority || (
+			first->priority == second->priority &&
+			first->pid < second->pid);
+}
+
+
+static int srp_exceeds_ceiling(struct task_struct* first,
+			       struct srp* srp)
+{
+	struct srp_priority prio;
+
+	if (list_empty(&srp->ceiling))
+		return 1;
+	else {
+		prio.pid = first->pid;
+		prio.priority = get_srp_prio(first);
+		return srp_higher_prio(&prio, system_ceiling(srp)) ||
+			ceiling2sem(system_ceiling(srp))->owner == first;
+	}
+}
+
+static void srp_add_prio(struct srp* srp, struct srp_priority* prio)
+{
+	struct list_head *pos;
+	if (in_list(&prio->list)) {
+		printk(KERN_CRIT "WARNING: SRP violation detected, prio is already in "
+		       "ceiling list! cpu=%d, srp=%p\n", smp_processor_id(), ceiling2sem(prio));
+		return;
+	}
+	list_for_each(pos, &srp->ceiling)
+		if (unlikely(srp_higher_prio(prio, list2prio(pos)))) {
+			__list_add(&prio->list, pos->prev, pos);
+			return;
+		}
+
+	list_add_tail(&prio->list, &srp->ceiling);
+}
+
+
+static int lock_srp_semaphore(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+
+	if (!is_realtime(t))
+		return -EPERM;
+
+	/* prevent acquisition of local locks in global critical sections */
+	if (tsk_rt(t)->num_locks_held)
+		return -EBUSY;
+
+	preempt_disable();
+
+	/* Update ceiling. */
+	srp_add_prio(&__get_cpu_var(srp), &sem->ceiling);
+
+	/* SRP invariant: all resources available */
+	BUG_ON(sem->owner != NULL);
+
+	sem->owner = t;
+	TRACE_CUR("acquired srp 0x%p\n", sem);
+
+	tsk_rt(t)->num_local_locks_held++;
+
+	preempt_enable();
+
+	return 0;
+}
+
+static int unlock_srp_semaphore(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+	int err = 0;
+
+	preempt_disable();
+
+	if (sem->owner != t) {
+		err = -EINVAL;
+	} else {
+		/* The current owner should be executing on the correct CPU.
+		 *
+		 * FIXME: if the owner transitioned out of RT mode or is
+		 * exiting, then we it might have already been migrated away by
+		 * the best-effort scheduler and we just have to deal with
+		 * it. This is currently not supported. */
+		BUG_ON(sem->cpu != smp_processor_id());
+
+		/* Determine new system priority ceiling for this CPU. */
+		BUG_ON(!in_list(&sem->ceiling.list));
+
+		list_del(&sem->ceiling.list);
+		sem->owner = NULL;
+
+		/* Wake tasks on this CPU, if they exceed current ceiling. */
+		TRACE_CUR("released srp 0x%p\n", sem);
+		wake_up_all(&__get_cpu_var(srp).ceiling_blocked);
+
+		tsk_rt(t)->num_local_locks_held--;
+	}
+
+	preempt_enable();
+	return err;
+}
+
+static int open_srp_semaphore(struct litmus_lock* l, void* __user arg)
+{
+	struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+	int err = 0;
+	struct task_struct* t = current;
+	struct srp_priority t_prio;
+
+	if (!is_realtime(t))
+		return -EPERM;
+
+	TRACE_CUR("opening SRP semaphore %p, cpu=%d\n", sem, sem->cpu);
+
+	preempt_disable();
+
+	if (sem->owner != NULL)
+		err = -EBUSY;
+
+	if (err == 0) {
+		if (sem->cpu == UNDEF_SEM)
+			sem->cpu = get_partition(t);
+		else if (sem->cpu != get_partition(t))
+			err = -EPERM;
+	}
+
+	if (err == 0) {
+		t_prio.priority = get_srp_prio(t);
+		t_prio.pid      = t->pid;
+		if (srp_higher_prio(&t_prio, &sem->ceiling)) {
+			sem->ceiling.priority = t_prio.priority;
+			sem->ceiling.pid      = t_prio.pid;
+		}
+	}
+
+	preempt_enable();
+
+	return err;
+}
+
+static int close_srp_semaphore(struct litmus_lock* l)
+{
+	struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+	int err = 0;
+
+	preempt_disable();
+
+	if (sem->owner == current)
+		unlock_srp_semaphore(l);
+
+	preempt_enable();
+
+	return err;
+}
+
+static void deallocate_srp_semaphore(struct litmus_lock* l)
+{
+	struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+	atomic_dec(&srp_objects_in_use);
+	kfree(sem);
+}
+
+static struct litmus_lock_ops srp_lock_ops = {
+	.open   = open_srp_semaphore,
+	.close  = close_srp_semaphore,
+	.lock   = lock_srp_semaphore,
+	.unlock = unlock_srp_semaphore,
+	.deallocate = deallocate_srp_semaphore,
+};
+
+struct srp_semaphore* allocate_srp_semaphore(void)
+{
+	struct srp_semaphore* sem;
+
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if (!sem)
+		return NULL;
+
+	INIT_LIST_HEAD(&sem->ceiling.list);
+	sem->ceiling.priority = 0;
+	sem->cpu     = UNDEF_SEM;
+	sem->owner   = NULL;
+
+	sem->litmus_lock.ops = &srp_lock_ops;
+
+	atomic_inc(&srp_objects_in_use);
+	return sem;
+}
+
+static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync,
+		       void *key)
+{
+	int cpu = smp_processor_id();
+	struct task_struct *tsk = wait->private;
+	if (cpu != get_partition(tsk))
+		TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b",
+			   get_partition(tsk));
+	else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
+		return default_wake_function(wait, mode, sync, key);
+	return 0;
+}
+
+static void do_ceiling_block(struct task_struct *tsk)
+{
+	wait_queue_t wait = {
+		.private   = tsk,
+		.func      = srp_wake_up,
+		.task_list = {NULL, NULL}
+	};
+
+	tsk->state = TASK_UNINTERRUPTIBLE;
+	add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
+	tsk->rt_param.srp_non_recurse = 1;
+	preempt_enable_no_resched();
+	schedule();
+	preempt_disable();
+	tsk->rt_param.srp_non_recurse = 0;
+	remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
+}
+
+/* Wait for current task priority to exceed system-wide priority ceiling.
+ * FIXME: the hotpath should be inline.
+ */
+void srp_ceiling_block(void)
+{
+	struct task_struct *tsk = current;
+
+	/* Only applies to real-time tasks, but optimize for RT tasks. */
+	if (unlikely(!is_realtime(tsk)))
+		return;
+
+	/* Avoid recursive ceiling blocking. */
+	if (unlikely(tsk->rt_param.srp_non_recurse))
+		return;
+
+	/* Bail out early if there aren't any SRP resources around. */
+	if (likely(!atomic_read(&srp_objects_in_use)))
+		return;
+
+	preempt_disable();
+	if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) {
+		TRACE_CUR("is priority ceiling blocked.\n");
+		while (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
+			do_ceiling_block(tsk);
+		TRACE_CUR("finally exceeds system ceiling.\n");
+	} else
+		TRACE_CUR("is not priority ceiling blocked\n");
+	preempt_enable();
+}
+
+#endif
diff --git a/litmus/sync.c b/litmus/sync.c
new file mode 100644
index 0000000..5d18060
--- /dev/null
+++ b/litmus/sync.c
@@ -0,0 +1,152 @@
+/* litmus/sync.c - Support for synchronous and asynchronous task system releases.
+ *
+ *
+ */
+
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/jobs.h>
+
+#include <litmus/sched_trace.h>
+
+struct ts_release_wait {
+	struct list_head list;
+	struct completion completion;
+	lt_t ts_release_time;
+};
+
+#define DECLARE_TS_RELEASE_WAIT(symb)					\
+	struct ts_release_wait symb =					\
+	{								\
+		LIST_HEAD_INIT(symb.list),				\
+		COMPLETION_INITIALIZER_ONSTACK(symb.completion),	\
+		0							\
+	}
+
+static LIST_HEAD(task_release_list);
+static DEFINE_MUTEX(task_release_lock);
+
+static long do_wait_for_ts_release(void)
+{
+	DECLARE_TS_RELEASE_WAIT(wait);
+
+	long ret = -ERESTARTSYS;
+
+	if (mutex_lock_interruptible(&task_release_lock))
+		goto out;
+
+	list_add(&wait.list, &task_release_list);
+
+	mutex_unlock(&task_release_lock);
+
+	/* We are enqueued, now we wait for someone to wake us up. */
+	ret = wait_for_completion_interruptible(&wait.completion);
+
+	if (!ret) {
+		/* Completion succeeded, setup release time. */
+		ret = litmus->wait_for_release_at(
+			wait.ts_release_time + get_rt_phase(current));
+	} else {
+		/* We were interrupted, must cleanup list. */
+		mutex_lock(&task_release_lock);
+		if (!wait.completion.done)
+			list_del(&wait.list);
+		mutex_unlock(&task_release_lock);
+	}
+
+out:
+	return ret;
+}
+
+int count_tasks_waiting_for_release(void)
+{
+	int task_count = 0;
+	struct list_head *pos;
+
+	mutex_lock(&task_release_lock);
+
+	list_for_each(pos, &task_release_list) {
+		task_count++;
+	}
+
+	mutex_unlock(&task_release_lock);
+
+
+	return task_count;
+}
+
+static long do_release_ts(lt_t start)
+{
+	long  task_count = 0;
+
+	struct list_head	*pos, *safe;
+	struct ts_release_wait	*wait;
+
+	if (mutex_lock_interruptible(&task_release_lock)) {
+		task_count = -ERESTARTSYS;
+		goto out;
+	}
+
+	TRACE("<<<<<< synchronous task system release >>>>>>\n");
+	sched_trace_sys_release(&start);
+	litmus->synchronous_release_at(start);
+
+	task_count = 0;
+	list_for_each_safe(pos, safe, &task_release_list) {
+		wait = (struct ts_release_wait*)
+			list_entry(pos, struct ts_release_wait, list);
+
+		task_count++;
+		wait->ts_release_time = start;
+		complete(&wait->completion);
+	}
+
+	/* clear stale list */
+	INIT_LIST_HEAD(&task_release_list);
+
+	mutex_unlock(&task_release_lock);
+
+out:
+	return task_count;
+}
+
+
+asmlinkage long sys_wait_for_ts_release(void)
+{
+	long ret = -EPERM;
+	struct task_struct *t = current;
+
+	if (is_realtime(t))
+		ret = do_wait_for_ts_release();
+
+	return ret;
+}
+
+#define ONE_MS 1000000
+
+asmlinkage long sys_release_ts(lt_t __user *__delay)
+{
+	long ret;
+	lt_t delay;
+	lt_t start_time;
+
+	/* FIXME: check capabilities... */
+
+	ret = copy_from_user(&delay, __delay, sizeof(delay));
+	if (ret == 0) {
+		/* round up to next larger integral millisecond */
+		start_time = litmus_clock();
+		do_div(start_time, ONE_MS);
+		start_time *= ONE_MS;
+		ret = do_release_ts(start_time + delay);
+	}
+
+	return ret;
+}
diff --git a/litmus/uncachedev.c b/litmus/uncachedev.c
new file mode 100644
index 0000000..06a6a7c
--- /dev/null
+++ b/litmus/uncachedev.c
@@ -0,0 +1,102 @@
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <asm/page.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+
+#include <litmus/litmus.h>
+
+/* device for allocating pages not cached by the CPU */
+
+#define UNCACHE_NAME        "litmus/uncache"
+
+void litmus_uncache_vm_open(struct vm_area_struct *vma)
+{
+}
+
+void litmus_uncache_vm_close(struct vm_area_struct *vma)
+{
+}
+
+int litmus_uncache_vm_fault(struct vm_area_struct* vma,
+							struct vm_fault* vmf)
+{
+	/* modeled after SG DMA video4linux, but without DMA. */
+	/* (see drivers/media/video/videobuf-dma-sg.c) */
+	struct page *page;
+
+	page = alloc_page(GFP_USER);
+	if (!page)
+		return VM_FAULT_OOM;
+
+	clear_user_highpage(page, (unsigned long)vmf->virtual_address);
+	vmf->page = page;
+
+	return 0;
+}
+
+static struct vm_operations_struct litmus_uncache_vm_ops = {
+	.open = litmus_uncache_vm_open,
+	.close = litmus_uncache_vm_close,
+	.fault = litmus_uncache_vm_fault,
+};
+
+static int litmus_uncache_mmap(struct file* filp, struct vm_area_struct* vma)
+{
+	/* first make sure mapper knows what he's doing */
+
+	/* you can only map the "first" page */
+	if (vma->vm_pgoff != 0)
+		return -EINVAL;
+
+	/* you can't share it with anyone */
+	if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+		return -EINVAL;
+
+	/* cannot be expanded, and is not a "normal" page. */
+	vma->vm_flags |= VM_DONTEXPAND;
+
+	/* noncached pages are not explicitly locked in memory (for now). */
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	vma->vm_ops = &litmus_uncache_vm_ops;
+
+	return 0;
+}
+
+static struct file_operations litmus_uncache_fops = {
+	.owner = THIS_MODULE,
+	.mmap  = litmus_uncache_mmap,
+};
+
+static struct miscdevice litmus_uncache_dev = {
+	.name  = UNCACHE_NAME,
+	.minor = MISC_DYNAMIC_MINOR,
+	.fops  = &litmus_uncache_fops,
+	/* pages are not locked, so there is no reason why
+	   anyone cannot allocate an uncache pages */
+	.mode  = (S_IRUGO | S_IWUGO),
+};
+
+static int __init init_litmus_uncache_dev(void)
+{
+	int err;
+
+	printk("Initializing LITMUS^RT uncache device.\n");
+	err = misc_register(&litmus_uncache_dev);
+	if (err)
+		printk("Could not allocate %s device (%d).\n", UNCACHE_NAME, err);
+	return err;
+}
+
+static void __exit exit_litmus_uncache_dev(void)
+{
+	misc_deregister(&litmus_uncache_dev);
+}
+
+module_init(init_litmus_uncache_dev);
+module_exit(exit_litmus_uncache_dev);
-- 
1.8.1.2


From 469c9f7ad36d105b31f478855f9c45ff376d3582 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Tue, 12 Feb 2013 18:57:35 +0100
Subject: [PATCH 021/119] Add LITMUS^RT syscalls for ARM

---
 arch/arm/include/asm/unistd.h      |  3 ++-
 arch/arm/include/uapi/asm/unistd.h |  3 +++
 arch/arm/kernel/calls.S            | 13 +++++++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
index 141baa3..8b26b32 100644
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -15,7 +15,8 @@
 
 #include <uapi/asm/unistd.h>
 
-#define __NR_syscalls  (380)
+#define __NR_syscalls  (380 + NR_litmus_syscalls)
+
 #define __ARM_NR_cmpxchg		(__ARM_NR_BASE+0x00fff0)
 
 #define __ARCH_WANT_STAT64
diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h
index af33b44..1a767bf 100644
--- a/arch/arm/include/uapi/asm/unistd.h
+++ b/arch/arm/include/uapi/asm/unistd.h
@@ -407,6 +407,9 @@
 #define __NR_kcmp			(__NR_SYSCALL_BASE+378)
 #define __NR_finit_module		(__NR_SYSCALL_BASE+379)
 
+#define __NR_LITMUS (__NR_SYSCALL_BASE+380)
+#include <litmus/unistd_32.h>
+
 /*
  * This may need to be greater than __NR_last_syscall+1 in order to
  * account for the padding in the syscall table
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index c6ca7e3..2da776a 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -389,6 +389,19 @@
 		CALL(sys_process_vm_writev)
 		CALL(sys_kcmp)
 		CALL(sys_finit_module)
+/* 380 */	CALL(sys_set_rt_task_param)
+		CALL(sys_get_rt_task_param)
+		CALL(sys_complete_job)
+		CALL(sys_od_open)
+                CALL(sys_od_close)
+/* 385 */	CALL(sys_litmus_lock)
+		CALL(sys_litmus_unlock)
+		CALL(sys_query_job_no)
+		CALL(sys_wait_for_job_release)
+        	CALL(sys_wait_for_ts_release)
+/* 390 */	CALL(sys_release_ts)
+		CALL(sys_null_call)
+
 #ifndef syscalls_counted
 .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
 #define syscalls_counted
-- 
1.8.1.2


From 0072c939b32ecc73a15dc2cc28185f4324d28536 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Tue, 12 Feb 2013 19:01:01 +0100
Subject: [PATCH 022/119] Add LITMUS^RT syscalls for x86

---
 arch/x86/syscalls/syscall_32.tbl | 13 +++++++++++++
 arch/x86/syscalls/syscall_64.tbl | 13 +++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index aabfb83..ffe39dd 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -357,3 +357,16 @@
 348	i386	process_vm_writev	sys_process_vm_writev		compat_sys_process_vm_writev
 349	i386	kcmp			sys_kcmp
 350	i386	finit_module		sys_finit_module
+
+351	i386	set_rt_task_param	sys_set_rt_task_param
+352	i386	get_rt_task_param	sys_get_rt_task_param
+353	i386	complete_job		sys_complete_job
+354	i386	od_open			sys_od_open
+355	i386	od_close		sys_od_close
+356	i386	litmus_lock		sys_litmus_lock
+357	i386	litmus_unlock		sys_litmus_unlock
+358	i386	query_job_no		sys_query_job_no
+359	i386	wait_for_job_release	sys_wait_for_job_release
+360	i386	wait_for_ts_release	sys_wait_for_ts_release
+361	i386	release_ts		sys_release_ts
+362	i386	null_call		sys_null_call
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 38ae65d..cde714e 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -321,6 +321,19 @@
 312	common	kcmp			sys_kcmp
 313	common	finit_module		sys_finit_module
 
+351	common	set_rt_task_param	sys_set_rt_task_param
+352	common	get_rt_task_param	sys_get_rt_task_param
+353	common	complete_job		sys_complete_job
+354	common	od_open			sys_od_open
+355	common	od_close		sys_od_close
+356	common	litmus_lock		sys_litmus_lock
+357	common	litmus_unlock		sys_litmus_unlock
+358	common	query_job_no		sys_query_job_no
+359	common	wait_for_job_release	sys_wait_for_job_release
+360	common	wait_for_ts_release	sys_wait_for_ts_release
+361	common	release_ts		sys_release_ts
+362	common	null_call		sys_null_call
+
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
 # for native 64-bit operation.
-- 
1.8.1.2


From a1b5e6c8f816c6c28dfbce7e16d3977f96e380f2 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Sun, 21 Jul 2013 13:51:38 +0200
Subject: [PATCH 023/119] Move trace point definition to litmus/litmus.c

If !CONFIG_SCHED_TASK_TRACE, but CONFIG_SCHED_LITMUS_TRACEPOINT, then
we still need to define the tracepoint structures.

This patch should be integrated with the earlier sched_task_trace.c
patches during one of the next major rebasing efforts.
---
 litmus/litmus.c           | 5 +++++
 litmus/sched_task_trace.c | 5 -----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/litmus/litmus.c b/litmus/litmus.c
index 9c419cd..a061343 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -25,6 +25,11 @@
 #include <litmus/affinity.h>
 #endif
 
+#ifdef CONFIG_SCHED_LITMUS_TRACEPOINT
+#define CREATE_TRACE_POINTS
+#include <trace/events/litmus.h>
+#endif
+
 /* Number of RT tasks that exist in the system */
 atomic_t rt_task_count 		= ATOMIC_INIT(0);
 
diff --git a/litmus/sched_task_trace.c b/litmus/sched_task_trace.c
index 2bdfbbd..933e7e4 100644
--- a/litmus/sched_task_trace.c
+++ b/litmus/sched_task_trace.c
@@ -15,11 +15,6 @@
 #include <litmus/feather_trace.h>
 #include <litmus/ftdev.h>
 
-#ifdef CONFIG_SCHED_LITMUS_TRACEPOINT
-#define CREATE_TRACE_POINTS
-#include <trace/events/litmus.h>
-#endif
-
 #define NO_EVENTS		(1 << CONFIG_SCHED_TASK_TRACE_SHIFT)
 
 #define now() litmus_clock()
-- 
1.8.1.2


From 751073bc996ac77c219c2031282ee7ce0f473af5 Mon Sep 17 00:00:00 2001
From: Glenn Elliott <gelliott@cs.unc.edu>
Date: Mon, 16 Sep 2013 17:26:56 -0400
Subject: [PATCH 024/119] Record LITMUS^RT timestamp in ftrace records

Patch updates ftrace.h to record a litmus_clock() time stamp
in ftrace records.
---
 include/trace/ftrace.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 66dba42..7571012 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -18,6 +18,9 @@
 
 #include <linux/ftrace_event.h>
 
+/* for litmus_clock() */
+#include <litmus/litmus.h>
+
 /*
  * DECLARE_EVENT_CLASS can be used to add a generic function
  * handlers for events. That is, if all events have the same
@@ -54,7 +57,7 @@
 #define __string(item, src) __dynamic_array(char, item, -1)
 
 #undef TP_STRUCT__entry
-#define TP_STRUCT__entry(args...) args
+#define TP_STRUCT__entry(args...) args __field( unsigned long long, __rt_ts )
 
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, print)	\
@@ -502,7 +505,7 @@ static inline notrace int ftrace_get_offsets_##call(			\
 	strcpy(__get_str(dst), (src) ? (const char *)(src) : "(null)");
 
 #undef TP_fast_assign
-#define TP_fast_assign(args...) args
+#define TP_fast_assign(args...) args; __entry->__rt_ts = litmus_clock();
 
 #undef TP_perf_assign
 #define TP_perf_assign(args...)
-- 
1.8.1.2


From d40f1cc129917e9478d48658073e46462968b973 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Tue, 25 Jun 2013 07:30:56 +0200
Subject: [PATCH 025/119] Integrate preemption state machine with Linux
 scheduler

Track when a processor is going to schedule "soon".
---
 arch/arm/kernel/smp.c |  4 ++++
 arch/x86/kernel/smp.c |  6 ++++++
 include/linux/sched.h |  2 ++
 kernel/sched/core.c   | 21 ++++++++++++++++++++-
 4 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 5919eb4..1a945e2 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -46,6 +46,8 @@
 #include <asm/virt.h>
 #include <asm/mach/arch.h>
 
+#include <litmus/preempt.h>
+
 /*
  * as from 2.5, kernels no longer have an init_tasks structure
  * so we need some other way of telling a new secondary core
@@ -617,6 +619,8 @@ void handle_IPI(int ipinr, struct pt_regs *regs)
 #endif
 
 	case IPI_RESCHEDULE:
+		/* LITMUS^RT: take action based on scheduler state */
+		sched_state_ipi();
 		scheduler_ipi();
 		break;
 
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index a52ef7f..becf5c3 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -24,6 +24,7 @@
 #include <linux/cpu.h>
 #include <linux/gfp.h>
 
+#include <litmus/preempt.h>
 #include <litmus/debug_trace.h>
 
 #include <asm/mtrr.h>
@@ -269,6 +270,11 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
 	/*
 	 * KVM uses this interrupt to force a cpu out of guest mode
 	 */
+
+	/* LITMUS^RT: this IPI might need to trigger the sched state machine.
+	 * Starting from 3.0 schedule_ipi() actually does something.  This may
+	 * increase IPI latencies compared with previous versions. */
+	sched_state_ipi();
 }
 
 void smp_call_function_interrupt(struct pt_regs *regs)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index cbb3b44..5dc3e5b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -56,6 +56,7 @@ struct sched_param {
 #include <asm/processor.h>
 
 #include <litmus/rt_param.h>
+#include <litmus/preempt.h>
 
 struct exec_domain;
 struct futex_pi_state;
@@ -2375,6 +2376,7 @@ static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
 static inline void set_tsk_need_resched(struct task_struct *tsk)
 {
 	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
+	sched_state_will_schedule(tsk);
 }
 
 static inline void clear_tsk_need_resched(struct task_struct *tsk)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3a471d6..17992b2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1966,8 +1966,12 @@ static inline void post_schedule(struct rq *rq)
 asmlinkage void schedule_tail(struct task_struct *prev)
 	__releases(rq->lock)
 {
-	struct rq *rq = this_rq();
+	struct rq *rq;
+
 
+	preempt_disable();
+
+	rq = this_rq();
 	finish_task_switch(rq, prev);
 
 	/*
@@ -1976,6 +1980,11 @@ asmlinkage void schedule_tail(struct task_struct *prev)
 	 */
 	post_schedule(rq);
 
+	if (sched_state_validate_switch())
+		litmus_reschedule_local();
+
+	preempt_enable();
+
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	/* In this case, finish_task_switch does not reenable preemption */
 	preempt_enable();
@@ -2973,11 +2982,16 @@ static void __sched __schedule(void)
 
 need_resched:
 	preempt_disable();
+	sched_state_entered_schedule();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_note_context_switch(cpu);
 	prev = rq->curr;
 
+	/* LITMUS^RT: quickly re-evaluate the scheduling decision
+	 * if the previous one is no longer valid after context switch.
+	 */
+litmus_need_resched_nonpreemptible:
 	TS_SCHED_START;
 
 	schedule_debug(prev);
@@ -3053,6 +3067,11 @@ need_resched:
 
 	post_schedule(rq);
 
+	if (sched_state_validate_switch()) {
+		TS_SCHED2_END(prev);
+		goto litmus_need_resched_nonpreemptible;
+	}
+
 	sched_preempt_enable_no_resched();
 
 	TS_SCHED2_END(prev);
-- 
1.8.1.2


From 1042b270f038a2c654d93aa3fd8b9ae9abe542d9 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Wed, 10 Jul 2013 18:34:34 +0200
Subject: [PATCH 026/119] Call sched_state_task_picked() from
 pick_next_task_stop()

Otherwise, the scheduler state machine becomes confused (and goes into
a rescheduling loop) when stop-machine is triggered.
---
 kernel/sched/stop_task.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index da5eb5b..6835d31 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,5 +1,7 @@
 #include "sched.h"
 
+#include <litmus/preempt.h>
+
 /*
  * stop-task scheduling class.
  *
@@ -29,6 +31,12 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
 
 	if (stop && stop->on_rq) {
 		stop->se.exec_start = rq->clock_task;
+		/* Let the LITMUS^RT scheduler state machine know
+		 * that a task was picked. This is needed because the
+		 * LITMUS^RT scheduling plugin will not be called
+		 * if the stop-task class picks a task.
+		 */
+		sched_state_task_picked();
 		return stop;
 	}
 
-- 
1.8.1.2


From 87c71e1c704021c7381821a6c654096db4f07b20 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Tue, 12 Feb 2013 17:45:17 +0100
Subject: [PATCH 027/119] Hook into fork(), exec(), and exit()

Allow LITMUS^RT to do some work when a process is created or
terminated.
---
 fs/exec.c     | 3 +++
 kernel/exit.c | 4 ++++
 kernel/fork.c | 9 +++++++++
 3 files changed, 16 insertions(+)

diff --git a/fs/exec.c b/fs/exec.c
index bb60cda..d84259a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -56,6 +56,8 @@
 #include <linux/oom.h>
 #include <linux/compat.h>
 
+#include <litmus/litmus.h>
+
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
@@ -1506,6 +1508,7 @@ static int do_execve_common(const char *filename,
 		goto out_unmark;
 
 	sched_exec();
+	litmus_exec();
 
 	bprm->file = file;
 	bprm->filename = filename;
diff --git a/kernel/exit.c b/kernel/exit.c
index 7bb73f9..ab36666 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -59,6 +59,8 @@
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
 
+extern void exit_od_table(struct task_struct *t);
+
 static void exit_mm(struct task_struct * tsk);
 
 static void __unhash_process(struct task_struct *p, bool group_dead)
@@ -781,6 +783,8 @@ void do_exit(long code)
 		tty_audit_exit();
 	audit_free(tsk);
 
+	exit_od_table(tsk);
+
 	tsk->exit_code = code;
 	taskstats_exit(tsk, group_dead);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index ff7be9d..b8aa56b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -81,6 +81,9 @@
 
 #include <trace/events/sched.h>
 
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/task.h>
 
@@ -238,6 +241,9 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(tsk == current);
 
 	security_task_free(tsk);
+
+	exit_litmus(tsk);
+
 	exit_creds(tsk);
 	delayacct_tsk_free(tsk);
 	put_signal_struct(tsk->signal);
@@ -312,6 +318,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 
 	tsk->stack = ti;
 
+	/* Don't let the new task be a real-time task. */
+	litmus_fork(tsk);
+
 	setup_thread_stack(tsk, orig);
 	clear_user_return_notifier(tsk);
 	clear_tsk_need_resched(tsk);
-- 
1.8.1.2


From e44cd07ea0123cac05852b00f3c9d514a8999933 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Sat, 8 Jun 2013 18:22:35 +0200
Subject: [PATCH 028/119] Augment rt_task() with is_realtime()

Whenever the kernel checks for rt_task() to avoid delaying real-time
tasks, we want it to also not delay LITMUS^RT tasks.  Hence, most
calls to rt_task() should be matched by an equivalent call to
is_realtime().

Notably, this affects the implementations of select() and nanosleep(),
which use timer_slack_ns when setting up timers for non-real-time
tasks.
---
 fs/select.c         | 4 +++-
 kernel/hrtimer.c    | 3 ++-
 kernel/mutex.c      | 5 ++++-
 mm/page-writeback.c | 6 ++++--
 mm/page_alloc.c     | 5 ++++-
 5 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/fs/select.c b/fs/select.c
index 8c1c96c..f53b3e4 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -28,6 +28,8 @@
 #include <linux/hrtimer.h>
 #include <linux/sched/rt.h>
 
+#include <litmus/litmus.h> /* for is_realtime() */
+
 #include <asm/uaccess.h>
 
 
@@ -77,7 +79,7 @@ long select_estimate_accuracy(struct timespec *tv)
 	 * Realtime tasks get a slack of 0 for obvious reasons.
 	 */
 
-	if (rt_task(current))
+	if (rt_task(current) || is_realtime(current))
 		return 0;
 
 	ktime_get_ts(&now);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index c7f0c79..60b6329 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -49,6 +49,7 @@
 #include <linux/timer.h>
 
 #include <litmus/debug_trace.h>
+#include <litmus/litmus.h>
 
 #include <asm/uaccess.h>
 
@@ -1701,7 +1702,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 	unsigned long slack;
 
 	slack = current->timer_slack_ns;
-	if (rt_task(current))
+	if (rt_task(current) || is_realtime(current))
 		slack = 0;
 
 	hrtimer_init_on_stack(&t.timer, clockid, mode);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index ad53a66..a60d05e 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -25,6 +25,8 @@
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
 
+#include <litmus/litmus.h>
+
 /*
  * In the DEBUG case we are using the "NULL fastpath" for mutexes,
  * which forces all calls into the slowpath:
@@ -325,7 +327,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * we're an RT task that will live-lock because we won't let
 		 * the owner complete.
 		 */
-		if (!owner && (need_resched() || rt_task(task)))
+		if (!owner && (need_resched() ||
+			       rt_task(task) || is_realtime(task)))
 			break;
 
 		/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 73cbc5d..1f0073b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -38,6 +38,8 @@
 #include <linux/sched/rt.h>
 #include <trace/events/writeback.h>
 
+#include <litmus/litmus.h> /* for is_realtime() */
+
 /*
  * Sleep at most 200ms at a time in balance_dirty_pages().
  */
@@ -300,7 +302,7 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 	if (background >= dirty)
 		background = dirty / 2;
 	tsk = current;
-	if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
+	if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk) || is_realtime(tsk)) {
 		background += background / 4;
 		dirty += dirty / 4;
 	}
@@ -328,7 +330,7 @@ static unsigned long zone_dirty_limit(struct zone *zone)
 	else
 		dirty = vm_dirty_ratio * zone_memory / 100;
 
-	if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
+	if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk) || is_realtime(tsk))
 		dirty += dirty / 4;
 
 	return dirty;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2ee0fd3..6529939 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -61,6 +61,8 @@
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
 
+#include <litmus/litmus.h> /* for is_realtime() */
+
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include "internal.h"
@@ -2362,7 +2364,8 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
 		 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 		 */
 		alloc_flags &= ~ALLOC_CPUSET;
-	} else if (unlikely(rt_task(current)) && !in_interrupt())
+	} else if (unlikely(rt_task(current) || is_realtime(current))
+		   && !in_interrupt())
 		alloc_flags |= ALLOC_HARDER;
 
 	if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
-- 
1.8.1.2


From bb196a3537cf825b18aa46cdf962fb9422bc2a8f Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Mon, 1 Jul 2013 22:06:19 +0200
Subject: [PATCH 029/119] Hookup sched_trace_XXX() tracing in Linux scheduler

This patch adds context switch tracing to the main Linux scheduler.
---
 kernel/sched/core.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 17992b2..3d37e2a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -87,6 +87,7 @@
 #include "../smpboot.h"
 
 #include <litmus/trace.h>
+#include <litmus/sched_trace.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -1974,6 +1975,8 @@ asmlinkage void schedule_tail(struct task_struct *prev)
 	rq = this_rq();
 	finish_task_switch(rq, prev);
 
+	sched_trace_task_switch_to(current);
+
 	/*
 	 * FIXME: do we need to worry about rq being invalidated by the
 	 * task_switch?
@@ -2993,6 +2996,7 @@ need_resched:
 	 */
 litmus_need_resched_nonpreemptible:
 	TS_SCHED_START;
+	sched_trace_task_switch_away(prev);
 
 	schedule_debug(prev);
 
@@ -3064,6 +3068,7 @@ litmus_need_resched_nonpreemptible:
 	}
 
 	TS_SCHED2_START(prev);
+	sched_trace_task_switch_to(current);
 
 	post_schedule(rq);
 
-- 
1.8.1.2


From b23a712f8a488189cee3ef373d878c39f4a1dab4 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Mon, 1 Jul 2013 22:15:48 +0200
Subject: [PATCH 030/119] Integrate SRP ceiling blocking callback with Linux
 scheduler

Check whether a suspension is required at end of schedule().
---
 kernel/sched/core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3d37e2a..403aa9e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -86,6 +86,7 @@
 #include "../workqueue_internal.h"
 #include "../smpboot.h"
 
+#include <litmus/litmus.h>
 #include <litmus/trace.h>
 #include <litmus/sched_trace.h>
 
@@ -3083,6 +3084,8 @@ litmus_need_resched_nonpreemptible:
 
 	if (need_resched())
 		goto need_resched;
+
+	srp_ceiling_block();
 }
 
 static inline void sched_submit_work(struct task_struct *tsk)
-- 
1.8.1.2


From 55d08a9fb5f361e38a8b21dc601e9037b0a3a98d Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Mon, 1 Jul 2013 22:30:19 +0200
Subject: [PATCH 031/119] Introduce LITMUS^RT runqueue dummy into struct rq

---
 kernel/sched/sched.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index dfa31d5..62f508b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -358,6 +358,11 @@ struct rt_rq {
 #endif
 };
 
+struct litmus_rq {
+	unsigned long nr_running;
+	struct task_struct *prev;
+};
+
 #ifdef CONFIG_SMP
 
 /*
@@ -422,6 +427,7 @@ struct rq {
 
 	struct cfs_rq cfs;
 	struct rt_rq rt;
+	struct litmus_rq litmus;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
-- 
1.8.1.2


From 4d892c962033fe3b959ce4cffe68f0f27304a436 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Mon, 1 Jul 2013 22:31:47 +0200
Subject: [PATCH 032/119] Add LITMUS^RT scheduling class in
 kernel/sched/Makefile

---
 kernel/sched/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index deaf90e..7002348 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -17,3 +17,7 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+
+
+# LITMUS^RT scheduling class
+obj-y += litmus.o
\ No newline at end of file
-- 
1.8.1.2


From 7c23bd3e9b9744e746222f3f4b77f4cf7978c746 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Mon, 1 Jul 2013 22:38:20 +0200
Subject: [PATCH 033/119] Make LITMUS^RT scheduling class the highest-priority
 scheduling class

Needs to be above stop_machine_class for legacy reasons; the main
plugins were developed before stop_machine_class was introduced and
assume that they are the highest-priority scheduling class.
---
 kernel/sched/litmus.c | 2 +-
 kernel/sched/sched.h  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/litmus.c b/kernel/sched/litmus.c
index ad88a14..b84361f 100644
--- a/kernel/sched/litmus.c
+++ b/kernel/sched/litmus.c
@@ -314,7 +314,7 @@ const struct sched_class litmus_sched_class = {
 	 * cpu-hotplug or cpu throttling. Allows Litmus to use up to 1.0
 	 * CPU capacity.
 	 */
-	.next			= &rt_sched_class,
+	.next			= &stop_sched_class,
 	.enqueue_task		= enqueue_task_litmus,
 	.dequeue_task		= dequeue_task_litmus,
 	.yield_task		= yield_task_litmus,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 62f508b..91b0641 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1017,11 +1017,12 @@ struct sched_class {
 #endif
 };
 
-#define sched_class_highest (&stop_sched_class)
+#define sched_class_highest (&litmus_sched_class)
 #define for_each_class(class) \
    for (class = sched_class_highest; class; class = class->next)
 
 extern const struct sched_class stop_sched_class;
+extern const struct sched_class litmus_sched_class;
 extern const struct sched_class rt_sched_class;
 extern const struct sched_class fair_sched_class;
 extern const struct sched_class idle_sched_class;
-- 
1.8.1.2


From 685b84e0e122117b1c38e06bc24378d757967545 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Mon, 1 Jul 2013 22:50:05 +0200
Subject: [PATCH 034/119] Integrate LITMUS^RT with try_to_wake_up() path

---
 kernel/sched/core.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 403aa9e..210d42d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1463,7 +1463,12 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 	struct rq *rq = cpu_rq(cpu);
 
 #if defined(CONFIG_SMP)
-	if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
+	/*
+	 * LITMUS^RT: whether to send an IPI to the remote CPU is plugin
+	 * specific.
+	 */
+	if (!is_realtime(p) &&
+	    sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
 		sched_clock_cpu(cpu); /* sync clocks x-cpu */
 		ttwu_queue_remote(p, cpu);
 		return;
@@ -1496,6 +1501,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	unsigned long flags;
 	int cpu, success = 0;
 
+	if (is_realtime(p))
+		TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state);
+
 	/*
 	 * If we are going to wake up a thread waiting for CONDITION we
 	 * need to ensure that CONDITION=1 done by the caller can not be
@@ -1525,6 +1533,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 */
 	smp_rmb();
 
+	/* LITMUS^RT: once the task can be safely referenced by this
+	 * CPU, don't mess with Linux load balancing stuff.
+	 */
+	if (is_realtime(p))
+		goto litmus_out_activate;
+
 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
 	p->state = TASK_WAKING;
 
@@ -1536,12 +1550,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 		wake_flags |= WF_MIGRATED;
 		set_task_cpu(p, cpu);
 	}
+
+litmus_out_activate:
 #endif /* CONFIG_SMP */
 
 	ttwu_queue(p, cpu);
 stat:
 	ttwu_stat(p, cpu, wake_flags);
 out:
+	if (is_realtime(p))
+		TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 	return success;
-- 
1.8.1.2


From d2d9bd3ce3b2af5edb9ac2a5b01fc6db4589c885 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Mon, 1 Jul 2013 23:45:08 +0200
Subject: [PATCH 035/119] Disable cut-to-CFS optimization in Linux scheduler

Global plugins require that the plugin be called even if there
currently is no real-time task executing on the local core.
---
 kernel/sched/core.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 210d42d..d54b6d6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2942,12 +2942,19 @@ pick_next_task(struct rq *rq)
 	/*
 	 * Optimization: we know that if all tasks are in
 	 * the fair class we can call that function directly:
-	 */
+
+	 * NOT IN LITMUS^RT!
+
+	 * This breaks many assumptions in the plugins.
+	 * Do not uncomment without thinking long and hard
+	 * about how this affects global plugins such as GSN-EDF.
+
 	if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
 		p = fair_sched_class.pick_next_task(rq);
 		if (likely(p))
 			return p;
 	}
+	*/
 
 	for_each_class(class) {
 		p = class->pick_next_task(rq);
-- 
1.8.1.2


From 5d0d1599bdcd8e35b3f23777234e0c9243fd4498 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Mon, 1 Jul 2013 23:46:56 +0200
Subject: [PATCH 036/119] Integrate LITMUS^RT scheduling class with
 sched_setscheduler

---
 kernel/sched/core.c | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d54b6d6..afc134d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -89,6 +89,7 @@
 #include <litmus/litmus.h>
 #include <litmus/trace.h>
 #include <litmus/sched_trace.h>
+#include <litmus/sched_plugin.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -3909,7 +3910,9 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 	p->normal_prio = normal_prio(p);
 	/* we are holding p->pi_lock already */
 	p->prio = rt_mutex_getprio(p);
-	if (rt_prio(p->prio))
+	if (p->policy == SCHED_LITMUS)
+		p->sched_class = &litmus_sched_class;
+	else if (rt_prio(p->prio))
 		p->sched_class = &rt_sched_class;
 	else
 		p->sched_class = &fair_sched_class;
@@ -3940,6 +3943,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
 	const struct sched_class *prev_class;
 	struct rq *rq;
 	int reset_on_fork;
+	int litmus_task = 0;
 
 	/* may grab non-irq protected spin_locks */
 	BUG_ON(in_interrupt());
@@ -3954,7 +3958,7 @@ recheck:
 
 		if (policy != SCHED_FIFO && policy != SCHED_RR &&
 				policy != SCHED_NORMAL && policy != SCHED_BATCH &&
-				policy != SCHED_IDLE)
+				policy != SCHED_IDLE && policy != SCHED_LITMUS)
 			return -EINVAL;
 	}
 
@@ -3969,6 +3973,8 @@ recheck:
 		return -EINVAL;
 	if (rt_policy(policy) != (param->sched_priority != 0))
 		return -EINVAL;
+	if (policy == SCHED_LITMUS && policy == p->policy)
+		return -EINVAL;
 
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
@@ -4012,6 +4018,12 @@ recheck:
 			return retval;
 	}
 
+	if (policy == SCHED_LITMUS) {
+		retval = litmus_admit_task(p);
+		if (retval)
+			return retval;
+	}
+
 	/*
 	 * make sure no PI-waiters arrive (or leave) while we are
 	 * changing the priority of the task:
@@ -4068,10 +4080,25 @@ recheck:
 
 	p->sched_reset_on_fork = reset_on_fork;
 
+	if (p->policy == SCHED_LITMUS) {
+		litmus_exit_task(p);
+		litmus_task = 1;
+	}
+
 	oldprio = p->prio;
 	prev_class = p->sched_class;
 	__setscheduler(rq, p, policy, param->sched_priority);
 
+	if (policy == SCHED_LITMUS) {
+#ifdef CONFIG_SMP
+		p->rt_param.stack_in_use = running ? rq->cpu : NO_CPU;
+#else
+		p->rt_param.stack_in_use = running ? 0 : NO_CPU;
+#endif
+		p->rt_param.present = running;
+		litmus->task_new(p, on_rq, running);
+	}
+
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (on_rq)
@@ -4082,6 +4109,9 @@ recheck:
 
 	rt_mutex_adjust_pi(p);
 
+	if (litmus_task)
+		litmus_dealloc(p);
+
 	return 0;
 }
 
-- 
1.8.1.2


From c728e26abc70530c62b794b017e6135ecd8df8f0 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Mon, 1 Jul 2013 23:48:22 +0200
Subject: [PATCH 037/119] Block sched_setaffinity() for SCHED_LITMUS tasks

---
 kernel/sched/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index afc134d..eb32fd0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4269,10 +4269,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	rcu_read_lock();
 
 	p = find_process_by_pid(pid);
-	if (!p) {
+	if (!p || is_realtime(p)) {
 		rcu_read_unlock();
 		put_online_cpus();
-		return -ESRCH;
+		return p ? -EPERM : -ESRCH;
 	}
 
 	/* Prevent p going away */
-- 
1.8.1.2


From 4fd5fc39aeed8941623be74fcadc3742055f459d Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Mon, 1 Jul 2013 23:51:06 +0200
Subject: [PATCH 038/119] Reset SCHED_LITMUS scheduling class on fork

---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index eb32fd0..8a421e9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1721,7 +1721,7 @@ void sched_fork(struct task_struct *p)
 	 * Revert to default priority/policy on fork if requested.
 	 */
 	if (unlikely(p->sched_reset_on_fork)) {
-		if (task_has_rt_policy(p)) {
+		if (task_has_rt_policy(p) || p->policy == SCHED_LITMUS) {
 			p->policy = SCHED_NORMAL;
 			p->static_prio = NICE_TO_PRIO(0);
 			p->rt_priority = 0;
-- 
1.8.1.2


From e07b27e05418f939cb559e96733aa2e96de71592 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Mon, 1 Jul 2013 23:52:14 +0200
Subject: [PATCH 039/119] Hook into finish_switch()

To keep track of stack usage and to notify plugin, if necessary.
---
 kernel/sched/core.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8a421e9..b50b3c2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1925,6 +1925,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	prev_state = prev->state;
 	vtime_task_switch(prev);
 	finish_arch_switch(prev);
+	litmus->finish_switch(prev);
+	prev->rt_param.stack_in_use = NO_CPU;
 	perf_event_task_sched_in(prev, current);
 	finish_lock_switch(rq, prev);
 	finish_arch_post_lock_switch();
-- 
1.8.1.2


From f40461bc0f6f2b7d76e6c24aef758862e3cbd0c1 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Mon, 1 Jul 2013 23:55:09 +0200
Subject: [PATCH 040/119] Cache 'prev' in LITMUS^RT runqueue

LITMUS^RT plugins like to know who 'prev' is. pick_next_task() doesn't
expose that info, so we just cache prev in the runqueue. Could robably
be replaced by looking at 'current' instead.
---
 kernel/sched/core.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b50b3c2..ad6ba36 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1953,6 +1953,14 @@ static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
 {
 	if (prev->sched_class->pre_schedule)
 		prev->sched_class->pre_schedule(rq, prev);
+
+       /* LITMUS^RT not very clean hack: we need to save the prev task as our
+        * scheduling decision rely on it (as we drop the rq lock something in
+        * prev can change...); there is no way to escape this hack apart from
+        * modifying pick_nex_task(rq, _prev_) or falling back on the previous
+        * solution of decoupling scheduling decisions.
+        */
+	rq->litmus.prev = prev;
 }
 
 /* rq->lock is NOT held, but preemption is disabled */
-- 
1.8.1.2


From 0aea465f37878c328da2361607b4670725be1139 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Mon, 1 Jul 2013 23:55:58 +0200
Subject: [PATCH 041/119] Don't trigger load balancer in scheduler tick for
 LITMUS^RT

---
 kernel/sched/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ad6ba36..1be8a35 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2804,7 +2804,8 @@ void scheduler_tick(void)
 
 #ifdef CONFIG_SMP
 	rq->idle_balance = idle_cpu(cpu);
-	trigger_load_balance(rq, cpu);
+	if (!is_realtime(current))
+		trigger_load_balance(rq, cpu);
 #endif
 	rq_last_tick_reset(rq);
 
-- 
1.8.1.2


From e801a3104619e8dbee8e1fed05402f160d8f790d Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Mon, 1 Jul 2013 23:56:31 +0200
Subject: [PATCH 042/119] Hook into rt scheduling class to protect LITMUS^RT
 tasks

The rt scheduling class thinks it's the highest-priority scheduling
class around. It is not in LITMUS^RT. Don't go preempting remote cores
that run SCHED_LITMUS tasks.
---
 kernel/sched/rt.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 15334e6..dbe21ae 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -7,6 +7,8 @@
 
 #include <linux/slab.h>
 
+#include <litmus/litmus.h>
+
 int sched_rr_timeslice = RR_TIMESLICE;
 
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
@@ -436,7 +438,9 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 	if (rt_rq->rt_nr_running) {
 		if (rt_se && !on_rt_rq(rt_se))
 			enqueue_rt_entity(rt_se, false);
-		if (rt_rq->highest_prio.curr < curr->prio)
+		if (rt_rq->highest_prio.curr < curr->prio
+		    /* Don't subject LITMUS^RT tasks to remote reschedules. */
+		    && !is_realtime(curr))
 			resched_task(curr);
 	}
 }
@@ -530,7 +534,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 
 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
-	if (rt_rq->rt_nr_running)
+	if (rt_rq->rt_nr_running
+	    && !is_realtime(rq_of_rt_rq(rt_rq)->curr))
 		resched_task(rq_of_rt_rq(rt_rq)->curr);
 }
 
-- 
1.8.1.2


From 2c85fa0767ce6355759868583a9eeedb1feb384f Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Thu, 25 Jul 2013 22:56:55 +0200
Subject: [PATCH 043/119] Don't call set_tsk_need_resched() on remote LITMUS^RT
 task

This patch fixes a BUG_ON() in litmus/preempt.c:33 reported by Felipe
Cerqueira & Manohar Vanga.
---
 kernel/sched/core.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1be8a35..6bced0e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -526,9 +526,14 @@ void resched_task(struct task_struct *p)
 	if (test_tsk_need_resched(p))
 		return;
 
-	set_tsk_need_resched(p);
-
 	cpu = task_cpu(p);
+
+	/* Cannot call set_tsk_need_resched() on LITMUS^RT task
+	 * on remote core. Only policy plugins may do this via
+	 * litmus_reschedule(). */
+	if (!is_realtime(p) || cpu == smp_processor_id())
+		set_tsk_need_resched(p);
+
 	if (cpu == smp_processor_id())
 		return;
 
-- 
1.8.1.2


From 232ab01e5abd3bf3b494f46d558898122d57f6d8 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Thu, 25 Jul 2013 23:00:29 +0200
Subject: [PATCH 044/119] Protect LITMUS^RT tasks from re-nicing

Assigning a nice value to LITMUS^RT tasks is meaningless. Bail out
early.
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6bced0e..46ceebb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3770,7 +3770,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	 * it wont have any effect on scheduling until the task is
 	 * SCHED_FIFO/SCHED_RR:
 	 */
-	if (task_has_rt_policy(p)) {
+	if (task_has_rt_policy(p) || is_realtime(p)) {
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
-- 
1.8.1.2


From d14164377790b4c15fd1fa4665329f6e6febe78f Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Mon, 2 Jun 2014 14:22:02 +0200
Subject: [PATCH 045/119] Hook into kernel/exit.c to force exiting RT tasks
 into best-effort mode

---
 kernel/exit.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/kernel/exit.c b/kernel/exit.c
index ab36666..2fc678f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -59,6 +59,8 @@
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
 
+#include <litmus/litmus.h>
+
 extern void exit_od_table(struct task_struct *t);
 
 static void exit_mm(struct task_struct * tsk);
@@ -720,6 +722,15 @@ void do_exit(long code)
 	if (unlikely(!tsk->pid))
 		panic("Attempted to kill the idle task!");
 
+	if (unlikely(is_realtime(tsk))) {
+		/* We would like the task to be polite
+		 * and transition out of RT mode first.
+		 * Let's give it a little help.
+		 */
+		litmus_do_exit(tsk);
+		BUG_ON(is_realtime(tsk));
+	}
+
 	/*
 	 * If do_exit is called because this processes oopsed, it's possible
 	 * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
-- 
1.8.1.2


From ad5164aa251591c954a084a51aaa866c1380e7b3 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Tue, 12 Feb 2013 19:15:27 +0100
Subject: [PATCH 046/119] Add PSN-EDF scheduler plugin

---
 litmus/Makefile        |   4 +-
 litmus/sched_psn_edf.c | 689 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 692 insertions(+), 1 deletion(-)
 create mode 100644 litmus/sched_psn_edf.c

diff --git a/litmus/Makefile b/litmus/Makefile
index f7ceabc..0db695e 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -18,7 +18,9 @@ obj-y     = sched_plugin.o litmus.o \
 	    bheap.o \
 	    binheap.o \
 	    ctrldev.o \
-	    uncachedev.o
+	    uncachedev.o \
+	    sched_psn_edf.o
+
 
 obj-$(CONFIG_SCHED_CPU_AFFINITY) += affinity.o
 
diff --git a/litmus/sched_psn_edf.c b/litmus/sched_psn_edf.c
new file mode 100644
index 0000000..dd042db
--- /dev/null
+++ b/litmus/sched_psn_edf.c
@@ -0,0 +1,689 @@
+/*
+ * kernel/sched_psn_edf.c
+ *
+ * Implementation of the PSN-EDF scheduler plugin.
+ * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c.
+ *
+ * Suspensions and non-preemptable sections are supported.
+ * Priority inheritance is not supported.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/budget.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+
+/* to set up domain/cpu mappings */
+#include <litmus/litmus_proc.h>
+
+typedef struct {
+	rt_domain_t 		domain;
+	int          		cpu;
+	struct task_struct* 	scheduled; /* only RT tasks */
+/*
+ * scheduling lock slock
+ * protects the domain and serializes scheduling decisions
+ */
+#define slock domain.ready_lock
+
+} psnedf_domain_t;
+
+DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains);
+
+#define local_edf		(&__get_cpu_var(psnedf_domains).domain)
+#define local_pedf		(&__get_cpu_var(psnedf_domains))
+#define remote_edf(cpu)		(&per_cpu(psnedf_domains, cpu).domain)
+#define remote_pedf(cpu)	(&per_cpu(psnedf_domains, cpu))
+#define task_edf(task)		remote_edf(get_partition(task))
+#define task_pedf(task)		remote_pedf(get_partition(task))
+
+
+static void psnedf_domain_init(psnedf_domain_t* pedf,
+			       check_resched_needed_t check,
+			       release_jobs_t release,
+			       int cpu)
+{
+	edf_domain_init(&pedf->domain, check, release);
+	pedf->cpu      		= cpu;
+	pedf->scheduled		= NULL;
+}
+
+static void requeue(struct task_struct* t, rt_domain_t *edf)
+{
+	if (t->state != TASK_RUNNING)
+		TRACE_TASK(t, "requeue: !TASK_RUNNING\n");
+
+	tsk_rt(t)->completed = 0;
+	if (is_early_releasing(t) || is_released(t, litmus_clock()))
+		__add_ready(edf, t);
+	else
+		add_release(edf, t); /* it has got to wait */
+}
+
+/* we assume the lock is being held */
+static void preempt(psnedf_domain_t *pedf)
+{
+	preempt_if_preemptable(pedf->scheduled, pedf->cpu);
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+static void boost_priority(struct task_struct* t)
+{
+	unsigned long		flags;
+	psnedf_domain_t* 	pedf = task_pedf(t);
+	lt_t			now;
+
+	raw_spin_lock_irqsave(&pedf->slock, flags);
+	now = litmus_clock();
+
+	TRACE_TASK(t, "priority boosted at %llu\n", now);
+
+	tsk_rt(t)->priority_boosted = 1;
+	tsk_rt(t)->boost_start_time = now;
+
+	if (pedf->scheduled != t) {
+		/* holder may be queued: first stop queue changes */
+		raw_spin_lock(&pedf->domain.release_lock);
+		if (is_queued(t) &&
+		    /* If it is queued, then we need to re-order. */
+		    bheap_decrease(edf_ready_order, tsk_rt(t)->heap_node) &&
+		    /* If we bubbled to the top, then we need to check for preemptions. */
+		    edf_preemption_needed(&pedf->domain, pedf->scheduled))
+				preempt(pedf);
+		raw_spin_unlock(&pedf->domain.release_lock);
+	} /* else: nothing to do since the job is not queued while scheduled */
+
+	raw_spin_unlock_irqrestore(&pedf->slock, flags);
+}
+
+static void unboost_priority(struct task_struct* t)
+{
+	unsigned long		flags;
+	psnedf_domain_t* 	pedf = task_pedf(t);
+	lt_t			now;
+
+	raw_spin_lock_irqsave(&pedf->slock, flags);
+	now = litmus_clock();
+
+	/* Assumption: this only happens when the job is scheduled.
+	 * Exception: If t transitioned to non-real-time mode, we no longer
+	 * care about it. */
+	BUG_ON(pedf->scheduled != t && is_realtime(t));
+
+	TRACE_TASK(t, "priority restored at %llu\n", now);
+
+	tsk_rt(t)->priority_boosted = 0;
+	tsk_rt(t)->boost_start_time = 0;
+
+	/* check if this changes anything */
+	if (edf_preemption_needed(&pedf->domain, pedf->scheduled))
+		preempt(pedf);
+
+	raw_spin_unlock_irqrestore(&pedf->slock, flags);
+}
+
+#endif
+
+static int psnedf_preempt_check(psnedf_domain_t *pedf)
+{
+	if (edf_preemption_needed(&pedf->domain, pedf->scheduled)) {
+		preempt(pedf);
+		return 1;
+	} else
+		return 0;
+}
+
+/* This check is trivial in partioned systems as we only have to consider
+ * the CPU of the partition.
+ */
+static int psnedf_check_resched(rt_domain_t *edf)
+{
+	psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain);
+
+	/* because this is a callback from rt_domain_t we already hold
+	 * the necessary lock for the ready queue
+	 */
+	return psnedf_preempt_check(pedf);
+}
+
+static void job_completion(struct task_struct* t, int forced)
+{
+	sched_trace_task_completion(t,forced);
+	TRACE_TASK(t, "job_completion().\n");
+
+	tsk_rt(t)->completed = 0;
+	prepare_for_next_period(t);
+}
+
+static struct task_struct* psnedf_schedule(struct task_struct * prev)
+{
+	psnedf_domain_t* 	pedf = local_pedf;
+	rt_domain_t*		edf  = &pedf->domain;
+	struct task_struct*	next;
+
+	int 			out_of_time, sleep, preempt,
+				np, exists, blocks, resched;
+
+	raw_spin_lock(&pedf->slock);
+
+	/* sanity checking
+	 * differently from gedf, when a task exits (dead)
+	 * pedf->schedule may be null and prev _is_ realtime
+	 */
+	BUG_ON(pedf->scheduled && pedf->scheduled != prev);
+	BUG_ON(pedf->scheduled && !is_realtime(prev));
+
+	/* (0) Determine state */
+	exists      = pedf->scheduled != NULL;
+	blocks      = exists && !is_running(pedf->scheduled);
+	out_of_time = exists &&
+				  budget_enforced(pedf->scheduled) &&
+				  budget_exhausted(pedf->scheduled);
+	np 	    = exists && is_np(pedf->scheduled);
+	sleep	    = exists && is_completed(pedf->scheduled);
+	preempt     = edf_preemption_needed(edf, prev);
+
+	/* If we need to preempt do so.
+	 * The following checks set resched to 1 in case of special
+	 * circumstances.
+	 */
+	resched = preempt;
+
+	/* If a task blocks we have no choice but to reschedule.
+	 */
+	if (blocks)
+		resched = 1;
+
+	/* Request a sys_exit_np() call if we would like to preempt but cannot.
+	 * Multiple calls to request_exit_np() don't hurt.
+	 */
+	if (np && (out_of_time || preempt || sleep))
+		request_exit_np(pedf->scheduled);
+
+	/* Any task that is preemptable and either exhausts its execution
+	 * budget or wants to sleep completes. We may have to reschedule after
+	 * this.
+	 */
+	if (!np && (out_of_time || sleep) && !blocks) {
+		job_completion(pedf->scheduled, !sleep);
+		resched = 1;
+	}
+
+	/* The final scheduling decision. Do we need to switch for some reason?
+	 * Switch if we are in RT mode and have no task or if we need to
+	 * resched.
+	 */
+	next = NULL;
+	if ((!np || blocks) && (resched || !exists)) {
+		/* When preempting a task that does not block, then
+		 * re-insert it into either the ready queue or the
+		 * release queue (if it completed). requeue() picks
+		 * the appropriate queue.
+		 */
+		if (pedf->scheduled && !blocks)
+			requeue(pedf->scheduled, edf);
+		next = __take_ready(edf);
+	} else
+		/* Only override Linux scheduler if we have a real-time task
+		 * scheduled that needs to continue.
+		 */
+		if (exists)
+			next = prev;
+
+	if (next) {
+		TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+	} else {
+		TRACE("becoming idle at %llu\n", litmus_clock());
+	}
+
+	pedf->scheduled = next;
+	sched_state_task_picked();
+	raw_spin_unlock(&pedf->slock);
+
+	return next;
+}
+
+
+/*	Prepare a task for running in RT mode
+ */
+static void psnedf_task_new(struct task_struct * t, int on_rq, int is_scheduled)
+{
+	rt_domain_t* 		edf  = task_edf(t);
+	psnedf_domain_t* 	pedf = task_pedf(t);
+	unsigned long		flags;
+
+	TRACE_TASK(t, "psn edf: task new, cpu = %d\n",
+		   t->rt_param.task_params.cpu);
+
+	/* setup job parameters */
+	release_at(t, litmus_clock());
+
+	/* The task should be running in the queue, otherwise signal
+	 * code will try to wake it up with fatal consequences.
+	 */
+	raw_spin_lock_irqsave(&pedf->slock, flags);
+	if (is_scheduled) {
+		/* there shouldn't be anything else scheduled at the time */
+		BUG_ON(pedf->scheduled);
+		pedf->scheduled = t;
+	} else {
+		/* !is_scheduled means it is not scheduled right now, but it
+		 * does not mean that it is suspended. If it is not suspended,
+		 * it still needs to be requeued. If it is suspended, there is
+		 * nothing that we need to do as it will be handled by the
+		 * wake_up() handler. */
+		if (is_running(t)) {
+			requeue(t, edf);
+			/* maybe we have to reschedule */
+			psnedf_preempt_check(pedf);
+		}
+	}
+	raw_spin_unlock_irqrestore(&pedf->slock, flags);
+}
+
+static void psnedf_task_wake_up(struct task_struct *task)
+{
+	unsigned long		flags;
+	psnedf_domain_t* 	pedf = task_pedf(task);
+	rt_domain_t* 		edf  = task_edf(task);
+	lt_t			now;
+
+	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+	raw_spin_lock_irqsave(&pedf->slock, flags);
+	BUG_ON(is_queued(task));
+	now = litmus_clock();
+	if (is_sporadic(task) && is_tardy(task, now)
+#ifdef CONFIG_LITMUS_LOCKING
+	/* We need to take suspensions because of semaphores into
+	 * account! If a job resumes after being suspended due to acquiring
+	 * a semaphore, it should never be treated as a new job release.
+	 */
+	    && !is_priority_boosted(task)
+#endif
+		) {
+		/* new sporadic release */
+		release_at(task, now);
+		sched_trace_task_release(task);
+	}
+
+	/* Only add to ready queue if it is not the currently-scheduled
+	 * task. This could be the case if a task was woken up concurrently
+	 * on a remote CPU before the executing CPU got around to actually
+	 * de-scheduling the task, i.e., wake_up() raced with schedule()
+	 * and won.
+	 */
+	if (pedf->scheduled != task) {
+		requeue(task, edf);
+		psnedf_preempt_check(pedf);
+	}
+
+	raw_spin_unlock_irqrestore(&pedf->slock, flags);
+	TRACE_TASK(task, "wake up done\n");
+}
+
+static void psnedf_task_block(struct task_struct *t)
+{
+	/* only running tasks can block, thus t is in no queue */
+	TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);
+
+	BUG_ON(!is_realtime(t));
+	BUG_ON(is_queued(t));
+}
+
+static void psnedf_task_exit(struct task_struct * t)
+{
+	unsigned long flags;
+	psnedf_domain_t* 	pedf = task_pedf(t);
+	rt_domain_t*		edf;
+
+	raw_spin_lock_irqsave(&pedf->slock, flags);
+	if (is_queued(t)) {
+		/* dequeue */
+		edf  = task_edf(t);
+		remove(edf, t);
+	}
+	if (pedf->scheduled == t)
+		pedf->scheduled = NULL;
+
+	TRACE_TASK(t, "RIP, now reschedule\n");
+
+	preempt(pedf);
+	raw_spin_unlock_irqrestore(&pedf->slock, flags);
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <litmus/fdso.h>
+#include <litmus/srp.h>
+
+/* ******************** SRP support ************************ */
+
+static unsigned int psnedf_get_srp_prio(struct task_struct* t)
+{
+	return get_rt_relative_deadline(t);
+}
+
+/* ******************** FMLP support ********************** */
+
+/* struct for semaphore with priority inheritance */
+struct fmlp_semaphore {
+	struct litmus_lock litmus_lock;
+
+	/* current resource holder */
+	struct task_struct *owner;
+
+	/* FIFO queue of waiting tasks */
+	wait_queue_head_t wait;
+};
+
+static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct fmlp_semaphore, litmus_lock);
+}
+int psnedf_fmlp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	wait_queue_t wait;
+	unsigned long flags;
+
+	if (!is_realtime(t))
+		return -EPERM;
+
+	/* prevent nested lock acquisition --- not supported by FMLP */
+	if (tsk_rt(t)->num_locks_held ||
+	    tsk_rt(t)->num_local_locks_held)
+		return -EBUSY;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	if (sem->owner) {
+		/* resource is not free => must suspend and wait */
+
+		init_waitqueue_entry(&wait, t);
+
+		/* FIXME: interruptible would be nice some day */
+		set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+		__add_wait_queue_tail_exclusive(&sem->wait, &wait);
+
+		TS_LOCK_SUSPEND;
+
+		/* release lock before sleeping */
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+		/* We depend on the FIFO order.  Thus, we don't need to recheck
+		 * when we wake up; we are guaranteed to have the lock since
+		 * there is only one wake up per release.
+		 */
+
+		schedule();
+
+		TS_LOCK_RESUME;
+
+		/* Since we hold the lock, no other task will change
+		 * ->owner. We can thus check it without acquiring the spin
+		 * lock. */
+		BUG_ON(sem->owner != t);
+	} else {
+		/* it's ours now */
+		sem->owner = t;
+
+		/* mark the task as priority-boosted. */
+		boost_priority(t);
+
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+	}
+
+	tsk_rt(t)->num_locks_held++;
+
+	return 0;
+}
+
+int psnedf_fmlp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current, *next;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	unsigned long flags;
+	int err = 0;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	if (sem->owner != t) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	tsk_rt(t)->num_locks_held--;
+
+	/* we lose the benefit of priority boosting */
+
+	unboost_priority(t);
+
+	/* check if there are jobs waiting for this resource */
+	next = __waitqueue_remove_first(&sem->wait);
+	if (next) {
+		/* boost next job */
+		boost_priority(next);
+
+		/* next becomes the resouce holder */
+		sem->owner = next;
+
+		/* wake up next */
+		wake_up_process(next);
+	} else
+		/* resource becomes available */
+		sem->owner = NULL;
+
+out:
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+	return err;
+}
+
+int psnedf_fmlp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	unsigned long flags;
+
+	int owner;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	owner = sem->owner == t;
+
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	if (owner)
+		psnedf_fmlp_unlock(l);
+
+	return 0;
+}
+
+void psnedf_fmlp_free(struct litmus_lock* lock)
+{
+	kfree(fmlp_from_lock(lock));
+}
+
+static struct litmus_lock_ops psnedf_fmlp_lock_ops = {
+	.close  = psnedf_fmlp_close,
+	.lock   = psnedf_fmlp_lock,
+	.unlock = psnedf_fmlp_unlock,
+	.deallocate = psnedf_fmlp_free,
+};
+
+static struct litmus_lock* psnedf_new_fmlp(void)
+{
+	struct fmlp_semaphore* sem;
+
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if (!sem)
+		return NULL;
+
+	sem->owner   = NULL;
+	init_waitqueue_head(&sem->wait);
+	sem->litmus_lock.ops = &psnedf_fmlp_lock_ops;
+
+	return &sem->litmus_lock;
+}
+
+/* **** lock constructor **** */
+
+
+static long psnedf_allocate_lock(struct litmus_lock **lock, int type,
+				 void* __user unused)
+{
+	int err = -ENXIO;
+	struct srp_semaphore* srp;
+
+	/* PSN-EDF currently supports the SRP for local resources and the FMLP
+	 * for global resources. */
+	switch (type) {
+	case FMLP_SEM:
+		/* Flexible Multiprocessor Locking Protocol */
+		*lock = psnedf_new_fmlp();
+		if (*lock)
+			err = 0;
+		else
+			err = -ENOMEM;
+		break;
+
+	case SRP_SEM:
+		/* Baker's Stack Resource Policy */
+		srp = allocate_srp_semaphore();
+		if (srp) {
+			*lock = &srp->litmus_lock;
+			err = 0;
+		} else
+			err = -ENOMEM;
+		break;
+	};
+
+	return err;
+}
+
+#endif
+
+static struct domain_proc_info psnedf_domain_proc_info;
+static long psnedf_get_domain_proc_info(struct domain_proc_info **ret)
+{
+	*ret = &psnedf_domain_proc_info;
+	return 0;
+}
+
+static void psnedf_setup_domain_proc(void)
+{
+	int i, cpu;
+	int release_master =
+#ifdef CONFIG_RELEASE_MASTER
+		atomic_read(&release_master_cpu);
+#else
+		NO_CPU;
+#endif
+	int num_rt_cpus = num_online_cpus() - (release_master != NO_CPU);
+	struct cd_mapping *cpu_map, *domain_map;
+
+	memset(&psnedf_domain_proc_info, sizeof(psnedf_domain_proc_info), 0);
+	init_domain_proc_info(&psnedf_domain_proc_info, num_rt_cpus, num_rt_cpus);
+	psnedf_domain_proc_info.num_cpus = num_rt_cpus;
+	psnedf_domain_proc_info.num_domains = num_rt_cpus;
+
+	for (cpu = 0, i = 0; cpu < num_online_cpus(); ++cpu) {
+		if (cpu == release_master)
+			continue;
+		cpu_map = &psnedf_domain_proc_info.cpu_to_domains[i];
+		domain_map = &psnedf_domain_proc_info.domain_to_cpus[i];
+
+		cpu_map->id = cpu;
+		domain_map->id = i; /* enumerate w/o counting the release master */
+		cpumask_set_cpu(i, cpu_map->mask);
+		cpumask_set_cpu(cpu, domain_map->mask);
+		++i;
+	}
+}
+
+static long psnedf_activate_plugin(void)
+{
+#ifdef CONFIG_RELEASE_MASTER
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		remote_edf(cpu)->release_master = atomic_read(&release_master_cpu);
+	}
+#endif
+
+#ifdef CONFIG_LITMUS_LOCKING
+	get_srp_prio = psnedf_get_srp_prio;
+#endif
+
+	psnedf_setup_domain_proc();
+
+	return 0;
+}
+
+static long psnedf_deactivate_plugin(void)
+{
+	destroy_domain_proc_info(&psnedf_domain_proc_info);
+	return 0;
+}
+
+static long psnedf_admit_task(struct task_struct* tsk)
+{
+	if (task_cpu(tsk) == tsk->rt_param.task_params.cpu
+#ifdef CONFIG_RELEASE_MASTER
+	    /* don't allow tasks on release master CPU */
+	     && task_cpu(tsk) != remote_edf(task_cpu(tsk))->release_master
+#endif
+		)
+		return 0;
+	else
+		return -EINVAL;
+}
+
+/*	Plugin object	*/
+static struct sched_plugin psn_edf_plugin __cacheline_aligned_in_smp = {
+	.plugin_name		= "PSN-EDF",
+	.task_new		= psnedf_task_new,
+	.complete_job		= complete_job,
+	.task_exit		= psnedf_task_exit,
+	.schedule		= psnedf_schedule,
+	.task_wake_up		= psnedf_task_wake_up,
+	.task_block		= psnedf_task_block,
+	.admit_task		= psnedf_admit_task,
+	.activate_plugin	= psnedf_activate_plugin,
+	.deactivate_plugin	= psnedf_deactivate_plugin,
+	.get_domain_proc_info	= psnedf_get_domain_proc_info,
+#ifdef CONFIG_LITMUS_LOCKING
+	.allocate_lock		= psnedf_allocate_lock,
+#endif
+};
+
+
+static int __init init_psn_edf(void)
+{
+	int i;
+
+	/* We do not really want to support cpu hotplug, do we? ;)
+	 * However, if we are so crazy to do so,
+	 * we cannot use num_online_cpu()
+	 */
+	for (i = 0; i < num_online_cpus(); i++) {
+		psnedf_domain_init(remote_pedf(i),
+				   psnedf_check_resched,
+				   NULL, i);
+	}
+	return register_sched_plugin(&psn_edf_plugin);
+}
+
+module_init(init_psn_edf);
-- 
1.8.1.2


From 8fcdf62f4db13de12ae638c8e7e3535858fb8d95 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Tue, 12 Feb 2013 19:16:03 +0100
Subject: [PATCH 047/119] Add GSN-EDF scheduler plugin

---
 litmus/Makefile        |    1 +
 litmus/sched_gsn_edf.c | 1069 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 1070 insertions(+)
 create mode 100644 litmus/sched_gsn_edf.c

diff --git a/litmus/Makefile b/litmus/Makefile
index 0db695e..c01ce3e 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -19,6 +19,7 @@ obj-y     = sched_plugin.o litmus.o \
 	    binheap.o \
 	    ctrldev.o \
 	    uncachedev.o \
+	    sched_gsn_edf.o \
 	    sched_psn_edf.o
 
 
diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c
new file mode 100644
index 0000000..9307d0b
--- /dev/null
+++ b/litmus/sched_gsn_edf.c
@@ -0,0 +1,1069 @@
+/*
+ * litmus/sched_gsn_edf.c
+ *
+ * Implementation of the GSN-EDF scheduling algorithm.
+ *
+ * This version uses the simple approach and serializes all scheduling
+ * decisions by the use of a queue lock. This is probably not the
+ * best way to do it, but it should suffice for now.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+
+#include <litmus/preempt.h>
+#include <litmus/budget.h>
+
+#include <litmus/bheap.h>
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
+
+/* to set up domain/cpu mappings */
+#include <litmus/litmus_proc.h>
+
+#include <linux/module.h>
+
+/* Overview of GSN-EDF operations.
+ *
+ * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This
+ * description only covers how the individual operations are implemented in
+ * LITMUS.
+ *
+ * link_task_to_cpu(T, cpu) 	- Low-level operation to update the linkage
+ *                                structure (NOT the actually scheduled
+ *                                task). If there is another linked task To
+ *                                already it will set To->linked_on = NO_CPU
+ *                                (thereby removing its association with this
+ *                                CPU). However, it will not requeue the
+ *                                previously linked task (if any). It will set
+ *                                T's state to not completed and check whether
+ *                                it is already running somewhere else. If T
+ *                                is scheduled somewhere else it will link
+ *                                it to that CPU instead (and pull the linked
+ *                                task to cpu). T may be NULL.
+ *
+ * unlink(T)			- Unlink removes T from all scheduler data
+ *                                structures. If it is linked to some CPU it
+ *                                will link NULL to that CPU. If it is
+ *                                currently queued in the gsnedf queue it will
+ *                                be removed from the rt_domain. It is safe to
+ *                                call unlink(T) if T is not linked. T may not
+ *                                be NULL.
+ *
+ * requeue(T)			- Requeue will insert T into the appropriate
+ *                                queue. If the system is in real-time mode and
+ *                                the T is released already, it will go into the
+ *                                ready queue. If the system is not in
+ *                                real-time mode is T, then T will go into the
+ *                                release queue. If T's release time is in the
+ *                                future, it will go into the release
+ *                                queue. That means that T's release time/job
+ *                                no/etc. has to be updated before requeu(T) is
+ *                                called. It is not safe to call requeue(T)
+ *                                when T is already queued. T may not be NULL.
+ *
+ * gsnedf_job_arrival(T)	- This is the catch all function when T enters
+ *                                the system after either a suspension or at a
+ *                                job release. It will queue T (which means it
+ *                                is not safe to call gsnedf_job_arrival(T) if
+ *                                T is already queued) and then check whether a
+ *                                preemption is necessary. If a preemption is
+ *                                necessary it will update the linkage
+ *                                accordingly and cause scheduled to be called
+ *                                (either with an IPI or need_resched). It is
+ *                                safe to call gsnedf_job_arrival(T) if T's
+ *                                next job has not been actually released yet
+ *                                (releast time in the future). T will be put
+ *                                on the release queue in that case.
+ *
+ * job_completion(T)		- Take care of everything that needs to be done
+ *                                to prepare T for its next release and place
+ *                                it in the right queue with
+ *                                gsnedf_job_arrival().
+ *
+ *
+ * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is
+ * equivalent to unlink(T). Note that if you unlink a task from a CPU none of
+ * the functions will automatically propagate pending task from the ready queue
+ * to a linked task. This is the job of the calling function ( by means of
+ * __take_ready).
+ */
+
+
+/* cpu_entry_t - maintain the linked and scheduled state
+ */
+typedef struct  {
+	int 			cpu;
+	struct task_struct*	linked;		/* only RT tasks */
+	struct task_struct*	scheduled;	/* only RT tasks */
+	struct bheap_node*	hn;
+} cpu_entry_t;
+DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries);
+
+cpu_entry_t* gsnedf_cpus[NR_CPUS];
+
+/* the cpus queue themselves according to priority in here */
+static struct bheap_node gsnedf_heap_node[NR_CPUS];
+static struct bheap      gsnedf_cpu_heap;
+
+static rt_domain_t gsnedf;
+#define gsnedf_lock (gsnedf.ready_lock)
+
+
+/* Uncomment this if you want to see all scheduling decisions in the
+ * TRACE() log.
+#define WANT_ALL_SCHED_EVENTS
+ */
+
+static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
+{
+	cpu_entry_t *a, *b;
+	a = _a->value;
+	b = _b->value;
+	/* Note that a and b are inverted: we want the lowest-priority CPU at
+	 * the top of the heap.
+	 */
+	return edf_higher_prio(b->linked, a->linked);
+}
+
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ *                       order in the cpu queue. Caller must hold gsnedf lock.
+ */
+static void update_cpu_position(cpu_entry_t *entry)
+{
+	if (likely(bheap_node_in_heap(entry->hn)))
+		bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
+	bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
+}
+
+/* caller must hold gsnedf lock */
+static cpu_entry_t* lowest_prio_cpu(void)
+{
+	struct bheap_node* hn;
+	hn = bheap_peek(cpu_lower_prio, &gsnedf_cpu_heap);
+	return hn->value;
+}
+
+
+/* link_task_to_cpu - Update the link of a CPU.
+ *                    Handles the case where the to-be-linked task is already
+ *                    scheduled on a different CPU.
+ */
+static noinline void link_task_to_cpu(struct task_struct* linked,
+				      cpu_entry_t *entry)
+{
+	cpu_entry_t *sched;
+	struct task_struct* tmp;
+	int on_cpu;
+
+	BUG_ON(linked && !is_realtime(linked));
+
+	/* Currently linked task is set to be unlinked. */
+	if (entry->linked) {
+		entry->linked->rt_param.linked_on = NO_CPU;
+	}
+
+	/* Link new task to CPU. */
+	if (linked) {
+		/* handle task is already scheduled somewhere! */
+		on_cpu = linked->rt_param.scheduled_on;
+		if (on_cpu != NO_CPU) {
+			sched = &per_cpu(gsnedf_cpu_entries, on_cpu);
+			/* this should only happen if not linked already */
+			BUG_ON(sched->linked == linked);
+
+			/* If we are already scheduled on the CPU to which we
+			 * wanted to link, we don't need to do the swap --
+			 * we just link ourselves to the CPU and depend on
+			 * the caller to get things right.
+			 */
+			if (entry != sched) {
+				TRACE_TASK(linked,
+					   "already scheduled on %d, updating link.\n",
+					   sched->cpu);
+				tmp = sched->linked;
+				linked->rt_param.linked_on = sched->cpu;
+				sched->linked = linked;
+				update_cpu_position(sched);
+				linked = tmp;
+			}
+		}
+		if (linked) /* might be NULL due to swap */
+			linked->rt_param.linked_on = entry->cpu;
+	}
+	entry->linked = linked;
+#ifdef WANT_ALL_SCHED_EVENTS
+	if (linked)
+		TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
+	else
+		TRACE("NULL linked to %d.\n", entry->cpu);
+#endif
+	update_cpu_position(entry);
+}
+
+/* unlink - Make sure a task is not linked any longer to an entry
+ *          where it was linked before. Must hold gsnedf_lock.
+ */
+static noinline void unlink(struct task_struct* t)
+{
+	cpu_entry_t *entry;
+
+	if (t->rt_param.linked_on != NO_CPU) {
+		/* unlink */
+		entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on);
+		t->rt_param.linked_on = NO_CPU;
+		link_task_to_cpu(NULL, entry);
+	} else if (is_queued(t)) {
+		/* This is an interesting situation: t is scheduled,
+		 * but was just recently unlinked.  It cannot be
+		 * linked anywhere else (because then it would have
+		 * been relinked to this CPU), thus it must be in some
+		 * queue. We must remove it from the list in this
+		 * case.
+		 */
+		remove(&gsnedf, t);
+	}
+}
+
+
+/* preempt - force a CPU to reschedule
+ */
+static void preempt(cpu_entry_t *entry)
+{
+	preempt_if_preemptable(entry->scheduled, entry->cpu);
+}
+
+/* requeue - Put an unlinked task into gsn-edf domain.
+ *           Caller must hold gsnedf_lock.
+ */
+static noinline void requeue(struct task_struct* task)
+{
+	BUG_ON(!task);
+	/* sanity check before insertion */
+	BUG_ON(is_queued(task));
+
+	if (is_early_releasing(task) || is_released(task, litmus_clock()))
+		__add_ready(&gsnedf, task);
+	else {
+		/* it has got to wait */
+		add_release(&gsnedf, task);
+	}
+}
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+static cpu_entry_t* gsnedf_get_nearest_available_cpu(cpu_entry_t *start)
+{
+	cpu_entry_t *affinity;
+
+	get_nearest_available_cpu(affinity, start, gsnedf_cpu_entries,
+#ifdef CONFIG_RELEASE_MASTER
+			gsnedf.release_master
+#else
+			NO_CPU
+#endif
+			);
+
+	return(affinity);
+}
+#endif
+
+/* check for any necessary preemptions */
+static void check_for_preemptions(void)
+{
+	struct task_struct *task;
+	cpu_entry_t *last;
+
+
+#ifdef CONFIG_PREFER_LOCAL_LINKING
+	cpu_entry_t *local;
+
+	/* Before linking to other CPUs, check first whether the local CPU is
+	 * idle. */
+	local = &__get_cpu_var(gsnedf_cpu_entries);
+	task  = __peek_ready(&gsnedf);
+
+	if (task && !local->linked
+#ifdef CONFIG_RELEASE_MASTER
+	    && likely(local->cpu != gsnedf.release_master)
+#endif
+		) {
+		task = __take_ready(&gsnedf);
+		TRACE_TASK(task, "linking to local CPU %d to avoid IPI\n", local->cpu);
+		link_task_to_cpu(task, local);
+		preempt(local);
+	}
+#endif
+
+	for (last = lowest_prio_cpu();
+	     edf_preemption_needed(&gsnedf, last->linked);
+	     last = lowest_prio_cpu()) {
+		/* preemption necessary */
+		task = __take_ready(&gsnedf);
+		TRACE("check_for_preemptions: attempting to link task %d to %d\n",
+		      task->pid, last->cpu);
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+		{
+			cpu_entry_t *affinity =
+					gsnedf_get_nearest_available_cpu(
+						&per_cpu(gsnedf_cpu_entries, task_cpu(task)));
+			if (affinity)
+				last = affinity;
+			else if (requeue_preempted_job(last->linked))
+				requeue(last->linked);
+		}
+#else
+		if (requeue_preempted_job(last->linked))
+			requeue(last->linked);
+#endif
+
+		link_task_to_cpu(task, last);
+		preempt(last);
+	}
+}
+
+/* gsnedf_job_arrival: task is either resumed or released */
+static noinline void gsnedf_job_arrival(struct task_struct* task)
+{
+	BUG_ON(!task);
+
+	requeue(task);
+	check_for_preemptions();
+}
+
+static void gsnedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+
+	__merge_ready(rt, tasks);
+	check_for_preemptions();
+
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+
+/* caller holds gsnedf_lock */
+static noinline void job_completion(struct task_struct *t, int forced)
+{
+	BUG_ON(!t);
+
+	sched_trace_task_completion(t, forced);
+
+	TRACE_TASK(t, "job_completion().\n");
+
+	/* set flags */
+	tsk_rt(t)->completed = 0;
+	/* prepare for next period */
+	prepare_for_next_period(t);
+	if (is_early_releasing(t) || is_released(t, litmus_clock()))
+		sched_trace_task_release(t);
+	/* unlink */
+	unlink(t);
+	/* requeue
+	 * But don't requeue a blocking task. */
+	if (is_running(t))
+		gsnedf_job_arrival(t);
+}
+
+/* Getting schedule() right is a bit tricky. schedule() may not make any
+ * assumptions on the state of the current task since it may be called for a
+ * number of reasons. The reasons include a scheduler_tick() determined that it
+ * was necessary, because sys_exit_np() was called, because some Linux
+ * subsystem determined so, or even (in the worst case) because there is a bug
+ * hidden somewhere. Thus, we must take extreme care to determine what the
+ * current state is.
+ *
+ * The CPU could currently be scheduling a task (or not), be linked (or not).
+ *
+ * The following assertions for the scheduled task could hold:
+ *
+ *      - !is_running(scheduled)        // the job blocks
+ *	- scheduled->timeslice == 0	// the job completed (forcefully)
+ *	- is_completed()		// the job completed (by syscall)
+ * 	- linked != scheduled		// we need to reschedule (for any reason)
+ * 	- is_np(scheduled)		// rescheduling must be delayed,
+ *					   sys_exit_np must be requested
+ *
+ * Any of these can occur together.
+ */
+static struct task_struct* gsnedf_schedule(struct task_struct * prev)
+{
+	cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries);
+	int out_of_time, sleep, preempt, np, exists, blocks;
+	struct task_struct* next = NULL;
+
+#ifdef CONFIG_RELEASE_MASTER
+	/* Bail out early if we are the release master.
+	 * The release master never schedules any real-time tasks.
+	 */
+	if (unlikely(gsnedf.release_master == entry->cpu)) {
+		sched_state_task_picked();
+		return NULL;
+	}
+#endif
+
+	raw_spin_lock(&gsnedf_lock);
+
+	/* sanity checking */
+	BUG_ON(entry->scheduled && entry->scheduled != prev);
+	BUG_ON(entry->scheduled && !is_realtime(prev));
+	BUG_ON(is_realtime(prev) && !entry->scheduled);
+
+	/* (0) Determine state */
+	exists      = entry->scheduled != NULL;
+	blocks      = exists && !is_running(entry->scheduled);
+	out_of_time = exists && budget_enforced(entry->scheduled)
+		&& budget_exhausted(entry->scheduled);
+	np 	    = exists && is_np(entry->scheduled);
+	sleep	    = exists && is_completed(entry->scheduled);
+	preempt     = entry->scheduled != entry->linked;
+
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE_TASK(prev, "invoked gsnedf_schedule.\n");
+#endif
+
+	if (exists)
+		TRACE_TASK(prev,
+			   "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
+			   "state:%d sig:%d\n",
+			   blocks, out_of_time, np, sleep, preempt,
+			   prev->state, signal_pending(prev));
+	if (entry->linked && preempt)
+		TRACE_TASK(prev, "will be preempted by %s/%d\n",
+			   entry->linked->comm, entry->linked->pid);
+
+
+	/* If a task blocks we have no choice but to reschedule.
+	 */
+	if (blocks)
+		unlink(entry->scheduled);
+
+	/* Request a sys_exit_np() call if we would like to preempt but cannot.
+	 * We need to make sure to update the link structure anyway in case
+	 * that we are still linked. Multiple calls to request_exit_np() don't
+	 * hurt.
+	 */
+	if (np && (out_of_time || preempt || sleep)) {
+		unlink(entry->scheduled);
+		request_exit_np(entry->scheduled);
+	}
+
+	/* Any task that is preemptable and either exhausts its execution
+	 * budget or wants to sleep completes. We may have to reschedule after
+	 * this. Don't do a job completion if we block (can't have timers running
+	 * for blocked jobs).
+	 */
+	if (!np && (out_of_time || sleep) && !blocks)
+		job_completion(entry->scheduled, !sleep);
+
+	/* Link pending task if we became unlinked.
+	 */
+	if (!entry->linked)
+		link_task_to_cpu(__take_ready(&gsnedf), entry);
+
+	/* The final scheduling decision. Do we need to switch for some reason?
+	 * If linked is different from scheduled, then select linked as next.
+	 */
+	if ((!np || blocks) &&
+	    entry->linked != entry->scheduled) {
+		/* Schedule a linked job? */
+		if (entry->linked) {
+			entry->linked->rt_param.scheduled_on = entry->cpu;
+			next = entry->linked;
+			TRACE_TASK(next, "scheduled_on = P%d\n", smp_processor_id());
+		}
+		if (entry->scheduled) {
+			/* not gonna be scheduled soon */
+			entry->scheduled->rt_param.scheduled_on = NO_CPU;
+			TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
+		}
+	} else
+		/* Only override Linux scheduler if we have a real-time task
+		 * scheduled that needs to continue.
+		 */
+		if (exists)
+			next = prev;
+
+	sched_state_task_picked();
+
+	raw_spin_unlock(&gsnedf_lock);
+
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE("gsnedf_lock released, next=0x%p\n", next);
+
+	if (next)
+		TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+	else if (exists && !next)
+		TRACE("becomes idle at %llu.\n", litmus_clock());
+#endif
+
+
+	return next;
+}
+
+
+/* _finish_switch - we just finished the switch away from prev
+ */
+static void gsnedf_finish_switch(struct task_struct *prev)
+{
+	cpu_entry_t* 	entry = &__get_cpu_var(gsnedf_cpu_entries);
+
+	entry->scheduled = is_realtime(current) ? current : NULL;
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE_TASK(prev, "switched away from\n");
+#endif
+}
+
+
+/*	Prepare a task for running in RT mode
+ */
+static void gsnedf_task_new(struct task_struct * t, int on_rq, int is_scheduled)
+{
+	unsigned long 		flags;
+	cpu_entry_t* 		entry;
+
+	TRACE("gsn edf: task new %d\n", t->pid);
+
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+
+	/* setup job params */
+	release_at(t, litmus_clock());
+
+	if (is_scheduled) {
+		entry = &per_cpu(gsnedf_cpu_entries, task_cpu(t));
+		BUG_ON(entry->scheduled);
+
+#ifdef CONFIG_RELEASE_MASTER
+		if (entry->cpu != gsnedf.release_master) {
+#endif
+			entry->scheduled = t;
+			tsk_rt(t)->scheduled_on = task_cpu(t);
+#ifdef CONFIG_RELEASE_MASTER
+		} else {
+			/* do not schedule on release master */
+			preempt(entry); /* force resched */
+			tsk_rt(t)->scheduled_on = NO_CPU;
+		}
+#endif
+	} else {
+		t->rt_param.scheduled_on = NO_CPU;
+	}
+	t->rt_param.linked_on          = NO_CPU;
+
+	if (is_running(t))
+		gsnedf_job_arrival(t);
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+
+static void gsnedf_task_wake_up(struct task_struct *task)
+{
+	unsigned long flags;
+	lt_t now;
+
+	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+	now = litmus_clock();
+	if (is_sporadic(task) && is_tardy(task, now)) {
+		/* new sporadic release */
+		release_at(task, now);
+		sched_trace_task_release(task);
+	}
+	gsnedf_job_arrival(task);
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+
+static void gsnedf_task_block(struct task_struct *t)
+{
+	unsigned long flags;
+
+	TRACE_TASK(t, "block at %llu\n", litmus_clock());
+
+	/* unlink if necessary */
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+	unlink(t);
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+
+	BUG_ON(!is_realtime(t));
+}
+
+
+static void gsnedf_task_exit(struct task_struct * t)
+{
+	unsigned long flags;
+
+	/* unlink if necessary */
+	raw_spin_lock_irqsave(&gsnedf_lock, flags);
+	unlink(t);
+	if (tsk_rt(t)->scheduled_on != NO_CPU) {
+		gsnedf_cpus[tsk_rt(t)->scheduled_on]->scheduled = NULL;
+		tsk_rt(t)->scheduled_on = NO_CPU;
+	}
+	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+
+	BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "RIP\n");
+}
+
+
+static long gsnedf_admit_task(struct task_struct* tsk)
+{
+	return 0;
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <litmus/fdso.h>
+
+/* called with IRQs off */
+static void set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
+{
+	int linked_on;
+	int check_preempt = 0;
+
+	raw_spin_lock(&gsnedf_lock);
+
+	TRACE_TASK(t, "inherits priority from %s/%d\n", prio_inh->comm, prio_inh->pid);
+	tsk_rt(t)->inh_task = prio_inh;
+
+	linked_on  = tsk_rt(t)->linked_on;
+
+	/* If it is scheduled, then we need to reorder the CPU heap. */
+	if (linked_on != NO_CPU) {
+		TRACE_TASK(t, "%s: linked  on %d\n",
+			   __FUNCTION__, linked_on);
+		/* Holder is scheduled; need to re-order CPUs.
+		 * We can't use heap_decrease() here since
+		 * the cpu_heap is ordered in reverse direction, so
+		 * it is actually an increase. */
+		bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap,
+			    gsnedf_cpus[linked_on]->hn);
+		bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap,
+			    gsnedf_cpus[linked_on]->hn);
+	} else {
+		/* holder may be queued: first stop queue changes */
+		raw_spin_lock(&gsnedf.release_lock);
+		if (is_queued(t)) {
+			TRACE_TASK(t, "%s: is queued\n",
+				   __FUNCTION__);
+			/* We need to update the position of holder in some
+			 * heap. Note that this could be a release heap if we
+			 * budget enforcement is used and this job overran. */
+			check_preempt =
+				!bheap_decrease(edf_ready_order,
+					       tsk_rt(t)->heap_node);
+		} else {
+			/* Nothing to do: if it is not queued and not linked
+			 * then it is either sleeping or currently being moved
+			 * by other code (e.g., a timer interrupt handler) that
+			 * will use the correct priority when enqueuing the
+			 * task. */
+			TRACE_TASK(t, "%s: is NOT queued => Done.\n",
+				   __FUNCTION__);
+		}
+		raw_spin_unlock(&gsnedf.release_lock);
+
+		/* If holder was enqueued in a release heap, then the following
+		 * preemption check is pointless, but we can't easily detect
+		 * that case. If you want to fix this, then consider that
+		 * simply adding a state flag requires O(n) time to update when
+		 * releasing n tasks, which conflicts with the goal to have
+		 * O(log n) merges. */
+		if (check_preempt) {
+			/* heap_decrease() hit the top level of the heap: make
+			 * sure preemption checks get the right task, not the
+			 * potentially stale cache. */
+			bheap_uncache_min(edf_ready_order,
+					 &gsnedf.ready_queue);
+			check_for_preemptions();
+		}
+	}
+
+	raw_spin_unlock(&gsnedf_lock);
+}
+
+/* called with IRQs off */
+static void clear_priority_inheritance(struct task_struct* t)
+{
+	raw_spin_lock(&gsnedf_lock);
+
+	/* A job only stops inheriting a priority when it releases a
+	 * resource. Thus we can make the following assumption.*/
+	BUG_ON(tsk_rt(t)->scheduled_on == NO_CPU);
+
+	TRACE_TASK(t, "priority restored\n");
+	tsk_rt(t)->inh_task = NULL;
+
+	/* Check if rescheduling is necessary. We can't use heap_decrease()
+	 * since the priority was effectively lowered. */
+	unlink(t);
+	gsnedf_job_arrival(t);
+
+	raw_spin_unlock(&gsnedf_lock);
+}
+
+
+/* ******************** FMLP support ********************** */
+
+/* struct for semaphore with priority inheritance */
+struct fmlp_semaphore {
+	struct litmus_lock litmus_lock;
+
+	/* current resource holder */
+	struct task_struct *owner;
+
+	/* highest-priority waiter */
+	struct task_struct *hp_waiter;
+
+	/* FIFO queue of waiting tasks */
+	wait_queue_head_t wait;
+};
+
+static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct fmlp_semaphore, litmus_lock);
+}
+
+/* caller is responsible for locking */
+struct task_struct* find_hp_waiter(struct fmlp_semaphore *sem,
+				   struct task_struct* skip)
+{
+	struct list_head	*pos;
+	struct task_struct 	*queued, *found = NULL;
+
+	list_for_each(pos, &sem->wait.task_list) {
+		queued  = (struct task_struct*) list_entry(pos, wait_queue_t,
+							   task_list)->private;
+
+		/* Compare task prios, find high prio task. */
+		if (queued != skip && edf_higher_prio(queued, found))
+			found = queued;
+	}
+	return found;
+}
+
+int gsnedf_fmlp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	wait_queue_t wait;
+	unsigned long flags;
+
+	if (!is_realtime(t))
+		return -EPERM;
+
+	/* prevent nested lock acquisition --- not supported by FMLP */
+	if (tsk_rt(t)->num_locks_held)
+		return -EBUSY;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	if (sem->owner) {
+		/* resource is not free => must suspend and wait */
+
+		init_waitqueue_entry(&wait, t);
+
+		/* FIXME: interruptible would be nice some day */
+		set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+		__add_wait_queue_tail_exclusive(&sem->wait, &wait);
+
+		/* check if we need to activate priority inheritance */
+		if (edf_higher_prio(t, sem->hp_waiter)) {
+			sem->hp_waiter = t;
+			if (edf_higher_prio(t, sem->owner))
+				set_priority_inheritance(sem->owner, sem->hp_waiter);
+		}
+
+		TS_LOCK_SUSPEND;
+
+		/* release lock before sleeping */
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+		/* We depend on the FIFO order.  Thus, we don't need to recheck
+		 * when we wake up; we are guaranteed to have the lock since
+		 * there is only one wake up per release.
+		 */
+
+		schedule();
+
+		TS_LOCK_RESUME;
+
+		/* Since we hold the lock, no other task will change
+		 * ->owner. We can thus check it without acquiring the spin
+		 * lock. */
+		BUG_ON(sem->owner != t);
+	} else {
+		/* it's ours now */
+		sem->owner = t;
+
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+	}
+
+	tsk_rt(t)->num_locks_held++;
+
+	return 0;
+}
+
+int gsnedf_fmlp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current, *next;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	unsigned long flags;
+	int err = 0;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	if (sem->owner != t) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	tsk_rt(t)->num_locks_held--;
+
+	/* check if there are jobs waiting for this resource */
+	next = __waitqueue_remove_first(&sem->wait);
+	if (next) {
+		/* next becomes the resouce holder */
+		sem->owner = next;
+		TRACE_CUR("lock ownership passed to %s/%d\n", next->comm, next->pid);
+
+		/* determine new hp_waiter if necessary */
+		if (next == sem->hp_waiter) {
+			TRACE_TASK(next, "was highest-prio waiter\n");
+			/* next has the highest priority --- it doesn't need to
+			 * inherit.  However, we need to make sure that the
+			 * next-highest priority in the queue is reflected in
+			 * hp_waiter. */
+			sem->hp_waiter = find_hp_waiter(sem, next);
+			if (sem->hp_waiter)
+				TRACE_TASK(sem->hp_waiter, "is new highest-prio waiter\n");
+			else
+				TRACE("no further waiters\n");
+		} else {
+			/* Well, if next is not the highest-priority waiter,
+			 * then it ought to inherit the highest-priority
+			 * waiter's priority. */
+			set_priority_inheritance(next, sem->hp_waiter);
+		}
+
+		/* wake up next */
+		wake_up_process(next);
+	} else
+		/* becomes available */
+		sem->owner = NULL;
+
+	/* we lose the benefit of priority inheritance (if any) */
+	if (tsk_rt(t)->inh_task)
+		clear_priority_inheritance(t);
+
+out:
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	return err;
+}
+
+int gsnedf_fmlp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	unsigned long flags;
+
+	int owner;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	owner = sem->owner == t;
+
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	if (owner)
+		gsnedf_fmlp_unlock(l);
+
+	return 0;
+}
+
+void gsnedf_fmlp_free(struct litmus_lock* lock)
+{
+	kfree(fmlp_from_lock(lock));
+}
+
+static struct litmus_lock_ops gsnedf_fmlp_lock_ops = {
+	.close  = gsnedf_fmlp_close,
+	.lock   = gsnedf_fmlp_lock,
+	.unlock = gsnedf_fmlp_unlock,
+	.deallocate = gsnedf_fmlp_free,
+};
+
+static struct litmus_lock* gsnedf_new_fmlp(void)
+{
+	struct fmlp_semaphore* sem;
+
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if (!sem)
+		return NULL;
+
+	sem->owner   = NULL;
+	sem->hp_waiter = NULL;
+	init_waitqueue_head(&sem->wait);
+	sem->litmus_lock.ops = &gsnedf_fmlp_lock_ops;
+
+	return &sem->litmus_lock;
+}
+
+/* **** lock constructor **** */
+
+
+static long gsnedf_allocate_lock(struct litmus_lock **lock, int type,
+				 void* __user unused)
+{
+	int err = -ENXIO;
+
+	/* GSN-EDF currently only supports the FMLP for global resources. */
+	switch (type) {
+
+	case FMLP_SEM:
+		/* Flexible Multiprocessor Locking Protocol */
+		*lock = gsnedf_new_fmlp();
+		if (*lock)
+			err = 0;
+		else
+			err = -ENOMEM;
+		break;
+
+	};
+
+	return err;
+}
+
+#endif
+
+static struct domain_proc_info gsnedf_domain_proc_info;
+static long gsnedf_get_domain_proc_info(struct domain_proc_info **ret)
+{
+	*ret = &gsnedf_domain_proc_info;
+	return 0;
+}
+
+static void gsnedf_setup_domain_proc(void)
+{
+	int i, cpu;
+	int release_master =
+#ifdef CONFIG_RELEASE_MASTER
+			atomic_read(&release_master_cpu);
+#else
+		NO_CPU;
+#endif
+	int num_rt_cpus = num_online_cpus() - (release_master != NO_CPU);
+	struct cd_mapping *map;
+
+	memset(&gsnedf_domain_proc_info, sizeof(gsnedf_domain_proc_info), 0);
+	init_domain_proc_info(&gsnedf_domain_proc_info, num_rt_cpus, 1);
+	gsnedf_domain_proc_info.num_cpus = num_rt_cpus;
+	gsnedf_domain_proc_info.num_domains = 1;
+
+	gsnedf_domain_proc_info.domain_to_cpus[0].id = 0;
+	for (cpu = 0, i = 0; cpu < num_online_cpus(); ++cpu) {
+		if (cpu == release_master)
+			continue;
+		map = &gsnedf_domain_proc_info.cpu_to_domains[i];
+		map->id = cpu;
+		cpumask_set_cpu(0, map->mask);
+		++i;
+
+		/* add cpu to the domain */
+		cpumask_set_cpu(cpu,
+			gsnedf_domain_proc_info.domain_to_cpus[0].mask);
+	}
+}
+
+static long gsnedf_activate_plugin(void)
+{
+	int cpu;
+	cpu_entry_t *entry;
+
+	bheap_init(&gsnedf_cpu_heap);
+#ifdef CONFIG_RELEASE_MASTER
+	gsnedf.release_master = atomic_read(&release_master_cpu);
+#endif
+
+	for_each_online_cpu(cpu) {
+		entry = &per_cpu(gsnedf_cpu_entries, cpu);
+		bheap_node_init(&entry->hn, entry);
+		entry->linked    = NULL;
+		entry->scheduled = NULL;
+#ifdef CONFIG_RELEASE_MASTER
+		if (cpu != gsnedf.release_master) {
+#endif
+			TRACE("GSN-EDF: Initializing CPU #%d.\n", cpu);
+			update_cpu_position(entry);
+#ifdef CONFIG_RELEASE_MASTER
+		} else {
+			TRACE("GSN-EDF: CPU %d is release master.\n", cpu);
+		}
+#endif
+	}
+
+	gsnedf_setup_domain_proc();
+
+	return 0;
+}
+
+static long gsnedf_deactivate_plugin(void)
+{
+	destroy_domain_proc_info(&gsnedf_domain_proc_info);
+	return 0;
+}
+
+/*	Plugin object	*/
+static struct sched_plugin gsn_edf_plugin __cacheline_aligned_in_smp = {
+	.plugin_name		= "GSN-EDF",
+	.finish_switch		= gsnedf_finish_switch,
+	.task_new		= gsnedf_task_new,
+	.complete_job		= complete_job,
+	.task_exit		= gsnedf_task_exit,
+	.schedule		= gsnedf_schedule,
+	.task_wake_up		= gsnedf_task_wake_up,
+	.task_block		= gsnedf_task_block,
+	.admit_task		= gsnedf_admit_task,
+	.activate_plugin	= gsnedf_activate_plugin,
+	.deactivate_plugin	= gsnedf_deactivate_plugin,
+	.get_domain_proc_info	= gsnedf_get_domain_proc_info,
+#ifdef CONFIG_LITMUS_LOCKING
+	.allocate_lock		= gsnedf_allocate_lock,
+#endif
+};
+
+
+static int __init init_gsn_edf(void)
+{
+	int cpu;
+	cpu_entry_t *entry;
+
+	bheap_init(&gsnedf_cpu_heap);
+	/* initialize CPU state */
+	for (cpu = 0; cpu < NR_CPUS; cpu++)  {
+		entry = &per_cpu(gsnedf_cpu_entries, cpu);
+		gsnedf_cpus[cpu] = entry;
+		entry->cpu 	 = cpu;
+		entry->hn        = &gsnedf_heap_node[cpu];
+		bheap_node_init(&entry->hn, entry);
+	}
+	edf_domain_init(&gsnedf, NULL, gsnedf_release_jobs);
+	return register_sched_plugin(&gsn_edf_plugin);
+}
+
+
+module_init(init_gsn_edf);
-- 
1.8.1.2


From 7cf2307d2c200a960c9e54839ba2134730adda52 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Tue, 12 Feb 2013 19:17:08 +0100
Subject: [PATCH 048/119] Add P-FP scheduler plugin

---
 litmus/Makefile    |    4 +-
 litmus/fp_common.c |   17 +-
 litmus/sched_pfp.c | 2013 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 2028 insertions(+), 6 deletions(-)
 create mode 100644 litmus/sched_pfp.c

diff --git a/litmus/Makefile b/litmus/Makefile
index c01ce3e..2d2e0a5 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -20,7 +20,9 @@ obj-y     = sched_plugin.o litmus.o \
 	    ctrldev.o \
 	    uncachedev.o \
 	    sched_gsn_edf.o \
-	    sched_psn_edf.o
+	    sched_psn_edf.o \
+	    sched_pfp.o
+
 
 
 obj-$(CONFIG_SCHED_CPU_AFFINITY) += affinity.o
diff --git a/litmus/fp_common.c b/litmus/fp_common.c
index 964a472..ff0f30a 100644
--- a/litmus/fp_common.c
+++ b/litmus/fp_common.c
@@ -32,7 +32,6 @@ int fp_higher_prio(struct task_struct* first,
 		return 0;
 	}
 
-
 	/* check for NULL tasks */
 	if (!first || !second)
 		return first && !second;
@@ -50,6 +49,15 @@ int fp_higher_prio(struct task_struct* first,
 	if (unlikely(second->rt_param.inh_task))
 		second_task = second->rt_param.inh_task;
 
+	/* Comparisons to itself are only possible with
+	 * priority inheritance when svc_preempt interrupt just
+	 * before scheduling (and everything that could follow in the
+	 * ready queue). Always favour the original job, as that one will just
+	 * suspend itself to resolve this.
+	 */
+	if(first_task == second_task)
+		return first_task == first;
+
 	/* Check for priority boosting. Tie-break by start of boosting.
 	 */
 	if (unlikely(is_priority_boosted(first_task))) {
@@ -65,11 +73,10 @@ int fp_higher_prio(struct task_struct* first,
 		/* second_task is boosted, first is not*/
 		return 0;
 
-#endif
-
-	/* Comparisons to itself are not expected; priority inheritance
-	 * should also not cause this to happen. */
+#else
+	/* No locks, no priority inheritance, no comparisons to itself */
 	BUG_ON(first_task == second_task);
+#endif
 
 	if (get_priority(first_task) < get_priority(second_task))
 		return 1;
diff --git a/litmus/sched_pfp.c b/litmus/sched_pfp.c
new file mode 100644
index 0000000..af7de76
--- /dev/null
+++ b/litmus/sched_pfp.c
@@ -0,0 +1,2013 @@
+/*
+ * litmus/sched_pfp.c
+ *
+ * Implementation of partitioned fixed-priority scheduling.
+ * Based on PSN-EDF.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+
+#include <litmus/litmus.h>
+#include <litmus/wait.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/fp_common.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+#include <litmus/budget.h>
+
+/* to set up domain/cpu mappings */
+#include <litmus/litmus_proc.h>
+#include <linux/uaccess.h>
+
+
+typedef struct {
+	rt_domain_t 		domain;
+	struct fp_prio_queue	ready_queue;
+	int          		cpu;
+	struct task_struct* 	scheduled; /* only RT tasks */
+/*
+ * scheduling lock slock
+ * protects the domain and serializes scheduling decisions
+ */
+#define slock domain.ready_lock
+
+} pfp_domain_t;
+
+DEFINE_PER_CPU(pfp_domain_t, pfp_domains);
+
+pfp_domain_t* pfp_doms[NR_CPUS];
+
+#define local_pfp		(&__get_cpu_var(pfp_domains))
+#define remote_dom(cpu)		(&per_cpu(pfp_domains, cpu).domain)
+#define remote_pfp(cpu)	(&per_cpu(pfp_domains, cpu))
+#define task_dom(task)		remote_dom(get_partition(task))
+#define task_pfp(task)		remote_pfp(get_partition(task))
+
+
+#ifdef CONFIG_LITMUS_LOCKING
+DEFINE_PER_CPU(uint64_t,fmlp_timestamp);
+#endif
+
+/* we assume the lock is being held */
+static void preempt(pfp_domain_t *pfp)
+{
+	preempt_if_preemptable(pfp->scheduled, pfp->cpu);
+}
+
+static unsigned int priority_index(struct task_struct* t)
+{
+#ifdef CONFIG_LITMUS_LOCKING
+	if (unlikely(t->rt_param.inh_task))
+		/* use effective priority */
+		t = t->rt_param.inh_task;
+
+	if (is_priority_boosted(t)) {
+		/* zero is reserved for priority-boosted tasks */
+		return 0;
+	} else
+#endif
+		return get_priority(t);
+}
+
+static void pfp_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+	pfp_domain_t *pfp = container_of(rt, pfp_domain_t, domain);
+	unsigned long flags;
+	struct task_struct* t;
+	struct bheap_node* hn;
+
+	raw_spin_lock_irqsave(&pfp->slock, flags);
+
+	while (!bheap_empty(tasks)) {
+		hn = bheap_take(fp_ready_order, tasks);
+		t = bheap2task(hn);
+		TRACE_TASK(t, "released (part:%d prio:%d)\n",
+			   get_partition(t), get_priority(t));
+		fp_prio_add(&pfp->ready_queue, t, priority_index(t));
+	}
+
+	/* do we need to preempt? */
+	if (fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled)) {
+		TRACE_CUR("preempted by new release\n");
+		preempt(pfp);
+	}
+
+	raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+
+static void pfp_preempt_check(pfp_domain_t *pfp)
+{
+	if (fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled))
+		preempt(pfp);
+}
+
+static void pfp_domain_init(pfp_domain_t* pfp,
+			       int cpu)
+{
+	fp_domain_init(&pfp->domain, NULL, pfp_release_jobs);
+	pfp->cpu      		= cpu;
+	pfp->scheduled		= NULL;
+	fp_prio_queue_init(&pfp->ready_queue);
+}
+
+static void requeue(struct task_struct* t, pfp_domain_t *pfp)
+{
+	BUG_ON(!is_running(t));
+
+	tsk_rt(t)->completed = 0;
+	if (is_released(t, litmus_clock()))
+		fp_prio_add(&pfp->ready_queue, t, priority_index(t));
+	else
+		add_release(&pfp->domain, t); /* it has got to wait */
+}
+
+static void job_completion(struct task_struct* t, int forced)
+{
+	sched_trace_task_completion(t,forced);
+	TRACE_TASK(t, "job_completion().\n");
+
+	tsk_rt(t)->completed = 0;
+	prepare_for_next_period(t);
+	if (is_released(t, litmus_clock()))
+		sched_trace_task_release(t);
+}
+
+static struct task_struct* pfp_schedule(struct task_struct * prev)
+{
+	pfp_domain_t* 	pfp = local_pfp;
+	struct task_struct*	next;
+
+	int out_of_time, sleep, preempt, np, exists, blocks, resched, migrate;
+
+	raw_spin_lock(&pfp->slock);
+
+	/* sanity checking
+	 * differently from gedf, when a task exits (dead)
+	 * pfp->schedule may be null and prev _is_ realtime
+	 */
+	BUG_ON(pfp->scheduled && pfp->scheduled != prev);
+	BUG_ON(pfp->scheduled && !is_realtime(prev));
+
+	/* (0) Determine state */
+	exists      = pfp->scheduled != NULL;
+	blocks      = exists && !is_running(pfp->scheduled);
+	out_of_time = exists &&
+				  budget_enforced(pfp->scheduled) &&
+				  budget_exhausted(pfp->scheduled);
+	np 	    = exists && is_np(pfp->scheduled);
+	sleep	    = exists && is_completed(pfp->scheduled);
+	migrate     = exists && get_partition(pfp->scheduled) != pfp->cpu;
+	preempt     = !blocks && (migrate || fp_preemption_needed(&pfp->ready_queue, prev));
+
+	/* If we need to preempt do so.
+	 * The following checks set resched to 1 in case of special
+	 * circumstances.
+	 */
+	resched = preempt;
+
+	/* If a task blocks we have no choice but to reschedule.
+	 */
+	if (blocks)
+		resched = 1;
+
+	/* Request a sys_exit_np() call if we would like to preempt but cannot.
+	 * Multiple calls to request_exit_np() don't hurt.
+	 */
+	if (np && (out_of_time || preempt || sleep))
+		request_exit_np(pfp->scheduled);
+
+	/* Any task that is preemptable and either exhausts its execution
+	 * budget or wants to sleep completes. We may have to reschedule after
+	 * this.
+	 */
+	if (!np && (out_of_time || sleep) && !blocks && !migrate) {
+		job_completion(pfp->scheduled, !sleep);
+		resched = 1;
+	}
+
+	/* The final scheduling decision. Do we need to switch for some reason?
+	 * Switch if we are in RT mode and have no task or if we need to
+	 * resched.
+	 */
+	next = NULL;
+	if ((!np || blocks) && (resched || !exists)) {
+		/* When preempting a task that does not block, then
+		 * re-insert it into either the ready queue or the
+		 * release queue (if it completed). requeue() picks
+		 * the appropriate queue.
+		 */
+		if (pfp->scheduled && !blocks  && !migrate)
+			requeue(pfp->scheduled, pfp);
+		next = fp_prio_take(&pfp->ready_queue);
+		if (next == prev) {
+			struct task_struct *t = fp_prio_peek(&pfp->ready_queue);
+			TRACE_TASK(next, "next==prev sleep=%d oot=%d np=%d preempt=%d migrate=%d "
+				   "boost=%d empty=%d prio-idx=%u prio=%u\n",
+				   sleep, out_of_time, np, preempt, migrate,
+				   is_priority_boosted(next),
+				   t == NULL,
+				   priority_index(next),
+				   get_priority(next));
+			if (t)
+				TRACE_TASK(t, "waiter boost=%d prio-idx=%u prio=%u\n",
+					   is_priority_boosted(t),
+					   priority_index(t),
+					   get_priority(t));
+		}
+		/* If preempt is set, we should not see the same task again. */
+		BUG_ON(preempt && next == prev);
+		/* Similarly, if preempt is set, then next may not be NULL,
+		 * unless it's a migration. */
+		BUG_ON(preempt && !migrate && next == NULL);
+	} else
+		/* Only override Linux scheduler if we have a real-time task
+		 * scheduled that needs to continue.
+		 */
+		if (exists)
+			next = prev;
+
+	if (next) {
+		TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+	} else {
+		TRACE("becoming idle at %llu\n", litmus_clock());
+	}
+
+	pfp->scheduled = next;
+	sched_state_task_picked();
+	raw_spin_unlock(&pfp->slock);
+
+	return next;
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+/* prev is no longer scheduled --- see if it needs to migrate */
+static void pfp_finish_switch(struct task_struct *prev)
+{
+	pfp_domain_t *to;
+
+	if (is_realtime(prev) &&
+	    is_running(prev) &&
+	    get_partition(prev) != smp_processor_id()) {
+		TRACE_TASK(prev, "needs to migrate from P%d to P%d\n",
+			   smp_processor_id(), get_partition(prev));
+
+		to = task_pfp(prev);
+
+		raw_spin_lock(&to->slock);
+
+		TRACE_TASK(prev, "adding to queue on P%d\n", to->cpu);
+		requeue(prev, to);
+		if (fp_preemption_needed(&to->ready_queue, to->scheduled))
+			preempt(to);
+
+		raw_spin_unlock(&to->slock);
+
+	}
+}
+
+#endif
+
+/*	Prepare a task for running in RT mode
+ */
+static void pfp_task_new(struct task_struct * t, int on_rq, int is_scheduled)
+{
+	pfp_domain_t* 	pfp = task_pfp(t);
+	unsigned long		flags;
+
+	TRACE_TASK(t, "P-FP: task new, cpu = %d\n",
+		   t->rt_param.task_params.cpu);
+
+	/* setup job parameters */
+	release_at(t, litmus_clock());
+
+	raw_spin_lock_irqsave(&pfp->slock, flags);
+	if (is_scheduled) {
+		/* there shouldn't be anything else running at the time */
+		BUG_ON(pfp->scheduled);
+		pfp->scheduled = t;
+	} else if (is_running(t)) {
+		requeue(t, pfp);
+		/* maybe we have to reschedule */
+		pfp_preempt_check(pfp);
+	}
+	raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+
+static void pfp_task_wake_up(struct task_struct *task)
+{
+	unsigned long		flags;
+	pfp_domain_t*		pfp = task_pfp(task);
+	lt_t			now;
+
+	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+	raw_spin_lock_irqsave(&pfp->slock, flags);
+
+#ifdef CONFIG_LITMUS_LOCKING
+	/* Should only be queued when processing a fake-wake up due to a
+	 * migration-related state change. */
+	if (unlikely(is_queued(task))) {
+		TRACE_TASK(task, "WARNING: waking task still queued. Is this right?\n");
+		goto out_unlock;
+	}
+#else
+	BUG_ON(is_queued(task));
+#endif
+	now = litmus_clock();
+	if (is_sporadic(task) && is_tardy(task, now)
+#ifdef CONFIG_LITMUS_LOCKING
+	/* We need to take suspensions because of semaphores into
+	 * account! If a job resumes after being suspended due to acquiring
+	 * a semaphore, it should never be treated as a new job release.
+	 */
+	    && !is_priority_boosted(task)
+#endif
+		) {
+		/* new sporadic release */
+		release_at(task, now);
+		sched_trace_task_release(task);
+	}
+
+	/* Only add to ready queue if it is not the currently-scheduled
+	 * task. This could be the case if a task was woken up concurrently
+	 * on a remote CPU before the executing CPU got around to actually
+	 * de-scheduling the task, i.e., wake_up() raced with schedule()
+	 * and won. Also, don't requeue if it is still queued, which can
+	 * happen under the DPCP due wake-ups racing with migrations.
+	 */
+	if (pfp->scheduled != task) {
+		requeue(task, pfp);
+		pfp_preempt_check(pfp);
+	}
+
+#ifdef CONFIG_LITMUS_LOCKING
+out_unlock:
+#endif
+	raw_spin_unlock_irqrestore(&pfp->slock, flags);
+	TRACE_TASK(task, "wake up done\n");
+}
+
+static void pfp_task_block(struct task_struct *t)
+{
+	/* only running tasks can block, thus t is in no queue */
+	TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);
+
+	BUG_ON(!is_realtime(t));
+
+	/* If this task blocked normally, it shouldn't be queued. The exception is
+	 * if this is a simulated block()/wakeup() pair from the pull-migration code path.
+	 * This should only happen if the DPCP is being used.
+	 */
+#ifdef CONFIG_LITMUS_LOCKING
+	if (unlikely(is_queued(t)))
+		TRACE_TASK(t, "WARNING: blocking task still queued. Is this right?\n");
+#else
+	BUG_ON(is_queued(t));
+#endif
+}
+
+static void pfp_task_exit(struct task_struct * t)
+{
+	unsigned long flags;
+	pfp_domain_t* 	pfp = task_pfp(t);
+	rt_domain_t*		dom;
+
+	raw_spin_lock_irqsave(&pfp->slock, flags);
+	if (is_queued(t)) {
+		BUG(); /* This currently doesn't work. */
+		/* dequeue */
+		dom  = task_dom(t);
+		remove(dom, t);
+	}
+	if (pfp->scheduled == t) {
+		pfp->scheduled = NULL;
+		preempt(pfp);
+	}
+	TRACE_TASK(t, "RIP, now reschedule\n");
+
+	raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <litmus/fdso.h>
+#include <litmus/srp.h>
+
+static void fp_dequeue(pfp_domain_t* pfp, struct task_struct* t)
+{
+	BUG_ON(pfp->scheduled == t && is_queued(t));
+	if (is_queued(t))
+		fp_prio_remove(&pfp->ready_queue, t, priority_index(t));
+}
+
+static void fp_set_prio_inh(pfp_domain_t* pfp, struct task_struct* t,
+			    struct task_struct* prio_inh)
+{
+	int requeue;
+
+	if (!t || t->rt_param.inh_task == prio_inh) {
+		/* no update  required */
+		if (t)
+			TRACE_TASK(t, "no prio-inh update required\n");
+		return;
+	}
+
+	requeue = is_queued(t);
+	TRACE_TASK(t, "prio-inh: is_queued:%d\n", requeue);
+
+	if (requeue)
+		/* first remove */
+		fp_dequeue(pfp, t);
+
+	t->rt_param.inh_task = prio_inh;
+
+	if (requeue)
+		/* add again to the right queue */
+		fp_prio_add(&pfp->ready_queue, t, priority_index(t));
+}
+
+static int effective_agent_priority(int prio)
+{
+	/* make sure agents have higher priority */
+	return prio - LITMUS_MAX_PRIORITY;
+}
+
+static lt_t prio_point(int eprio)
+{
+	/* make sure we have non-negative prio points */
+	return eprio + LITMUS_MAX_PRIORITY;
+}
+
+static void boost_priority(struct task_struct* t, lt_t priority_point)
+{
+	unsigned long		flags;
+	pfp_domain_t* 	pfp = task_pfp(t);
+
+	raw_spin_lock_irqsave(&pfp->slock, flags);
+
+
+	TRACE_TASK(t, "priority boosted at %llu\n", litmus_clock());
+
+	tsk_rt(t)->priority_boosted = 1;
+	/* tie-break by protocol-specific priority point */
+	tsk_rt(t)->boost_start_time = priority_point;
+
+	/* Priority boosting currently only takes effect for already-scheduled
+	 * tasks. This is sufficient since priority boosting only kicks in as
+	 * part of lock acquisitions. */
+	BUG_ON(pfp->scheduled != t);
+
+	raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+
+static void unboost_priority(struct task_struct* t)
+{
+	unsigned long		flags;
+	pfp_domain_t* 	pfp = task_pfp(t);
+
+	raw_spin_lock_irqsave(&pfp->slock, flags);
+
+	/* Assumption: this only happens when the job is scheduled.
+	 * Exception: If t transitioned to non-real-time mode, we no longer
+	 * care abou tit. */
+	BUG_ON(pfp->scheduled != t && is_realtime(t));
+
+	TRACE_TASK(t, "priority restored at %llu\n", litmus_clock());
+
+	tsk_rt(t)->priority_boosted = 0;
+	tsk_rt(t)->boost_start_time = 0;
+
+	/* check if this changes anything */
+	if (fp_preemption_needed(&pfp->ready_queue, pfp->scheduled))
+		preempt(pfp);
+
+	raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+
+/* ******************** SRP support ************************ */
+
+static unsigned int pfp_get_srp_prio(struct task_struct* t)
+{
+	return get_priority(t);
+}
+
+/* ******************** FMLP support ********************** */
+
+struct fmlp_semaphore {
+	struct litmus_lock litmus_lock;
+
+	/* current resource holder */
+	struct task_struct *owner;
+
+	/* FIFO queue of waiting tasks */
+	wait_queue_head_t wait;
+};
+
+static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct fmlp_semaphore, litmus_lock);
+}
+
+static inline lt_t
+fmlp_clock(void)
+{
+	return (lt_t) __get_cpu_var(fmlp_timestamp)++;
+}
+
+int pfp_fmlp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	wait_queue_t wait;
+	unsigned long flags;
+	lt_t time_of_request;
+
+	if (!is_realtime(t))
+		return -EPERM;
+
+	/* prevent nested lock acquisition --- not supported by FMLP */
+	if (tsk_rt(t)->num_locks_held ||
+	    tsk_rt(t)->num_local_locks_held)
+		return -EBUSY;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	/* tie-break by this point in time */
+	time_of_request = fmlp_clock();
+
+	/* Priority-boost ourself *before* we suspend so that
+	 * our priority is boosted when we resume. */
+	boost_priority(t, time_of_request);
+
+	if (sem->owner) {
+		/* resource is not free => must suspend and wait */
+
+		init_waitqueue_entry(&wait, t);
+
+		/* FIXME: interruptible would be nice some day */
+		set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+		__add_wait_queue_tail_exclusive(&sem->wait, &wait);
+
+		TS_LOCK_SUSPEND;
+
+		/* release lock before sleeping */
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+		/* We depend on the FIFO order.  Thus, we don't need to recheck
+		 * when we wake up; we are guaranteed to have the lock since
+		 * there is only one wake up per release.
+		 */
+
+		schedule();
+
+		TS_LOCK_RESUME;
+
+		/* Since we hold the lock, no other task will change
+		 * ->owner. We can thus check it without acquiring the spin
+		 * lock. */
+		BUG_ON(sem->owner != t);
+	} else {
+		/* it's ours now */
+		sem->owner = t;
+
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+	}
+
+	tsk_rt(t)->num_locks_held++;
+
+	return 0;
+}
+
+int pfp_fmlp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current, *next = NULL;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	unsigned long flags;
+	int err = 0;
+
+	preempt_disable();
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	if (sem->owner != t) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	tsk_rt(t)->num_locks_held--;
+
+	/* we lose the benefit of priority boosting */
+
+	unboost_priority(t);
+
+	/* check if there are jobs waiting for this resource */
+	next = __waitqueue_remove_first(&sem->wait);
+	sem->owner = next;
+
+out:
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	/* Wake up next. The waiting job is already priority-boosted. */
+	if(next) {
+		wake_up_process(next);
+	}
+
+	preempt_enable();
+
+	return err;
+}
+
+int pfp_fmlp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct fmlp_semaphore *sem = fmlp_from_lock(l);
+	unsigned long flags;
+
+	int owner;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	owner = sem->owner == t;
+
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	if (owner)
+		pfp_fmlp_unlock(l);
+
+	return 0;
+}
+
+void pfp_fmlp_free(struct litmus_lock* lock)
+{
+	kfree(fmlp_from_lock(lock));
+}
+
+static struct litmus_lock_ops pfp_fmlp_lock_ops = {
+	.close  = pfp_fmlp_close,
+	.lock   = pfp_fmlp_lock,
+	.unlock = pfp_fmlp_unlock,
+	.deallocate = pfp_fmlp_free,
+};
+
+static struct litmus_lock* pfp_new_fmlp(void)
+{
+	struct fmlp_semaphore* sem;
+
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if (!sem)
+		return NULL;
+
+	sem->owner   = NULL;
+	init_waitqueue_head(&sem->wait);
+	sem->litmus_lock.ops = &pfp_fmlp_lock_ops;
+
+	return &sem->litmus_lock;
+}
+
+/* ******************** MPCP support ********************** */
+
+struct mpcp_semaphore {
+	struct litmus_lock litmus_lock;
+
+	/* current resource holder */
+	struct task_struct *owner;
+
+	/* priority queue of waiting tasks */
+	wait_queue_head_t wait;
+
+	/* priority ceiling per cpu */
+	unsigned int prio_ceiling[NR_CPUS];
+
+	/* should jobs spin "virtually" for this resource? */
+	int vspin;
+};
+
+#define OMEGA_CEILING UINT_MAX
+
+/* Since jobs spin "virtually" while waiting to acquire a lock,
+ * they first must aquire a local per-cpu resource.
+ */
+static DEFINE_PER_CPU(wait_queue_head_t, mpcpvs_vspin_wait);
+static DEFINE_PER_CPU(struct task_struct*, mpcpvs_vspin);
+
+/* called with preemptions off <=> no local modifications */
+static void mpcp_vspin_enter(void)
+{
+	struct task_struct* t = current;
+
+	while (1) {
+		if (__get_cpu_var(mpcpvs_vspin) == NULL) {
+			/* good, we get to issue our request */
+			__get_cpu_var(mpcpvs_vspin) = t;
+			break;
+		} else {
+			/* some job is spinning => enqueue in request queue */
+			prio_wait_queue_t wait;
+			wait_queue_head_t* vspin = &__get_cpu_var(mpcpvs_vspin_wait);
+			unsigned long flags;
+
+			/* ordered by regular priority */
+			init_prio_waitqueue_entry(&wait, t, prio_point(get_priority(t)));
+
+			spin_lock_irqsave(&vspin->lock, flags);
+
+			set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+			__add_wait_queue_prio_exclusive(vspin, &wait);
+
+			spin_unlock_irqrestore(&vspin->lock, flags);
+
+			TS_LOCK_SUSPEND;
+
+			preempt_enable_no_resched();
+
+			schedule();
+
+			preempt_disable();
+
+			TS_LOCK_RESUME;
+			/* Recheck if we got it --- some higher-priority process might
+			 * have swooped in. */
+		}
+	}
+	/* ok, now it is ours */
+}
+
+/* called with preemptions off */
+static void mpcp_vspin_exit(void)
+{
+	struct task_struct* t = current, *next;
+	unsigned long flags;
+	wait_queue_head_t* vspin = &__get_cpu_var(mpcpvs_vspin_wait);
+
+	BUG_ON(__get_cpu_var(mpcpvs_vspin) != t);
+
+	/* no spinning job */
+	__get_cpu_var(mpcpvs_vspin) = NULL;
+
+	/* see if anyone is waiting for us to stop "spinning" */
+	spin_lock_irqsave(&vspin->lock, flags);
+	next = __waitqueue_remove_first(vspin);
+
+	if (next)
+		wake_up_process(next);
+
+	spin_unlock_irqrestore(&vspin->lock, flags);
+}
+
+static inline struct mpcp_semaphore* mpcp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct mpcp_semaphore, litmus_lock);
+}
+
+int pfp_mpcp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct mpcp_semaphore *sem = mpcp_from_lock(l);
+	prio_wait_queue_t wait;
+	unsigned long flags;
+
+	if (!is_realtime(t))
+		return -EPERM;
+
+	/* prevent nested lock acquisition */
+	if (tsk_rt(t)->num_locks_held ||
+	    tsk_rt(t)->num_local_locks_held)
+		return -EBUSY;
+
+	preempt_disable();
+
+	if (sem->vspin)
+		mpcp_vspin_enter();
+
+	/* Priority-boost ourself *before* we suspend so that
+	 * our priority is boosted when we resume. Use the priority
+	 * ceiling for the local partition. */
+	boost_priority(t, sem->prio_ceiling[get_partition(t)]);
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	preempt_enable_no_resched();
+
+	if (sem->owner) {
+		/* resource is not free => must suspend and wait */
+
+		/* ordered by regular priority */
+		init_prio_waitqueue_entry(&wait, t, prio_point(get_priority(t)));
+
+		/* FIXME: interruptible would be nice some day */
+		set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+		__add_wait_queue_prio_exclusive(&sem->wait, &wait);
+
+		TS_LOCK_SUSPEND;
+
+		/* release lock before sleeping */
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+		/* We depend on the FIFO order.  Thus, we don't need to recheck
+		 * when we wake up; we are guaranteed to have the lock since
+		 * there is only one wake up per release.
+		 */
+
+		schedule();
+
+		TS_LOCK_RESUME;
+
+		/* Since we hold the lock, no other task will change
+		 * ->owner. We can thus check it without acquiring the spin
+		 * lock. */
+		BUG_ON(sem->owner != t);
+	} else {
+		/* it's ours now */
+		sem->owner = t;
+
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+	}
+
+	tsk_rt(t)->num_locks_held++;
+
+	return 0;
+}
+
+int pfp_mpcp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current, *next = NULL;
+	struct mpcp_semaphore *sem = mpcp_from_lock(l);
+	unsigned long flags;
+	int err = 0;
+
+	preempt_disable();
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	if (sem->owner != t) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	tsk_rt(t)->num_locks_held--;
+
+	/* we lose the benefit of priority boosting */
+	unboost_priority(t);
+
+	/* check if there are jobs waiting for this resource */
+	next = __waitqueue_remove_first(&sem->wait);
+	sem->owner = next;
+
+out:
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	/* Wake up next. The waiting job is already priority-boosted. */
+	if(next) {
+		wake_up_process(next);
+	}
+
+	if (sem->vspin && err == 0) {
+		mpcp_vspin_exit();
+	}
+
+	preempt_enable();
+
+	return err;
+}
+
+int pfp_mpcp_open(struct litmus_lock* l, void* config)
+{
+	struct task_struct *t = current;
+	int cpu, local_cpu;
+	struct mpcp_semaphore *sem = mpcp_from_lock(l);
+	unsigned long flags;
+
+	if (!is_realtime(t))
+		/* we need to know the real-time priority */
+		return -EPERM;
+
+	local_cpu = get_partition(t);
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		if (cpu != local_cpu) {
+			sem->prio_ceiling[cpu] = min(sem->prio_ceiling[cpu],
+						     get_priority(t));
+			TRACE_CUR("priority ceiling for sem %p is now %d on cpu %d\n",
+				  sem, sem->prio_ceiling[cpu], cpu);
+		}
+	}
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	return 0;
+}
+
+int pfp_mpcp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct mpcp_semaphore *sem = mpcp_from_lock(l);
+	unsigned long flags;
+
+	int owner;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	owner = sem->owner == t;
+
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	if (owner)
+		pfp_mpcp_unlock(l);
+
+	return 0;
+}
+
+void pfp_mpcp_free(struct litmus_lock* lock)
+{
+	kfree(mpcp_from_lock(lock));
+}
+
+static struct litmus_lock_ops pfp_mpcp_lock_ops = {
+	.close  = pfp_mpcp_close,
+	.lock   = pfp_mpcp_lock,
+	.open	= pfp_mpcp_open,
+	.unlock = pfp_mpcp_unlock,
+	.deallocate = pfp_mpcp_free,
+};
+
+static struct litmus_lock* pfp_new_mpcp(int vspin)
+{
+	struct mpcp_semaphore* sem;
+	int cpu;
+
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if (!sem)
+		return NULL;
+
+	sem->owner   = NULL;
+	init_waitqueue_head(&sem->wait);
+	sem->litmus_lock.ops = &pfp_mpcp_lock_ops;
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		sem->prio_ceiling[cpu] = OMEGA_CEILING;
+
+	/* mark as virtual spinning */
+	sem->vspin = vspin;
+
+	return &sem->litmus_lock;
+}
+
+
+/* ******************** PCP support ********************** */
+
+
+struct pcp_semaphore {
+	struct litmus_lock litmus_lock;
+
+	struct list_head ceiling;
+
+	/* current resource holder */
+	struct task_struct *owner;
+
+	/* priority ceiling --- can be negative due to DPCP support */
+	int prio_ceiling;
+
+	/* on which processor is this PCP semaphore allocated? */
+	int on_cpu;
+};
+
+static inline struct pcp_semaphore* pcp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct pcp_semaphore, litmus_lock);
+}
+
+
+struct pcp_state {
+	struct list_head system_ceiling;
+
+	/* highest-priority waiting task */
+	struct task_struct* hp_waiter;
+
+	/* list of jobs waiting to get past the system ceiling */
+	wait_queue_head_t ceiling_blocked;
+};
+
+static void pcp_init_state(struct pcp_state* s)
+{
+	INIT_LIST_HEAD(&s->system_ceiling);
+	s->hp_waiter = NULL;
+	init_waitqueue_head(&s->ceiling_blocked);
+}
+
+static DEFINE_PER_CPU(struct pcp_state, pcp_state);
+
+/* assumes preemptions are off */
+static struct pcp_semaphore* pcp_get_ceiling(void)
+{
+	struct list_head* top = &__get_cpu_var(pcp_state).system_ceiling;
+	return list_first_entry_or_null(top, struct pcp_semaphore, ceiling);
+}
+
+/* assumes preempt off */
+static void pcp_add_ceiling(struct pcp_semaphore* sem)
+{
+	struct list_head *pos;
+	struct list_head *in_use = &__get_cpu_var(pcp_state).system_ceiling;
+	struct pcp_semaphore* held;
+
+	BUG_ON(sem->on_cpu != smp_processor_id());
+	BUG_ON(in_list(&sem->ceiling));
+
+	list_for_each(pos, in_use) {
+		held = list_entry(pos, struct pcp_semaphore, ceiling);
+		if (held->prio_ceiling >= sem->prio_ceiling) {
+			__list_add(&sem->ceiling, pos->prev, pos);
+			return;
+		}
+	}
+
+	/* we hit the end of the list */
+
+	list_add_tail(&sem->ceiling, in_use);
+}
+
+/* assumes preempt off */
+static int pcp_exceeds_ceiling(struct pcp_semaphore* ceiling,
+			      struct task_struct* task,
+			      int effective_prio)
+{
+	return ceiling == NULL ||
+		ceiling->prio_ceiling > effective_prio ||
+		ceiling->owner == task;
+}
+
+/* assumes preempt off */
+static void pcp_priority_inheritance(void)
+{
+	unsigned long	flags;
+	pfp_domain_t* 	pfp = local_pfp;
+
+	struct pcp_semaphore* ceiling = pcp_get_ceiling();
+	struct task_struct *blocker, *blocked;
+
+	blocker = ceiling ?  ceiling->owner : NULL;
+	blocked = __get_cpu_var(pcp_state).hp_waiter;
+
+	raw_spin_lock_irqsave(&pfp->slock, flags);
+
+	/* Current is no longer inheriting anything by default.  This should be
+	 * the currently scheduled job, and hence not currently queued.
+	 * Special case: if current stopped being a real-time task, it will no longer
+	 * be registered as pfp->scheduled. */
+	BUG_ON(current != pfp->scheduled && is_realtime(current));
+
+	fp_set_prio_inh(pfp, current, NULL);
+	fp_set_prio_inh(pfp, blocked, NULL);
+	fp_set_prio_inh(pfp, blocker, NULL);
+
+	/* Let blocking job inherit priority of blocked job, if required. */
+	if (blocker && blocked &&
+	    fp_higher_prio(blocked, blocker)) {
+		TRACE_TASK(blocker, "PCP inherits from %s/%d (prio %u -> %u) \n",
+			   blocked->comm, blocked->pid,
+			   get_priority(blocker), get_priority(blocked));
+		fp_set_prio_inh(pfp, blocker, blocked);
+	}
+
+	/* Check if anything changed. If the blocked job is current, then it is
+	 * just blocking and hence is going to call the scheduler anyway. */
+	if (blocked != current &&
+	    fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled))
+		preempt(pfp);
+
+	raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+
+/* called with preemptions off */
+static void pcp_raise_ceiling(struct pcp_semaphore* sem,
+			      int effective_prio)
+{
+	struct task_struct* t = current;
+	struct pcp_semaphore* ceiling;
+	prio_wait_queue_t wait;
+	unsigned int waiting_higher_prio;
+
+	while(1) {
+		ceiling = pcp_get_ceiling();
+		if (pcp_exceeds_ceiling(ceiling, t, effective_prio))
+			break;
+
+		TRACE_CUR("PCP ceiling-blocked, wanted sem %p, but %s/%d has the ceiling \n",
+			  sem, ceiling->owner->comm, ceiling->owner->pid);
+
+		/* we need to wait until the ceiling is lowered */
+
+		/* enqueue in priority order */
+		init_prio_waitqueue_entry(&wait, t, effective_prio);
+		set_task_state(t, TASK_UNINTERRUPTIBLE);
+		waiting_higher_prio = add_wait_queue_prio_exclusive(
+			&__get_cpu_var(pcp_state).ceiling_blocked, &wait);
+
+		if (waiting_higher_prio == 0) {
+			TRACE_CUR("PCP new highest-prio waiter => prio inheritance\n");
+
+			/* we are the new highest-priority waiting job
+			 * => update inheritance */
+			__get_cpu_var(pcp_state).hp_waiter = t;
+			pcp_priority_inheritance();
+		}
+
+		TS_LOCK_SUSPEND;
+
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
+
+		/* pcp_resume_unblocked() removed us from wait queue */
+
+		TS_LOCK_RESUME;
+	}
+
+	TRACE_CUR("PCP got the ceiling and sem %p\n", sem);
+
+	/* We are good to go. The semaphore should be available. */
+	BUG_ON(sem->owner != NULL);
+
+	sem->owner = t;
+
+	pcp_add_ceiling(sem);
+}
+
+static void pcp_resume_unblocked(void)
+{
+	wait_queue_head_t *blocked =  &__get_cpu_var(pcp_state).ceiling_blocked;
+	unsigned long flags;
+	prio_wait_queue_t* q;
+	struct task_struct* t = NULL;
+
+	struct pcp_semaphore* ceiling = pcp_get_ceiling();
+
+	spin_lock_irqsave(&blocked->lock, flags);
+
+	while (waitqueue_active(blocked)) {
+		/* check first == highest-priority waiting job */
+		q = list_entry(blocked->task_list.next,
+			       prio_wait_queue_t, wq.task_list);
+		t = (struct task_struct*) q->wq.private;
+
+		/* can it proceed now? => let it go */
+		if (pcp_exceeds_ceiling(ceiling, t, q->priority)) {
+		    __remove_wait_queue(blocked, &q->wq);
+		    wake_up_process(t);
+		} else {
+			/* We are done. Update highest-priority waiter. */
+			__get_cpu_var(pcp_state).hp_waiter = t;
+			goto out;
+		}
+	}
+	/* If we get here, then there are no more waiting
+	 * jobs. */
+	__get_cpu_var(pcp_state).hp_waiter = NULL;
+out:
+	spin_unlock_irqrestore(&blocked->lock, flags);
+}
+
+/* assumes preempt off */
+static void pcp_lower_ceiling(struct pcp_semaphore* sem)
+{
+	BUG_ON(!in_list(&sem->ceiling));
+	BUG_ON(sem->owner != current);
+	BUG_ON(sem->on_cpu != smp_processor_id());
+
+	/* remove from ceiling list */
+	list_del(&sem->ceiling);
+
+	/* release */
+	sem->owner = NULL;
+
+	TRACE_CUR("PCP released sem %p\n", sem);
+
+	/* Wake up all ceiling-blocked jobs that now pass the ceiling. */
+	pcp_resume_unblocked();
+
+	pcp_priority_inheritance();
+}
+
+static void pcp_update_prio_ceiling(struct pcp_semaphore* sem,
+				    int effective_prio)
+{
+	/* This needs to be synchronized on something.
+	 * Might as well use waitqueue lock for the processor.
+	 * We assume this happens only before the task set starts execution,
+	 * (i.e., during initialization), but it may happen on multiple processors
+	 * at the same time.
+	 */
+	unsigned long flags;
+
+	struct pcp_state* s = &per_cpu(pcp_state, sem->on_cpu);
+
+	spin_lock_irqsave(&s->ceiling_blocked.lock, flags);
+
+	sem->prio_ceiling = min(sem->prio_ceiling, effective_prio);
+
+	spin_unlock_irqrestore(&s->ceiling_blocked.lock, flags);
+}
+
+static void pcp_init_semaphore(struct pcp_semaphore* sem, int cpu)
+{
+	sem->owner   = NULL;
+	INIT_LIST_HEAD(&sem->ceiling);
+	sem->prio_ceiling = INT_MAX;
+	sem->on_cpu = cpu;
+}
+
+int pfp_pcp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct pcp_semaphore *sem = pcp_from_lock(l);
+
+	/* The regular PCP uses the regular task priorities, not agent
+	 * priorities. */
+	int eprio = get_priority(t);
+	int from  = get_partition(t);
+	int to    = sem->on_cpu;
+
+	if (!is_realtime(t) || from != to)
+		return -EPERM;
+
+	/* prevent nested lock acquisition in global critical section */
+	if (tsk_rt(t)->num_locks_held)
+		return -EBUSY;
+
+	preempt_disable();
+
+	pcp_raise_ceiling(sem, eprio);
+
+	preempt_enable();
+
+	tsk_rt(t)->num_local_locks_held++;
+
+	return 0;
+}
+
+int pfp_pcp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct pcp_semaphore *sem = pcp_from_lock(l);
+
+	int err = 0;
+
+	preempt_disable();
+
+	if (sem->owner != t) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* The current owner should be executing on the correct CPU.
+	 *
+	 * FIXME: if the owner transitioned out of RT mode or is exiting, then
+	 * we it might have already been migrated away by the best-effort
+	 * scheduler and we just have to deal with it. This is currently not
+	 * supported. */
+	BUG_ON(sem->on_cpu != smp_processor_id());
+
+	tsk_rt(t)->num_local_locks_held--;
+
+	/* give it back */
+	pcp_lower_ceiling(sem);
+
+out:
+	preempt_enable();
+
+	return err;
+}
+
+int pfp_pcp_open(struct litmus_lock* l, void* __user config)
+{
+	struct task_struct *t = current;
+	struct pcp_semaphore *sem = pcp_from_lock(l);
+
+	int cpu, eprio;
+
+	if (!is_realtime(t))
+		/* we need to know the real-time priority */
+		return -EPERM;
+
+	if (!config)
+		cpu = get_partition(t);
+	else if (get_user(cpu, (int*) config))
+		return -EFAULT;
+
+	/* make sure the resource location matches */
+	if (cpu != sem->on_cpu)
+		return -EINVAL;
+
+	/* The regular PCP uses regular task priorites, not agent
+	 * priorities. */
+	eprio = get_priority(t);
+
+	pcp_update_prio_ceiling(sem, eprio);
+
+	return 0;
+}
+
+int pfp_pcp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct pcp_semaphore *sem = pcp_from_lock(l);
+
+	int owner = 0;
+
+	preempt_disable();
+
+	if (sem->on_cpu == smp_processor_id())
+		owner = sem->owner == t;
+
+	preempt_enable();
+
+	if (owner)
+		pfp_pcp_unlock(l);
+
+	return 0;
+}
+
+void pfp_pcp_free(struct litmus_lock* lock)
+{
+	kfree(pcp_from_lock(lock));
+}
+
+
+static struct litmus_lock_ops pfp_pcp_lock_ops = {
+	.close  = pfp_pcp_close,
+	.lock   = pfp_pcp_lock,
+	.open	= pfp_pcp_open,
+	.unlock = pfp_pcp_unlock,
+	.deallocate = pfp_pcp_free,
+};
+
+
+static struct litmus_lock* pfp_new_pcp(int on_cpu)
+{
+	struct pcp_semaphore* sem;
+
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if (!sem)
+		return NULL;
+
+	sem->litmus_lock.ops = &pfp_pcp_lock_ops;
+	pcp_init_semaphore(sem, on_cpu);
+
+	return &sem->litmus_lock;
+}
+
+/* ******************** DPCP support ********************** */
+
+struct dpcp_semaphore {
+	struct litmus_lock litmus_lock;
+	struct pcp_semaphore  pcp;
+	int owner_cpu;
+};
+
+static inline struct dpcp_semaphore* dpcp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct dpcp_semaphore, litmus_lock);
+}
+
+/* called with preemptions disabled */
+static void pfp_migrate_to(int target_cpu)
+{
+	struct task_struct* t = current;
+	pfp_domain_t *from;
+
+	if (get_partition(t) == target_cpu)
+		return;
+
+	/* make sure target_cpu makes sense */
+	BUG_ON(!cpu_online(target_cpu));
+
+	local_irq_disable();
+
+	from = task_pfp(t);
+	raw_spin_lock(&from->slock);
+
+	/* Scheduled task should not be in any ready or release queue.  Check
+	 * this while holding the lock to avoid RT mode transitions.*/
+	BUG_ON(is_realtime(t) && is_queued(t));
+
+	/* switch partitions */
+	tsk_rt(t)->task_params.cpu = target_cpu;
+
+	raw_spin_unlock(&from->slock);
+
+	/* Don't trace scheduler costs as part of
+	 * locking overhead. Scheduling costs are accounted for
+	 * explicitly. */
+	TS_LOCK_SUSPEND;
+
+	local_irq_enable();
+	preempt_enable_no_resched();
+
+	/* deschedule to be migrated */
+	schedule();
+
+	/* we are now on the target processor */
+	preempt_disable();
+
+	/* start recording costs again */
+	TS_LOCK_RESUME;
+
+	BUG_ON(smp_processor_id() != target_cpu && is_realtime(t));
+}
+
+int pfp_dpcp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct dpcp_semaphore *sem = dpcp_from_lock(l);
+	int eprio = effective_agent_priority(get_priority(t));
+	int from  = get_partition(t);
+	int to    = sem->pcp.on_cpu;
+
+	if (!is_realtime(t))
+		return -EPERM;
+
+	/* prevent nested lock accquisition */
+	if (tsk_rt(t)->num_locks_held ||
+	    tsk_rt(t)->num_local_locks_held)
+		return -EBUSY;
+
+	preempt_disable();
+
+	/* Priority-boost ourself *before* we suspend so that
+	 * our priority is boosted when we resume. */
+
+	boost_priority(t, get_priority(t));
+
+	pfp_migrate_to(to);
+
+	pcp_raise_ceiling(&sem->pcp, eprio);
+
+	/* yep, we got it => execute request */
+	sem->owner_cpu = from;
+
+	preempt_enable();
+
+	tsk_rt(t)->num_locks_held++;
+
+	return 0;
+}
+
+int pfp_dpcp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct dpcp_semaphore *sem = dpcp_from_lock(l);
+	int err = 0;
+	int home;
+
+	preempt_disable();
+
+	if (sem->pcp.owner != t) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/* The current owner should be executing on the correct CPU.
+	 *
+	 * FIXME: if the owner transitioned out of RT mode or is exiting, then
+	 * we it might have already been migrated away by the best-effort
+	 * scheduler and we just have to deal with it. This is currently not
+	 * supported. */
+	BUG_ON(sem->pcp.on_cpu != smp_processor_id());
+
+	tsk_rt(t)->num_locks_held--;
+
+	home = sem->owner_cpu;
+
+	/* give it back */
+	pcp_lower_ceiling(&sem->pcp);
+
+	/* we lose the benefit of priority boosting */
+	unboost_priority(t);
+
+	pfp_migrate_to(home);
+
+out:
+	preempt_enable();
+
+	return err;
+}
+
+int pfp_dpcp_open(struct litmus_lock* l, void* __user config)
+{
+	struct task_struct *t = current;
+	struct dpcp_semaphore *sem = dpcp_from_lock(l);
+	int cpu, eprio;
+
+	if (!is_realtime(t))
+		/* we need to know the real-time priority */
+		return -EPERM;
+
+	if (get_user(cpu, (int*) config))
+		return -EFAULT;
+
+	/* make sure the resource location matches */
+	if (cpu != sem->pcp.on_cpu)
+		return -EINVAL;
+
+	eprio = effective_agent_priority(get_priority(t));
+
+	pcp_update_prio_ceiling(&sem->pcp, eprio);
+
+	return 0;
+}
+
+int pfp_dpcp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct dpcp_semaphore *sem = dpcp_from_lock(l);
+	int owner = 0;
+
+	preempt_disable();
+
+	if (sem->pcp.on_cpu == smp_processor_id())
+		owner = sem->pcp.owner == t;
+
+	preempt_enable();
+
+	if (owner)
+		pfp_dpcp_unlock(l);
+
+	return 0;
+}
+
+void pfp_dpcp_free(struct litmus_lock* lock)
+{
+	kfree(dpcp_from_lock(lock));
+}
+
+static struct litmus_lock_ops pfp_dpcp_lock_ops = {
+	.close  = pfp_dpcp_close,
+	.lock   = pfp_dpcp_lock,
+	.open	= pfp_dpcp_open,
+	.unlock = pfp_dpcp_unlock,
+	.deallocate = pfp_dpcp_free,
+};
+
+static struct litmus_lock* pfp_new_dpcp(int on_cpu)
+{
+	struct dpcp_semaphore* sem;
+
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if (!sem)
+		return NULL;
+
+	sem->litmus_lock.ops = &pfp_dpcp_lock_ops;
+	sem->owner_cpu = NO_CPU;
+	pcp_init_semaphore(&sem->pcp, on_cpu);
+
+	return &sem->litmus_lock;
+}
+
+
+/* ******************** DFLP support ********************** */
+
+struct dflp_semaphore {
+	struct litmus_lock litmus_lock;
+
+	/* current resource holder */
+	struct task_struct *owner;
+	int owner_cpu;
+
+	/* FIFO queue of waiting tasks */
+	wait_queue_head_t wait;
+
+	/* where is the resource assigned to */
+	int on_cpu;
+};
+
+static inline struct dflp_semaphore* dflp_from_lock(struct litmus_lock* lock)
+{
+	return container_of(lock, struct dflp_semaphore, litmus_lock);
+}
+
+int pfp_dflp_lock(struct litmus_lock* l)
+{
+	struct task_struct* t = current;
+	struct dflp_semaphore *sem = dflp_from_lock(l);
+	int from  = get_partition(t);
+	int to    = sem->on_cpu;
+	unsigned long flags;
+	wait_queue_t wait;
+	lt_t time_of_request;
+
+	if (!is_realtime(t))
+		return -EPERM;
+
+	/* prevent nested lock accquisition */
+	if (tsk_rt(t)->num_locks_held ||
+	    tsk_rt(t)->num_local_locks_held)
+		return -EBUSY;
+
+	preempt_disable();
+
+	/* tie-break by this point in time */
+	time_of_request = litmus_clock();
+
+	/* Priority-boost ourself *before* we suspend so that
+	 * our priority is boosted when we resume. */
+	boost_priority(t, time_of_request);
+
+	pfp_migrate_to(to);
+
+	/* Now on the right CPU, preemptions still disabled. */
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	if (sem->owner) {
+		/* resource is not free => must suspend and wait */
+
+		init_waitqueue_entry(&wait, t);
+
+		/* FIXME: interruptible would be nice some day */
+		set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+		__add_wait_queue_tail_exclusive(&sem->wait, &wait);
+
+		TS_LOCK_SUSPEND;
+
+		/* release lock before sleeping */
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+		/* We depend on the FIFO order.  Thus, we don't need to recheck
+		 * when we wake up; we are guaranteed to have the lock since
+		 * there is only one wake up per release.
+		 */
+
+		preempt_enable_no_resched();
+
+		schedule();
+
+		preempt_disable();
+
+		TS_LOCK_RESUME;
+
+		/* Since we hold the lock, no other task will change
+		 * ->owner. We can thus check it without acquiring the spin
+		 * lock. */
+		BUG_ON(sem->owner != t);
+	} else {
+		/* it's ours now */
+		sem->owner = t;
+
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+	}
+
+	sem->owner_cpu = from;
+
+	preempt_enable();
+
+	tsk_rt(t)->num_locks_held++;
+
+	return 0;
+}
+
+int pfp_dflp_unlock(struct litmus_lock* l)
+{
+	struct task_struct *t = current, *next;
+	struct dflp_semaphore *sem = dflp_from_lock(l);
+	int err = 0;
+	int home;
+	unsigned long flags;
+
+	preempt_disable();
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+
+	if (sem->owner != t) {
+		err = -EINVAL;
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+		goto out;
+	}
+
+	/* check if there are jobs waiting for this resource */
+	next = __waitqueue_remove_first(&sem->wait);
+	if (next) {
+		/* next becomes the resouce holder */
+		sem->owner = next;
+
+		/* Wake up next. The waiting job is already priority-boosted. */
+		wake_up_process(next);
+	} else
+		/* resource becomes available */
+		sem->owner = NULL;
+
+	tsk_rt(t)->num_locks_held--;
+
+	home = sem->owner_cpu;
+
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	/* we lose the benefit of priority boosting */
+	unboost_priority(t);
+
+	pfp_migrate_to(home);
+
+out:
+	preempt_enable();
+
+	return err;
+}
+
+int pfp_dflp_open(struct litmus_lock* l, void* __user config)
+{
+	struct dflp_semaphore *sem = dflp_from_lock(l);
+	int cpu;
+
+	if (get_user(cpu, (int*) config))
+		return -EFAULT;
+
+	/* make sure the resource location matches */
+	if (cpu != sem->on_cpu)
+		return -EINVAL;
+
+	return 0;
+}
+
+int pfp_dflp_close(struct litmus_lock* l)
+{
+	struct task_struct *t = current;
+	struct dflp_semaphore *sem = dflp_from_lock(l);
+	int owner = 0;
+
+	preempt_disable();
+
+	if (sem->on_cpu == smp_processor_id())
+		owner = sem->owner == t;
+
+	preempt_enable();
+
+	if (owner)
+		pfp_dflp_unlock(l);
+
+	return 0;
+}
+
+void pfp_dflp_free(struct litmus_lock* lock)
+{
+	kfree(dflp_from_lock(lock));
+}
+
+static struct litmus_lock_ops pfp_dflp_lock_ops = {
+	.close  = pfp_dflp_close,
+	.lock   = pfp_dflp_lock,
+	.open	= pfp_dflp_open,
+	.unlock = pfp_dflp_unlock,
+	.deallocate = pfp_dflp_free,
+};
+
+static struct litmus_lock* pfp_new_dflp(int on_cpu)
+{
+	struct dflp_semaphore* sem;
+
+	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+	if (!sem)
+		return NULL;
+
+	sem->litmus_lock.ops = &pfp_dflp_lock_ops;
+	sem->owner_cpu = NO_CPU;
+	sem->owner   = NULL;
+	sem->on_cpu  = on_cpu;
+	init_waitqueue_head(&sem->wait);
+
+	return &sem->litmus_lock;
+}
+
+
+/* **** lock constructor **** */
+
+
+static long pfp_allocate_lock(struct litmus_lock **lock, int type,
+				 void* __user config)
+{
+	int err = -ENXIO, cpu;
+	struct srp_semaphore* srp;
+
+	/* P-FP currently supports the SRP for local resources and the FMLP
+	 * for global resources. */
+	switch (type) {
+	case FMLP_SEM:
+		/* FIFO Mutex Locking Protocol */
+		*lock = pfp_new_fmlp();
+		if (*lock)
+			err = 0;
+		else
+			err = -ENOMEM;
+		break;
+
+	case MPCP_SEM:
+		/* Multiprocesor Priority Ceiling Protocol */
+		*lock = pfp_new_mpcp(0);
+		if (*lock)
+			err = 0;
+		else
+			err = -ENOMEM;
+		break;
+
+	case MPCP_VS_SEM:
+		/* Multiprocesor Priority Ceiling Protocol with virtual spinning */
+		*lock = pfp_new_mpcp(1);
+		if (*lock)
+			err = 0;
+		else
+			err = -ENOMEM;
+		break;
+
+	case DPCP_SEM:
+		/* Distributed Priority Ceiling Protocol */
+		if (get_user(cpu, (int*) config))
+			return -EFAULT;
+
+		if (!cpu_online(cpu))
+			return -EINVAL;
+
+		*lock = pfp_new_dpcp(cpu);
+		if (*lock)
+			err = 0;
+		else
+			err = -ENOMEM;
+		break;
+
+	case DFLP_SEM:
+		/* Distributed FIFO Locking Protocol */
+		if (get_user(cpu, (int*) config))
+			return -EFAULT;
+
+		if (!cpu_online(cpu))
+			return -EINVAL;
+
+		*lock = pfp_new_dflp(cpu);
+		if (*lock)
+			err = 0;
+		else
+			err = -ENOMEM;
+		break;
+
+	case SRP_SEM:
+		/* Baker's Stack Resource Policy */
+		srp = allocate_srp_semaphore();
+		if (srp) {
+			*lock = &srp->litmus_lock;
+			err = 0;
+		} else
+			err = -ENOMEM;
+		break;
+
+        case PCP_SEM:
+		/* Priority Ceiling Protocol */
+		if (!config)
+			cpu = get_partition(current);
+		else if (get_user(cpu, (int*) config))
+			return -EFAULT;
+
+		if (!cpu_online(cpu))
+			return -EINVAL;
+
+		*lock = pfp_new_pcp(cpu);
+		if (*lock)
+			err = 0;
+		else
+			err = -ENOMEM;
+		break;
+	};
+
+	return err;
+}
+
+#endif
+
+static long pfp_admit_task(struct task_struct* tsk)
+{
+	if (task_cpu(tsk) == tsk->rt_param.task_params.cpu &&
+#ifdef CONFIG_RELEASE_MASTER
+	    /* don't allow tasks on release master CPU */
+	    task_cpu(tsk) != remote_dom(task_cpu(tsk))->release_master &&
+#endif
+	    litmus_is_valid_fixed_prio(get_priority(tsk)))
+		return 0;
+	else
+		return -EINVAL;
+}
+
+static struct domain_proc_info pfp_domain_proc_info;
+static long pfp_get_domain_proc_info(struct domain_proc_info **ret)
+{
+	*ret = &pfp_domain_proc_info;
+	return 0;
+}
+
+static void pfp_setup_domain_proc(void)
+{
+	int i, cpu;
+	int release_master =
+#ifdef CONFIG_RELEASE_MASTER
+		atomic_read(&release_master_cpu);
+#else
+		NO_CPU;
+#endif
+	int num_rt_cpus = num_online_cpus() - (release_master != NO_CPU);
+	struct cd_mapping *cpu_map, *domain_map;
+
+	memset(&pfp_domain_proc_info, sizeof(pfp_domain_proc_info), 0);
+	init_domain_proc_info(&pfp_domain_proc_info, num_rt_cpus, num_rt_cpus);
+	pfp_domain_proc_info.num_cpus = num_rt_cpus;
+	pfp_domain_proc_info.num_domains = num_rt_cpus;
+	for (cpu = 0, i = 0; cpu < num_online_cpus(); ++cpu) {
+		if (cpu == release_master)
+			continue;
+		cpu_map = &pfp_domain_proc_info.cpu_to_domains[i];
+		domain_map = &pfp_domain_proc_info.domain_to_cpus[i];
+
+		cpu_map->id = cpu;
+		domain_map->id = i; /* enumerate w/o counting the release master */
+		cpumask_set_cpu(i, cpu_map->mask);
+		cpumask_set_cpu(cpu, domain_map->mask);
+		++i;
+	}
+}
+
+static long pfp_activate_plugin(void)
+{
+#if defined(CONFIG_RELEASE_MASTER) || defined(CONFIG_LITMUS_LOCKING)
+	int cpu;
+#endif
+
+#ifdef CONFIG_RELEASE_MASTER
+	for_each_online_cpu(cpu) {
+		remote_dom(cpu)->release_master = atomic_read(&release_master_cpu);
+	}
+#endif
+
+#ifdef CONFIG_LITMUS_LOCKING
+	get_srp_prio = pfp_get_srp_prio;
+
+	for_each_online_cpu(cpu) {
+		init_waitqueue_head(&per_cpu(mpcpvs_vspin_wait, cpu));
+		per_cpu(mpcpvs_vspin, cpu) = NULL;
+
+		pcp_init_state(&per_cpu(pcp_state, cpu));
+		pfp_doms[cpu] = remote_pfp(cpu);
+		per_cpu(fmlp_timestamp,cpu) = 0;
+	}
+
+#endif
+
+	pfp_setup_domain_proc();
+
+	return 0;
+}
+
+static long pfp_deactivate_plugin(void)
+{
+	destroy_domain_proc_info(&pfp_domain_proc_info);
+	return 0;
+}
+
+/*	Plugin object	*/
+static struct sched_plugin pfp_plugin __cacheline_aligned_in_smp = {
+	.plugin_name		= "P-FP",
+	.task_new		= pfp_task_new,
+	.complete_job		= complete_job,
+	.task_exit		= pfp_task_exit,
+	.schedule		= pfp_schedule,
+	.task_wake_up		= pfp_task_wake_up,
+	.task_block		= pfp_task_block,
+	.admit_task		= pfp_admit_task,
+	.activate_plugin	= pfp_activate_plugin,
+	.deactivate_plugin	= pfp_deactivate_plugin,
+	.get_domain_proc_info	= pfp_get_domain_proc_info,
+#ifdef CONFIG_LITMUS_LOCKING
+	.allocate_lock		= pfp_allocate_lock,
+	.finish_switch		= pfp_finish_switch,
+#endif
+};
+
+
+static int __init init_pfp(void)
+{
+	int i;
+
+	/* We do not really want to support cpu hotplug, do we? ;)
+	 * However, if we are so crazy to do so,
+	 * we cannot use num_online_cpu()
+	 */
+	for (i = 0; i < num_online_cpus(); i++) {
+		pfp_domain_init(remote_pfp(i), i);
+	}
+	return register_sched_plugin(&pfp_plugin);
+}
+
+module_init(init_pfp);
-- 
1.8.1.2


From dcd52da5373b0afb556b0d4fb006568dc44f2ba0 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Tue, 12 Feb 2013 19:18:11 +0100
Subject: [PATCH 049/119] Add C-EDF scheduler plugin

---
 litmus/Kconfig      |  10 +
 litmus/Makefile     |   2 +-
 litmus/sched_cedf.c | 903 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 914 insertions(+), 1 deletion(-)
 create mode 100644 litmus/sched_cedf.c

diff --git a/litmus/Kconfig b/litmus/Kconfig
index fdf31f3..38d9e43 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -2,6 +2,16 @@ menu "LITMUS^RT"
 
 menu "Scheduling"
 
+config PLUGIN_CEDF
+        bool "Clustered-EDF"
+	depends on X86 && SYSFS
+        default y
+        help
+          Include the Clustered EDF (C-EDF) plugin in the kernel.
+          This is appropriate for large platforms with shared caches.
+          On smaller platforms (e.g., ARM PB11MPCore), using C-EDF
+          makes little sense since there aren't any shared caches.
+
 config RELEASE_MASTER
         bool "Release-master Support"
 	depends on ARCH_HAS_SEND_PULL_TIMERS && SMP
diff --git a/litmus/Makefile b/litmus/Makefile
index 2d2e0a5..8110a5a 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -23,7 +23,7 @@ obj-y     = sched_plugin.o litmus.o \
 	    sched_psn_edf.o \
 	    sched_pfp.o
 
-
+obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
 
 obj-$(CONFIG_SCHED_CPU_AFFINITY) += affinity.o
 
diff --git a/litmus/sched_cedf.c b/litmus/sched_cedf.c
new file mode 100644
index 0000000..b1454c0
--- /dev/null
+++ b/litmus/sched_cedf.c
@@ -0,0 +1,903 @@
+/*
+ * litmus/sched_cedf.c
+ *
+ * Implementation of the C-EDF scheduling algorithm.
+ *
+ * This implementation is based on G-EDF:
+ * - CPUs are clustered around L2 or L3 caches.
+ * - Clusters topology is automatically detected (this is arch dependent
+ *   and is working only on x86 at the moment --- and only with modern
+ *   cpus that exports cpuid4 information)
+ * - The plugins _does not_ attempt to put tasks in the right cluster i.e.
+ *   the programmer needs to be aware of the topology to place tasks
+ *   in the desired cluster
+ * - default clustering is around L2 cache (cache index = 2)
+ *   supported clusters are: L1 (private cache: pedf), L2, L3, ALL (all
+ *   online_cpus are placed in a single cluster).
+ *
+ *   For details on functions, take a look at sched_gsn_edf.c
+ *
+ * Currently, we do not support changes in the number of online cpus.
+ * If the num_online_cpus() dynamically changes, the plugin is broken.
+ *
+ * This version uses the simple approach and serializes all scheduling
+ * decisions by the use of a queue lock. This is probably not the
+ * best way to do it, but it should suffice for now.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <linux/module.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/budget.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/clustered.h>
+
+#include <litmus/bheap.h>
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
+
+/* to configure the cluster size */
+#include <litmus/litmus_proc.h>
+#include <linux/uaccess.h>
+
+/* Reference configuration variable. Determines which cache level is used to
+ * group CPUs into clusters.  GLOBAL_CLUSTER, which is the default, means that
+ * all CPUs form a single cluster (just like GSN-EDF).
+ */
+static enum cache_level cluster_config = GLOBAL_CLUSTER;
+
+struct clusterdomain;
+
+/* cpu_entry_t - maintain the linked and scheduled state
+ *
+ * A cpu also contains a pointer to the cedf_domain_t cluster
+ * that owns it (struct clusterdomain*)
+ */
+typedef struct  {
+	int 			cpu;
+	struct clusterdomain*	cluster;	/* owning cluster */
+	struct task_struct*	linked;		/* only RT tasks */
+	struct task_struct*	scheduled;	/* only RT tasks */
+	atomic_t		will_schedule;	/* prevent unneeded IPIs */
+	struct bheap_node*	hn;
+} cpu_entry_t;
+
+/* one cpu_entry_t per CPU */
+DEFINE_PER_CPU(cpu_entry_t, cedf_cpu_entries);
+
+#define set_will_schedule() \
+	(atomic_set(&__get_cpu_var(cedf_cpu_entries).will_schedule, 1))
+#define clear_will_schedule() \
+	(atomic_set(&__get_cpu_var(cedf_cpu_entries).will_schedule, 0))
+#define test_will_schedule(cpu) \
+	(atomic_read(&per_cpu(cedf_cpu_entries, cpu).will_schedule))
+
+/*
+ * In C-EDF there is a cedf domain _per_ cluster
+ * The number of clusters is dynamically determined accordingly to the
+ * total cpu number and the cluster size
+ */
+typedef struct clusterdomain {
+	/* rt_domain for this cluster */
+	rt_domain_t	domain;
+	/* cpus in this cluster */
+	cpu_entry_t*	*cpus;
+	/* map of this cluster cpus */
+	cpumask_var_t	cpu_map;
+	/* the cpus queue themselves according to priority in here */
+	struct bheap_node *heap_node;
+	struct bheap      cpu_heap;
+	/* lock for this cluster */
+#define cluster_lock domain.ready_lock
+} cedf_domain_t;
+
+/* a cedf_domain per cluster; allocation is done at init/activation time */
+cedf_domain_t *cedf;
+
+#define remote_cluster(cpu)	((cedf_domain_t *) per_cpu(cedf_cpu_entries, cpu).cluster)
+#define task_cpu_cluster(task)	remote_cluster(get_partition(task))
+
+/* Uncomment WANT_ALL_SCHED_EVENTS if you want to see all scheduling
+ * decisions in the TRACE() log; uncomment VERBOSE_INIT for verbose
+ * information during the initialization of the plugin (e.g., topology)
+#define WANT_ALL_SCHED_EVENTS
+ */
+#define VERBOSE_INIT
+
+static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
+{
+	cpu_entry_t *a, *b;
+	a = _a->value;
+	b = _b->value;
+	/* Note that a and b are inverted: we want the lowest-priority CPU at
+	 * the top of the heap.
+	 */
+	return edf_higher_prio(b->linked, a->linked);
+}
+
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ *                       order in the cpu queue. Caller must hold cedf lock.
+ */
+static void update_cpu_position(cpu_entry_t *entry)
+{
+	cedf_domain_t *cluster = entry->cluster;
+
+	if (likely(bheap_node_in_heap(entry->hn)))
+		bheap_delete(cpu_lower_prio,
+				&cluster->cpu_heap,
+				entry->hn);
+
+	bheap_insert(cpu_lower_prio, &cluster->cpu_heap, entry->hn);
+}
+
+/* caller must hold cedf lock */
+static cpu_entry_t* lowest_prio_cpu(cedf_domain_t *cluster)
+{
+	struct bheap_node* hn;
+	hn = bheap_peek(cpu_lower_prio, &cluster->cpu_heap);
+	return hn->value;
+}
+
+
+/* link_task_to_cpu - Update the link of a CPU.
+ *                    Handles the case where the to-be-linked task is already
+ *                    scheduled on a different CPU.
+ */
+static noinline void link_task_to_cpu(struct task_struct* linked,
+				      cpu_entry_t *entry)
+{
+	cpu_entry_t *sched;
+	struct task_struct* tmp;
+	int on_cpu;
+
+	BUG_ON(linked && !is_realtime(linked));
+
+	/* Currently linked task is set to be unlinked. */
+	if (entry->linked) {
+		entry->linked->rt_param.linked_on = NO_CPU;
+	}
+
+	/* Link new task to CPU. */
+	if (linked) {
+		/* handle task is already scheduled somewhere! */
+		on_cpu = linked->rt_param.scheduled_on;
+		if (on_cpu != NO_CPU) {
+			sched = &per_cpu(cedf_cpu_entries, on_cpu);
+			/* this should only happen if not linked already */
+			BUG_ON(sched->linked == linked);
+
+			/* If we are already scheduled on the CPU to which we
+			 * wanted to link, we don't need to do the swap --
+			 * we just link ourselves to the CPU and depend on
+			 * the caller to get things right.
+			 */
+			if (entry != sched) {
+				TRACE_TASK(linked,
+					   "already scheduled on %d, updating link.\n",
+					   sched->cpu);
+				tmp = sched->linked;
+				linked->rt_param.linked_on = sched->cpu;
+				sched->linked = linked;
+				update_cpu_position(sched);
+				linked = tmp;
+			}
+		}
+		if (linked) /* might be NULL due to swap */
+			linked->rt_param.linked_on = entry->cpu;
+	}
+	entry->linked = linked;
+#ifdef WANT_ALL_SCHED_EVENTS
+	if (linked)
+		TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
+	else
+		TRACE("NULL linked to %d.\n", entry->cpu);
+#endif
+	update_cpu_position(entry);
+}
+
+/* unlink - Make sure a task is not linked any longer to an entry
+ *          where it was linked before. Must hold cedf_lock.
+ */
+static noinline void unlink(struct task_struct* t)
+{
+	cpu_entry_t *entry;
+
+	if (t->rt_param.linked_on != NO_CPU) {
+		/* unlink */
+		entry = &per_cpu(cedf_cpu_entries, t->rt_param.linked_on);
+		t->rt_param.linked_on = NO_CPU;
+		link_task_to_cpu(NULL, entry);
+	} else if (is_queued(t)) {
+		/* This is an interesting situation: t is scheduled,
+		 * but was just recently unlinked.  It cannot be
+		 * linked anywhere else (because then it would have
+		 * been relinked to this CPU), thus it must be in some
+		 * queue. We must remove it from the list in this
+		 * case.
+		 *
+		 * in C-EDF case is should be somewhere in the queue for
+		 * its domain, therefore and we can get the domain using
+		 * task_cpu_cluster
+		 */
+		remove(&(task_cpu_cluster(t))->domain, t);
+	}
+}
+
+
+/* preempt - force a CPU to reschedule
+ */
+static void preempt(cpu_entry_t *entry)
+{
+	preempt_if_preemptable(entry->scheduled, entry->cpu);
+}
+
+/* requeue - Put an unlinked task into gsn-edf domain.
+ *           Caller must hold cedf_lock.
+ */
+static noinline void requeue(struct task_struct* task)
+{
+	cedf_domain_t *cluster = task_cpu_cluster(task);
+	BUG_ON(!task);
+	/* sanity check before insertion */
+	BUG_ON(is_queued(task));
+
+	if (is_early_releasing(task) || is_released(task, litmus_clock()))
+		__add_ready(&cluster->domain, task);
+	else {
+		/* it has got to wait */
+		add_release(&cluster->domain, task);
+	}
+}
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+static cpu_entry_t* cedf_get_nearest_available_cpu(
+				cedf_domain_t *cluster, cpu_entry_t *start)
+{
+	cpu_entry_t *affinity;
+
+	get_nearest_available_cpu(affinity, start, cedf_cpu_entries,
+#ifdef CONFIG_RELEASE_MASTER
+		cluster->domain.release_master
+#else
+		NO_CPU
+#endif
+		);
+
+	/* make sure CPU is in our cluster */
+	if (affinity && cpu_isset(affinity->cpu, *cluster->cpu_map))
+		return(affinity);
+	else
+		return(NULL);
+}
+#endif
+
+
+/* check for any necessary preemptions */
+static void check_for_preemptions(cedf_domain_t *cluster)
+{
+	struct task_struct *task;
+	cpu_entry_t *last;
+
+#ifdef CONFIG_PREFER_LOCAL_LINKING
+	cpu_entry_t *local;
+
+	/* Before linking to other CPUs, check first whether the local CPU is
+	 * idle. */
+	local = &__get_cpu_var(cedf_cpu_entries);
+	task  = __peek_ready(&cluster->domain);
+
+	if (task && !local->linked
+#ifdef CONFIG_RELEASE_MASTER
+	    && likely(local->cpu != cluster->domain.release_master)
+#endif
+		) {
+		task = __take_ready(&cluster->domain);
+		TRACE_TASK(task, "linking to local CPU %d to avoid IPI\n", local->cpu);
+		link_task_to_cpu(task, local);
+		preempt(local);
+	}
+#endif
+
+
+	for(last = lowest_prio_cpu(cluster);
+	    edf_preemption_needed(&cluster->domain, last->linked);
+	    last = lowest_prio_cpu(cluster)) {
+		/* preemption necessary */
+		task = __take_ready(&cluster->domain);
+		TRACE("check_for_preemptions: attempting to link task %d to %d\n",
+		      task->pid, last->cpu);
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+		{
+			cpu_entry_t *affinity =
+					cedf_get_nearest_available_cpu(cluster,
+						&per_cpu(cedf_cpu_entries, task_cpu(task)));
+			if(affinity)
+				last = affinity;
+			else if(requeue_preempted_job(last->linked))
+				requeue(last->linked);
+		}
+#else
+		if (requeue_preempted_job(last->linked))
+			requeue(last->linked);
+#endif
+		link_task_to_cpu(task, last);
+		preempt(last);
+	}
+}
+
+/* cedf_job_arrival: task is either resumed or released */
+static noinline void cedf_job_arrival(struct task_struct* task)
+{
+	cedf_domain_t *cluster = task_cpu_cluster(task);
+	BUG_ON(!task);
+
+	requeue(task);
+	check_for_preemptions(cluster);
+}
+
+static void cedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+	cedf_domain_t* cluster = container_of(rt, cedf_domain_t, domain);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+
+	__merge_ready(&cluster->domain, tasks);
+	check_for_preemptions(cluster);
+
+	raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+}
+
+/* caller holds cedf_lock */
+static noinline void job_completion(struct task_struct *t, int forced)
+{
+	BUG_ON(!t);
+
+	sched_trace_task_completion(t, forced);
+
+	TRACE_TASK(t, "job_completion().\n");
+
+	/* set flags */
+	tsk_rt(t)->completed = 0;
+	/* prepare for next period */
+	prepare_for_next_period(t);
+	if (is_early_releasing(t) || is_released(t, litmus_clock()))
+		sched_trace_task_release(t);
+	/* unlink */
+	unlink(t);
+	/* requeue
+	 * But don't requeue a blocking task. */
+	if (is_running(t))
+		cedf_job_arrival(t);
+}
+
+/* Getting schedule() right is a bit tricky. schedule() may not make any
+ * assumptions on the state of the current task since it may be called for a
+ * number of reasons. The reasons include a scheduler_tick() determined that it
+ * was necessary, because sys_exit_np() was called, because some Linux
+ * subsystem determined so, or even (in the worst case) because there is a bug
+ * hidden somewhere. Thus, we must take extreme care to determine what the
+ * current state is.
+ *
+ * The CPU could currently be scheduling a task (or not), be linked (or not).
+ *
+ * The following assertions for the scheduled task could hold:
+ *
+ *      - !is_running(scheduled)        // the job blocks
+ *	- scheduled->timeslice == 0	// the job completed (forcefully)
+ *	- is_completed()		// the job completed (by syscall)
+ * 	- linked != scheduled		// we need to reschedule (for any reason)
+ * 	- is_np(scheduled)		// rescheduling must be delayed,
+ *					   sys_exit_np must be requested
+ *
+ * Any of these can occur together.
+ */
+static struct task_struct* cedf_schedule(struct task_struct * prev)
+{
+	cpu_entry_t* entry = &__get_cpu_var(cedf_cpu_entries);
+	cedf_domain_t *cluster = entry->cluster;
+	int out_of_time, sleep, preempt, np, exists, blocks;
+	struct task_struct* next = NULL;
+
+#ifdef CONFIG_RELEASE_MASTER
+	/* Bail out early if we are the release master.
+	 * The release master never schedules any real-time tasks.
+	 */
+	if (unlikely(cluster->domain.release_master == entry->cpu)) {
+		sched_state_task_picked();
+		return NULL;
+	}
+#endif
+
+	raw_spin_lock(&cluster->cluster_lock);
+	clear_will_schedule();
+
+	/* sanity checking */
+	BUG_ON(entry->scheduled && entry->scheduled != prev);
+	BUG_ON(entry->scheduled && !is_realtime(prev));
+	BUG_ON(is_realtime(prev) && !entry->scheduled);
+
+	/* (0) Determine state */
+	exists      = entry->scheduled != NULL;
+	blocks      = exists && !is_running(entry->scheduled);
+	out_of_time = exists &&
+				  budget_enforced(entry->scheduled) &&
+				  budget_exhausted(entry->scheduled);
+	np 	    = exists && is_np(entry->scheduled);
+	sleep	    = exists && is_completed(entry->scheduled);
+	preempt     = entry->scheduled != entry->linked;
+
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE_TASK(prev, "invoked cedf_schedule.\n");
+#endif
+
+	if (exists)
+		TRACE_TASK(prev,
+			   "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
+			   "state:%d sig:%d\n",
+			   blocks, out_of_time, np, sleep, preempt,
+			   prev->state, signal_pending(prev));
+	if (entry->linked && preempt)
+		TRACE_TASK(prev, "will be preempted by %s/%d\n",
+			   entry->linked->comm, entry->linked->pid);
+
+
+	/* If a task blocks we have no choice but to reschedule.
+	 */
+	if (blocks)
+		unlink(entry->scheduled);
+
+	/* Request a sys_exit_np() call if we would like to preempt but cannot.
+	 * We need to make sure to update the link structure anyway in case
+	 * that we are still linked. Multiple calls to request_exit_np() don't
+	 * hurt.
+	 */
+	if (np && (out_of_time || preempt || sleep)) {
+		unlink(entry->scheduled);
+		request_exit_np(entry->scheduled);
+	}
+
+	/* Any task that is preemptable and either exhausts its execution
+	 * budget or wants to sleep completes. We may have to reschedule after
+	 * this. Don't do a job completion if we block (can't have timers running
+	 * for blocked jobs).
+	 */
+	if (!np && (out_of_time || sleep) && !blocks)
+		job_completion(entry->scheduled, !sleep);
+
+	/* Link pending task if we became unlinked.
+	 */
+	if (!entry->linked)
+		link_task_to_cpu(__take_ready(&cluster->domain), entry);
+
+	/* The final scheduling decision. Do we need to switch for some reason?
+	 * If linked is different from scheduled, then select linked as next.
+	 */
+	if ((!np || blocks) &&
+	    entry->linked != entry->scheduled) {
+		/* Schedule a linked job? */
+		if (entry->linked) {
+			entry->linked->rt_param.scheduled_on = entry->cpu;
+			next = entry->linked;
+		}
+		if (entry->scheduled) {
+			/* not gonna be scheduled soon */
+			entry->scheduled->rt_param.scheduled_on = NO_CPU;
+			TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
+		}
+	} else
+		/* Only override Linux scheduler if we have a real-time task
+		 * scheduled that needs to continue.
+		 */
+		if (exists)
+			next = prev;
+
+	sched_state_task_picked();
+	raw_spin_unlock(&cluster->cluster_lock);
+
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE("cedf_lock released, next=0x%p\n", next);
+
+	if (next)
+		TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+	else if (exists && !next)
+		TRACE("becomes idle at %llu.\n", litmus_clock());
+#endif
+
+
+	return next;
+}
+
+
+/* _finish_switch - we just finished the switch away from prev
+ */
+static void cedf_finish_switch(struct task_struct *prev)
+{
+	cpu_entry_t* 	entry = &__get_cpu_var(cedf_cpu_entries);
+
+	entry->scheduled = is_realtime(current) ? current : NULL;
+#ifdef WANT_ALL_SCHED_EVENTS
+	TRACE_TASK(prev, "switched away from\n");
+#endif
+}
+
+
+/*	Prepare a task for running in RT mode
+ */
+static void cedf_task_new(struct task_struct * t, int on_rq, int is_scheduled)
+{
+	unsigned long 		flags;
+	cpu_entry_t* 		entry;
+	cedf_domain_t*		cluster;
+
+	TRACE("gsn edf: task new %d\n", t->pid);
+
+	/* the cluster doesn't change even if t is scheduled */
+	cluster = task_cpu_cluster(t);
+
+	raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+
+	/* setup job params */
+	release_at(t, litmus_clock());
+
+	if (is_scheduled) {
+		entry = &per_cpu(cedf_cpu_entries, task_cpu(t));
+		BUG_ON(entry->scheduled);
+
+#ifdef CONFIG_RELEASE_MASTER
+		if (entry->cpu != cluster->domain.release_master) {
+#endif
+			entry->scheduled = t;
+			tsk_rt(t)->scheduled_on = task_cpu(t);
+#ifdef CONFIG_RELEASE_MASTER
+		} else {
+			/* do not schedule on release master */
+			preempt(entry); /* force resched */
+			tsk_rt(t)->scheduled_on = NO_CPU;
+		}
+#endif
+	} else {
+		t->rt_param.scheduled_on = NO_CPU;
+	}
+	t->rt_param.linked_on          = NO_CPU;
+
+	if (is_running(t))
+		cedf_job_arrival(t);
+	raw_spin_unlock_irqrestore(&(cluster->cluster_lock), flags);
+}
+
+static void cedf_task_wake_up(struct task_struct *task)
+{
+	unsigned long flags;
+	lt_t now;
+	cedf_domain_t *cluster;
+
+	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+
+	cluster = task_cpu_cluster(task);
+
+	raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+	now = litmus_clock();
+	if (is_sporadic(task) && is_tardy(task, now)) {
+		/* new sporadic release */
+		release_at(task, now);
+		sched_trace_task_release(task);
+	}
+	cedf_job_arrival(task);
+	raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+}
+
+static void cedf_task_block(struct task_struct *t)
+{
+	unsigned long flags;
+	cedf_domain_t *cluster;
+
+	TRACE_TASK(t, "block at %llu\n", litmus_clock());
+
+	cluster = task_cpu_cluster(t);
+
+	/* unlink if necessary */
+	raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+	unlink(t);
+	raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+
+	BUG_ON(!is_realtime(t));
+}
+
+
+static void cedf_task_exit(struct task_struct * t)
+{
+	unsigned long flags;
+	cedf_domain_t *cluster = task_cpu_cluster(t);
+
+	/* unlink if necessary */
+	raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+	unlink(t);
+	if (tsk_rt(t)->scheduled_on != NO_CPU) {
+		cpu_entry_t *cpu;
+		cpu = &per_cpu(cedf_cpu_entries, tsk_rt(t)->scheduled_on);
+		cpu->scheduled = NULL;
+		tsk_rt(t)->scheduled_on = NO_CPU;
+	}
+	raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+
+	BUG_ON(!is_realtime(t));
+        TRACE_TASK(t, "RIP\n");
+}
+
+static long cedf_admit_task(struct task_struct* tsk)
+{
+	return (remote_cluster(task_cpu(tsk)) == task_cpu_cluster(tsk)) ?
+			0 : -EINVAL;
+}
+
+/* total number of cluster */
+static int num_clusters;
+/* we do not support cluster of different sizes */
+static unsigned int cluster_size;
+
+#ifdef VERBOSE_INIT
+static void print_cluster_topology(cpumask_var_t mask, int cpu)
+{
+	int chk;
+	char buf[255];
+
+	chk = cpulist_scnprintf(buf, 254, mask);
+	buf[chk] = '\0';
+	printk(KERN_INFO "CPU = %d, shared cpu(s) = %s\n", cpu, buf);
+
+}
+#endif
+
+static int clusters_allocated = 0;
+
+static void cleanup_cedf(void)
+{
+	int i;
+
+	if (clusters_allocated) {
+		for (i = 0; i < num_clusters; i++) {
+			kfree(cedf[i].cpus);
+			kfree(cedf[i].heap_node);
+			free_cpumask_var(cedf[i].cpu_map);
+		}
+
+		kfree(cedf);
+	}
+}
+
+static struct domain_proc_info cedf_domain_proc_info;
+static long cedf_get_domain_proc_info(struct domain_proc_info **ret)
+{
+	*ret = &cedf_domain_proc_info;
+	return 0;
+}
+
+static void cedf_setup_domain_proc(void)
+{
+	int i, cpu, domain;
+#ifdef CONFIG_RELEASE_MASTER
+	int release_master = atomic_read(&release_master_cpu);
+	/* skip over the domain with the release master if cluster size is 1 */
+	int skip_domain = (1 == cluster_size && release_master != NO_CPU) ?
+			release_master : NO_CPU;
+#else
+	int release_master = NO_CPU;
+	int skip_domain = NO_CPU;
+#endif
+	int num_rt_cpus = num_online_cpus() - (release_master != NO_CPU);
+	int num_rt_domains = num_clusters - (skip_domain != NO_CPU);
+	struct cd_mapping *map;
+
+	memset(&cedf_domain_proc_info, sizeof(cedf_domain_proc_info), 0);
+	init_domain_proc_info(&cedf_domain_proc_info, num_rt_cpus, num_rt_domains);
+	cedf_domain_proc_info.num_cpus = num_rt_cpus;
+	cedf_domain_proc_info.num_domains = num_rt_domains;
+
+	for (cpu = 0, i = 0; cpu < num_online_cpus(); ++cpu) {
+		if (cpu == release_master)
+			continue;
+		map = &cedf_domain_proc_info.cpu_to_domains[i];
+		/* pointer math to figure out the domain index */
+		domain = remote_cluster(cpu) - cedf;
+		map->id = cpu;
+		cpumask_set_cpu(domain, map->mask);
+		++i;
+	}
+
+	for (domain = 0, i = 0; domain < num_clusters; ++domain) {
+		if (domain == skip_domain)
+			continue;
+		map = &cedf_domain_proc_info.domain_to_cpus[i];
+		map->id = i;
+		cpumask_copy(map->mask, cedf[domain].cpu_map);
+		++i;
+	}
+}
+
+static long cedf_activate_plugin(void)
+{
+	int i, j, cpu, ccpu, cpu_count;
+	cpu_entry_t *entry;
+
+	cpumask_var_t mask;
+	int chk = 0;
+
+	/* de-allocate old clusters, if any */
+	cleanup_cedf();
+
+	printk(KERN_INFO "C-EDF: Activate Plugin, cluster configuration = %d\n",
+			cluster_config);
+
+	/* need to get cluster_size first */
+	if(!zalloc_cpumask_var(&mask, GFP_ATOMIC))
+		return -ENOMEM;
+
+	if (cluster_config == GLOBAL_CLUSTER) {
+		cluster_size = num_online_cpus();
+	} else {
+		chk = get_shared_cpu_map(mask, 0, cluster_config);
+		if (chk) {
+			/* if chk != 0 then it is the max allowed index */
+			printk(KERN_INFO "C-EDF: Cluster configuration = %d "
+			       "is not supported on this hardware.\n",
+			       cluster_config);
+			/* User should notice that the configuration failed, so
+			 * let's bail out. */
+			return -EINVAL;
+		}
+
+		cluster_size = cpumask_weight(mask);
+	}
+
+	if ((num_online_cpus() % cluster_size) != 0) {
+		/* this can't be right, some cpus are left out */
+		printk(KERN_ERR "C-EDF: Trying to group %d cpus in %d!\n",
+				num_online_cpus(), cluster_size);
+		return -1;
+	}
+
+	num_clusters = num_online_cpus() / cluster_size;
+	printk(KERN_INFO "C-EDF: %d cluster(s) of size = %d\n",
+			num_clusters, cluster_size);
+
+	/* initialize clusters */
+	cedf = kmalloc(num_clusters * sizeof(cedf_domain_t), GFP_ATOMIC);
+	for (i = 0; i < num_clusters; i++) {
+
+		cedf[i].cpus = kmalloc(cluster_size * sizeof(cpu_entry_t),
+				GFP_ATOMIC);
+		cedf[i].heap_node = kmalloc(
+				cluster_size * sizeof(struct bheap_node),
+				GFP_ATOMIC);
+		bheap_init(&(cedf[i].cpu_heap));
+		edf_domain_init(&(cedf[i].domain), NULL, cedf_release_jobs);
+
+		if(!zalloc_cpumask_var(&cedf[i].cpu_map, GFP_ATOMIC))
+			return -ENOMEM;
+#ifdef CONFIG_RELEASE_MASTER
+		cedf[i].domain.release_master = atomic_read(&release_master_cpu);
+#endif
+	}
+
+	/* cycle through cluster and add cpus to them */
+	for (i = 0; i < num_clusters; i++) {
+
+		for_each_online_cpu(cpu) {
+			/* check if the cpu is already in a cluster */
+			for (j = 0; j < num_clusters; j++)
+				if (cpumask_test_cpu(cpu, cedf[j].cpu_map))
+					break;
+			/* if it is in a cluster go to next cpu */
+			if (j < num_clusters &&
+					cpumask_test_cpu(cpu, cedf[j].cpu_map))
+				continue;
+
+			/* this cpu isn't in any cluster */
+			/* get the shared cpus */
+			if (unlikely(cluster_config == GLOBAL_CLUSTER))
+				cpumask_copy(mask, cpu_online_mask);
+			else
+				get_shared_cpu_map(mask, cpu, cluster_config);
+
+			cpumask_copy(cedf[i].cpu_map, mask);
+#ifdef VERBOSE_INIT
+			print_cluster_topology(mask, cpu);
+#endif
+			/* add cpus to current cluster and init cpu_entry_t */
+			cpu_count = 0;
+			for_each_cpu(ccpu, cedf[i].cpu_map) {
+
+				entry = &per_cpu(cedf_cpu_entries, ccpu);
+				cedf[i].cpus[cpu_count] = entry;
+				atomic_set(&entry->will_schedule, 0);
+				entry->cpu = ccpu;
+				entry->cluster = &cedf[i];
+				entry->hn = &(cedf[i].heap_node[cpu_count]);
+				bheap_node_init(&entry->hn, entry);
+
+				cpu_count++;
+
+				entry->linked = NULL;
+				entry->scheduled = NULL;
+#ifdef CONFIG_RELEASE_MASTER
+				/* only add CPUs that should schedule jobs */
+				if (entry->cpu != entry->cluster->domain.release_master)
+#endif
+					update_cpu_position(entry);
+			}
+			/* done with this cluster */
+			break;
+		}
+	}
+
+	clusters_allocated = 1;
+	free_cpumask_var(mask);
+
+	cedf_setup_domain_proc();
+
+	return 0;
+}
+
+static long cedf_deactivate_plugin(void)
+{
+	destroy_domain_proc_info(&cedf_domain_proc_info);
+	return 0;
+}
+
+/*	Plugin object	*/
+static struct sched_plugin cedf_plugin __cacheline_aligned_in_smp = {
+	.plugin_name		= "C-EDF",
+	.finish_switch		= cedf_finish_switch,
+	.task_new		= cedf_task_new,
+	.complete_job		= complete_job,
+	.task_exit		= cedf_task_exit,
+	.schedule		= cedf_schedule,
+	.task_wake_up		= cedf_task_wake_up,
+	.task_block		= cedf_task_block,
+	.admit_task		= cedf_admit_task,
+	.activate_plugin	= cedf_activate_plugin,
+	.deactivate_plugin	= cedf_deactivate_plugin,
+	.get_domain_proc_info	= cedf_get_domain_proc_info,
+};
+
+static struct proc_dir_entry *cluster_file = NULL, *cedf_dir = NULL;
+
+static int __init init_cedf(void)
+{
+	int err, fs;
+
+	err = register_sched_plugin(&cedf_plugin);
+	if (!err) {
+		fs = make_plugin_proc_dir(&cedf_plugin, &cedf_dir);
+		if (!fs)
+			cluster_file = create_cluster_file(cedf_dir, &cluster_config);
+		else
+			printk(KERN_ERR "Could not allocate C-EDF procfs dir.\n");
+	}
+	return err;
+}
+
+static void clean_cedf(void)
+{
+	cleanup_cedf();
+	if (cluster_file)
+		remove_proc_entry("cluster", cedf_dir);
+	if (cedf_dir)
+		remove_plugin_proc_dir(&cedf_plugin);
+}
+
+module_init(init_cedf);
+module_exit(clean_cedf);
-- 
1.8.1.2


From 3e37b4b502634d6598bbc45d89fef854a2a13ae6 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Tue, 12 Feb 2013 19:21:11 +0100
Subject: [PATCH 050/119] Add PD^2 scheduler plugin

---
 litmus/Kconfig       |   13 +
 litmus/Makefile      |    1 +
 litmus/sched_pfair.c | 1165 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1179 insertions(+)
 create mode 100644 litmus/sched_pfair.c

diff --git a/litmus/Kconfig b/litmus/Kconfig
index 38d9e43..babb43d 100644
--- a/litmus/Kconfig
+++ b/litmus/Kconfig
@@ -12,6 +12,19 @@ config PLUGIN_CEDF
           On smaller platforms (e.g., ARM PB11MPCore), using C-EDF
           makes little sense since there aren't any shared caches.
 
+config PLUGIN_PFAIR
+	bool "PFAIR"
+	default y
+	help
+	  Include the PFAIR plugin (i.e., the PD^2 scheduler) in the kernel.
+	  The PFAIR plugin requires high resolution timers (for staggered
+	  quanta) and also requires HZ_PERIODIC (i.e., periodic timer ticks
+	  even if a processor is idle, as quanta could be missed otherwise).
+	  Further, the PFAIR plugin uses the system tick and thus requires
+	  HZ=1000 to achive reasonable granularity.
+
+	  If unsure, say Yes.
+
 config RELEASE_MASTER
         bool "Release-master Support"
 	depends on ARCH_HAS_SEND_PULL_TIMERS && SMP
diff --git a/litmus/Makefile b/litmus/Makefile
index 8110a5a..84b173a 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -24,6 +24,7 @@ obj-y     = sched_plugin.o litmus.o \
 	    sched_pfp.o
 
 obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
+obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o
 
 obj-$(CONFIG_SCHED_CPU_AFFINITY) += affinity.o
 
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
new file mode 100644
index 0000000..91f1e08
--- /dev/null
+++ b/litmus/sched_pfair.c
@@ -0,0 +1,1165 @@
+/*
+ * kernel/sched_pfair.c
+ *
+ * Implementation of the PD^2 pfair scheduling algorithm. This
+ * implementation realizes "early releasing," i.e., it is work-conserving.
+ *
+ */
+
+#include <asm/div64.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/rt_domain.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+
+#include <litmus/bheap.h>
+
+/* to configure the cluster size */
+#include <litmus/litmus_proc.h>
+
+#include <litmus/clustered.h>
+
+static enum cache_level pfair_cluster_level = GLOBAL_CLUSTER;
+
+struct subtask {
+	/* measured in quanta relative to job release */
+	quanta_t release;
+        quanta_t deadline;
+	quanta_t overlap; /* called "b bit" by PD^2 */
+	quanta_t group_deadline;
+};
+
+struct pfair_param   {
+	quanta_t	quanta;       /* number of subtasks */
+	quanta_t	cur;          /* index of current subtask */
+
+	quanta_t	release;      /* in quanta */
+	quanta_t	period;       /* in quanta */
+
+	quanta_t	last_quantum; /* when scheduled last */
+	int		last_cpu;     /* where scheduled last */
+
+	struct pfair_cluster* cluster; /* where this task is scheduled */
+
+	struct subtask subtasks[0];   /* allocate together with pfair_param */
+};
+
+#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)
+
+struct pfair_state {
+	struct cluster_cpu topology;
+
+	struct hrtimer quantum_timer;
+
+	volatile quanta_t cur_tick;    /* updated by the CPU that is advancing
+				        * the time */
+	volatile quanta_t local_tick;  /* What tick is the local CPU currently
+				        * executing? Updated only by the local
+				        * CPU. In QEMU, this may lag behind the
+				        * current tick. In a real system, with
+				        * proper timers and aligned quanta,
+				        * that should only be the case for a
+				        * very short time after the time
+				        * advanced. With staggered quanta, it
+				        * will lag for the duration of the
+				        * offset.
+					*/
+
+	struct task_struct* linked;    /* the task that should be executing */
+	struct task_struct* local;     /* the local copy of linked          */
+	struct task_struct* scheduled; /* what is actually scheduled        */
+
+	lt_t offset;			/* stagger offset */
+	unsigned int missed_updates;
+	unsigned int missed_quanta;
+};
+
+struct pfair_cluster {
+	struct scheduling_cluster topology;
+
+	/* The "global" time in this cluster. */
+	quanta_t pfair_time; /* the "official" PFAIR clock */
+
+	/* The ready queue for this cluster. */
+	rt_domain_t pfair;
+
+	/* The set of jobs that should have their release enacted at the next
+	 * quantum boundary.
+	 */
+	struct bheap release_queue;
+	raw_spinlock_t release_lock;
+};
+
+#define FLAGS_NEED_REQUEUE 0x1
+
+static inline struct pfair_cluster* cpu_cluster(struct pfair_state* state)
+{
+	return container_of(state->topology.cluster, struct pfair_cluster, topology);
+}
+
+static inline int cpu_id(struct pfair_state* state)
+{
+	return state->topology.id;
+}
+
+static inline struct pfair_state* from_cluster_list(struct list_head* pos)
+{
+	return list_entry(pos, struct pfair_state, topology.cluster_list);
+}
+
+static inline struct pfair_cluster* from_domain(rt_domain_t* rt)
+{
+	return container_of(rt, struct pfair_cluster, pfair);
+}
+
+static inline raw_spinlock_t* cluster_lock(struct pfair_cluster* cluster)
+{
+	/* The ready_lock is used to serialize all scheduling events. */
+	return &cluster->pfair.ready_lock;
+}
+
+static inline raw_spinlock_t* cpu_lock(struct pfair_state* state)
+{
+	return cluster_lock(cpu_cluster(state));
+}
+
+DEFINE_PER_CPU(struct pfair_state, pfair_state);
+struct pfair_state* *pstate; /* short cut */
+
+static struct pfair_cluster* pfair_clusters;
+static int num_pfair_clusters;
+
+/* Enable for lots of trace info.
+ * #define PFAIR_DEBUG
+ */
+
+#ifdef PFAIR_DEBUG
+#define PTRACE_TASK(t, f, args...)  TRACE_TASK(t, f, ## args)
+#define PTRACE(f, args...) TRACE(f, ## args)
+#else
+#define PTRACE_TASK(t, f, args...)
+#define PTRACE(f, args...)
+#endif
+
+/* gcc will inline all of these accessor functions... */
+static struct subtask* cur_subtask(struct task_struct* t)
+{
+	return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur;
+}
+
+static quanta_t cur_deadline(struct task_struct* t)
+{
+	return cur_subtask(t)->deadline +  tsk_pfair(t)->release;
+}
+
+static quanta_t cur_release(struct task_struct* t)
+{
+	/* This is early releasing: only the release of the first subtask
+	 * counts. */
+	return tsk_pfair(t)->release;
+}
+
+static quanta_t cur_overlap(struct task_struct* t)
+{
+	return cur_subtask(t)->overlap;
+}
+
+static quanta_t cur_group_deadline(struct task_struct* t)
+{
+	quanta_t gdl = cur_subtask(t)->group_deadline;
+	if (gdl)
+		return gdl + tsk_pfair(t)->release;
+	else
+		return gdl;
+}
+
+
+static int pfair_higher_prio(struct task_struct* first,
+			     struct task_struct* second)
+{
+	return  /* first task must exist */
+		first && (
+		/* Does the second task exist and is it a real-time task?  If
+		 * not, the first task (which is a RT task) has higher
+		 * priority.
+		 */
+		!second || !is_realtime(second)  ||
+
+		/* Is the (subtask) deadline of the first task earlier?
+		 * Then it has higher priority.
+		 */
+		time_before(cur_deadline(first), cur_deadline(second)) ||
+
+		/* Do we have a deadline tie?
+		 * Then break by B-bit.
+		 */
+		(cur_deadline(first) == cur_deadline(second) &&
+		 (cur_overlap(first) > cur_overlap(second) ||
+
+		/* Do we have a B-bit tie?
+		 * Then break by group deadline.
+		 */
+		(cur_overlap(first) == cur_overlap(second) &&
+		 (time_after(cur_group_deadline(first),
+			     cur_group_deadline(second)) ||
+
+		/* Do we have a group deadline tie?
+		 * Then break by PID, which are unique.
+		 */
+		(cur_group_deadline(first) ==
+		 cur_group_deadline(second) &&
+		 first->pid < second->pid))))));
+}
+
+int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+	return pfair_higher_prio(bheap2task(a), bheap2task(b));
+}
+
+static void pfair_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+	struct pfair_cluster* cluster = from_domain(rt);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&cluster->release_lock, flags);
+
+	bheap_union(pfair_ready_order, &cluster->release_queue, tasks);
+
+	raw_spin_unlock_irqrestore(&cluster->release_lock, flags);
+}
+
+static void prepare_release(struct task_struct* t, quanta_t at)
+{
+	tsk_pfair(t)->release    = at;
+	tsk_pfair(t)->cur        = 0;
+}
+
+/* pull released tasks from the release queue */
+static void poll_releases(struct pfair_cluster* cluster)
+{
+	raw_spin_lock(&cluster->release_lock);
+	__merge_ready(&cluster->pfair, &cluster->release_queue);
+	raw_spin_unlock(&cluster->release_lock);
+}
+
+static void check_preempt(struct task_struct* t)
+{
+	int cpu = NO_CPU;
+	if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
+	    is_present(t)) {
+		/* the task can be scheduled and
+		 * is not scheduled where it ought to be scheduled
+		 */
+		cpu = tsk_rt(t)->linked_on != NO_CPU ?
+			tsk_rt(t)->linked_on         :
+			tsk_rt(t)->scheduled_on;
+		PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n",
+			   tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on);
+		/* preempt */
+		litmus_reschedule(cpu);
+	}
+}
+
+/* caller must hold pfair.ready_lock */
+static void drop_all_references(struct task_struct *t)
+{
+        int cpu;
+        struct pfair_state* s;
+	struct pfair_cluster* cluster;
+        if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
+                /* It must be in the ready queue; drop references isn't called
+		 * when the job is in a release queue. */
+		cluster = tsk_pfair(t)->cluster;
+                bheap_delete(pfair_ready_order, &cluster->pfair.ready_queue,
+                            tsk_rt(t)->heap_node);
+        }
+        for (cpu = 0; cpu < num_online_cpus(); cpu++) {
+                s = &per_cpu(pfair_state, cpu);
+                if (s->linked == t)
+                        s->linked = NULL;
+                if (s->local  == t)
+                        s->local  = NULL;
+                if (s->scheduled  == t)
+                        s->scheduled = NULL;
+        }
+	/* make sure we don't have a stale linked_on field */
+	tsk_rt(t)->linked_on = NO_CPU;
+}
+
+static void pfair_prepare_next_period(struct task_struct* t)
+{
+	struct pfair_param* p = tsk_pfair(t);
+
+	prepare_for_next_period(t);
+	tsk_rt(t)->completed = 0;
+	p->release = time2quanta(get_release(t), CEIL);
+}
+
+/* returns 1 if the task needs to go the release queue */
+static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
+{
+	struct pfair_param* p = tsk_pfair(t);
+	int to_relq;
+	p->cur = (p->cur + 1) % p->quanta;
+	if (!p->cur) {
+		if (is_present(t)) {
+			/* The job overran; we start a new budget allocation. */
+			pfair_prepare_next_period(t);
+		} else {
+			/* remove task from system until it wakes */
+			drop_all_references(t);
+			tsk_rt(t)->flags |= FLAGS_NEED_REQUEUE;
+			TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n",
+				   cpu, p->cur);
+			return 0;
+		}
+	}
+	to_relq = time_after(cur_release(t), time);
+	TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d (cur_release:%lu time:%lu)\n",
+		   cpu, p->cur, to_relq, cur_release(t), time);
+	return to_relq;
+}
+
+static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time)
+{
+	struct task_struct* l;
+	struct pfair_param* p;
+	struct list_head* pos;
+	struct pfair_state* cpu;
+
+	list_for_each(pos, &cluster->topology.cpus) {
+		cpu = from_cluster_list(pos);
+		l = cpu->linked;
+		cpu->missed_updates += cpu->linked != cpu->local;
+		if (l) {
+			p = tsk_pfair(l);
+			p->last_quantum = time;
+			p->last_cpu     =  cpu_id(cpu);
+			if (advance_subtask(time, l, cpu_id(cpu))) {
+				//cpu->linked = NULL;
+				PTRACE_TASK(l, "should go to release queue. "
+					    "scheduled_on=%d present=%d\n",
+					    tsk_rt(l)->scheduled_on,
+					    tsk_rt(l)->present);
+			}
+		}
+	}
+}
+
+static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu)
+{
+	int cpu;
+	if (tsk_rt(t)->scheduled_on != NO_CPU) {
+		/* always observe scheduled_on linkage */
+		default_cpu = tsk_rt(t)->scheduled_on;
+	} else if (tsk_pfair(t)->last_quantum == time - 1) {
+		/* back2back quanta */
+		/* Only observe last_quantum if no scheduled_on is in the way.
+		 * This should only kick in if a CPU missed quanta, and that
+		 * *should* only happen in QEMU.
+		 */
+		cpu = tsk_pfair(t)->last_cpu;
+		if (!pstate[cpu]->linked ||
+		    tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) {
+			default_cpu = cpu;
+		}
+	}
+	return default_cpu;
+}
+
+/* returns one if linking was redirected */
+static int pfair_link(quanta_t time, int cpu,
+		      struct task_struct* t)
+{
+	int target = target_cpu(time, t, cpu);
+	struct task_struct* prev  = pstate[cpu]->linked;
+	struct task_struct* other;
+	struct pfair_cluster* cluster = cpu_cluster(pstate[cpu]);
+
+	if (target != cpu) {
+		BUG_ON(pstate[target]->topology.cluster != pstate[cpu]->topology.cluster);
+		other = pstate[target]->linked;
+		pstate[target]->linked = t;
+		tsk_rt(t)->linked_on   = target;
+		if (!other)
+			/* linked ok, but reschedule this CPU */
+			return 1;
+		if (target < cpu) {
+			/* link other to cpu instead */
+			tsk_rt(other)->linked_on = cpu;
+			pstate[cpu]->linked      = other;
+			if (prev) {
+				/* prev got pushed back into the ready queue */
+				tsk_rt(prev)->linked_on = NO_CPU;
+				__add_ready(&cluster->pfair, prev);
+			}
+			/* we are done with this cpu */
+			return 0;
+		} else {
+			/* re-add other, it's original CPU was not considered yet */
+			tsk_rt(other)->linked_on = NO_CPU;
+			__add_ready(&cluster->pfair, other);
+			/* reschedule this CPU */
+			return 1;
+		}
+	} else {
+		pstate[cpu]->linked  = t;
+		tsk_rt(t)->linked_on = cpu;
+		if (prev) {
+			/* prev got pushed back into the ready queue */
+			tsk_rt(prev)->linked_on = NO_CPU;
+			__add_ready(&cluster->pfair, prev);
+		}
+		/* we are done with this CPU */
+		return 0;
+	}
+}
+
+static void schedule_subtasks(struct pfair_cluster *cluster, quanta_t time)
+{
+	int retry;
+	struct list_head *pos;
+	struct pfair_state *cpu_state;
+
+	list_for_each(pos, &cluster->topology.cpus) {
+		cpu_state = from_cluster_list(pos);
+		retry = 1;
+#ifdef CONFIG_RELEASE_MASTER
+		/* skip release master */
+		if (cluster->pfair.release_master == cpu_id(cpu_state))
+			continue;
+#endif
+		while (retry) {
+			if (pfair_higher_prio(__peek_ready(&cluster->pfair),
+					      cpu_state->linked))
+				retry = pfair_link(time, cpu_id(cpu_state),
+						   __take_ready(&cluster->pfair));
+			else
+				retry = 0;
+		}
+	}
+}
+
+static void schedule_next_quantum(struct pfair_cluster *cluster, quanta_t time)
+{
+	struct pfair_state *cpu;
+	struct list_head* pos;
+
+	/* called with interrupts disabled */
+	PTRACE("--- Q %lu at %llu PRE-SPIN\n",
+	       time, litmus_clock());
+	raw_spin_lock(cluster_lock(cluster));
+	PTRACE("<<< Q %lu at %llu\n",
+	       time, litmus_clock());
+
+	sched_trace_quantum_boundary();
+
+	advance_subtasks(cluster, time);
+	poll_releases(cluster);
+	schedule_subtasks(cluster, time);
+
+	list_for_each(pos, &cluster->topology.cpus) {
+		cpu = from_cluster_list(pos);
+		if (cpu->linked)
+			PTRACE_TASK(cpu->linked,
+				    " linked on %d.\n", cpu_id(cpu));
+		else
+			PTRACE("(null) linked on %d.\n", cpu_id(cpu));
+	}
+	/* We are done. Advance time. */
+	mb();
+	list_for_each(pos, &cluster->topology.cpus) {
+		cpu = from_cluster_list(pos);
+		if (cpu->local_tick != cpu->cur_tick) {
+			TRACE("BAD Quantum not acked on %d "
+			      "(l:%lu c:%lu p:%lu)\n",
+			      cpu_id(cpu),
+			      cpu->local_tick,
+			      cpu->cur_tick,
+			      cluster->pfair_time);
+			cpu->missed_quanta++;
+		}
+		cpu->cur_tick = time;
+	}
+	PTRACE(">>> Q %lu at %llu\n",
+	       time, litmus_clock());
+	raw_spin_unlock(cluster_lock(cluster));
+}
+
+static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state)
+{
+	quanta_t loc;
+
+	goto first; /* skip mb() on first iteration */
+	do {
+		cpu_relax();
+		mb();
+	first:	loc = state->cur_tick;
+		/* FIXME: what if loc > cur? */
+	} while (time_before(loc, q));
+	PTRACE("observed cur_tick:%lu >= q:%lu\n",
+	       loc, q);
+}
+
+static quanta_t current_quantum(struct pfair_state* state)
+{
+	lt_t t = litmus_clock() - state->offset;
+	return time2quanta(t, FLOOR);
+}
+
+static void catchup_quanta(quanta_t from, quanta_t target,
+			   struct pfair_state* state)
+{
+	quanta_t cur = from, time;
+	TRACE("+++< BAD catching up quanta from %lu to %lu\n",
+	      from, target);
+	while (time_before(cur, target)) {
+		wait_for_quantum(cur, state);
+		cur++;
+		time = cmpxchg(&cpu_cluster(state)->pfair_time,
+			       cur - 1,   /* expected */
+			       cur        /* next     */
+			);
+		if (time == cur - 1)
+			schedule_next_quantum(cpu_cluster(state), cur);
+	}
+	TRACE("+++> catching up done\n");
+}
+
+/* pfair_tick - this function is called for every local timer
+ *                         interrupt.
+ */
+static void pfair_tick(struct task_struct* t)
+{
+	struct pfair_state* state = &__get_cpu_var(pfair_state);
+	quanta_t time, cur;
+	int retry = 10;
+
+	do {
+		cur  = current_quantum(state);
+		PTRACE("q %lu at %llu\n", cur, litmus_clock());
+
+		/* Attempt to advance time. First CPU to get here
+		 * will prepare the next quantum.
+		 */
+		time = cmpxchg(&cpu_cluster(state)->pfair_time,
+			       cur - 1,   /* expected */
+			       cur        /* next     */
+			);
+		if (time == cur - 1) {
+			/* exchange succeeded */
+			wait_for_quantum(cur - 1, state);
+			schedule_next_quantum(cpu_cluster(state), cur);
+			retry = 0;
+		} else if (time_before(time, cur - 1)) {
+			/* the whole system missed a tick !? */
+			catchup_quanta(time, cur, state);
+			retry--;
+		} else if (time_after(time, cur)) {
+			/* our timer lagging behind!? */
+			TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur);
+			retry--;
+		} else {
+			/* Some other CPU already started scheduling
+			 * this quantum. Let it do its job and then update.
+			 */
+			retry = 0;
+		}
+	} while (retry);
+
+	/* Spin locally until time advances. */
+	wait_for_quantum(cur, state);
+
+	/* copy assignment */
+	/* FIXME: what if we race with a future update? Corrupted state? */
+	state->local      = state->linked;
+	/* signal that we are done */
+	mb();
+	state->local_tick = state->cur_tick;
+
+	if (state->local != current
+	    && (is_realtime(current) || is_present(state->local)))
+		litmus_reschedule_local();
+}
+
+/* Custom scheduling tick: called on each quantum boundary. */
+static enum hrtimer_restart on_quantum_boundary(struct hrtimer *timer)
+{
+	TS_QUANTUM_BOUNDARY_START;
+
+	pfair_tick(current);
+	hrtimer_add_expires_ns(timer, LITMUS_QUANTUM_LENGTH_NS);
+
+	TS_QUANTUM_BOUNDARY_END;
+	return  HRTIMER_RESTART;
+}
+
+static int safe_to_schedule(struct task_struct* t, int cpu)
+{
+	int where = tsk_rt(t)->scheduled_on;
+	if (where != NO_CPU && where != cpu) {
+		TRACE_TASK(t, "BAD: can't be scheduled on %d, "
+			   "scheduled already on %d.\n", cpu, where);
+		return 0;
+	} else
+		return is_present(t) && !is_completed(t);
+}
+
+static struct task_struct* pfair_schedule(struct task_struct * prev)
+{
+	struct pfair_state* state = &__get_cpu_var(pfair_state);
+	struct pfair_cluster* cluster = cpu_cluster(state);
+	int blocks, completion, out_of_time;
+	struct task_struct* next = NULL;
+
+#ifdef CONFIG_RELEASE_MASTER
+	/* Bail out early if we are the release master.
+	 * The release master never schedules any real-time tasks.
+	 */
+	if (unlikely(cluster->pfair.release_master == cpu_id(state))) {
+		sched_state_task_picked();
+		return NULL;
+	}
+#endif
+
+	raw_spin_lock(cpu_lock(state));
+
+	blocks      = is_realtime(prev) && !is_running(prev);
+	completion  = is_realtime(prev) && is_completed(prev);
+	out_of_time = is_realtime(prev) && time_after(cur_release(prev),
+						      state->local_tick);
+
+	if (is_realtime(prev))
+	    PTRACE_TASK(prev, "blocks:%d completion:%d out_of_time:%d\n",
+			blocks, completion, out_of_time);
+
+	if (completion) {
+		sched_trace_task_completion(prev, 0);
+		pfair_prepare_next_period(prev);
+		prepare_release(prev, cur_release(prev));
+	}
+
+	if (!blocks && (completion || out_of_time)) {
+		drop_all_references(prev);
+		sched_trace_task_release(prev);
+		add_release(&cluster->pfair, prev);
+	}
+
+	if (state->local && safe_to_schedule(state->local, cpu_id(state)))
+		next = state->local;
+
+	if (prev != next) {
+		tsk_rt(prev)->scheduled_on = NO_CPU;
+		if (next)
+			tsk_rt(next)->scheduled_on = cpu_id(state);
+	}
+	sched_state_task_picked();
+	raw_spin_unlock(cpu_lock(state));
+
+	if (next)
+		TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n",
+			   tsk_pfair(next)->release, cpu_cluster(state)->pfair_time, litmus_clock());
+	else if (is_realtime(prev))
+		TRACE("Becomes idle at %lu (%llu)\n", cpu_cluster(state)->pfair_time, litmus_clock());
+
+	if (unlikely(!hrtimer_active(&state->quantum_timer))) {
+		TRACE("activating quantum timer start=%llu\n",
+			hrtimer_get_expires(&state->quantum_timer));
+		hrtimer_start(&state->quantum_timer,
+			hrtimer_get_expires(&state->quantum_timer),
+			HRTIMER_MODE_ABS_PINNED);
+	}
+
+	return next;
+}
+
+static void pfair_task_new(struct task_struct * t, int on_rq, int is_scheduled)
+{
+	unsigned long flags;
+	struct pfair_cluster* cluster;
+
+	TRACE("pfair: task new %d state:%d\n", t->pid, t->state);
+
+	cluster = tsk_pfair(t)->cluster;
+
+	raw_spin_lock_irqsave(cluster_lock(cluster), flags);
+
+	prepare_release(t, cluster->pfair_time + 1);
+
+	t->rt_param.scheduled_on = NO_CPU;
+	t->rt_param.linked_on    = NO_CPU;
+
+	if (is_scheduled) {
+#ifdef CONFIG_RELEASE_MASTER
+		if (task_cpu(t) != cluster->pfair.release_master)
+#endif
+			t->rt_param.scheduled_on = task_cpu(t);
+	}
+
+	if (is_running(t)) {
+		tsk_rt(t)->present = 1;
+		__add_ready(&cluster->pfair, t);
+	} else {
+		tsk_rt(t)->present = 0;
+		tsk_rt(t)->flags |= FLAGS_NEED_REQUEUE;
+	}
+
+	check_preempt(t);
+
+	raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
+}
+
+static void pfair_task_wake_up(struct task_struct *t)
+{
+	unsigned long flags;
+	lt_t now;
+	struct pfair_cluster* cluster;
+
+	cluster = tsk_pfair(t)->cluster;
+
+	TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n",
+		   litmus_clock(), cur_release(t), cluster->pfair_time);
+
+	raw_spin_lock_irqsave(cluster_lock(cluster), flags);
+
+	/* If a task blocks and wakes before its next job release,
+	 * then it may resume if it is currently linked somewhere
+	 * (as if it never blocked at all). Otherwise, we have a
+	 * new sporadic job release.
+	 */
+	now = litmus_clock();
+	if (is_tardy(t, now)) {
+		TRACE_TASK(t, "sporadic release!\n");
+		release_at(t, now);
+		prepare_release(t, time2quanta(now, CEIL));
+		sched_trace_task_release(t);
+	}
+
+	/* only add to ready queue if the task isn't still linked somewhere */
+	if (tsk_rt(t)->flags & FLAGS_NEED_REQUEUE) {
+		tsk_rt(t)->flags &= ~FLAGS_NEED_REQUEUE;
+		TRACE_TASK(t, "requeueing required\n");
+		tsk_rt(t)->completed = 0;
+		__add_ready(&cluster->pfair, t);
+	}
+
+	check_preempt(t);
+
+	raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
+	TRACE_TASK(t, "wake up done at %llu\n", litmus_clock());
+}
+
+static void pfair_task_block(struct task_struct *t)
+{
+	BUG_ON(!is_realtime(t));
+	TRACE_TASK(t, "blocks at %llu, state:%d\n",
+		   litmus_clock(), t->state);
+}
+
+static void pfair_task_exit(struct task_struct * t)
+{
+	unsigned long flags;
+	struct pfair_cluster *cluster;
+
+	BUG_ON(!is_realtime(t));
+
+	cluster = tsk_pfair(t)->cluster;
+
+	/* Remote task from release or ready queue, and ensure
+	 * that it is not the scheduled task for ANY CPU. We
+	 * do this blanket check because occassionally when
+	 * tasks exit while blocked, the task_cpu of the task
+	 * might not be the same as the CPU that the PFAIR scheduler
+	 * has chosen for it.
+	 */
+	raw_spin_lock_irqsave(cluster_lock(cluster), flags);
+
+	TRACE_TASK(t, "RIP, state:%d\n", t->state);
+	drop_all_references(t);
+
+	raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
+
+	kfree(t->rt_param.pfair);
+	t->rt_param.pfair = NULL;
+}
+
+static void init_subtask(struct subtask* sub, unsigned long i,
+			 lt_t quanta, lt_t period)
+{
+	/* since i is zero-based, the formulas are shifted by one */
+	lt_t tmp;
+
+	/* release */
+	tmp = period * i;
+	do_div(tmp, quanta); /* floor */
+	sub->release = (quanta_t) tmp;
+
+	/* deadline */
+	tmp = period * (i + 1);
+	if (do_div(tmp, quanta)) /* ceil */
+		tmp++;
+	sub->deadline = (quanta_t) tmp;
+
+	/* next release */
+	tmp = period * (i + 1);
+	do_div(tmp, quanta); /* floor */
+	sub->overlap =  sub->deadline - (quanta_t) tmp;
+
+	/* Group deadline.
+	 * Based on the formula given in Uma's thesis.
+	 */
+	if (2 * quanta >= period) {
+		/* heavy */
+		tmp = (sub->deadline - (i + 1)) * period;
+		if (period > quanta &&
+		    do_div(tmp, (period - quanta))) /* ceil */
+			tmp++;
+		sub->group_deadline = (quanta_t) tmp;
+	} else
+		sub->group_deadline = 0;
+}
+
+static void dump_subtasks(struct task_struct* t)
+{
+	unsigned long i;
+	for (i = 0; i < t->rt_param.pfair->quanta; i++)
+		TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n",
+			   i + 1,
+			   t->rt_param.pfair->subtasks[i].release,
+			   t->rt_param.pfair->subtasks[i].deadline,
+			   t->rt_param.pfair->subtasks[i].overlap,
+			   t->rt_param.pfair->subtasks[i].group_deadline);
+}
+
+static long pfair_admit_task(struct task_struct* t)
+{
+	lt_t quanta;
+	lt_t period;
+	s64  quantum_length = LITMUS_QUANTUM_LENGTH_NS;
+	struct pfair_param* param;
+	unsigned long i;
+
+	/* first check that the task is in the right cluster */
+	if (cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]) !=
+	    cpu_cluster(pstate[task_cpu(t)]))
+		return -EINVAL;
+
+	if (get_rt_period(t) != get_rt_relative_deadline(t)) {
+		printk(KERN_INFO "%s: Admission rejected. "
+			"Only implicit deadlines are currently supported.\n",
+			litmus->plugin_name);
+		return -EINVAL;
+	}
+
+	/* Pfair is a tick-based method, so the time
+	 * of interest is jiffies. Calculate tick-based
+	 * times for everything.
+	 * (Ceiling of exec cost, floor of period.)
+	 */
+
+	quanta = get_exec_cost(t);
+	period = get_rt_period(t);
+
+	quanta = time2quanta(get_exec_cost(t), CEIL);
+
+	if (do_div(period, quantum_length))
+		printk(KERN_WARNING
+		       "The period of %s/%d is not a multiple of %llu.\n",
+		       t->comm, t->pid, (unsigned long long) quantum_length);
+
+	if (quanta == period) {
+		/* special case: task has weight 1.0 */
+		printk(KERN_INFO
+		       "Admitting weight 1.0 task. (%s/%d, %llu, %llu).\n",
+		       t->comm, t->pid, quanta, period);
+		quanta = 1;
+		period = 1;
+	}
+
+	param = kmalloc(sizeof(*param) +
+			quanta * sizeof(struct subtask), GFP_ATOMIC);
+
+	if (!param)
+		return -ENOMEM;
+
+	param->quanta  = quanta;
+	param->cur     = 0;
+	param->release = 0;
+	param->period  = period;
+
+	param->cluster = cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]);
+
+	for (i = 0; i < quanta; i++)
+		init_subtask(param->subtasks + i, i, quanta, period);
+
+	if (t->rt_param.pfair)
+		/* get rid of stale allocation */
+		kfree(t->rt_param.pfair);
+
+	t->rt_param.pfair = param;
+
+	/* spew out some debug info */
+	dump_subtasks(t);
+
+	return 0;
+}
+
+static void pfair_init_cluster(struct pfair_cluster* cluster)
+{
+	rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, pfair_release_jobs);
+	bheap_init(&cluster->release_queue);
+	raw_spin_lock_init(&cluster->release_lock);
+	INIT_LIST_HEAD(&cluster->topology.cpus);
+}
+
+static void cleanup_clusters(void)
+{
+	int i;
+
+	if (num_pfair_clusters)
+		kfree(pfair_clusters);
+	pfair_clusters = NULL;
+	num_pfair_clusters = 0;
+
+	/* avoid stale pointers */
+	for (i = 0; i < num_online_cpus(); i++) {
+		pstate[i]->topology.cluster = NULL;
+		printk("P%d missed %u updates and %u quanta.\n", cpu_id(pstate[i]),
+		       pstate[i]->missed_updates, pstate[i]->missed_quanta);
+	}
+}
+
+static struct domain_proc_info pfair_domain_proc_info;
+static long pfair_get_domain_proc_info(struct domain_proc_info **ret)
+{
+	*ret = &pfair_domain_proc_info;
+	return 0;
+}
+
+static void pfair_setup_domain_proc(void)
+{
+	int i, cpu, domain;
+#ifdef CONFIG_RELEASE_MASTER
+	int release_master = atomic_read(&release_master_cpu);
+	/* skip over the domain with the release master if cluster size is 1 */
+	int cluster_size = num_online_cpus() / num_pfair_clusters;
+	int skip_domain = (1 == cluster_size && release_master != NO_CPU) ?
+			release_master : NO_CPU;
+#else
+	int release_master = NO_CPU;
+	int skip_domain = NO_CPU;
+#endif
+	int num_rt_cpus = num_online_cpus() - (release_master != NO_CPU);
+	int num_rt_domains = num_pfair_clusters - (skip_domain != NO_CPU);
+	struct cd_mapping *map;
+
+	memset(&pfair_domain_proc_info, sizeof(pfair_domain_proc_info), 0);
+	init_domain_proc_info(&pfair_domain_proc_info, num_rt_cpus, num_pfair_clusters);
+	pfair_domain_proc_info.num_cpus = num_rt_cpus;
+	pfair_domain_proc_info.num_domains = num_rt_domains;
+
+	for (cpu = 0, i = 0; cpu < num_online_cpus(); ++cpu) {
+		if (cpu == release_master)
+			continue;
+		map = &pfair_domain_proc_info.cpu_to_domains[i];
+		/* pointer math to figure out the domain index */
+		domain = cpu_cluster(&per_cpu(pfair_state, cpu)) - pfair_clusters;
+		map->id = cpu;
+		cpumask_set_cpu(domain, map->mask);
+		++i;
+	}
+
+	for (domain = 0, i = 0; domain < num_pfair_clusters; ++domain) {
+		struct pfair_cluster *cluster;
+		struct list_head *pos;
+
+		if (domain == skip_domain)
+			continue;
+
+		cluster = &pfair_clusters[domain];
+		map = &pfair_domain_proc_info.domain_to_cpus[i];
+		map->id = i;
+
+		list_for_each(pos, &cluster->topology.cpus) {
+			cpu = cpu_id(from_cluster_list(pos));
+			if (cpu != release_master)
+				cpumask_set_cpu(cpu, map->mask);
+		}
+		++i;
+	}
+}
+
+static long pfair_activate_plugin(void)
+{
+	int err, i;
+	struct pfair_state* state;
+	struct pfair_cluster* cluster;
+	quanta_t now, start;
+	int cluster_size;
+	struct cluster_cpu* cpus[NR_CPUS];
+	struct scheduling_cluster* clust[NR_CPUS];
+	lt_t quantum_timer_start;
+
+	cluster_size = get_cluster_size(pfair_cluster_level);
+
+	if (cluster_size <= 0 || num_online_cpus() % cluster_size != 0)
+		return -EINVAL;
+
+	num_pfair_clusters = num_online_cpus() / cluster_size;
+
+	pfair_clusters = kzalloc(num_pfair_clusters * sizeof(struct pfair_cluster), GFP_ATOMIC);
+	if (!pfair_clusters) {
+		num_pfair_clusters = 0;
+		printk(KERN_ERR "Could not allocate Pfair clusters!\n");
+		return -ENOMEM;
+	}
+
+	state = &__get_cpu_var(pfair_state);
+	now   = current_quantum(state);
+	start = now + 50;
+	quantum_timer_start = quanta2time(start);
+	TRACE("Activating PFAIR at %llu (q=%lu), first tick at %llu (q=%lu)\n",
+		litmus_clock(),
+		now,
+		quantum_timer_start,
+		time2quanta(quantum_timer_start, CEIL));
+
+	for (i = 0; i < num_pfair_clusters; i++) {
+		cluster = &pfair_clusters[i];
+		pfair_init_cluster(cluster);
+		cluster->pfair_time = start;
+		clust[i] = &cluster->topology;
+#ifdef CONFIG_RELEASE_MASTER
+		cluster->pfair.release_master = atomic_read(&release_master_cpu);
+#endif
+	}
+
+	for_each_online_cpu(i) {
+		state = &per_cpu(pfair_state, i);
+		state->cur_tick   = start;
+		state->local_tick = start;
+		state->missed_quanta = 0;
+		state->missed_updates = 0;
+		state->offset     = cpu_stagger_offset(i);
+		hrtimer_set_expires(&state->quantum_timer,
+			ns_to_ktime(quantum_timer_start + state->offset));
+		printk(KERN_ERR "cpus[%d] set; offset=%llu; %d\n", i, state->offset, num_online_cpus());
+		cpus[i] = &state->topology;
+		/* force rescheduling to start quantum timer */
+		litmus_reschedule(i);
+
+		WARN_ONCE(!hrtimer_is_hres_active(&state->quantum_timer),
+			KERN_ERR "WARNING: no high resolution timers available!?\n");
+	}
+
+	err = assign_cpus_to_clusters(pfair_cluster_level, clust, num_pfair_clusters,
+				      cpus, num_online_cpus());
+
+	if (err < 0)
+		cleanup_clusters();
+	else
+		pfair_setup_domain_proc();
+
+	return err;
+}
+
+static long pfair_deactivate_plugin(void)
+{
+	int cpu;
+	struct pfair_state* state;
+
+	for_each_online_cpu(cpu) {
+		state = &per_cpu(pfair_state, cpu);
+		TRACE("stopping quantum timer on CPU%d\n", cpu);
+		hrtimer_cancel(&state->quantum_timer);
+	}
+	cleanup_clusters();
+	destroy_domain_proc_info(&pfair_domain_proc_info);
+	return 0;
+}
+
+/*	Plugin object	*/
+static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
+	.plugin_name		= "PFAIR",
+	.task_new		= pfair_task_new,
+	.task_exit		= pfair_task_exit,
+	.schedule		= pfair_schedule,
+	.task_wake_up		= pfair_task_wake_up,
+	.task_block		= pfair_task_block,
+	.admit_task		= pfair_admit_task,
+	.complete_job		= complete_job,
+	.activate_plugin	= pfair_activate_plugin,
+	.deactivate_plugin	= pfair_deactivate_plugin,
+	.get_domain_proc_info	= pfair_get_domain_proc_info,
+};
+
+
+static struct proc_dir_entry *cluster_file = NULL, *pfair_dir = NULL;
+
+static int __init init_pfair(void)
+{
+	int cpu, err, fs;
+	struct pfair_state *state;
+
+	/*
+	 * initialize short_cut for per-cpu pfair state;
+	 * there may be a problem here if someone removes a cpu
+	 * while we are doing this initialization... and if cpus
+	 * are added / removed later... but we don't support CPU hotplug atm anyway.
+	 */
+	pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL);
+
+	/* initialize CPU state */
+	for (cpu = 0; cpu < num_online_cpus(); cpu++)  {
+		state = &per_cpu(pfair_state, cpu);
+		hrtimer_init(&state->quantum_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+		state->quantum_timer.function = on_quantum_boundary;
+		state->topology.id = cpu;
+		state->cur_tick   = 0;
+		state->local_tick = 0;
+		state->linked     = NULL;
+		state->local      = NULL;
+		state->scheduled  = NULL;
+		state->missed_quanta = 0;
+		state->offset     = cpu_stagger_offset(cpu);
+		pstate[cpu] = state;
+	}
+
+	pfair_clusters = NULL;
+	num_pfair_clusters = 0;
+
+	err = register_sched_plugin(&pfair_plugin);
+	if (!err) {
+		fs = make_plugin_proc_dir(&pfair_plugin, &pfair_dir);
+		if (!fs)
+			cluster_file = create_cluster_file(pfair_dir, &pfair_cluster_level);
+		else
+			printk(KERN_ERR "Could not allocate PFAIR procfs dir.\n");
+	}
+
+	return err;
+}
+
+static void __exit clean_pfair(void)
+{
+	kfree(pstate);
+
+	if (cluster_file)
+		remove_proc_entry("cluster", pfair_dir);
+	if (pfair_dir)
+		remove_plugin_proc_dir(&pfair_plugin);
+}
+
+module_init(init_pfair);
+module_exit(clean_pfair);
-- 
1.8.1.2


From 5b564e918add09d778ae347e9fdd005a36f8e879 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Thu, 12 Jun 2014 14:05:51 +0200
Subject: [PATCH 051/119] PFAIR: set release time of new tasks

Without a proper release time, the job will be considered "lagging
behind" for quite a while, which breaks the period enforcement. This
bug manifested only in the absence of a synchronous release (which set
a proper release time).

This patch simply sets the beginning of the next quantum as the
release time of the first job of a newly added task.
---
 litmus/sched_pfair.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
index 91f1e08..54fa36c 100644
--- a/litmus/sched_pfair.c
+++ b/litmus/sched_pfair.c
@@ -696,6 +696,7 @@ static void pfair_task_new(struct task_struct * t, int on_rq, int is_scheduled)
 	raw_spin_lock_irqsave(cluster_lock(cluster), flags);
 
 	prepare_release(t, cluster->pfair_time + 1);
+	release_at(t, quanta2time(cur_release(t)));
 
 	t->rt_param.scheduled_on = NO_CPU;
 	t->rt_param.linked_on    = NO_CPU;
-- 
1.8.1.2


From 5c2112a210e8654d96e3f4c0395f1a326f28666f Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Mon, 3 Nov 2014 21:52:24 -0500
Subject: [PATCH 052/119] ARM timer support

---
 include/litmus/clock.h         | 48 ++++++++++++++++++++++++++++++++++++++
 include/litmus/feather_trace.h | 12 ++++++++++
 litmus/litmus.c                | 53 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 112 insertions(+), 1 deletion(-)
 create mode 100644 include/litmus/clock.h

diff --git a/include/litmus/clock.h b/include/litmus/clock.h
new file mode 100644
index 0000000..f8de7a3
--- /dev/null
+++ b/include/litmus/clock.h
@@ -0,0 +1,48 @@
+#ifndef _LITMUS_CLOCK_H_
+#define _LITMUS_CLOCK_H_
+
+#if defined(CONFIG_EXYNOS_MCT)
+
+/*
+ * Only used if we are using the EXYNOS MCT clock.
+ */
+
+#include <linux/clocksource.h>
+extern struct clocksource mct_frc;
+
+static inline cycles_t mct_frc_read(void)
+{
+	cycle_t cycles = mct_frc.read(&mct_frc);
+	return cycles;
+}
+
+static inline s64 litmus_cycles_to_ns(cycles_t cycles)
+{
+	return clocksource_cyc2ns(cycles, mct_frc.mult, mct_frc.shift);
+}
+
+#define litmus_get_cycles mct_frc_read
+
+#elif defined(CONFIG_CPU_V7) && !defined(CONFIG_HW_PERF_EVENTS)
+
+#include <asm/timex.h>
+
+static inline cycles_t v7_get_cycles (void)
+{
+	u32 value;
+        /* read CCNT register */
+        asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(value));
+	return value;
+}
+
+#define litmus_get_cycles v7_get_cycles
+
+#else
+#include <asm/timex.h>
+
+#define litmus_get_cycles get_cycles
+
+#endif
+
+#endif
+
diff --git a/include/litmus/feather_trace.h b/include/litmus/feather_trace.h
index dbeca46..cc4396e 100644
--- a/include/litmus/feather_trace.h
+++ b/include/litmus/feather_trace.h
@@ -38,11 +38,23 @@ static inline void ft_atomic_dec(int *val)
 /* provide default implementation */
 #include <linux/timex.h> /* for get_cycles() */
 
+#if defined(CONFIG_CPU_V7) && !defined(CONFIG_HW_PERF_EVENTS)	
+
+#include <litmus/clock.h> /* for litmus_get_cycles() */
+static inline unsigned long long ft_timestamp(void)
+{
+	return (unsigned long long)litmus_get_cycles();
+}
+
+#else
+
 static inline unsigned long long ft_timestamp(void)
 {
 	return get_cycles();
 }
 
+#endif
+
 #define feather_callback
 
 #define MAX_EVENTS 1024
diff --git a/litmus/litmus.c b/litmus/litmus.c
index a061343..14b1031 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -20,6 +20,9 @@
 #include <litmus/rt_domain.h>
 #include <litmus/litmus_proc.h>
 #include <litmus/sched_trace.h>
+#include <litmus/clock.h>
+
+#include <asm/cacheflush.h>
 
 #ifdef CONFIG_SCHED_CPU_AFFINITY
 #include <litmus/affinity.h>
@@ -303,9 +306,11 @@ asmlinkage long sys_null_call(cycles_t __user *ts)
 	cycles_t now;
 
 	if (ts) {
-		now = get_cycles();
+		now = litmus_get_cycles();
 		ret = put_user(now, ts);
 	}
+	else
+		flush_cache_all();
 
 	return ret;
 }
@@ -599,6 +604,48 @@ static struct notifier_block shutdown_notifier = {
 	.notifier_call = litmus_shutdown_nb,
 };
 
+#if defined(CONFIG_CPU_V7) && !defined(CONFIG_HW_PERF_EVENTS)
+static void __init litmus_enable_perfcounters_v7(void *_ignore)
+{
+	u32 enable_val = 0;
+
+	/* disable performance monitoring */
+	asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r" (0x00000006));
+
+	/* disable all events */
+	asm volatile("mcr p15, 0, %0, c9, c12, 2" : : "r" (0xffffffff));
+
+	/* write 1 to enable user-mode access to the performance counter */
+	asm volatile("mcr p15, 0, %0, c9, c14, 0" : : "r" (1));
+
+	/* disable counter overflow interrupts (just in case) */
+	asm volatile("mcr p15, 0, %0, c9, c14, 2" : : "r" (0x8000000f));
+
+	/* select event zero */
+	asm volatile("mcr p15, 0, %0, c9, c12, 5" : : "r" (0));
+
+	/* count cycles in the selected event zero */
+	asm volatile("mcr p15, 0, %0, c9, c13, 1" : : "r" (0x00000011));
+
+	enable_val |= 1;	/* bit 1 enables the counters */
+	enable_val |= 2;	/* resets event counters to zero */
+	enable_val |= 4;	/* resets cycle counter to zero */
+	//enable_val |= 8;	/* enable "by 64" divider for CCNT. */
+	
+	/* performance monitor control register: enable all counters */
+	asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r"(enable_val));
+
+	/* enables counters (cycle counter and event 1) */
+        asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r"(0x80000001));
+}
+
+static void __init litmus_enable_perfcounters(void)
+{
+	litmus_enable_perfcounters_v7(NULL);
+	smp_call_function(litmus_enable_perfcounters_v7, NULL, 0);
+}
+#endif
+
 static int __init _init_litmus(void)
 {
 	/*      Common initializers,
@@ -628,6 +675,10 @@ static int __init _init_litmus(void)
 
 	register_reboot_notifier(&shutdown_notifier);
 
+#if defined(CONFIG_CPU_V7) && !defined(CONFIG_HW_PERF_EVENTS)	
+	litmus_enable_perfcounters();
+#endif
+	
 	return 0;
 }
 
-- 
1.8.1.2


From d0cc5b0897b74201fe1ca363ce1d980b5dbefff5 Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Mon, 3 Nov 2014 21:53:47 -0500
Subject: [PATCH 053/119] Added cache /proc

---
 arch/arm/mach-imx/Makefile  |   4 +-
 arch/arm/mm/cache-l2x0.c    |   6 ++
 include/litmus/cache_proc.h |  10 ++
 litmus/cache_proc.c         | 245 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 263 insertions(+), 2 deletions(-)
 create mode 100644 include/litmus/cache_proc.h
 create mode 100644 litmus/cache_proc.c

diff --git a/arch/arm/mach-imx/Makefile b/arch/arm/mach-imx/Makefile
index 2536699..02d0208 100644
--- a/arch/arm/mach-imx/Makefile
+++ b/arch/arm/mach-imx/Makefile
@@ -107,13 +107,13 @@ obj-$(CONFIG_SOC_IMX6SX) += clk-imx6sx.o mach-imx6sx.o
 AFLAGS_suspend-imx6.o :=-Wa,-march=armv7-a
 obj-$(CONFIG_PM) += pm-imx6.o headsmp.o suspend-imx6.o
 
-ifeq ($(CONFIG_ARM_IMX6_CPUFREQ),y)
+#ifeq ($(CONFIG_ARM_IMX6_CPUFREQ),y)
 obj-y += busfreq-imx6.o
 obj-$(CONFIG_SOC_IMX6Q) += ddr3_freq_imx6.o busfreq_ddr3.o
 obj-$(CONFIG_SOC_IMX6SL) += lpddr2_freq_imx6.o busfreq_lpddr2.o imx6sl_wfi.o
 obj-$(CONFIG_SOC_IMX6SX) += ddr3_freq_imx6sx.o lpddr2_freq_imx6sx.o
 
-endif
+#endif
 
 
 # i.MX5 based machines
diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c
index c465fac..5efe6b6e 100644
--- a/arch/arm/mm/cache-l2x0.c
+++ b/arch/arm/mm/cache-l2x0.c
@@ -27,6 +27,8 @@
 #include <asm/hardware/cache-l2x0.h>
 #include "cache-aurora-l2.h"
 
+#include <litmus/cache_proc.h>
+
 #define CACHE_LINE_SIZE		32
 
 static void __iomem *l2x0_base;
@@ -393,6 +395,8 @@ void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask)
 		l2x0_unlock(cache_id);
 
 		/* l2x0 controller is disabled */
+		//aux |= (1 << 12);
+		//printk("AUX BIT = %08x\n", aux);
 		writel_relaxed(aux, l2x0_base + L2X0_AUX_CTRL);
 
 		l2x0_inv_all();
@@ -420,6 +424,8 @@ void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask)
 	printk(KERN_INFO "%s cache controller enabled\n", type);
 	printk(KERN_INFO "l2x0: %d ways, CACHE_ID 0x%08x, AUX_CTRL 0x%08x, Cache size: %d B\n",
 			ways, cache_id, aux, l2x0_size);
+
+	litmus_setup_lockdown(l2x0_base, cache_id);
 }
 
 #ifdef CONFIG_OF
diff --git a/include/litmus/cache_proc.h b/include/litmus/cache_proc.h
new file mode 100644
index 0000000..a7a740e
--- /dev/null
+++ b/include/litmus/cache_proc.h
@@ -0,0 +1,10 @@
+#ifndef LITMUS_CACHE_PROC_H
+#define LITMUS_CACHE_PROC_H
+
+#ifdef __KERNEL__
+
+void litmus_setup_lockdown(void __iomem*, u32);
+
+#endif
+
+#endif
diff --git a/litmus/cache_proc.c b/litmus/cache_proc.c
new file mode 100644
index 0000000..4425bfb
--- /dev/null
+++ b/litmus/cache_proc.c
@@ -0,0 +1,245 @@
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/mutex.h>
+
+#include <litmus/litmus_proc.h>
+#include <litmus/sched_trace.h>
+
+#include <asm/hardware/cache-l2x0.h>
+#include <asm/cacheflush.h>
+
+#define UNLOCK_ALL	0x00000000 /* allocation in any way */
+#define LOCK_ALL        (~UNLOCK_ALL)
+#define MAX_NR_WAYS	16
+
+u32 nr_unlocked_way[MAX_NR_WAYS+1]  = {
+	0xFFFFFFFF, /* all ways are locked. usable = 0*/
+	0xFFFFFFFE, /* way ~0 unlocked. usable = 1 */
+	0xFFFFFFFC,
+	0xFFFFFFF8,
+	0xFFFFFFF0,
+	0xFFFFFFE0,
+	0xFFFFFFC0,
+	0xFFFFFF80,
+	0xFFFFFF00,
+	0xFFFFFE00,
+	0xFFFFFC00,
+	0xFFFFF800,
+	0xFFFFF000,
+	0xFFFFE000,
+	0xFFFFC000,
+	0xFFFF8000,
+	0xFFFF0000, /* way ~15 unlocked. usable = 16 */
+};
+
+static void __iomem *cache_base;
+static void __iomem *lockreg_d;
+static void __iomem *lockreg_i;
+
+static u32 cache_id;
+
+struct mutex actlr_mutex;
+struct mutex l2x0_prefetch_mutex;
+struct mutex lockdown_proc;
+
+static int min_usable_ways = 0;
+static int max_usable_ways = 16;
+static int zero = 0;
+static int one = 1;
+
+#define ld_d_reg(cpu) ({ int __cpu = cpu; \
+			void __iomem *__v = cache_base + L2X0_LOCKDOWN_WAY_D_BASE + \
+			__cpu * L2X0_LOCKDOWN_STRIDE; __v; })
+#define ld_i_reg(cpu) ({ int __cpu = cpu; \
+			void __iomem *__v = cache_base + L2X0_LOCKDOWN_WAY_I_BASE + \
+			__cpu * L2X0_LOCKDOWN_STRIDE; __v; })
+
+int l2_usable_ways;
+int lock_all;
+int nr_lockregs;
+
+static void print_lockdown_registers(void)
+{
+	int i;
+
+	for (i = 0; i < nr_lockregs; i++) {
+		printk("Lockdown Data CPU %2d: 0x%8x\n",
+				i, readl_relaxed(ld_d_reg(i)));
+		printk("Lockdown Inst CPU %2d: 0x%8x\n",
+				i, readl_relaxed(ld_i_reg(i)));
+	}
+}
+
+static void test_lockdown(void *ignore)
+{
+	int i;
+
+	printk("Start lockdown test on CPU %d.\n", smp_processor_id());
+
+	for (i = 0; i < nr_lockregs; i++) {
+		printk("CPU %2d data reg: 0x%8p\n", i, ld_d_reg(i));
+		printk("CPU %2d inst reg: 0x%8p\n", i, ld_i_reg(i));
+	}
+
+	printk("Lockdown initial state:\n");
+	print_lockdown_registers();
+	printk("---\n");
+
+	for (i = 0; i < nr_lockregs; i++) {
+		writel_relaxed(1, ld_d_reg(i));
+		writel_relaxed(2, ld_i_reg(i));
+	}
+	printk("Lockdown all data=1 instr=2:\n");
+	print_lockdown_registers();
+	printk("---\n");
+
+	for (i = 0; i < nr_lockregs; i++) {
+		writel_relaxed((1 << i), ld_d_reg(i));
+		writel_relaxed(((1 << 8) >> i), ld_i_reg(i));
+	}
+	printk("Lockdown varies:\n");
+	print_lockdown_registers();
+	printk("---\n");
+
+	for (i = 0; i < nr_lockregs; i++) {
+		writel_relaxed(UNLOCK_ALL, ld_d_reg(i));
+		writel_relaxed(UNLOCK_ALL, ld_i_reg(i));
+	}
+	printk("Lockdown all zero:\n");
+	print_lockdown_registers();
+
+	printk("End lockdown test.\n");
+}
+
+void litmus_setup_lockdown(void __iomem *base, u32 id)
+{
+    cache_base = base;
+	cache_id = id;
+	lockreg_d = cache_base + L2X0_LOCKDOWN_WAY_D_BASE;
+	lockreg_i = cache_base + L2X0_LOCKDOWN_WAY_I_BASE;
+    
+	if (L2X0_CACHE_ID_PART_L310 == (cache_id & L2X0_CACHE_ID_PART_MASK)) {
+		nr_lockregs = 8;
+	} else {
+		printk("Unknown cache ID!\n");
+		nr_lockregs = 1;
+	}
+	
+	mutex_init(&actlr_mutex);
+	mutex_init(&l2x0_prefetch_mutex);
+	mutex_init(&lockdown_proc);
+	
+	test_lockdown(NULL);
+}
+int lock_all_handler(struct ctl_table *table, int write, void __user *buffer,
+		size_t *lenp, loff_t *ppos)
+{
+	int ret = 0, i;
+	
+	mutex_lock(&lockdown_proc);
+	
+	flush_cache_all();
+	
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret)
+		goto out;
+	
+	if (write && lock_all == 1) {
+		for (i = 0; i < nr_lockregs;  i++) {
+			writel_relaxed(nr_unlocked_way[0], ld_d_reg(i));
+			writel_relaxed(nr_unlocked_way[0], ld_i_reg(i));
+		}
+		print_lockdown_registers();
+	}
+
+out:
+	mutex_unlock(&lockdown_proc);
+	return ret;
+}
+
+int l2_usable_ways_handler(struct ctl_table *table, int write, void __user *buffer,
+		size_t *lenp, loff_t *ppos)
+{
+	int ret = 0, i = 0;
+	
+	mutex_lock(&lockdown_proc);
+	
+	flush_cache_all();
+	
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret)
+		goto out;
+		
+	TRACE_CUR("l2_usable_ways : %d\n", l2_usable_ways);
+	
+	if (write) {
+		//for (i = 0; i < nr_lockregs;  i++) {
+			writel_relaxed(nr_unlocked_way[l2_usable_ways], ld_d_reg(i));
+			writel_relaxed(nr_unlocked_way[l2_usable_ways], ld_i_reg(i));
+		//}
+		print_lockdown_registers();
+	}
+
+out:
+	mutex_unlock(&lockdown_proc);
+	return ret;
+}
+
+static struct ctl_table cache_table[] =
+{
+	{
+		.procname	= "l2_usable_ways",
+		.mode		= 0666,
+		.proc_handler	= l2_usable_ways_handler,
+		.data		= &l2_usable_ways,
+		.maxlen		= sizeof(l2_usable_ways),
+		.extra1		= &min_usable_ways,
+		.extra2		= &max_usable_ways,
+	},
+	{
+		.procname	= "lock_all",
+		.mode		= 0666,
+		.proc_handler	= lock_all_handler,
+		.data		= &lock_all,
+		.maxlen		= sizeof(lock_all),
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{ }
+};
+
+static struct ctl_table litmus_dir_table[] = {
+	{
+		.procname	= "litmus",
+ 		.mode		= 0555,
+		.child		= cache_table,
+	},
+	{ }
+};
+
+static struct ctl_table_header *litmus_sysctls;
+
+static int __init litmus_sysctl_init(void)
+{
+	int ret = 0;
+
+	printk(KERN_INFO "Registering LITMUS^RT proc sysctl.\n");
+	litmus_sysctls = register_sysctl_table(litmus_dir_table);
+	if (!litmus_sysctls) {
+		printk(KERN_WARNING "Could not register LITMUS^RT sysctl.\n");
+		ret = -EFAULT;
+		goto out;
+	}
+
+	l2_usable_ways = 16;
+
+out:
+	return ret;
+}
+
+module_init(litmus_sysctl_init);
\ No newline at end of file
-- 
1.8.1.2


From 709a4f8279a10ad85f5688808d11ffabff9ef25c Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Mon, 3 Nov 2014 21:54:53 -0500
Subject: [PATCH 054/119] Add MC2 plugin

---
 include/litmus/mc2_common.h |  39 ++
 litmus/mc2_common.c         |  30 ++
 litmus/sched_mc2.c          | 842 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 911 insertions(+)
 create mode 100644 include/litmus/mc2_common.h
 create mode 100644 litmus/mc2_common.c
 create mode 100644 litmus/sched_mc2.c

diff --git a/include/litmus/mc2_common.h b/include/litmus/mc2_common.h
new file mode 100644
index 0000000..a1d571f
--- /dev/null
+++ b/include/litmus/mc2_common.h
@@ -0,0 +1,39 @@
+/*
+ * MC^2 common data structures
+ */
+ 
+#ifndef __UNC_MC2_COMMON_H__
+#define __UNC_MC2_COMMON_H__
+
+enum crit_level {
+	CRIT_LEVEL_A = 0,
+	CRIT_LEVEL_B = 1,
+	CRIT_LEVEL_C = 2,
+	NUM_CRIT_LEVELS = 3,
+};
+
+struct mc2_task {
+	enum crit_level crit;
+	pid_t pid;
+	lt_t hyperperiod;
+};
+
+#ifdef __KERNEL__
+
+#include <litmus/reservation.h>
+
+struct mc2_param{
+	struct mc2_task mc2_task;
+};
+
+struct mc2_task_client {
+	struct task_client tc;
+	struct mc2_param mc2;
+};
+
+long mc2_task_client_init(struct mc2_task_client *mtc, struct task_struct *tsk,
+							struct reservation *res);
+	
+#endif /* __KERNEL__ */
+
+#endif
\ No newline at end of file
diff --git a/litmus/mc2_common.c b/litmus/mc2_common.c
new file mode 100644
index 0000000..56ef6b5
--- /dev/null
+++ b/litmus/mc2_common.c
@@ -0,0 +1,30 @@
+/*
+ * litmus/mc2_common.c
+ *
+ * Common functions for MC2 plugin.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/mc2_common.h>
+
+long mc2_task_client_init(
+	struct mc2_task_client *mtc,
+	struct task_struct *tsk,
+	struct reservation *res
+)
+{
+	task_client_init(&mtc->tc, tsk, res);
+	if ((mtc->mc2.mc2_task.crit < CRIT_LEVEL_A) ||
+		(mtc->mc2.mc2_task.crit > CRIT_LEVEL_C))
+		return -EINVAL;
+	
+	TRACE_TASK(tsk, "mc2_task_client_init: crit_level = %d\n", mtc->mc2.mc2_task.crit);
+	return 0;
+}
\ No newline at end of file
diff --git a/litmus/sched_mc2.c b/litmus/sched_mc2.c
new file mode 100644
index 0000000..ab35008
--- /dev/null
+++ b/litmus/sched_mc2.c
@@ -0,0 +1,842 @@
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+
+#include <litmus/sched_plugin.h>
+#include <litmus/preempt.h>
+#include <litmus/debug_trace.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/budget.h>
+#include <litmus/litmus_proc.h>
+
+#include <litmus/mc2_common.h>
+#include <litmus/reservation.h>
+#include <litmus/polling_reservations.h>
+
+struct mc2_task_state {
+	struct mc2_task_client res_info;
+	int cpu;
+	bool has_departed;
+};
+
+struct mc2_cpu_state {
+	raw_spinlock_t lock;
+
+	struct sup_reservation_environment sup_env;
+	struct hrtimer timer;
+
+	int cpu;
+	struct task_struct* scheduled;
+
+#ifdef CONFIG_RELEASE_MASTER
+	int release_master;
+	/* used to delegate releases */
+	struct hrtimer_start_on_info info;
+#endif
+};
+
+static DEFINE_PER_CPU(struct mc2_cpu_state, mc2_cpu_state);
+
+#define cpu_state_for(cpu_id)	(&per_cpu(mc2_cpu_state, cpu_id))
+#define local_cpu_state()	(&__get_cpu_var(mc2_cpu_state))
+
+static struct mc2_task_state* get_mc2_state(struct task_struct *tsk)
+{
+	return (struct mc2_task_state*) tsk_rt(tsk)->plugin_state;
+}
+
+static void task_departs(struct task_struct *tsk, int job_complete)
+{
+	struct mc2_task_state* state = get_mc2_state(tsk);
+	struct reservation* res;
+	struct reservation_client *client;
+
+	res    = state->res_info.tc.client.reservation;
+	client = &state->res_info.tc.client;
+
+	res->ops->client_departs(res, client, job_complete);
+	state->has_departed = true;
+}
+
+static void task_arrives(struct task_struct *tsk)
+{
+	struct mc2_task_state* state = get_mc2_state(tsk);
+	struct reservation* res;
+	struct reservation_client *client;
+
+	res    = state->res_info.tc.client.reservation;
+	client = &state->res_info.tc.client;
+
+	state->has_departed = false;
+	res->ops->client_arrives(res, client);
+}
+
+/* NOTE: drops state->lock */
+static void mc2_update_timer_and_unlock(struct mc2_cpu_state *state)
+{
+	int local;
+	lt_t update, now;
+
+	update = state->sup_env.next_scheduler_update;
+	now = state->sup_env.env.current_time;
+	
+	/* Be sure we're actually running on the right core,
+	 * as mc2_update_timer() is also called from mc2_task_resume(),
+	 * which might be called on any CPU when a thread resumes.
+	 */
+	local = local_cpu_state() == state;
+	
+	/* Must drop state lock before calling into hrtimer_start(), which
+	 * may raise a softirq, which in turn may wake ksoftirqd. */
+	raw_spin_unlock(&state->lock);
+
+	if (update <= now) {
+		litmus_reschedule(state->cpu);
+	} else if (likely(local && update != SUP_NO_SCHEDULER_UPDATE)) {
+		/* Reprogram only if not already set correctly. */
+		if (!hrtimer_active(&state->timer) ||
+		    ktime_to_ns(hrtimer_get_expires(&state->timer)) != update) {
+			TRACE("canceling timer...\n");
+			hrtimer_cancel(&state->timer);
+			TRACE("setting scheduler timer for %llu\n", update);
+			/* We cannot use hrtimer_start() here because the
+			 * wakeup flag must be set to zero. */
+			__hrtimer_start_range_ns(&state->timer,
+					ns_to_ktime(update),
+					0 /* timer coalescing slack */,
+					HRTIMER_MODE_ABS_PINNED,
+					0 /* wakeup */);
+		}
+	} else if (unlikely(!local && update != SUP_NO_SCHEDULER_UPDATE)) {
+		/* Poke remote core only if timer needs to be set earlier than
+		 * it is currently set.
+		 */
+		TRACE("mc2_update_timer for remote CPU %d (update=%llu, "
+		      "active:%d, set:%llu)\n",
+			state->cpu,
+			update,
+			hrtimer_active(&state->timer),
+			ktime_to_ns(hrtimer_get_expires(&state->timer)));
+		if (!hrtimer_active(&state->timer) ||
+		    ktime_to_ns(hrtimer_get_expires(&state->timer)) > update) {
+			TRACE("poking CPU %d so that it can update its "
+			       "scheduling timer (active:%d, set:%llu)\n",
+			       state->cpu,
+			       hrtimer_active(&state->timer),
+			       ktime_to_ns(hrtimer_get_expires(&state->timer)));
+			litmus_reschedule(state->cpu);
+		}
+	}
+}
+
+static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
+{
+	unsigned long flags;
+	enum hrtimer_restart restart = HRTIMER_NORESTART;
+	struct mc2_cpu_state *state;
+	lt_t update, now;
+
+	state = container_of(timer, struct mc2_cpu_state, timer);
+
+	/* The scheduling timer should only fire on the local CPU, because
+	 * otherwise deadlocks via timer_cancel() are possible.
+	 * Note: this does not interfere with dedicated interrupt handling, as
+	 * even under dedicated interrupt handling scheduling timers for
+	 * budget enforcement must occur locally on each CPU.
+	 */
+	BUG_ON(state->cpu != raw_smp_processor_id());
+
+	raw_spin_lock_irqsave(&state->lock, flags);
+	sup_update_time(&state->sup_env, litmus_clock());
+
+	update = state->sup_env.next_scheduler_update;
+	now = state->sup_env.env.current_time;
+
+	TRACE_CUR("on_scheduling_timer at %llu, upd:%llu (for cpu=%d)\n",
+		now, update, state->cpu);
+
+	if (update <= now) {
+		litmus_reschedule_local();
+	} else if (update != SUP_NO_SCHEDULER_UPDATE) {
+		hrtimer_set_expires(timer, ns_to_ktime(update));
+		restart = HRTIMER_RESTART;
+	}
+
+	raw_spin_unlock_irqrestore(&state->lock, flags);
+
+	return restart;
+}
+
+static long mc2_complete_job(void)
+{
+	ktime_t next_release;
+	long err;
+
+	TRACE_CUR("mc2_complete_job at %llu (deadline: %llu)\n", litmus_clock(),
+					get_deadline(current));
+
+	tsk_rt(current)->completed = 1;
+	prepare_for_next_period(current);
+	next_release = ns_to_ktime(get_release(current));
+	preempt_disable();
+	TRACE_CUR("next_release=%llu\n", get_release(current));
+	if (get_release(current) > litmus_clock()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		preempt_enable_no_resched();
+		err = schedule_hrtimeout(&next_release, HRTIMER_MODE_ABS);
+	} else {
+		err = 0;
+		TRACE_CUR("TARDY: release=%llu now=%llu\n", get_release(current), litmus_clock());
+		preempt_enable();
+	}
+
+	TRACE_CUR("mc2_complete_job returns [%d] at %llu\n", err, litmus_clock());
+	return err;
+}
+
+static struct task_struct* mc2_schedule(struct task_struct * prev)
+{
+	/* next == NULL means "schedule background work". */
+	struct mc2_cpu_state *state = local_cpu_state();
+
+	raw_spin_lock(&state->lock);
+
+	BUG_ON(state->scheduled && state->scheduled != prev);
+	BUG_ON(state->scheduled && !is_realtime(prev));
+
+	/* update time */
+	state->sup_env.will_schedule = true;
+	sup_update_time(&state->sup_env, litmus_clock());
+
+	/* check if prev task complete */
+	if (is_realtime(prev)) {
+		TRACE_TASK(prev, "EXEC_TIME = %llu, EXEC_COST = %llu, REMAINED = %llu\n",
+			get_exec_time(prev), get_exec_cost(prev), get_exec_cost(prev)-get_exec_time(prev)); 
+	}
+	if (is_realtime(prev) && (get_exec_time(prev) >= get_exec_cost(prev))) {
+		TRACE_TASK(prev, "JOB COMPLETED! but is_completed = %d\n", is_completed(prev));
+//		mc2_complete_job(prev);
+	}
+
+	/* remove task from reservation if it blocks */
+	if (is_realtime(prev) && !is_running(prev))
+		task_departs(prev, is_completed(prev));
+
+	/* figure out what to schedule next */
+	state->scheduled = sup_dispatch(&state->sup_env);
+
+	/* Notify LITMUS^RT core that we've arrived at a scheduling decision. */
+	sched_state_task_picked();
+
+	/* program scheduler timer */
+	state->sup_env.will_schedule = false;
+	/* NOTE: drops state->lock */
+	mc2_update_timer_and_unlock(state);
+	
+	if (prev != state->scheduled && is_realtime(prev))
+		TRACE_TASK(prev, "descheduled.\n");
+	if (state->scheduled)
+		TRACE_TASK(state->scheduled, "scheduled.\n");
+
+	return state->scheduled;
+}
+
+static void resume_legacy_task_model_updates(struct task_struct *tsk)
+{
+	lt_t now;
+	if (is_sporadic(tsk)) {
+		/* If this sporadic task was gone for a "long" time and woke up past
+		 * its deadline, then give it a new budget by triggering a job
+		 * release. This is purely cosmetic and has no effect on the
+		 * P-RES scheduler. */
+
+		now = litmus_clock();
+		if (is_tardy(tsk, now))
+			release_at(tsk, now);
+	}
+}
+
+/* Called when the state of tsk changes back to TASK_RUNNING.
+ * We need to requeue the task.
+ */
+static void mc2_task_resume(struct task_struct  *tsk)
+{
+	unsigned long flags;
+	struct mc2_task_state* tinfo = get_mc2_state(tsk);
+	struct mc2_cpu_state *state = cpu_state_for(tinfo->cpu);
+
+	TRACE_TASK(tsk, "thread wakes up at %llu\n", litmus_clock());
+
+	raw_spin_lock_irqsave(&state->lock, flags);
+	/* Requeue only if self-suspension was already processed. */
+	if (tinfo->has_departed)
+	{
+		/* Assumption: litmus_clock() is synchronized across cores,
+		 * since we might not actually be executing on tinfo->cpu
+		 * at the moment. */
+		sup_update_time(&state->sup_env, litmus_clock());
+		task_arrives(tsk);
+		/* NOTE: drops state->lock */
+		mc2_update_timer_and_unlock(state);
+		local_irq_restore(flags);
+	} else {
+		TRACE_TASK(tsk, "resume event ignored, still scheduled\n");
+		raw_spin_unlock_irqrestore(&state->lock, flags);
+	}
+
+	resume_legacy_task_model_updates(tsk);
+}
+
+static long mc2_admit_task(struct task_struct *tsk)
+{
+	long err = -ESRCH;
+	unsigned long flags;
+	struct reservation *res;
+	struct mc2_cpu_state *state;
+	struct mc2_task_state *tinfo = kzalloc(sizeof(*tinfo), GFP_ATOMIC);
+	struct mc2_task *mp = tsk_rt(tsk)->plugin_state;
+	
+	if (!tinfo)
+		return -ENOMEM;
+	
+	if (!mp) {
+		printk(KERN_ERR "mc2_admit_task: criticality level has not been set\n");
+		return err;
+	}
+	
+	preempt_disable();
+
+	state = cpu_state_for(task_cpu(tsk));
+	raw_spin_lock_irqsave(&state->lock, flags);
+
+	res = sup_find_by_id(&state->sup_env, mp->pid);
+
+	/* found the appropriate reservation (or vCPU) */
+	if (res) {
+		TRACE_TASK(tsk, "FOUND RES\n");
+		tinfo->res_info.mc2.mc2_task.crit = mp->crit;
+		
+		kfree(tsk_rt(tsk)->plugin_state);
+		tsk_rt(tsk)->plugin_state = NULL;
+	
+		err = mc2_task_client_init(&tinfo->res_info, tsk, res);
+		tinfo->cpu = task_cpu(tsk);
+		tinfo->has_departed = true;
+		tsk_rt(tsk)->plugin_state = tinfo;
+		
+		/* disable LITMUS^RT's per-thread budget enforcement */
+		tsk_rt(tsk)->task_params.budget_policy = NO_ENFORCEMENT;
+	}
+
+	raw_spin_unlock_irqrestore(&state->lock, flags);
+
+	preempt_enable();
+
+	if (err)
+		kfree(tinfo);
+
+	return err;
+}
+
+static void task_new_legacy_task_model_updates(struct task_struct *tsk)
+{
+	lt_t now = litmus_clock();
+
+	/* the first job exists starting as of right now */
+	release_at(tsk, now);
+}
+
+static void mc2_task_new(struct task_struct *tsk, int on_runqueue,
+			  int is_running)
+{
+	unsigned long flags;
+	struct mc2_task_state* tinfo = get_mc2_state(tsk);
+	struct mc2_cpu_state *state = cpu_state_for(tinfo->cpu);
+
+	TRACE_TASK(tsk, "new RT task %llu (on_rq:%d, running:%d)\n",
+		   litmus_clock(), on_runqueue, is_running);
+
+	/* acquire the lock protecting the state and disable interrupts */
+	raw_spin_lock_irqsave(&state->lock, flags);
+
+	if (is_running) {
+		state->scheduled = tsk;
+		/* make sure this task should actually be running */
+		litmus_reschedule_local();
+	}
+
+	if (on_runqueue || is_running) {
+		/* Assumption: litmus_clock() is synchronized across cores
+		 * [see comment in pres_task_resume()] */
+		sup_update_time(&state->sup_env, litmus_clock());
+		task_arrives(tsk);
+		/* NOTE: drops state->lock */
+		mc2_update_timer_and_unlock(state);
+		local_irq_restore(flags);
+	} else
+		raw_spin_unlock_irqrestore(&state->lock, flags);
+
+	task_new_legacy_task_model_updates(tsk);
+}
+
+static void mc2_task_exit(struct task_struct *tsk)
+{
+	unsigned long flags;
+	struct mc2_task_state* tinfo = get_mc2_state(tsk);
+	struct mc2_cpu_state *state = cpu_state_for(tinfo->cpu);
+
+	raw_spin_lock_irqsave(&state->lock, flags);
+
+	if (state->scheduled == tsk)
+		state->scheduled = NULL;
+
+	/* remove from queues */
+	if (is_running(tsk)) {
+		/* Assumption: litmus_clock() is synchronized across cores
+		 * [see comment in pres_task_resume()] */
+		sup_update_time(&state->sup_env, litmus_clock());
+		task_departs(tsk, 0);
+		/* NOTE: drops state->lock */
+		mc2_update_timer_and_unlock(state);
+		local_irq_restore(flags);
+	} else
+		raw_spin_unlock_irqrestore(&state->lock, flags);
+
+	kfree(tsk_rt(tsk)->plugin_state);
+	tsk_rt(tsk)->plugin_state = NULL;
+}
+
+asmlinkage long sys_set_mc2_task_param(pid_t pid, struct mc2_task __user * param)
+{
+	struct task_struct *target;
+	int retval = -EINVAL;
+	struct mc2_task *mp = kzalloc(sizeof(*mp), GFP_KERNEL);
+	
+	if (!mp)
+		return -ENOMEM;
+
+	printk("Setting up mc^2 task parameters for process %d.\n", pid);
+
+	if (pid < 0 || param == 0) {
+		goto out;
+	}
+	if (copy_from_user(mp, param, sizeof(*mp))) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	/* Task search and manipulation must be protected */
+	read_lock_irq(&tasklist_lock);
+	if (!(target = find_task_by_vpid(pid))) {
+		retval = -ESRCH;
+		goto out_unlock;
+	}
+
+	if (is_realtime(target)) {
+		/* The task is already a real-time task.
+		 * We cannot not allow parameter changes at this point.
+		 */
+		retval = -EBUSY;
+		goto out_unlock;
+	}
+	if (mp->crit < CRIT_LEVEL_A || mp->crit >= NUM_CRIT_LEVELS) {
+		printk(KERN_INFO "litmus: real-time task %d rejected "
+			"because of invalid criticality level\n", pid);
+		goto out_unlock;
+	}
+	
+	target->rt_param.plugin_state = mp;
+
+	retval = 0;
+      out_unlock:
+	read_unlock_irq(&tasklist_lock);
+      out:
+	return retval;
+}
+
+static long create_polling_reservation(
+	int res_type,
+	struct reservation_config *config)
+{
+	struct mc2_cpu_state *state;
+	struct reservation* res;
+	struct polling_reservation *pres;
+	unsigned long flags;
+	int use_edf  = config->priority == LITMUS_NO_PRIORITY;
+	int periodic =  res_type == PERIODIC_POLLING;
+	long err = -EINVAL;
+
+	if (config->polling_params.budget >
+	    config->polling_params.period) {
+		printk(KERN_ERR "invalid polling reservation (%u): "
+		       "budget > period\n", config->id);
+		return -EINVAL;
+	}
+	if (config->polling_params.budget >
+	    config->polling_params.relative_deadline
+	    && config->polling_params.relative_deadline) {
+		printk(KERN_ERR "invalid polling reservation (%u): "
+		       "budget > deadline\n", config->id);
+		return -EINVAL;
+	}
+	if (config->polling_params.offset >
+	    config->polling_params.period) {
+		printk(KERN_ERR "invalid polling reservation (%u): "
+		       "offset > period\n", config->id);
+		return -EINVAL;
+	}
+
+	/* Allocate before we grab a spin lock.
+	 * Todo: would be nice to use a core-local allocation.
+	 */
+	pres = kzalloc(sizeof(*pres), GFP_KERNEL);
+	if (!pres)
+		return -ENOMEM;
+
+	state = cpu_state_for(config->cpu);
+	raw_spin_lock_irqsave(&state->lock, flags);
+
+	res = sup_find_by_id(&state->sup_env, config->id);
+	if (!res) {
+		polling_reservation_init(pres, use_edf, periodic,
+			config->polling_params.budget,
+			config->polling_params.period,
+			config->polling_params.relative_deadline,
+			config->polling_params.offset);
+		pres->res.id = config->id;
+		if (!use_edf)
+			pres->res.priority = config->priority;
+		sup_add_new_reservation(&state->sup_env, &pres->res);
+		err = config->id;
+	} else {
+		err = -EEXIST;
+	}
+
+	raw_spin_unlock_irqrestore(&state->lock, flags);
+
+	if (err < 0)
+		kfree(pres);
+
+	return err;
+}
+
+#define MAX_INTERVALS 1024
+
+static long create_table_driven_reservation(
+	struct reservation_config *config)
+{
+	struct mc2_cpu_state *state;
+	struct reservation* res;
+	struct table_driven_reservation *td_res = NULL;
+	struct lt_interval *slots = NULL;
+	size_t slots_size;
+	unsigned int i, num_slots;
+	unsigned long flags;
+	long err = -EINVAL;
+
+
+	if (!config->table_driven_params.num_intervals) {
+		printk(KERN_ERR "invalid table-driven reservation (%u): "
+		       "no intervals\n", config->id);
+		return -EINVAL;
+	}
+
+	if (config->table_driven_params.num_intervals > MAX_INTERVALS) {
+		printk(KERN_ERR "invalid table-driven reservation (%u): "
+		       "too many intervals (max: %d)\n", config->id, MAX_INTERVALS);
+		return -EINVAL;
+	}
+
+	num_slots = config->table_driven_params.num_intervals;
+	slots_size = sizeof(slots[0]) * num_slots;
+	slots = kzalloc(slots_size, GFP_KERNEL);
+	if (!slots)
+		return -ENOMEM;
+
+	td_res = kzalloc(sizeof(*td_res), GFP_KERNEL);
+	if (!td_res)
+		err = -ENOMEM;
+	else
+		err = copy_from_user(slots,
+			config->table_driven_params.intervals, slots_size);
+
+	for (i=0; i<num_slots;i++) {
+		TRACE("###### [%llu, %llu]\n", slots[i].start, slots[i].end);
+	}
+	
+	if (!err) {
+		/* sanity checks */
+		for (i = 0; !err && i < num_slots; i++)
+			if (slots[i].end <= slots[i].start) {
+				printk(KERN_ERR
+				       "invalid table-driven reservation (%u): "
+				       "invalid interval %u => [%llu, %llu]\n",
+				       config->id, i,
+				       slots[i].start, slots[i].end);
+				err = -EINVAL;
+			}
+
+		for (i = 0; !err && i + 1 < num_slots; i++)
+			if (slots[i + 1].start <= slots[i].end) {
+				printk(KERN_ERR
+				       "invalid table-driven reservation (%u): "
+				       "overlapping intervals %u, %u\n",
+				       config->id, i, i + 1);
+				err = -EINVAL;
+			}
+
+		if (slots[num_slots - 1].end >
+			config->table_driven_params.major_cycle_length) {
+			printk(KERN_ERR
+				"invalid table-driven reservation (%u): last "
+				"interval ends past major cycle %llu > %llu\n",
+				config->id,
+				slots[num_slots - 1].end,
+				config->table_driven_params.major_cycle_length);
+			err = -EINVAL;
+		}
+	}
+
+	if (!err) {
+		state = cpu_state_for(config->cpu);
+		raw_spin_lock_irqsave(&state->lock, flags);
+
+		res = sup_find_by_id(&state->sup_env, config->id);
+		if (!res) {
+			table_driven_reservation_init(td_res,
+				config->table_driven_params.major_cycle_length,
+				slots, num_slots);
+			td_res->res.id = config->id;
+			td_res->res.priority = config->priority;
+			sup_add_new_reservation(&state->sup_env, &td_res->res);
+			err = config->id;
+		} else {
+			err = -EEXIST;
+		}
+
+		raw_spin_unlock_irqrestore(&state->lock, flags);
+	}
+
+	if (err < 0) {
+		kfree(slots);
+		kfree(td_res);
+	}
+
+	TRACE("CREATE_TABLE_DRIVEN_RES = %d\n", err);
+	return err;
+}
+
+static long mc2_reservation_create(int res_type, void* __user _config)
+{
+	long ret = -EINVAL;
+	struct reservation_config config;
+
+	TRACE("Attempt to create reservation (%d)\n", res_type);
+
+	if (copy_from_user(&config, _config, sizeof(config)))
+		return -EFAULT;
+
+	if (config.cpu < 0 || !cpu_online(config.cpu)) {
+		printk(KERN_ERR "invalid polling reservation (%u): "
+		       "CPU %d offline\n", config.id, config.cpu);
+		return -EINVAL;
+	}
+
+	switch (res_type) {
+		case PERIODIC_POLLING:
+		case SPORADIC_POLLING:
+			ret = create_polling_reservation(res_type, &config);
+			break;
+
+		case TABLE_DRIVEN:
+			ret = create_table_driven_reservation(&config);
+			break;
+
+		default:
+			return -EINVAL;
+	};
+
+	return ret;
+}
+
+static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
+{
+	long ret = -EINVAL;
+	struct mc2_cpu_state *state;
+	struct reservation *res, *next;
+	struct sup_reservation_environment *sup_env;
+	unsigned long flags;
+	int found = 0;
+	
+	state = cpu_state_for(cpu);
+	raw_spin_lock_irqsave(&state->lock, flags);
+	
+	//res = sup_find_by_id(&state->sup_env, reservation_id);
+	sup_env = &state->sup_env;
+	//if (!res) {
+	list_for_each_entry_safe(res, next, &sup_env->depleted_reservations, list) {
+		if (res->id == reservation_id) {
+			list_del(&res->list);
+			found = 1;
+			ret = 0;
+		}
+	}
+	if (!found) {
+		list_for_each_entry_safe(res, next, &sup_env->inactive_reservations, list) {
+			if (res->id == reservation_id) {
+				list_del(&res->list);
+				found = 1;
+				ret = 0;
+			}
+		}
+	}
+	if (!found) {
+		list_for_each_entry_safe(res, next, &sup_env->active_reservations, list) {
+			if (res->id == reservation_id) {
+				list_del(&res->list);
+				found = 1;
+				ret = 0;
+			}
+		}
+	}
+	//}
+
+	raw_spin_unlock_irqrestore(&state->lock, flags);
+	
+	TRACE("RESERVATION_DESTROY ret = %d\n", ret);
+	return ret;
+}
+
+static struct domain_proc_info mc2_domain_proc_info;
+
+static long mc2_get_domain_proc_info(struct domain_proc_info **ret)
+{
+	*ret = &mc2_domain_proc_info;
+	return 0;
+}
+
+static void mc2_setup_domain_proc(void)
+{
+	int i, cpu;
+	int num_rt_cpus = num_online_cpus();
+
+	struct cd_mapping *cpu_map, *domain_map;
+
+	memset(&mc2_domain_proc_info, sizeof(mc2_domain_proc_info), 0);
+	init_domain_proc_info(&mc2_domain_proc_info, num_rt_cpus, num_rt_cpus);
+	mc2_domain_proc_info.num_cpus = num_rt_cpus;
+	mc2_domain_proc_info.num_domains = num_rt_cpus;
+
+	i = 0;
+	for_each_online_cpu(cpu) {
+		cpu_map = &mc2_domain_proc_info.cpu_to_domains[i];
+		domain_map = &mc2_domain_proc_info.domain_to_cpus[i];
+
+		cpu_map->id = cpu;
+		domain_map->id = i;
+		cpumask_set_cpu(i, cpu_map->mask);
+		cpumask_set_cpu(cpu, domain_map->mask);
+		++i;
+	}
+}
+
+static long mc2_activate_plugin(void)
+{
+	int cpu;
+	struct mc2_cpu_state *state;
+
+	for_each_online_cpu(cpu) {
+		TRACE("Initializing CPU%d...\n", cpu);
+
+		state = cpu_state_for(cpu);
+
+#ifdef CONFIG_RELEASE_MASTER
+		state->release_master = atomic_read(&release_master_cpu);
+		hrtimer_start_on_info_init(&state->info);
+#endif
+		
+		raw_spin_lock_init(&state->lock);
+		state->cpu = cpu;
+		state->scheduled = NULL;
+
+		sup_init(&state->sup_env);
+
+		hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+		state->timer.function = on_scheduling_timer;
+	}
+
+	mc2_setup_domain_proc();
+
+	return 0;
+}
+
+static long mc2_deactivate_plugin(void)
+{
+	int cpu;
+	struct mc2_cpu_state *state;
+	struct reservation *res;
+
+	for_each_online_cpu(cpu) {
+		state = cpu_state_for(cpu);
+		raw_spin_lock(&state->lock);
+
+		hrtimer_cancel(&state->timer);
+
+		/* Delete all reservations --- assumes struct reservation
+		 * is prefix of containing struct. */
+
+		while (!list_empty(&state->sup_env.active_reservations)) {
+			res = list_first_entry(
+				&state->sup_env.active_reservations,
+			        struct reservation, list);
+			list_del(&res->list);
+			kfree(res);
+		}
+
+		while (!list_empty(&state->sup_env.inactive_reservations)) {
+			res = list_first_entry(
+				&state->sup_env.inactive_reservations,
+			        struct reservation, list);
+			list_del(&res->list);
+			kfree(res);
+		}
+
+		while (!list_empty(&state->sup_env.depleted_reservations)) {
+			res = list_first_entry(
+				&state->sup_env.depleted_reservations,
+			        struct reservation, list);
+			list_del(&res->list);
+			kfree(res);
+		}
+
+		raw_spin_unlock(&state->lock);
+	}
+
+	destroy_domain_proc_info(&mc2_domain_proc_info);
+	return 0;
+}
+
+static struct sched_plugin mc2_plugin = {
+	.plugin_name		= "MC2",
+	.schedule		= mc2_schedule,
+	.task_wake_up		= mc2_task_resume,
+	.admit_task		= mc2_admit_task,
+	.task_new		= mc2_task_new,
+	.task_exit		= mc2_task_exit,
+	.complete_job           = mc2_complete_job,
+	.get_domain_proc_info   = mc2_get_domain_proc_info,
+	.activate_plugin	= mc2_activate_plugin,
+	.deactivate_plugin      = mc2_deactivate_plugin,
+	.reservation_create     = mc2_reservation_create,
+	.reservation_destroy	= mc2_reservation_destroy,
+};
+
+static int __init init_mc2(void)
+{
+	return register_sched_plugin(&mc2_plugin);
+}
+
+module_init(init_mc2);
+
-- 
1.8.1.2


From abd10c08d222f23322ba91cc493ef1095bdb5f86 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Fri, 20 Jun 2014 11:29:09 +0200
Subject: [PATCH 055/119] default_wait_for_release_at() should invoke plugin
 callback

Instead of calling complete_job() directly, the default implementation
of wait_for_release_at() should invoke the plugin-provided
complete_job() method to support plugins that happen to override
complete_job(), but not wait_for_release_at().
---
 litmus/jobs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/litmus/jobs.c b/litmus/jobs.c
index 2d9f8aa..547222c 100644
--- a/litmus/jobs.c
+++ b/litmus/jobs.c
@@ -4,6 +4,7 @@
 #include <linux/sched.h>
 
 #include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
 #include <litmus/jobs.h>
 
 static inline void setup_release(struct task_struct *t, lt_t release)
@@ -58,7 +59,7 @@ long default_wait_for_release_at(lt_t release_time)
 	tsk_rt(t)->sporadic_release = 1;
 	local_irq_restore(flags);
 
-	return complete_job();
+	return litmus->complete_job();
 }
 
 
-- 
1.8.1.2


From 70f269792d87c26f03a93c6715ea351c2eee62a3 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Sat, 14 Jun 2014 17:15:00 +0200
Subject: [PATCH 056/119] Add void* plugin_state pointer to task_struct

---
 include/litmus/rt_param.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
index e26535b..060b5d7 100644
--- a/include/litmus/rt_param.h
+++ b/include/litmus/rt_param.h
@@ -249,7 +249,10 @@ struct rt_param {
 	volatile int		linked_on;
 
 	/* PFAIR/PD^2 state. Allocated on demand. */
-	struct pfair_param*	pfair;
+	union {
+		void *plugin_state;
+		struct pfair_param *pfair;
+	};
 
 	/* Fields saved before BE->RT transition.
 	 */
-- 
1.8.1.2


From 277dbeea9af1ca31add69636aef4b18892e54646 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Wed, 16 Jul 2014 16:52:13 +0200
Subject: [PATCH 057/119] Add reservation creation API to plugin interface &
 syscalls

---
 include/litmus/sched_plugin.h |  9 +++++++++
 litmus/litmus.c               | 10 ++++++++++
 litmus/sched_plugin.c         | 13 +++++++++++++
 3 files changed, 32 insertions(+)

diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h
index 0ccccd6..cb663b8 100644
--- a/include/litmus/sched_plugin.h
+++ b/include/litmus/sched_plugin.h
@@ -77,6 +77,11 @@ typedef long (*wait_for_release_at_t)(lt_t release_time);
 /* Informs the plugin when a synchronous release takes place. */
 typedef void (*synchronous_release_at_t)(lt_t time_zero);
 
+/* Reservation creation/removal backends. Meaning of reservation_type and
+ * reservation_id are entirely plugin-specific. */
+typedef long (*reservation_create_t)(int reservation_type, void* __user config);
+typedef long (*reservation_destroy_t)(unsigned int reservation_id, int cpu);
+
 /************************ misc routines ***********************/
 
 
@@ -109,6 +114,10 @@ struct sched_plugin {
 	task_exit_t 		task_exit;
 	task_cleanup_t		task_cleanup;
 
+	/* Reservation support */
+	reservation_create_t	reservation_create;
+	reservation_destroy_t	reservation_destroy;
+
 #ifdef CONFIG_LITMUS_LOCKING
 	/*	locking protocols	*/
 	allocate_lock_t		allocate_lock;
diff --git a/litmus/litmus.c b/litmus/litmus.c
index 14b1031..0b87e04 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -315,6 +315,16 @@ asmlinkage long sys_null_call(cycles_t __user *ts)
 	return ret;
 }
 
+asmlinkage long sys_reservation_create(int type, void __user *config)
+{
+	return litmus->reservation_create(type, config);
+}
+
+asmlinkage long sys_reservation_destroy(unsigned int reservation_id, int cpu)
+{
+	return litmus->reservation_destroy(reservation_id, cpu);
+}
+
 /* p is a real-time task. Re-init its state as a best-effort task. */
 static void reinit_litmus_state(struct task_struct* p, int restore)
 {
diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
index edd91e9..b917793 100644
--- a/litmus/sched_plugin.c
+++ b/litmus/sched_plugin.c
@@ -132,6 +132,17 @@ static long litmus_dummy_allocate_lock(struct litmus_lock **lock, int type,
 
 #endif
 
+static long  litmus_dummy_reservation_create(
+	int reservation_type,
+	void* __user config)
+{
+	return -EINVAL;
+}
+
+static long litmus_dummy_reservation_destroy(unsigned int reservation_id, int cpu)
+{
+	return -EINVAL;
+}
 
 /* The default scheduler plugin. It doesn't do anything and lets Linux do its
  * job.
@@ -193,6 +204,8 @@ int register_sched_plugin(struct sched_plugin* plugin)
 #endif
 	CHECK(admit_task);
 	CHECK(synchronous_release_at);
+	CHECK(reservation_destroy);
+	CHECK(reservation_create);
 
 	if (!plugin->wait_for_release_at)
 		plugin->wait_for_release_at = default_wait_for_release_at;
-- 
1.8.1.2


From cd6cb2ecd3238a0a1f05408e0b8148c1ecc80f59 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Wed, 16 Jul 2014 17:29:07 +0200
Subject: [PATCH 058/119] Add reservation system calls to x86 syscall table

---
 arch/x86/syscalls/syscall_32.tbl | 2 ++
 arch/x86/syscalls/syscall_64.tbl | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index ffe39dd..290c879 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -370,3 +370,5 @@
 360	i386	wait_for_ts_release	sys_wait_for_ts_release
 361	i386	release_ts		sys_release_ts
 362	i386	null_call		sys_null_call
+363 i386    reservation_create sys_reservation_create
+364 i386    reservation_destroy sys_reservation_destroy
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index cde714e..d39de2a 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -333,6 +333,9 @@
 360	common	wait_for_ts_release	sys_wait_for_ts_release
 361	common	release_ts		sys_release_ts
 362	common	null_call		sys_null_call
+363 common  reservation_create  sys_reservation_create
+364 common  reservation_destroy sys_reservation_destroy
+
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
-- 
1.8.1.2


From 4b9d58f6f3441c8bbf37dfc24ae9dee04f64c9cb Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Wed, 16 Jul 2014 17:38:05 +0200
Subject: [PATCH 059/119] Add generic reservation syscall table definitions

---
 include/litmus/unistd_32.h | 4 +++-
 include/litmus/unistd_64.h | 7 ++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/include/litmus/unistd_32.h b/include/litmus/unistd_32.h
index 94264c2..5f6a274 100644
--- a/include/litmus/unistd_32.h
+++ b/include/litmus/unistd_32.h
@@ -17,5 +17,7 @@
 #define __NR_wait_for_ts_release __LSC(9)
 #define __NR_release_ts		__LSC(10)
 #define __NR_null_call		__LSC(11)
+#define __NR_reservation_create	__LSC(12)
+#define __NR_reservation_destroy __LSC(13)
 
-#define NR_litmus_syscalls 12
+#define NR_litmus_syscalls 14
diff --git a/include/litmus/unistd_64.h b/include/litmus/unistd_64.h
index d5ced0d..3e6b1d3 100644
--- a/include/litmus/unistd_64.h
+++ b/include/litmus/unistd_64.h
@@ -29,5 +29,10 @@ __SYSCALL(__NR_wait_for_ts_release, sys_wait_for_ts_release)
 __SYSCALL(__NR_release_ts, sys_release_ts)
 #define __NR_null_call				__LSC(11)
 __SYSCALL(__NR_null_call, sys_null_call)
+#define __NR_reservation_create			__LSC(12)
+__SYSCALL(__NR_reservation_create, sys_reservation_create)
+#define __NR_reservation_destroy		__LSC(13)
+__SYSCALL(__NR_reservation_destroy, sys_reservation_destroy)
 
-#define NR_litmus_syscalls 12
+
+#define NR_litmus_syscalls 14
-- 
1.8.1.2


From 1d65b6286a0f6c13495eefbb41bd1cac3d420cc3 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Wed, 16 Jul 2014 17:38:37 +0200
Subject: [PATCH 060/119]  Add reservation system calls to ARM syscall table

---
 arch/arm/kernel/calls.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index 2da776a..ad22fcc 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -401,6 +401,8 @@
         	CALL(sys_wait_for_ts_release)
 /* 390 */	CALL(sys_release_ts)
 		CALL(sys_null_call)
+	    CALL(sys_reservation_create)
+	    CALL(sys_reservation_destroy)
 
 #ifndef syscalls_counted
 .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
-- 
1.8.1.2


From a88daa29933e6c2b1b3b4d616450a35137e59723 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Sat, 14 Jun 2014 12:42:30 +0200
Subject: [PATCH 061/119] Add basic generic reservation-based scheduling
 infrastructure

---
 include/litmus/polling_reservations.h |  37 +++
 include/litmus/reservation.h          | 189 +++++++++++++++
 litmus/Makefile                       |   2 +
 litmus/polling_reservations.c         | 436 ++++++++++++++++++++++++++++++++++
 litmus/reservation.c                  | 298 +++++++++++++++++++++++
 5 files changed, 962 insertions(+)
 create mode 100644 include/litmus/polling_reservations.h
 create mode 100644 include/litmus/reservation.h
 create mode 100644 litmus/polling_reservations.c
 create mode 100644 litmus/reservation.c

diff --git a/include/litmus/polling_reservations.h b/include/litmus/polling_reservations.h
new file mode 100644
index 0000000..9958a92
--- /dev/null
+++ b/include/litmus/polling_reservations.h
@@ -0,0 +1,37 @@
+#ifndef LITMUS_POLLING_RESERVATIONS_H
+#define LITMUS_POLLING_RESERVATIONS_H
+
+#include <litmus/reservation.h>
+
+struct polling_reservation {
+	/* extend basic reservation */
+	struct reservation res;
+
+	lt_t max_budget;
+	lt_t period;
+	lt_t deadline;
+	lt_t offset;
+};
+
+void polling_reservation_init(struct polling_reservation *pres, int use_edf_prio,
+	int use_periodic_polling, lt_t budget, lt_t period, lt_t deadline, lt_t offset);
+
+struct lt_interval {
+	lt_t start;
+	lt_t end;
+};
+
+struct table_driven_reservation {
+	/* extend basic reservation */
+	struct reservation res;
+
+	lt_t major_cycle;
+	unsigned int next_interval;
+	unsigned int num_intervals;
+	struct lt_interval *intervals;
+};
+
+void table_driven_reservation_init(struct table_driven_reservation *tdres,
+	lt_t major_cycle, struct lt_interval *intervals, unsigned int num_intervals);
+
+#endif
diff --git a/include/litmus/reservation.h b/include/litmus/reservation.h
new file mode 100644
index 0000000..d8d6ce3
--- /dev/null
+++ b/include/litmus/reservation.h
@@ -0,0 +1,189 @@
+#ifndef LITMUS_RESERVATION_H
+#define LITMUS_RESERVATION_H
+
+#include <linux/list.h>
+#include <linux/hrtimer.h>
+
+struct reservation_client;
+struct reservation_environment;
+struct reservation;
+
+typedef enum {
+	/* reservation has no clients, is not consuming budget */
+	RESERVATION_INACTIVE = 0,
+
+	/* reservation has clients, consumes budget when scheduled */
+	RESERVATION_ACTIVE,
+
+	/* reservation has no clients, but may be consuming budget */
+	RESERVATION_ACTIVE_IDLE,
+
+	/* Reservation has no budget and waits for
+	 * replenishment. May or may not have clients. */
+	RESERVATION_DEPLETED,
+} reservation_state_t;
+
+
+/* ************************************************************************** */
+
+/* Select which task to dispatch. If NULL is returned, it means there is nothing
+ * to schedule right now and background work can be scheduled. */
+typedef struct task_struct * (*dispatch_t)  (
+	struct reservation_client *client
+);
+
+/* Something that can be managed in a reservation and that can yield
+ * a process for dispatching. */
+struct reservation_client {
+	struct list_head list;
+	dispatch_t dispatch;
+};
+
+
+/* ************************************************************************** */
+
+/* Called by reservations to request state change. */
+typedef void (*reservation_change_state_t)  (
+	struct reservation_environment* env,
+	struct reservation *res,
+	reservation_state_t new_state
+);
+
+/* The framework within wich reservations operate. */
+struct reservation_environment {
+	lt_t time_zero;
+	lt_t current_time;
+
+	/* services invoked by reservations */
+	reservation_change_state_t change_state;
+};
+
+
+/* ************************************************************************** */
+
+/* A new client is added or an existing client resumes. */
+typedef void (*client_arrives_t)  (
+	struct reservation *reservation,
+	struct reservation_client *client
+);
+
+/* A client suspends or terminates. */
+typedef void (*client_departs_t)  (
+	struct reservation *reservation,
+	struct reservation_client *client,
+	int did_signal_job_completion
+);
+
+/* A previously requested replenishment has occurred. */
+typedef void (*on_replenishment_timer_t)  (
+	struct reservation *reservation
+);
+
+/* Update the reservation's budget to reflect execution or idling. */
+typedef void (*drain_budget_t) (
+	struct reservation *reservation,
+	lt_t how_much
+);
+
+/* Select a ready task from one of the clients for scheduling. */
+typedef struct task_struct* (*dispatch_client_t)  (
+	struct reservation *reservation,
+	lt_t *time_slice /* May be used to force rescheduling after
+	                    some amount of time. 0 => no limit */
+);
+
+
+struct reservation_ops {
+	dispatch_client_t dispatch_client;
+
+	client_arrives_t client_arrives;
+	client_departs_t client_departs;
+
+	on_replenishment_timer_t replenish;
+	drain_budget_t drain_budget;
+};
+
+struct reservation {
+	/* used to queue in environment */
+	struct list_head list;
+
+	reservation_state_t state;
+	unsigned int id;
+
+	/* exact meaning defined by impl. */
+	lt_t priority;
+	lt_t cur_budget;
+	lt_t next_replenishment;
+
+	/* interaction with framework */
+	struct reservation_environment *env;
+	struct reservation_ops *ops;
+
+	struct list_head clients;
+};
+
+void reservation_init(struct reservation *res);
+
+/* Default implementations */
+
+/* simply select the first client in the list, set *for_at_most to zero */
+struct task_struct* default_dispatch_client(
+	struct reservation *res,
+	lt_t *for_at_most
+);
+
+/* "connector" reservation client to hook up tasks with reservations */
+struct task_client {
+	struct reservation_client client;
+	struct reservation* reservation;
+	struct task_struct *task;
+};
+
+void task_client_init(struct task_client *tc, struct task_struct *task,
+	struct reservation *reservation);
+
+#define SUP_RESCHEDULE_NOW (0)
+#define SUP_NO_SCHEDULER_UPDATE (ULLONG_MAX)
+
+/* A simple uniprocessor (SUP) flat (i.e., non-hierarchical) reservation
+ * environment.
+ */
+struct sup_reservation_environment {
+	struct reservation_environment env;
+
+	/* ordered by priority */
+	struct list_head active_reservations;
+
+	/* ordered by next_replenishment */
+	struct list_head depleted_reservations;
+
+	/* unordered */
+	struct list_head inactive_reservations;
+
+	/* - SUP_RESCHEDULE_NOW means call sup_dispatch() now
+	 * - SUP_NO_SCHEDULER_UPDATE means nothing to do
+	 * any other value means program a timer for the given time
+	 */
+	lt_t next_scheduler_update;
+	/* set to true if a call to sup_dispatch() is imminent */
+	bool will_schedule;
+};
+
+/* Contract:
+ *  - before calling into sup_ code, or any reservation methods,
+ *    update the time with sup_update_time(); and
+ *  - after calling into sup_ code, or any reservation methods,
+ *    check next_scheduler_update and program timer or trigger
+ *    scheduler invocation accordingly.
+ */
+
+void sup_init(struct sup_reservation_environment* sup_env);
+void sup_add_new_reservation(struct sup_reservation_environment* sup_env,
+	struct reservation* new_res);
+void sup_update_time(struct sup_reservation_environment* sup_env, lt_t now);
+struct task_struct* sup_dispatch(struct sup_reservation_environment* sup_env);
+
+struct reservation* sup_find_by_id(struct sup_reservation_environment* sup_env,
+	unsigned int id);
+
+#endif
diff --git a/litmus/Makefile b/litmus/Makefile
index 84b173a..e3439c8 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -32,3 +32,5 @@ obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
 obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
 obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o
 obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o
+
+obj-y += reservation.o polling_reservations.o
diff --git a/litmus/polling_reservations.c b/litmus/polling_reservations.c
new file mode 100644
index 0000000..08034c3
--- /dev/null
+++ b/litmus/polling_reservations.c
@@ -0,0 +1,436 @@
+#include <linux/sched.h>
+
+#include <litmus/litmus.h>
+#include <litmus/reservation.h>
+#include <litmus/polling_reservations.h>
+
+
+static void periodic_polling_client_arrives(
+	struct reservation* res,
+	struct reservation_client *client
+)
+{
+	struct polling_reservation *pres =
+		container_of(res, struct polling_reservation, res);
+	lt_t instances, tmp;
+
+	list_add_tail(&client->list, &res->clients);
+
+	switch (res->state) {
+		case RESERVATION_INACTIVE:
+			/* Figure out next replenishment time. */
+			tmp = res->env->current_time - res->env->time_zero;
+			instances =  div64_u64(tmp, pres->period);
+			res->next_replenishment =
+				(instances + 1) * pres->period + pres->offset;
+
+			TRACE("pol-res: activate tmp=%llu instances=%llu period=%llu nextrp=%llu cur=%llu\n",
+				tmp, instances, pres->period, res->next_replenishment,
+				res->env->current_time);
+
+			res->env->change_state(res->env, res,
+				RESERVATION_DEPLETED);
+			break;
+
+		case RESERVATION_ACTIVE:
+		case RESERVATION_DEPLETED:
+			/* do nothing */
+			break;
+
+		case RESERVATION_ACTIVE_IDLE:
+			res->env->change_state(res->env, res,
+				RESERVATION_ACTIVE);
+			break;
+	}
+}
+
+
+static void periodic_polling_client_departs(
+	struct reservation *res,
+	struct reservation_client *client,
+	int did_signal_job_completion
+)
+{
+	list_del(&client->list);
+
+	switch (res->state) {
+		case RESERVATION_INACTIVE:
+		case RESERVATION_ACTIVE_IDLE:
+			BUG(); /* INACTIVE or IDLE <=> no client */
+			break;
+
+		case RESERVATION_ACTIVE:
+			if (list_empty(&res->clients)) {
+				res->env->change_state(res->env, res,
+					did_signal_job_completion ?
+						RESERVATION_DEPLETED :
+						RESERVATION_ACTIVE_IDLE);
+			} /* else: nothing to do, more clients ready */
+			break;
+
+		case RESERVATION_DEPLETED:
+			/* do nothing */
+			break;
+	}
+}
+
+static void periodic_polling_on_replenishment(
+	struct reservation *res
+)
+{
+	struct polling_reservation *pres =
+		container_of(res, struct polling_reservation, res);
+
+	/* replenish budget */
+	res->cur_budget = pres->max_budget;
+	res->next_replenishment += pres->period;
+
+	switch (res->state) {
+		case RESERVATION_DEPLETED:
+		case RESERVATION_INACTIVE:
+		case RESERVATION_ACTIVE_IDLE:
+			if (list_empty(&res->clients))
+				/* no clients => poll again later */
+				res->env->change_state(res->env, res,
+					RESERVATION_INACTIVE);
+			else
+				/* we have clients & budget => ACTIVE */
+				res->env->change_state(res->env, res,
+					RESERVATION_ACTIVE);
+			break;
+
+		case RESERVATION_ACTIVE:
+			/* Replenished while active => tardy? In any case,
+			 * go ahead and stay active. */
+			break;
+	}
+}
+
+static void periodic_polling_on_replenishment_edf(
+	struct reservation *res
+)
+{
+	struct polling_reservation *pres =
+		container_of(res, struct polling_reservation, res);
+
+	/* update current priority */
+	res->priority = res->next_replenishment + pres->deadline;
+
+	/* do common updates */
+	periodic_polling_on_replenishment(res);
+}
+
+static void common_drain_budget(
+		struct reservation *res,
+		lt_t how_much)
+{
+	if (how_much >= res->cur_budget)
+		res->cur_budget = 0;
+	else
+		res->cur_budget -= how_much;
+
+	switch (res->state) {
+		case RESERVATION_DEPLETED:
+		case RESERVATION_INACTIVE:
+			BUG();
+			break;
+
+		case RESERVATION_ACTIVE_IDLE:
+		case RESERVATION_ACTIVE:
+			if (!res->cur_budget) {
+				res->env->change_state(res->env, res,
+					RESERVATION_DEPLETED);
+			} /* else: stay in current state */
+			break;
+	}
+}
+
+static struct reservation_ops periodic_polling_ops_fp = {
+	.dispatch_client = default_dispatch_client,
+	.client_arrives = periodic_polling_client_arrives,
+	.client_departs = periodic_polling_client_departs,
+	.replenish = periodic_polling_on_replenishment,
+	.drain_budget = common_drain_budget,
+};
+
+static struct reservation_ops periodic_polling_ops_edf = {
+	.dispatch_client = default_dispatch_client,
+	.client_arrives = periodic_polling_client_arrives,
+	.client_departs = periodic_polling_client_departs,
+	.replenish = periodic_polling_on_replenishment_edf,
+	.drain_budget = common_drain_budget,
+};
+
+
+
+
+static void sporadic_polling_client_arrives_fp(
+	struct reservation* res,
+	struct reservation_client *client
+)
+{
+	struct polling_reservation *pres =
+		container_of(res, struct polling_reservation, res);
+
+	list_add_tail(&client->list, &res->clients);
+
+	switch (res->state) {
+		case RESERVATION_INACTIVE:
+			/* Replenish now. */
+			res->cur_budget = pres->max_budget;
+			res->next_replenishment =
+				res->env->current_time + pres->period;
+
+			res->env->change_state(res->env, res,
+				RESERVATION_ACTIVE);
+			break;
+
+		case RESERVATION_ACTIVE:
+		case RESERVATION_DEPLETED:
+			/* do nothing */
+			break;
+
+		case RESERVATION_ACTIVE_IDLE:
+			res->env->change_state(res->env, res,
+				RESERVATION_ACTIVE);
+			break;
+	}
+}
+
+static void sporadic_polling_client_arrives_edf(
+	struct reservation* res,
+	struct reservation_client *client
+)
+{
+	struct polling_reservation *pres =
+		container_of(res, struct polling_reservation, res);
+
+	list_add_tail(&client->list, &res->clients);
+
+	switch (res->state) {
+		case RESERVATION_INACTIVE:
+			/* Replenish now. */
+			res->cur_budget = pres->max_budget;
+			res->next_replenishment =
+				res->env->current_time + pres->period;
+			res->priority =
+				res->env->current_time + pres->deadline;
+
+			res->env->change_state(res->env, res,
+				RESERVATION_ACTIVE);
+			break;
+
+		case RESERVATION_ACTIVE:
+		case RESERVATION_DEPLETED:
+			/* do nothing */
+			break;
+
+		case RESERVATION_ACTIVE_IDLE:
+			res->env->change_state(res->env, res,
+				RESERVATION_ACTIVE);
+			break;
+	}
+}
+
+static struct reservation_ops sporadic_polling_ops_fp = {
+	.dispatch_client = default_dispatch_client,
+	.client_arrives = sporadic_polling_client_arrives_fp,
+	.client_departs = periodic_polling_client_departs,
+	.replenish = periodic_polling_on_replenishment,
+	.drain_budget = common_drain_budget,
+};
+
+static struct reservation_ops sporadic_polling_ops_edf = {
+	.dispatch_client = default_dispatch_client,
+	.client_arrives = sporadic_polling_client_arrives_edf,
+	.client_departs = periodic_polling_client_departs,
+	.replenish = periodic_polling_on_replenishment_edf,
+	.drain_budget = common_drain_budget,
+};
+
+void polling_reservation_init(
+	struct polling_reservation *pres,
+	int use_edf_prio,
+	int use_periodic_polling,
+	lt_t budget, lt_t period, lt_t deadline, lt_t offset
+)
+{
+	if (!deadline)
+		deadline = period;
+	BUG_ON(budget > period);
+	BUG_ON(budget > deadline);
+	BUG_ON(offset >= period);
+
+	reservation_init(&pres->res);
+	pres->max_budget = budget;
+	pres->period = period;
+	pres->deadline = deadline;
+	pres->offset = offset;
+	if (use_periodic_polling) {
+		if (use_edf_prio)
+			pres->res.ops = &periodic_polling_ops_edf;
+		else
+			pres->res.ops = &periodic_polling_ops_fp;
+	} else {
+		if (use_edf_prio)
+			pres->res.ops = &sporadic_polling_ops_edf;
+		else
+			pres->res.ops = &sporadic_polling_ops_fp;
+	}
+}
+
+
+static lt_t td_cur_major_cycle_start(struct table_driven_reservation *tdres)
+{
+	lt_t x, tmp;
+
+	tmp = tdres->res.env->current_time - tdres->res.env->time_zero;
+	x = div64_u64(tmp, tdres->major_cycle);
+	x *= tdres->major_cycle;
+	return x;
+}
+
+
+static lt_t td_next_major_cycle_start(struct table_driven_reservation *tdres)
+{
+	lt_t x, tmp;
+
+	tmp = tdres->res.env->current_time - tdres->res.env->time_zero;
+	x = div64_u64(tmp, tdres->major_cycle) + 1;
+	x *= tdres->major_cycle;
+	return x;
+}
+
+static void td_client_arrives(
+	struct reservation* res,
+	struct reservation_client *client
+)
+{
+	struct table_driven_reservation *tdres =
+		container_of(res, struct table_driven_reservation, res);
+
+	list_add_tail(&client->list, &res->clients);
+
+	switch (res->state) {
+		case RESERVATION_INACTIVE:
+			/* Figure out first replenishment time. */
+			res->next_replenishment = td_next_major_cycle_start(tdres);
+			res->next_replenishment += tdres->intervals[0].start;
+			tdres->next_interval = 0;
+
+			res->env->change_state(res->env, res,
+				RESERVATION_DEPLETED);
+			break;
+
+		case RESERVATION_ACTIVE:
+		case RESERVATION_DEPLETED:
+			/* do nothing */
+			break;
+
+		case RESERVATION_ACTIVE_IDLE:
+			res->env->change_state(res->env, res,
+				RESERVATION_ACTIVE);
+			break;
+	}
+}
+
+static void td_client_departs(
+	struct reservation *res,
+	struct reservation_client *client,
+	int did_signal_job_completion
+)
+{
+	list_del(&client->list);
+
+	switch (res->state) {
+		case RESERVATION_INACTIVE:
+		case RESERVATION_ACTIVE_IDLE:
+			BUG(); /* INACTIVE or IDLE <=> no client */
+			break;
+
+		case RESERVATION_ACTIVE:
+			if (list_empty(&res->clients)) {
+				res->env->change_state(res->env, res,
+						RESERVATION_ACTIVE_IDLE);
+			} /* else: nothing to do, more clients ready */
+			break;
+
+		case RESERVATION_DEPLETED:
+			/* do nothing */
+			break;
+	}
+}
+
+static lt_t td_interval_length(struct lt_interval *ival)
+{
+	return ival->end - ival->start;
+}
+
+static void td_replenish(
+	struct reservation *res
+)
+{
+	struct table_driven_reservation *tdres =
+		container_of(res, struct table_driven_reservation, res);
+
+	/* replenish budget */
+	res->cur_budget = td_interval_length(tdres->intervals + tdres->next_interval);
+
+	tdres->next_interval = (tdres->next_interval + 1) % tdres->num_intervals;
+	if (tdres->next_interval)
+		res->next_replenishment = td_cur_major_cycle_start(tdres);
+	else
+		/* wrap to next major cycle */
+		res->next_replenishment = td_next_major_cycle_start(tdres);
+	res->next_replenishment += tdres->intervals[tdres->next_interval].start;
+
+
+	switch (res->state) {
+		case RESERVATION_DEPLETED:
+		case RESERVATION_ACTIVE:
+		case RESERVATION_ACTIVE_IDLE:
+			if (list_empty(&res->clients))
+				res->env->change_state(res->env, res,
+					RESERVATION_ACTIVE_IDLE);
+			else
+				/* we have clients & budget => ACTIVE */
+				res->env->change_state(res->env, res,
+					RESERVATION_ACTIVE);
+			break;
+
+		case RESERVATION_INACTIVE:
+			BUG();
+			break;
+	}
+}
+
+static struct reservation_ops td_ops = {
+	.dispatch_client = default_dispatch_client,
+	.client_arrives = td_client_arrives,
+	.client_departs = td_client_departs,
+	.replenish = td_replenish,
+	.drain_budget = common_drain_budget,
+};
+
+void table_driven_reservation_init(
+	struct table_driven_reservation *tdres,
+	lt_t major_cycle,
+	struct lt_interval *intervals,
+	unsigned int num_intervals)
+{
+	unsigned int i;
+
+	/* sanity checking */
+	BUG_ON(!num_intervals);
+	for (i = 0; i < num_intervals; i++)
+		BUG_ON(intervals[i].end <= intervals[i].start);
+	for (i = 0; i + 1 < num_intervals; i++)
+		BUG_ON(intervals[i + 1].start <= intervals[i].end);
+	BUG_ON(intervals[num_intervals - 1].end > major_cycle);
+
+	reservation_init(&tdres->res);
+	tdres->major_cycle = major_cycle;
+	tdres->intervals = intervals;
+	tdres->num_intervals = num_intervals;
+	tdres->res.ops = &td_ops;
+}
diff --git a/litmus/reservation.c b/litmus/reservation.c
new file mode 100644
index 0000000..bc32b2e
--- /dev/null
+++ b/litmus/reservation.c
@@ -0,0 +1,298 @@
+#include <linux/sched.h>
+
+#include <litmus/litmus.h>
+#include <litmus/reservation.h>
+
+void reservation_init(struct reservation *res)
+{
+	memset(res, sizeof(*res), 0);
+	res->state = RESERVATION_INACTIVE;
+	INIT_LIST_HEAD(&res->clients);
+}
+
+struct task_struct* default_dispatch_client(
+	struct reservation *res,
+	lt_t *for_at_most)
+{
+	struct reservation_client *client, *next;
+	struct task_struct* tsk;
+
+	BUG_ON(res->state != RESERVATION_ACTIVE);
+	*for_at_most = 0;
+
+	list_for_each_entry_safe(client, next, &res->clients, list) {
+		tsk = client->dispatch(client);
+		if (likely(tsk)) {
+			return tsk;
+		}
+	}
+	return NULL;
+}
+
+static struct task_struct * task_client_dispatch(struct reservation_client *client)
+{
+	struct task_client *tc = container_of(client, struct task_client, client);
+	return tc->task;
+}
+
+void task_client_init(struct task_client *tc, struct task_struct *tsk,
+	struct reservation *res)
+{
+	memset(&tc->client, sizeof(tc->client), 0);
+	tc->client.dispatch = task_client_dispatch;
+	tc->task = tsk;
+	tc->reservation = res;
+}
+
+static void sup_scheduler_update_at(
+	struct sup_reservation_environment* sup_env,
+	lt_t when)
+{
+	if (sup_env->next_scheduler_update > when)
+		sup_env->next_scheduler_update = when;
+}
+
+static void sup_scheduler_update_after(
+	struct sup_reservation_environment* sup_env,
+	lt_t timeout)
+{
+	sup_scheduler_update_at(sup_env, sup_env->env.current_time + timeout);
+}
+
+static int _sup_queue_depleted(
+	struct sup_reservation_environment* sup_env,
+	struct reservation *res)
+{
+	struct list_head *pos;
+	struct reservation *queued;
+	int passed_earlier = 0;
+
+	list_for_each(pos, &sup_env->depleted_reservations) {
+		queued = list_entry(pos, struct reservation, list);
+		if (queued->next_replenishment > res->next_replenishment) {
+			list_add(&res->list, pos->prev);
+			return passed_earlier;
+		} else
+			passed_earlier = 1;
+	}
+
+	list_add_tail(&res->list, &sup_env->depleted_reservations);
+
+	return passed_earlier;
+}
+
+static void sup_queue_depleted(
+	struct sup_reservation_environment* sup_env,
+	struct reservation *res)
+{
+	int passed_earlier = _sup_queue_depleted(sup_env, res);
+
+	/* check for updated replenishment time */
+	if (!passed_earlier)
+		sup_scheduler_update_at(sup_env, res->next_replenishment);
+}
+
+static int _sup_queue_active(
+	struct sup_reservation_environment* sup_env,
+	struct reservation *res)
+{
+	struct list_head *pos;
+	struct reservation *queued;
+	int passed_active = 0;
+
+	list_for_each(pos, &sup_env->active_reservations) {
+		queued = list_entry(pos, struct reservation, list);
+		if (queued->priority > res->priority) {
+			list_add(&res->list, pos->prev);
+			return passed_active;
+		} else if (queued->state == RESERVATION_ACTIVE)
+			passed_active = 1;
+	}
+
+	list_add_tail(&res->list, &sup_env->active_reservations);
+	return passed_active;
+}
+
+static void sup_queue_active(
+	struct sup_reservation_environment* sup_env,
+	struct reservation *res)
+{
+	int passed_active = _sup_queue_active(sup_env, res);
+
+	/* check for possible preemption */
+	if (res->state == RESERVATION_ACTIVE && !passed_active)
+		sup_env->next_scheduler_update = SUP_RESCHEDULE_NOW;
+}
+
+
+static void sup_queue_reservation(
+	struct sup_reservation_environment* sup_env,
+	struct reservation *res)
+{
+	switch (res->state) {
+		case RESERVATION_INACTIVE:
+			list_add(&res->list, &sup_env->inactive_reservations);
+			break;
+
+		case RESERVATION_DEPLETED:
+			sup_queue_depleted(sup_env, res);
+			break;
+
+		case RESERVATION_ACTIVE_IDLE:
+		case RESERVATION_ACTIVE:
+			sup_queue_active(sup_env, res);
+			break;
+	}
+}
+
+void sup_add_new_reservation(
+	struct sup_reservation_environment* sup_env,
+	struct reservation* new_res)
+{
+	new_res->env = &sup_env->env;
+	sup_queue_reservation(sup_env, new_res);
+}
+
+struct reservation* sup_find_by_id(struct sup_reservation_environment* sup_env,
+	unsigned int id)
+{
+	struct reservation *res;
+
+	list_for_each_entry(res, &sup_env->active_reservations, list) {
+		if (res->id == id)
+			return res;
+	}
+	list_for_each_entry(res, &sup_env->inactive_reservations, list) {
+		if (res->id == id)
+			return res;
+	}
+	list_for_each_entry(res, &sup_env->depleted_reservations, list) {
+		if (res->id == id)
+			return res;
+	}
+
+	return NULL;
+}
+
+static void sup_charge_budget(
+	struct sup_reservation_environment* sup_env,
+	lt_t delta)
+{
+	struct list_head *pos, *next;
+	struct reservation *res;
+
+	list_for_each_safe(pos, next, &sup_env->active_reservations) {
+		/* charge all ACTIVE_IDLE up to the first ACTIVE reservation */
+		res = list_entry(pos, struct reservation, list);
+		if (res->state == RESERVATION_ACTIVE) {
+			res->ops->drain_budget(res, delta);
+			/* stop at the first ACTIVE reservation */
+			break;
+		} else {
+			BUG_ON(res->state != RESERVATION_ACTIVE_IDLE);
+			res->ops->drain_budget(res, delta);
+		}
+	}
+}
+
+static void sup_replenish_budgets(struct sup_reservation_environment* sup_env)
+{
+	struct list_head *pos, *next;
+	struct reservation *res;
+
+	list_for_each_safe(pos, next, &sup_env->depleted_reservations) {
+		res = list_entry(pos, struct reservation, list);
+		if (res->next_replenishment <= sup_env->env.current_time) {
+			res->ops->replenish(res);
+		} else {
+			/* list is ordered by increasing depletion times */
+			break;
+		}
+	}
+
+	/* request a scheduler update at the next replenishment instant */
+	res = list_first_entry_or_null(&sup_env->depleted_reservations,
+		struct reservation, list);
+	if (res)
+		sup_scheduler_update_at(sup_env, res->next_replenishment);
+}
+
+void sup_update_time(
+	struct sup_reservation_environment* sup_env,
+	lt_t now)
+{
+	lt_t delta;
+
+	/* If the time didn't advance, there is nothing to do.
+	 * This check makes it safe to call sup_advance_time() potentially
+	 * multiple times (e.g., via different code paths. */
+	if (unlikely(now <= sup_env->env.current_time))
+		return;
+
+	delta = now - sup_env->env.current_time;
+	sup_env->env.current_time = now;
+
+	/* check if future updates are required */
+	if (sup_env->next_scheduler_update <= sup_env->env.current_time)
+		sup_env->next_scheduler_update = SUP_NO_SCHEDULER_UPDATE;
+
+	/* deplete budgets by passage of time */
+	sup_charge_budget(sup_env, delta);
+
+	/* check if any budgets where replenished */
+	sup_replenish_budgets(sup_env);
+}
+
+struct task_struct* sup_dispatch(struct sup_reservation_environment* sup_env)
+{
+	struct reservation *res, *next;
+	struct task_struct *tsk = NULL;
+	lt_t time_slice;
+
+	list_for_each_entry_safe(res, next, &sup_env->active_reservations, list) {
+		if (res->state == RESERVATION_ACTIVE) {
+			tsk = res->ops->dispatch_client(res, &time_slice);
+			if (likely(tsk)) {
+				if (time_slice)
+				    sup_scheduler_update_after(sup_env, time_slice);
+				sup_scheduler_update_after(sup_env, res->cur_budget);
+				return tsk;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+static void sup_res_change_state(
+	struct reservation_environment* env,
+	struct reservation *res,
+	reservation_state_t new_state)
+{
+	struct sup_reservation_environment* sup_env;
+
+	sup_env = container_of(env, struct sup_reservation_environment, env);
+
+	TRACE("reservation R%d state %d->%d at %llu\n",
+		res->id, res->state, new_state, env->current_time);
+
+	list_del(&res->list);
+	/* check if we need to reschedule because we lost an active reservation */
+	if (res->state == RESERVATION_ACTIVE && !sup_env->will_schedule)
+		sup_env->next_scheduler_update = SUP_RESCHEDULE_NOW;
+	res->state = new_state;
+	sup_queue_reservation(sup_env, res);
+}
+
+void sup_init(struct sup_reservation_environment* sup_env)
+{
+	memset(sup_env, sizeof(*sup_env), 0);
+
+	INIT_LIST_HEAD(&sup_env->active_reservations);
+	INIT_LIST_HEAD(&sup_env->depleted_reservations);
+	INIT_LIST_HEAD(&sup_env->inactive_reservations);
+
+	sup_env->env.change_state = sup_res_change_state;
+
+	sup_env->next_scheduler_update = SUP_NO_SCHEDULER_UPDATE;
+}
-- 
1.8.1.2


From 62c4870dbe84cafc37ff9e3b867352ab2a02703f Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Thu, 17 Jul 2014 13:54:11 +0200
Subject: [PATCH 062/119] Add reservation configuration types to rt_param.h

---
 include/litmus/polling_reservations.h |  5 -----
 include/litmus/rt_param.h             | 41 +++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/include/litmus/polling_reservations.h b/include/litmus/polling_reservations.h
index 9958a92..15910ed 100644
--- a/include/litmus/polling_reservations.h
+++ b/include/litmus/polling_reservations.h
@@ -16,11 +16,6 @@ struct polling_reservation {
 void polling_reservation_init(struct polling_reservation *pres, int use_edf_prio,
 	int use_periodic_polling, lt_t budget, lt_t period, lt_t deadline, lt_t offset);
 
-struct lt_interval {
-	lt_t start;
-	lt_t end;
-};
-
 struct table_driven_reservation {
 	/* extend basic reservation */
 	struct reservation res;
diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
index 060b5d7..b252cc1 100644
--- a/include/litmus/rt_param.h
+++ b/include/litmus/rt_param.h
@@ -62,6 +62,7 @@ typedef enum {
 #define LITMUS_MAX_PRIORITY     512
 #define LITMUS_HIGHEST_PRIORITY   1
 #define LITMUS_LOWEST_PRIORITY    (LITMUS_MAX_PRIORITY - 1)
+#define LITMUS_NO_PRIORITY	UINT_MAX
 
 /* Provide generic comparison macros for userspace,
  * in case that we change this later. */
@@ -71,6 +72,46 @@ typedef enum {
 	((p) >= LITMUS_HIGHEST_PRIORITY &&	\
 	 (p) <= LITMUS_LOWEST_PRIORITY)
 
+/* reservation support */
+
+typedef enum {
+	PERIODIC_POLLING,
+	SPORADIC_POLLING,
+	TABLE_DRIVEN,
+} reservation_type_t;
+
+struct lt_interval {
+	lt_t start;
+	lt_t end;
+};
+
+#ifndef __KERNEL__
+#define __user
+#endif
+
+struct reservation_config {
+	unsigned int id;
+	unsigned int priority;
+	int  cpu;
+
+	union {
+		struct {
+			lt_t period;
+			lt_t budget;
+			lt_t relative_deadline;
+			lt_t offset;
+		} polling_params;
+
+		struct {
+			lt_t major_cycle_length;
+			unsigned int num_intervals;
+			struct lt_interval __user *intervals;
+		} table_driven_params;
+	};
+};
+
+/* regular sporadic task support */
+
 struct rt_task {
 	lt_t 		exec_cost;
 	lt_t 		period;
-- 
1.8.1.2


From bb1ee06d3b70f0d546cbf829a9ffe3ff7e800e8a Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Sat, 14 Jun 2014 17:16:06 +0200
Subject: [PATCH 063/119] Add partitioned reservation-based scheduler plugin
 (P-RES)

A simple partitioned scheduler that provides a reservation environment
on each core, based on the generic reservations code.  Hierarchical
scheduling is not supported in this version.
---
 litmus/Makefile     |   2 +
 litmus/sched_pres.c | 631 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 633 insertions(+)
 create mode 100644 litmus/sched_pres.c

diff --git a/litmus/Makefile b/litmus/Makefile
index e3439c8..05021f5 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -34,3 +34,5 @@ obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o
 obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o
 
 obj-y += reservation.o polling_reservations.o
+
+obj-y += sched_pres.o
\ No newline at end of file
diff --git a/litmus/sched_pres.c b/litmus/sched_pres.c
new file mode 100644
index 0000000..6779ffd
--- /dev/null
+++ b/litmus/sched_pres.c
@@ -0,0 +1,631 @@
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+
+#include <litmus/sched_plugin.h>
+#include <litmus/preempt.h>
+#include <litmus/debug_trace.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/budget.h>
+#include <litmus/litmus_proc.h>
+
+#include <litmus/reservation.h>
+#include <litmus/polling_reservations.h>
+
+struct pres_task_state {
+	struct task_client res_info;
+	int cpu;
+};
+
+struct pres_cpu_state {
+	raw_spinlock_t lock;
+
+	struct sup_reservation_environment sup_env;
+	struct hrtimer timer;
+
+	int cpu;
+	struct task_struct* scheduled;
+};
+
+static DEFINE_PER_CPU(struct pres_cpu_state, pres_cpu_state);
+
+#define cpu_state_for(cpu_id)	(&per_cpu(pres_cpu_state, cpu_id))
+#define local_cpu_state()	(&__get_cpu_var(pres_cpu_state))
+
+static struct pres_task_state* get_pres_state(struct task_struct *tsk)
+{
+	return (struct pres_task_state*) tsk_rt(tsk)->plugin_state;
+}
+
+static void task_departs(struct task_struct *tsk, int job_complete)
+{
+	struct pres_task_state* state = get_pres_state(tsk);
+	struct reservation* res;
+	struct reservation_client *client;
+
+	res    = state->res_info.reservation;
+	client = &state->res_info.client;
+
+	res->ops->client_departs(res, client, job_complete);
+}
+
+static void task_arrives(struct task_struct *tsk)
+{
+	struct pres_task_state* state = get_pres_state(tsk);
+	struct reservation* res;
+	struct reservation_client *client;
+
+	res    = state->res_info.reservation;
+	client = &state->res_info.client;
+
+	res->ops->client_arrives(res, client);
+}
+
+static void pres_update_timer(struct pres_cpu_state *state)
+{
+	lt_t update, now;
+
+	update = state->sup_env.next_scheduler_update;
+	now = state->sup_env.env.current_time;
+	if (update <= now) {
+		litmus_reschedule(state->cpu);
+	} else if (update != SUP_NO_SCHEDULER_UPDATE) {
+		/* reprogram only if not already set correctly */
+		if (!hrtimer_active(&state->timer) ||
+		    ktime_to_ns(hrtimer_get_expires(&state->timer)) != update) {
+			TRACE("canceling timer...\n");
+			hrtimer_cancel(&state->timer);
+			TRACE("setting scheduler timer for %llu\n", update);
+			hrtimer_start(&state->timer, ns_to_ktime(update),
+				HRTIMER_MODE_ABS_PINNED);
+		}
+	}
+}
+
+static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
+{
+	unsigned long flags;
+	enum hrtimer_restart restart = HRTIMER_NORESTART;
+	struct pres_cpu_state *state = local_cpu_state();
+	lt_t update, now;
+
+	raw_spin_lock_irqsave(&state->lock, flags);
+	sup_update_time(&state->sup_env, litmus_clock());
+
+	update = state->sup_env.next_scheduler_update;
+	now = state->sup_env.env.current_time;
+
+	TRACE_CUR("on_scheduling_timer at %llu, upd:%llu\n", now, update);
+
+	if (update <= now) {
+		litmus_reschedule_local();
+	} else if (update != SUP_NO_SCHEDULER_UPDATE) {
+		hrtimer_set_expires(timer, ns_to_ktime(update));
+		restart = HRTIMER_RESTART;
+	}
+
+	raw_spin_unlock_irqrestore(&state->lock, flags);
+
+	return restart;
+}
+
+static struct task_struct* pres_schedule(struct task_struct * prev)
+{
+	/* next == NULL means "schedule background work". */
+	struct pres_cpu_state *state = local_cpu_state();
+
+	raw_spin_lock(&state->lock);
+
+	BUG_ON(state->scheduled && state->scheduled != prev);
+	BUG_ON(state->scheduled && !is_realtime(prev));
+
+	/* update time */
+	state->sup_env.will_schedule = true;
+	sup_update_time(&state->sup_env, litmus_clock());
+
+	/* remove task from reservation if it blocks */
+	if (is_realtime(prev) && !is_running(prev))
+		task_departs(prev, is_completed(prev));
+
+	/* figure out what to schedule next */
+	state->scheduled = sup_dispatch(&state->sup_env);
+
+	/* program scheduler timer */
+	state->sup_env.will_schedule = false;
+	pres_update_timer(state);
+
+	/* Notify LITMUS^RT core that we've arrived at a scheduling decision. */
+	sched_state_task_picked();
+
+	raw_spin_unlock(&state->lock);
+
+	if (prev != state->scheduled && is_realtime(prev))
+		TRACE_TASK(prev, "descheduled.\n");
+	if (state->scheduled)
+		TRACE_TASK(state->scheduled, "scheduled.\n");
+
+	return state->scheduled;
+}
+
+static void resume_legacy_task_model_updates(struct task_struct *tsk)
+{
+	lt_t now;
+	if (is_sporadic(tsk)) {
+		/* If this sporadic task was gone for a "long" time and woke up past
+		 * its deadline, then give it a new budget by triggering a job
+		 * release. This is purely cosmetic and has no effect on the
+		 * P-RES scheduler. */
+
+		now = litmus_clock();
+		if (is_tardy(tsk, now))
+			release_at(tsk, now);
+	}
+}
+
+/* Called when the state of tsk changes back to TASK_RUNNING.
+ * We need to requeue the task.
+ */
+static void pres_task_resume(struct task_struct  *tsk)
+{
+	unsigned long flags;
+	struct pres_task_state* tinfo = get_pres_state(tsk);
+	struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
+
+	TRACE_TASK(tsk, "wake_up at %llu\n", litmus_clock());
+
+	raw_spin_lock_irqsave(&state->lock, flags);
+	/* Requeue if self-suspension was already processed. */
+	if (state->scheduled != tsk)
+	{
+		sup_update_time(&state->sup_env, litmus_clock());
+		task_arrives(tsk);
+		pres_update_timer(state);
+	}
+	raw_spin_unlock_irqrestore(&state->lock, flags);
+
+	resume_legacy_task_model_updates(tsk);
+}
+
+/* syscall backend for job completions */
+static long pres_complete_job(void)
+{
+	ktime_t next_release;
+	long err;
+
+	TRACE_CUR("pres_complete_job at %llu\n", litmus_clock());
+
+	tsk_rt(current)->completed = 1;
+	prepare_for_next_period(current);
+	next_release = ns_to_ktime(get_release(current));
+	set_current_state(TASK_INTERRUPTIBLE);
+	err = schedule_hrtimeout(&next_release, HRTIMER_MODE_ABS);
+
+	TRACE_CUR("pres_complete_job returns at %llu\n", litmus_clock());
+	return err;
+}
+
+static long pres_admit_task(struct task_struct *tsk)
+{
+	long err = -ESRCH;
+	unsigned long flags;
+	struct reservation *res;
+	struct pres_cpu_state *state;
+	struct pres_task_state *tinfo = kzalloc(sizeof(*tinfo), GFP_KERNEL);
+
+	if (!tinfo)
+		return -ENOMEM;
+
+	preempt_disable();
+
+	state = cpu_state_for(task_cpu(tsk));
+	raw_spin_lock_irqsave(&state->lock, flags);
+
+	res = sup_find_by_id(&state->sup_env, tsk_rt(tsk)->task_params.cpu);
+
+	/* found the appropriate reservation (or vCPU) */
+	if (res) {
+		task_client_init(&tinfo->res_info, tsk, res);
+		tinfo->cpu = task_cpu(tsk);
+		tsk_rt(tsk)->plugin_state = tinfo;
+		err = 0;
+	}
+
+	raw_spin_unlock_irqrestore(&state->lock, flags);
+
+	preempt_enable();
+
+	if (err)
+		kfree(tinfo);
+
+	return err;
+}
+
+static void task_new_legacy_task_model_updates(struct task_struct *tsk)
+{
+	lt_t now = litmus_clock();
+
+	/* the first job exists starting as of right now */
+	release_at(tsk, now);
+}
+
+static void pres_task_new(struct task_struct *tsk, int on_runqueue,
+			  int is_running)
+{
+	unsigned long flags;
+	struct pres_task_state* tinfo = get_pres_state(tsk);
+	struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
+
+	TRACE_TASK(tsk, "new RT task %llu (on_rq:%d, running:%d)\n",
+		   litmus_clock(), on_runqueue, is_running);
+
+	/* acquire the lock protecting the state and disable interrupts */
+	raw_spin_lock_irqsave(&state->lock, flags);
+
+	if (is_running) {
+		state->scheduled = tsk;
+		/* make sure this task should actually be running */
+		litmus_reschedule_local();
+	}
+
+	if (on_runqueue || is_running) {
+		sup_update_time(&state->sup_env, litmus_clock());
+		task_arrives(tsk);
+		pres_update_timer(state);
+	}
+
+	raw_spin_unlock_irqrestore(&state->lock, flags);
+
+	task_new_legacy_task_model_updates(tsk);
+}
+
+static void pres_task_exit(struct task_struct *tsk)
+{
+	unsigned long flags;
+	struct pres_task_state* tinfo = get_pres_state(tsk);
+	struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
+
+	raw_spin_lock_irqsave(&state->lock, flags);
+
+	if (state->scheduled == tsk)
+		state->scheduled = NULL;
+
+	/* remove from queues */
+	if (is_running(tsk)) {
+		sup_update_time(&state->sup_env, litmus_clock());
+		task_departs(tsk, 0);
+		pres_update_timer(state);
+	}
+
+	raw_spin_unlock_irqrestore(&state->lock, flags);
+
+	kfree(tsk_rt(tsk)->plugin_state);
+	tsk_rt(tsk)->plugin_state = NULL;
+}
+
+static long create_polling_reservation(
+	int res_type,
+	struct reservation_config *config)
+{
+	struct pres_cpu_state *state;
+	struct reservation* res;
+	struct polling_reservation *pres;
+	unsigned long flags;
+	int use_edf  = config->priority == LITMUS_NO_PRIORITY;
+	int periodic =  res_type == PERIODIC_POLLING;
+	long err = -EINVAL;
+
+	if (config->polling_params.budget >
+	    config->polling_params.period) {
+		printk(KERN_ERR "invalid polling reservation (%u): "
+		       "budget > period\n", config->id);
+		return -EINVAL;
+	}
+	if (config->polling_params.budget >
+	    config->polling_params.relative_deadline
+	    && config->polling_params.relative_deadline) {
+		printk(KERN_ERR "invalid polling reservation (%u): "
+		       "budget > deadline\n", config->id);
+		return -EINVAL;
+	}
+	if (config->polling_params.offset >
+	    config->polling_params.period) {
+		printk(KERN_ERR "invalid polling reservation (%u): "
+		       "offset > period\n", config->id);
+		return -EINVAL;
+	}
+
+	/* Allocate before we grab a spin lock.
+	 * Todo: would be nice to use a core-local allocation.
+	 */
+	pres = kzalloc(sizeof(*pres), GFP_KERNEL);
+	if (!pres)
+		return -ENOMEM;
+
+	state = cpu_state_for(config->cpu);
+	raw_spin_lock_irqsave(&state->lock, flags);
+
+	res = sup_find_by_id(&state->sup_env, config->id);
+	if (!res) {
+		polling_reservation_init(pres, use_edf, periodic,
+			config->polling_params.budget,
+			config->polling_params.period,
+			config->polling_params.relative_deadline,
+			config->polling_params.offset);
+		pres->res.id = config->id;
+		if (!use_edf)
+			pres->res.priority = config->priority;
+		sup_add_new_reservation(&state->sup_env, &pres->res);
+		err = config->id;
+	} else {
+		err = -EEXIST;
+	}
+
+	raw_spin_unlock_irqrestore(&state->lock, flags);
+
+	if (err < 0)
+		kfree(pres);
+
+	return err;
+}
+
+#define MAX_INTERVALS 1024
+
+static long create_table_driven_reservation(
+	struct reservation_config *config)
+{
+	struct pres_cpu_state *state;
+	struct reservation* res;
+	struct table_driven_reservation *td_res = NULL;
+	struct lt_interval *slots = NULL;
+	size_t slots_size;
+	unsigned int i, num_slots;
+	unsigned long flags;
+	long err = -EINVAL;
+
+
+	if (!config->table_driven_params.num_intervals) {
+		printk(KERN_ERR "invalid table-driven reservation (%u): "
+		       "no intervals\n", config->id);
+		return -EINVAL;
+	}
+
+	if (config->table_driven_params.num_intervals > MAX_INTERVALS) {
+		printk(KERN_ERR "invalid table-driven reservation (%u): "
+		       "too many intervals (max: %d)\n", config->id, MAX_INTERVALS);
+		return -EINVAL;
+	}
+
+	num_slots = config->table_driven_params.num_intervals;
+	slots_size = sizeof(slots[0]) * num_slots;
+	slots = kzalloc(slots_size, GFP_KERNEL);
+	if (!slots)
+		return -ENOMEM;
+
+	td_res = kzalloc(sizeof(*td_res), GFP_KERNEL);
+	if (!td_res)
+		err = -ENOMEM;
+	else
+		err = copy_from_user(slots,
+			config->table_driven_params.intervals, slots_size);
+
+	if (!err) {
+		/* sanity checks */
+		for (i = 0; !err && i < num_slots; i++)
+			if (slots[i].end <= slots[i].start) {
+				printk(KERN_ERR
+				       "invalid table-driven reservation (%u): "
+				       "invalid interval %u => [%llu, %llu]\n",
+				       config->id, i,
+				       slots[i].start, slots[i].end);
+				err = -EINVAL;
+			}
+
+		for (i = 0; !err && i + 1 < num_slots; i++)
+			if (slots[i + 1].start <= slots[i].end) {
+				printk(KERN_ERR
+				       "invalid table-driven reservation (%u): "
+				       "overlapping intervals %u, %u\n",
+				       config->id, i, i + 1);
+				err = -EINVAL;
+			}
+
+		if (slots[num_slots - 1].end >
+			config->table_driven_params.major_cycle_length) {
+			printk(KERN_ERR
+				"invalid table-driven reservation (%u): last "
+				"interval ends past major cycle %llu > %llu\n",
+				config->id,
+				slots[num_slots - 1].end,
+				config->table_driven_params.major_cycle_length);
+			err = -EINVAL;
+		}
+	}
+
+	if (!err) {
+		state = cpu_state_for(config->cpu);
+		raw_spin_lock_irqsave(&state->lock, flags);
+
+		res = sup_find_by_id(&state->sup_env, config->id);
+		if (!res) {
+			table_driven_reservation_init(td_res,
+				config->table_driven_params.major_cycle_length,
+				slots, num_slots);
+			td_res->res.id = config->id;
+			td_res->res.priority = config->priority;
+			sup_add_new_reservation(&state->sup_env, &td_res->res);
+			err = config->id;
+		} else {
+			err = -EEXIST;
+		}
+
+		raw_spin_unlock_irqrestore(&state->lock, flags);
+	}
+
+	if (err < 0) {
+		kfree(slots);
+		kfree(td_res);
+	}
+
+	return err;
+}
+
+static long pres_reservation_create(int res_type, void* __user _config)
+{
+	long ret = -EINVAL;
+	struct reservation_config config;
+
+	TRACE("Attempt to create reservation (%d)\n", res_type);
+
+	if (copy_from_user(&config, _config, sizeof(config)))
+		return -EFAULT;
+
+	if (config.cpu < 0 || !cpu_online(config.cpu)) {
+		printk(KERN_ERR "invalid polling reservation (%u): "
+		       "CPU %d offline\n", config.id, config.cpu);
+		return -EINVAL;
+	}
+
+	switch (res_type) {
+		case PERIODIC_POLLING:
+		case SPORADIC_POLLING:
+			ret = create_polling_reservation(res_type, &config);
+			break;
+
+		case TABLE_DRIVEN:
+			ret = create_table_driven_reservation(&config);
+			break;
+
+		default:
+			return -EINVAL;
+	};
+
+	return ret;
+}
+
+static struct domain_proc_info pres_domain_proc_info;
+
+static long pres_get_domain_proc_info(struct domain_proc_info **ret)
+{
+	*ret = &pres_domain_proc_info;
+	return 0;
+}
+
+static void pres_setup_domain_proc(void)
+{
+	int i, cpu;
+	int num_rt_cpus = num_online_cpus();
+
+	struct cd_mapping *cpu_map, *domain_map;
+
+	memset(&pres_domain_proc_info, sizeof(pres_domain_proc_info), 0);
+	init_domain_proc_info(&pres_domain_proc_info, num_rt_cpus, num_rt_cpus);
+	pres_domain_proc_info.num_cpus = num_rt_cpus;
+	pres_domain_proc_info.num_domains = num_rt_cpus;
+
+	i = 0;
+	for_each_online_cpu(cpu) {
+		cpu_map = &pres_domain_proc_info.cpu_to_domains[i];
+		domain_map = &pres_domain_proc_info.domain_to_cpus[i];
+
+		cpu_map->id = cpu;
+		domain_map->id = i;
+		cpumask_set_cpu(i, cpu_map->mask);
+		cpumask_set_cpu(cpu, domain_map->mask);
+		++i;
+	}
+}
+
+static long pres_activate_plugin(void)
+{
+	int cpu;
+	struct pres_cpu_state *state;
+
+	for_each_online_cpu(cpu) {
+		TRACE("Initializing CPU%d...\n", cpu);
+
+		state = cpu_state_for(cpu);
+
+		raw_spin_lock_init(&state->lock);
+		state->cpu = cpu;
+		state->scheduled = NULL;
+
+		sup_init(&state->sup_env);
+
+		hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+		state->timer.function = on_scheduling_timer;
+	}
+
+	pres_setup_domain_proc();
+
+	return 0;
+}
+
+static long pres_deactivate_plugin(void)
+{
+	int cpu;
+	struct pres_cpu_state *state;
+	struct reservation *res;
+
+	for_each_online_cpu(cpu) {
+		state = cpu_state_for(cpu);
+		raw_spin_lock(&state->lock);
+
+		hrtimer_cancel(&state->timer);
+
+		/* Delete all reservations --- assumes struct reservation
+		 * is prefix of containing struct. */
+
+		while (!list_empty(&state->sup_env.active_reservations)) {
+			res = list_first_entry(
+				&state->sup_env.active_reservations,
+			        struct reservation, list);
+			list_del(&res->list);
+			kfree(res);
+		}
+
+		while (!list_empty(&state->sup_env.inactive_reservations)) {
+			res = list_first_entry(
+				&state->sup_env.inactive_reservations,
+			        struct reservation, list);
+			list_del(&res->list);
+			kfree(res);
+		}
+
+		while (!list_empty(&state->sup_env.depleted_reservations)) {
+			res = list_first_entry(
+				&state->sup_env.depleted_reservations,
+			        struct reservation, list);
+			list_del(&res->list);
+			kfree(res);
+		}
+
+		raw_spin_unlock(&state->lock);
+	}
+
+	destroy_domain_proc_info(&pres_domain_proc_info);
+	return 0;
+}
+
+static struct sched_plugin pres_plugin = {
+	.plugin_name		= "P-RES",
+	.schedule		= pres_schedule,
+	.task_wake_up		= pres_task_resume,
+	.admit_task		= pres_admit_task,
+	.task_new		= pres_task_new,
+	.task_exit		= pres_task_exit,
+	.complete_job           = pres_complete_job,
+	.get_domain_proc_info   = pres_get_domain_proc_info,
+	.activate_plugin	= pres_activate_plugin,
+	.deactivate_plugin      = pres_deactivate_plugin,
+	.reservation_create     = pres_reservation_create,
+};
+
+static int __init init_pres(void)
+{
+	return register_sched_plugin(&pres_plugin);
+}
+
+module_init(init_pres);
+
-- 
1.8.1.2


From 90add7e63ec95fcf81e311ffc1c036382ac28347 Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Thu, 4 Sep 2014 15:30:12 -0400
Subject: [PATCH 064/119] Fix scheduler invocation after draining budget

---
 litmus/reservation.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/litmus/reservation.c b/litmus/reservation.c
index bc32b2e..cd51b90 100644
--- a/litmus/reservation.c
+++ b/litmus/reservation.c
@@ -180,18 +180,31 @@ static void sup_charge_budget(
 {
 	struct list_head *pos, *next;
 	struct reservation *res;
+	
+	int encountered_active = 0;
 
 	list_for_each_safe(pos, next, &sup_env->active_reservations) {
 		/* charge all ACTIVE_IDLE up to the first ACTIVE reservation */
 		res = list_entry(pos, struct reservation, list);
 		if (res->state == RESERVATION_ACTIVE) {
 			res->ops->drain_budget(res, delta);
-			/* stop at the first ACTIVE reservation */
-			break;
+			encountered_active = 1;
 		} else {
 			BUG_ON(res->state != RESERVATION_ACTIVE_IDLE);
 			res->ops->drain_budget(res, delta);
 		}
+		if (res->state == RESERVATION_ACTIVE ||
+			res->state == RESERVATION_ACTIVE_IDLE)
+		{
+			/* make sure scheduler is invoked when this reservation expires
+			 * its remaining budget */
+			 TRACE("requesting scheduler update for reservation %u in %llu nanoseconds\n",
+				res->id, res->cur_budget);
+			 sup_scheduler_update_after(sup_env, res->cur_budget);
+		}
+		if (encountered_active)
+			/* stop at the first ACTIVE reservation */
+			break;
 	}
 }
 
@@ -226,6 +239,7 @@ void sup_update_time(
 	/* If the time didn't advance, there is nothing to do.
 	 * This check makes it safe to call sup_advance_time() potentially
 	 * multiple times (e.g., via different code paths. */
+	TRACE("(sup_update_time) now: %llu, current_time: %llu\n", now, sup_env->env.current_time);
 	if (unlikely(now <= sup_env->env.current_time))
 		return;
 
-- 
1.8.1.2


From e55ab67a060ded6f2c47e5ede00a39176bacbab3 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Fri, 5 Sep 2014 01:39:41 +0200
Subject: [PATCH 065/119] Switch table-driven reservations to use table-driven
 budget

Instead of counting how much budget has been consumed, determine
budget based on actual time slots.
---
 include/litmus/polling_reservations.h |  1 +
 litmus/polling_reservations.c         | 54 ++++++++++++++++++++++++++++++++---
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/include/litmus/polling_reservations.h b/include/litmus/polling_reservations.h
index 15910ed..fa22181 100644
--- a/include/litmus/polling_reservations.h
+++ b/include/litmus/polling_reservations.h
@@ -24,6 +24,7 @@ struct table_driven_reservation {
 	unsigned int next_interval;
 	unsigned int num_intervals;
 	struct lt_interval *intervals;
+	struct lt_interval *cur_interval;
 };
 
 void table_driven_reservation_init(struct table_driven_reservation *tdres,
diff --git a/litmus/polling_reservations.c b/litmus/polling_reservations.c
index 08034c3..e6c57f5 100644
--- a/litmus/polling_reservations.c
+++ b/litmus/polling_reservations.c
@@ -366,15 +366,29 @@ static lt_t td_interval_length(struct lt_interval *ival)
 	return ival->end - ival->start;
 }
 
+static lt_t td_time_remaining_until_end(struct table_driven_reservation *tdres)
+{
+	lt_t now = tdres->res.env->current_time;
+	lt_t end = td_cur_major_cycle_start(tdres) + tdres->cur_interval->end;
+	TRACE("td_remaining(%u): start=%llu now=%llu end=%llu\n",
+		tdres->res.id,
+		td_cur_major_cycle_start(tdres) + tdres->cur_interval->start,
+		now, end);
+	if (now >=  end)
+		return 0;
+	else
+		return end - now;
+}
+
 static void td_replenish(
-	struct reservation *res
-)
+	struct reservation *res)
 {
 	struct table_driven_reservation *tdres =
 		container_of(res, struct table_driven_reservation, res);
 
 	/* replenish budget */
-	res->cur_budget = td_interval_length(tdres->intervals + tdres->next_interval);
+	tdres->cur_interval = tdres->intervals + tdres->next_interval;
+	res->cur_budget = td_interval_length(tdres->cur_interval);
 
 	tdres->next_interval = (tdres->next_interval + 1) % tdres->num_intervals;
 	if (tdres->next_interval)
@@ -404,12 +418,43 @@ static void td_replenish(
 	}
 }
 
+static void td_drain_budget(
+		struct reservation *res,
+		lt_t how_much)
+{
+	struct table_driven_reservation *tdres =
+		container_of(res, struct table_driven_reservation, res);
+
+	/* Table-driven scheduling: instead of tracking the budget, we compute
+	 * how much time is left in this allocation interval. */
+
+	switch (res->state) {
+		case RESERVATION_DEPLETED:
+		case RESERVATION_INACTIVE:
+			BUG();
+			break;
+
+		case RESERVATION_ACTIVE_IDLE:
+		case RESERVATION_ACTIVE:
+			res->cur_budget = td_time_remaining_until_end(tdres);
+			TRACE("td_drain_budget(%u): drained to budget=%llu\n",
+				res->id, res->cur_budget);
+			if (!res->cur_budget) {
+				res->env->change_state(res->env, res,
+					RESERVATION_DEPLETED);
+			} /* else: stay in current state */
+			break;
+	}
+}
+
+
+
 static struct reservation_ops td_ops = {
 	.dispatch_client = default_dispatch_client,
 	.client_arrives = td_client_arrives,
 	.client_departs = td_client_departs,
 	.replenish = td_replenish,
-	.drain_budget = common_drain_budget,
+	.drain_budget = td_drain_budget,
 };
 
 void table_driven_reservation_init(
@@ -431,6 +476,7 @@ void table_driven_reservation_init(
 	reservation_init(&tdres->res);
 	tdres->major_cycle = major_cycle;
 	tdres->intervals = intervals;
+	tdres->cur_interval = intervals;
 	tdres->num_intervals = num_intervals;
 	tdres->res.ops = &td_ops;
 }
-- 
1.8.1.2


From fcd5e594a26012c7457197fd111e58820bc34741 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Mon, 8 Sep 2014 18:19:43 +0200
Subject: [PATCH 066/119] P-RES: ensure scheduler timer fires on _local_ CPU
 only

Accidentally setting up the timer on the wrong CPU when a thread
resumes is problematic can lead (potentially) to deadlock and to
missed scheduling events.
---
 litmus/sched_pres.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 50 insertions(+), 5 deletions(-)

diff --git a/litmus/sched_pres.c b/litmus/sched_pres.c
index 6779ffd..60afde6 100644
--- a/litmus/sched_pres.c
+++ b/litmus/sched_pres.c
@@ -65,14 +65,22 @@ static void task_arrives(struct task_struct *tsk)
 
 static void pres_update_timer(struct pres_cpu_state *state)
 {
+	int local;
 	lt_t update, now;
 
 	update = state->sup_env.next_scheduler_update;
 	now = state->sup_env.env.current_time;
+
+	/* Be sure we're actually running on the right core,
+	 * as pres_update_timer() is also called from pres_task_resume(),
+	 * which might be called on any CPU when a thread resumes.
+	 */
+	local = local_cpu_state() == state;
+
 	if (update <= now) {
 		litmus_reschedule(state->cpu);
-	} else if (update != SUP_NO_SCHEDULER_UPDATE) {
-		/* reprogram only if not already set correctly */
+	} else if (likely(local && update != SUP_NO_SCHEDULER_UPDATE)) {
+		/* Reprogram only if not already set correctly. */
 		if (!hrtimer_active(&state->timer) ||
 		    ktime_to_ns(hrtimer_get_expires(&state->timer)) != update) {
 			TRACE("canceling timer...\n");
@@ -81,6 +89,25 @@ static void pres_update_timer(struct pres_cpu_state *state)
 			hrtimer_start(&state->timer, ns_to_ktime(update),
 				HRTIMER_MODE_ABS_PINNED);
 		}
+	} else if (unlikely(!local && update != SUP_NO_SCHEDULER_UPDATE)) {
+		/* Poke remote core only if timer needs to be set earlier than
+		 * it is currently set.
+		 */
+		TRACE("pres_update_timer for remote CPU %d (update=%llu, "
+		      "active:%d, set:%llu)\n",
+			state->cpu,
+			update,
+			hrtimer_active(&state->timer),
+			ktime_to_ns(hrtimer_get_expires(&state->timer)));
+		if (!hrtimer_active(&state->timer) ||
+		    ktime_to_ns(hrtimer_get_expires(&state->timer)) > update) {
+			TRACE("poking CPU %d so that it can update its "
+			       "scheduling timer (active:%d, set:%llu)\n",
+			       state->cpu,
+			       hrtimer_active(&state->timer),
+			       ktime_to_ns(hrtimer_get_expires(&state->timer)));
+			litmus_reschedule(state->cpu);
+		}
 	}
 }
 
@@ -88,16 +115,27 @@ static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 {
 	unsigned long flags;
 	enum hrtimer_restart restart = HRTIMER_NORESTART;
-	struct pres_cpu_state *state = local_cpu_state();
+	struct pres_cpu_state *state;
 	lt_t update, now;
 
+	state = container_of(timer, struct pres_cpu_state, timer);
+
+	/* The scheduling timer should only fire on the local CPU, because
+	 * otherwise deadlocks via timer_cancel() are possible.
+	 * Note: this does not interfere with dedicated interrupt handling, as
+	 * even under dedicated interrupt handling scheduling timers for
+	 * budget enforcement must occur locally on each CPU.
+	 */
+	BUG_ON(state->cpu != raw_smp_processor_id());
+
 	raw_spin_lock_irqsave(&state->lock, flags);
 	sup_update_time(&state->sup_env, litmus_clock());
 
 	update = state->sup_env.next_scheduler_update;
 	now = state->sup_env.env.current_time;
 
-	TRACE_CUR("on_scheduling_timer at %llu, upd:%llu\n", now, update);
+	TRACE_CUR("on_scheduling_timer at %llu, upd:%llu (for cpu=%d)\n",
+		now, update, state->cpu);
 
 	if (update <= now) {
 		litmus_reschedule_local();
@@ -173,12 +211,15 @@ static void pres_task_resume(struct task_struct  *tsk)
 	struct pres_task_state* tinfo = get_pres_state(tsk);
 	struct pres_cpu_state *state = cpu_state_for(tinfo->cpu);
 
-	TRACE_TASK(tsk, "wake_up at %llu\n", litmus_clock());
+	TRACE_TASK(tsk, "thread wakes up at %llu\n", litmus_clock());
 
 	raw_spin_lock_irqsave(&state->lock, flags);
 	/* Requeue if self-suspension was already processed. */
 	if (state->scheduled != tsk)
 	{
+		/* Assumption: litmus_clock() is synchronized across cores,
+		 * since we might not actually be executing on tinfo->cpu
+		 * at the moment. */
 		sup_update_time(&state->sup_env, litmus_clock());
 		task_arrives(tsk);
 		pres_update_timer(state);
@@ -270,6 +311,8 @@ static void pres_task_new(struct task_struct *tsk, int on_runqueue,
 	}
 
 	if (on_runqueue || is_running) {
+		/* Assumption: litmus_clock() is synchronized across cores
+		 * [see comment in pres_task_resume()] */
 		sup_update_time(&state->sup_env, litmus_clock());
 		task_arrives(tsk);
 		pres_update_timer(state);
@@ -293,6 +336,8 @@ static void pres_task_exit(struct task_struct *tsk)
 
 	/* remove from queues */
 	if (is_running(tsk)) {
+		/* Assumption: litmus_clock() is synchronized across cores
+		 * [see comment in pres_task_resume()] */
 		sup_update_time(&state->sup_env, litmus_clock());
 		task_departs(tsk, 0);
 		pres_update_timer(state);
-- 
1.8.1.2


From aca9db004d887d28621a0fc30818d7ebf77fc9bf Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Mon, 8 Sep 2014 18:24:18 +0200
Subject: [PATCH 067/119] Table-driven replenishments should depend on the
 current time

Make sure we don't accidentally bleed past the current reservation
scheduling slot (due to jitter) by determining the remaining budget
precisely when replenishing the reservation budget.
---
 litmus/polling_reservations.c | 45 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/litmus/polling_reservations.c b/litmus/polling_reservations.c
index e6c57f5..5c9b183 100644
--- a/litmus/polling_reservations.c
+++ b/litmus/polling_reservations.c
@@ -361,11 +361,6 @@ static void td_client_departs(
 	}
 }
 
-static lt_t td_interval_length(struct lt_interval *ival)
-{
-	return ival->end - ival->start;
-}
-
 static lt_t td_time_remaining_until_end(struct table_driven_reservation *tdres)
 {
 	lt_t now = tdres->res.env->current_time;
@@ -388,7 +383,9 @@ static void td_replenish(
 
 	/* replenish budget */
 	tdres->cur_interval = tdres->intervals + tdres->next_interval;
-	res->cur_budget = td_interval_length(tdres->cur_interval);
+	res->cur_budget = td_time_remaining_until_end(tdres);
+	TRACE("td_replenish(%u): %s budget=%llu\n", res->id,
+		res->cur_budget ? "" : "WARNING", res->cur_budget);
 
 	tdres->next_interval = (tdres->next_interval + 1) % tdres->num_intervals;
 	if (tdres->next_interval)
@@ -447,10 +444,44 @@ static void td_drain_budget(
 	}
 }
 
+static struct task_struct* td_dispatch_client(
+	struct reservation *res,
+	lt_t *for_at_most)
+{
+	struct task_struct *t;
+	struct table_driven_reservation *tdres =
+		container_of(res, struct table_driven_reservation, res);
 
+	/* usual logic for selecting a client */
+	t = default_dispatch_client(res, for_at_most);
+
+	TRACE_TASK(t, "td_dispatch_client(%u): selected, budget=%llu\n",
+		res->id, res->cur_budget);
+
+	/* check how much budget we have left in this time slot */
+	res->cur_budget = td_time_remaining_until_end(tdres);
+
+	TRACE_TASK(t, "td_dispatch_client(%u): updated to budget=%llu next=%d\n",
+		res->id, res->cur_budget, tdres->next_interval);
+
+	if (unlikely(!res->cur_budget)) {
+		/* Unlikely case: if we ran out of budget, the user configured
+		 * a broken scheduling table (overlapping table slots).
+		 * Not much we can do about this, but we can't dispatch a job
+		 * now without causing overload. So let's register this reservation
+		 * as depleted and wait for the next allocation. */
+		TRACE("td_dispatch_client(%u): budget unexpectedly depleted "
+			"(check scheduling table for unintended overlap)\n",
+			res->id);
+		res->env->change_state(res->env, res,
+			RESERVATION_DEPLETED);
+		return NULL;
+	} else
+		return t;
+}
 
 static struct reservation_ops td_ops = {
-	.dispatch_client = default_dispatch_client,
+	.dispatch_client = td_dispatch_client,
 	.client_arrives = td_client_arrives,
 	.client_departs = td_client_departs,
 	.replenish = td_replenish,
-- 
1.8.1.2


From 563999251e34d52bfbc47889cabd763714d020e1 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Thu, 11 Sep 2014 12:53:42 +0200
Subject: [PATCH 068/119] Move 'reservation' field from task_client to generic
 reservation_client

This makes it a lot easier to write generic code for thread arrival /
thread departure in plugins with multiple types of reservation
clients.
---
 include/litmus/reservation.h | 5 +++--
 litmus/reservation.c         | 4 ++--
 litmus/sched_pres.c          | 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/litmus/reservation.h b/include/litmus/reservation.h
index d8d6ce3..9c23e27 100644
--- a/include/litmus/reservation.h
+++ b/include/litmus/reservation.h
@@ -33,9 +33,11 @@ typedef struct task_struct * (*dispatch_t)  (
 );
 
 /* Something that can be managed in a reservation and that can yield
- * a process for dispatching. */
+ * a process for dispatching. Contains a pointer to the reservation
+ * to which it "belongs". */
 struct reservation_client {
 	struct list_head list;
+	struct reservation* reservation;
 	dispatch_t dispatch;
 };
 
@@ -135,7 +137,6 @@ struct task_struct* default_dispatch_client(
 /* "connector" reservation client to hook up tasks with reservations */
 struct task_client {
 	struct reservation_client client;
-	struct reservation* reservation;
 	struct task_struct *task;
 };
 
diff --git a/litmus/reservation.c b/litmus/reservation.c
index cd51b90..447fc5b 100644
--- a/litmus/reservation.c
+++ b/litmus/reservation.c
@@ -40,8 +40,8 @@ void task_client_init(struct task_client *tc, struct task_struct *tsk,
 {
 	memset(&tc->client, sizeof(tc->client), 0);
 	tc->client.dispatch = task_client_dispatch;
+	tc->client.reservation = res;
 	tc->task = tsk;
-	tc->reservation = res;
 }
 
 static void sup_scheduler_update_at(
@@ -180,7 +180,7 @@ static void sup_charge_budget(
 {
 	struct list_head *pos, *next;
 	struct reservation *res;
-	
+
 	int encountered_active = 0;
 
 	list_for_each_safe(pos, next, &sup_env->active_reservations) {
diff --git a/litmus/sched_pres.c b/litmus/sched_pres.c
index 60afde6..2c777ec 100644
--- a/litmus/sched_pres.c
+++ b/litmus/sched_pres.c
@@ -45,7 +45,7 @@ static void task_departs(struct task_struct *tsk, int job_complete)
 	struct reservation* res;
 	struct reservation_client *client;
 
-	res    = state->res_info.reservation;
+	res    = state->res_info.client.reservation;
 	client = &state->res_info.client;
 
 	res->ops->client_departs(res, client, job_complete);
@@ -57,7 +57,7 @@ static void task_arrives(struct task_struct *tsk)
 	struct reservation* res;
 	struct reservation_client *client;
 
-	res    = state->res_info.reservation;
+	res    = state->res_info.client.reservation;
 	client = &state->res_info.client;
 
 	res->ops->client_arrives(res, client);
-- 
1.8.1.2


From 4841253863ef57e0b91d169b0080ce079d54fe6f Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Fri, 12 Sep 2014 13:31:08 +0200
Subject: [PATCH 069/119] P-RES: fix rare deadlock via hrtimer_start()

There's a rare condition under which the current call to hrtimer_start()
in pres_update_timer() may result in deadlock.

pres_update_timer() // holds runqueue lock and state->lock
-> hrtimer_start()
  -> raise_softirq_irqoff()
    -> wakeup_softirqd()
      ->  wake_up_process()
        -> acquires runqueue lock()

To avoid this, we need to call __hrtimer_start_range_ns() with the
'wakeup' flag set to zero.

While at it, also drop the state->lock before calling into hrtimer(),
to avoid making the scheduler critical section longer than necessary.
---
 litmus/sched_pres.c | 51 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 19 deletions(-)

diff --git a/litmus/sched_pres.c b/litmus/sched_pres.c
index 2c777ec..13a47a8 100644
--- a/litmus/sched_pres.c
+++ b/litmus/sched_pres.c
@@ -63,7 +63,8 @@ static void task_arrives(struct task_struct *tsk)
 	res->ops->client_arrives(res, client);
 }
 
-static void pres_update_timer(struct pres_cpu_state *state)
+/* NOTE: drops state->lock */
+static void pres_update_timer_and_unlock(struct pres_cpu_state *state)
 {
 	int local;
 	lt_t update, now;
@@ -77,6 +78,10 @@ static void pres_update_timer(struct pres_cpu_state *state)
 	 */
 	local = local_cpu_state() == state;
 
+	/* Must drop state lock before calling into hrtimer_start(), which
+	 * may raise a softirq, which in turn may wake ksoftirqd. */
+	raw_spin_unlock(&state->lock);
+
 	if (update <= now) {
 		litmus_reschedule(state->cpu);
 	} else if (likely(local && update != SUP_NO_SCHEDULER_UPDATE)) {
@@ -86,8 +91,13 @@ static void pres_update_timer(struct pres_cpu_state *state)
 			TRACE("canceling timer...\n");
 			hrtimer_cancel(&state->timer);
 			TRACE("setting scheduler timer for %llu\n", update);
-			hrtimer_start(&state->timer, ns_to_ktime(update),
-				HRTIMER_MODE_ABS_PINNED);
+			/* We cannot use hrtimer_start() here because the
+			 * wakeup flag must be set to zero. */
+			__hrtimer_start_range_ns(&state->timer,
+					ns_to_ktime(update),
+					0 /* timer coalescing slack */,
+					HRTIMER_MODE_ABS_PINNED,
+					0 /* wakeup */);
 		}
 	} else if (unlikely(!local && update != SUP_NO_SCHEDULER_UPDATE)) {
 		/* Poke remote core only if timer needs to be set earlier than
@@ -170,14 +180,13 @@ static struct task_struct* pres_schedule(struct task_struct * prev)
 	/* figure out what to schedule next */
 	state->scheduled = sup_dispatch(&state->sup_env);
 
-	/* program scheduler timer */
-	state->sup_env.will_schedule = false;
-	pres_update_timer(state);
-
 	/* Notify LITMUS^RT core that we've arrived at a scheduling decision. */
 	sched_state_task_picked();
 
-	raw_spin_unlock(&state->lock);
+	/* program scheduler timer */
+	state->sup_env.will_schedule = false;
+	/* NOTE: drops state->lock */
+	pres_update_timer_and_unlock(state);
 
 	if (prev != state->scheduled && is_realtime(prev))
 		TRACE_TASK(prev, "descheduled.\n");
@@ -222,9 +231,11 @@ static void pres_task_resume(struct task_struct  *tsk)
 		 * at the moment. */
 		sup_update_time(&state->sup_env, litmus_clock());
 		task_arrives(tsk);
-		pres_update_timer(state);
-	}
-	raw_spin_unlock_irqrestore(&state->lock, flags);
+		/* NOTE: drops state->lock */
+		pres_update_timer_and_unlock(state);
+		local_irq_restore(flags);
+	} else
+		raw_spin_unlock_irqrestore(&state->lock, flags);
 
 	resume_legacy_task_model_updates(tsk);
 }
@@ -315,10 +326,11 @@ static void pres_task_new(struct task_struct *tsk, int on_runqueue,
 		 * [see comment in pres_task_resume()] */
 		sup_update_time(&state->sup_env, litmus_clock());
 		task_arrives(tsk);
-		pres_update_timer(state);
-	}
-
-	raw_spin_unlock_irqrestore(&state->lock, flags);
+		/* NOTE: drops state->lock */
+		pres_update_timer_and_unlock(state);
+		local_irq_restore(flags);
+	} else
+		raw_spin_unlock_irqrestore(&state->lock, flags);
 
 	task_new_legacy_task_model_updates(tsk);
 }
@@ -340,10 +352,11 @@ static void pres_task_exit(struct task_struct *tsk)
 		 * [see comment in pres_task_resume()] */
 		sup_update_time(&state->sup_env, litmus_clock());
 		task_departs(tsk, 0);
-		pres_update_timer(state);
-	}
-
-	raw_spin_unlock_irqrestore(&state->lock, flags);
+		/* NOTE: drops state->lock */
+		pres_update_timer_and_unlock(state);
+		local_irq_restore(flags);
+	} else
+		raw_spin_unlock_irqrestore(&state->lock, flags);
 
 	kfree(tsk_rt(tsk)->plugin_state);
 	tsk_rt(tsk)->plugin_state = NULL;
-- 
1.8.1.2


From a56ffe502e0f4edc7be9b59533455fdc3c9f86d3 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Mon, 15 Sep 2014 08:13:35 +0200
Subject: [PATCH 070/119] Reservations: keep track of consumed budget

This can be a useful stat for userspace.
---
 include/litmus/reservation.h  | 4 ++++
 litmus/polling_reservations.c | 8 ++++++++
 2 files changed, 12 insertions(+)

diff --git a/include/litmus/reservation.h b/include/litmus/reservation.h
index 9c23e27..4eecd3f 100644
--- a/include/litmus/reservation.h
+++ b/include/litmus/reservation.h
@@ -117,6 +117,10 @@ struct reservation {
 	lt_t cur_budget;
 	lt_t next_replenishment;
 
+	/* budget stats */
+	lt_t budget_consumed; /* how much budget consumed in this allocation cycle? */
+	lt_t budget_consumed_total;
+
 	/* interaction with framework */
 	struct reservation_environment *env;
 	struct reservation_ops *ops;
diff --git a/litmus/polling_reservations.c b/litmus/polling_reservations.c
index 5c9b183..2c481b4 100644
--- a/litmus/polling_reservations.c
+++ b/litmus/polling_reservations.c
@@ -84,6 +84,7 @@ static void periodic_polling_on_replenishment(
 	/* replenish budget */
 	res->cur_budget = pres->max_budget;
 	res->next_replenishment += pres->period;
+	res->budget_consumed = 0;
 
 	switch (res->state) {
 		case RESERVATION_DEPLETED:
@@ -129,6 +130,9 @@ static void common_drain_budget(
 	else
 		res->cur_budget -= how_much;
 
+	res->budget_consumed += how_much;
+	res->budget_consumed_total += how_much;
+
 	switch (res->state) {
 		case RESERVATION_DEPLETED:
 		case RESERVATION_INACTIVE:
@@ -384,6 +388,7 @@ static void td_replenish(
 	/* replenish budget */
 	tdres->cur_interval = tdres->intervals + tdres->next_interval;
 	res->cur_budget = td_time_remaining_until_end(tdres);
+	res->budget_consumed = 0;
 	TRACE("td_replenish(%u): %s budget=%llu\n", res->id,
 		res->cur_budget ? "" : "WARNING", res->cur_budget);
 
@@ -422,6 +427,9 @@ static void td_drain_budget(
 	struct table_driven_reservation *tdres =
 		container_of(res, struct table_driven_reservation, res);
 
+	res->budget_consumed += how_much;
+	res->budget_consumed_total += how_much;
+
 	/* Table-driven scheduling: instead of tracking the budget, we compute
 	 * how much time is left in this allocation interval. */
 
-- 
1.8.1.2


From 0dc72270017d1362bbb4eb05aa07c1967cc9c30c Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Tue, 16 Sep 2014 12:03:10 +0200
Subject: [PATCH 071/119] Reservations: priority should be a lt_t

Rationale: the internal priority point representation is of type lt_t
(64 bits), so to enable userspace to specify priorities below (=after)
EDF priority points, we need to allow userspace to specify values
larger than 2^32.
---
 include/litmus/rt_param.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
index b252cc1..e626bbb 100644
--- a/include/litmus/rt_param.h
+++ b/include/litmus/rt_param.h
@@ -91,7 +91,7 @@ struct lt_interval {
 
 struct reservation_config {
 	unsigned int id;
-	unsigned int priority;
+	lt_t priority;
 	int  cpu;
 
 	union {
-- 
1.8.1.2


From f552154ad716c7601d07e4e90c8491766ef74fa7 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Wed, 17 Sep 2014 09:44:03 +0200
Subject: [PATCH 072/119] P-RES: allocation in pres_admit_task() must be atomic

The kernel codepath calling into pres_admit_task() is holding some
lock unrelated to LITMUS^RT. As a result, we need to pass GFP_ATOMIC,
not just GFP_KERNEL, to kzalloc().
---
 litmus/sched_pres.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litmus/sched_pres.c b/litmus/sched_pres.c
index 13a47a8..49648ee 100644
--- a/litmus/sched_pres.c
+++ b/litmus/sched_pres.c
@@ -264,7 +264,7 @@ static long pres_admit_task(struct task_struct *tsk)
 	unsigned long flags;
 	struct reservation *res;
 	struct pres_cpu_state *state;
-	struct pres_task_state *tinfo = kzalloc(sizeof(*tinfo), GFP_KERNEL);
+	struct pres_task_state *tinfo = kzalloc(sizeof(*tinfo), GFP_ATOMIC);
 
 	if (!tinfo)
 		return -ENOMEM;
-- 
1.8.1.2


From b1c6f8b1f57417ea05d83261e8a20623ca11b6d5 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Wed, 17 Sep 2014 09:34:49 +0200
Subject: [PATCH 073/119] P-RES: disable LITMUS^RT's standard budget
 enforcement

The P-RES plugin is currently not compatible with the per-thread
budget enforcement logic, which can trigger assertion failures. For
now, let's simply disable per-thread timeslice enforcement. (P-RES's
reservations are a much better mechanism anyway.)
---
 litmus/sched_pres.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/litmus/sched_pres.c b/litmus/sched_pres.c
index 49648ee..6126852 100644
--- a/litmus/sched_pres.c
+++ b/litmus/sched_pres.c
@@ -282,6 +282,9 @@ static long pres_admit_task(struct task_struct *tsk)
 		tinfo->cpu = task_cpu(tsk);
 		tsk_rt(tsk)->plugin_state = tinfo;
 		err = 0;
+
+		/* disable LITMUS^RT's per-thread budget enforcement */
+		tsk_rt(tsk)->task_params.budget_policy = NO_ENFORCEMENT;
 	}
 
 	raw_spin_unlock_irqrestore(&state->lock, flags);
-- 
1.8.1.2


From 33ad22dfbddcff613fd530f3721cd3e941f4614c Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Wed, 17 Sep 2014 09:33:32 +0200
Subject: [PATCH 074/119] P-RES: keep track in per-task state of whether it
 suspended

Checking state->scheduled is not accurate when bandwidth inheritance
is applied.
---
 litmus/sched_pres.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/litmus/sched_pres.c b/litmus/sched_pres.c
index 6126852..34b096d 100644
--- a/litmus/sched_pres.c
+++ b/litmus/sched_pres.c
@@ -17,6 +17,7 @@
 struct pres_task_state {
 	struct task_client res_info;
 	int cpu;
+	bool has_departed;
 };
 
 struct pres_cpu_state {
@@ -49,6 +50,7 @@ static void task_departs(struct task_struct *tsk, int job_complete)
 	client = &state->res_info.client;
 
 	res->ops->client_departs(res, client, job_complete);
+	state->has_departed = true;
 }
 
 static void task_arrives(struct task_struct *tsk)
@@ -60,6 +62,7 @@ static void task_arrives(struct task_struct *tsk)
 	res    = state->res_info.client.reservation;
 	client = &state->res_info.client;
 
+	state->has_departed = false;
 	res->ops->client_arrives(res, client);
 }
 
@@ -223,8 +226,8 @@ static void pres_task_resume(struct task_struct  *tsk)
 	TRACE_TASK(tsk, "thread wakes up at %llu\n", litmus_clock());
 
 	raw_spin_lock_irqsave(&state->lock, flags);
-	/* Requeue if self-suspension was already processed. */
-	if (state->scheduled != tsk)
+	/* Requeue only if self-suspension was already processed. */
+	if (tinfo->has_departed)
 	{
 		/* Assumption: litmus_clock() is synchronized across cores,
 		 * since we might not actually be executing on tinfo->cpu
@@ -234,8 +237,10 @@ static void pres_task_resume(struct task_struct  *tsk)
 		/* NOTE: drops state->lock */
 		pres_update_timer_and_unlock(state);
 		local_irq_restore(flags);
-	} else
+	} else {
+		TRACE_TASK(tsk, "resume event ignored, still scheduled\n");
 		raw_spin_unlock_irqrestore(&state->lock, flags);
+	}
 
 	resume_legacy_task_model_updates(tsk);
 }
@@ -280,6 +285,7 @@ static long pres_admit_task(struct task_struct *tsk)
 	if (res) {
 		task_client_init(&tinfo->res_info, tsk, res);
 		tinfo->cpu = task_cpu(tsk);
+		tinfo->has_departed = true;
 		tsk_rt(tsk)->plugin_state = tinfo;
 		err = 0;
 
-- 
1.8.1.2


From f460d28c594341d8e8f78cfe92e6e0d42b2f5616 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Wed, 17 Sep 2014 17:25:50 +0200
Subject: [PATCH 075/119] Reservations: fix time-tracking of table-driven
 reservations

Keep track of the current slot and major cycle explicitly to avoid
ambiguity when the budget charging is delayed into the next major
cycle due to a late interrupt or other sources of delay.
---
 include/litmus/polling_reservations.h |  5 +++-
 litmus/polling_reservations.c         | 53 ++++++++++++++++++++++++++---------
 litmus/reservation.c                  |  2 ++
 3 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/include/litmus/polling_reservations.h b/include/litmus/polling_reservations.h
index fa22181..66c9b1e 100644
--- a/include/litmus/polling_reservations.h
+++ b/include/litmus/polling_reservations.h
@@ -24,7 +24,10 @@ struct table_driven_reservation {
 	unsigned int next_interval;
 	unsigned int num_intervals;
 	struct lt_interval *intervals;
-	struct lt_interval *cur_interval;
+
+	/* info about current scheduling slot */
+	struct lt_interval cur_interval;
+	lt_t major_cycle_start;
 };
 
 void table_driven_reservation_init(struct table_driven_reservation *tdres,
diff --git a/litmus/polling_reservations.c b/litmus/polling_reservations.c
index 2c481b4..86a3206 100644
--- a/litmus/polling_reservations.c
+++ b/litmus/polling_reservations.c
@@ -318,7 +318,8 @@ static void td_client_arrives(
 	switch (res->state) {
 		case RESERVATION_INACTIVE:
 			/* Figure out first replenishment time. */
-			res->next_replenishment = td_next_major_cycle_start(tdres);
+			tdres->major_cycle_start = td_next_major_cycle_start(tdres);
+			res->next_replenishment  = tdres->major_cycle_start;
 			res->next_replenishment += tdres->intervals[0].start;
 			tdres->next_interval = 0;
 
@@ -368,11 +369,12 @@ static void td_client_departs(
 static lt_t td_time_remaining_until_end(struct table_driven_reservation *tdres)
 {
 	lt_t now = tdres->res.env->current_time;
-	lt_t end = td_cur_major_cycle_start(tdres) + tdres->cur_interval->end;
-	TRACE("td_remaining(%u): start=%llu now=%llu end=%llu\n",
+	lt_t end = tdres->cur_interval.end;
+	TRACE("td_remaining(%u): start=%llu now=%llu end=%llu state=%d\n",
 		tdres->res.id,
-		td_cur_major_cycle_start(tdres) + tdres->cur_interval->start,
-		now, end);
+		tdres->cur_interval.start,
+		now, end,
+		tdres->res.state);
 	if (now >=  end)
 		return 0;
 	else
@@ -385,20 +387,36 @@ static void td_replenish(
 	struct table_driven_reservation *tdres =
 		container_of(res, struct table_driven_reservation, res);
 
-	/* replenish budget */
-	tdres->cur_interval = tdres->intervals + tdres->next_interval;
+	TRACE("td_replenish(%u): expected_replenishment=%llu\n", res->id,
+		res->next_replenishment);
+
+	/* figure out current interval */
+	tdres->cur_interval.start = tdres->major_cycle_start +
+		tdres->intervals[tdres->next_interval].start;
+	tdres->cur_interval.end =  tdres->major_cycle_start +
+		tdres->intervals[tdres->next_interval].end;
+	TRACE("major_cycle_start=%llu => [%llu, %llu]\n",
+		tdres->major_cycle_start,
+		tdres->cur_interval.start,
+		tdres->cur_interval.end);
+
+	/* reset budget */
 	res->cur_budget = td_time_remaining_until_end(tdres);
 	res->budget_consumed = 0;
 	TRACE("td_replenish(%u): %s budget=%llu\n", res->id,
 		res->cur_budget ? "" : "WARNING", res->cur_budget);
 
+	/* prepare next slot */
 	tdres->next_interval = (tdres->next_interval + 1) % tdres->num_intervals;
-	if (tdres->next_interval)
-		res->next_replenishment = td_cur_major_cycle_start(tdres);
-	else
+	if (!tdres->next_interval)
 		/* wrap to next major cycle */
-		res->next_replenishment = td_next_major_cycle_start(tdres);
+		tdres->major_cycle_start += tdres->major_cycle;
+
+	/* determine next time this reservation becomes eligible to execute */
+	res->next_replenishment  = tdres->major_cycle_start;
 	res->next_replenishment += tdres->intervals[tdres->next_interval].start;
+	TRACE("td_replenish(%u): next_replenishment=%llu\n", res->id,
+		res->next_replenishment);
 
 
 	switch (res->state) {
@@ -433,6 +451,9 @@ static void td_drain_budget(
 	/* Table-driven scheduling: instead of tracking the budget, we compute
 	 * how much time is left in this allocation interval. */
 
+	/* sanity check: we should never try to drain from future slots */
+	BUG_ON(tdres->cur_interval.start > res->env->current_time);
+
 	switch (res->state) {
 		case RESERVATION_DEPLETED:
 		case RESERVATION_INACTIVE:
@@ -447,7 +468,12 @@ static void td_drain_budget(
 			if (!res->cur_budget) {
 				res->env->change_state(res->env, res,
 					RESERVATION_DEPLETED);
-			} /* else: stay in current state */
+			} else {
+				/* sanity check budget calculation */
+				BUG_ON(res->env->current_time >= tdres->cur_interval.end);
+				BUG_ON(res->env->current_time < tdres->cur_interval.start);
+			}
+
 			break;
 	}
 }
@@ -515,7 +541,8 @@ void table_driven_reservation_init(
 	reservation_init(&tdres->res);
 	tdres->major_cycle = major_cycle;
 	tdres->intervals = intervals;
-	tdres->cur_interval = intervals;
+	tdres->cur_interval.start = 0;
+	tdres->cur_interval.end   = 0;
 	tdres->num_intervals = num_intervals;
 	tdres->res.ops = &td_ops;
 }
diff --git a/litmus/reservation.c b/litmus/reservation.c
index 447fc5b..f796898 100644
--- a/litmus/reservation.c
+++ b/litmus/reservation.c
@@ -206,6 +206,7 @@ static void sup_charge_budget(
 			/* stop at the first ACTIVE reservation */
 			break;
 	}
+	TRACE("finished charging budgets\n");
 }
 
 static void sup_replenish_budgets(struct sup_reservation_environment* sup_env)
@@ -222,6 +223,7 @@ static void sup_replenish_budgets(struct sup_reservation_environment* sup_env)
 			break;
 		}
 	}
+	TRACE("finished replenishing budgets\n");
 
 	/* request a scheduler update at the next replenishment instant */
 	res = list_first_entry_or_null(&sup_env->depleted_reservations,
-- 
1.8.1.2


From 301fe33935a9ce53d6f2fc15084c7bcbae163d7a Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Wed, 17 Sep 2014 17:27:20 +0200
Subject: [PATCH 076/119] Reservations: request scheduler update for new active
 reservations

Don't forget to ask for a scheduler update when a reservation is
replenished but enters state ACTIVE_IDLE and there's nothing else
going on that triggers the scheduler by chance.
---
 litmus/reservation.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/litmus/reservation.c b/litmus/reservation.c
index f796898..0bc551e 100644
--- a/litmus/reservation.c
+++ b/litmus/reservation.c
@@ -122,9 +122,14 @@ static void sup_queue_active(
 	/* check for possible preemption */
 	if (res->state == RESERVATION_ACTIVE && !passed_active)
 		sup_env->next_scheduler_update = SUP_RESCHEDULE_NOW;
+	else {
+		/* Active means this reservation is draining budget => make sure
+		 * the scheduler is called to notice when the reservation budget has been
+		 * drained completely. */
+		sup_scheduler_update_after(sup_env, res->cur_budget);
+	}
 }
 
-
 static void sup_queue_reservation(
 	struct sup_reservation_environment* sup_env,
 	struct reservation *res)
-- 
1.8.1.2


From 6d16993db5e56e01d1b19f149ef805ab7aff8e12 Mon Sep 17 00:00:00 2001
From: Bjoern Brandenburg <bbb@mpi-sws.org>
Date: Wed, 17 Sep 2014 13:16:08 +0200
Subject: [PATCH 077/119] P-RES: improved handling of tardy jobs

Don't set a release timer for jobs that are tardy and already
released.
---
 litmus/sched_pres.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/litmus/sched_pres.c b/litmus/sched_pres.c
index 34b096d..e2d4810 100644
--- a/litmus/sched_pres.c
+++ b/litmus/sched_pres.c
@@ -251,13 +251,23 @@ static long pres_complete_job(void)
 	ktime_t next_release;
 	long err;
 
-	TRACE_CUR("pres_complete_job at %llu\n", litmus_clock());
+	TRACE_CUR("pres_complete_job at %llu (deadline: %llu)\n", litmus_clock(),
+		get_deadline(current));
 
 	tsk_rt(current)->completed = 1;
 	prepare_for_next_period(current);
 	next_release = ns_to_ktime(get_release(current));
-	set_current_state(TASK_INTERRUPTIBLE);
-	err = schedule_hrtimeout(&next_release, HRTIMER_MODE_ABS);
+	preempt_disable();
+	TRACE_CUR("next_release=%llu\n", get_release(current));
+	if (get_release(current) > litmus_clock()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		preempt_enable_no_resched();
+		err = schedule_hrtimeout(&next_release, HRTIMER_MODE_ABS);
+	} else {
+		err = 0;
+		TRACE_CUR("TARDY: release=%llu now=%llu\n", get_release(current), litmus_clock());
+		preempt_enable();
+	}
 
 	TRACE_CUR("pres_complete_job returns at %llu\n", litmus_clock());
 	return err;
-- 
1.8.1.2


From 3805cbc73a63f74166c65446395785390d7ad44b Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Tue, 4 Nov 2014 05:01:02 -0500
Subject: [PATCH 078/119] Add set_mc2_task_param system call

---
 arch/arm/include/asm/unistd.h    | 4 ++--
 arch/arm/kernel/calls.S          | 5 +++--
 arch/x86/syscalls/syscall_32.tbl | 5 +++--
 arch/x86/syscalls/syscall_64.tbl | 6 +++---
 include/litmus/unistd_32.h       | 3 ++-
 include/litmus/unistd_64.h       | 5 +++--
 6 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
index 8b26b32..0db825d 100644
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -14,8 +14,8 @@
 #define __ASM_ARM_UNISTD_H
 
 #include <uapi/asm/unistd.h>
-
-#define __NR_syscalls  (380 + NR_litmus_syscalls)
+/** __NR_syscalls padding */
+#define __NR_syscalls  (380 + NR_litmus_syscalls + 1)
 
 #define __ARM_NR_cmpxchg		(__ARM_NR_BASE+0x00fff0)
 
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index ad22fcc..34c5ee7 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -401,8 +401,9 @@
         	CALL(sys_wait_for_ts_release)
 /* 390 */	CALL(sys_release_ts)
 		CALL(sys_null_call)
-	    CALL(sys_reservation_create)
-	    CALL(sys_reservation_destroy)
+		CALL(sys_reservation_create)
+		CALL(sys_reservation_destroy)
+		CALL(sys_set_mc2_task_param)
 
 #ifndef syscalls_counted
 .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 290c879..20f6cdc 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -370,5 +370,6 @@
 360	i386	wait_for_ts_release	sys_wait_for_ts_release
 361	i386	release_ts		sys_release_ts
 362	i386	null_call		sys_null_call
-363 i386    reservation_create sys_reservation_create
-364 i386    reservation_destroy sys_reservation_destroy
+363	i386	reservation_create	sys_reservation_create
+364	i386	reservation_destroy	sys_reservation_destroy
+365	i386	set_mc2_task_param	sys_set_mc2_task_param
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index d39de2a..f3d142c 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -333,9 +333,9 @@
 360	common	wait_for_ts_release	sys_wait_for_ts_release
 361	common	release_ts		sys_release_ts
 362	common	null_call		sys_null_call
-363 common  reservation_create  sys_reservation_create
-364 common  reservation_destroy sys_reservation_destroy
-
+363	common	reservation_create	sys_reservation_create
+364	common	reservation_destroy	sys_reservation_destroy
+365	common	set_mc2_task_param	sys_set_mc2_task_param
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/litmus/unistd_32.h b/include/litmus/unistd_32.h
index 5f6a274..202f439 100644
--- a/include/litmus/unistd_32.h
+++ b/include/litmus/unistd_32.h
@@ -19,5 +19,6 @@
 #define __NR_null_call		__LSC(11)
 #define __NR_reservation_create	__LSC(12)
 #define __NR_reservation_destroy __LSC(13)
+#define __NR_set_mc2_task_param	__LSC(14)
 
-#define NR_litmus_syscalls 14
+#define NR_litmus_syscalls 15
diff --git a/include/litmus/unistd_64.h b/include/litmus/unistd_64.h
index 3e6b1d3..ba2c91c 100644
--- a/include/litmus/unistd_64.h
+++ b/include/litmus/unistd_64.h
@@ -33,6 +33,7 @@ __SYSCALL(__NR_null_call, sys_null_call)
 __SYSCALL(__NR_reservation_create, sys_reservation_create)
 #define __NR_reservation_destroy		__LSC(13)
 __SYSCALL(__NR_reservation_destroy, sys_reservation_destroy)
+#define __NR_set_mc2_task_param			__LSC(14)
+__SYSCALL(__NR_set_mc2_task_param, sys_set_mc2_task_param)
 
-
-#define NR_litmus_syscalls 14
+#define NR_litmus_syscalls 15
-- 
1.8.1.2


From cef9a08794f4ecdd0d1ea80ef4035f2bc9e234ce Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Tue, 4 Nov 2014 05:01:31 -0500
Subject: [PATCH 079/119] Add MC2 plugin

---
 litmus/Makefile      | 4 +++-
 litmus/litmus.c      | 2 +-
 litmus/reservation.c | 6 +++---
 litmus/sched_mc2.c   | 4 ++++
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/litmus/Makefile b/litmus/Makefile
index 05021f5..997524f 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -19,6 +19,7 @@ obj-y     = sched_plugin.o litmus.o \
 	    binheap.o \
 	    ctrldev.o \
 	    uncachedev.o \
+	    cache_proc.o \
 	    sched_gsn_edf.o \
 	    sched_psn_edf.o \
 	    sched_pfp.o
@@ -35,4 +36,5 @@ obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o
 
 obj-y += reservation.o polling_reservations.o
 
-obj-y += sched_pres.o
\ No newline at end of file
+obj-y += sched_pres.o
+obj-y += mc2_common.o sched_mc2.o
diff --git a/litmus/litmus.c b/litmus/litmus.c
index 0b87e04..8a2446f 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -646,7 +646,7 @@ static void __init litmus_enable_perfcounters_v7(void *_ignore)
 	asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r"(enable_val));
 
 	/* enables counters (cycle counter and event 1) */
-        asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r"(0x80000001));
+    asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r"(0x80000001));
 }
 
 static void __init litmus_enable_perfcounters(void)
diff --git a/litmus/reservation.c b/litmus/reservation.c
index 0bc551e..0e43479 100644
--- a/litmus/reservation.c
+++ b/litmus/reservation.c
@@ -211,7 +211,7 @@ static void sup_charge_budget(
 			/* stop at the first ACTIVE reservation */
 			break;
 	}
-	TRACE("finished charging budgets\n");
+	//TRACE("finished charging budgets\n");
 }
 
 static void sup_replenish_budgets(struct sup_reservation_environment* sup_env)
@@ -228,7 +228,7 @@ static void sup_replenish_budgets(struct sup_reservation_environment* sup_env)
 			break;
 		}
 	}
-	TRACE("finished replenishing budgets\n");
+	//TRACE("finished replenishing budgets\n");
 
 	/* request a scheduler update at the next replenishment instant */
 	res = list_first_entry_or_null(&sup_env->depleted_reservations,
@@ -246,7 +246,7 @@ void sup_update_time(
 	/* If the time didn't advance, there is nothing to do.
 	 * This check makes it safe to call sup_advance_time() potentially
 	 * multiple times (e.g., via different code paths. */
-	TRACE("(sup_update_time) now: %llu, current_time: %llu\n", now, sup_env->env.current_time);
+	//TRACE("(sup_update_time) now: %llu, current_time: %llu\n", now, sup_env->env.current_time);
 	if (unlikely(now <= sup_env->env.current_time))
 		return;
 
diff --git a/litmus/sched_mc2.c b/litmus/sched_mc2.c
index ab35008..1e5b28b 100644
--- a/litmus/sched_mc2.c
+++ b/litmus/sched_mc2.c
@@ -381,6 +381,8 @@ static void mc2_task_new(struct task_struct *tsk, int on_runqueue,
 	task_new_legacy_task_model_updates(tsk);
 }
 
+static long mc2_reservation_destroy(unsigned int reservation_id, int cpu);
+
 static void mc2_task_exit(struct task_struct *tsk)
 {
 	unsigned long flags;
@@ -404,6 +406,8 @@ static void mc2_task_exit(struct task_struct *tsk)
 	} else
 		raw_spin_unlock_irqrestore(&state->lock, flags);
 
+	mc2_reservation_destroy(tsk->pid, tinfo->cpu);
+	
 	kfree(tsk_rt(tsk)->plugin_state);
 	tsk_rt(tsk)->plugin_state = NULL;
 }
-- 
1.8.1.2


From d77654f3287edf9fa6aeda97825e9a972bdc8821 Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Wed, 19 Nov 2014 16:01:27 -0500
Subject: [PATCH 080/119] set_page_color syscall

---
 arch/arm/include/asm/unistd.h    |   2 +-
 arch/arm/kernel/calls.S          |   3 +-
 arch/arm/mm/cache-l2x0.c         |   4 ++
 arch/x86/syscalls/syscall_32.tbl |   1 +
 arch/x86/syscalls/syscall_64.tbl |   1 +
 include/litmus/unistd_32.h       |   3 +-
 include/litmus/unistd_64.h       |   4 +-
 litmus/litmus.c                  | 110 +++++++++++++++++++++++++++++++++++++++
 8 files changed, 124 insertions(+), 4 deletions(-)

diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
index 0db825d..f31a912 100644
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -15,7 +15,7 @@
 
 #include <uapi/asm/unistd.h>
 /** __NR_syscalls padding */
-#define __NR_syscalls  (380 + NR_litmus_syscalls + 1)
+#define __NR_syscalls  (380 + NR_litmus_syscalls)
 
 #define __ARM_NR_cmpxchg		(__ARM_NR_BASE+0x00fff0)
 
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index 34c5ee7..2fbce68 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -404,7 +404,8 @@
 		CALL(sys_reservation_create)
 		CALL(sys_reservation_destroy)
 		CALL(sys_set_mc2_task_param)
-
+/* 395 */	CALL(sys_set_page_color)
+		
 #ifndef syscalls_counted
 .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
 #define syscalls_counted
diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c
index 5efe6b6e..cff808e 100644
--- a/arch/arm/mm/cache-l2x0.c
+++ b/arch/arm/mm/cache-l2x0.c
@@ -326,6 +326,7 @@ void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask)
 {
 	u32 aux;
 	u32 cache_id;
+	u32 cache_type;
 	u32 way_size = 0;
 	int ways;
 	int way_size_shift = L2X0_WAY_SIZE_SHIFT;
@@ -337,6 +338,8 @@ void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask)
 	else
 		cache_id = readl_relaxed(l2x0_base + L2X0_CACHE_ID);
 	aux = readl_relaxed(l2x0_base + L2X0_AUX_CTRL);
+	
+	cache_type = readl_relaxed(l2x0_base + L2X0_CACHE_TYPE);
 
 	aux &= aux_mask;
 	aux |= aux_val;
@@ -424,6 +427,7 @@ void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask)
 	printk(KERN_INFO "%s cache controller enabled\n", type);
 	printk(KERN_INFO "l2x0: %d ways, CACHE_ID 0x%08x, AUX_CTRL 0x%08x, Cache size: %d B\n",
 			ways, cache_id, aux, l2x0_size);
+	printk(KERN_INFO "l2x0: CACHE_TYPE 0x%08x\n", cache_type);
 
 	litmus_setup_lockdown(l2x0_base, cache_id);
 }
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 20f6cdc..401f313 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -373,3 +373,4 @@
 363	i386	reservation_create	sys_reservation_create
 364	i386	reservation_destroy	sys_reservation_destroy
 365	i386	set_mc2_task_param	sys_set_mc2_task_param
+366	i386	set_page_color		sys_set_page_color
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index f3d142c..77710a9 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -336,6 +336,7 @@
 363	common	reservation_create	sys_reservation_create
 364	common	reservation_destroy	sys_reservation_destroy
 365	common	set_mc2_task_param	sys_set_mc2_task_param
+366	common	set_page_color		sys_set_page_color
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/litmus/unistd_32.h b/include/litmus/unistd_32.h
index 202f439..cfffbdd 100644
--- a/include/litmus/unistd_32.h
+++ b/include/litmus/unistd_32.h
@@ -20,5 +20,6 @@
 #define __NR_reservation_create	__LSC(12)
 #define __NR_reservation_destroy __LSC(13)
 #define __NR_set_mc2_task_param	__LSC(14)
+#define __NR_set_page_color		__LSC(15)
 
-#define NR_litmus_syscalls 15
+#define NR_litmus_syscalls 16
diff --git a/include/litmus/unistd_64.h b/include/litmus/unistd_64.h
index ba2c91c..2fb49dc 100644
--- a/include/litmus/unistd_64.h
+++ b/include/litmus/unistd_64.h
@@ -35,5 +35,7 @@ __SYSCALL(__NR_reservation_create, sys_reservation_create)
 __SYSCALL(__NR_reservation_destroy, sys_reservation_destroy)
 #define __NR_set_mc2_task_param			__LSC(14)
 __SYSCALL(__NR_set_mc2_task_param, sys_set_mc2_task_param)
+#define __NR_set_page_color				__LSC(15)
+__SYSCALL(__NR_set_page_color, sys_set_page_color)
 
-#define NR_litmus_syscalls 15
+#define NR_litmus_syscalls 16
diff --git a/litmus/litmus.c b/litmus/litmus.c
index 8a2446f..cdffbc6 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -13,6 +13,7 @@
 #include <linux/stop_machine.h>
 #include <linux/sched/rt.h>
 #include <linux/rwsem.h>
+#include <linux/list.h>
 
 #include <litmus/litmus.h>
 #include <litmus/bheap.h>
@@ -325,6 +326,112 @@ asmlinkage long sys_reservation_destroy(unsigned int reservation_id, int cpu)
 	return litmus->reservation_destroy(reservation_id, cpu);
 }
 
+struct task_page {
+	unsigned long vm_start;
+	unsigned long vm_end;
+	struct page* page;
+	struct list_head list;
+};
+
+LIST_HEAD(task_page_list);
+
+static unsigned long color_mask;
+
+static inline unsigned long page_color(struct page *page)
+{
+	return ((page_to_phys(page) & color_mask) >> PAGE_SHIFT);
+}
+
+static struct page *walk_page_table(unsigned long addr)
+{
+    pgd_t *pgd;
+    pte_t *ptep, pte;
+    pud_t *pud;
+    pmd_t *pmd;
+
+    struct page *page = NULL;
+    struct mm_struct *mm = current->mm;
+
+    pgd = pgd_offset(mm, addr);
+    if (pgd_none(*pgd) || pgd_bad(*pgd))
+        goto out;
+    
+    pud = pud_offset(pgd, addr);
+    if (pud_none(*pud) || pud_bad(*pud))
+        goto out;
+    
+    pmd = pmd_offset(pud, addr);
+    if (pmd_none(*pmd) || pmd_bad(*pmd))
+        goto out;
+    
+    ptep = pte_offset_map(pmd, addr);
+    if (!ptep)
+        goto out;
+    pte = *ptep;
+
+    page = pte_page(pte);
+    if (page)
+        printk(KERN_INFO "page frame struct is @ %p\n", page);
+	
+	pte_unmap(ptep);
+
+ out:
+    return page;
+}
+
+asmlinkage long sys_set_page_color(int cpu)
+{
+	long ret = 0;
+	struct task_page *task_page_itr = NULL;
+	struct task_page *task_page_itr_next = NULL;
+	struct vm_area_struct *vma_itr = NULL;
+	struct task_page *entry = NULL;
+	
+	down_read(&current->mm->mmap_sem);
+	printk(KERN_INFO "SYSCALL set_page_color\n");
+	vma_itr = current->mm->mmap;
+	while (vma_itr != NULL) {
+		unsigned int num_pages = 0, i;
+		struct page *new_page = NULL;
+		entry = kmalloc(sizeof(struct task_page), GFP_ATOMIC);
+		if (entry == NULL) {
+			return -ENOSPC;
+		}
+		entry->vm_start = vma_itr->vm_start;
+		entry->vm_end = vma_itr->vm_end;
+		num_pages = (entry->vm_end - entry->vm_start) / PAGE_SIZE;
+		// print vma flags
+		printk(KERN_INFO "flags: 0x%lx\n", vma_itr->vm_flags);
+		printk(KERN_INFO "start - end: 0x%lx - 0x%lx (%lu)\n", vma_itr->vm_start, vma_itr->vm_end, (vma_itr->vm_end - vma_itr->vm_start)/PAGE_SIZE);
+		
+		for (i = 0; i < num_pages; i++) {
+alloc:
+			new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma_itr, vma_itr->vm_start);
+			if (!new_page)
+				return -ENOSPC;
+			if ( (page_color(new_page)>>2) != cpu) {
+				
+				
+		}
+		//entry->page = walk_page_table(entry->vm_start);
+		
+		INIT_LIST_HEAD(&entry->list);
+		list_add(&entry->list, &task_page_list);
+		
+		vma_itr = vma_itr->vm_next;
+	}
+	
+	up_read(&current->mm->mmap_sem);
+	
+	list_for_each_entry_safe(task_page_itr, task_page_itr_next, &task_page_list, list) {
+		//printk(KERN_INFO "start - end: 0x%lx - 0x%lx (%lu)\n", task_page_itr->vm_start, task_page_itr->vm_end, (task_page_itr->vm_end - task_page_itr->vm_start)/PAGE_SIZE);
+		list_del(&task_page_itr->list);
+		kfree(task_page_itr);		
+	}
+	
+	return ret;
+}
+
 /* p is a real-time task. Re-init its state as a best-effort task. */
 static void reinit_litmus_state(struct task_struct* p, int restore)
 {
@@ -662,6 +769,8 @@ static int __init _init_litmus(void)
 	 *      mode change lock is used to enforce single mode change
 	 *      operation.
 	 */
+	unsigned int line_size_log = 5; // 2^5 = 32 byte
+	unsigned int cache_info_sets = 2048; // 64KB (way_size) / 32B (line_size) = 2048
 	printk("Starting LITMUS^RT kernel\n");
 
 	register_sched_plugin(&linux_sched_plugin);
@@ -689,6 +798,7 @@ static int __init _init_litmus(void)
 	litmus_enable_perfcounters();
 #endif
 	
+	color_mask = ((cache_info_sets << line_size_log) - 1) ^ (PAGE_SIZE - 1);
 	return 0;
 }
 
-- 
1.8.1.2


From 07d5680c4c476a4b68bd3cff134d99ca996b2481 Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Fri, 21 Nov 2014 15:37:46 -0500
Subject: [PATCH 081/119] static linking coloring

---
 litmus/litmus.c | 94 ++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 73 insertions(+), 21 deletions(-)

diff --git a/litmus/litmus.c b/litmus/litmus.c
index cdffbc6..88cc3e0 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -14,13 +14,16 @@
 #include <linux/sched/rt.h>
 #include <linux/rwsem.h>
 #include <linux/list.h>
+#include <linux/migrate.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
 
 #include <litmus/litmus.h>
 #include <litmus/bheap.h>
 #include <litmus/trace.h>
 #include <litmus/rt_domain.h>
-#include <litmus/litmus_proc.h>
 #include <litmus/sched_trace.h>
+#include <litmus/litmus_proc.h>
 #include <litmus/clock.h>
 
 #include <asm/cacheflush.h>
@@ -342,6 +345,7 @@ static inline unsigned long page_color(struct page *page)
 	return ((page_to_phys(page) & color_mask) >> PAGE_SHIFT);
 }
 
+/*
 static struct page *walk_page_table(unsigned long addr)
 {
     pgd_t *pgd;
@@ -353,15 +357,18 @@ static struct page *walk_page_table(unsigned long addr)
     struct mm_struct *mm = current->mm;
 
     pgd = pgd_offset(mm, addr);
-    if (pgd_none(*pgd) || pgd_bad(*pgd))
+    //if (pgd_none(*pgd) || pgd_bad(*pgd))
+	if (pgd_none_or_clear_bad(pgd))
         goto out;
     
     pud = pud_offset(pgd, addr);
-    if (pud_none(*pud) || pud_bad(*pud))
+    //if (pud_none(*pud) || pud_bad(*pud))
+	if (pud_none_or_clear_bad(pud))
         goto out;
     
     pmd = pmd_offset(pud, addr);
-    if (pmd_none(*pmd) || pmd_bad(*pmd))
+    //if (pmd_none(*pmd) || pmd_bad(*pmd))
+	if (pmd_none_or_clear_bad(pmd))
         goto out;
     
     ptep = pte_offset_map(pmd, addr);
@@ -370,65 +377,110 @@ static struct page *walk_page_table(unsigned long addr)
     pte = *ptep;
 
     page = pte_page(pte);
-    if (page)
-        printk(KERN_INFO "page frame struct is @ %p\n", page);
+    if (pfn_valid(__page_to_pfn(page))) {
+        ;//printk(KERN_INFO "page frame struct is @ %p\n", page);
+		//printk(KERN_INFO "pfn is %lu\n", __page_to_pfn(page));
+	}
 	
 	pte_unmap(ptep);
 
  out:
     return page;
 }
+*/
+
+extern int isolate_lru_page(struct page *page);
+extern void putback_lru_page(struct page *page);
+
+static struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
+{
+	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
+}
 
 asmlinkage long sys_set_page_color(int cpu)
 {
 	long ret = 0;
-	struct task_page *task_page_itr = NULL;
-	struct task_page *task_page_itr_next = NULL;
+	//struct task_page *task_page_itr = NULL;
+	//struct task_page *task_page_itr_next = NULL;
 	struct vm_area_struct *vma_itr = NULL;
-	struct task_page *entry = NULL;
+	//struct task_page *entry = NULL;
+	int nr_pages = 0;
+	LIST_HEAD(pagelist);
 	
 	down_read(&current->mm->mmap_sem);
 	printk(KERN_INFO "SYSCALL set_page_color\n");
 	vma_itr = current->mm->mmap;
 	while (vma_itr != NULL) {
 		unsigned int num_pages = 0, i;
-		struct page *new_page = NULL;
+		struct page *new_page = NULL, *old_page = NULL;
+		/*
 		entry = kmalloc(sizeof(struct task_page), GFP_ATOMIC);
 		if (entry == NULL) {
 			return -ENOSPC;
 		}
 		entry->vm_start = vma_itr->vm_start;
 		entry->vm_end = vma_itr->vm_end;
-		num_pages = (entry->vm_end - entry->vm_start) / PAGE_SIZE;
+		*/
+		num_pages = (vma_itr->vm_end - vma_itr->vm_start) / PAGE_SIZE;
 		// print vma flags
-		printk(KERN_INFO "flags: 0x%lx\n", vma_itr->vm_flags);
-		printk(KERN_INFO "start - end: 0x%lx - 0x%lx (%lu)\n", vma_itr->vm_start, vma_itr->vm_end, (vma_itr->vm_end - vma_itr->vm_start)/PAGE_SIZE);
+		//printk(KERN_INFO "flags: 0x%lx\n", vma_itr->vm_flags);
+		//printk(KERN_INFO "start - end: 0x%lx - 0x%lx (%lu)\n", vma_itr->vm_start, vma_itr->vm_end, (vma_itr->vm_end - vma_itr->vm_start)/PAGE_SIZE);
 		
 		for (i = 0; i < num_pages; i++) {
-alloc:
+/*
 			new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma_itr, vma_itr->vm_start);
 			if (!new_page)
 				return -ENOSPC;
-			if ( (page_color(new_page)>>2) != cpu) {
-				
+			printk(KERN_INFO "PAGE_COLOR: %lu\n", page_color(new_page));
+*/
+			//old_page = walk_page_table(vma_itr->vm_start + PAGE_SIZE*i);
+			old_page = follow_page(vma_itr, vma_itr->vm_start + PAGE_SIZE*i, FOLL_GET|FOLL_SPLIT);
+			
+			if (IS_ERR(old_page))
+				continue;
+			if (!old_page)
+				continue;
+			if (PageReserved(old_page))
+				goto put_and_next;
 				
+			ret = isolate_lru_page(old_page);
+			//if (pfn_valid(__page_to_pfn(old_page)))
+			if (!ret) {
+				//printk(KERN_INFO "page_mapcount = %d\n", page_mapcount(old_page));
+				printk(KERN_INFO "addr: %lu, pfn: %lu mapcount: %d\n", vma_itr->vm_start + PAGE_SIZE*i, __page_to_pfn(old_page), page_mapcount(old_page));
+				list_add_tail(&old_page->lru, &pagelist);
+				inc_zone_page_state(old_page, NR_ISOLATED_ANON + !PageSwapBacked(old_page));
+				nr_pages++;
+			}
+put_and_next:				
+				put_page(old_page);
 		}
-		//entry->page = walk_page_table(entry->vm_start);
 		
-		INIT_LIST_HEAD(&entry->list);
-		list_add(&entry->list, &task_page_list);
+		//INIT_LIST_HEAD(&entry->list);
+		//list_add(&entry->list, &task_page_list);
 		
 		vma_itr = vma_itr->vm_next;
 	}
+
+	ret = 0;
+	if (!list_empty(&pagelist)) {
+		ret = migrate_pages(&pagelist, new_alloc_page, 0, MIGRATE_ASYNC, MR_SYSCALL);
+		if (ret) {
+			printk(KERN_INFO "%ld pages not migrated.\n", ret);
+			putback_lru_pages(&pagelist);
+		}
+	}
 	
 	up_read(&current->mm->mmap_sem);
-	
+
+/*	
 	list_for_each_entry_safe(task_page_itr, task_page_itr_next, &task_page_list, list) {
 		//printk(KERN_INFO "start - end: 0x%lx - 0x%lx (%lu)\n", task_page_itr->vm_start, task_page_itr->vm_end, (task_page_itr->vm_end - task_page_itr->vm_start)/PAGE_SIZE);
 		list_del(&task_page_itr->list);
 		kfree(task_page_itr);		
 	}
-	
+*/	
+	printk(KERN_INFO "nr_pages = %d\n", nr_pages);
 	return ret;
 }
 
-- 
1.8.1.2


From 6583dcfbda43e420921e3adf7f2e46dc719e8d26 Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Wed, 14 Jan 2015 02:31:12 -0500
Subject: [PATCH 082/119] MC2 levels A, B

---
 include/litmus/mc2_common.h   |  14 +-
 litmus/litmus.c               |  63 ++++++--
 litmus/mc2_common.c           |  65 ++++++--
 litmus/polling_reservations.c |   9 +-
 litmus/sched_mc2.c            | 357 ++++++++++++++++++++----------------------
 litmus/sched_pres.c           |  10 ++
 6 files changed, 288 insertions(+), 230 deletions(-)

diff --git a/include/litmus/mc2_common.h b/include/litmus/mc2_common.h
index a1d571f..bdc3a6d 100644
--- a/include/litmus/mc2_common.h
+++ b/include/litmus/mc2_common.h
@@ -14,24 +14,14 @@ enum crit_level {
 
 struct mc2_task {
 	enum crit_level crit;
-	pid_t pid;
-	lt_t hyperperiod;
+	unsigned int res_id;
 };
 
 #ifdef __KERNEL__
 
 #include <litmus/reservation.h>
 
-struct mc2_param{
-	struct mc2_task mc2_task;
-};
-
-struct mc2_task_client {
-	struct task_client tc;
-	struct mc2_param mc2;
-};
-
-long mc2_task_client_init(struct mc2_task_client *mtc, struct task_struct *tsk,
+long mc2_task_client_init(struct task_client *tc, struct mc2_task *mc2_param, struct task_struct *tsk,
 							struct reservation *res);
 	
 #endif /* __KERNEL__ */
diff --git a/litmus/litmus.c b/litmus/litmus.c
index 88cc3e0..6034ff8 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -402,10 +402,12 @@ asmlinkage long sys_set_page_color(int cpu)
 	long ret = 0;
 	//struct task_page *task_page_itr = NULL;
 	//struct task_page *task_page_itr_next = NULL;
+	struct page *page_itr = NULL;
 	struct vm_area_struct *vma_itr = NULL;
 	//struct task_page *entry = NULL;
-	int nr_pages = 0;
+	int nr_pages = 0, nr_shared_pages = 0;
 	LIST_HEAD(pagelist);
+	LIST_HEAD(shared_pagelist);
 	
 	down_read(&current->mm->mmap_sem);
 	printk(KERN_INFO "SYSCALL set_page_color\n");
@@ -423,8 +425,8 @@ asmlinkage long sys_set_page_color(int cpu)
 		*/
 		num_pages = (vma_itr->vm_end - vma_itr->vm_start) / PAGE_SIZE;
 		// print vma flags
-		//printk(KERN_INFO "flags: 0x%lx\n", vma_itr->vm_flags);
-		//printk(KERN_INFO "start - end: 0x%lx - 0x%lx (%lu)\n", vma_itr->vm_start, vma_itr->vm_end, (vma_itr->vm_end - vma_itr->vm_start)/PAGE_SIZE);
+		printk(KERN_INFO "flags: 0x%lx\n", vma_itr->vm_flags);
+		printk(KERN_INFO "start - end: 0x%lx - 0x%lx (%lu)\n", vma_itr->vm_start, vma_itr->vm_end, (vma_itr->vm_end - vma_itr->vm_start)/PAGE_SIZE);
 		
 		for (i = 0; i < num_pages; i++) {
 /*
@@ -440,20 +442,32 @@ asmlinkage long sys_set_page_color(int cpu)
 				continue;
 			if (!old_page)
 				continue;
-			if (PageReserved(old_page))
-				goto put_and_next;
-				
-			ret = isolate_lru_page(old_page);
-			//if (pfn_valid(__page_to_pfn(old_page)))
-			if (!ret) {
-				//printk(KERN_INFO "page_mapcount = %d\n", page_mapcount(old_page));
-				printk(KERN_INFO "addr: %lu, pfn: %lu mapcount: %d\n", vma_itr->vm_start + PAGE_SIZE*i, __page_to_pfn(old_page), page_mapcount(old_page));
-				list_add_tail(&old_page->lru, &pagelist);
-				inc_zone_page_state(old_page, NR_ISOLATED_ANON + !PageSwapBacked(old_page));
-				nr_pages++;
+			if (PageReserved(old_page)) {
+				put_page(old_page);
+				continue;
+			}
+			
+			printk(KERN_INFO "addr: %lu, pfn: %lu, _mapcount: %d, _count: %d\n", vma_itr->vm_start + PAGE_SIZE*i, __page_to_pfn(old_page), page_mapcount(old_page), page_count(old_page));
+			
+			if (page_mapcount(old_page) == 1) {
+				ret = isolate_lru_page(old_page);
+				//if (pfn_valid(__page_to_pfn(old_page)))
+				if (!ret) {
+					list_add_tail(&old_page->lru, &pagelist);
+					inc_zone_page_state(old_page, NR_ISOLATED_ANON + !PageSwapBacked(old_page));
+					nr_pages++;
+				}
+				put_page(old_page);
 			}
-put_and_next:				
+			else {
+				ret = isolate_lru_page(old_page);
+				if (!ret) {
+					list_add_tail(&old_page->lru, &shared_pagelist);
+					inc_zone_page_state(old_page, NR_ISOLATED_ANON + !PageSwapBacked(old_page));
+					nr_shared_pages++;
+				}					
 				put_page(old_page);
+			}
 		}
 		
 		//INIT_LIST_HEAD(&entry->list);
@@ -462,6 +476,10 @@ put_and_next:
 		vma_itr = vma_itr->vm_next;
 	}
 
+	//list_for_each_entry(page_itr, &pagelist, lru) {
+//		printk(KERN_INFO "B _mapcount = %d, _count = %d\n", page_mapcount(page_itr), page_count(page_itr));
+//	}
+	
 	ret = 0;
 	if (!list_empty(&pagelist)) {
 		ret = migrate_pages(&pagelist, new_alloc_page, 0, MIGRATE_ASYNC, MR_SYSCALL);
@@ -471,8 +489,23 @@ put_and_next:
 		}
 	}
 	
+	/* copy shared pages HERE */
+/*	
+	ret = 0;
+	if (!list_empty(&shared_pagelist)) {
+		ret = migrate_shared_pages(&shared_pagelist, new_alloc_page, 0, MIGRATE_ASYNC, MR_SYSCALL);
+		if (ret) {
+			printk(KERN_INFO "%ld shared pages not migrated.\n", ret);
+			putback_lru_pages(&shared_pagelist);
+		}
+	}
+*/
 	up_read(&current->mm->mmap_sem);
 
+	list_for_each_entry(page_itr, &shared_pagelist, lru) {
+		printk(KERN_INFO "S Anon=%d, pfn = %lu, _mapcount = %d, _count = %d\n", PageAnon(page_itr), __page_to_pfn(page_itr), page_mapcount(page_itr), page_count(page_itr));
+	}
+	
 /*	
 	list_for_each_entry_safe(task_page_itr, task_page_itr_next, &task_page_list, list) {
 		//printk(KERN_INFO "start - end: 0x%lx - 0x%lx (%lu)\n", task_page_itr->vm_start, task_page_itr->vm_end, (task_page_itr->vm_end - task_page_itr->vm_start)/PAGE_SIZE);
diff --git a/litmus/mc2_common.c b/litmus/mc2_common.c
index 56ef6b5..d0a42c6 100644
--- a/litmus/mc2_common.c
+++ b/litmus/mc2_common.c
@@ -7,6 +7,8 @@
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/list.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
 
 #include <litmus/litmus.h>
 #include <litmus/sched_plugin.h>
@@ -14,17 +16,62 @@
 
 #include <litmus/mc2_common.h>
 
-long mc2_task_client_init(
-	struct mc2_task_client *mtc,
-	struct task_struct *tsk,
-	struct reservation *res
-)
+long mc2_task_client_init(struct task_client *tc, struct mc2_task *mc2_param, struct task_struct *tsk, struct reservation *res)
 {
-	task_client_init(&mtc->tc, tsk, res);
-	if ((mtc->mc2.mc2_task.crit < CRIT_LEVEL_A) ||
-		(mtc->mc2.mc2_task.crit > CRIT_LEVEL_C))
+	task_client_init(tc, tsk, res);
+	if ((mc2_param->crit < CRIT_LEVEL_A) ||
+		(mc2_param->crit > CRIT_LEVEL_C))
 		return -EINVAL;
 	
-	TRACE_TASK(tsk, "mc2_task_client_init: crit_level = %d\n", mtc->mc2.mc2_task.crit);
+	TRACE_TASK(tsk, "mc2_task_client_init: crit_level = %d\n", mc2_param->crit);
+	
 	return 0;
+}
+
+asmlinkage long sys_set_mc2_task_param(pid_t pid, struct mc2_task __user * param)
+{
+	struct task_struct *target;
+	int retval = -EINVAL;
+	struct mc2_task *mp = kzalloc(sizeof(*mp), GFP_KERNEL);
+	
+	if (!mp)
+		return -ENOMEM;
+
+	printk("Setting up mc^2 task parameters for process %d.\n", pid);
+
+	if (pid < 0 || param == 0) {
+		goto out;
+	}
+	if (copy_from_user(mp, param, sizeof(*mp))) {
+		retval = -EFAULT;
+		goto out;
+	}
+
+	/* Task search and manipulation must be protected */
+	read_lock_irq(&tasklist_lock);
+	if (!(target = find_task_by_vpid(pid))) {
+		retval = -ESRCH;
+		goto out_unlock;
+	}
+
+	if (is_realtime(target)) {
+		/* The task is already a real-time task.
+		 * We cannot not allow parameter changes at this point.
+		 */
+		retval = -EBUSY;
+		goto out_unlock;
+	}
+	if (mp->crit < CRIT_LEVEL_A || mp->crit >= NUM_CRIT_LEVELS) {
+		printk(KERN_INFO "litmus: real-time task %d rejected "
+			"because of invalid criticality level\n", pid);
+		goto out_unlock;
+	}
+	
+	target->rt_param.plugin_state = mp;
+
+	retval = 0;
+out_unlock:
+	read_unlock_irq(&tasklist_lock);
+out:
+	return retval;
 }
\ No newline at end of file
diff --git a/litmus/polling_reservations.c b/litmus/polling_reservations.c
index 86a3206..4c07ee7 100644
--- a/litmus/polling_reservations.c
+++ b/litmus/polling_reservations.c
@@ -452,7 +452,10 @@ static void td_drain_budget(
 	 * how much time is left in this allocation interval. */
 
 	/* sanity check: we should never try to drain from future slots */
-	BUG_ON(tdres->cur_interval.start > res->env->current_time);
+	TRACE("TD_DRAIN STATE(%d) [%llu,%llu]  %llu ?\n", res->state, tdres->cur_interval.start, tdres->cur_interval.end, res->env->current_time);
+	//BUG_ON(tdres->cur_interval.start > res->env->current_time);
+	if (tdres->cur_interval.start > res->env->current_time)
+		TRACE("TD_DRAIN BUG!!!!!!!!!!\n");
 
 	switch (res->state) {
 		case RESERVATION_DEPLETED:
@@ -470,8 +473,8 @@ static void td_drain_budget(
 					RESERVATION_DEPLETED);
 			} else {
 				/* sanity check budget calculation */
-				BUG_ON(res->env->current_time >= tdres->cur_interval.end);
-				BUG_ON(res->env->current_time < tdres->cur_interval.start);
+				//BUG_ON(res->env->current_time >= tdres->cur_interval.end);
+				//BUG_ON(res->env->current_time < tdres->cur_interval.start);
 			}
 
 			break;
diff --git a/litmus/sched_mc2.c b/litmus/sched_mc2.c
index 1e5b28b..b9f0523 100644
--- a/litmus/sched_mc2.c
+++ b/litmus/sched_mc2.c
@@ -16,9 +16,10 @@
 #include <litmus/polling_reservations.h>
 
 struct mc2_task_state {
-	struct mc2_task_client res_info;
+	struct task_client res_info;
 	int cpu;
 	bool has_departed;
+	struct mc2_task mc2_param;
 };
 
 struct mc2_cpu_state {
@@ -29,12 +30,7 @@ struct mc2_cpu_state {
 
 	int cpu;
 	struct task_struct* scheduled;
-
-#ifdef CONFIG_RELEASE_MASTER
-	int release_master;
-	/* used to delegate releases */
-	struct hrtimer_start_on_info info;
-#endif
+	enum crit_level run_level;
 };
 
 static DEFINE_PER_CPU(struct mc2_cpu_state, mc2_cpu_state);
@@ -53,8 +49,8 @@ static void task_departs(struct task_struct *tsk, int job_complete)
 	struct reservation* res;
 	struct reservation_client *client;
 
-	res    = state->res_info.tc.client.reservation;
-	client = &state->res_info.tc.client;
+	res    = state->res_info.client.reservation;
+	client = &state->res_info.client;
 
 	res->ops->client_departs(res, client, job_complete);
 	state->has_departed = true;
@@ -66,8 +62,8 @@ static void task_arrives(struct task_struct *tsk)
 	struct reservation* res;
 	struct reservation_client *client;
 
-	res    = state->res_info.tc.client.reservation;
-	client = &state->res_info.tc.client;
+	res    = state->res_info.client.reservation;
+	client = &state->res_info.client;
 
 	state->has_departed = false;
 	res->ops->client_arrives(res, client);
@@ -81,13 +77,13 @@ static void mc2_update_timer_and_unlock(struct mc2_cpu_state *state)
 
 	update = state->sup_env.next_scheduler_update;
 	now = state->sup_env.env.current_time;
-	
+
 	/* Be sure we're actually running on the right core,
-	 * as mc2_update_timer() is also called from mc2_task_resume(),
+	 * as pres_update_timer() is also called from pres_task_resume(),
 	 * which might be called on any CPU when a thread resumes.
 	 */
 	local = local_cpu_state() == state;
-	
+
 	/* Must drop state lock before calling into hrtimer_start(), which
 	 * may raise a softirq, which in turn may wake ksoftirqd. */
 	raw_spin_unlock(&state->lock);
@@ -169,36 +165,10 @@ static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 	return restart;
 }
 
-static long mc2_complete_job(void)
-{
-	ktime_t next_release;
-	long err;
-
-	TRACE_CUR("mc2_complete_job at %llu (deadline: %llu)\n", litmus_clock(),
-					get_deadline(current));
-
-	tsk_rt(current)->completed = 1;
-	prepare_for_next_period(current);
-	next_release = ns_to_ktime(get_release(current));
-	preempt_disable();
-	TRACE_CUR("next_release=%llu\n", get_release(current));
-	if (get_release(current) > litmus_clock()) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		preempt_enable_no_resched();
-		err = schedule_hrtimeout(&next_release, HRTIMER_MODE_ABS);
-	} else {
-		err = 0;
-		TRACE_CUR("TARDY: release=%llu now=%llu\n", get_release(current), litmus_clock());
-		preempt_enable();
-	}
-
-	TRACE_CUR("mc2_complete_job returns [%d] at %llu\n", err, litmus_clock());
-	return err;
-}
-
 static struct task_struct* mc2_schedule(struct task_struct * prev)
 {
 	/* next == NULL means "schedule background work". */
+	struct mc2_task_state *tinfo;
 	struct mc2_cpu_state *state = local_cpu_state();
 
 	raw_spin_lock(&state->lock);
@@ -210,16 +180,6 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 	state->sup_env.will_schedule = true;
 	sup_update_time(&state->sup_env, litmus_clock());
 
-	/* check if prev task complete */
-	if (is_realtime(prev)) {
-		TRACE_TASK(prev, "EXEC_TIME = %llu, EXEC_COST = %llu, REMAINED = %llu\n",
-			get_exec_time(prev), get_exec_cost(prev), get_exec_cost(prev)-get_exec_time(prev)); 
-	}
-	if (is_realtime(prev) && (get_exec_time(prev) >= get_exec_cost(prev))) {
-		TRACE_TASK(prev, "JOB COMPLETED! but is_completed = %d\n", is_completed(prev));
-//		mc2_complete_job(prev);
-	}
-
 	/* remove task from reservation if it blocks */
 	if (is_realtime(prev) && !is_running(prev))
 		task_departs(prev, is_completed(prev));
@@ -234,12 +194,17 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 	state->sup_env.will_schedule = false;
 	/* NOTE: drops state->lock */
 	mc2_update_timer_and_unlock(state);
-	
-	if (prev != state->scheduled && is_realtime(prev))
+
+	if (prev != state->scheduled && is_realtime(prev)) {
 		TRACE_TASK(prev, "descheduled.\n");
-	if (state->scheduled)
+		state->run_level = NUM_CRIT_LEVELS;
+	}
+	if (state->scheduled) {
 		TRACE_TASK(state->scheduled, "scheduled.\n");
-
+		//tinfo = get_mc2_state(state->scheduled);
+		//state->run_level = tinfo->mc2_param.crit;
+	}
+	
 	return state->scheduled;
 }
 
@@ -279,6 +244,7 @@ static void mc2_task_resume(struct task_struct  *tsk)
 		sup_update_time(&state->sup_env, litmus_clock());
 		task_arrives(tsk);
 		/* NOTE: drops state->lock */
+		TRACE("mc2_resume()\n");
 		mc2_update_timer_and_unlock(state);
 		local_irq_restore(flags);
 	} else {
@@ -289,6 +255,60 @@ static void mc2_task_resume(struct task_struct  *tsk)
 	resume_legacy_task_model_updates(tsk);
 }
 
+/* syscall backend for job completions */
+static long mc2_complete_job(void)
+{
+	ktime_t next_release;
+	long err;
+	struct mc2_cpu_state *state = local_cpu_state();
+	struct reservation_environment *env = &(state->sup_env.env);
+	struct mc2_task_state *tinfo = get_mc2_state(current);
+	
+	
+	TRACE_CUR("mc2_complete_job at %llu (deadline: %llu)\n", litmus_clock(),
+		get_deadline(current));
+
+	tsk_rt(current)->completed = 1;
+	
+	if (tsk_rt(current)->sporadic_release) {
+		env->time_zero = tsk_rt(current)->sporadic_release_time;
+	
+		if (tinfo->mc2_param.crit == CRIT_LEVEL_A) {
+			struct reservation *res;
+			struct table_driven_reservation *tdres;
+			
+			sup_update_time(&state->sup_env, litmus_clock());
+			res = sup_find_by_id(&state->sup_env, tinfo->mc2_param.res_id);
+			tdres = container_of(res, struct table_driven_reservation, res);
+			tdres->next_interval = 0;
+			tdres->major_cycle_start = tsk_rt(current)->sporadic_release_time;
+			res->next_replenishment = tsk_rt(current)->sporadic_release_time;
+			res->next_replenishment += tdres->intervals[0].start;
+			res->env->change_state(res->env, res, RESERVATION_DEPLETED);
+						
+			TRACE_CUR("CHANGE NEXT_REP = %llu\n NEXT_UPDATE = %llu\n", res->next_replenishment, state->sup_env.next_scheduler_update);
+		}
+		
+	}
+		
+	prepare_for_next_period(current);
+	next_release = ns_to_ktime(get_release(current));
+	preempt_disable();
+	TRACE_CUR("next_release=%llu\n", get_release(current));
+	if (get_release(current) > litmus_clock()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		preempt_enable_no_resched();
+		err = schedule_hrtimeout(&next_release, HRTIMER_MODE_ABS);
+	} else {
+		err = 0;
+		TRACE_CUR("TARDY: release=%llu now=%llu\n", get_release(current), litmus_clock());
+		preempt_enable();
+	}
+
+	TRACE_CUR("mc2_complete_job returns at %llu\n", litmus_clock());
+	return err;
+}
+
 static long mc2_admit_task(struct task_struct *tsk)
 {
 	long err = -ESRCH;
@@ -297,10 +317,10 @@ static long mc2_admit_task(struct task_struct *tsk)
 	struct mc2_cpu_state *state;
 	struct mc2_task_state *tinfo = kzalloc(sizeof(*tinfo), GFP_ATOMIC);
 	struct mc2_task *mp = tsk_rt(tsk)->plugin_state;
-	
+
 	if (!tinfo)
 		return -ENOMEM;
-	
+
 	if (!mp) {
 		printk(KERN_ERR "mc2_admit_task: criticality level has not been set\n");
 		return err;
@@ -311,21 +331,22 @@ static long mc2_admit_task(struct task_struct *tsk)
 	state = cpu_state_for(task_cpu(tsk));
 	raw_spin_lock_irqsave(&state->lock, flags);
 
-	res = sup_find_by_id(&state->sup_env, mp->pid);
+	res = sup_find_by_id(&state->sup_env, mp->res_id);
 
 	/* found the appropriate reservation (or vCPU) */
 	if (res) {
-		TRACE_TASK(tsk, "FOUND RES\n");
-		tinfo->res_info.mc2.mc2_task.crit = mp->crit;
+		TRACE_TASK(tsk, "FOUND RES ID\n");
+		tinfo->mc2_param.crit = mp->crit;
+		tinfo->mc2_param.res_id = mp->res_id;
 		
 		kfree(tsk_rt(tsk)->plugin_state);
 		tsk_rt(tsk)->plugin_state = NULL;
-	
-		err = mc2_task_client_init(&tinfo->res_info, tsk, res);
+		
+		err = mc2_task_client_init(&tinfo->res_info, &tinfo->mc2_param, tsk, res);
 		tinfo->cpu = task_cpu(tsk);
 		tinfo->has_departed = true;
 		tsk_rt(tsk)->plugin_state = tinfo;
-		
+
 		/* disable LITMUS^RT's per-thread budget enforcement */
 		tsk_rt(tsk)->task_params.budget_policy = NO_ENFORCEMENT;
 	}
@@ -340,21 +361,14 @@ static long mc2_admit_task(struct task_struct *tsk)
 	return err;
 }
 
-static void task_new_legacy_task_model_updates(struct task_struct *tsk)
-{
-	lt_t now = litmus_clock();
-
-	/* the first job exists starting as of right now */
-	release_at(tsk, now);
-}
-
 static void mc2_task_new(struct task_struct *tsk, int on_runqueue,
 			  int is_running)
 {
 	unsigned long flags;
 	struct mc2_task_state* tinfo = get_mc2_state(tsk);
 	struct mc2_cpu_state *state = cpu_state_for(tinfo->cpu);
-
+	struct reservation *res;
+	
 	TRACE_TASK(tsk, "new RT task %llu (on_rq:%d, running:%d)\n",
 		   litmus_clock(), on_runqueue, is_running);
 
@@ -373,15 +387,69 @@ static void mc2_task_new(struct task_struct *tsk, int on_runqueue,
 		sup_update_time(&state->sup_env, litmus_clock());
 		task_arrives(tsk);
 		/* NOTE: drops state->lock */
+		TRACE("mc2_new()\n");
 		mc2_update_timer_and_unlock(state);
 		local_irq_restore(flags);
 	} else
 		raw_spin_unlock_irqrestore(&state->lock, flags);
 
-	task_new_legacy_task_model_updates(tsk);
+	res = sup_find_by_id(&state->sup_env, tinfo->mc2_param.res_id);
+	release_at(tsk, res->next_replenishment);
+	if (res)
+		TRACE_TASK(tsk, "next_replenishment = %llu\n", res->next_replenishment);
+	else
+		TRACE_TASK(tsk, "next_replenishment = NULL\n");
 }
 
-static long mc2_reservation_destroy(unsigned int reservation_id, int cpu);
+static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
+{
+	long ret = -EINVAL;
+	struct mc2_cpu_state *state;
+	struct reservation *res = NULL, *next;
+	struct sup_reservation_environment *sup_env;
+	int found = 0;
+		
+	state = cpu_state_for(cpu);
+	raw_spin_lock(&state->lock);
+	
+//	res = sup_find_by_id(&state->sup_env, reservation_id);
+	sup_env = &state->sup_env;
+	//if (!res) {
+	list_for_each_entry_safe(res, next, &sup_env->depleted_reservations, list) {
+		if (res->id == reservation_id) {
+			list_del(&res->list);
+			//kfree(res);
+			found = 1;
+			ret = 0;
+		}
+	}
+	if (!found) {
+		list_for_each_entry_safe(res, next, &sup_env->inactive_reservations, list) {
+			if (res->id == reservation_id) {
+				list_del(&res->list);
+				//kfree(res);
+				found = 1;
+				ret = 0;
+			}
+		}
+	}
+	if (!found) {
+		list_for_each_entry_safe(res, next, &sup_env->active_reservations, list) {
+			if (res->id == reservation_id) {
+				list_del(&res->list);
+				//kfree(res);
+				found = 1;
+				ret = 0;
+			}
+		}
+	}
+	//}
+
+	raw_spin_unlock(&state->lock);
+	
+	TRACE("RESERVATION_DESTROY ret = %d\n", ret);
+	return ret;
+}
 
 static void mc2_task_exit(struct task_struct *tsk)
 {
@@ -401,65 +469,31 @@ static void mc2_task_exit(struct task_struct *tsk)
 		sup_update_time(&state->sup_env, litmus_clock());
 		task_departs(tsk, 0);
 		/* NOTE: drops state->lock */
+		TRACE("mc2_exit()\n");
 		mc2_update_timer_and_unlock(state);
 		local_irq_restore(flags);
 	} else
 		raw_spin_unlock_irqrestore(&state->lock, flags);
-
-	mc2_reservation_destroy(tsk->pid, tinfo->cpu);
-	
+/*
+	if (tinfo->mc2_param.crit == CRIT_LEVEL_A) {
+		struct table_driven_reservation *td_res;
+		struct reservation *res;
+		res = sup_find_by_id(&state->sup_env, tinfo->mc2_param.res_id);
+		td_res = container_of(res, struct table_driven_reservation, res);
+		kfree(td_res->intervals);
+		//kfree(td_res);
+	} else if (tinfo->mc2_param.crit == CRIT_LEVEL_B) {
+		struct polling_reservation *pres;
+		struct reservation *res;
+		res = sup_find_by_id(&state->sup_env, tinfo->mc2_param.res_id);
+		pres = container_of(res, struct polling_reservation, res);
+		kfree(pres);
+	}
+*/
 	kfree(tsk_rt(tsk)->plugin_state);
 	tsk_rt(tsk)->plugin_state = NULL;
 }
 
-asmlinkage long sys_set_mc2_task_param(pid_t pid, struct mc2_task __user * param)
-{
-	struct task_struct *target;
-	int retval = -EINVAL;
-	struct mc2_task *mp = kzalloc(sizeof(*mp), GFP_KERNEL);
-	
-	if (!mp)
-		return -ENOMEM;
-
-	printk("Setting up mc^2 task parameters for process %d.\n", pid);
-
-	if (pid < 0 || param == 0) {
-		goto out;
-	}
-	if (copy_from_user(mp, param, sizeof(*mp))) {
-		retval = -EFAULT;
-		goto out;
-	}
-
-	/* Task search and manipulation must be protected */
-	read_lock_irq(&tasklist_lock);
-	if (!(target = find_task_by_vpid(pid))) {
-		retval = -ESRCH;
-		goto out_unlock;
-	}
-
-	if (is_realtime(target)) {
-		/* The task is already a real-time task.
-		 * We cannot not allow parameter changes at this point.
-		 */
-		retval = -EBUSY;
-		goto out_unlock;
-	}
-	if (mp->crit < CRIT_LEVEL_A || mp->crit >= NUM_CRIT_LEVELS) {
-		printk(KERN_INFO "litmus: real-time task %d rejected "
-			"because of invalid criticality level\n", pid);
-		goto out_unlock;
-	}
-	
-	target->rt_param.plugin_state = mp;
-
-	retval = 0;
-      out_unlock:
-	read_unlock_irq(&tasklist_lock);
-      out:
-	return retval;
-}
-
 static long create_polling_reservation(
 	int res_type,
 	struct reservation_config *config)
@@ -566,10 +600,6 @@ static long create_table_driven_reservation(
 		err = copy_from_user(slots,
 			config->table_driven_params.intervals, slots_size);
 
-	for (i=0; i<num_slots;i++) {
-		TRACE("###### [%llu, %llu]\n", slots[i].start, slots[i].end);
-	}
-	
 	if (!err) {
 		/* sanity checks */
 		for (i = 0; !err && i < num_slots; i++)
@@ -628,7 +658,6 @@ static long create_table_driven_reservation(
 		kfree(td_res);
 	}
 
-	TRACE("CREATE_TABLE_DRIVEN_RES = %d\n", err);
 	return err;
 }
 
@@ -665,54 +694,6 @@ static long mc2_reservation_create(int res_type, void* __user _config)
 	return ret;
 }
 
-static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
-{
-	long ret = -EINVAL;
-	struct mc2_cpu_state *state;
-	struct reservation *res, *next;
-	struct sup_reservation_environment *sup_env;
-	unsigned long flags;
-	int found = 0;
-	
-	state = cpu_state_for(cpu);
-	raw_spin_lock_irqsave(&state->lock, flags);
-	
-	//res = sup_find_by_id(&state->sup_env, reservation_id);
-	sup_env = &state->sup_env;
-	//if (!res) {
-	list_for_each_entry_safe(res, next, &sup_env->depleted_reservations, list) {
-		if (res->id == reservation_id) {
-			list_del(&res->list);
-			found = 1;
-			ret = 0;
-		}
-	}
-	if (!found) {
-		list_for_each_entry_safe(res, next, &sup_env->inactive_reservations, list) {
-			if (res->id == reservation_id) {
-				list_del(&res->list);
-				found = 1;
-				ret = 0;
-			}
-		}
-	}
-	if (!found) {
-		list_for_each_entry_safe(res, next, &sup_env->active_reservations, list) {
-			if (res->id == reservation_id) {
-				list_del(&res->list);
-				found = 1;
-				ret = 0;
-			}
-		}
-	}
-	//}
-
-	raw_spin_unlock_irqrestore(&state->lock, flags);
-	
-	TRACE("RESERVATION_DESTROY ret = %d\n", ret);
-	return ret;
-}
-
 static struct domain_proc_info mc2_domain_proc_info;
 
 static long mc2_get_domain_proc_info(struct domain_proc_info **ret)
@@ -756,11 +737,6 @@ static long mc2_activate_plugin(void)
 
 		state = cpu_state_for(cpu);
 
-#ifdef CONFIG_RELEASE_MASTER
-		state->release_master = atomic_read(&release_master_cpu);
-		hrtimer_start_on_info_init(&state->info);
-#endif
-		
 		raw_spin_lock_init(&state->lock);
 		state->cpu = cpu;
 		state->scheduled = NULL;
@@ -823,15 +799,15 @@ static long mc2_deactivate_plugin(void)
 }
 
 static struct sched_plugin mc2_plugin = {
-	.plugin_name		= "MC2",
-	.schedule		= mc2_schedule,
-	.task_wake_up		= mc2_task_resume,
-	.admit_task		= mc2_admit_task,
-	.task_new		= mc2_task_new,
-	.task_exit		= mc2_task_exit,
+	.plugin_name			= "MC2",
+	.schedule				= mc2_schedule,
+	.task_wake_up			= mc2_task_resume,
+	.admit_task				= mc2_admit_task,
+	.task_new				= mc2_task_new,
+	.task_exit				= mc2_task_exit,
 	.complete_job           = mc2_complete_job,
 	.get_domain_proc_info   = mc2_get_domain_proc_info,
-	.activate_plugin	= mc2_activate_plugin,
+	.activate_plugin		= mc2_activate_plugin,
 	.deactivate_plugin      = mc2_deactivate_plugin,
 	.reservation_create     = mc2_reservation_create,
 	.reservation_destroy	= mc2_reservation_destroy,
@@ -843,4 +819,3 @@ static int __init init_mc2(void)
 }
 
 module_init(init_mc2);
-
diff --git a/litmus/sched_pres.c b/litmus/sched_pres.c
index e2d4810..6c636cc 100644
--- a/litmus/sched_pres.c
+++ b/litmus/sched_pres.c
@@ -14,6 +14,8 @@
 #include <litmus/reservation.h>
 #include <litmus/polling_reservations.h>
 
+//static int testval = 0;
+
 struct pres_task_state {
 	struct task_client res_info;
 	int cpu;
@@ -172,6 +174,11 @@ static struct task_struct* pres_schedule(struct task_struct * prev)
 	BUG_ON(state->scheduled && state->scheduled != prev);
 	BUG_ON(state->scheduled && !is_realtime(prev));
 
+//if (testval == 1) {
+//	testval = 0;
+//	printk(KERN_INFO "TESTVAL = 1 at %llu\n", litmus_clock());
+//}
+	
 	/* update time */
 	state->sup_env.will_schedule = true;
 	sup_update_time(&state->sup_env, litmus_clock());
@@ -250,6 +257,8 @@ static long pres_complete_job(void)
 {
 	ktime_t next_release;
 	long err;
+//testval = 1;
+//printk(KERN_INFO "pres_complete_job at %llu (deadline: %llu)\n", litmus_clock(), get_deadline(current));
 
 	TRACE_CUR("pres_complete_job at %llu (deadline: %llu)\n", litmus_clock(),
 		get_deadline(current));
@@ -270,6 +279,7 @@ static long pres_complete_job(void)
 	}
 
 	TRACE_CUR("pres_complete_job returns at %llu\n", litmus_clock());
+//printk(KERN_INFO "pres_complete_job returns at %llu\n", litmus_clock());	
 	return err;
 }
 
-- 
1.8.1.2


From 5be3aecdd4b8b0beb981cc0f7fc84b0d0ded2c47 Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Sat, 24 Jan 2015 04:55:04 -0500
Subject: [PATCH 083/119] slack stealing

---
 include/litmus/mc2_common.h   |   2 +
 include/litmus/reservation.h  |  44 ++++++++++
 include/litmus/rt_param.h     |   4 +
 litmus/jobs.c                 |   1 +
 litmus/polling_reservations.c |  28 +++++--
 litmus/reservation.c          |  16 ++--
 litmus/sched_mc2.c            | 185 +++++++++++++++++++++++++++++++++++-------
 7 files changed, 237 insertions(+), 43 deletions(-)

diff --git a/include/litmus/mc2_common.h b/include/litmus/mc2_common.h
index bdc3a6d..e3c0af2 100644
--- a/include/litmus/mc2_common.h
+++ b/include/litmus/mc2_common.h
@@ -21,6 +21,8 @@ struct mc2_task {
 
 #include <litmus/reservation.h>
 
+#define tsk_mc2_data(t)		(tsk_rt(t)->mc2_data)
+
 long mc2_task_client_init(struct task_client *tc, struct mc2_task *mc2_param, struct task_struct *tsk,
 							struct reservation *res);
 	
diff --git a/include/litmus/reservation.h b/include/litmus/reservation.h
index 4eecd3f..5ccb200 100644
--- a/include/litmus/reservation.h
+++ b/include/litmus/reservation.h
@@ -126,6 +126,9 @@ struct reservation {
 	struct reservation_ops *ops;
 
 	struct list_head clients;
+	
+	/* for global env. */
+	int scheduled_on;
 };
 
 void reservation_init(struct reservation *res);
@@ -185,10 +188,51 @@ struct sup_reservation_environment {
 void sup_init(struct sup_reservation_environment* sup_env);
 void sup_add_new_reservation(struct sup_reservation_environment* sup_env,
 	struct reservation* new_res);
+void sup_scheduler_update_after(struct sup_reservation_environment* sup_env,
+	lt_t timeout);
 void sup_update_time(struct sup_reservation_environment* sup_env, lt_t now);
 struct task_struct* sup_dispatch(struct sup_reservation_environment* sup_env);
 
 struct reservation* sup_find_by_id(struct sup_reservation_environment* sup_env,
 	unsigned int id);
+	
+/* A global multiprocessor reservation environment. */
 
+struct next_timer_event {
+	lt_t next_update;
+	int timer_armed_on;
+	unsigned int id;
+	struct list_head list;
+};
+
+struct gmp_reservation_environment {
+	raw_spinlock_t lock;
+	struct reservation_environment env;
+
+	/* ordered by priority */
+	struct list_head active_reservations;
+
+	/* ordered by next_replenishment */
+	struct list_head depleted_reservations;
+
+	/* unordered */
+	struct list_head inactive_reservations;
+
+	/* timer event ordered by next_update */
+	struct list_head next_events;
+	/* (schedule_now == true) means call gmp_dispatch() now */
+	bool schedule_now;
+	/* set to true if a call to gmp_dispatch() is imminent */
+	bool will_schedule;
+};
+/*
+void gmp_init(struct gmp_reservation_environment* gmp_env);
+void gmp_add_new_reservation(struct gmp_reservation_environment* gmp_env,
+	struct reservation* new_res);
+void gmp_update_time(struct gmp_reservation_environment* gmp_env, lt_t now);
+struct task_struct* gmp_dispatch(struct gmp_reservation_environment* gmp_env);
+
+struct reservation* gmp_find_by_id(struct gmp_reservation_environment* gmp_env,
+	unsigned int id);
+*/
 #endif
diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
index e626bbb..284b89e 100644
--- a/include/litmus/rt_param.h
+++ b/include/litmus/rt_param.h
@@ -206,6 +206,7 @@ struct rt_job {
 };
 
 struct pfair_param;
+struct mc2_task;
 
 /*	RT task parameters for scheduling extensions
  *	These parameters are inherited during clone and therefore must
@@ -322,6 +323,9 @@ struct rt_param {
 
 	/* Pointer to the page shared between userspace and kernel. */
 	struct control_page * ctrl_page;
+	
+	/* Mixed-criticality specific data */
+	struct mc2_task* mc2_data;
 };
 
 #endif
diff --git a/litmus/jobs.c b/litmus/jobs.c
index 547222c..e523e29 100644
--- a/litmus/jobs.c
+++ b/litmus/jobs.c
@@ -45,6 +45,7 @@ void release_at(struct task_struct *t, lt_t start)
 {
 	BUG_ON(!t);
 	setup_release(t, start);
+	TRACE("RELEASE!!\n");
 	tsk_rt(t)->completed = 0;
 }
 
diff --git a/litmus/polling_reservations.c b/litmus/polling_reservations.c
index 4c07ee7..941a371 100644
--- a/litmus/polling_reservations.c
+++ b/litmus/polling_reservations.c
@@ -19,11 +19,19 @@ static void periodic_polling_client_arrives(
 	switch (res->state) {
 		case RESERVATION_INACTIVE:
 			/* Figure out next replenishment time. */
-			tmp = res->env->current_time - res->env->time_zero;
-			instances =  div64_u64(tmp, pres->period);
-			res->next_replenishment =
-				(instances + 1) * pres->period + pres->offset;
-
+			if (res->env->time_zero == 0) {
+				tmp = res->env->current_time - res->env->time_zero;
+				instances =  div64_u64(tmp, pres->period);
+				res->next_replenishment =
+					(instances + 1) * pres->period + pres->offset;
+			}
+			else {
+				tmp = res->env->current_time - res->env->time_zero;
+				instances =  div64_u64(tmp, pres->period);
+				res->next_replenishment = res->env->time_zero + instances * pres->period;
+			}
+				
+			TRACE("ENV_TIME_ZERO %llu\n", res->env->time_zero);
 			TRACE("pol-res: activate tmp=%llu instances=%llu period=%llu nextrp=%llu cur=%llu\n",
 				tmp, instances, pres->period, res->next_replenishment,
 				res->env->current_time);
@@ -62,9 +70,10 @@ static void periodic_polling_client_departs(
 		case RESERVATION_ACTIVE:
 			if (list_empty(&res->clients)) {
 				res->env->change_state(res->env, res,
-					did_signal_job_completion ?
-						RESERVATION_DEPLETED :
 						RESERVATION_ACTIVE_IDLE);
+//					did_signal_job_completion ?
+//						RESERVATION_DEPLETED :
+//						RESERVATION_ACTIVE_IDLE);
 			} /* else: nothing to do, more clients ready */
 			break;
 
@@ -86,6 +95,7 @@ static void periodic_polling_on_replenishment(
 	res->next_replenishment += pres->period;
 	res->budget_consumed = 0;
 
+	TRACE("polling_replenish(%u): next_replenishment=%llu\n", res->id, res->next_replenishment);
 	switch (res->state) {
 		case RESERVATION_DEPLETED:
 		case RESERVATION_INACTIVE:
@@ -270,6 +280,7 @@ void polling_reservation_init(
 	pres->period = period;
 	pres->deadline = deadline;
 	pres->offset = offset;
+	TRACE_TASK(current, "polling_reservation_init: periodic %d, use_edf %d\n", use_periodic_polling, use_edf_prio);
 	if (use_periodic_polling) {
 		if (use_edf_prio)
 			pres->res.ops = &periodic_polling_ops_edf;
@@ -460,7 +471,8 @@ static void td_drain_budget(
 	switch (res->state) {
 		case RESERVATION_DEPLETED:
 		case RESERVATION_INACTIVE:
-			BUG();
+			//BUG();
+			TRACE("TD_DRAIN!!!!!!!!! RES_STATE = %d\n", res->state);
 			break;
 
 		case RESERVATION_ACTIVE_IDLE:
diff --git a/litmus/reservation.c b/litmus/reservation.c
index 0e43479..2dc3dc2 100644
--- a/litmus/reservation.c
+++ b/litmus/reservation.c
@@ -48,11 +48,12 @@ static void sup_scheduler_update_at(
 	struct sup_reservation_environment* sup_env,
 	lt_t when)
 {
+	TRACE("SCHEDULER_UPDATE_AT update: %llu > when %llu\n", sup_env->next_scheduler_update, when);
 	if (sup_env->next_scheduler_update > when)
 		sup_env->next_scheduler_update = when;
 }
 
-static void sup_scheduler_update_after(
+void sup_scheduler_update_after(
 	struct sup_reservation_environment* sup_env,
 	lt_t timeout)
 {
@@ -192,10 +193,13 @@ static void sup_charge_budget(
 		/* charge all ACTIVE_IDLE up to the first ACTIVE reservation */
 		res = list_entry(pos, struct reservation, list);
 		if (res->state == RESERVATION_ACTIVE) {
-			res->ops->drain_budget(res, delta);
+			TRACE("sup_charge_budget ACTIVE R%u drain %llu\n", res->id, delta);
+			if (encountered_active == 0)
+				res->ops->drain_budget(res, delta);
 			encountered_active = 1;
 		} else {
 			BUG_ON(res->state != RESERVATION_ACTIVE_IDLE);
+			TRACE("sup_charge_budget INACTIVE R%u drain %llu\n", res->id, delta);
 			res->ops->drain_budget(res, delta);
 		}
 		if (res->state == RESERVATION_ACTIVE ||
@@ -207,9 +211,9 @@ static void sup_charge_budget(
 				res->id, res->cur_budget);
 			 sup_scheduler_update_after(sup_env, res->cur_budget);
 		}
-		if (encountered_active)
+		//if (encountered_active == 2)
 			/* stop at the first ACTIVE reservation */
-			break;
+		//	break;
 	}
 	//TRACE("finished charging budgets\n");
 }
@@ -246,7 +250,7 @@ void sup_update_time(
 	/* If the time didn't advance, there is nothing to do.
 	 * This check makes it safe to call sup_advance_time() potentially
 	 * multiple times (e.g., via different code paths. */
-	//TRACE("(sup_update_time) now: %llu, current_time: %llu\n", now, sup_env->env.current_time);
+	TRACE("(sup_update_time) now: %llu, current_time: %llu\n", now, sup_env->env.current_time);
 	if (unlikely(now <= sup_env->env.current_time))
 		return;
 
@@ -258,9 +262,11 @@ void sup_update_time(
 		sup_env->next_scheduler_update = SUP_NO_SCHEDULER_UPDATE;
 
 	/* deplete budgets by passage of time */
+	TRACE("CHARGE###\n");
 	sup_charge_budget(sup_env, delta);
 
 	/* check if any budgets where replenished */
+	TRACE("REPLENISH###\n");
 	sup_replenish_budgets(sup_env);
 }
 
diff --git a/litmus/sched_mc2.c b/litmus/sched_mc2.c
index b9f0523..6b29d52 100644
--- a/litmus/sched_mc2.c
+++ b/litmus/sched_mc2.c
@@ -22,15 +22,22 @@ struct mc2_task_state {
 	struct mc2_task mc2_param;
 };
 
+struct crit_entry {
+	enum crit_level level;
+	struct task_struct *running;
+	struct hrtimer ghost_timer;
+};
+
 struct mc2_cpu_state {
 	raw_spinlock_t lock;
 
 	struct sup_reservation_environment sup_env;
 	struct hrtimer timer;
+	struct hrtimer g_timer;
 
 	int cpu;
 	struct task_struct* scheduled;
-	enum crit_level run_level;
+	struct crit_entry crit_entries[NUM_CRIT_LEVELS];
 };
 
 static DEFINE_PER_CPU(struct mc2_cpu_state, mc2_cpu_state);
@@ -42,30 +49,53 @@ static struct mc2_task_state* get_mc2_state(struct task_struct *tsk)
 {
 	return (struct mc2_task_state*) tsk_rt(tsk)->plugin_state;
 }
+static enum crit_level get_task_crit_level(struct task_struct *tsk)
+{
+	struct mc2_task_state *tinfo = get_mc2_state(tsk);
+	if (!tinfo)
+		return NUM_CRIT_LEVELS;
+	else
+		return tinfo->mc2_param.crit;
+}
 
 static void task_departs(struct task_struct *tsk, int job_complete)
 {
-	struct mc2_task_state* state = get_mc2_state(tsk);
+	struct mc2_task_state* tinfo = get_mc2_state(tsk);
+	struct mc2_cpu_state* state = local_cpu_state();
 	struct reservation* res;
 	struct reservation_client *client;
 
-	res    = state->res_info.client.reservation;
-	client = &state->res_info.client;
+	res    = tinfo->res_info.client.reservation;
+	client = &tinfo->res_info.client;
 
 	res->ops->client_departs(res, client, job_complete);
-	state->has_departed = true;
+	tinfo->has_departed = true;
+	TRACE_TASK(tsk, "CLIENT DEPART with budget %llu\n", res->cur_budget);
+	if (job_complete && res->cur_budget) {
+		struct crit_entry* ce;
+		enum crit_level lv = tinfo->mc2_param.crit;
+		//lt_t now = litmus_clock();
+		
+		ce = &state->crit_entries[lv];
+		ce->running = tsk;
+		TRACE_TASK(tsk, "BECOME GHOST at %llu\n", litmus_clock());
+		
+		BUG_ON(hrtimer_active(&ce->ghost_timer));
+		//TRACE("setting GHOST timer %llu\n", ns_to_ktime(now + res->cur_budget));
+		//__hrtimer_start_range_ns(&ce->ghost_timer, ns_to_ktime(now + res->cur_budget), 0, HRTIMER_MODE_ABS_PINNED, 0);
+	}		
 }
 
 static void task_arrives(struct task_struct *tsk)
 {
-	struct mc2_task_state* state = get_mc2_state(tsk);
+	struct mc2_task_state* tinfo = get_mc2_state(tsk);
 	struct reservation* res;
 	struct reservation_client *client;
 
-	res    = state->res_info.client.reservation;
-	client = &state->res_info.client;
+	res    = tinfo->res_info.client.reservation;
+	client = &tinfo->res_info.client;
 
-	state->has_departed = false;
+	tinfo->has_departed = false;
 	res->ops->client_arrives(res, client);
 }
 
@@ -94,7 +124,7 @@ static void mc2_update_timer_and_unlock(struct mc2_cpu_state *state)
 		/* Reprogram only if not already set correctly. */
 		if (!hrtimer_active(&state->timer) ||
 		    ktime_to_ns(hrtimer_get_expires(&state->timer)) != update) {
-			TRACE("canceling timer...\n");
+			TRACE("canceling timer...at %llu\n", ktime_to_ns(hrtimer_get_expires(&state->timer)));
 			hrtimer_cancel(&state->timer);
 			TRACE("setting scheduler timer for %llu\n", update);
 			/* We cannot use hrtimer_start() here because the
@@ -127,6 +157,49 @@ static void mc2_update_timer_and_unlock(struct mc2_cpu_state *state)
 	}
 }
 
+static void mc2_update_ghost_state(struct mc2_cpu_state *state)
+{
+	int lv = 0;
+	struct crit_entry* ce;
+	struct reservation *res;
+	struct mc2_task_state *tinfo;
+	
+	for (lv = 0; lv < NUM_CRIT_LEVELS; lv++) {
+		ce = &state->crit_entries[lv];
+		if (ce->running != NULL) {
+			tinfo = get_mc2_state(ce->running);
+			if (lv != CRIT_LEVEL_C)
+				res = sup_find_by_id(&state->sup_env, tinfo->mc2_param.res_id);
+			else
+				continue;
+			TRACE("LV %d running id %d budget %llu\n", lv, tinfo->mc2_param.res_id, res->cur_budget);
+			if (!res->cur_budget) {
+				TRACE("GHOST FINISH id %d at %llu\n", tinfo->mc2_param.res_id, litmus_clock());
+				ce->running = NULL;
+			}
+		}
+	}
+}			
+
+static enum hrtimer_restart on_ghost_timer(struct hrtimer *timer)
+{
+	struct crit_entry *ce;
+	struct mc2_cpu_state *state;
+	
+	ce = container_of(timer, struct crit_entry, ghost_timer);
+	state = container_of(ce, struct mc2_cpu_state, crit_entries[ce->level]);
+	
+	TRACE("GHOST_TIMER FIRED at %llu\n", litmus_clock());
+	
+	raw_spin_lock(&state->lock);
+	sup_update_time(&state->sup_env, litmus_clock());
+	mc2_update_ghost_state(state);
+	
+	raw_spin_unlock(&state->lock);
+	
+	return HRTIMER_NORESTART;
+}
+	
 static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 {
 	unsigned long flags;
@@ -144,9 +217,11 @@ static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 	 */
 	BUG_ON(state->cpu != raw_smp_processor_id());
 
+	TRACE("TIMER FIRED at %llu\n", litmus_clock());
 	raw_spin_lock_irqsave(&state->lock, flags);
 	sup_update_time(&state->sup_env, litmus_clock());
-
+	mc2_update_ghost_state(state);
+	
 	update = state->sup_env.next_scheduler_update;
 	now = state->sup_env.env.current_time;
 
@@ -165,6 +240,36 @@ static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 	return restart;
 }
 
+struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, struct mc2_cpu_state* state)
+{
+	struct reservation *res, *next;
+	struct task_struct *tsk = NULL;
+	struct crit_entry *ce;
+	enum crit_level lv;
+	lt_t time_slice;
+
+	list_for_each_entry_safe(res, next, &sup_env->active_reservations, list) {
+		if (res->state == RESERVATION_ACTIVE) {
+			tsk = res->ops->dispatch_client(res, &time_slice);
+			if (likely(tsk)) {
+				lv = get_task_crit_level(tsk);
+				if (lv == NUM_CRIT_LEVELS) {
+					sup_scheduler_update_after(sup_env, res->cur_budget);
+					return tsk;
+				} else {
+					ce = &state->crit_entries[lv];
+					if (likely(!ce->running)) {
+						sup_scheduler_update_after(sup_env, res->cur_budget);
+						return tsk;
+					}
+				}
+			}
+		}
+	}
+
+	return NULL;
+}
+
 static struct task_struct* mc2_schedule(struct task_struct * prev)
 {
 	/* next == NULL means "schedule background work". */
@@ -178,14 +283,17 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 
 	/* update time */
 	state->sup_env.will_schedule = true;
+	TRACE_TASK(prev, "MC2_SCHEDULE sup_update_time ####\n");
 	sup_update_time(&state->sup_env, litmus_clock());
-
+	TRACE_TASK(prev, "MC2_SCHEDULE sup_update_time !!!!\n");
+	mc2_update_ghost_state(state);
+	
 	/* remove task from reservation if it blocks */
 	if (is_realtime(prev) && !is_running(prev))
 		task_departs(prev, is_completed(prev));
 
 	/* figure out what to schedule next */
-	state->scheduled = sup_dispatch(&state->sup_env);
+	state->scheduled = mc2_dispatch(&state->sup_env, state);
 
 	/* Notify LITMUS^RT core that we've arrived at a scheduling decision. */
 	sched_state_task_picked();
@@ -197,7 +305,6 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 
 	if (prev != state->scheduled && is_realtime(prev)) {
 		TRACE_TASK(prev, "descheduled.\n");
-		state->run_level = NUM_CRIT_LEVELS;
 	}
 	if (state->scheduled) {
 		TRACE_TASK(state->scheduled, "scheduled.\n");
@@ -242,9 +349,10 @@ static void mc2_task_resume(struct task_struct  *tsk)
 		 * since we might not actually be executing on tinfo->cpu
 		 * at the moment. */
 		sup_update_time(&state->sup_env, litmus_clock());
+		mc2_update_ghost_state(state);
 		task_arrives(tsk);
 		/* NOTE: drops state->lock */
-		TRACE("mc2_resume()\n");
+		TRACE_TASK(tsk, "mc2_resume()\n");
 		mc2_update_timer_and_unlock(state);
 		local_irq_restore(flags);
 	} else {
@@ -263,34 +371,36 @@ static long mc2_complete_job(void)
 	struct mc2_cpu_state *state = local_cpu_state();
 	struct reservation_environment *env = &(state->sup_env.env);
 	struct mc2_task_state *tinfo = get_mc2_state(current);
+	struct reservation *res;
 	
+	res = sup_find_by_id(&state->sup_env, tinfo->mc2_param.res_id);
+	if (!res)
+		; // find in global env
 	
-	TRACE_CUR("mc2_complete_job at %llu (deadline: %llu)\n", litmus_clock(),
-		get_deadline(current));
+	TRACE_CUR("mc2_complete_job at %llu (deadline: %llu) (cur->budget: %llu)\n", litmus_clock(),
+		get_deadline(current), res->cur_budget);
 
 	tsk_rt(current)->completed = 1;
 	
 	if (tsk_rt(current)->sporadic_release) {
 		env->time_zero = tsk_rt(current)->sporadic_release_time;
-	
+		res->next_replenishment = tsk_rt(current)->sporadic_release_time;
+		res->cur_budget = 0;
+		res->env->change_state(res->env, res, RESERVATION_DEPLETED);
+		
 		if (tinfo->mc2_param.crit == CRIT_LEVEL_A) {
-			struct reservation *res;
 			struct table_driven_reservation *tdres;
 			
-			sup_update_time(&state->sup_env, litmus_clock());
-			res = sup_find_by_id(&state->sup_env, tinfo->mc2_param.res_id);
+			//sup_update_time(&state->sup_env, litmus_clock());
+			//res = sup_find_by_id(&state->sup_env, tinfo->mc2_param.res_id);
 			tdres = container_of(res, struct table_driven_reservation, res);
 			tdres->next_interval = 0;
 			tdres->major_cycle_start = tsk_rt(current)->sporadic_release_time;
-			res->next_replenishment = tsk_rt(current)->sporadic_release_time;
-			res->next_replenishment += tdres->intervals[0].start;
-			res->env->change_state(res->env, res, RESERVATION_DEPLETED);
-						
-			TRACE_CUR("CHANGE NEXT_REP = %llu\n NEXT_UPDATE = %llu\n", res->next_replenishment, state->sup_env.next_scheduler_update);
+			res->next_replenishment += tdres->intervals[0].start;			
 		}
-		
+		TRACE_CUR("CHANGE NEXT_REP = %llu\n NEXT_UPDATE = %llu\n", res->next_replenishment, state->sup_env.next_scheduler_update);
 	}
-		
+	
 	prepare_for_next_period(current);
 	next_release = ns_to_ktime(get_release(current));
 	preempt_disable();
@@ -385,6 +495,7 @@ static void mc2_task_new(struct task_struct *tsk, int on_runqueue,
 		/* Assumption: litmus_clock() is synchronized across cores
 		 * [see comment in pres_task_resume()] */
 		sup_update_time(&state->sup_env, litmus_clock());
+		mc2_update_ghost_state(state);
 		task_arrives(tsk);
 		/* NOTE: drops state->lock */
 		TRACE("mc2_new()\n");
@@ -456,18 +567,26 @@ static void mc2_task_exit(struct task_struct *tsk)
 	unsigned long flags;
 	struct mc2_task_state* tinfo = get_mc2_state(tsk);
 	struct mc2_cpu_state *state = cpu_state_for(tinfo->cpu);
+	enum crit_level lv = tinfo->mc2_param.crit;
+	struct crit_entry* ce;	
 
 	raw_spin_lock_irqsave(&state->lock, flags);
 
 	if (state->scheduled == tsk)
 		state->scheduled = NULL;
 
+	ce = &state->crit_entries[lv];
+	if (ce->running == tsk)
+		ce->running = NULL;
+	
 	/* remove from queues */
 	if (is_running(tsk)) {
 		/* Assumption: litmus_clock() is synchronized across cores
 		 * [see comment in pres_task_resume()] */
 		sup_update_time(&state->sup_env, litmus_clock());
+		mc2_update_ghost_state(state);
 		task_departs(tsk, 0);
+		
 		/* NOTE: drops state->lock */
 		TRACE("mc2_exit()\n");
 		mc2_update_timer_and_unlock(state);
@@ -729,7 +848,7 @@ static void mc2_setup_domain_proc(void)
 
 static long mc2_activate_plugin(void)
 {
-	int cpu;
+	int cpu, lv;
 	struct mc2_cpu_state *state;
 
 	for_each_online_cpu(cpu) {
@@ -740,7 +859,13 @@ static long mc2_activate_plugin(void)
 		raw_spin_lock_init(&state->lock);
 		state->cpu = cpu;
 		state->scheduled = NULL;
-
+		for (lv = 0; lv < NUM_CRIT_LEVELS; lv++) {
+			struct crit_entry *ce = &state->crit_entries[lv];
+			ce->level = lv;
+			ce->running = NULL;
+			hrtimer_init(&ce->ghost_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+			ce->ghost_timer.function = on_ghost_timer;
+		}
 		sup_init(&state->sup_env);
 
 		hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
-- 
1.8.1.2


From 34fe51ed2dc210e87bfa5d85ab98c5125495f002 Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Sat, 24 Jan 2015 05:22:15 -0500
Subject: [PATCH 084/119] Removed ghost_timer

---
 litmus/sched_mc2.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/litmus/sched_mc2.c b/litmus/sched_mc2.c
index 6b29d52..499f770 100644
--- a/litmus/sched_mc2.c
+++ b/litmus/sched_mc2.c
@@ -25,7 +25,7 @@ struct mc2_task_state {
 struct crit_entry {
 	enum crit_level level;
 	struct task_struct *running;
-	struct hrtimer ghost_timer;
+	//struct hrtimer ghost_timer;
 };
 
 struct mc2_cpu_state {
@@ -80,7 +80,7 @@ static void task_departs(struct task_struct *tsk, int job_complete)
 		ce->running = tsk;
 		TRACE_TASK(tsk, "BECOME GHOST at %llu\n", litmus_clock());
 		
-		BUG_ON(hrtimer_active(&ce->ghost_timer));
+		//BUG_ON(hrtimer_active(&ce->ghost_timer));
 		//TRACE("setting GHOST timer %llu\n", ns_to_ktime(now + res->cur_budget));
 		//__hrtimer_start_range_ns(&ce->ghost_timer, ns_to_ktime(now + res->cur_budget), 0, HRTIMER_MODE_ABS_PINNED, 0);
 	}		
@@ -181,6 +181,7 @@ static void mc2_update_ghost_state(struct mc2_cpu_state *state)
 	}
 }			
 
+/*
 static enum hrtimer_restart on_ghost_timer(struct hrtimer *timer)
 {
 	struct crit_entry *ce;
@@ -199,7 +200,8 @@ static enum hrtimer_restart on_ghost_timer(struct hrtimer *timer)
 	
 	return HRTIMER_NORESTART;
 }
-	
+*/
+
 static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 {
 	unsigned long flags;
@@ -863,8 +865,8 @@ static long mc2_activate_plugin(void)
 			struct crit_entry *ce = &state->crit_entries[lv];
 			ce->level = lv;
 			ce->running = NULL;
-			hrtimer_init(&ce->ghost_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
-			ce->ghost_timer.function = on_ghost_timer;
+			//hrtimer_init(&ce->ghost_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+			//ce->ghost_timer.function = on_ghost_timer;
 		}
 		sup_init(&state->sup_env);
 
-- 
1.8.1.2


From ca538aafd7cebfd09a47af0a628647620a6bba35 Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Sat, 24 Jan 2015 13:24:12 -0500
Subject: [PATCH 085/119] Reservation destroy

---
 include/litmus/reservation.h  |  8 +++++---
 litmus/polling_reservations.c |  1 +
 litmus/reservation.c          |  6 ++++--
 litmus/sched_mc2.c            | 47 ++++++++++++++++++++++++++++++++++++-------
 4 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/include/litmus/reservation.h b/include/litmus/reservation.h
index 5ccb200..0e656ad 100644
--- a/include/litmus/reservation.h
+++ b/include/litmus/reservation.h
@@ -129,6 +129,8 @@ struct reservation {
 	
 	/* for global env. */
 	int scheduled_on;
+	/* for blocked by ghost */
+	int blocked_by_ghost;
 };
 
 void reservation_init(struct reservation *res);
@@ -225,14 +227,14 @@ struct gmp_reservation_environment {
 	/* set to true if a call to gmp_dispatch() is imminent */
 	bool will_schedule;
 };
-/*
+
 void gmp_init(struct gmp_reservation_environment* gmp_env);
 void gmp_add_new_reservation(struct gmp_reservation_environment* gmp_env,
 	struct reservation* new_res);
 void gmp_update_time(struct gmp_reservation_environment* gmp_env, lt_t now);
 struct task_struct* gmp_dispatch(struct gmp_reservation_environment* gmp_env);
-
+struct next_timer_event* gmp_find_event_by_id(struct gmp_reservation_environment* gmp_env, unsigned int id);
 struct reservation* gmp_find_by_id(struct gmp_reservation_environment* gmp_env,
 	unsigned int id);
-*/
+
 #endif
diff --git a/litmus/polling_reservations.c b/litmus/polling_reservations.c
index 941a371..ec5cadd 100644
--- a/litmus/polling_reservations.c
+++ b/litmus/polling_reservations.c
@@ -46,6 +46,7 @@ static void periodic_polling_client_arrives(
 			break;
 
 		case RESERVATION_ACTIVE_IDLE:
+			res->blocked_by_ghost = 0;
 			res->env->change_state(res->env, res,
 				RESERVATION_ACTIVE);
 			break;
diff --git a/litmus/reservation.c b/litmus/reservation.c
index 2dc3dc2..16b3a48 100644
--- a/litmus/reservation.c
+++ b/litmus/reservation.c
@@ -194,9 +194,11 @@ static void sup_charge_budget(
 		res = list_entry(pos, struct reservation, list);
 		if (res->state == RESERVATION_ACTIVE) {
 			TRACE("sup_charge_budget ACTIVE R%u drain %llu\n", res->id, delta);
-			if (encountered_active == 0)
+			if (encountered_active == 0 && res->blocked_by_ghost == 0) {
+				TRACE("DRAIN !!\n");
 				res->ops->drain_budget(res, delta);
-			encountered_active = 1;
+				encountered_active = 1;
+			}			
 		} else {
 			BUG_ON(res->state != RESERVATION_ACTIVE_IDLE);
 			TRACE("sup_charge_budget INACTIVE R%u drain %llu\n", res->id, delta);
diff --git a/litmus/sched_mc2.c b/litmus/sched_mc2.c
index 499f770..0c26019 100644
--- a/litmus/sched_mc2.c
+++ b/litmus/sched_mc2.c
@@ -86,17 +86,25 @@ static void task_departs(struct task_struct *tsk, int job_complete)
 	}		
 }
 
-static void task_arrives(struct task_struct *tsk)
+static void task_arrives(struct mc2_cpu_state *state, struct task_struct *tsk)
 {
 	struct mc2_task_state* tinfo = get_mc2_state(tsk);
 	struct reservation* res;
 	struct reservation_client *client;
+	enum crit_level lv = get_task_crit_level(tsk);
 
 	res    = tinfo->res_info.client.reservation;
 	client = &tinfo->res_info.client;
 
 	tinfo->has_departed = false;
 	res->ops->client_arrives(res, client);
+	
+	if (lv != NUM_CRIT_LEVELS) {
+		struct crit_entry *ce;
+		ce = &state->crit_entries[lv];
+		if (ce->running == tsk)
+			ce->running = NULL;
+	}
 }
 
 /* NOTE: drops state->lock */
@@ -174,8 +182,13 @@ static void mc2_update_ghost_state(struct mc2_cpu_state *state)
 				continue;
 			TRACE("LV %d running id %d budget %llu\n", lv, tinfo->mc2_param.res_id, res->cur_budget);
 			if (!res->cur_budget) {
+				struct sup_reservation_environment* sup_env = &state->sup_env;
+				
 				TRACE("GHOST FINISH id %d at %llu\n", tinfo->mc2_param.res_id, litmus_clock());
 				ce->running = NULL;
+				res = list_first_entry_or_null(&sup_env->active_reservations, struct reservation, list);
+				if (res)
+					litmus_reschedule_local();
 			}
 		}
 	}
@@ -262,7 +275,10 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 					ce = &state->crit_entries[lv];
 					if (likely(!ce->running)) {
 						sup_scheduler_update_after(sup_env, res->cur_budget);
+						res->blocked_by_ghost = 0;
 						return tsk;
+					} else {
+						res->blocked_by_ghost = 1;
 					}
 				}
 			}
@@ -275,7 +291,6 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 static struct task_struct* mc2_schedule(struct task_struct * prev)
 {
 	/* next == NULL means "schedule background work". */
-	struct mc2_task_state *tinfo;
 	struct mc2_cpu_state *state = local_cpu_state();
 
 	raw_spin_lock(&state->lock);
@@ -352,7 +367,7 @@ static void mc2_task_resume(struct task_struct  *tsk)
 		 * at the moment. */
 		sup_update_time(&state->sup_env, litmus_clock());
 		mc2_update_ghost_state(state);
-		task_arrives(tsk);
+		task_arrives(state, tsk);
 		/* NOTE: drops state->lock */
 		TRACE_TASK(tsk, "mc2_resume()\n");
 		mc2_update_timer_and_unlock(state);
@@ -498,7 +513,7 @@ static void mc2_task_new(struct task_struct *tsk, int on_runqueue,
 		 * [see comment in pres_task_resume()] */
 		sup_update_time(&state->sup_env, litmus_clock());
 		mc2_update_ghost_state(state);
-		task_arrives(tsk);
+		task_arrives(state, tsk);
 		/* NOTE: drops state->lock */
 		TRACE("mc2_new()\n");
 		mc2_update_timer_and_unlock(state);
@@ -521,6 +536,7 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 	struct reservation *res = NULL, *next;
 	struct sup_reservation_environment *sup_env;
 	int found = 0;
+	enum crit_level lv = get_task_crit_level(current);
 		
 	state = cpu_state_for(cpu);
 	raw_spin_lock(&state->lock);
@@ -530,8 +546,13 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 	//if (!res) {
 	list_for_each_entry_safe(res, next, &sup_env->depleted_reservations, list) {
 		if (res->id == reservation_id) {
+			if (lv == CRIT_LEVEL_A) {
+				struct table_driven_reservation *tdres;
+				tdres = container_of(res, struct table_driven_reservation, res);
+				kfree(tdres->intervals);
+			}
 			list_del(&res->list);
-			//kfree(res);
+			kfree(res);
 			found = 1;
 			ret = 0;
 		}
@@ -539,8 +560,13 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 	if (!found) {
 		list_for_each_entry_safe(res, next, &sup_env->inactive_reservations, list) {
 			if (res->id == reservation_id) {
+				if (lv == CRIT_LEVEL_A) {
+					struct table_driven_reservation *tdres;
+					tdres = container_of(res, struct table_driven_reservation, res);
+					kfree(tdres->intervals);
+				}
 				list_del(&res->list);
-				//kfree(res);
+				kfree(res);
 				found = 1;
 				ret = 0;
 			}
@@ -549,8 +575,13 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 	if (!found) {
 		list_for_each_entry_safe(res, next, &sup_env->active_reservations, list) {
 			if (res->id == reservation_id) {
+				if (lv == CRIT_LEVEL_A) {
+					struct table_driven_reservation *tdres;
+					tdres = container_of(res, struct table_driven_reservation, res);
+					kfree(tdres->intervals);
+				}
 				list_del(&res->list);
-				//kfree(res);
+				kfree(res);
 				found = 1;
 				ret = 0;
 			}
@@ -665,6 +696,7 @@ static long create_polling_reservation(
 			config->polling_params.relative_deadline,
 			config->polling_params.offset);
 		pres->res.id = config->id;
+		pres->res.blocked_by_ghost = 0;
 		if (!use_edf)
 			pres->res.priority = config->priority;
 		sup_add_new_reservation(&state->sup_env, &pres->res);
@@ -765,6 +797,7 @@ static long create_table_driven_reservation(
 				slots, num_slots);
 			td_res->res.id = config->id;
 			td_res->res.priority = config->priority;
+			td_res->res.blocked_by_ghost = 0;
 			sup_add_new_reservation(&state->sup_env, &td_res->res);
 			err = config->id;
 		} else {
-- 
1.8.1.2


From 5ba38eb6290a0c1767932c03b15edb0627ffd6b2 Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Wed, 28 Jan 2015 09:26:59 -0500
Subject: [PATCH 086/119] LV c

---
 include/litmus/reservation.h  |  22 +-
 litmus/mc2_common.c           |   3 +-
 litmus/polling_reservations.c |   7 +-
 litmus/reservation.c          | 356 ++++++++++++++++++++-
 litmus/sched_mc2.c            | 729 +++++++++++++++++++++++++++++++++++-------
 5 files changed, 985 insertions(+), 132 deletions(-)

diff --git a/include/litmus/reservation.h b/include/litmus/reservation.h
index 0e656ad..fc7e319 100644
--- a/include/litmus/reservation.h
+++ b/include/litmus/reservation.h
@@ -129,8 +129,10 @@ struct reservation {
 	
 	/* for global env. */
 	int scheduled_on;
-	/* for blocked by ghost */
+	/* for blocked by ghost. Do not charge budget when ACTIVE */
 	int blocked_by_ghost;
+	/* ghost_job. If it is clear, do not charge budget when ACTIVE_IDLE */
+	int is_ghost;
 };
 
 void reservation_init(struct reservation *res);
@@ -199,11 +201,19 @@ struct reservation* sup_find_by_id(struct sup_reservation_environment* sup_env,
 	unsigned int id);
 	
 /* A global multiprocessor reservation environment. */
+/*
+typedef enum {
+	EVENT_REPLENISH = 0,
+	EVENT_DRAIN,
+	EVENT_OTHERS,
+} event_type_t;
+*/
 
 struct next_timer_event {
 	lt_t next_update;
 	int timer_armed_on;
-	unsigned int id;
+	//unsigned int id;
+	//event_type_t type;
 	struct list_head list;
 };
 
@@ -222,6 +232,7 @@ struct gmp_reservation_environment {
 
 	/* timer event ordered by next_update */
 	struct list_head next_events;
+	
 	/* (schedule_now == true) means call gmp_dispatch() now */
 	bool schedule_now;
 	/* set to true if a call to gmp_dispatch() is imminent */
@@ -231,9 +242,12 @@ struct gmp_reservation_environment {
 void gmp_init(struct gmp_reservation_environment* gmp_env);
 void gmp_add_new_reservation(struct gmp_reservation_environment* gmp_env,
 	struct reservation* new_res);
-void gmp_update_time(struct gmp_reservation_environment* gmp_env, lt_t now);
+void gmp_scheduler_update_after(struct gmp_reservation_environment* gmp_env,
+	lt_t timeout);
+bool gmp_update_time(struct gmp_reservation_environment* gmp_env, lt_t now);
 struct task_struct* gmp_dispatch(struct gmp_reservation_environment* gmp_env);
-struct next_timer_event* gmp_find_event_by_id(struct gmp_reservation_environment* gmp_env, unsigned int id);
+//struct next_timer_event* gmp_find_event_by_id(struct gmp_reservation_environment* gmp_env, unsigned int id);
+struct next_timer_event* gmp_find_event_by_time(struct gmp_reservation_environment* gmp_env, lt_t when);
 struct reservation* gmp_find_by_id(struct gmp_reservation_environment* gmp_env,
 	unsigned int id);
 
diff --git a/litmus/mc2_common.c b/litmus/mc2_common.c
index d0a42c6..a8ea5d9 100644
--- a/litmus/mc2_common.c
+++ b/litmus/mc2_common.c
@@ -67,7 +67,8 @@ asmlinkage long sys_set_mc2_task_param(pid_t pid, struct mc2_task __user * param
 		goto out_unlock;
 	}
 	
-	target->rt_param.plugin_state = mp;
+	//target->rt_param.plugin_state = mp;
+	target->rt_param.mc2_data = mp;
 
 	retval = 0;
 out_unlock:
diff --git a/litmus/polling_reservations.c b/litmus/polling_reservations.c
index ec5cadd..d2c54c4 100644
--- a/litmus/polling_reservations.c
+++ b/litmus/polling_reservations.c
@@ -32,8 +32,8 @@ static void periodic_polling_client_arrives(
 			}
 				
 			TRACE("ENV_TIME_ZERO %llu\n", res->env->time_zero);
-			TRACE("pol-res: activate tmp=%llu instances=%llu period=%llu nextrp=%llu cur=%llu\n",
-				tmp, instances, pres->period, res->next_replenishment,
+			TRACE("pol-res: R%d activate tmp=%llu instances=%llu period=%llu nextrp=%llu cur=%llu\n",
+				res->id, tmp, instances, pres->period, res->next_replenishment,
 				res->env->current_time);
 
 			res->env->change_state(res->env, res,
@@ -147,7 +147,8 @@ static void common_drain_budget(
 	switch (res->state) {
 		case RESERVATION_DEPLETED:
 		case RESERVATION_INACTIVE:
-			BUG();
+			//BUG();
+			TRACE("!!!!!!!!!!!!!!!STATE ERROR R%d STATE(%d)\n", res->id, res->state);
 			break;
 
 		case RESERVATION_ACTIVE_IDLE:
diff --git a/litmus/reservation.c b/litmus/reservation.c
index 16b3a48..e30892c 100644
--- a/litmus/reservation.c
+++ b/litmus/reservation.c
@@ -1,4 +1,5 @@
 #include <linux/sched.h>
+#include <linux/slab.h>
 
 #include <litmus/litmus.h>
 #include <litmus/reservation.h>
@@ -48,7 +49,7 @@ static void sup_scheduler_update_at(
 	struct sup_reservation_environment* sup_env,
 	lt_t when)
 {
-	TRACE("SCHEDULER_UPDATE_AT update: %llu > when %llu\n", sup_env->next_scheduler_update, when);
+	//TRACE("SCHEDULER_UPDATE_AT update: %llu > when %llu\n", sup_env->next_scheduler_update, when);
 	if (sup_env->next_scheduler_update > when)
 		sup_env->next_scheduler_update = when;
 }
@@ -252,7 +253,7 @@ void sup_update_time(
 	/* If the time didn't advance, there is nothing to do.
 	 * This check makes it safe to call sup_advance_time() potentially
 	 * multiple times (e.g., via different code paths. */
-	TRACE("(sup_update_time) now: %llu, current_time: %llu\n", now, sup_env->env.current_time);
+	//TRACE("(sup_update_time) now: %llu, current_time: %llu\n", now, sup_env->env.current_time);
 	if (unlikely(now <= sup_env->env.current_time))
 		return;
 
@@ -264,11 +265,11 @@ void sup_update_time(
 		sup_env->next_scheduler_update = SUP_NO_SCHEDULER_UPDATE;
 
 	/* deplete budgets by passage of time */
-	TRACE("CHARGE###\n");
+	//TRACE("CHARGE###\n");
 	sup_charge_budget(sup_env, delta);
 
 	/* check if any budgets where replenished */
-	TRACE("REPLENISH###\n");
+	//TRACE("REPLENISH###\n");
 	sup_replenish_budgets(sup_env);
 }
 
@@ -325,3 +326,350 @@ void sup_init(struct sup_reservation_environment* sup_env)
 
 	sup_env->next_scheduler_update = SUP_NO_SCHEDULER_UPDATE;
 }
+
+struct reservation* gmp_find_by_id(struct gmp_reservation_environment* gmp_env,
+	unsigned int id)
+{
+	struct reservation *res;
+
+	list_for_each_entry(res, &gmp_env->active_reservations, list) {
+		if (res->id == id)
+			return res;
+	}
+	list_for_each_entry(res, &gmp_env->inactive_reservations, list) {
+		if (res->id == id)
+			return res;
+	}
+	list_for_each_entry(res, &gmp_env->depleted_reservations, list) {
+		if (res->id == id)
+			return res;
+	}
+
+	return NULL;
+}
+
+/*
+struct next_timer_event* gmp_find_event_by_id(struct gmp_reservation_environment* gmp_env,
+	unsigned int id)
+{
+	struct next_timer_event *event;
+
+	list_for_each_entry(event, &gmp_env->next_events, list) {
+		if (event->id == id)
+			return event;
+	}
+
+	return NULL;
+}
+*/
+
+struct next_timer_event* gmp_find_event_by_time(struct gmp_reservation_environment* gmp_env,
+	lt_t when)
+{
+	struct next_timer_event *event;
+
+	list_for_each_entry(event, &gmp_env->next_events, list) {
+		if (event->next_update == when)
+			return event;
+	}
+
+	return NULL;
+}
+
+/*
+static void gmp_scheduler_update_at(
+	struct gmp_reservation_environment* gmp_env, unsigned int id,
+	event_type_t type, lt_t when)
+{
+	struct next_timer_event *nevent, *queued;
+	struct list_head *pos;
+	int found = 0;
+	
+	nevent = gmp_find_event_by_id(gmp_env, id);
+	
+	if (!nevent) {
+		nevent = kzalloc(sizeof(*nevent), GFP_KERNEL);
+		nevent->next_update = when;
+		nevent->id = id;
+		nevent->timer_armed_on = NO_CPU;
+		nevent->type = type;
+		
+		list_for_each(pos, &gmp_env->next_events) {
+			queued = list_entry(pos, struct next_timer_event, list);
+			if (queued->next_update > nevent->next_update) {
+				list_add(&nevent->list, pos->prev);
+				found = 1;
+				TRACE("NEXT_EVENT ADDED after %llu\n", queued->next_update);
+				break;
+			}
+		}
+		
+		if (!found) {
+			list_add_tail(&nevent->list, &gmp_env->next_events);
+			TRACE("NEXT_EVENT ADDED at [0]\n");
+		}
+	} else {
+		TRACE("EVENT FOUND at %llu T(%d), NEW EVENT %llu T(%d)\n", nevent->next_update, nevent->type, when, type);
+	}
+}
+*/
+#define TIMER_RESOLUTION 100000L
+
+static void gmp_scheduler_update_at(
+	struct gmp_reservation_environment* gmp_env,
+	lt_t when)
+{
+	struct next_timer_event *nevent, *queued;
+	struct list_head *pos;
+	int found = 0;
+
+	//when = div64_u64(when, TIMER_RESOLUTION);
+	//when *= TIMER_RESOLUTION;
+	
+	nevent = gmp_find_event_by_time(gmp_env, when);
+	
+	if (!nevent) {
+		nevent = kzalloc(sizeof(*nevent), GFP_KERNEL);
+		nevent->next_update = when;
+		nevent->timer_armed_on = NO_CPU;
+		
+		list_for_each(pos, &gmp_env->next_events) {
+			queued = list_entry(pos, struct next_timer_event, list);
+			if (queued->next_update > nevent->next_update) {
+				list_add(&nevent->list, pos->prev);
+				found = 1;
+				TRACE("NEXT_EVENT at %llu ADDED before %llu\n", nevent->next_update, queued->next_update);
+				break;
+			}
+		}
+		
+		if (!found) {
+			list_add_tail(&nevent->list, &gmp_env->next_events);
+			TRACE("NEXT_EVENT ADDED at %llu ADDED at HEAD\n", nevent->next_update);
+		}
+	} else {
+		; //TRACE("EVENT FOUND at %llu, NEW EVENT %llu\n", nevent->next_update, when);
+	}
+}
+
+void gmp_scheduler_update_after(
+	struct gmp_reservation_environment* gmp_env, lt_t timeout)
+{
+	gmp_scheduler_update_at(gmp_env, gmp_env->env.current_time + timeout);
+}
+
+static void gmp_queue_depleted(
+	struct gmp_reservation_environment* gmp_env,
+	struct reservation *res)
+{
+	struct list_head *pos;
+	struct reservation *queued;
+	int found = 0;
+
+	list_for_each(pos, &gmp_env->depleted_reservations) {
+		queued = list_entry(pos, struct reservation, list);
+		if (queued->next_replenishment > res->next_replenishment) {
+			list_add(&res->list, pos->prev);
+			found = 1;
+		}
+	}
+
+	if (!found)
+		list_add_tail(&res->list, &gmp_env->depleted_reservations);
+
+	gmp_scheduler_update_at(gmp_env, res->next_replenishment);
+}
+
+static void gmp_queue_active(
+	struct gmp_reservation_environment* gmp_env,
+	struct reservation *res)
+{
+	struct list_head *pos;
+	struct reservation *queued;
+	int check_preempt = 1, found = 0;
+
+	list_for_each(pos, &gmp_env->active_reservations) {
+		queued = list_entry(pos, struct reservation, list);
+		if (queued->priority > res->priority) {
+			list_add(&res->list, pos->prev);
+			found = 1;
+			break;
+		} else if (queued->scheduled_on == NO_CPU)
+			check_preempt = 0;
+	}
+
+	if (!found)
+		list_add_tail(&res->list, &gmp_env->active_reservations);
+
+	/* check for possible preemption */
+	if (res->state == RESERVATION_ACTIVE && !check_preempt)
+		gmp_env->schedule_now = true;
+	
+	gmp_scheduler_update_after(gmp_env, res->cur_budget);
+}
+
+static void gmp_queue_reservation(
+	struct gmp_reservation_environment* gmp_env,
+	struct reservation *res)
+{
+	switch (res->state) {
+		case RESERVATION_INACTIVE:
+			list_add(&res->list, &gmp_env->inactive_reservations);
+			break;
+
+		case RESERVATION_DEPLETED:
+			gmp_queue_depleted(gmp_env, res);
+			break;
+
+		case RESERVATION_ACTIVE_IDLE:
+		case RESERVATION_ACTIVE:
+			gmp_queue_active(gmp_env, res);
+			break;
+	}
+}
+
+void gmp_add_new_reservation(
+	struct gmp_reservation_environment* gmp_env,
+	struct reservation* new_res)
+{
+	new_res->env = &gmp_env->env;
+	gmp_queue_reservation(gmp_env, new_res);
+}
+
+static void gmp_charge_budget(
+	struct gmp_reservation_environment* gmp_env,
+	lt_t delta)
+{
+	struct list_head *pos, *next;
+	struct reservation *res;
+
+	list_for_each_safe(pos, next, &gmp_env->active_reservations) {
+		int drained = 0;
+		/* charge all ACTIVE_IDLE up to the first ACTIVE reservation */
+		res = list_entry(pos, struct reservation, list);
+		if (res->state == RESERVATION_ACTIVE) {
+			TRACE("gmp_charge_budget ACTIVE R%u drain %llu\n", res->id, delta);
+			if (res->scheduled_on != NO_CPU && res->blocked_by_ghost == 0) {
+				TRACE("DRAIN !!\n");
+				drained = 1;
+				res->ops->drain_budget(res, delta);
+			}			
+		} else {
+			//BUG_ON(res->state != RESERVATION_ACTIVE_IDLE);
+			if (res->state != RESERVATION_ACTIVE_IDLE)
+				TRACE("BUG!!!!!!!!!!!! gmp_charge_budget()\n");
+			TRACE("gmp_charge_budget INACTIVE R%u drain %llu\n", res->id, delta);
+			//if (res->is_ghost == 1) {
+				TRACE("DRAIN !!\n");
+				drained = 1;
+				res->ops->drain_budget(res, delta);
+			//}
+		}
+		if ((res->state == RESERVATION_ACTIVE ||
+			res->state == RESERVATION_ACTIVE_IDLE) && (drained == 1))
+		{
+			/* make sure scheduler is invoked when this reservation expires
+			 * its remaining budget */
+			 TRACE("requesting gmp_scheduler update for reservation %u in %llu nanoseconds\n",
+				res->id, res->cur_budget);
+			 gmp_scheduler_update_after(gmp_env, res->cur_budget);
+		}
+		//if (encountered_active == 2)
+			/* stop at the first ACTIVE reservation */
+		//	break;
+	}
+	//TRACE("finished charging budgets\n");
+}
+
+static void gmp_replenish_budgets(struct gmp_reservation_environment* gmp_env)
+{
+	struct list_head *pos, *next;
+	struct reservation *res;
+
+	list_for_each_safe(pos, next, &gmp_env->depleted_reservations) {
+		res = list_entry(pos, struct reservation, list);
+		if (res->next_replenishment <= gmp_env->env.current_time) {
+			res->ops->replenish(res);
+		} else {
+			/* list is ordered by increasing depletion times */
+			break;
+		}
+	}
+	//TRACE("finished replenishing budgets\n");
+
+	/* request a scheduler update at the next replenishment instant */
+	res = list_first_entry_or_null(&gmp_env->depleted_reservations,
+		struct reservation, list);
+	if (res)
+		gmp_scheduler_update_at(gmp_env, res->next_replenishment);
+}
+
+/* return schedule_now */
+bool gmp_update_time(
+	struct gmp_reservation_environment* gmp_env,
+	lt_t now)
+{
+	lt_t delta;
+
+	if (!gmp_env) {
+		TRACE("BUG****************************************\n");
+		return false;
+	}
+	/* If the time didn't advance, there is nothing to do.
+	 * This check makes it safe to call sup_advance_time() potentially
+	 * multiple times (e.g., via different code paths. */
+	//TRACE("(sup_update_time) now: %llu, current_time: %llu\n", now, sup_env->env.current_time);
+	if (unlikely(now <= gmp_env->env.current_time))
+		return gmp_env->schedule_now;
+
+	delta = now - gmp_env->env.current_time;
+	gmp_env->env.current_time = now;
+
+
+	/* deplete budgets by passage of time */
+	//TRACE("CHARGE###\n");
+	gmp_charge_budget(gmp_env, delta);
+
+	/* check if any budgets where replenished */
+	//TRACE("REPLENISH###\n");
+	gmp_replenish_budgets(gmp_env);
+	
+	return gmp_env->schedule_now;
+}
+
+static void gmp_res_change_state(
+	struct reservation_environment* env,
+	struct reservation *res,
+	reservation_state_t new_state)
+{
+	struct gmp_reservation_environment* gmp_env;
+
+	gmp_env = container_of(env, struct gmp_reservation_environment, env);
+
+	TRACE("GMP reservation R%d state %d->%d at %llu\n",
+		res->id, res->state, new_state, env->current_time);
+
+	list_del(&res->list);
+	/* check if we need to reschedule because we lost an active reservation */
+	if (res->state == RESERVATION_ACTIVE && !gmp_env->will_schedule)
+		gmp_env->schedule_now = true;
+	res->state = new_state;
+	gmp_queue_reservation(gmp_env, res);
+}
+
+void gmp_init(struct gmp_reservation_environment* gmp_env)
+{
+	memset(gmp_env, sizeof(*gmp_env), 0);
+
+	INIT_LIST_HEAD(&gmp_env->active_reservations);
+	INIT_LIST_HEAD(&gmp_env->depleted_reservations);
+	INIT_LIST_HEAD(&gmp_env->inactive_reservations);
+	INIT_LIST_HEAD(&gmp_env->next_events);
+
+	gmp_env->env.change_state = gmp_res_change_state;
+
+	gmp_env->schedule_now = false;
+	gmp_env->will_schedule = false;
+	
+	raw_spin_lock_init(&gmp_env->lock);
+}
\ No newline at end of file
diff --git a/litmus/sched_mc2.c b/litmus/sched_mc2.c
index 0c26019..6dee1ec 100644
--- a/litmus/sched_mc2.c
+++ b/litmus/sched_mc2.c
@@ -15,6 +15,23 @@
 #include <litmus/reservation.h>
 #include <litmus/polling_reservations.h>
 
+struct gmp_reservation_environment _global_env;
+
+struct cpu_entry {
+	struct task_struct *scheduled;
+	lt_t deadline;
+	int cpu;
+	enum crit_level lv;
+	bool will_schedule;
+};
+
+struct cpu_priority {
+	raw_spinlock_t lock;
+	struct cpu_entry cpu_entries[NR_CPUS];
+};
+
+struct cpu_priority _lowest_prio_cpu;
+	
 struct mc2_task_state {
 	struct task_client res_info;
 	int cpu;
@@ -51,11 +68,39 @@ static struct mc2_task_state* get_mc2_state(struct task_struct *tsk)
 }
 static enum crit_level get_task_crit_level(struct task_struct *tsk)
 {
-	struct mc2_task_state *tinfo = get_mc2_state(tsk);
-	if (!tinfo)
+	//struct mc2_task_state *tinfo = get_mc2_state(tsk);
+	struct mc2_task *mp;
+	
+	if (!tsk || !is_realtime(tsk))
+		return NUM_CRIT_LEVELS;
+	
+	mp = tsk_rt(tsk)->mc2_data;
+	
+	if (!mp)
 		return NUM_CRIT_LEVELS;
 	else
-		return tinfo->mc2_param.crit;
+		return mp->crit;
+}
+
+static struct reservation* res_find_by_id(struct mc2_cpu_state *state, unsigned int id)
+{
+	struct reservation *res;
+
+	res = sup_find_by_id(&state->sup_env, id);
+	if (!res)
+		res = gmp_find_by_id(&_global_env, id);
+	
+	return res;
+}
+
+static void mc2_update_time(enum crit_level lv, struct mc2_cpu_state *state, lt_t time)
+{
+	if (lv < CRIT_LEVEL_C)
+		sup_update_time(&state->sup_env, time);
+	else if (lv == CRIT_LEVEL_C)
+		gmp_update_time(&_global_env, time);
+	else
+		TRACE("update_time(): Criticality level error!!!!\n");
 }
 
 static void task_departs(struct task_struct *tsk, int job_complete)
@@ -78,6 +123,7 @@ static void task_departs(struct task_struct *tsk, int job_complete)
 		
 		ce = &state->crit_entries[lv];
 		ce->running = tsk;
+		res->is_ghost = 1;
 		TRACE_TASK(tsk, "BECOME GHOST at %llu\n", litmus_clock());
 		
 		//BUG_ON(hrtimer_active(&ce->ghost_timer));
@@ -107,11 +153,44 @@ static void task_arrives(struct mc2_cpu_state *state, struct task_struct *tsk)
 	}
 }
 
+/* return: NO_CPU - all CPUs are running tasks with higher priority than Level C */
+static int get_lowest_prio_cpu(void)
+{
+	struct cpu_entry *ce;
+	int cpu, ret = NO_CPU;
+	lt_t latest_deadline = 0;
+	
+	raw_spin_lock(&_lowest_prio_cpu.lock);
+	for_each_online_cpu(cpu) {
+		ce = &_lowest_prio_cpu.cpu_entries[cpu];
+		if (!ce->will_schedule) {
+			if (!ce->scheduled) {
+				raw_spin_unlock(&_lowest_prio_cpu.lock);
+				return ce->cpu;
+			} else if (ce->lv == CRIT_LEVEL_C && ce->deadline > latest_deadline) {
+				latest_deadline = ce->deadline;
+				ret = ce->cpu;
+			}
+		}
+	}		
+	
+	raw_spin_unlock(&_lowest_prio_cpu.lock);
+	
+	return ret;
+}
+
 /* NOTE: drops state->lock */
 static void mc2_update_timer_and_unlock(struct mc2_cpu_state *state)
 {
 	int local;
 	lt_t update, now;
+	enum crit_level lv = get_task_crit_level(state->scheduled);
+	struct next_timer_event *event, *next;
+	int found_event = 0;
+	
+	//TRACE_TASK(state->scheduled, "update_timer!\n");
+	if (lv != NUM_CRIT_LEVELS)
+		TRACE_TASK(state->scheduled, "UPDATE_TIMER LV = %d\n", lv);
 
 	update = state->sup_env.next_scheduler_update;
 	now = state->sup_env.env.current_time;
@@ -163,6 +242,37 @@ static void mc2_update_timer_and_unlock(struct mc2_cpu_state *state)
 			litmus_reschedule(state->cpu);
 		}
 	}
+	
+	raw_spin_lock(&_global_env.lock);
+	list_for_each_entry_safe(event, next, &_global_env.next_events, list) {
+		if (event->timer_armed_on == NO_CPU) {
+			found_event = 1;
+			if (event->next_update < litmus_clock()) {
+				int cpu = get_lowest_prio_cpu();
+				TRACE("GLOBAL EVENT PASSED!! poking CPU %d to reschedule\n", cpu);
+				list_del(&event->list);
+				kfree(event);
+				if (cpu != NO_CPU) {
+					raw_spin_lock(&_lowest_prio_cpu.lock);
+					_lowest_prio_cpu.cpu_entries[cpu].will_schedule = true;
+					raw_spin_unlock(&_lowest_prio_cpu.lock);
+					litmus_reschedule(cpu);
+				}
+			} else if (!hrtimer_active(&state->g_timer)) {
+				int ret;
+				TRACE("setting global scheduler timer for %llu\n", event->next_update);
+				ret = __hrtimer_start_range_ns(&state->g_timer,
+						ns_to_ktime(event->next_update),
+						0 /* timer coalescing slack */,
+						HRTIMER_MODE_ABS_PINNED,
+						0 /* wakeup */);
+				if (!ret) {
+					event->timer_armed_on = state->cpu;
+				}
+			}				
+		}
+	}	
+	raw_spin_unlock(&_global_env.lock);
 }
 
 static void mc2_update_ghost_state(struct mc2_cpu_state *state)
@@ -176,16 +286,20 @@ static void mc2_update_ghost_state(struct mc2_cpu_state *state)
 		ce = &state->crit_entries[lv];
 		if (ce->running != NULL) {
 			tinfo = get_mc2_state(ce->running);
+			/*
 			if (lv != CRIT_LEVEL_C)
 				res = sup_find_by_id(&state->sup_env, tinfo->mc2_param.res_id);
 			else
 				continue;
+			*/
+			res = res_find_by_id(state, tinfo->mc2_param.res_id);
 			TRACE("LV %d running id %d budget %llu\n", lv, tinfo->mc2_param.res_id, res->cur_budget);
 			if (!res->cur_budget) {
 				struct sup_reservation_environment* sup_env = &state->sup_env;
 				
 				TRACE("GHOST FINISH id %d at %llu\n", tinfo->mc2_param.res_id, litmus_clock());
 				ce->running = NULL;
+				res->is_ghost = 0;
 				res = list_first_entry_or_null(&sup_env->active_reservations, struct reservation, list);
 				if (res)
 					litmus_reschedule_local();
@@ -215,6 +329,95 @@ static enum hrtimer_restart on_ghost_timer(struct hrtimer *timer)
 }
 */
 
+static void update_cpu_prio(struct mc2_cpu_state *state)
+{
+	struct cpu_entry *ce = &_lowest_prio_cpu.cpu_entries[state->cpu];
+	enum crit_level lv = get_task_crit_level(state->scheduled);
+	
+	if (!state->scheduled) {
+		// cpu is idle.
+		ce->scheduled = NULL;
+		ce->deadline = ULLONG_MAX;
+		ce->lv = NUM_CRIT_LEVELS;
+	} else if (lv == CRIT_LEVEL_C) {
+		ce->scheduled = state->scheduled;
+		ce->deadline = get_deadline(state->scheduled);
+		ce->lv = lv;
+	} else if (lv < CRIT_LEVEL_C) {
+		ce->scheduled = state->scheduled;
+		ce->deadline = 0;
+		ce->lv = lv;
+	}
+};
+
+static enum hrtimer_restart on_global_scheduling_timer(struct hrtimer *timer)
+{
+	unsigned long flags;
+	enum hrtimer_restart restart = HRTIMER_NORESTART;
+	struct mc2_cpu_state *state;
+	struct next_timer_event *event, *next;
+	bool schedule_now;
+	lt_t update, now;
+	int found_event = 0;
+
+	state = container_of(timer, struct mc2_cpu_state, g_timer);
+
+	/* The scheduling timer should only fire on the local CPU, because
+	 * otherwise deadlocks via timer_cancel() are possible.
+	 * Note: this does not interfere with dedicated interrupt handling, as
+	 * even under dedicated interrupt handling scheduling timers for
+	 * budget enforcement must occur locally on each CPU.
+	 */
+	//BUG_ON(state->cpu != raw_smp_processor_id());
+	if (state->cpu != raw_smp_processor_id())
+		TRACE("BUG!!!!!!!!!!!!! TIMER FIRED ON THE OTHER CPU\n");
+
+	raw_spin_lock_irqsave(&_global_env.lock, flags);
+	
+	update = litmus_clock();
+	TRACE("GLOBAL TIMER FIRED at %llu\n", update);
+	
+	list_for_each_entry_safe(event, next, &_global_env.next_events, list) {
+		if (event->next_update < update) {
+			found_event = 1;
+			list_del(&event->list);
+			TRACE("EVENT at %llu IS DELETED\n", event->next_update);
+			kfree(event);
+		}
+	}			
+	
+	if (!found_event) {
+		raw_spin_unlock_irqrestore(&_global_env.lock, flags);
+		return restart;
+	}
+	
+	schedule_now = gmp_update_time(&_global_env, update);
+	
+	raw_spin_lock(&state->lock);
+	mc2_update_ghost_state(state);
+	raw_spin_unlock(&state->lock);
+	
+	now = _global_env.env.current_time;
+	
+	TRACE_CUR("on_global_scheduling_timer at %llu, upd:%llu (for cpu=%d) SCHEDULE_NOW = %d\n",
+		now, update, state->cpu, schedule_now);
+
+	if (schedule_now) {
+		int cpu = get_lowest_prio_cpu();
+		if (cpu != NO_CPU) {
+			raw_spin_lock(&_lowest_prio_cpu.lock);
+			_lowest_prio_cpu.cpu_entries[cpu].will_schedule = true;
+			raw_spin_unlock(&_lowest_prio_cpu.lock);
+			TRACE("LOWEST CPU = P%d\n", cpu);
+			litmus_reschedule(cpu);
+		}
+	} 
+
+	raw_spin_unlock_irqrestore(&_global_env.lock, flags);
+
+	return restart;
+}
+
 static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 {
 	unsigned long flags;
@@ -276,6 +479,7 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 					if (likely(!ce->running)) {
 						sup_scheduler_update_after(sup_env, res->cur_budget);
 						res->blocked_by_ghost = 0;
+						res->is_ghost = 0;
 						return tsk;
 					} else {
 						res->blocked_by_ghost = 1;
@@ -284,7 +488,34 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 			}
 		}
 	}
-
+	// no level A or B tasks
+	
+	list_for_each_entry_safe(res, next, &_global_env.active_reservations, list) {
+		if (res->state == RESERVATION_ACTIVE && res->scheduled_on == NO_CPU) {
+			tsk = res->ops->dispatch_client(res, &time_slice);
+			if (likely(tsk)) {
+				lv = get_task_crit_level(tsk);
+				if (lv == NUM_CRIT_LEVELS) {
+					gmp_scheduler_update_after(&_global_env, res->cur_budget);
+					//raw_spin_unlock(&_global_env.lock);
+					return tsk;
+				} else {
+					ce = &state->crit_entries[lv];
+					if (likely(!ce->running)) {
+						gmp_scheduler_update_after(&_global_env, res->cur_budget);
+						res->blocked_by_ghost = 0;
+						res->is_ghost = 0;
+						res->scheduled_on = state->cpu;
+						//raw_spin_unlock(&_global_env.lock);
+						return tsk;
+					} else {
+						res->blocked_by_ghost = 1;
+					}
+				}
+			}
+		}
+	}
+	
 	return NULL;
 }
 
@@ -292,17 +523,30 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 {
 	/* next == NULL means "schedule background work". */
 	struct mc2_cpu_state *state = local_cpu_state();
-
+	
+	raw_spin_lock(&_lowest_prio_cpu.lock);
+	if (_lowest_prio_cpu.cpu_entries[state->cpu].will_schedule == true)
+		_lowest_prio_cpu.cpu_entries[state->cpu].will_schedule = false;
+	raw_spin_unlock(&_lowest_prio_cpu.lock);
+	
 	raw_spin_lock(&state->lock);
 
-	BUG_ON(state->scheduled && state->scheduled != prev);
-	BUG_ON(state->scheduled && !is_realtime(prev));
+	//BUG_ON(state->scheduled && state->scheduled != prev);
+	//BUG_ON(state->scheduled && !is_realtime(prev));
+	if (state->scheduled && state->scheduled != prev)
+		TRACE("BUG1!!!!!!!!\n");
+	if (state->scheduled && !is_realtime(prev))
+		TRACE("BUG2!!!!!!!!\n");
 
 	/* update time */
 	state->sup_env.will_schedule = true;
-	TRACE_TASK(prev, "MC2_SCHEDULE sup_update_time ####\n");
+	//TRACE_TASK(prev, "MC2_SCHEDULE sup_update_time ####\n");
 	sup_update_time(&state->sup_env, litmus_clock());
-	TRACE_TASK(prev, "MC2_SCHEDULE sup_update_time !!!!\n");
+	
+	raw_spin_lock(&_global_env.lock);
+	gmp_update_time(&_global_env, litmus_clock());
+	
+	//TRACE_TASK(prev, "MC2_SCHEDULE sup_update_time !!!!\n");
 	mc2_update_ghost_state(state);
 	
 	/* remove task from reservation if it blocks */
@@ -311,16 +555,29 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 
 	/* figure out what to schedule next */
 	state->scheduled = mc2_dispatch(&state->sup_env, state);
-
+	if (state->scheduled && is_realtime(state->scheduled))
+		TRACE_TASK(state->scheduled, "mc2_dispatch picked me!\n");
+	
+	raw_spin_lock(&_lowest_prio_cpu.lock);
+	update_cpu_prio(state);
+	raw_spin_unlock(&_lowest_prio_cpu.lock);
+	
 	/* Notify LITMUS^RT core that we've arrived at a scheduling decision. */
 	sched_state_task_picked();
 
 	/* program scheduler timer */
 	state->sup_env.will_schedule = false;
+	
+	raw_spin_unlock(&_global_env.lock);
+	
 	/* NOTE: drops state->lock */
 	mc2_update_timer_and_unlock(state);
 
 	if (prev != state->scheduled && is_realtime(prev)) {
+		struct mc2_task_state* tinfo = get_mc2_state(prev);
+		struct reservation* res = tinfo->res_info.client.reservation;
+		TRACE_TASK(prev, "PREV JOB scheduled_on = P%d\n", res->scheduled_on);
+		res->scheduled_on = NO_CPU;
 		TRACE_TASK(prev, "descheduled.\n");
 	}
 	if (state->scheduled) {
@@ -354,10 +611,15 @@ static void mc2_task_resume(struct task_struct  *tsk)
 {
 	unsigned long flags;
 	struct mc2_task_state* tinfo = get_mc2_state(tsk);
-	struct mc2_cpu_state *state = cpu_state_for(tinfo->cpu);
+	struct mc2_cpu_state *state;
 
 	TRACE_TASK(tsk, "thread wakes up at %llu\n", litmus_clock());
 
+	if (tinfo->cpu != -1)
+		state = cpu_state_for(tinfo->cpu);
+	else
+		state = local_cpu_state();
+
 	raw_spin_lock_irqsave(&state->lock, flags);
 	/* Requeue only if self-suspension was already processed. */
 	if (tinfo->has_departed)
@@ -365,7 +627,16 @@ static void mc2_task_resume(struct task_struct  *tsk)
 		/* Assumption: litmus_clock() is synchronized across cores,
 		 * since we might not actually be executing on tinfo->cpu
 		 * at the moment. */
-		sup_update_time(&state->sup_env, litmus_clock());
+		if (tinfo->cpu != -1) {
+			sup_update_time(&state->sup_env, litmus_clock());
+		} else {
+			raw_spin_lock(&_global_env.lock);
+			TRACE("RESUME UPDATE ####\n");
+			gmp_update_time(&_global_env, litmus_clock());
+			TRACE("RESUME UPDATE $$$$\n");
+			raw_spin_unlock(&_global_env.lock);
+		}
+			
 		mc2_update_ghost_state(state);
 		task_arrives(state, tsk);
 		/* NOTE: drops state->lock */
@@ -385,37 +656,55 @@ static long mc2_complete_job(void)
 {
 	ktime_t next_release;
 	long err;
-	struct mc2_cpu_state *state = local_cpu_state();
-	struct reservation_environment *env = &(state->sup_env.env);
-	struct mc2_task_state *tinfo = get_mc2_state(current);
-	struct reservation *res;
-	
-	res = sup_find_by_id(&state->sup_env, tinfo->mc2_param.res_id);
-	if (!res)
-		; // find in global env
-	
-	TRACE_CUR("mc2_complete_job at %llu (deadline: %llu) (cur->budget: %llu)\n", litmus_clock(),
-		get_deadline(current), res->cur_budget);
+
+	TRACE_CUR("mc2_complete_job at %llu (deadline: %llu)\n", litmus_clock(),
+		get_deadline(current));
 
 	tsk_rt(current)->completed = 1;
 	
 	if (tsk_rt(current)->sporadic_release) {
-		env->time_zero = tsk_rt(current)->sporadic_release_time;
+		struct mc2_cpu_state *state;
+		struct reservation_environment *env;
+		struct mc2_task_state *tinfo;
+		struct reservation *res;
+		unsigned long flags;
+
+		local_irq_save(flags);
+	
+		state = local_cpu_state();
+		env = &(state->sup_env.env);
+		tinfo = get_mc2_state(current);
+		
+		res = res_find_by_id(state, tsk_rt(current)->mc2_data->res_id);
+		
+		if (get_task_crit_level(current) < CRIT_LEVEL_C) {
+			raw_spin_lock(&state->lock);
+			env->time_zero = tsk_rt(current)->sporadic_release_time;
+		} else {
+			raw_spin_lock(&_global_env.lock);
+			_global_env.env.time_zero = tsk_rt(current)->sporadic_release_time;
+		}
+		
 		res->next_replenishment = tsk_rt(current)->sporadic_release_time;
-		res->cur_budget = 0;
-		res->env->change_state(res->env, res, RESERVATION_DEPLETED);
 		
-		if (tinfo->mc2_param.crit == CRIT_LEVEL_A) {
+		if (get_task_crit_level(current) == CRIT_LEVEL_A) {
 			struct table_driven_reservation *tdres;
-			
-			//sup_update_time(&state->sup_env, litmus_clock());
-			//res = sup_find_by_id(&state->sup_env, tinfo->mc2_param.res_id);
 			tdres = container_of(res, struct table_driven_reservation, res);
 			tdres->next_interval = 0;
 			tdres->major_cycle_start = tsk_rt(current)->sporadic_release_time;
 			res->next_replenishment += tdres->intervals[0].start;			
 		}
-		TRACE_CUR("CHANGE NEXT_REP = %llu\n NEXT_UPDATE = %llu\n", res->next_replenishment, state->sup_env.next_scheduler_update);
+		res->cur_budget = 0;
+		res->env->change_state(res->env, res, RESERVATION_DEPLETED);
+		
+		//TRACE_CUR("CHANGE NEXT_REP = %llu\n NEXT_UPDATE = %llu\n", res->next_replenishment, state->sup_env.next_scheduler_update);
+		if (get_task_crit_level(current) < CRIT_LEVEL_C) {
+			raw_spin_unlock(&state->lock);
+		} else {
+			raw_spin_unlock(&_global_env.lock);
+		}
+		
+		local_irq_restore(flags);
 	}
 	
 	prepare_for_next_period(current);
@@ -443,8 +732,9 @@ static long mc2_admit_task(struct task_struct *tsk)
 	struct reservation *res;
 	struct mc2_cpu_state *state;
 	struct mc2_task_state *tinfo = kzalloc(sizeof(*tinfo), GFP_ATOMIC);
-	struct mc2_task *mp = tsk_rt(tsk)->plugin_state;
-
+	struct mc2_task *mp = tsk_rt(tsk)->mc2_data;
+	enum crit_level lv;
+	
 	if (!tinfo)
 		return -ENOMEM;
 
@@ -453,33 +743,61 @@ static long mc2_admit_task(struct task_struct *tsk)
 		return err;
 	}
 	
+	lv = mp->crit;
 	preempt_disable();
 
-	state = cpu_state_for(task_cpu(tsk));
-	raw_spin_lock_irqsave(&state->lock, flags);
+	if (lv < CRIT_LEVEL_C) {
+		state = cpu_state_for(task_cpu(tsk));
+		raw_spin_lock_irqsave(&state->lock, flags);
 
-	res = sup_find_by_id(&state->sup_env, mp->res_id);
+		res = sup_find_by_id(&state->sup_env, mp->res_id);
 
-	/* found the appropriate reservation (or vCPU) */
-	if (res) {
-		TRACE_TASK(tsk, "FOUND RES ID\n");
-		tinfo->mc2_param.crit = mp->crit;
-		tinfo->mc2_param.res_id = mp->res_id;
-		
-		kfree(tsk_rt(tsk)->plugin_state);
-		tsk_rt(tsk)->plugin_state = NULL;
+		/* found the appropriate reservation (or vCPU) */
+		if (res) {
+			TRACE_TASK(tsk, "SUP FOUND RES ID\n");
+			tinfo->mc2_param.crit = mp->crit;
+			tinfo->mc2_param.res_id = mp->res_id;
+			
+			//kfree(tsk_rt(tsk)->plugin_state);
+			//tsk_rt(tsk)->plugin_state = NULL;
+			
+			err = mc2_task_client_init(&tinfo->res_info, &tinfo->mc2_param, tsk, res);
+			tinfo->cpu = task_cpu(tsk);
+			tinfo->has_departed = true;
+			tsk_rt(tsk)->plugin_state = tinfo;
+
+			/* disable LITMUS^RT's per-thread budget enforcement */
+			tsk_rt(tsk)->task_params.budget_policy = NO_ENFORCEMENT;
+		}
+
+		raw_spin_unlock_irqrestore(&state->lock, flags);
+	} else if (lv == CRIT_LEVEL_C) {
+		raw_spin_lock_irqsave(&_global_env.lock, flags);
 		
-		err = mc2_task_client_init(&tinfo->res_info, &tinfo->mc2_param, tsk, res);
-		tinfo->cpu = task_cpu(tsk);
-		tinfo->has_departed = true;
-		tsk_rt(tsk)->plugin_state = tinfo;
+		res = gmp_find_by_id(&_global_env, mp->res_id);
 
-		/* disable LITMUS^RT's per-thread budget enforcement */
-		tsk_rt(tsk)->task_params.budget_policy = NO_ENFORCEMENT;
-	}
+		/* found the appropriate reservation (or vCPU) */
+		if (res) {
+			TRACE_TASK(tsk, "GMP FOUND RES ID\n");
+			tinfo->mc2_param.crit = mp->crit;
+			tinfo->mc2_param.res_id = mp->res_id;
+			
+			//kfree(tsk_rt(tsk)->plugin_state);
+			//tsk_rt(tsk)->plugin_state = NULL;
+			
+			err = mc2_task_client_init(&tinfo->res_info, &tinfo->mc2_param, tsk, res);
+			tinfo->cpu = -1;
+			tinfo->has_departed = true;
+			tsk_rt(tsk)->plugin_state = tinfo;
 
-	raw_spin_unlock_irqrestore(&state->lock, flags);
+			/* disable LITMUS^RT's per-thread budget enforcement */
+			tsk_rt(tsk)->task_params.budget_policy = NO_ENFORCEMENT;
+		}
 
+		raw_spin_unlock_irqrestore(&_global_env.lock, flags);
+		
+	}
+	
 	preempt_enable();
 
 	if (err)
@@ -493,12 +811,18 @@ static void mc2_task_new(struct task_struct *tsk, int on_runqueue,
 {
 	unsigned long flags;
 	struct mc2_task_state* tinfo = get_mc2_state(tsk);
-	struct mc2_cpu_state *state = cpu_state_for(tinfo->cpu);
+	struct mc2_cpu_state *state; // = cpu_state_for(tinfo->cpu);
 	struct reservation *res;
-	
+	enum crit_level lv = get_task_crit_level(tsk);
+
 	TRACE_TASK(tsk, "new RT task %llu (on_rq:%d, running:%d)\n",
 		   litmus_clock(), on_runqueue, is_running);
 
+	if (tinfo->cpu == -1)
+		state = local_cpu_state();
+	else 
+		state = cpu_state_for(tinfo->cpu);
+	
 	/* acquire the lock protecting the state and disable interrupts */
 	raw_spin_lock_irqsave(&state->lock, flags);
 
@@ -511,7 +835,9 @@ static void mc2_task_new(struct task_struct *tsk, int on_runqueue,
 	if (on_runqueue || is_running) {
 		/* Assumption: litmus_clock() is synchronized across cores
 		 * [see comment in pres_task_resume()] */
-		sup_update_time(&state->sup_env, litmus_clock());
+		raw_spin_lock(&_global_env.lock);
+		mc2_update_time(lv, state, litmus_clock());
+		raw_spin_unlock(&_global_env.lock);
 		mc2_update_ghost_state(state);
 		task_arrives(state, tsk);
 		/* NOTE: drops state->lock */
@@ -521,12 +847,14 @@ static void mc2_task_new(struct task_struct *tsk, int on_runqueue,
 	} else
 		raw_spin_unlock_irqrestore(&state->lock, flags);
 
-	res = sup_find_by_id(&state->sup_env, tinfo->mc2_param.res_id);
-	release_at(tsk, res->next_replenishment);
-	if (res)
-		TRACE_TASK(tsk, "next_replenishment = %llu\n", res->next_replenishment);
+	res = res_find_by_id(state, tinfo->mc2_param.res_id);
+	
+	if (res) {
+		TRACE_TASK(tsk, "mc2_task_new() next_replenishment = %llu\n", res->next_replenishment);
+		release_at(tsk, res->next_replenishment);
+	}
 	else
-		TRACE_TASK(tsk, "next_replenishment = NULL\n");
+		TRACE_TASK(tsk, "mc2_task_new() next_replenishment = NULL\n");
 }
 
 static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
@@ -537,43 +865,71 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 	struct sup_reservation_environment *sup_env;
 	int found = 0;
 	enum crit_level lv = get_task_crit_level(current);
-		
-	state = cpu_state_for(cpu);
-	raw_spin_lock(&state->lock);
 	
-//	res = sup_find_by_id(&state->sup_env, reservation_id);
-	sup_env = &state->sup_env;
-	//if (!res) {
-	list_for_each_entry_safe(res, next, &sup_env->depleted_reservations, list) {
-		if (res->id == reservation_id) {
-			if (lv == CRIT_LEVEL_A) {
-				struct table_driven_reservation *tdres;
-				tdres = container_of(res, struct table_driven_reservation, res);
-				kfree(tdres->intervals);
-			}
-			list_del(&res->list);
-			kfree(res);
-			found = 1;
-			ret = 0;
-		}
-	}
-	if (!found) {
-		list_for_each_entry_safe(res, next, &sup_env->inactive_reservations, list) {
+	if (cpu == -1) {
+		raw_spin_lock(&_global_env.lock);
+	
+		list_for_each_entry_safe(res, next, &_global_env.depleted_reservations, list) {
 			if (res->id == reservation_id) {
-				if (lv == CRIT_LEVEL_A) {
-					struct table_driven_reservation *tdres;
-					tdres = container_of(res, struct table_driven_reservation, res);
-					kfree(tdres->intervals);
-				}
+				TRACE("DESTROY RES FOUND!!!\n");
 				list_del(&res->list);
 				kfree(res);
 				found = 1;
 				ret = 0;
 			}
 		}
-	}
-	if (!found) {
-		list_for_each_entry_safe(res, next, &sup_env->active_reservations, list) {
+		if (!found) {
+			list_for_each_entry_safe(res, next, &_global_env.inactive_reservations, list) {
+				if (res->id == reservation_id) {
+					TRACE("DESTROY RES FOUND!!!\n");
+					list_del(&res->list);
+					kfree(res);
+					found = 1;
+					ret = 0;
+				}
+			}
+		}
+		if (!found) {
+			list_for_each_entry_safe(res, next, &_global_env.active_reservations, list) {
+				if (res->id == reservation_id) {
+					TRACE("DESTROY RES FOUND!!!\n");
+					list_del(&res->list);
+					kfree(res);
+					found = 1;
+					ret = 0;
+				}
+			}
+		}
+
+/*		
+list_for_each_entry(res, &_global_env.depleted_reservations, list) {
+	TRACE("DEPLETED LIST R%d\n", res->id);
+}
+list_for_each_entry(res, &_global_env.inactive_reservations, list) {
+	TRACE("INACTIVE LIST R%d\n", res->id);
+}
+list_for_each_entry(res, &_global_env.active_reservations, list) {
+	TRACE("ACTIVE LIST R%d\n", res->id);
+}
+*/
+		if (list_empty(&_global_env.active_reservations)) 
+			INIT_LIST_HEAD(&_global_env.active_reservations);
+		if (list_empty(&_global_env.depleted_reservations)) 
+			INIT_LIST_HEAD(&_global_env.depleted_reservations);
+		if (list_empty(&_global_env.inactive_reservations)) 
+			INIT_LIST_HEAD(&_global_env.inactive_reservations);
+		if (list_empty(&_global_env.next_events)) 
+			INIT_LIST_HEAD(&_global_env.next_events);
+		
+		raw_spin_unlock(&_global_env.lock);
+	} else {
+		state = cpu_state_for(cpu);
+		raw_spin_lock(&state->lock);
+		
+	//	res = sup_find_by_id(&state->sup_env, reservation_id);
+		sup_env = &state->sup_env;
+		//if (!res) {
+		list_for_each_entry_safe(res, next, &sup_env->depleted_reservations, list) {
 			if (res->id == reservation_id) {
 				if (lv == CRIT_LEVEL_A) {
 					struct table_driven_reservation *tdres;
@@ -586,10 +942,40 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 				ret = 0;
 			}
 		}
-	}
-	//}
+		if (!found) {
+			list_for_each_entry_safe(res, next, &sup_env->inactive_reservations, list) {
+				if (res->id == reservation_id) {
+					if (lv == CRIT_LEVEL_A) {
+						struct table_driven_reservation *tdres;
+						tdres = container_of(res, struct table_driven_reservation, res);
+						kfree(tdres->intervals);
+					}
+					list_del(&res->list);
+					kfree(res);
+					found = 1;
+					ret = 0;
+				}
+			}
+		}
+		if (!found) {
+			list_for_each_entry_safe(res, next, &sup_env->active_reservations, list) {
+				if (res->id == reservation_id) {
+					if (lv == CRIT_LEVEL_A) {
+						struct table_driven_reservation *tdres;
+						tdres = container_of(res, struct table_driven_reservation, res);
+						kfree(tdres->intervals);
+					}
+					list_del(&res->list);
+					kfree(res);
+					found = 1;
+					ret = 0;
+				}
+			}
+		}
+		//}
 
-	raw_spin_unlock(&state->lock);
+		raw_spin_unlock(&state->lock);
+	}
 	
 	TRACE("RESERVATION_DESTROY ret = %d\n", ret);
 	return ret;
@@ -599,10 +985,15 @@ static void mc2_task_exit(struct task_struct *tsk)
 {
 	unsigned long flags;
 	struct mc2_task_state* tinfo = get_mc2_state(tsk);
-	struct mc2_cpu_state *state = cpu_state_for(tinfo->cpu);
+	struct mc2_cpu_state *state;
 	enum crit_level lv = tinfo->mc2_param.crit;
 	struct crit_entry* ce;	
 
+	if (tinfo->cpu != -1)
+		state = cpu_state_for(tinfo->cpu);
+	else
+		state = local_cpu_state();
+	
 	raw_spin_lock_irqsave(&state->lock, flags);
 
 	if (state->scheduled == tsk)
@@ -616,7 +1007,11 @@ static void mc2_task_exit(struct task_struct *tsk)
 	if (is_running(tsk)) {
 		/* Assumption: litmus_clock() is synchronized across cores
 		 * [see comment in pres_task_resume()] */
-		sup_update_time(&state->sup_env, litmus_clock());
+		//if (lv < CRIT_LEVEL_C)
+		//	sup_update_time(&state->sup_env, litmus_clock());
+		raw_spin_lock(&_global_env.lock);
+		mc2_update_time(lv, state, litmus_clock());
+		raw_spin_unlock(&_global_env.lock);
 		mc2_update_ghost_state(state);
 		task_departs(tsk, 0);
 		
@@ -644,6 +1039,8 @@ static void mc2_task_exit(struct task_struct *tsk)
 */
 	kfree(tsk_rt(tsk)->plugin_state);
 	tsk_rt(tsk)->plugin_state = NULL;
+	kfree(tsk_rt(tsk)->mc2_data);
+	tsk_rt(tsk)->mc2_data = NULL;
 }
 
 static long create_polling_reservation(
@@ -685,28 +1082,54 @@ static long create_polling_reservation(
 	if (!pres)
 		return -ENOMEM;
 
-	state = cpu_state_for(config->cpu);
-	raw_spin_lock_irqsave(&state->lock, flags);
+	if (config->cpu != -1) {
+		state = cpu_state_for(config->cpu);
+		raw_spin_lock_irqsave(&state->lock, flags);
+
+		res = sup_find_by_id(&state->sup_env, config->id);
+		if (!res) {
+			polling_reservation_init(pres, use_edf, periodic,
+				config->polling_params.budget,
+				config->polling_params.period,
+				config->polling_params.relative_deadline,
+				config->polling_params.offset);
+			pres->res.id = config->id;
+			pres->res.blocked_by_ghost = 0;
+			pres->res.is_ghost = 0;
+			if (!use_edf)
+				pres->res.priority = config->priority;
+			sup_add_new_reservation(&state->sup_env, &pres->res);
+			err = config->id;
+		} else {
+			err = -EEXIST;
+		}
 
-	res = sup_find_by_id(&state->sup_env, config->id);
-	if (!res) {
-		polling_reservation_init(pres, use_edf, periodic,
-			config->polling_params.budget,
-			config->polling_params.period,
-			config->polling_params.relative_deadline,
-			config->polling_params.offset);
-		pres->res.id = config->id;
-		pres->res.blocked_by_ghost = 0;
-		if (!use_edf)
-			pres->res.priority = config->priority;
-		sup_add_new_reservation(&state->sup_env, &pres->res);
-		err = config->id;
+		raw_spin_unlock_irqrestore(&state->lock, flags);
 	} else {
-		err = -EEXIST;
+		raw_spin_lock_irqsave(&_global_env.lock, flags);
+		
+		res = gmp_find_by_id(&_global_env, config->id);
+		if (!res) {
+			polling_reservation_init(pres, use_edf, periodic,
+				config->polling_params.budget,
+				config->polling_params.period,
+				config->polling_params.relative_deadline,
+				config->polling_params.offset);
+			pres->res.id = config->id;
+			pres->res.blocked_by_ghost = 0;
+			pres->res.scheduled_on = NO_CPU;
+			pres->res.is_ghost = 0;
+			if (!use_edf)
+				pres->res.priority = config->priority;
+			gmp_add_new_reservation(&_global_env, &pres->res);
+			TRACE("GMP_ADD_NEW_RESERVATION R%d\n", pres->res.id);
+			err = config->id;
+		} else {
+			err = -EEXIST;
+		}
+		raw_spin_unlock_irqrestore(&_global_env.lock, flags);		
 	}
-
-	raw_spin_unlock_irqrestore(&state->lock, flags);
-
+	
 	if (err < 0)
 		kfree(pres);
 
@@ -825,10 +1248,12 @@ static long mc2_reservation_create(int res_type, void* __user _config)
 	if (copy_from_user(&config, _config, sizeof(config)))
 		return -EFAULT;
 
-	if (config.cpu < 0 || !cpu_online(config.cpu)) {
-		printk(KERN_ERR "invalid polling reservation (%u): "
-		       "CPU %d offline\n", config.id, config.cpu);
-		return -EINVAL;
+	if (config.cpu != -1) {
+		if (config.cpu < 0 || !cpu_online(config.cpu)) {
+			printk(KERN_ERR "invalid polling reservation (%u): "
+				   "CPU %d offline\n", config.id, config.cpu);
+			return -EINVAL;
+		}
 	}
 
 	switch (res_type) {
@@ -885,19 +1310,30 @@ static long mc2_activate_plugin(void)
 {
 	int cpu, lv;
 	struct mc2_cpu_state *state;
+	struct cpu_entry *ce;
 
+	gmp_init(&_global_env);
+	raw_spin_lock_init(&_lowest_prio_cpu.lock);
+	
 	for_each_online_cpu(cpu) {
 		TRACE("Initializing CPU%d...\n", cpu);
 
 		state = cpu_state_for(cpu);
+		ce = &_lowest_prio_cpu.cpu_entries[cpu];
+		
+		ce->cpu = cpu;
+		ce->scheduled = NULL;
+		ce->deadline = ULLONG_MAX;
+		ce->lv = NUM_CRIT_LEVELS;
+		ce->will_schedule = false;
 
 		raw_spin_lock_init(&state->lock);
 		state->cpu = cpu;
 		state->scheduled = NULL;
 		for (lv = 0; lv < NUM_CRIT_LEVELS; lv++) {
-			struct crit_entry *ce = &state->crit_entries[lv];
-			ce->level = lv;
-			ce->running = NULL;
+			struct crit_entry *cr_entry = &state->crit_entries[lv];
+			cr_entry->level = lv;
+			cr_entry->running = NULL;
 			//hrtimer_init(&ce->ghost_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 			//ce->ghost_timer.function = on_ghost_timer;
 		}
@@ -905,6 +1341,9 @@ static long mc2_activate_plugin(void)
 
 		hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 		state->timer.function = on_scheduling_timer;
+		
+		hrtimer_init(&state->g_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+		state->g_timer.function = on_global_scheduling_timer;
 	}
 
 	mc2_setup_domain_proc();
@@ -912,17 +1351,26 @@ static long mc2_activate_plugin(void)
 	return 0;
 }
 
+static void mc2_finish_switch(struct task_struct *prev)
+{
+	struct mc2_cpu_state *state = local_cpu_state();
+	
+	state->scheduled = is_realtime(current) ? current : NULL;
+}
+
 static long mc2_deactivate_plugin(void)
 {
 	int cpu;
 	struct mc2_cpu_state *state;
 	struct reservation *res;
+	struct next_timer_event *event;
 
 	for_each_online_cpu(cpu) {
 		state = cpu_state_for(cpu);
 		raw_spin_lock(&state->lock);
 
 		hrtimer_cancel(&state->timer);
+		hrtimer_cancel(&state->g_timer);
 
 		/* Delete all reservations --- assumes struct reservation
 		 * is prefix of containing struct. */
@@ -954,6 +1402,46 @@ static long mc2_deactivate_plugin(void)
 		raw_spin_unlock(&state->lock);
 	}
 
+	raw_spin_lock(&_global_env.lock);
+
+	while (!list_empty(&_global_env.active_reservations)) {
+		TRACE("RES FOUND!!!\n");
+		res = list_first_entry(
+			&_global_env.active_reservations,
+				struct reservation, list);
+		list_del(&res->list);
+		kfree(res);
+	}
+
+	while (!list_empty(&_global_env.inactive_reservations)) {
+		TRACE("RES FOUND!!!\n");
+		res = list_first_entry(
+			&_global_env.inactive_reservations,
+				struct reservation, list);
+		list_del(&res->list);
+		kfree(res);
+	}
+
+	while (!list_empty(&_global_env.depleted_reservations)) {
+		TRACE("RES FOUND!!!\n");
+		res = list_first_entry(
+			&_global_env.depleted_reservations,
+				struct reservation, list);
+		list_del(&res->list);
+		kfree(res);
+	}
+	
+	while (!list_empty(&_global_env.next_events)) {
+		TRACE("EVENT FOUND!!!\n");
+		event = list_first_entry(
+			&_global_env.next_events,
+				struct next_timer_event, list);
+		list_del(&event->list);
+		kfree(event);
+	}
+	
+	raw_spin_unlock(&_global_env.lock);
+	
 	destroy_domain_proc_info(&mc2_domain_proc_info);
 	return 0;
 }
@@ -961,6 +1449,7 @@ static long mc2_deactivate_plugin(void)
 static struct sched_plugin mc2_plugin = {
 	.plugin_name			= "MC2",
 	.schedule				= mc2_schedule,
+	.finish_switch			= mc2_finish_switch,
 	.task_wake_up			= mc2_task_resume,
 	.admit_task				= mc2_admit_task,
 	.task_new				= mc2_task_new,
-- 
1.8.1.2


From f9b8ce9e2c06fe8ecd3141837da910675af238c3 Mon Sep 17 00:00:00 2001
From: ChengYang Fu <chengyangfu@gmail.com>
Date: Tue, 20 Jan 2015 11:08:04 -0500
Subject: [PATCH 087/119] add bank_proc.c for cache and bank coloring

Conflicts:
	litmus/litmus.c
---
 litmus/Makefile     |   1 +
 litmus/bank_proc.c  | 254 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 litmus/cache_proc.c |   2 +-
 litmus/litmus.c     |   8 +-
 4 files changed, 262 insertions(+), 3 deletions(-)
 create mode 100644 litmus/bank_proc.c

diff --git a/litmus/Makefile b/litmus/Makefile
index 997524f..713a14f 100644
--- a/litmus/Makefile
+++ b/litmus/Makefile
@@ -20,6 +20,7 @@ obj-y     = sched_plugin.o litmus.o \
 	    ctrldev.o \
 	    uncachedev.o \
 	    cache_proc.o \
+	    bank_proc.o \
 	    sched_gsn_edf.o \
 	    sched_psn_edf.o \
 	    sched_pfp.o
diff --git a/litmus/bank_proc.c b/litmus/bank_proc.c
new file mode 100644
index 0000000..2c69657
--- /dev/null
+++ b/litmus/bank_proc.c
@@ -0,0 +1,254 @@
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/mutex.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+
+#include <litmus/litmus_proc.h>
+#include <litmus/sched_trace.h>
+
+#define LITMUS_LOCKDEP_NAME_MAX_LEN 50
+
+// This is Address Decoding for imx6-sabredsd board
+#define CACHE_MASK 0x0000f000      
+#define BANK_MASK  0x00007000      
+#define OFFSET_SHIFT 12
+
+#define PAGES_PER_COLOR 1024
+
+unsigned long number_banks;
+unsigned long number_cachecolors;
+
+struct color_group {
+	spinlock_t lock;
+	char _lock_name[LITMUS_LOCKDEP_NAME_MAX_LEN];
+	struct list_head list;
+	atomic_t nr_pages;
+};
+
+static struct alloced_pages {
+	spinlock_t lock;
+	struct list_head list;
+} alloced_pages;
+
+struct alloced_page {
+	struct page *page;
+	struct vm_area_struct *vma;
+	struct list_head list;
+};
+
+static struct color_group *color_groups;
+static struct lock_class_key color_lock_keys[16];
+
+//static struct color_group *color_groups;
+
+/* Decoding page color, 0~15 */ 
+static inline unsigned long page_color(struct page *page)
+{
+	return ((page_to_phys(page)& CACHE_MASK) >> PAGE_SHIFT);
+}
+
+/* Decoding page bank number, 0~7 */ 
+static inline unsigned long page_bank(struct page *page)
+{
+	return ((page_to_phys(page)& BANK_MASK) >> PAGE_SHIFT);
+}
+
+static unsigned long smallest_nr_pages(void)
+{
+	unsigned long i, min_pages = -1;
+	struct color_group *cgroup;
+	for (i = 0; i < number_cachecolors; ++i) {
+		cgroup = &color_groups[i];
+		if (atomic_read(&cgroup->nr_pages) < min_pages)
+			min_pages = atomic_read(&cgroup->nr_pages);
+	}
+	return min_pages;
+}
+/*
+ * Page's count should be one, it sould not be on any LRU list.
+ */
+void add_page_to_color_list(struct page *page)
+{
+	const unsigned long color = page_color(page);
+	struct color_group *cgroup = &color_groups[color];
+	BUG_ON(in_list(&page->lru) || PageLRU(page));
+	BUG_ON(page_count(page) > 1);
+	spin_lock(&cgroup->lock);
+	list_add_tail(&page->lru, &cgroup->list);
+	atomic_inc(&cgroup->nr_pages);
+//	SetPageLRU(page);
+	spin_unlock(&cgroup->lock);
+}
+
+static int do_add_pages(void)
+{
+	printk("LITMUS do add pages\n");
+	
+	struct page *page, *page_tmp;
+	LIST_HEAD(free_later);
+	unsigned long color;
+	int ret = 0;
+
+	while (smallest_nr_pages() < PAGES_PER_COLOR) {
+	
+		//page = alloc_page(GFP_HIGHUSER | __GFP_MOVABLE);
+		page = alloc_page(GFP_HIGHUSER_MOVABLE);
+		
+		if (unlikely(!page)) {
+			printk(KERN_WARNING "Could not allocate pages.\n");
+			ret = -ENOMEM;
+			goto out;
+		}
+		color = page_color(page);
+		if (atomic_read(&color_groups[color].nr_pages) < PAGES_PER_COLOR) {
+	//		SetPageReserved(page);
+			add_page_to_color_list(page);
+		} else
+			list_add_tail(&page->lru, &free_later);
+	}
+	list_for_each_entry_safe(page, page_tmp, &free_later, lru) {
+		list_del(&page->lru);
+		__free_page(page);
+	}
+	/* setup the color queue stuff */
+//	ret = setup_flusher_array();
+out:
+	return ret;
+}
+
+
+/*
+ * provide pages for replacement 
+ * node = 0 for Level A, B tasks in Cpu 0
+ * node = 1 for Level A, B tasks in Cpu 1
+ * node = 2 for Level A, B tasks in Cpu 2
+ * node = 3 for Level A, B tasks in Cpu 3
+ * node = 4 for Level C tasks 
+ */
+ #if 1 
+struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
+{
+	printk("allocate new page node = %d\n", node);	
+//	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
+	struct color_group *cgroup;
+	struct page *rPage = NULL;
+	unsigned int color;
+	get_random_bytes(&color, sizeof(unsigned int));
+	
+	/*
+	if(node ==0){
+		color = (color%2)*8+node;
+	}else if(node == 1){
+		color = (color%2)*8+node;
+	}else if(node == 2){
+		color = (color%2)*8+;
+	}else if(node == 3){
+		color = color%2 + 6;
+	}else if(node == 4){
+		color = color%8 + 8;
+	}else{
+		goto out;
+	}
+	*/
+	
+	switch(node ){
+		case 0:
+		case 1: 
+		case 2: 
+		case 3:
+			color = (color%2) * 8 + node;
+			break;
+		case 4:
+			color = (color%8)+4;
+			if(color >=8)	
+				color+=4;
+			break;
+		default:
+			goto out;
+	}
+
+
+	printk("allocate new page color = %d\n", color);
+		
+	cgroup = &color_groups[color];
+	spin_lock(&cgroup->lock);
+	if (unlikely(!atomic_read(&cgroup->nr_pages))) {
+		TRACE_CUR("No free %lu colored pages.\n", color);
+		printk(KERN_WARNING "no free %lu colored pages.\n", color);
+		goto out_unlock;
+	}
+	rPage = list_first_entry(&cgroup->list, struct page, lru);
+	BUG_ON(page_count(rPage) > 1);
+	get_page(rPage);
+	list_del(&rPage->lru);
+	atomic_dec(&cgroup->nr_pages);
+//	ClearPageLRU(rPage);
+out_unlock:
+	spin_unlock(&cgroup->lock);
+out:
+	do_add_pages();
+	return rPage;
+	//return  alloc_page(GFP_HIGHUSER_MOVABLE);
+}
+#endif
+
+static int __init init_variables(void)
+{
+	number_banks = 1+(BANK_MASK >> PAGE_SHIFT); 
+	number_cachecolors = 1+(CACHE_MASK >> PAGE_SHIFT);
+}
+
+
+
+static int __init init_color_groups(void)
+{
+	struct color_group *cgroup;
+	unsigned long i;
+	int err = 0;
+
+	color_groups = kmalloc(number_cachecolors *
+			sizeof(struct color_group), GFP_KERNEL);
+	if (!color_groups) {
+		printk(KERN_WARNING "Could not allocate color groups.\n");
+		err = -ENOMEM;
+	}else{
+
+		for (i = 0; i < number_cachecolors; ++i) {
+			cgroup = &color_groups[i];
+			atomic_set(&cgroup->nr_pages, 0);
+			INIT_LIST_HEAD(&cgroup->list);
+			spin_lock_init(&cgroup->lock);
+//			LOCKDEP_DYNAMIC_ALLOC(&cgroup->lock, &color_lock_keys[i],
+//					cgroup->_lock_name, "color%lu", i);
+		}
+	}
+	return err;
+}
+
+/*
+ * Initialzie the this proc 
+ */
+static int __init litmus_color_init(void)
+{
+	int err=0;
+	
+	INIT_LIST_HEAD(&alloced_pages.list);
+	spin_lock_init(&alloced_pages.lock);
+	init_variables();
+	printk("Cache number = %d , Cache mask = 0x%lx\n", number_cachecolors, CACHE_MASK); 
+	printk("Bank number = %d , Bank mask = 0x%lx\n", number_banks, BANK_MASK); 
+	init_color_groups();			
+	do_add_pages();
+
+	printk(KERN_INFO "Registering LITMUS^RT color and bank proc.\n");
+	return err;
+}
+
+module_init(litmus_color_init);
+
diff --git a/litmus/cache_proc.c b/litmus/cache_proc.c
index 4425bfb..cc818b9 100644
--- a/litmus/cache_proc.c
+++ b/litmus/cache_proc.c
@@ -242,4 +242,4 @@ out:
 	return ret;
 }
 
-module_init(litmus_sysctl_init);
\ No newline at end of file
+module_init(litmus_sysctl_init);
diff --git a/litmus/litmus.c b/litmus/litmus.c
index 6034ff8..dcb9ed5 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -392,11 +392,14 @@ static struct page *walk_page_table(unsigned long addr)
 extern int isolate_lru_page(struct page *page);
 extern void putback_lru_page(struct page *page);
 
+extern struct page *new_alloc_page(struct page *page, unsigned long node, int **x);
+
+#if 0
 static struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 {
 	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 }
-
+#endif
 asmlinkage long sys_set_page_color(int cpu)
 {
 	long ret = 0;
@@ -442,6 +445,7 @@ asmlinkage long sys_set_page_color(int cpu)
 				continue;
 			if (!old_page)
 				continue;
+
 			if (PageReserved(old_page)) {
 				put_page(old_page);
 				continue;
@@ -482,7 +486,7 @@ asmlinkage long sys_set_page_color(int cpu)
 	
 	ret = 0;
 	if (!list_empty(&pagelist)) {
-		ret = migrate_pages(&pagelist, new_alloc_page, 0, MIGRATE_ASYNC, MR_SYSCALL);
+		ret = migrate_pages(&pagelist, new_alloc_page, 4, MIGRATE_ASYNC, MR_SYSCALL);
 		if (ret) {
 			printk(KERN_INFO "%ld pages not migrated.\n", ret);
 			putback_lru_pages(&pagelist);
-- 
1.8.1.2


From d7f8145b8a0525dccd2990fd0739012d04f3d978 Mon Sep 17 00:00:00 2001
From: ChengYang Fu <chengyangfu@gmail.com>
Date: Tue, 27 Jan 2015 12:23:44 -0500
Subject: [PATCH 088/119] Provide interface to let litmus ask new pages by bank
 number

---
 litmus/bank_proc.c | 40 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/litmus/bank_proc.c b/litmus/bank_proc.c
index 2c69657..ec04626 100644
--- a/litmus/bank_proc.c
+++ b/litmus/bank_proc.c
@@ -131,7 +131,6 @@ out:
  * node = 3 for Level A, B tasks in Cpu 3
  * node = 4 for Level C tasks 
  */
- #if 1 
 struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 {
 	printk("allocate new page node = %d\n", node);	
@@ -194,9 +193,44 @@ out_unlock:
 out:
 	do_add_pages();
 	return rPage;
-	//return  alloc_page(GFP_HIGHUSER_MOVABLE);
 }
-#endif
+
+struct page *new_alloc_page_banknr(struct page *page, unsigned long banknr, int **x)
+{
+	printk("allocate new page bank = %d\n", banknr);	
+	struct color_group *cgroup;
+	struct page *rPage = NULL;
+	unsigned int color;
+	get_random_bytes(&color, sizeof(unsigned int));
+	
+	if((banknr<= 7) && (banknr>=0)){
+		color = (color%2) * 8 + banknr;
+	}else{
+		goto out;
+	}
+		
+	cgroup = &color_groups[color];
+	spin_lock(&cgroup->lock);
+	if (unlikely(!atomic_read(&cgroup->nr_pages))) {
+		TRACE_CUR("No free %lu colored pages.\n", color);
+		printk(KERN_WARNING "no free %lu colored pages.\n", color);
+		goto out_unlock;
+	}
+	rPage = list_first_entry(&cgroup->list, struct page, lru);
+	BUG_ON(page_count(rPage) > 1);
+	get_page(rPage);
+	list_del(&rPage->lru);
+	atomic_dec(&cgroup->nr_pages);
+//	ClearPageLRU(rPage);
+out_unlock:
+	spin_unlock(&cgroup->lock);
+out:
+	do_add_pages();
+	return rPage;
+
+
+
+}
 
 static int __init init_variables(void)
 {
-- 
1.8.1.2


From a83b6b631b081f9dfeb8134c9aee6aeb866f7231 Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Wed, 25 Feb 2015 10:42:28 -0500
Subject: [PATCH 089/119] proto type

---
 include/litmus/reservation.h |  18 +-
 litmus/bank_proc.c           |  29 +++-
 litmus/cache_proc.c          |  40 +++++
 litmus/jobs.c                |   1 -
 litmus/litmus.c              |  48 +++--
 litmus/reservation.c         |  54 +++---
 litmus/sched_mc2.c           | 405 ++++++++++++++++++++++++++++---------------
 7 files changed, 396 insertions(+), 199 deletions(-)

diff --git a/include/litmus/reservation.h b/include/litmus/reservation.h
index fc7e319..0b9c08d 100644
--- a/include/litmus/reservation.h
+++ b/include/litmus/reservation.h
@@ -201,19 +201,19 @@ struct reservation* sup_find_by_id(struct sup_reservation_environment* sup_env,
 	unsigned int id);
 	
 /* A global multiprocessor reservation environment. */
-/*
+
 typedef enum {
 	EVENT_REPLENISH = 0,
 	EVENT_DRAIN,
 	EVENT_OTHERS,
 } event_type_t;
-*/
+
 
 struct next_timer_event {
 	lt_t next_update;
 	int timer_armed_on;
-	//unsigned int id;
-	//event_type_t type;
+	unsigned int id;
+	event_type_t type;
 	struct list_head list;
 };
 
@@ -234,7 +234,7 @@ struct gmp_reservation_environment {
 	struct list_head next_events;
 	
 	/* (schedule_now == true) means call gmp_dispatch() now */
-	bool schedule_now;
+	int schedule_now;
 	/* set to true if a call to gmp_dispatch() is imminent */
 	bool will_schedule;
 };
@@ -242,11 +242,11 @@ struct gmp_reservation_environment {
 void gmp_init(struct gmp_reservation_environment* gmp_env);
 void gmp_add_new_reservation(struct gmp_reservation_environment* gmp_env,
 	struct reservation* new_res);
-void gmp_scheduler_update_after(struct gmp_reservation_environment* gmp_env,
-	lt_t timeout);
-bool gmp_update_time(struct gmp_reservation_environment* gmp_env, lt_t now);
+void gmp_add_event_after(struct gmp_reservation_environment* gmp_env,
+	lt_t timeout, unsigned int id, event_type_t type);
+int gmp_update_time(struct gmp_reservation_environment* gmp_env, lt_t now);
 struct task_struct* gmp_dispatch(struct gmp_reservation_environment* gmp_env);
-//struct next_timer_event* gmp_find_event_by_id(struct gmp_reservation_environment* gmp_env, unsigned int id);
+struct next_timer_event* gmp_find_event_by_id(struct gmp_reservation_environment* gmp_env, unsigned int id);
 struct next_timer_event* gmp_find_event_by_time(struct gmp_reservation_environment* gmp_env, lt_t when);
 struct reservation* gmp_find_by_id(struct gmp_reservation_environment* gmp_env,
 	unsigned int id);
diff --git a/litmus/bank_proc.c b/litmus/bank_proc.c
index ec04626..07d5728 100644
--- a/litmus/bank_proc.c
+++ b/litmus/bank_proc.c
@@ -88,7 +88,7 @@ void add_page_to_color_list(struct page *page)
 
 static int do_add_pages(void)
 {
-	printk("LITMUS do add pages\n");
+	//printk("LITMUS do add pages\n");
 	
 	struct page *page, *page_tmp;
 	LIST_HEAD(free_later);
@@ -122,6 +122,7 @@ out:
 	return ret;
 }
 
+extern int l2_usable_sets;
 
 /*
  * provide pages for replacement 
@@ -131,9 +132,10 @@ out:
  * node = 3 for Level A, B tasks in Cpu 3
  * node = 4 for Level C tasks 
  */
+#if 1
 struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 {
-	printk("allocate new page node = %d\n", node);	
+	//printk("allocate new page node = %d\n", node);	
 //	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 	struct color_group *cgroup;
 	struct page *rPage = NULL;
@@ -155,7 +157,20 @@ struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 		goto out;
 	}
 	*/
-	
+	switch(node ){
+		case 0:
+			color = (color % l2_usable_sets);
+			break;
+		case 1: 
+		case 2: 
+		case 3:
+		case 4:
+			color = (color% (16-l2_usable_sets)) + l2_usable_sets;
+			break;
+		default:
+			goto out;
+	}
+	/*
 	switch(node ){
 		case 0:
 		case 1: 
@@ -171,14 +186,15 @@ struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 		default:
 			goto out;
 	}
+	*/
 
-
-	printk("allocate new page color = %d\n", color);
+	//printk("allocate new page color = %d\n", color);
+	//TRACE("allocate new page color = %d\n", color);
 		
 	cgroup = &color_groups[color];
 	spin_lock(&cgroup->lock);
 	if (unlikely(!atomic_read(&cgroup->nr_pages))) {
-		TRACE_CUR("No free %lu colored pages.\n", color);
+		//TRACE_CUR("No free %lu colored pages.\n", color);
 		printk(KERN_WARNING "no free %lu colored pages.\n", color);
 		goto out_unlock;
 	}
@@ -194,6 +210,7 @@ out:
 	do_add_pages();
 	return rPage;
 }
+#endif
 
 struct page *new_alloc_page_banknr(struct page *page, unsigned long banknr, int **x)
 {
diff --git a/litmus/cache_proc.c b/litmus/cache_proc.c
index cc818b9..7b48d5c 100644
--- a/litmus/cache_proc.c
+++ b/litmus/cache_proc.c
@@ -49,6 +49,9 @@ struct mutex lockdown_proc;
 
 static int min_usable_ways = 0;
 static int max_usable_ways = 16;
+static int min_usable_sets = 1;
+static int max_usable_sets = 15;
+
 static int zero = 0;
 static int one = 1;
 
@@ -60,6 +63,7 @@ static int one = 1;
 			__cpu * L2X0_LOCKDOWN_STRIDE; __v; })
 
 int l2_usable_ways;
+int l2_usable_sets;
 int lock_all;
 int nr_lockregs;
 
@@ -176,6 +180,7 @@ int l2_usable_ways_handler(struct ctl_table *table, int write, void __user *buff
 		goto out;
 		
 	TRACE_CUR("l2_usable_ways : %d\n", l2_usable_ways);
+	printk("l2_usable_ways : %d\n", l2_usable_ways);
 	
 	if (write) {
 		//for (i = 0; i < nr_lockregs;  i++) {
@@ -190,6 +195,31 @@ out:
 	return ret;
 }
 
+int l2_usable_sets_handler(struct ctl_table *table, int write, void __user *buffer,
+		size_t *lenp, loff_t *ppos)
+{
+	int ret = 0;
+	
+	mutex_lock(&lockdown_proc);
+	
+	flush_cache_all();
+	
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret)
+		goto out;
+		
+	TRACE_CUR("l2_usable_sets : %d\n", l2_usable_sets);
+	printk("l2_usable_sets : %d\n", l2_usable_sets);
+	
+	if (write) {
+		;
+	}
+
+out:
+	mutex_unlock(&lockdown_proc);
+	return ret;
+}
+
 static struct ctl_table cache_table[] =
 {
 	{
@@ -202,6 +232,15 @@ static struct ctl_table cache_table[] =
 		.extra2		= &max_usable_ways,
 	},
 	{
+		.procname	= "l2_usable_sets",
+		.mode		= 0666,
+		.proc_handler	= l2_usable_sets_handler,
+		.data		= &l2_usable_sets,
+		.maxlen		= sizeof(l2_usable_sets),
+		.extra1		= &min_usable_sets,
+		.extra2		= &max_usable_sets,
+	},
+	{
 		.procname	= "lock_all",
 		.mode		= 0666,
 		.proc_handler	= lock_all_handler,
@@ -237,6 +276,7 @@ static int __init litmus_sysctl_init(void)
 	}
 
 	l2_usable_ways = 16;
+	l2_usable_sets = 5;
 
 out:
 	return ret;
diff --git a/litmus/jobs.c b/litmus/jobs.c
index e523e29..547222c 100644
--- a/litmus/jobs.c
+++ b/litmus/jobs.c
@@ -45,7 +45,6 @@ void release_at(struct task_struct *t, lt_t start)
 {
 	BUG_ON(!t);
 	setup_release(t, start);
-	TRACE("RELEASE!!\n");
 	tsk_rt(t)->completed = 0;
 }
 
diff --git a/litmus/litmus.c b/litmus/litmus.c
index dcb9ed5..4ff840d 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -392,14 +392,16 @@ static struct page *walk_page_table(unsigned long addr)
 extern int isolate_lru_page(struct page *page);
 extern void putback_lru_page(struct page *page);
 
-extern struct page *new_alloc_page(struct page *page, unsigned long node, int **x);
-
 #if 0
 static struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 {
-	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
+	return alloc_pages_exact_node(0, GFP_HIGHUSER_MOVABLE, 0);
 }
+#else
+extern struct page *new_alloc_page(struct page *page, unsigned long node, int **x);
+
 #endif
+
 asmlinkage long sys_set_page_color(int cpu)
 {
 	long ret = 0;
@@ -408,16 +410,18 @@ asmlinkage long sys_set_page_color(int cpu)
 	struct page *page_itr = NULL;
 	struct vm_area_struct *vma_itr = NULL;
 	//struct task_page *entry = NULL;
-	int nr_pages = 0, nr_shared_pages = 0;
+	int nr_pages = 0, nr_shared_pages = 0, nr_failed = 0;
+	unsigned long node;
+	
 	LIST_HEAD(pagelist);
 	LIST_HEAD(shared_pagelist);
 	
 	down_read(&current->mm->mmap_sem);
-	printk(KERN_INFO "SYSCALL set_page_color\n");
+	TRACE_TASK(current, "SYSCALL set_page_color\n");
 	vma_itr = current->mm->mmap;
 	while (vma_itr != NULL) {
 		unsigned int num_pages = 0, i;
-		struct page *new_page = NULL, *old_page = NULL;
+		struct page *old_page = NULL;
 		/*
 		entry = kmalloc(sizeof(struct task_page), GFP_ATOMIC);
 		if (entry == NULL) {
@@ -428,8 +432,8 @@ asmlinkage long sys_set_page_color(int cpu)
 		*/
 		num_pages = (vma_itr->vm_end - vma_itr->vm_start) / PAGE_SIZE;
 		// print vma flags
-		printk(KERN_INFO "flags: 0x%lx\n", vma_itr->vm_flags);
-		printk(KERN_INFO "start - end: 0x%lx - 0x%lx (%lu)\n", vma_itr->vm_start, vma_itr->vm_end, (vma_itr->vm_end - vma_itr->vm_start)/PAGE_SIZE);
+		//printk(KERN_INFO "flags: 0x%lx\n", vma_itr->vm_flags);
+		//printk(KERN_INFO "start - end: 0x%lx - 0x%lx (%lu)\n", vma_itr->vm_start, vma_itr->vm_end, (vma_itr->vm_end - vma_itr->vm_start)/PAGE_SIZE);
 		
 		for (i = 0; i < num_pages; i++) {
 /*
@@ -447,19 +451,22 @@ asmlinkage long sys_set_page_color(int cpu)
 				continue;
 
 			if (PageReserved(old_page)) {
+				TRACE("Reserved Page!\n");
 				put_page(old_page);
 				continue;
 			}
 			
-			printk(KERN_INFO "addr: %lu, pfn: %lu, _mapcount: %d, _count: %d\n", vma_itr->vm_start + PAGE_SIZE*i, __page_to_pfn(old_page), page_mapcount(old_page), page_count(old_page));
+			TRACE_TASK(current, "addr: %lu, pfn: %lu, _mapcount: %d, _count: %d\n", vma_itr->vm_start + PAGE_SIZE*i, __page_to_pfn(old_page), page_mapcount(old_page), page_count(old_page));
 			
-			if (page_mapcount(old_page) == 1) {
+			if (page_mapcount(old_page) != 0) {
 				ret = isolate_lru_page(old_page);
-				//if (pfn_valid(__page_to_pfn(old_page)))
 				if (!ret) {
 					list_add_tail(&old_page->lru, &pagelist);
 					inc_zone_page_state(old_page, NR_ISOLATED_ANON + !PageSwapBacked(old_page));
 					nr_pages++;
+				} else {
+					TRACE_TASK(current, "isolate_lru_page failed\n");
+					nr_failed++;
 				}
 				put_page(old_page);
 			}
@@ -468,8 +475,9 @@ asmlinkage long sys_set_page_color(int cpu)
 				if (!ret) {
 					list_add_tail(&old_page->lru, &shared_pagelist);
 					inc_zone_page_state(old_page, NR_ISOLATED_ANON + !PageSwapBacked(old_page));
-					nr_shared_pages++;
-				}					
+					
+				}	
+				nr_shared_pages++;
 				put_page(old_page);
 			}
 		}
@@ -485,10 +493,15 @@ asmlinkage long sys_set_page_color(int cpu)
 //	}
 	
 	ret = 0;
+	if (cpu == -1)
+		node = 4;
+	else
+		node = cpu;
+	
 	if (!list_empty(&pagelist)) {
-		ret = migrate_pages(&pagelist, new_alloc_page, 4, MIGRATE_ASYNC, MR_SYSCALL);
+		ret = migrate_pages(&pagelist, new_alloc_page, node, MIGRATE_ASYNC, MR_SYSCALL);
+		TRACE_TASK(current, "%ld pages not migrated.\n", ret);
 		if (ret) {
-			printk(KERN_INFO "%ld pages not migrated.\n", ret);
 			putback_lru_pages(&pagelist);
 		}
 	}
@@ -507,7 +520,7 @@ asmlinkage long sys_set_page_color(int cpu)
 	up_read(&current->mm->mmap_sem);
 
 	list_for_each_entry(page_itr, &shared_pagelist, lru) {
-		printk(KERN_INFO "S Anon=%d, pfn = %lu, _mapcount = %d, _count = %d\n", PageAnon(page_itr), __page_to_pfn(page_itr), page_mapcount(page_itr), page_count(page_itr));
+		TRACE("S Anon=%d, pfn = %lu, _mapcount = %d, _count = %d\n", PageAnon(page_itr), __page_to_pfn(page_itr), page_mapcount(page_itr), page_count(page_itr));
 	}
 	
 /*	
@@ -517,7 +530,7 @@ asmlinkage long sys_set_page_color(int cpu)
 		kfree(task_page_itr);		
 	}
 */	
-	printk(KERN_INFO "nr_pages = %d\n", nr_pages);
+	TRACE_TASK(current, "nr_pages = %d nr_failed = %d\n", nr_pages, nr_failed);
 	return ret;
 }
 
@@ -888,6 +901,7 @@ static int __init _init_litmus(void)
 #endif
 	
 	color_mask = ((cache_info_sets << line_size_log) - 1) ^ (PAGE_SIZE - 1);
+	printk("Page color mask %08x\n", color_mask);
 	return 0;
 }
 
diff --git a/litmus/reservation.c b/litmus/reservation.c
index e30892c..b0b13a9 100644
--- a/litmus/reservation.c
+++ b/litmus/reservation.c
@@ -348,7 +348,7 @@ struct reservation* gmp_find_by_id(struct gmp_reservation_environment* gmp_env,
 	return NULL;
 }
 
-/*
+
 struct next_timer_event* gmp_find_event_by_id(struct gmp_reservation_environment* gmp_env,
 	unsigned int id)
 {
@@ -361,7 +361,7 @@ struct next_timer_event* gmp_find_event_by_id(struct gmp_reservation_environment
 
 	return NULL;
 }
-*/
+
 
 struct next_timer_event* gmp_find_event_by_time(struct gmp_reservation_environment* gmp_env,
 	lt_t when)
@@ -415,9 +415,9 @@ static void gmp_scheduler_update_at(
 */
 #define TIMER_RESOLUTION 100000L
 
-static void gmp_scheduler_update_at(
+static void gmp_add_event(
 	struct gmp_reservation_environment* gmp_env,
-	lt_t when)
+	lt_t when, unsigned int id, event_type_t type)
 {
 	struct next_timer_event *nevent, *queued;
 	struct list_head *pos;
@@ -426,11 +426,13 @@ static void gmp_scheduler_update_at(
 	//when = div64_u64(when, TIMER_RESOLUTION);
 	//when *= TIMER_RESOLUTION;
 	
-	nevent = gmp_find_event_by_time(gmp_env, when);
+	nevent = gmp_find_event_by_id(gmp_env, id);
 	
 	if (!nevent) {
-		nevent = kzalloc(sizeof(*nevent), GFP_KERNEL);
+		nevent = kzalloc(sizeof(*nevent), GFP_ATOMIC);
 		nevent->next_update = when;
+		nevent->id = id;
+		nevent->type = type;
 		nevent->timer_armed_on = NO_CPU;
 		
 		list_for_each(pos, &gmp_env->next_events) {
@@ -448,14 +450,14 @@ static void gmp_scheduler_update_at(
 			TRACE("NEXT_EVENT ADDED at %llu ADDED at HEAD\n", nevent->next_update);
 		}
 	} else {
-		; //TRACE("EVENT FOUND at %llu, NEW EVENT %llu\n", nevent->next_update, when);
+		TRACE("EVENT FOUND type=%d when=%llu, NEW EVENT type=%d when=%llu\n", nevent->type, nevent->next_update, type, when);
 	}
 }
 
-void gmp_scheduler_update_after(
-	struct gmp_reservation_environment* gmp_env, lt_t timeout)
+void gmp_add_event_after(
+	struct gmp_reservation_environment* gmp_env, lt_t timeout, unsigned int id, event_type_t type)
 {
-	gmp_scheduler_update_at(gmp_env, gmp_env->env.current_time + timeout);
+	gmp_add_event(gmp_env, gmp_env->env.current_time + timeout, id, type);
 }
 
 static void gmp_queue_depleted(
@@ -468,7 +470,7 @@ static void gmp_queue_depleted(
 
 	list_for_each(pos, &gmp_env->depleted_reservations) {
 		queued = list_entry(pos, struct reservation, list);
-		if (queued->next_replenishment > res->next_replenishment) {
+		if (queued && queued->next_replenishment > res->next_replenishment) {
 			list_add(&res->list, pos->prev);
 			found = 1;
 		}
@@ -477,7 +479,7 @@ static void gmp_queue_depleted(
 	if (!found)
 		list_add_tail(&res->list, &gmp_env->depleted_reservations);
 
-	gmp_scheduler_update_at(gmp_env, res->next_replenishment);
+	gmp_add_event(gmp_env, res->next_replenishment, res->id, EVENT_REPLENISH);
 }
 
 static void gmp_queue_active(
@@ -502,16 +504,20 @@ static void gmp_queue_active(
 		list_add_tail(&res->list, &gmp_env->active_reservations);
 
 	/* check for possible preemption */
-	if (res->state == RESERVATION_ACTIVE && !check_preempt)
-		gmp_env->schedule_now = true;
+	if (res->state == RESERVATION_ACTIVE && check_preempt)
+		gmp_env->schedule_now++;
 	
-	gmp_scheduler_update_after(gmp_env, res->cur_budget);
+	gmp_add_event_after(gmp_env, res->cur_budget, res->id, EVENT_DRAIN);
 }
 
 static void gmp_queue_reservation(
 	struct gmp_reservation_environment* gmp_env,
 	struct reservation *res)
 {
+	if (res == NULL) {
+		BUG();
+		return;
+	}
 	switch (res->state) {
 		case RESERVATION_INACTIVE:
 			list_add(&res->list, &gmp_env->inactive_reservations);
@@ -572,7 +578,7 @@ static void gmp_charge_budget(
 			 * its remaining budget */
 			 TRACE("requesting gmp_scheduler update for reservation %u in %llu nanoseconds\n",
 				res->id, res->cur_budget);
-			 gmp_scheduler_update_after(gmp_env, res->cur_budget);
+			 gmp_add_event_after(gmp_env, res->cur_budget, res->id, EVENT_DRAIN);
 		}
 		//if (encountered_active == 2)
 			/* stop at the first ACTIVE reservation */
@@ -601,26 +607,22 @@ static void gmp_replenish_budgets(struct gmp_reservation_environment* gmp_env)
 	res = list_first_entry_or_null(&gmp_env->depleted_reservations,
 		struct reservation, list);
 	if (res)
-		gmp_scheduler_update_at(gmp_env, res->next_replenishment);
+		gmp_add_event(gmp_env, res->next_replenishment, res->id, EVENT_REPLENISH);
 }
 
 /* return schedule_now */
-bool gmp_update_time(
+int gmp_update_time(
 	struct gmp_reservation_environment* gmp_env,
 	lt_t now)
 {
 	lt_t delta;
 
-	if (!gmp_env) {
-		TRACE("BUG****************************************\n");
-		return false;
-	}
 	/* If the time didn't advance, there is nothing to do.
 	 * This check makes it safe to call sup_advance_time() potentially
 	 * multiple times (e.g., via different code paths. */
 	//TRACE("(sup_update_time) now: %llu, current_time: %llu\n", now, sup_env->env.current_time);
 	if (unlikely(now <= gmp_env->env.current_time))
-		return gmp_env->schedule_now;
+		return min(gmp_env->schedule_now, NR_CPUS);
 
 	delta = now - gmp_env->env.current_time;
 	gmp_env->env.current_time = now;
@@ -634,7 +636,7 @@ bool gmp_update_time(
 	//TRACE("REPLENISH###\n");
 	gmp_replenish_budgets(gmp_env);
 	
-	return gmp_env->schedule_now;
+	return min(gmp_env->schedule_now, NR_CPUS);
 }
 
 static void gmp_res_change_state(
@@ -652,7 +654,7 @@ static void gmp_res_change_state(
 	list_del(&res->list);
 	/* check if we need to reschedule because we lost an active reservation */
 	if (res->state == RESERVATION_ACTIVE && !gmp_env->will_schedule)
-		gmp_env->schedule_now = true;
+		gmp_env->schedule_now++;
 	res->state = new_state;
 	gmp_queue_reservation(gmp_env, res);
 }
@@ -668,7 +670,7 @@ void gmp_init(struct gmp_reservation_environment* gmp_env)
 
 	gmp_env->env.change_state = gmp_res_change_state;
 
-	gmp_env->schedule_now = false;
+	gmp_env->schedule_now = 0;
 	gmp_env->will_schedule = false;
 	
 	raw_spin_lock_init(&gmp_env->lock);
diff --git a/litmus/sched_mc2.c b/litmus/sched_mc2.c
index 6dee1ec..79fecd4 100644
--- a/litmus/sched_mc2.c
+++ b/litmus/sched_mc2.c
@@ -1,3 +1,12 @@
+/*
+ * litmus/sched_mc2.c
+ *
+ * Implementation of the Mixed-Criticality on MultiCore scheduler
+ *
+ * Thus plugin implements a scheduling algorithm proposed in 
+ * "Mixed-Criticality Real-Time Scheduling for Multicore System" paper.
+ */ 
+ 
 #include <linux/percpu.h>
 #include <linux/slab.h>
 #include <asm/uaccess.h>
@@ -10,21 +19,29 @@
 #include <litmus/jobs.h>
 #include <litmus/budget.h>
 #include <litmus/litmus_proc.h>
+#include <litmus/sched_trace.h>
 
 #include <litmus/mc2_common.h>
 #include <litmus/reservation.h>
 #include <litmus/polling_reservations.h>
 
+/* _global_env - reservation container for level-C tasks*/
 struct gmp_reservation_environment _global_env;
 
+/* cpu_entry - keep track of a running task on a cpu
+ * This state is used to decide the lowest priority cpu
+ */
 struct cpu_entry {
 	struct task_struct *scheduled;
 	lt_t deadline;
 	int cpu;
 	enum crit_level lv;
+	/* if will_schedule is true, this cpu is already selected and
+	   call mc2_schedule() soon. */
 	bool will_schedule;
 };
 
+/* cpu_priority - a global state for choosing the lowest priority CPU */
 struct cpu_priority {
 	raw_spinlock_t lock;
 	struct cpu_entry cpu_entries[NR_CPUS];
@@ -32,19 +49,26 @@ struct cpu_priority {
 
 struct cpu_priority _lowest_prio_cpu;
 	
+/* mc2_task_state - a task state structure */
 struct mc2_task_state {
 	struct task_client res_info;
+	/* if cpu == -1, this task is a global task (level C) */
 	int cpu;
 	bool has_departed;
 	struct mc2_task mc2_param;
 };
 
+/* crit_entry - maintain the logically running job (ghost job) */
 struct crit_entry {
 	enum crit_level level;
 	struct task_struct *running;
 	//struct hrtimer ghost_timer;
 };
 
+/* mc2_cpu_state - maintain the scheduled state and ghost jobs
+ * timer : timer for partitioned tasks (level A and B)
+ * g_timer : timer for global tasks (level C)
+ */
 struct mc2_cpu_state {
 	raw_spinlock_t lock;
 
@@ -62,13 +86,22 @@ static DEFINE_PER_CPU(struct mc2_cpu_state, mc2_cpu_state);
 #define cpu_state_for(cpu_id)	(&per_cpu(mc2_cpu_state, cpu_id))
 #define local_cpu_state()	(&__get_cpu_var(mc2_cpu_state))
 
+/* get_mc2_state - get the task's state */
 static struct mc2_task_state* get_mc2_state(struct task_struct *tsk)
 {
-	return (struct mc2_task_state*) tsk_rt(tsk)->plugin_state;
+	struct mc2_task_state* tinfo;
+	
+	tinfo = (struct mc2_task_state*)tsk_rt(tsk)->plugin_state;
+	
+	if (tinfo)
+		return tinfo;
+	else
+		return NULL;
 }
+
+/* get_task_crit_level - return the criticaility level of a task */
 static enum crit_level get_task_crit_level(struct task_struct *tsk)
 {
-	//struct mc2_task_state *tinfo = get_mc2_state(tsk);
 	struct mc2_task *mp;
 	
 	if (!tsk || !is_realtime(tsk))
@@ -82,7 +115,9 @@ static enum crit_level get_task_crit_level(struct task_struct *tsk)
 		return mp->crit;
 }
 
-static struct reservation* res_find_by_id(struct mc2_cpu_state *state, unsigned int id)
+/* res_find_by_id - find reservation by id */
+static struct reservation* res_find_by_id(struct mc2_cpu_state *state,
+                                          unsigned int id)
 {
 	struct reservation *res;
 
@@ -93,7 +128,12 @@ static struct reservation* res_find_by_id(struct mc2_cpu_state *state, unsigned
 	return res;
 }
 
-static void mc2_update_time(enum crit_level lv, struct mc2_cpu_state *state, lt_t time)
+/* mc2_update_time - update time for a given criticality level. 
+ *                   caller must hold a proper lock
+ *                   (cpu_state lock or global lock)
+ */
+static void mc2_update_time(enum crit_level lv, 
+                            struct mc2_cpu_state *state, lt_t time)
 {
 	if (lv < CRIT_LEVEL_C)
 		sup_update_time(&state->sup_env, time);
@@ -103,6 +143,12 @@ static void mc2_update_time(enum crit_level lv, struct mc2_cpu_state *state, lt_
 		TRACE("update_time(): Criticality level error!!!!\n");
 }
 
+/* task_depart - remove a task from its reservation
+ *               If the job has remaining budget, convert it to a ghost job
+ *               and update crit_entries[]
+ *               
+ * @job_complete	indicate whether job completes or not              
+ */
 static void task_departs(struct task_struct *tsk, int job_complete)
 {
 	struct mc2_task_state* tinfo = get_mc2_state(tsk);
@@ -110,28 +156,30 @@ static void task_departs(struct task_struct *tsk, int job_complete)
 	struct reservation* res;
 	struct reservation_client *client;
 
+	BUG_ON(!is_realtime(tsk));
+	
 	res    = tinfo->res_info.client.reservation;
 	client = &tinfo->res_info.client;
 
 	res->ops->client_departs(res, client, job_complete);
 	tinfo->has_departed = true;
 	TRACE_TASK(tsk, "CLIENT DEPART with budget %llu\n", res->cur_budget);
+	
 	if (job_complete && res->cur_budget) {
 		struct crit_entry* ce;
 		enum crit_level lv = tinfo->mc2_param.crit;
-		//lt_t now = litmus_clock();
 		
 		ce = &state->crit_entries[lv];
 		ce->running = tsk;
 		res->is_ghost = 1;
 		TRACE_TASK(tsk, "BECOME GHOST at %llu\n", litmus_clock());
 		
-		//BUG_ON(hrtimer_active(&ce->ghost_timer));
-		//TRACE("setting GHOST timer %llu\n", ns_to_ktime(now + res->cur_budget));
-		//__hrtimer_start_range_ns(&ce->ghost_timer, ns_to_ktime(now + res->cur_budget), 0, HRTIMER_MODE_ABS_PINNED, 0);
 	}		
 }
 
+/* task_arrive - put a task into its reservation
+ *               If the job was a ghost job, remove it from crit_entries[]
+ */
 static void task_arrives(struct mc2_cpu_state *state, struct task_struct *tsk)
 {
 	struct mc2_task_state* tinfo = get_mc2_state(tsk);
@@ -145,15 +193,22 @@ static void task_arrives(struct mc2_cpu_state *state, struct task_struct *tsk)
 	tinfo->has_departed = false;
 	res->ops->client_arrives(res, client);
 	
+	sched_trace_task_release(tsk);
+	
 	if (lv != NUM_CRIT_LEVELS) {
 		struct crit_entry *ce;
 		ce = &state->crit_entries[lv];
+		/* if the currrent task is a ghost job, remove it */
 		if (ce->running == tsk)
 			ce->running = NULL;
 	}
 }
 
-/* return: NO_CPU - all CPUs are running tasks with higher priority than Level C */
+/* get_lowest_prio_cpu - return the lowest priority cpu
+ *                       This will be used for scheduling level-C tasks.
+ *                       If all CPUs are running tasks which has
+ *                       higher priority than level C, return NO_CPU.
+ */
 static int get_lowest_prio_cpu(void)
 {
 	struct cpu_entry *ce;
@@ -163,11 +218,15 @@ static int get_lowest_prio_cpu(void)
 	raw_spin_lock(&_lowest_prio_cpu.lock);
 	for_each_online_cpu(cpu) {
 		ce = &_lowest_prio_cpu.cpu_entries[cpu];
+		/* If a CPU will call schedule() in the near future, we don't
+		   return that CPU. */
 		if (!ce->will_schedule) {
 			if (!ce->scheduled) {
+				/* Idle cpu, return this. */
 				raw_spin_unlock(&_lowest_prio_cpu.lock);
 				return ce->cpu;
-			} else if (ce->lv == CRIT_LEVEL_C && ce->deadline > latest_deadline) {
+			} else if (ce->lv == CRIT_LEVEL_C && 
+			           ce->deadline > latest_deadline) {
 				latest_deadline = ce->deadline;
 				ret = ce->cpu;
 			}
@@ -180,6 +239,14 @@ static int get_lowest_prio_cpu(void)
 }
 
 /* NOTE: drops state->lock */
+/* mc2_update_timer_and_unlock - set a timer and g_timer and unlock 
+ *                               Whenever res_env.current_time is updated,
+ *                               we check next_scheduler_update and set 
+ *                               a timer.
+ *                               If there exist a global event which is 
+ *                               not armed on any CPU and g_timer is not
+ *                               active, set a g_timer for that event.
+ */
 static void mc2_update_timer_and_unlock(struct mc2_cpu_state *state)
 {
 	int local;
@@ -211,7 +278,8 @@ static void mc2_update_timer_and_unlock(struct mc2_cpu_state *state)
 		/* Reprogram only if not already set correctly. */
 		if (!hrtimer_active(&state->timer) ||
 		    ktime_to_ns(hrtimer_get_expires(&state->timer)) != update) {
-			TRACE("canceling timer...at %llu\n", ktime_to_ns(hrtimer_get_expires(&state->timer)));
+			TRACE("canceling timer...at %llu\n", 
+			      ktime_to_ns(hrtimer_get_expires(&state->timer)));
 			hrtimer_cancel(&state->timer);
 			TRACE("setting scheduler timer for %llu\n", update);
 			/* We cannot use hrtimer_start() here because the
@@ -246,7 +314,8 @@ static void mc2_update_timer_and_unlock(struct mc2_cpu_state *state)
 	raw_spin_lock(&_global_env.lock);
 	list_for_each_entry_safe(event, next, &_global_env.next_events, list) {
 		if (event->timer_armed_on == NO_CPU) {
-			found_event = 1;
+			/* If the event time is already passed, we call schedule() on
+			   the lowest priority cpu */
 			if (event->next_update < litmus_clock()) {
 				int cpu = get_lowest_prio_cpu();
 				TRACE("GLOBAL EVENT PASSED!! poking CPU %d to reschedule\n", cpu);
@@ -260,7 +329,12 @@ static void mc2_update_timer_and_unlock(struct mc2_cpu_state *state)
 				}
 			} else if (!hrtimer_active(&state->g_timer)) {
 				int ret;
-				TRACE("setting global scheduler timer for %llu\n", event->next_update);
+			
+				raw_spin_unlock(&_global_env.lock);
+				found_event = 1;
+			
+				TRACE("setting global scheduler timer for %llu\n", 
+				       event->next_update);
 				ret = __hrtimer_start_range_ns(&state->g_timer,
 						ns_to_ktime(event->next_update),
 						0 /* timer coalescing slack */,
@@ -268,74 +342,76 @@ static void mc2_update_timer_and_unlock(struct mc2_cpu_state *state)
 						0 /* wakeup */);
 				if (!ret) {
 					event->timer_armed_on = state->cpu;
+					break;
 				}
 			}				
 		}
-	}	
-	raw_spin_unlock(&_global_env.lock);
+	}
+	if (found_event == 0)
+		raw_spin_unlock(&_global_env.lock);
 }
 
+/* mc2_update_ghost_state - Update crit_entries[] to track ghost jobs
+ *                          If the budget of a ghost is exhausted,
+ *                          clear is_ghost and reschedule
+ */
 static void mc2_update_ghost_state(struct mc2_cpu_state *state)
 {
 	int lv = 0;
 	struct crit_entry* ce;
 	struct reservation *res;
 	struct mc2_task_state *tinfo;
+
+	BUG_ON(!state);
 	
 	for (lv = 0; lv < NUM_CRIT_LEVELS; lv++) {
 		ce = &state->crit_entries[lv];
 		if (ce->running != NULL) {
 			tinfo = get_mc2_state(ce->running);
-			/*
-			if (lv != CRIT_LEVEL_C)
-				res = sup_find_by_id(&state->sup_env, tinfo->mc2_param.res_id);
-			else
+			if (!tinfo)
 				continue;
-			*/
+			
 			res = res_find_by_id(state, tinfo->mc2_param.res_id);
-			TRACE("LV %d running id %d budget %llu\n", lv, tinfo->mc2_param.res_id, res->cur_budget);
+			TRACE("LV %d running id %d budget %llu\n", 
+			       lv, tinfo->mc2_param.res_id, res->cur_budget);
+			/* If the budget is exhausted, clear is_ghost and reschedule */
 			if (!res->cur_budget) {
 				struct sup_reservation_environment* sup_env = &state->sup_env;
 				
-				TRACE("GHOST FINISH id %d at %llu\n", tinfo->mc2_param.res_id, litmus_clock());
+				TRACE("GHOST FINISH id %d at %llu\n", 
+				      tinfo->mc2_param.res_id, litmus_clock());
 				ce->running = NULL;
 				res->is_ghost = 0;
-				res = list_first_entry_or_null(&sup_env->active_reservations, struct reservation, list);
-				if (res)
-					litmus_reschedule_local();
+				
+				if (lv < CRIT_LEVEL_C) {
+					res = list_first_entry_or_null(
+					      &sup_env->active_reservations, 
+						  struct reservation, list);
+					if (res)
+						litmus_reschedule_local();
+				} else {
+					res = list_first_entry_or_null(
+					      &_global_env.active_reservations,
+						  struct reservation, list);
+					if (res)
+						litmus_reschedule(state->cpu);
+				}
 			}
 		}
 	}
 }			
 
-/*
-static enum hrtimer_restart on_ghost_timer(struct hrtimer *timer)
-{
-	struct crit_entry *ce;
-	struct mc2_cpu_state *state;
-	
-	ce = container_of(timer, struct crit_entry, ghost_timer);
-	state = container_of(ce, struct mc2_cpu_state, crit_entries[ce->level]);
-	
-	TRACE("GHOST_TIMER FIRED at %llu\n", litmus_clock());
-	
-	raw_spin_lock(&state->lock);
-	sup_update_time(&state->sup_env, litmus_clock());
-	mc2_update_ghost_state(state);
-	
-	raw_spin_unlock(&state->lock);
-	
-	return HRTIMER_NORESTART;
-}
-*/
-
+/* update_cpu_prio - Update cpu's priority
+ *                   When a cpu picks a new task, call this function
+ *                   to update cpu priorities.
+ */
 static void update_cpu_prio(struct mc2_cpu_state *state)
 {
 	struct cpu_entry *ce = &_lowest_prio_cpu.cpu_entries[state->cpu];
 	enum crit_level lv = get_task_crit_level(state->scheduled);
 	
 	if (!state->scheduled) {
-		// cpu is idle.
+		/* cpu is idle. */
 		ce->scheduled = NULL;
 		ce->deadline = ULLONG_MAX;
 		ce->lv = NUM_CRIT_LEVELS;
@@ -344,24 +420,31 @@ static void update_cpu_prio(struct mc2_cpu_state *state)
 		ce->deadline = get_deadline(state->scheduled);
 		ce->lv = lv;
 	} else if (lv < CRIT_LEVEL_C) {
+		/* If cpu is running level A or B tasks, it is not eligible
+		   to run level-C tasks */
 		ce->scheduled = state->scheduled;
 		ce->deadline = 0;
 		ce->lv = lv;
 	}
 };
 
+/* on_global_scheduling_timer - Process the budget accounting (replenish
+ *                              and charge)
+ */								
 static enum hrtimer_restart on_global_scheduling_timer(struct hrtimer *timer)
 {
 	unsigned long flags;
 	enum hrtimer_restart restart = HRTIMER_NORESTART;
 	struct mc2_cpu_state *state;
 	struct next_timer_event *event, *next;
-	bool schedule_now;
+	int schedule_now;
 	lt_t update, now;
 	int found_event = 0;
 
 	state = container_of(timer, struct mc2_cpu_state, g_timer);
 
+	raw_spin_lock_irqsave(&state->lock, flags);
+	
 	/* The scheduling timer should only fire on the local CPU, because
 	 * otherwise deadlocks via timer_cancel() are possible.
 	 * Note: this does not interfere with dedicated interrupt handling, as
@@ -372,11 +455,13 @@ static enum hrtimer_restart on_global_scheduling_timer(struct hrtimer *timer)
 	if (state->cpu != raw_smp_processor_id())
 		TRACE("BUG!!!!!!!!!!!!! TIMER FIRED ON THE OTHER CPU\n");
 
-	raw_spin_lock_irqsave(&_global_env.lock, flags);
+	raw_spin_lock(&_global_env.lock);
 	
 	update = litmus_clock();
 	TRACE("GLOBAL TIMER FIRED at %llu\n", update);
 	
+	/* The event can be processed by the other cpus. So, if there is no 
+	   events to process, we do nothing */
 	list_for_each_entry_safe(event, next, &_global_env.next_events, list) {
 		if (event->next_update < update) {
 			found_event = 1;
@@ -387,22 +472,21 @@ static enum hrtimer_restart on_global_scheduling_timer(struct hrtimer *timer)
 	}			
 	
 	if (!found_event) {
-		raw_spin_unlock_irqrestore(&_global_env.lock, flags);
-		return restart;
+		goto unlock;
 	}
 	
+	/* gmp_update_timer returns how many tasks become ACTIVE */
 	schedule_now = gmp_update_time(&_global_env, update);
 	
-	raw_spin_lock(&state->lock);
 	mc2_update_ghost_state(state);
-	raw_spin_unlock(&state->lock);
 	
 	now = _global_env.env.current_time;
 	
 	TRACE_CUR("on_global_scheduling_timer at %llu, upd:%llu (for cpu=%d) SCHEDULE_NOW = %d\n",
 		now, update, state->cpu, schedule_now);
 
-	if (schedule_now) {
+	/* Find the lowest cpu, and call reschedule */
+	while (schedule_now--) {
 		int cpu = get_lowest_prio_cpu();
 		if (cpu != NO_CPU) {
 			raw_spin_lock(&_lowest_prio_cpu.lock);
@@ -413,11 +497,15 @@ static enum hrtimer_restart on_global_scheduling_timer(struct hrtimer *timer)
 		}
 	} 
 
-	raw_spin_unlock_irqrestore(&_global_env.lock, flags);
-
+unlock:
+	raw_spin_unlock(&_global_env.lock);
+	raw_spin_unlock_irqrestore(&state->lock, flags);
+	
 	return restart;
 }
 
+/* on_scheduling_timer - timer event for partitioned tasks
+ */                       
 static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 {
 	unsigned long flags;
@@ -438,8 +526,9 @@ static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 	TRACE("TIMER FIRED at %llu\n", litmus_clock());
 	raw_spin_lock_irqsave(&state->lock, flags);
 	sup_update_time(&state->sup_env, litmus_clock());
+	raw_spin_lock(&_global_env.lock);
 	mc2_update_ghost_state(state);
-	
+	raw_spin_unlock(&_global_env.lock);
 	update = state->sup_env.next_scheduler_update;
 	now = state->sup_env.env.current_time;
 
@@ -458,6 +547,8 @@ static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 	return restart;
 }
 
+/* mc2_dispatch - Select the next task to schedule.
+ */
 struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, struct mc2_cpu_state* state)
 {
 	struct reservation *res, *next;
@@ -477,36 +568,38 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 				} else {
 					ce = &state->crit_entries[lv];
 					if (likely(!ce->running)) {
+						/* If we found the next task, clear all flags */
 						sup_scheduler_update_after(sup_env, res->cur_budget);
 						res->blocked_by_ghost = 0;
 						res->is_ghost = 0;
 						return tsk;
 					} else {
+						/* We cannot schedule the same criticality task
+						   because the ghost job exists. Set blocked_by_ghost
+						   flag not to charge budget */
 						res->blocked_by_ghost = 1;
 					}
 				}
 			}
 		}
 	}
-	// no level A or B tasks
 	
+	/* no eligible level A or B tasks exists */
 	list_for_each_entry_safe(res, next, &_global_env.active_reservations, list) {
 		if (res->state == RESERVATION_ACTIVE && res->scheduled_on == NO_CPU) {
 			tsk = res->ops->dispatch_client(res, &time_slice);
 			if (likely(tsk)) {
 				lv = get_task_crit_level(tsk);
 				if (lv == NUM_CRIT_LEVELS) {
-					gmp_scheduler_update_after(&_global_env, res->cur_budget);
-					//raw_spin_unlock(&_global_env.lock);
+					gmp_add_event_after(&_global_env, res->cur_budget, res->id, EVENT_DRAIN);
 					return tsk;
 				} else {
 					ce = &state->crit_entries[lv];
 					if (likely(!ce->running)) {
-						gmp_scheduler_update_after(&_global_env, res->cur_budget);
+						gmp_add_event_after(&_global_env, res->cur_budget, res->id, EVENT_DRAIN);
 						res->blocked_by_ghost = 0;
 						res->is_ghost = 0;
 						res->scheduled_on = state->cpu;
-						//raw_spin_unlock(&_global_env.lock);
 						return tsk;
 					} else {
 						res->blocked_by_ghost = 1;
@@ -519,18 +612,43 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 	return NULL;
 }
 
+/* not used now */
+static void pre_schedule(struct task_struct *prev)
+{
+	enum crit_level lv;
+	if (!is_realtime(prev) || !prev)
+		return;
+	
+	lv = get_task_crit_level(prev);
+}
+
+/* not used now */
+static void post_schedule(struct task_struct *next)
+{
+	enum crit_level lv;
+	if (!is_realtime(next) || !next)
+		return;
+	
+	lv = get_task_crit_level(next);
+}
+
+/* mc2_schedule - main scheduler function. pick the next task to run
+ */
 static struct task_struct* mc2_schedule(struct task_struct * prev)
 {
 	/* next == NULL means "schedule background work". */
 	struct mc2_cpu_state *state = local_cpu_state();
 	
+	pre_schedule(prev);
+	
 	raw_spin_lock(&_lowest_prio_cpu.lock);
 	if (_lowest_prio_cpu.cpu_entries[state->cpu].will_schedule == true)
 		_lowest_prio_cpu.cpu_entries[state->cpu].will_schedule = false;
 	raw_spin_unlock(&_lowest_prio_cpu.lock);
 	
 	raw_spin_lock(&state->lock);
-
+	raw_spin_lock(&_global_env.lock);
+	
 	//BUG_ON(state->scheduled && state->scheduled != prev);
 	//BUG_ON(state->scheduled && !is_realtime(prev));
 	if (state->scheduled && state->scheduled != prev)
@@ -540,19 +658,16 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 
 	/* update time */
 	state->sup_env.will_schedule = true;
-	//TRACE_TASK(prev, "MC2_SCHEDULE sup_update_time ####\n");
+
 	sup_update_time(&state->sup_env, litmus_clock());
-	
-	raw_spin_lock(&_global_env.lock);
 	gmp_update_time(&_global_env, litmus_clock());
 	
-	//TRACE_TASK(prev, "MC2_SCHEDULE sup_update_time !!!!\n");
 	mc2_update_ghost_state(state);
 	
 	/* remove task from reservation if it blocks */
 	if (is_realtime(prev) && !is_running(prev))
 		task_departs(prev, is_completed(prev));
-
+	
 	/* figure out what to schedule next */
 	state->scheduled = mc2_dispatch(&state->sup_env, state);
 	if (state->scheduled && is_realtime(state->scheduled))
@@ -582,10 +697,10 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 	}
 	if (state->scheduled) {
 		TRACE_TASK(state->scheduled, "scheduled.\n");
-		//tinfo = get_mc2_state(state->scheduled);
-		//state->run_level = tinfo->mc2_param.crit;
 	}
 	
+	post_schedule(state->scheduled);
+	
 	return state->scheduled;
 }
 
@@ -599,13 +714,15 @@ static void resume_legacy_task_model_updates(struct task_struct *tsk)
 		 * P-RES scheduler. */
 
 		now = litmus_clock();
-		if (is_tardy(tsk, now))
+		if (is_tardy(tsk, now)) {
 			release_at(tsk, now);
+			sched_trace_task_release(tsk);
+		}
 	}
 }
 
-/* Called when the state of tsk changes back to TASK_RUNNING.
- * We need to requeue the task.
+/* mc2_task_resume - Called when the state of tsk changes back to 
+ *                   TASK_RUNNING. We need to requeue the task.
  */
 static void mc2_task_resume(struct task_struct  *tsk)
 {
@@ -624,23 +741,25 @@ static void mc2_task_resume(struct task_struct  *tsk)
 	/* Requeue only if self-suspension was already processed. */
 	if (tinfo->has_departed)
 	{
+		raw_spin_lock(&_global_env.lock);
 		/* Assumption: litmus_clock() is synchronized across cores,
 		 * since we might not actually be executing on tinfo->cpu
 		 * at the moment. */
 		if (tinfo->cpu != -1) {
 			sup_update_time(&state->sup_env, litmus_clock());
 		} else {
-			raw_spin_lock(&_global_env.lock);
+			//raw_spin_lock(&_global_env.lock);
 			TRACE("RESUME UPDATE ####\n");
 			gmp_update_time(&_global_env, litmus_clock());
 			TRACE("RESUME UPDATE $$$$\n");
-			raw_spin_unlock(&_global_env.lock);
+			//raw_spin_unlock(&_global_env.lock);
 		}
 			
 		mc2_update_ghost_state(state);
 		task_arrives(state, tsk);
 		/* NOTE: drops state->lock */
 		TRACE_TASK(tsk, "mc2_resume()\n");
+		raw_spin_unlock(&_global_env.lock);
 		mc2_update_timer_and_unlock(state);
 		local_irq_restore(flags);
 	} else {
@@ -651,7 +770,8 @@ static void mc2_task_resume(struct task_struct  *tsk)
 	resume_legacy_task_model_updates(tsk);
 }
 
-/* syscall backend for job completions */
+/* mc2_complete_job - syscall backend for job completions
+ */
 static long mc2_complete_job(void)
 {
 	ktime_t next_release;
@@ -662,6 +782,8 @@ static long mc2_complete_job(void)
 
 	tsk_rt(current)->completed = 1;
 	
+	/* If this the first job instance, we need to reset replenish
+	   time to the next release time */
 	if (tsk_rt(current)->sporadic_release) {
 		struct mc2_cpu_state *state;
 		struct reservation_environment *env;
@@ -670,21 +792,27 @@ static long mc2_complete_job(void)
 		unsigned long flags;
 
 		local_irq_save(flags);
-	
-		state = local_cpu_state();
-		env = &(state->sup_env.env);
+		
 		tinfo = get_mc2_state(current);
 		
-		res = res_find_by_id(state, tsk_rt(current)->mc2_data->res_id);
+		if (get_task_crit_level(current) < CRIT_LEVEL_C)
+			state = cpu_state_for(tinfo->cpu);
+		else
+			state = local_cpu_state();
+		
+		raw_spin_lock(&state->lock);
+		env = &(state->sup_env.env);
+		
+		res = res_find_by_id(state, tinfo->mc2_param.res_id);
 		
 		if (get_task_crit_level(current) < CRIT_LEVEL_C) {
-			raw_spin_lock(&state->lock);
 			env->time_zero = tsk_rt(current)->sporadic_release_time;
 		} else {
 			raw_spin_lock(&_global_env.lock);
 			_global_env.env.time_zero = tsk_rt(current)->sporadic_release_time;
 		}
 		
+		/* set next_replenishtime to synchronous release time */
 		res->next_replenishment = tsk_rt(current)->sporadic_release_time;
 		
 		if (get_task_crit_level(current) == CRIT_LEVEL_A) {
@@ -697,34 +825,44 @@ static long mc2_complete_job(void)
 		res->cur_budget = 0;
 		res->env->change_state(res->env, res, RESERVATION_DEPLETED);
 		
-		//TRACE_CUR("CHANGE NEXT_REP = %llu\n NEXT_UPDATE = %llu\n", res->next_replenishment, state->sup_env.next_scheduler_update);
-		if (get_task_crit_level(current) < CRIT_LEVEL_C) {
-			raw_spin_unlock(&state->lock);
-		} else {
+		TRACE_CUR("CHANGE NEXT_REP = %llu\n NEXT_UPDATE = %llu\n", res->next_replenishment, state->sup_env.next_scheduler_update);
+		if (get_task_crit_level(current) == CRIT_LEVEL_C) {
 			raw_spin_unlock(&_global_env.lock);
 		}
+		raw_spin_unlock(&state->lock);
 		
 		local_irq_restore(flags);
 	}
+	sched_trace_task_completion(current, 0);
 	
+	/* update the next release time and deadline */
 	prepare_for_next_period(current);
+	
 	next_release = ns_to_ktime(get_release(current));
 	preempt_disable();
 	TRACE_CUR("next_release=%llu\n", get_release(current));
 	if (get_release(current) > litmus_clock()) {
+		/* sleep until next_release */
 		set_current_state(TASK_INTERRUPTIBLE);
 		preempt_enable_no_resched();
 		err = schedule_hrtimeout(&next_release, HRTIMER_MODE_ABS);
+		if (get_task_crit_level(current) == CRIT_LEVEL_A)
+			sched_trace_task_release(current);
 	} else {
+		/* release the next job immediately */
 		err = 0;
 		TRACE_CUR("TARDY: release=%llu now=%llu\n", get_release(current), litmus_clock());
 		preempt_enable();
+		if (get_task_crit_level(current) == CRIT_LEVEL_A)
+			sched_trace_task_release(current);
 	}
 
 	TRACE_CUR("mc2_complete_job returns at %llu\n", litmus_clock());
 	return err;
 }
 
+/* mc2_admit_task - Setup mc2 task parameters
+ */
 static long mc2_admit_task(struct task_struct *tsk)
 {
 	long err = -ESRCH;
@@ -752,15 +890,13 @@ static long mc2_admit_task(struct task_struct *tsk)
 
 		res = sup_find_by_id(&state->sup_env, mp->res_id);
 
-		/* found the appropriate reservation (or vCPU) */
+		/* found the appropriate reservation */
 		if (res) {
 			TRACE_TASK(tsk, "SUP FOUND RES ID\n");
 			tinfo->mc2_param.crit = mp->crit;
 			tinfo->mc2_param.res_id = mp->res_id;
-			
-			//kfree(tsk_rt(tsk)->plugin_state);
-			//tsk_rt(tsk)->plugin_state = NULL;
-			
+		
+			/* initial values */
 			err = mc2_task_client_init(&tinfo->res_info, &tinfo->mc2_param, tsk, res);
 			tinfo->cpu = task_cpu(tsk);
 			tinfo->has_departed = true;
@@ -772,7 +908,10 @@ static long mc2_admit_task(struct task_struct *tsk)
 
 		raw_spin_unlock_irqrestore(&state->lock, flags);
 	} else if (lv == CRIT_LEVEL_C) {
-		raw_spin_lock_irqsave(&_global_env.lock, flags);
+		local_irq_save(flags);
+		state = local_cpu_state();
+		raw_spin_lock(&state->lock);
+		raw_spin_lock(&_global_env.lock);
 		
 		res = gmp_find_by_id(&_global_env, mp->res_id);
 
@@ -782,9 +921,7 @@ static long mc2_admit_task(struct task_struct *tsk)
 			tinfo->mc2_param.crit = mp->crit;
 			tinfo->mc2_param.res_id = mp->res_id;
 			
-			//kfree(tsk_rt(tsk)->plugin_state);
-			//tsk_rt(tsk)->plugin_state = NULL;
-			
+			/* initial values */
 			err = mc2_task_client_init(&tinfo->res_info, &tinfo->mc2_param, tsk, res);
 			tinfo->cpu = -1;
 			tinfo->has_departed = true;
@@ -794,8 +931,9 @@ static long mc2_admit_task(struct task_struct *tsk)
 			tsk_rt(tsk)->task_params.budget_policy = NO_ENFORCEMENT;
 		}
 
-		raw_spin_unlock_irqrestore(&_global_env.lock, flags);
-		
+		raw_spin_unlock(&_global_env.lock);
+		raw_spin_unlock(&state->lock);
+		local_irq_restore(flags);
 	}
 	
 	preempt_enable();
@@ -806,6 +944,9 @@ static long mc2_admit_task(struct task_struct *tsk)
 	return err;
 }
 
+/* mc2_task_new - A new real-time job is arrived. Release the next job
+ *                at the next reservation replenish time
+ */
 static void mc2_task_new(struct task_struct *tsk, int on_runqueue,
 			  int is_running)
 {
@@ -837,11 +978,12 @@ static void mc2_task_new(struct task_struct *tsk, int on_runqueue,
 		 * [see comment in pres_task_resume()] */
 		raw_spin_lock(&_global_env.lock);
 		mc2_update_time(lv, state, litmus_clock());
-		raw_spin_unlock(&_global_env.lock);
 		mc2_update_ghost_state(state);
 		task_arrives(state, tsk);
 		/* NOTE: drops state->lock */
 		TRACE("mc2_new()\n");
+		raw_spin_unlock(&_global_env.lock);
+		
 		mc2_update_timer_and_unlock(state);
 		local_irq_restore(flags);
 	} else
@@ -857,6 +999,8 @@ static void mc2_task_new(struct task_struct *tsk, int on_runqueue,
 		TRACE_TASK(tsk, "mc2_task_new() next_replenishment = NULL\n");
 }
 
+/* mc2_reservation_destroy - reservation_destroy system call backend
+ */
 static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 {
 	long ret = -EINVAL;
@@ -865,8 +1009,13 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 	struct sup_reservation_environment *sup_env;
 	int found = 0;
 	enum crit_level lv = get_task_crit_level(current);
+	unsigned long flags;
 	
 	if (cpu == -1) {
+		/* if the reservation is global reservation */
+		local_irq_save(flags);
+		state = local_cpu_state();
+		raw_spin_lock(&state->lock);
 		raw_spin_lock(&_global_env.lock);
 	
 		list_for_each_entry_safe(res, next, &_global_env.depleted_reservations, list) {
@@ -901,34 +1050,16 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 			}
 		}
 
-/*		
-list_for_each_entry(res, &_global_env.depleted_reservations, list) {
-	TRACE("DEPLETED LIST R%d\n", res->id);
-}
-list_for_each_entry(res, &_global_env.inactive_reservations, list) {
-	TRACE("INACTIVE LIST R%d\n", res->id);
-}
-list_for_each_entry(res, &_global_env.active_reservations, list) {
-	TRACE("ACTIVE LIST R%d\n", res->id);
-}
-*/
-		if (list_empty(&_global_env.active_reservations)) 
-			INIT_LIST_HEAD(&_global_env.active_reservations);
-		if (list_empty(&_global_env.depleted_reservations)) 
-			INIT_LIST_HEAD(&_global_env.depleted_reservations);
-		if (list_empty(&_global_env.inactive_reservations)) 
-			INIT_LIST_HEAD(&_global_env.inactive_reservations);
-		if (list_empty(&_global_env.next_events)) 
-			INIT_LIST_HEAD(&_global_env.next_events);
-		
 		raw_spin_unlock(&_global_env.lock);
+		raw_spin_unlock(&state->lock);
+		local_irq_restore(flags);
 	} else {
+		/* if the reservation is partitioned reservation */
 		state = cpu_state_for(cpu);
-		raw_spin_lock(&state->lock);
+		raw_spin_lock_irqsave(&state->lock, flags);
 		
 	//	res = sup_find_by_id(&state->sup_env, reservation_id);
 		sup_env = &state->sup_env;
-		//if (!res) {
 		list_for_each_entry_safe(res, next, &sup_env->depleted_reservations, list) {
 			if (res->id == reservation_id) {
 				if (lv == CRIT_LEVEL_A) {
@@ -972,15 +1103,16 @@ list_for_each_entry(res, &_global_env.active_reservations, list) {
 				}
 			}
 		}
-		//}
 
-		raw_spin_unlock(&state->lock);
+		raw_spin_unlock_irqrestore(&state->lock, flags);
 	}
 	
 	TRACE("RESERVATION_DESTROY ret = %d\n", ret);
 	return ret;
 }
 
+/* mc2_task_exit - Task became a normal task (not real-time task)
+ */
 static void mc2_task_exit(struct task_struct *tsk)
 {
 	unsigned long flags;
@@ -1007,42 +1139,30 @@ static void mc2_task_exit(struct task_struct *tsk)
 	if (is_running(tsk)) {
 		/* Assumption: litmus_clock() is synchronized across cores
 		 * [see comment in pres_task_resume()] */
-		//if (lv < CRIT_LEVEL_C)
-		//	sup_update_time(&state->sup_env, litmus_clock());
+		
 		raw_spin_lock(&_global_env.lock);
+		/* update both global and partitioned */
 		mc2_update_time(lv, state, litmus_clock());
-		raw_spin_unlock(&_global_env.lock);
+		
 		mc2_update_ghost_state(state);
 		task_departs(tsk, 0);
 		
 		/* NOTE: drops state->lock */
 		TRACE("mc2_exit()\n");
+		raw_spin_unlock(&_global_env.lock);
 		mc2_update_timer_and_unlock(state);
 		local_irq_restore(flags);
 	} else
 		raw_spin_unlock_irqrestore(&state->lock, flags);
-/*
-	if (tinfo->mc2_param.crit == CRIT_LEVEL_A) {
-		struct table_driven_reservation *td_res;
-		struct reservation *res;
-		res = sup_find_by_id(&state->sup_env, tinfo->mc2_param.res_id);
-		td_res = container_of(res, struct table_driven_reservation, res);
-		kfree(td_res->intervals);
-		//kfree(td_res);
-	} else if (tinfo->mc2_param.crit == CRIT_LEVEL_B) {
-		struct polling_reservation *pres;
-		struct reservation *res;
-		res = sup_find_by_id(&state->sup_env, tinfo->mc2_param.res_id);
-		pres = container_of(res, struct polling_reservation, res);
-		kfree(pres);
-	}
-*/
+
 	kfree(tsk_rt(tsk)->plugin_state);
 	tsk_rt(tsk)->plugin_state = NULL;
 	kfree(tsk_rt(tsk)->mc2_data);
 	tsk_rt(tsk)->mc2_data = NULL;
 }
 
+/* create_polling_reservation - create a new polling reservation
+ */
 static long create_polling_reservation(
 	int res_type,
 	struct reservation_config *config)
@@ -1055,6 +1175,7 @@ static long create_polling_reservation(
 	int periodic =  res_type == PERIODIC_POLLING;
 	long err = -EINVAL;
 
+	/* sanity checks */
 	if (config->polling_params.budget >
 	    config->polling_params.period) {
 		printk(KERN_ERR "invalid polling reservation (%u): "
@@ -1138,6 +1259,8 @@ static long create_polling_reservation(
 
 #define MAX_INTERVALS 1024
 
+/* create_table_driven_reservation - create a table_driven reservation
+ */
 static long create_table_driven_reservation(
 	struct reservation_config *config)
 {
@@ -1238,6 +1361,8 @@ static long create_table_driven_reservation(
 	return err;
 }
 
+/* mc2_reservation_create - reservation_create system call backend
+ */
 static long mc2_reservation_create(int res_type, void* __user _config)
 {
 	long ret = -EINVAL;
-- 
1.8.1.2


From f34d9982907644ade66b8689460cf0f414e88ce7 Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Thu, 26 Feb 2015 10:10:13 -0500
Subject: [PATCH 090/119] seems working

---
 drivers/net/ethernet/freescale/fec_main.c | 2 ++
 litmus/polling_reservations.c             | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 7b95428..a05b372 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3390,7 +3390,9 @@ static struct platform_driver fec_driver = {
 	.driver	= {
 		.name	= DRIVER_NAME,
 		.owner	= THIS_MODULE,
+#ifdef CONFIG_PM
 		.pm	= &fec_pm_ops,
+#endif
 		.of_match_table = fec_dt_ids,
 	},
 	.id_table = fec_devtype,
diff --git a/litmus/polling_reservations.c b/litmus/polling_reservations.c
index d2c54c4..a3125eb 100644
--- a/litmus/polling_reservations.c
+++ b/litmus/polling_reservations.c
@@ -363,7 +363,7 @@ static void td_client_departs(
 	switch (res->state) {
 		case RESERVATION_INACTIVE:
 		case RESERVATION_ACTIVE_IDLE:
-			BUG(); /* INACTIVE or IDLE <=> no client */
+			//BUG(); /* INACTIVE or IDLE <=> no client */
 			break;
 
 		case RESERVATION_ACTIVE:
-- 
1.8.1.2


From 0a62a98d4cbd2f1cb0ecee6669f708a3e83afcb3 Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Sun, 1 Mar 2015 17:58:29 -0500
Subject: [PATCH 091/119] MC2 scheduling infrastructure

---
 include/litmus/reservation.h |   2 +
 litmus/reservation.c         |  78 +++++++++++++----
 litmus/sched_mc2.c           | 196 ++++++++++++++++++++++++++++++-------------
 3 files changed, 204 insertions(+), 72 deletions(-)

diff --git a/include/litmus/reservation.h b/include/litmus/reservation.h
index 0b9c08d..7e022b3 100644
--- a/include/litmus/reservation.h
+++ b/include/litmus/reservation.h
@@ -129,6 +129,7 @@ struct reservation {
 	
 	/* for global env. */
 	int scheduled_on;
+	int event_added;
 	/* for blocked by ghost. Do not charge budget when ACTIVE */
 	int blocked_by_ghost;
 	/* ghost_job. If it is clear, do not charge budget when ACTIVE_IDLE */
@@ -244,6 +245,7 @@ void gmp_add_new_reservation(struct gmp_reservation_environment* gmp_env,
 	struct reservation* new_res);
 void gmp_add_event_after(struct gmp_reservation_environment* gmp_env,
 	lt_t timeout, unsigned int id, event_type_t type);
+void gmp_print_events(struct gmp_reservation_environment* gmp_env, lt_t now);
 int gmp_update_time(struct gmp_reservation_environment* gmp_env, lt_t now);
 struct task_struct* gmp_dispatch(struct gmp_reservation_environment* gmp_env);
 struct next_timer_event* gmp_find_event_by_id(struct gmp_reservation_environment* gmp_env, unsigned int id);
diff --git a/litmus/reservation.c b/litmus/reservation.c
index b0b13a9..3ec18a2 100644
--- a/litmus/reservation.c
+++ b/litmus/reservation.c
@@ -428,7 +428,7 @@ static void gmp_add_event(
 	
 	nevent = gmp_find_event_by_id(gmp_env, id);
 	
-	if (!nevent) {
+	if (!nevent || nevent->type != type) {
 		nevent = kzalloc(sizeof(*nevent), GFP_ATOMIC);
 		nevent->next_update = when;
 		nevent->id = id;
@@ -440,18 +440,22 @@ static void gmp_add_event(
 			if (queued->next_update > nevent->next_update) {
 				list_add(&nevent->list, pos->prev);
 				found = 1;
-				TRACE("NEXT_EVENT at %llu ADDED before %llu\n", nevent->next_update, queued->next_update);
+				TRACE("NEXT_EVENT id=%d type=%d update=%llu ADDED at before %llu\n", nevent->id, nevent->type, nevent->next_update, queued->next_update);
 				break;
 			}
 		}
 		
 		if (!found) {
 			list_add_tail(&nevent->list, &gmp_env->next_events);
-			TRACE("NEXT_EVENT ADDED at %llu ADDED at HEAD\n", nevent->next_update);
+			TRACE("NEXT_EVENT id=%d type=%d update=%llu ADDED at TAIL\n", nevent->id, nevent->type, nevent->next_update);
 		}
 	} else {
-		TRACE("EVENT FOUND type=%d when=%llu, NEW EVENT type=%d when=%llu\n", nevent->type, nevent->next_update, type, when);
+		TRACE("EVENT FOUND id = %d type=%d when=%llu, NEW EVENT type=%d when=%llu\n", nevent->id, nevent->type, nevent->next_update, type, when);
 	}
+	
+	TRACE("======START PRINTING EVENT LIST======\n");
+	gmp_print_events(gmp_env, litmus_clock());
+	TRACE("======FINISH PRINTING EVENT LIST======\n");
 }
 
 void gmp_add_event_after(
@@ -478,8 +482,9 @@ static void gmp_queue_depleted(
 
 	if (!found)
 		list_add_tail(&res->list, &gmp_env->depleted_reservations);
-
+	TRACE("R%d queued to depleted_list\n", res->id);
 	gmp_add_event(gmp_env, res->next_replenishment, res->id, EVENT_REPLENISH);
+	res->event_added = 1;
 }
 
 static void gmp_queue_active(
@@ -508,6 +513,7 @@ static void gmp_queue_active(
 		gmp_env->schedule_now++;
 	
 	gmp_add_event_after(gmp_env, res->cur_budget, res->id, EVENT_DRAIN);
+	res->event_added = 1;
 }
 
 static void gmp_queue_reservation(
@@ -554,12 +560,14 @@ static void gmp_charge_budget(
 		/* charge all ACTIVE_IDLE up to the first ACTIVE reservation */
 		res = list_entry(pos, struct reservation, list);
 		if (res->state == RESERVATION_ACTIVE) {
-			TRACE("gmp_charge_budget ACTIVE R%u drain %llu\n", res->id, delta);
+			TRACE("gmp_charge_budget ACTIVE R%u scheduled_on=%d drain %llu\n", res->id, res->scheduled_on, delta);
 			if (res->scheduled_on != NO_CPU && res->blocked_by_ghost == 0) {
 				TRACE("DRAIN !!\n");
 				drained = 1;
 				res->ops->drain_budget(res, delta);
-			}			
+			} else {
+				TRACE("NO DRAIN (not scheduled)!!\n");
+			}
 		} else {
 			//BUG_ON(res->state != RESERVATION_ACTIVE_IDLE);
 			if (res->state != RESERVATION_ACTIVE_IDLE)
@@ -579,6 +587,7 @@ static void gmp_charge_budget(
 			 TRACE("requesting gmp_scheduler update for reservation %u in %llu nanoseconds\n",
 				res->id, res->cur_budget);
 			 gmp_add_event_after(gmp_env, res->cur_budget, res->id, EVENT_DRAIN);
+			 res->event_added = 1;
 		}
 		//if (encountered_active == 2)
 			/* stop at the first ACTIVE reservation */
@@ -601,33 +610,49 @@ static void gmp_replenish_budgets(struct gmp_reservation_environment* gmp_env)
 			break;
 		}
 	}
-	//TRACE("finished replenishing budgets\n");
+	TRACE("finished replenishing budgets\n");
 
 	/* request a scheduler update at the next replenishment instant */
+	list_for_each_safe(pos, next, &gmp_env->depleted_reservations) {
+		res = list_entry(pos, struct reservation, list);
+		if (res->event_added == 0) {
+			gmp_add_event(gmp_env, res->next_replenishment, res->id, EVENT_REPLENISH);
+			res->event_added = 1;
+		}
+	}	
+	
+/*
 	res = list_first_entry_or_null(&gmp_env->depleted_reservations,
 		struct reservation, list);
-	if (res)
+	if (res && res->event_added == 0) {
 		gmp_add_event(gmp_env, res->next_replenishment, res->id, EVENT_REPLENISH);
+		res->event_added = 1;
+	}
+*/
 }
 
+#define EPSILON	50
+
 /* return schedule_now */
 int gmp_update_time(
 	struct gmp_reservation_environment* gmp_env,
 	lt_t now)
 {
-	lt_t delta;
+	struct next_timer_event *event, *next;
+	lt_t delta, ret;
 
 	/* If the time didn't advance, there is nothing to do.
 	 * This check makes it safe to call sup_advance_time() potentially
 	 * multiple times (e.g., via different code paths. */
-	//TRACE("(sup_update_time) now: %llu, current_time: %llu\n", now, sup_env->env.current_time);
-	if (unlikely(now <= gmp_env->env.current_time))
-		return min(gmp_env->schedule_now, NR_CPUS);
+	TRACE("(gmp_update_time) now: %llu, current_time: %llu\n", now, gmp_env->env.current_time);
+	if (unlikely(now <= gmp_env->env.current_time + EPSILON))
+		return 0;
 
 	delta = now - gmp_env->env.current_time;
 	gmp_env->env.current_time = now;
 
 
+	//gmp_print_events(gmp_env, now);
 	/* deplete budgets by passage of time */
 	//TRACE("CHARGE###\n");
 	gmp_charge_budget(gmp_env, delta);
@@ -636,7 +661,30 @@ int gmp_update_time(
 	//TRACE("REPLENISH###\n");
 	gmp_replenish_budgets(gmp_env);
 	
-	return min(gmp_env->schedule_now, NR_CPUS);
+	list_for_each_entry_safe(event, next, &gmp_env->next_events, list) {
+		if (event->next_update < now) {
+			list_del(&event->list);
+			TRACE("EVENT at %llu IS DELETED\n", event->next_update);
+			kfree(event);
+		}
+	}		
+	
+	//gmp_print_events(gmp_env, litmus_clock());
+	
+	ret = min(gmp_env->schedule_now, NR_CPUS);
+	gmp_env->schedule_now = 0;
+	
+	return ret;
+}
+
+void gmp_print_events(struct gmp_reservation_environment* gmp_env, lt_t now)
+{
+	struct next_timer_event *event, *next;
+
+	TRACE("GLOBAL EVENTS now=%llu\n", now);
+	list_for_each_entry_safe(event, next, &gmp_env->next_events, list) {
+		TRACE("at %llu type=%d id=%d armed_on=%d\n", event->next_update, event->type, event->id, event->timer_armed_on);
+	}		
 }
 
 static void gmp_res_change_state(
@@ -653,7 +701,7 @@ static void gmp_res_change_state(
 
 	list_del(&res->list);
 	/* check if we need to reschedule because we lost an active reservation */
-	if (res->state == RESERVATION_ACTIVE && !gmp_env->will_schedule)
+	if (res->state == RESERVATION_ACTIVE)
 		gmp_env->schedule_now++;
 	res->state = new_state;
 	gmp_queue_reservation(gmp_env, res);
diff --git a/litmus/sched_mc2.c b/litmus/sched_mc2.c
index 79fecd4..b3390dc 100644
--- a/litmus/sched_mc2.c
+++ b/litmus/sched_mc2.c
@@ -74,7 +74,7 @@ struct mc2_cpu_state {
 
 	struct sup_reservation_environment sup_env;
 	struct hrtimer timer;
-	struct hrtimer g_timer;
+	//struct hrtimer g_timer;
 
 	int cpu;
 	struct task_struct* scheduled;
@@ -209,21 +209,33 @@ static void task_arrives(struct mc2_cpu_state *state, struct task_struct *tsk)
  *                       If all CPUs are running tasks which has
  *                       higher priority than level C, return NO_CPU.
  */
-static int get_lowest_prio_cpu(void)
+static int get_lowest_prio_cpu(lt_t priority)
 {
 	struct cpu_entry *ce;
 	int cpu, ret = NO_CPU;
 	lt_t latest_deadline = 0;
 	
 	raw_spin_lock(&_lowest_prio_cpu.lock);
+	ce = &_lowest_prio_cpu.cpu_entries[local_cpu_state()->cpu];
+	if (!ce->will_schedule && !ce->scheduled) {
+		raw_spin_unlock(&_lowest_prio_cpu.lock);
+		TRACE("CPU %d (local) is the lowest!\n", ce->cpu);
+		return ce->cpu;
+	}	
+
 	for_each_online_cpu(cpu) {
 		ce = &_lowest_prio_cpu.cpu_entries[cpu];
 		/* If a CPU will call schedule() in the near future, we don't
 		   return that CPU. */
+		TRACE("CPU %d will_schedule=%d, scheduled=(%s/%d:%d)\n", cpu, ce->will_schedule,
+	      ce->scheduled ? (ce->scheduled)->comm : "null",
+	      ce->scheduled ? (ce->scheduled)->pid : 0,
+	      ce->scheduled ? (ce->scheduled)->rt_param.job_params.job_no : 0);
 		if (!ce->will_schedule) {
 			if (!ce->scheduled) {
 				/* Idle cpu, return this. */
 				raw_spin_unlock(&_lowest_prio_cpu.lock);
+				TRACE("CPU %d is the lowest!\n", ce->cpu);
 				return ce->cpu;
 			} else if (ce->lv == CRIT_LEVEL_C && 
 			           ce->deadline > latest_deadline) {
@@ -234,7 +246,12 @@ static int get_lowest_prio_cpu(void)
 	}		
 	
 	raw_spin_unlock(&_lowest_prio_cpu.lock);
+
+	if (priority >= latest_deadline)
+		ret = NO_CPU;
 	
+	TRACE("CPU %d is the lowest!\n", ret);
+
 	return ret;
 }
 
@@ -253,7 +270,6 @@ static void mc2_update_timer_and_unlock(struct mc2_cpu_state *state)
 	lt_t update, now;
 	enum crit_level lv = get_task_crit_level(state->scheduled);
 	struct next_timer_event *event, *next;
-	int found_event = 0;
 	
 	//TRACE_TASK(state->scheduled, "update_timer!\n");
 	if (lv != NUM_CRIT_LEVELS)
@@ -268,10 +284,35 @@ static void mc2_update_timer_and_unlock(struct mc2_cpu_state *state)
 	 */
 	local = local_cpu_state() == state;
 
+	list_for_each_entry_safe(event, next, &_global_env.next_events, list) {
+		/* If the event time is already passed, we call schedule() on
+		   the lowest priority cpu */
+		if (event->next_update < litmus_clock()) {
+			if (event->timer_armed_on == NO_CPU) {
+				struct reservation *res = gmp_find_by_id(&_global_env, event->id);
+				int cpu = get_lowest_prio_cpu(res?res->priority:0);
+				TRACE("GLOBAL EVENT PASSED!! poking CPU %d to reschedule\n", cpu);
+				list_del(&event->list);
+				kfree(event);
+				if (cpu != NO_CPU) {
+					raw_spin_lock(&_lowest_prio_cpu.lock);
+					_lowest_prio_cpu.cpu_entries[cpu].will_schedule = true;
+					raw_spin_unlock(&_lowest_prio_cpu.lock);
+					litmus_reschedule(cpu);
+				}
+			}
+		} else if (event->next_update < update && event->timer_armed_on == NO_CPU) {
+			event->timer_armed_on = state->cpu;
+			update = event->next_update;
+			break;
+		}
+	}
+	
 	/* Must drop state lock before calling into hrtimer_start(), which
 	 * may raise a softirq, which in turn may wake ksoftirqd. */
 	raw_spin_unlock(&state->lock);
-
+	raw_spin_unlock(&_global_env.lock);
+	
 	if (update <= now) {
 		litmus_reschedule(state->cpu);
 	} else if (likely(local && update != SUP_NO_SCHEDULER_UPDATE)) {
@@ -310,7 +351,8 @@ static void mc2_update_timer_and_unlock(struct mc2_cpu_state *state)
 			litmus_reschedule(state->cpu);
 		}
 	}
-	
+
+#if 0	
 	raw_spin_lock(&_global_env.lock);
 	list_for_each_entry_safe(event, next, &_global_env.next_events, list) {
 		if (event->timer_armed_on == NO_CPU) {
@@ -349,6 +391,7 @@ static void mc2_update_timer_and_unlock(struct mc2_cpu_state *state)
 	}
 	if (found_event == 0)
 		raw_spin_unlock(&_global_env.lock);
+#endif	
 }
 
 /* mc2_update_ghost_state - Update crit_entries[] to track ghost jobs
@@ -396,6 +439,9 @@ static void mc2_update_ghost_state(struct mc2_cpu_state *state)
 					if (res)
 						litmus_reschedule(state->cpu);
 				}
+			} else {
+				TRACE("GHOST NOT FINISH id %d budget %llu\n", res->id, res->cur_budget);
+				gmp_add_event_after(&_global_env, res->cur_budget, res->id, EVENT_DRAIN);
 			}
 		}
 	}
@@ -428,6 +474,7 @@ static void update_cpu_prio(struct mc2_cpu_state *state)
 	}
 };
 
+#if 0
 /* on_global_scheduling_timer - Process the budget accounting (replenish
  *                              and charge)
  */								
@@ -503,6 +550,7 @@ unlock:
 	
 	return restart;
 }
+#endif
 
 /* on_scheduling_timer - timer event for partitioned tasks
  */                       
@@ -512,6 +560,7 @@ static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 	enum hrtimer_restart restart = HRTIMER_NORESTART;
 	struct mc2_cpu_state *state;
 	lt_t update, now;
+	int global_schedule_now;
 
 	state = container_of(timer, struct mc2_cpu_state, timer);
 
@@ -524,16 +573,19 @@ static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 	BUG_ON(state->cpu != raw_smp_processor_id());
 
 	TRACE("TIMER FIRED at %llu\n", litmus_clock());
-	raw_spin_lock_irqsave(&state->lock, flags);
+	raw_spin_lock_irqsave(&_global_env.lock, flags);
+	raw_spin_lock(&state->lock);
+	
 	sup_update_time(&state->sup_env, litmus_clock());
-	raw_spin_lock(&_global_env.lock);
+	global_schedule_now = gmp_update_time(&_global_env, litmus_clock());
+	
 	mc2_update_ghost_state(state);
-	raw_spin_unlock(&_global_env.lock);
+	
 	update = state->sup_env.next_scheduler_update;
 	now = state->sup_env.env.current_time;
 
-	TRACE_CUR("on_scheduling_timer at %llu, upd:%llu (for cpu=%d)\n",
-		now, update, state->cpu);
+	TRACE_CUR("on_scheduling_timer at %llu, upd:%llu (for cpu=%d) g_schedule_now:%d\n",
+		now, update, state->cpu, global_schedule_now);
 
 	if (update <= now) {
 		litmus_reschedule_local();
@@ -542,7 +594,20 @@ static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 		restart = HRTIMER_RESTART;
 	}
 
-	raw_spin_unlock_irqrestore(&state->lock, flags);
+	/* Find the lowest cpu, and call reschedule */
+	while (global_schedule_now--) {
+		int cpu = get_lowest_prio_cpu(0);
+		if (cpu != NO_CPU) {
+			raw_spin_lock(&_lowest_prio_cpu.lock);
+			_lowest_prio_cpu.cpu_entries[cpu].will_schedule = true;
+			raw_spin_unlock(&_lowest_prio_cpu.lock);
+			TRACE("LOWEST CPU = P%d\n", cpu);
+			litmus_reschedule(cpu);
+		}
+	} 
+	
+	raw_spin_unlock(&state->lock);
+	raw_spin_unlock_irqrestore(&_global_env.lock, flags);
 
 	return restart;
 }
@@ -555,7 +620,7 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 	struct task_struct *tsk = NULL;
 	struct crit_entry *ce;
 	enum crit_level lv;
-	lt_t time_slice;
+	lt_t time_slice, cur_priority;
 
 	list_for_each_entry_safe(res, next, &sup_env->active_reservations, list) {
 		if (res->state == RESERVATION_ACTIVE) {
@@ -578,6 +643,7 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 						   because the ghost job exists. Set blocked_by_ghost
 						   flag not to charge budget */
 						res->blocked_by_ghost = 1;
+						TRACE_TASK(ce->running, " is GHOST\n");
 					}
 				}
 			}
@@ -585,24 +651,32 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 	}
 	
 	/* no eligible level A or B tasks exists */
+	cur_priority = _lowest_prio_cpu.cpu_entries[state->cpu].deadline;
+	
+	TRACE("****** ACTIVE LIST ******\n");
+	TRACE_TASK(_lowest_prio_cpu.cpu_entries[state->cpu].scheduled, "** CURRENT JOB deadline %llu **\n", cur_priority);
 	list_for_each_entry_safe(res, next, &_global_env.active_reservations, list) {
+		TRACE("R%d deadline=%llu, scheduled_on=%d\n", res->id, res->priority, res->scheduled_on);
 		if (res->state == RESERVATION_ACTIVE && res->scheduled_on == NO_CPU) {
 			tsk = res->ops->dispatch_client(res, &time_slice);
 			if (likely(tsk)) {
 				lv = get_task_crit_level(tsk);
 				if (lv == NUM_CRIT_LEVELS) {
 					gmp_add_event_after(&_global_env, res->cur_budget, res->id, EVENT_DRAIN);
+					res->event_added = 1;
 					return tsk;
 				} else {
 					ce = &state->crit_entries[lv];
 					if (likely(!ce->running)) {
 						gmp_add_event_after(&_global_env, res->cur_budget, res->id, EVENT_DRAIN);
+						res->event_added = 1;
 						res->blocked_by_ghost = 0;
 						res->is_ghost = 0;
 						res->scheduled_on = state->cpu;
 						return tsk;
 					} else {
 						res->blocked_by_ghost = 1;
+						TRACE_TASK(ce->running, " is GHOST\n");
 					}
 				}
 			}
@@ -641,13 +715,8 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 	
 	pre_schedule(prev);
 	
-	raw_spin_lock(&_lowest_prio_cpu.lock);
-	if (_lowest_prio_cpu.cpu_entries[state->cpu].will_schedule == true)
-		_lowest_prio_cpu.cpu_entries[state->cpu].will_schedule = false;
-	raw_spin_unlock(&_lowest_prio_cpu.lock);
-	
-	raw_spin_lock(&state->lock);
 	raw_spin_lock(&_global_env.lock);
+	raw_spin_lock(&state->lock);
 	
 	//BUG_ON(state->scheduled && state->scheduled != prev);
 	//BUG_ON(state->scheduled && !is_realtime(prev));
@@ -668,12 +737,14 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 	if (is_realtime(prev) && !is_running(prev))
 		task_departs(prev, is_completed(prev));
 	
+	raw_spin_lock(&_lowest_prio_cpu.lock);
+	_lowest_prio_cpu.cpu_entries[state->cpu].will_schedule = false;
+	
 	/* figure out what to schedule next */
 	state->scheduled = mc2_dispatch(&state->sup_env, state);
 	if (state->scheduled && is_realtime(state->scheduled))
 		TRACE_TASK(state->scheduled, "mc2_dispatch picked me!\n");
 	
-	raw_spin_lock(&_lowest_prio_cpu.lock);
 	update_cpu_prio(state);
 	raw_spin_unlock(&_lowest_prio_cpu.lock);
 	
@@ -682,18 +753,29 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 
 	/* program scheduler timer */
 	state->sup_env.will_schedule = false;
-	
-	raw_spin_unlock(&_global_env.lock);
-	
+		
 	/* NOTE: drops state->lock */
 	mc2_update_timer_and_unlock(state);
 
+
+	
 	if (prev != state->scheduled && is_realtime(prev)) {
 		struct mc2_task_state* tinfo = get_mc2_state(prev);
 		struct reservation* res = tinfo->res_info.client.reservation;
 		TRACE_TASK(prev, "PREV JOB scheduled_on = P%d\n", res->scheduled_on);
 		res->scheduled_on = NO_CPU;
 		TRACE_TASK(prev, "descheduled.\n");
+		/* if prev is preempted and a global task, find the lowest cpu and reschedule */
+		if (tinfo->has_departed == false && get_task_crit_level(prev) == CRIT_LEVEL_C) {
+			int cpu = get_lowest_prio_cpu(res?res->priority:0);
+			TRACE("LEVEL-C TASK PREEMPTED!! poking CPU %d to reschedule\n", cpu);
+			if (cpu != NO_CPU) {
+				raw_spin_lock(&_lowest_prio_cpu.lock);
+				_lowest_prio_cpu.cpu_entries[cpu].will_schedule = true;
+				raw_spin_unlock(&_lowest_prio_cpu.lock);
+				litmus_reschedule(cpu);
+			}
+		}
 	}
 	if (state->scheduled) {
 		TRACE_TASK(state->scheduled, "scheduled.\n");
@@ -737,34 +819,31 @@ static void mc2_task_resume(struct task_struct  *tsk)
 	else
 		state = local_cpu_state();
 
-	raw_spin_lock_irqsave(&state->lock, flags);
+	raw_spin_lock_irqsave(&_global_env.lock, flags);
 	/* Requeue only if self-suspension was already processed. */
 	if (tinfo->has_departed)
 	{
-		raw_spin_lock(&_global_env.lock);
+		raw_spin_lock(&state->lock);
 		/* Assumption: litmus_clock() is synchronized across cores,
 		 * since we might not actually be executing on tinfo->cpu
 		 * at the moment. */
 		if (tinfo->cpu != -1) {
 			sup_update_time(&state->sup_env, litmus_clock());
 		} else {
-			//raw_spin_lock(&_global_env.lock);
 			TRACE("RESUME UPDATE ####\n");
 			gmp_update_time(&_global_env, litmus_clock());
 			TRACE("RESUME UPDATE $$$$\n");
-			//raw_spin_unlock(&_global_env.lock);
 		}
 			
 		mc2_update_ghost_state(state);
 		task_arrives(state, tsk);
 		/* NOTE: drops state->lock */
 		TRACE_TASK(tsk, "mc2_resume()\n");
-		raw_spin_unlock(&_global_env.lock);
 		mc2_update_timer_and_unlock(state);
 		local_irq_restore(flags);
 	} else {
 		TRACE_TASK(tsk, "resume event ignored, still scheduled\n");
-		raw_spin_unlock_irqrestore(&state->lock, flags);
+		raw_spin_unlock_irqrestore(&_global_env.lock, flags);
 	}
 
 	resume_legacy_task_model_updates(tsk);
@@ -800,7 +879,9 @@ static long mc2_complete_job(void)
 		else
 			state = local_cpu_state();
 		
+		raw_spin_lock(&_global_env.lock);
 		raw_spin_lock(&state->lock);
+
 		env = &(state->sup_env.env);
 		
 		res = res_find_by_id(state, tinfo->mc2_param.res_id);
@@ -808,7 +889,6 @@ static long mc2_complete_job(void)
 		if (get_task_crit_level(current) < CRIT_LEVEL_C) {
 			env->time_zero = tsk_rt(current)->sporadic_release_time;
 		} else {
-			raw_spin_lock(&_global_env.lock);
 			_global_env.env.time_zero = tsk_rt(current)->sporadic_release_time;
 		}
 		
@@ -826,10 +906,9 @@ static long mc2_complete_job(void)
 		res->env->change_state(res->env, res, RESERVATION_DEPLETED);
 		
 		TRACE_CUR("CHANGE NEXT_REP = %llu\n NEXT_UPDATE = %llu\n", res->next_replenishment, state->sup_env.next_scheduler_update);
-		if (get_task_crit_level(current) == CRIT_LEVEL_C) {
-			raw_spin_unlock(&_global_env.lock);
-		}
+		
 		raw_spin_unlock(&state->lock);
+		raw_spin_unlock(&_global_env.lock);
 		
 		local_irq_restore(flags);
 	}
@@ -908,10 +987,10 @@ static long mc2_admit_task(struct task_struct *tsk)
 
 		raw_spin_unlock_irqrestore(&state->lock, flags);
 	} else if (lv == CRIT_LEVEL_C) {
-		local_irq_save(flags);
+		raw_spin_lock_irqsave(&_global_env.lock, flags);
 		state = local_cpu_state();
+		
 		raw_spin_lock(&state->lock);
-		raw_spin_lock(&_global_env.lock);
 		
 		res = gmp_find_by_id(&_global_env, mp->res_id);
 
@@ -931,9 +1010,8 @@ static long mc2_admit_task(struct task_struct *tsk)
 			tsk_rt(tsk)->task_params.budget_policy = NO_ENFORCEMENT;
 		}
 
-		raw_spin_unlock(&_global_env.lock);
 		raw_spin_unlock(&state->lock);
-		local_irq_restore(flags);
+		raw_spin_unlock_irqrestore(&_global_env.lock, flags);
 	}
 	
 	preempt_enable();
@@ -965,8 +1043,9 @@ static void mc2_task_new(struct task_struct *tsk, int on_runqueue,
 		state = cpu_state_for(tinfo->cpu);
 	
 	/* acquire the lock protecting the state and disable interrupts */
-	raw_spin_lock_irqsave(&state->lock, flags);
-
+	raw_spin_lock_irqsave(&_global_env.lock, flags);
+	raw_spin_lock(&state->lock);
+	
 	if (is_running) {
 		state->scheduled = tsk;
 		/* make sure this task should actually be running */
@@ -976,18 +1055,16 @@ static void mc2_task_new(struct task_struct *tsk, int on_runqueue,
 	if (on_runqueue || is_running) {
 		/* Assumption: litmus_clock() is synchronized across cores
 		 * [see comment in pres_task_resume()] */
-		raw_spin_lock(&_global_env.lock);
 		mc2_update_time(lv, state, litmus_clock());
 		mc2_update_ghost_state(state);
 		task_arrives(state, tsk);
 		/* NOTE: drops state->lock */
 		TRACE("mc2_new()\n");
-		raw_spin_unlock(&_global_env.lock);
 		
 		mc2_update_timer_and_unlock(state);
 		local_irq_restore(flags);
 	} else
-		raw_spin_unlock_irqrestore(&state->lock, flags);
+		raw_spin_unlock_irqrestore(&_global_env.lock, flags);
 
 	res = res_find_by_id(state, tinfo->mc2_param.res_id);
 	
@@ -1015,9 +1092,9 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 		/* if the reservation is global reservation */
 		local_irq_save(flags);
 		state = local_cpu_state();
-		raw_spin_lock(&state->lock);
 		raw_spin_lock(&_global_env.lock);
-	
+		raw_spin_lock(&state->lock);
+		
 		list_for_each_entry_safe(res, next, &_global_env.depleted_reservations, list) {
 			if (res->id == reservation_id) {
 				TRACE("DESTROY RES FOUND!!!\n");
@@ -1050,8 +1127,8 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 			}
 		}
 
-		raw_spin_unlock(&_global_env.lock);
 		raw_spin_unlock(&state->lock);
+		raw_spin_unlock(&_global_env.lock);
 		local_irq_restore(flags);
 	} else {
 		/* if the reservation is partitioned reservation */
@@ -1125,9 +1202,10 @@ static void mc2_task_exit(struct task_struct *tsk)
 		state = cpu_state_for(tinfo->cpu);
 	else
 		state = local_cpu_state();
+		
+	raw_spin_lock_irqsave(&_global_env.lock, flags);
+	raw_spin_lock(&state->lock);
 	
-	raw_spin_lock_irqsave(&state->lock, flags);
-
 	if (state->scheduled == tsk)
 		state->scheduled = NULL;
 
@@ -1140,20 +1218,18 @@ static void mc2_task_exit(struct task_struct *tsk)
 		/* Assumption: litmus_clock() is synchronized across cores
 		 * [see comment in pres_task_resume()] */
 		
-		raw_spin_lock(&_global_env.lock);
 		/* update both global and partitioned */
 		mc2_update_time(lv, state, litmus_clock());
-		
 		mc2_update_ghost_state(state);
 		task_departs(tsk, 0);
 		
 		/* NOTE: drops state->lock */
 		TRACE("mc2_exit()\n");
-		raw_spin_unlock(&_global_env.lock);
+
 		mc2_update_timer_and_unlock(state);
 		local_irq_restore(flags);
 	} else
-		raw_spin_unlock_irqrestore(&state->lock, flags);
+		raw_spin_unlock_irqrestore(&_global_env.lock, flags);
 
 	kfree(tsk_rt(tsk)->plugin_state);
 	tsk_rt(tsk)->plugin_state = NULL;
@@ -1204,8 +1280,10 @@ static long create_polling_reservation(
 		return -ENOMEM;
 
 	if (config->cpu != -1) {
+		
+		raw_spin_lock_irqsave(&_global_env.lock, flags);
 		state = cpu_state_for(config->cpu);
-		raw_spin_lock_irqsave(&state->lock, flags);
+		raw_spin_lock(&state->lock);
 
 		res = sup_find_by_id(&state->sup_env, config->id);
 		if (!res) {
@@ -1225,7 +1303,9 @@ static long create_polling_reservation(
 			err = -EEXIST;
 		}
 
-		raw_spin_unlock_irqrestore(&state->lock, flags);
+		raw_spin_unlock(&state->lock);
+		raw_spin_unlock_irqrestore(&_global_env.lock, flags);
+
 	} else {
 		raw_spin_lock_irqsave(&_global_env.lock, flags);
 		
@@ -1467,8 +1547,8 @@ static long mc2_activate_plugin(void)
 		hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 		state->timer.function = on_scheduling_timer;
 		
-		hrtimer_init(&state->g_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
-		state->g_timer.function = on_global_scheduling_timer;
+//		hrtimer_init(&state->g_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+//		state->g_timer.function = on_global_scheduling_timer;
 	}
 
 	mc2_setup_domain_proc();
@@ -1481,6 +1561,7 @@ static void mc2_finish_switch(struct task_struct *prev)
 	struct mc2_cpu_state *state = local_cpu_state();
 	
 	state->scheduled = is_realtime(current) ? current : NULL;
+	TRACE_TASK(prev, "FINISH CXS! complete=%d\n", tsk_rt(prev)->completed);
 }
 
 static long mc2_deactivate_plugin(void)
@@ -1490,12 +1571,14 @@ static long mc2_deactivate_plugin(void)
 	struct reservation *res;
 	struct next_timer_event *event;
 
+	raw_spin_lock(&_global_env.lock);
+
 	for_each_online_cpu(cpu) {
 		state = cpu_state_for(cpu);
 		raw_spin_lock(&state->lock);
 
 		hrtimer_cancel(&state->timer);
-		hrtimer_cancel(&state->g_timer);
+//		hrtimer_cancel(&state->g_timer);
 
 		/* Delete all reservations --- assumes struct reservation
 		 * is prefix of containing struct. */
@@ -1527,8 +1610,7 @@ static long mc2_deactivate_plugin(void)
 		raw_spin_unlock(&state->lock);
 	}
 
-	raw_spin_lock(&_global_env.lock);
-
+	
 	while (!list_empty(&_global_env.active_reservations)) {
 		TRACE("RES FOUND!!!\n");
 		res = list_first_entry(
-- 
1.8.1.2


From d9f5d5edbda26349cf6bf4e7d371d6e91660fe0f Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Mon, 2 Mar 2015 15:57:54 -0500
Subject: [PATCH 092/119] Working version

---
 kernel/sched/litmus.c         |   2 +-
 litmus/polling_reservations.c |  28 ++--
 litmus/reservation.c          |  98 ++++--------
 litmus/sched_mc2.c            | 337 ++++++++++++++++++------------------------
 4 files changed, 187 insertions(+), 278 deletions(-)

diff --git a/kernel/sched/litmus.c b/kernel/sched/litmus.c
index b84361f..29cd69d 100644
--- a/kernel/sched/litmus.c
+++ b/kernel/sched/litmus.c
@@ -73,7 +73,7 @@ litmus_schedule(struct rq *rq, struct task_struct *prev)
 			if (next->rt_param.stack_in_use == NO_CPU)
 				TRACE_TASK(next,"descheduled. Proceeding.\n");
 
-			if (lt_before(_maybe_deadlock + 1000000000L,
+			if (lt_before(_maybe_deadlock + 5000000000L,
 				      litmus_clock())) {
 				/* We've been spinning for 1s.
 				 * Something can't be right!
diff --git a/litmus/polling_reservations.c b/litmus/polling_reservations.c
index a3125eb..df1aeb0 100644
--- a/litmus/polling_reservations.c
+++ b/litmus/polling_reservations.c
@@ -383,11 +383,7 @@ static lt_t td_time_remaining_until_end(struct table_driven_reservation *tdres)
 {
 	lt_t now = tdres->res.env->current_time;
 	lt_t end = tdres->cur_interval.end;
-	TRACE("td_remaining(%u): start=%llu now=%llu end=%llu state=%d\n",
-		tdres->res.id,
-		tdres->cur_interval.start,
-		now, end,
-		tdres->res.state);
+	//TRACE("td_remaining(%u): start=%llu now=%llu end=%llu state=%d\n", tdres->res.id,	tdres->cur_interval.start, now, end, tdres->res.state);
 	if (now >=  end)
 		return 0;
 	else
@@ -400,24 +396,22 @@ static void td_replenish(
 	struct table_driven_reservation *tdres =
 		container_of(res, struct table_driven_reservation, res);
 
-	TRACE("td_replenish(%u): expected_replenishment=%llu\n", res->id,
-		res->next_replenishment);
+	//TRACE("td_replenish(%u): expected_replenishment=%llu\n", res->id, res->next_replenishment);
 
 	/* figure out current interval */
 	tdres->cur_interval.start = tdres->major_cycle_start +
 		tdres->intervals[tdres->next_interval].start;
 	tdres->cur_interval.end =  tdres->major_cycle_start +
 		tdres->intervals[tdres->next_interval].end;
-	TRACE("major_cycle_start=%llu => [%llu, %llu]\n",
+/*	TRACE("major_cycle_start=%llu => [%llu, %llu]\n",
 		tdres->major_cycle_start,
 		tdres->cur_interval.start,
 		tdres->cur_interval.end);
-
+*/
 	/* reset budget */
 	res->cur_budget = td_time_remaining_until_end(tdres);
 	res->budget_consumed = 0;
-	TRACE("td_replenish(%u): %s budget=%llu\n", res->id,
-		res->cur_budget ? "" : "WARNING", res->cur_budget);
+	//TRACE("td_replenish(%u): %s budget=%llu\n", res->id, res->cur_budget ? "" : "WARNING", res->cur_budget);
 
 	/* prepare next slot */
 	tdres->next_interval = (tdres->next_interval + 1) % tdres->num_intervals;
@@ -428,8 +422,7 @@ static void td_replenish(
 	/* determine next time this reservation becomes eligible to execute */
 	res->next_replenishment  = tdres->major_cycle_start;
 	res->next_replenishment += tdres->intervals[tdres->next_interval].start;
-	TRACE("td_replenish(%u): next_replenishment=%llu\n", res->id,
-		res->next_replenishment);
+	//TRACE("td_replenish(%u): next_replenishment=%llu\n", res->id, res->next_replenishment);
 
 
 	switch (res->state) {
@@ -465,7 +458,7 @@ static void td_drain_budget(
 	 * how much time is left in this allocation interval. */
 
 	/* sanity check: we should never try to drain from future slots */
-	TRACE("TD_DRAIN STATE(%d) [%llu,%llu]  %llu ?\n", res->state, tdres->cur_interval.start, tdres->cur_interval.end, res->env->current_time);
+	//TRACE("TD_DRAIN STATE(%d) [%llu,%llu]  %llu ?\n", res->state, tdres->cur_interval.start, tdres->cur_interval.end, res->env->current_time);
 	//BUG_ON(tdres->cur_interval.start > res->env->current_time);
 	if (tdres->cur_interval.start > res->env->current_time)
 		TRACE("TD_DRAIN BUG!!!!!!!!!!\n");
@@ -480,8 +473,7 @@ static void td_drain_budget(
 		case RESERVATION_ACTIVE_IDLE:
 		case RESERVATION_ACTIVE:
 			res->cur_budget = td_time_remaining_until_end(tdres);
-			TRACE("td_drain_budget(%u): drained to budget=%llu\n",
-				res->id, res->cur_budget);
+			//TRACE("td_drain_budget(%u): drained to budget=%llu\n", res->id, res->cur_budget);
 			if (!res->cur_budget) {
 				res->env->change_state(res->env, res,
 					RESERVATION_DEPLETED);
@@ -489,6 +481,10 @@ static void td_drain_budget(
 				/* sanity check budget calculation */
 				//BUG_ON(res->env->current_time >= tdres->cur_interval.end);
 				//BUG_ON(res->env->current_time < tdres->cur_interval.start);
+				if (res->env->current_time >= tdres->cur_interval.end)
+					printk(KERN_ALERT "TD_DRAIN_BUDGET WARNING1\n");
+				if (res->env->current_time < tdres->cur_interval.start)
+					printk(KERN_ALERT "TD_DRAIN_BUDGET WARNING2\n");
 			}
 
 			break;
diff --git a/litmus/reservation.c b/litmus/reservation.c
index 3ec18a2..86d2f6e 100644
--- a/litmus/reservation.c
+++ b/litmus/reservation.c
@@ -4,6 +4,9 @@
 #include <litmus/litmus.h>
 #include <litmus/reservation.h>
 
+#define TRACE(fmt, args...) do {} while (false)
+#define TRACE_TASK(fmt, args...) do {} while (false)
+
 void reservation_init(struct reservation *res)
 {
 	memset(res, sizeof(*res), 0);
@@ -376,43 +379,6 @@ struct next_timer_event* gmp_find_event_by_time(struct gmp_reservation_environme
 	return NULL;
 }
 
-/*
-static void gmp_scheduler_update_at(
-	struct gmp_reservation_environment* gmp_env, unsigned int id,
-	event_type_t type, lt_t when)
-{
-	struct next_timer_event *nevent, *queued;
-	struct list_head *pos;
-	int found = 0;
-	
-	nevent = gmp_find_event_by_id(gmp_env, id);
-	
-	if (!nevent) {
-		nevent = kzalloc(sizeof(*nevent), GFP_KERNEL);
-		nevent->next_update = when;
-		nevent->id = id;
-		nevent->timer_armed_on = NO_CPU;
-		nevent->type = type;
-		
-		list_for_each(pos, &gmp_env->next_events) {
-			queued = list_entry(pos, struct next_timer_event, list);
-			if (queued->next_update > nevent->next_update) {
-				list_add(&nevent->list, pos->prev);
-				found = 1;
-				TRACE("NEXT_EVENT ADDED after %llu\n", queued->next_update);
-				break;
-			}
-		}
-		
-		if (!found) {
-			list_add_tail(&nevent->list, &gmp_env->next_events);
-			TRACE("NEXT_EVENT ADDED at [0]\n");
-		}
-	} else {
-		TRACE("EVENT FOUND at %llu T(%d), NEW EVENT %llu T(%d)\n", nevent->next_update, nevent->type, when, type);
-	}
-}
-*/
 #define TIMER_RESOLUTION 100000L
 
 static void gmp_add_event(
@@ -425,11 +391,12 @@ static void gmp_add_event(
 
 	//when = div64_u64(when, TIMER_RESOLUTION);
 	//when *= TIMER_RESOLUTION;
-	
+//printk(KERN_ALERT "GMP_ADD id=%d type=%d when=%llu\n", id, type, when);
 	nevent = gmp_find_event_by_id(gmp_env, id);
 	
 	if (!nevent || nevent->type != type) {
 		nevent = kzalloc(sizeof(*nevent), GFP_ATOMIC);
+		BUG_ON(!nevent);
 		nevent->next_update = when;
 		nevent->id = id;
 		nevent->type = type;
@@ -450,17 +417,19 @@ static void gmp_add_event(
 			TRACE("NEXT_EVENT id=%d type=%d update=%llu ADDED at TAIL\n", nevent->id, nevent->type, nevent->next_update);
 		}
 	} else {
-		TRACE("EVENT FOUND id = %d type=%d when=%llu, NEW EVENT type=%d when=%llu\n", nevent->id, nevent->type, nevent->next_update, type, when);
+		//TRACE("EVENT FOUND id = %d type=%d when=%llu, NEW EVENT type=%d when=%llu\n", nevent->id, nevent->type, nevent->next_update, type, when);
+; //printk(KERN_ALERT "EVENT FOUND id = %d type=%d when=%llu, NEW EVENT type=%d when=%llu\n", nevent->id, nevent->type, nevent->next_update, type, when);
 	}
 	
-	TRACE("======START PRINTING EVENT LIST======\n");
-	gmp_print_events(gmp_env, litmus_clock());
-	TRACE("======FINISH PRINTING EVENT LIST======\n");
+	//TRACE("======START PRINTING EVENT LIST======\n");
+	//gmp_print_events(gmp_env, litmus_clock());
+	//TRACE("======FINISH PRINTING EVENT LIST======\n");
 }
 
 void gmp_add_event_after(
 	struct gmp_reservation_environment* gmp_env, lt_t timeout, unsigned int id, event_type_t type)
 {
+	//printk(KERN_ALERT "ADD_EVENT_AFTER id = %d\n", id);
 	gmp_add_event(gmp_env, gmp_env->env.current_time + timeout, id, type);
 }
 
@@ -472,19 +441,24 @@ static void gmp_queue_depleted(
 	struct reservation *queued;
 	int found = 0;
 
+//printk(KERN_ALERT "R%d request to enqueue depleted_list\n", res->id);
+	
 	list_for_each(pos, &gmp_env->depleted_reservations) {
 		queued = list_entry(pos, struct reservation, list);
-		if (queued && queued->next_replenishment > res->next_replenishment) {
+		if (queued && (queued->next_replenishment > res->next_replenishment)) {
+//printk(KERN_ALERT "QUEUED R%d %llu\n", queued->id, queued->next_replenishment);
 			list_add(&res->list, pos->prev);
 			found = 1;
+			break;
 		}
 	}
 
 	if (!found)
 		list_add_tail(&res->list, &gmp_env->depleted_reservations);
+
 	TRACE("R%d queued to depleted_list\n", res->id);
+//printk(KERN_ALERT "R%d queued to depleted_list\n", res->id);
 	gmp_add_event(gmp_env, res->next_replenishment, res->id, EVENT_REPLENISH);
-	res->event_added = 1;
 }
 
 static void gmp_queue_active(
@@ -520,10 +494,8 @@ static void gmp_queue_reservation(
 	struct gmp_reservation_environment* gmp_env,
 	struct reservation *res)
 {
-	if (res == NULL) {
-		BUG();
-		return;
-	}
+
+//printk(KERN_ALERT "DEBUG: Passed %s %d %p R%d STATE %d\n",__FUNCTION__,__LINE__, gmp_env, res->id, res->state);
 	switch (res->state) {
 		case RESERVATION_INACTIVE:
 			list_add(&res->list, &gmp_env->inactive_reservations);
@@ -584,8 +556,7 @@ static void gmp_charge_budget(
 		{
 			/* make sure scheduler is invoked when this reservation expires
 			 * its remaining budget */
-			 TRACE("requesting gmp_scheduler update for reservation %u in %llu nanoseconds\n",
-				res->id, res->cur_budget);
+			 TRACE("requesting gmp_scheduler update for reservation %u in %llu nanoseconds\n", res->id, res->cur_budget);
 			 gmp_add_event_after(gmp_env, res->cur_budget, res->id, EVENT_DRAIN);
 			 res->event_added = 1;
 		}
@@ -593,7 +564,7 @@ static void gmp_charge_budget(
 			/* stop at the first ACTIVE reservation */
 		//	break;
 	}
-	//TRACE("finished charging budgets\n");
+	TRACE("finished charging budgets\n");
 }
 
 static void gmp_replenish_budgets(struct gmp_reservation_environment* gmp_env)
@@ -611,24 +582,6 @@ static void gmp_replenish_budgets(struct gmp_reservation_environment* gmp_env)
 		}
 	}
 	TRACE("finished replenishing budgets\n");
-
-	/* request a scheduler update at the next replenishment instant */
-	list_for_each_safe(pos, next, &gmp_env->depleted_reservations) {
-		res = list_entry(pos, struct reservation, list);
-		if (res->event_added == 0) {
-			gmp_add_event(gmp_env, res->next_replenishment, res->id, EVENT_REPLENISH);
-			res->event_added = 1;
-		}
-	}	
-	
-/*
-	res = list_first_entry_or_null(&gmp_env->depleted_reservations,
-		struct reservation, list);
-	if (res && res->event_added == 0) {
-		gmp_add_event(gmp_env, res->next_replenishment, res->id, EVENT_REPLENISH);
-		res->event_added = 1;
-	}
-*/
 }
 
 #define EPSILON	50
@@ -644,7 +597,7 @@ int gmp_update_time(
 	/* If the time didn't advance, there is nothing to do.
 	 * This check makes it safe to call sup_advance_time() potentially
 	 * multiple times (e.g., via different code paths. */
-	TRACE("(gmp_update_time) now: %llu, current_time: %llu\n", now, gmp_env->env.current_time);
+	//TRACE("(gmp_update_time) now: %llu, current_time: %llu\n", now, gmp_env->env.current_time);
 	if (unlikely(now <= gmp_env->env.current_time + EPSILON))
 		return 0;
 
@@ -660,12 +613,15 @@ int gmp_update_time(
 	/* check if any budgets where replenished */
 	//TRACE("REPLENISH###\n");
 	gmp_replenish_budgets(gmp_env);
+
 	
 	list_for_each_entry_safe(event, next, &gmp_env->next_events, list) {
 		if (event->next_update < now) {
 			list_del(&event->list);
 			TRACE("EVENT at %llu IS DELETED\n", event->next_update);
 			kfree(event);
+		} else {
+			break;
 		}
 	}		
 	
@@ -673,7 +629,7 @@ int gmp_update_time(
 	
 	ret = min(gmp_env->schedule_now, NR_CPUS);
 	gmp_env->schedule_now = 0;
-	
+
 	return ret;
 }
 
diff --git a/litmus/sched_mc2.c b/litmus/sched_mc2.c
index b3390dc..f7758f2 100644
--- a/litmus/sched_mc2.c
+++ b/litmus/sched_mc2.c
@@ -62,7 +62,6 @@ struct mc2_task_state {
 struct crit_entry {
 	enum crit_level level;
 	struct task_struct *running;
-	//struct hrtimer ghost_timer;
 };
 
 /* mc2_cpu_state - maintain the scheduled state and ghost jobs
@@ -74,7 +73,6 @@ struct mc2_cpu_state {
 
 	struct sup_reservation_environment sup_env;
 	struct hrtimer timer;
-	//struct hrtimer g_timer;
 
 	int cpu;
 	struct task_struct* scheduled;
@@ -221,7 +219,9 @@ static int get_lowest_prio_cpu(lt_t priority)
 		raw_spin_unlock(&_lowest_prio_cpu.lock);
 		TRACE("CPU %d (local) is the lowest!\n", ce->cpu);
 		return ce->cpu;
-	}	
+	} else {
+		TRACE("Local CPU will_schedule=%d, scheduled=(%s/%d)\n", ce->will_schedule, ce->scheduled ? (ce->scheduled)->comm : "null", ce->scheduled ? (ce->scheduled)->pid : 0);
+	}
 
 	for_each_online_cpu(cpu) {
 		ce = &_lowest_prio_cpu.cpu_entries[cpu];
@@ -287,6 +287,10 @@ static void mc2_update_timer_and_unlock(struct mc2_cpu_state *state)
 	list_for_each_entry_safe(event, next, &_global_env.next_events, list) {
 		/* If the event time is already passed, we call schedule() on
 		   the lowest priority cpu */
+		if (event->next_update >= update) {
+			break;
+		}
+		
 		if (event->next_update < litmus_clock()) {
 			if (event->timer_armed_on == NO_CPU) {
 				struct reservation *res = gmp_find_by_id(&_global_env, event->id);
@@ -351,70 +355,33 @@ static void mc2_update_timer_and_unlock(struct mc2_cpu_state *state)
 			litmus_reschedule(state->cpu);
 		}
 	}
-
-#if 0	
-	raw_spin_lock(&_global_env.lock);
-	list_for_each_entry_safe(event, next, &_global_env.next_events, list) {
-		if (event->timer_armed_on == NO_CPU) {
-			/* If the event time is already passed, we call schedule() on
-			   the lowest priority cpu */
-			if (event->next_update < litmus_clock()) {
-				int cpu = get_lowest_prio_cpu();
-				TRACE("GLOBAL EVENT PASSED!! poking CPU %d to reschedule\n", cpu);
-				list_del(&event->list);
-				kfree(event);
-				if (cpu != NO_CPU) {
-					raw_spin_lock(&_lowest_prio_cpu.lock);
-					_lowest_prio_cpu.cpu_entries[cpu].will_schedule = true;
-					raw_spin_unlock(&_lowest_prio_cpu.lock);
-					litmus_reschedule(cpu);
-				}
-			} else if (!hrtimer_active(&state->g_timer)) {
-				int ret;
-			
-				raw_spin_unlock(&_global_env.lock);
-				found_event = 1;
-			
-				TRACE("setting global scheduler timer for %llu\n", 
-				       event->next_update);
-				ret = __hrtimer_start_range_ns(&state->g_timer,
-						ns_to_ktime(event->next_update),
-						0 /* timer coalescing slack */,
-						HRTIMER_MODE_ABS_PINNED,
-						0 /* wakeup */);
-				if (!ret) {
-					event->timer_armed_on = state->cpu;
-					break;
-				}
-			}				
-		}
-	}
-	if (found_event == 0)
-		raw_spin_unlock(&_global_env.lock);
-#endif	
 }
 
 /* mc2_update_ghost_state - Update crit_entries[] to track ghost jobs
  *                          If the budget of a ghost is exhausted,
  *                          clear is_ghost and reschedule
  */
-static void mc2_update_ghost_state(struct mc2_cpu_state *state)
+static lt_t mc2_update_ghost_state(struct mc2_cpu_state *state)
 {
 	int lv = 0;
 	struct crit_entry* ce;
 	struct reservation *res;
 	struct mc2_task_state *tinfo;
-
+	lt_t ret = ULLONG_MAX;
+	
 	BUG_ON(!state);
 	
 	for (lv = 0; lv < NUM_CRIT_LEVELS; lv++) {
 		ce = &state->crit_entries[lv];
 		if (ce->running != NULL) {
+//printk(KERN_ALERT "P%d ce->running : %s/%d\n", state->cpu,  ce->running ? (ce->running)->comm : "null", ce->running ? (ce->running)->pid : 0);
 			tinfo = get_mc2_state(ce->running);
 			if (!tinfo)
 				continue;
 			
 			res = res_find_by_id(state, tinfo->mc2_param.res_id);
+			BUG_ON(!res);
+//printk(KERN_ALERT "R%d found!\n", res->id);			
 			TRACE("LV %d running id %d budget %llu\n", 
 			       lv, tinfo->mc2_param.res_id, res->cur_budget);
 			/* If the budget is exhausted, clear is_ghost and reschedule */
@@ -432,7 +399,7 @@ static void mc2_update_ghost_state(struct mc2_cpu_state *state)
 						  struct reservation, list);
 					if (res)
 						litmus_reschedule_local();
-				} else {
+				} else if (lv == CRIT_LEVEL_C) {
 					res = list_first_entry_or_null(
 					      &_global_env.active_reservations,
 						  struct reservation, list);
@@ -440,11 +407,16 @@ static void mc2_update_ghost_state(struct mc2_cpu_state *state)
 						litmus_reschedule(state->cpu);
 				}
 			} else {
-				TRACE("GHOST NOT FINISH id %d budget %llu\n", res->id, res->cur_budget);
-				gmp_add_event_after(&_global_env, res->cur_budget, res->id, EVENT_DRAIN);
+				//TRACE("GHOST NOT FINISH id %d budget %llu\n", res->id, res->cur_budget);
+				//gmp_add_event_after(&_global_env, res->cur_budget, res->id, EVENT_DRAIN);
+				if (ret > res->cur_budget) {
+					ret = res->cur_budget;
+				}
 			}
 		}
 	}
+	
+	return ret;
 }			
 
 /* update_cpu_prio - Update cpu's priority
@@ -474,84 +446,6 @@ static void update_cpu_prio(struct mc2_cpu_state *state)
 	}
 };
 
-#if 0
-/* on_global_scheduling_timer - Process the budget accounting (replenish
- *                              and charge)
- */								
-static enum hrtimer_restart on_global_scheduling_timer(struct hrtimer *timer)
-{
-	unsigned long flags;
-	enum hrtimer_restart restart = HRTIMER_NORESTART;
-	struct mc2_cpu_state *state;
-	struct next_timer_event *event, *next;
-	int schedule_now;
-	lt_t update, now;
-	int found_event = 0;
-
-	state = container_of(timer, struct mc2_cpu_state, g_timer);
-
-	raw_spin_lock_irqsave(&state->lock, flags);
-	
-	/* The scheduling timer should only fire on the local CPU, because
-	 * otherwise deadlocks via timer_cancel() are possible.
-	 * Note: this does not interfere with dedicated interrupt handling, as
-	 * even under dedicated interrupt handling scheduling timers for
-	 * budget enforcement must occur locally on each CPU.
-	 */
-	//BUG_ON(state->cpu != raw_smp_processor_id());
-	if (state->cpu != raw_smp_processor_id())
-		TRACE("BUG!!!!!!!!!!!!! TIMER FIRED ON THE OTHER CPU\n");
-
-	raw_spin_lock(&_global_env.lock);
-	
-	update = litmus_clock();
-	TRACE("GLOBAL TIMER FIRED at %llu\n", update);
-	
-	/* The event can be processed by the other cpus. So, if there is no 
-	   events to process, we do nothing */
-	list_for_each_entry_safe(event, next, &_global_env.next_events, list) {
-		if (event->next_update < update) {
-			found_event = 1;
-			list_del(&event->list);
-			TRACE("EVENT at %llu IS DELETED\n", event->next_update);
-			kfree(event);
-		}
-	}			
-	
-	if (!found_event) {
-		goto unlock;
-	}
-	
-	/* gmp_update_timer returns how many tasks become ACTIVE */
-	schedule_now = gmp_update_time(&_global_env, update);
-	
-	mc2_update_ghost_state(state);
-	
-	now = _global_env.env.current_time;
-	
-	TRACE_CUR("on_global_scheduling_timer at %llu, upd:%llu (for cpu=%d) SCHEDULE_NOW = %d\n",
-		now, update, state->cpu, schedule_now);
-
-	/* Find the lowest cpu, and call reschedule */
-	while (schedule_now--) {
-		int cpu = get_lowest_prio_cpu();
-		if (cpu != NO_CPU) {
-			raw_spin_lock(&_lowest_prio_cpu.lock);
-			_lowest_prio_cpu.cpu_entries[cpu].will_schedule = true;
-			raw_spin_unlock(&_lowest_prio_cpu.lock);
-			TRACE("LOWEST CPU = P%d\n", cpu);
-			litmus_reschedule(cpu);
-		}
-	} 
-
-unlock:
-	raw_spin_unlock(&_global_env.lock);
-	raw_spin_unlock_irqrestore(&state->lock, flags);
-	
-	return restart;
-}
-#endif
-
 /* on_scheduling_timer - timer event for partitioned tasks
  */                       
 static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
@@ -561,7 +455,8 @@ static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 	struct mc2_cpu_state *state;
 	lt_t update, now;
 	int global_schedule_now;
-
+	lt_t remain_budget;
+	
 	state = container_of(timer, struct mc2_cpu_state, timer);
 
 	/* The scheduling timer should only fire on the local CPU, because
@@ -575,18 +470,22 @@ static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 	TRACE("TIMER FIRED at %llu\n", litmus_clock());
 	raw_spin_lock_irqsave(&_global_env.lock, flags);
 	raw_spin_lock(&state->lock);
-	
-	sup_update_time(&state->sup_env, litmus_clock());
-	global_schedule_now = gmp_update_time(&_global_env, litmus_clock());
-	
-	mc2_update_ghost_state(state);
+//printk(KERN_ALERT "P%d on_scheduling_timer() hold lock %s/%d\n", state->cpu, current ? (current)->comm : "null", current ? (current)->pid : 0);			
+	now = litmus_clock();
+	sup_update_time(&state->sup_env, now);
+	global_schedule_now = gmp_update_time(&_global_env, now);
+//printk(KERN_ALERT "P%d update_time in timer() %s/%d\n", state->cpu, current ? (current)->comm : "null", current ? (current)->pid : 0);			
+	remain_budget = mc2_update_ghost_state(state);
 	
 	update = state->sup_env.next_scheduler_update;
 	now = state->sup_env.env.current_time;
 
-	TRACE_CUR("on_scheduling_timer at %llu, upd:%llu (for cpu=%d) g_schedule_now:%d\n",
-		now, update, state->cpu, global_schedule_now);
-
+	if (remain_budget != ULLONG_MAX && update > now + remain_budget) {
+		update = now + remain_budget;
+	}
+	
+	//TRACE_CUR("on_scheduling_timer at %llu, upd:%llu (for cpu=%d) g_schedule_now:%d\n", now, update, state->cpu, global_schedule_now);
+//printk(KERN_ALERT "on_scheduling_timer at %llu, upd:%llu (for cpu=%d) g_schedule_now:%d\n", now, update, state->cpu, global_schedule_now);
 	if (update <= now) {
 		litmus_reschedule_local();
 	} else if (update != SUP_NO_SCHEDULER_UPDATE) {
@@ -594,6 +493,8 @@ static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 		restart = HRTIMER_RESTART;
 	}
 
+	BUG_ON(global_schedule_now < 0 || global_schedule_now > 4);
+	
 	/* Find the lowest cpu, and call reschedule */
 	while (global_schedule_now--) {
 		int cpu = get_lowest_prio_cpu(0);
@@ -601,14 +502,14 @@ static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 			raw_spin_lock(&_lowest_prio_cpu.lock);
 			_lowest_prio_cpu.cpu_entries[cpu].will_schedule = true;
 			raw_spin_unlock(&_lowest_prio_cpu.lock);
-			TRACE("LOWEST CPU = P%d\n", cpu);
+			//TRACE("LOWEST CPU = P%d\n", cpu);
 			litmus_reschedule(cpu);
 		}
 	} 
 	
 	raw_spin_unlock(&state->lock);
 	raw_spin_unlock_irqrestore(&_global_env.lock, flags);
-
+//printk(KERN_ALERT "P%d on_scheduling_timer() release lock %s/%d\n", state->cpu, current ? (current)->comm : "null", current ? (current)->pid : 0);	
 	return restart;
 }
 
@@ -651,6 +552,13 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 	}
 	
 	/* no eligible level A or B tasks exists */
+	/* check the ghost job */
+	ce = &state->crit_entries[CRIT_LEVEL_C];
+	if (ce->running) {
+		TRACE_TASK(ce->running," is GHOST\n");
+		return NULL;
+	}
+	
 	cur_priority = _lowest_prio_cpu.cpu_entries[state->cpu].deadline;
 	
 	TRACE("****** ACTIVE LIST ******\n");
@@ -663,21 +571,24 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 				lv = get_task_crit_level(tsk);
 				if (lv == NUM_CRIT_LEVELS) {
 					gmp_add_event_after(&_global_env, res->cur_budget, res->id, EVENT_DRAIN);
-					res->event_added = 1;
+					//res->event_added = 1;
 					return tsk;
-				} else {
-					ce = &state->crit_entries[lv];
-					if (likely(!ce->running)) {
+				} else if (lv == CRIT_LEVEL_C) {
+					//ce = &state->crit_entries[lv];
+					//if (likely(!ce->running)) {
 						gmp_add_event_after(&_global_env, res->cur_budget, res->id, EVENT_DRAIN);
 						res->event_added = 1;
 						res->blocked_by_ghost = 0;
 						res->is_ghost = 0;
 						res->scheduled_on = state->cpu;
 						return tsk;
-					} else {
-						res->blocked_by_ghost = 1;
-						TRACE_TASK(ce->running, " is GHOST\n");
-					}
+					//} else {
+					//	res->blocked_by_ghost = 1;
+					//	TRACE_TASK(ce->running, " is GHOST\n");
+					//	return NULL;
+					//}
+				} else {
+					BUG();
 				}
 			}
 		}
@@ -711,8 +622,9 @@ static void post_schedule(struct task_struct *next)
 static struct task_struct* mc2_schedule(struct task_struct * prev)
 {
 	/* next == NULL means "schedule background work". */
+	lt_t now;
 	struct mc2_cpu_state *state = local_cpu_state();
-	
+
 	pre_schedule(prev);
 	
 	raw_spin_lock(&_global_env.lock);
@@ -721,17 +633,18 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 	//BUG_ON(state->scheduled && state->scheduled != prev);
 	//BUG_ON(state->scheduled && !is_realtime(prev));
 	if (state->scheduled && state->scheduled != prev)
-		TRACE("BUG1!!!!!!!!\n");
+		printk(KERN_ALERT "BUG1!!!!!!!! %s %s\n", state->scheduled ? (state->scheduled)->comm : "null", prev ? (prev)->comm : "null");
 	if (state->scheduled && !is_realtime(prev))
-		TRACE("BUG2!!!!!!!!\n");
+		printk(KERN_ALERT "BUG2!!!!!!!! \n");
 
 	/* update time */
 	state->sup_env.will_schedule = true;
 
-	sup_update_time(&state->sup_env, litmus_clock());
-	gmp_update_time(&_global_env, litmus_clock());
-	
-	mc2_update_ghost_state(state);
+	now = litmus_clock();
+	sup_update_time(&state->sup_env, now);
+	gmp_update_time(&_global_env, now);
+		
+	mc2_update_ghost_state(state);	
 	
 	/* remove task from reservation if it blocks */
 	if (is_realtime(prev) && !is_running(prev))
@@ -767,14 +680,17 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 		TRACE_TASK(prev, "descheduled.\n");
 		/* if prev is preempted and a global task, find the lowest cpu and reschedule */
 		if (tinfo->has_departed == false && get_task_crit_level(prev) == CRIT_LEVEL_C) {
-			int cpu = get_lowest_prio_cpu(res?res->priority:0);
-			TRACE("LEVEL-C TASK PREEMPTED!! poking CPU %d to reschedule\n", cpu);
+			int cpu;
+			raw_spin_lock(&_global_env.lock);
+			cpu = get_lowest_prio_cpu(res?res->priority:0);
+			//TRACE("LEVEL-C TASK PREEMPTED!! poking CPU %d to reschedule\n", cpu);
 			if (cpu != NO_CPU) {
 				raw_spin_lock(&_lowest_prio_cpu.lock);
 				_lowest_prio_cpu.cpu_entries[cpu].will_schedule = true;
 				raw_spin_unlock(&_lowest_prio_cpu.lock);
 				litmus_reschedule(cpu);
 			}
+			raw_spin_unlock(&_global_env.lock);
 		}
 	}
 	if (state->scheduled) {
@@ -814,12 +730,14 @@ static void mc2_task_resume(struct task_struct  *tsk)
 
 	TRACE_TASK(tsk, "thread wakes up at %llu\n", litmus_clock());
 
+	local_irq_save(flags);
 	if (tinfo->cpu != -1)
 		state = cpu_state_for(tinfo->cpu);
 	else
 		state = local_cpu_state();
 
-	raw_spin_lock_irqsave(&_global_env.lock, flags);
+	raw_spin_lock(&_global_env.lock);
+//printk(KERN_ALERT "P%d resume() hold lock\n", state->cpu);	
 	/* Requeue only if self-suspension was already processed. */
 	if (tinfo->has_departed)
 	{
@@ -830,22 +748,25 @@ static void mc2_task_resume(struct task_struct  *tsk)
 		if (tinfo->cpu != -1) {
 			sup_update_time(&state->sup_env, litmus_clock());
 		} else {
-			TRACE("RESUME UPDATE ####\n");
+			//TRACE("RESUME UPDATE ####\n");
 			gmp_update_time(&_global_env, litmus_clock());
-			TRACE("RESUME UPDATE $$$$\n");
+			//TRACE("RESUME UPDATE $$$$\n");
 		}
 			
 		mc2_update_ghost_state(state);
 		task_arrives(state, tsk);
 		/* NOTE: drops state->lock */
 		TRACE_TASK(tsk, "mc2_resume()\n");
-		mc2_update_timer_and_unlock(state);
-		local_irq_restore(flags);
+		mc2_update_timer_and_unlock(state);	
+//printk(KERN_ALERT "P%d resume() dropped lock\n", state->cpu);			
 	} else {
 		TRACE_TASK(tsk, "resume event ignored, still scheduled\n");
-		raw_spin_unlock_irqrestore(&_global_env.lock, flags);
+		raw_spin_unlock(&_global_env.lock);
+//printk(KERN_ALERT "P%d resume() release lock\n", state->cpu);			
 	}
 
+	local_irq_restore(flags);
+	
 	resume_legacy_task_model_updates(tsk);
 }
 
@@ -870,6 +791,7 @@ static long mc2_complete_job(void)
 		struct reservation *res;
 		unsigned long flags;
 
+		preempt_disable();
 		local_irq_save(flags);
 		
 		tinfo = get_mc2_state(current);
@@ -881,7 +803,7 @@ static long mc2_complete_job(void)
 		
 		raw_spin_lock(&_global_env.lock);
 		raw_spin_lock(&state->lock);
-
+//printk(KERN_ALERT "P%d complete() hold lock\n", state->cpu);
 		env = &(state->sup_env.env);
 		
 		res = res_find_by_id(state, tinfo->mc2_param.res_id);
@@ -905,12 +827,13 @@ static long mc2_complete_job(void)
 		res->cur_budget = 0;
 		res->env->change_state(res->env, res, RESERVATION_DEPLETED);
 		
-		TRACE_CUR("CHANGE NEXT_REP = %llu\n NEXT_UPDATE = %llu\n", res->next_replenishment, state->sup_env.next_scheduler_update);
+		//TRACE_CUR("CHANGE NEXT_REP = %llu\n NEXT_UPDATE = %llu\n", res->next_replenishment, state->sup_env.next_scheduler_update);
 		
 		raw_spin_unlock(&state->lock);
 		raw_spin_unlock(&_global_env.lock);
-		
+//printk(KERN_ALERT "P%d complete() release lock\n", state->cpu);				
 		local_irq_restore(flags);
+		preempt_enable();
 	}
 	sched_trace_task_completion(current, 0);
 	
@@ -937,6 +860,7 @@ static long mc2_complete_job(void)
 	}
 
 	TRACE_CUR("mc2_complete_job returns at %llu\n", litmus_clock());
+
 	return err;
 }
 
@@ -988,6 +912,7 @@ static long mc2_admit_task(struct task_struct *tsk)
 		raw_spin_unlock_irqrestore(&state->lock, flags);
 	} else if (lv == CRIT_LEVEL_C) {
 		raw_spin_lock_irqsave(&_global_env.lock, flags);
+//printk(KERN_ALERT "admit() hold lock\n");		
 		state = local_cpu_state();
 		
 		raw_spin_lock(&state->lock);
@@ -1012,6 +937,7 @@ static long mc2_admit_task(struct task_struct *tsk)
 
 		raw_spin_unlock(&state->lock);
 		raw_spin_unlock_irqrestore(&_global_env.lock, flags);
+//printk(KERN_ALERT "admit() release lock\n");		
 	}
 	
 	preempt_enable();
@@ -1033,25 +959,30 @@ static void mc2_task_new(struct task_struct *tsk, int on_runqueue,
 	struct mc2_cpu_state *state; // = cpu_state_for(tinfo->cpu);
 	struct reservation *res;
 	enum crit_level lv = get_task_crit_level(tsk);
+	lt_t release = 0;
 
 	TRACE_TASK(tsk, "new RT task %llu (on_rq:%d, running:%d)\n",
 		   litmus_clock(), on_runqueue, is_running);
 
+	local_irq_save(flags);
 	if (tinfo->cpu == -1)
 		state = local_cpu_state();
 	else 
 		state = cpu_state_for(tinfo->cpu);
 	
 	/* acquire the lock protecting the state and disable interrupts */
-	raw_spin_lock_irqsave(&_global_env.lock, flags);
+	raw_spin_lock(&_global_env.lock);
 	raw_spin_lock(&state->lock);
-	
+//printk(KERN_ALERT "new() hold lock R%d\n", tinfo->mc2_param.res_id);	
 	if (is_running) {
 		state->scheduled = tsk;
 		/* make sure this task should actually be running */
 		litmus_reschedule_local();
 	}
-
+	
+	res = res_find_by_id(state, tinfo->mc2_param.res_id);
+	release = res->next_replenishment;
+	
 	if (on_runqueue || is_running) {
 		/* Assumption: litmus_clock() is synchronized across cores
 		 * [see comment in pres_task_resume()] */
@@ -1062,18 +993,20 @@ static void mc2_task_new(struct task_struct *tsk, int on_runqueue,
 		TRACE("mc2_new()\n");
 		
 		mc2_update_timer_and_unlock(state);
-		local_irq_restore(flags);
-	} else
-		raw_spin_unlock_irqrestore(&_global_env.lock, flags);
-
-	res = res_find_by_id(state, tinfo->mc2_param.res_id);
+//printk(KERN_ALERT "new() dropped lock R%d\n",tinfo->mc2_param.res_id);		
+	} else {
+		raw_spin_unlock(&state->lock);
+		raw_spin_unlock(&_global_env.lock);
+//printk(KERN_ALERT "new() release lock R%d\n",tinfo->mc2_param.res_id);		
+	}
+	local_irq_restore(flags);
 	
-	if (res) {
-		TRACE_TASK(tsk, "mc2_task_new() next_replenishment = %llu\n", res->next_replenishment);
-		release_at(tsk, res->next_replenishment);
+	if (!release) {
+		TRACE_TASK(tsk, "mc2_task_new() next_release = %llu\n", release);
+		release_at(tsk, release);
 	}
 	else
-		TRACE_TASK(tsk, "mc2_task_new() next_replenishment = NULL\n");
+		TRACE_TASK(tsk, "mc2_task_new() next_release = NULL\n");
 }
 
 /* mc2_reservation_destroy - reservation_destroy system call backend
@@ -1196,14 +1129,16 @@ static void mc2_task_exit(struct task_struct *tsk)
 	struct mc2_task_state* tinfo = get_mc2_state(tsk);
 	struct mc2_cpu_state *state;
 	enum crit_level lv = tinfo->mc2_param.crit;
-	struct crit_entry* ce;	
+	struct crit_entry* ce;
+	int cpu;
 
+	local_irq_save(flags);
 	if (tinfo->cpu != -1)
 		state = cpu_state_for(tinfo->cpu);
 	else
 		state = local_cpu_state();
 		
-	raw_spin_lock_irqsave(&_global_env.lock, flags);
+	raw_spin_lock(&_global_env.lock);
 	raw_spin_lock(&state->lock);
 	
 	if (state->scheduled == tsk)
@@ -1226,11 +1161,30 @@ static void mc2_task_exit(struct task_struct *tsk)
 		/* NOTE: drops state->lock */
 		TRACE("mc2_exit()\n");
 
-		mc2_update_timer_and_unlock(state);
-		local_irq_restore(flags);
-	} else
-		raw_spin_unlock_irqrestore(&_global_env.lock, flags);
+		mc2_update_timer_and_unlock(state);	
+	} else {
+		raw_spin_unlock(&state->lock);
+		raw_spin_unlock(&_global_env.lock);
+	}
 
+	for_each_online_cpu(cpu) {
+		state = cpu_state_for(cpu);
+		if (state == local_cpu_state())
+			continue;
+		raw_spin_lock(&state->lock);
+		
+		if (state->scheduled == tsk)
+			state->scheduled = NULL;
+		
+		ce = &state->crit_entries[lv];
+		if (ce->running == tsk)
+			ce->running = NULL;
+		
+		raw_spin_unlock(&state->lock);
+	}
+	
+	local_irq_restore(flags);
+	
 	kfree(tsk_rt(tsk)->plugin_state);
 	tsk_rt(tsk)->plugin_state = NULL;
 	kfree(tsk_rt(tsk)->mc2_data);
@@ -1539,16 +1493,11 @@ static long mc2_activate_plugin(void)
 			struct crit_entry *cr_entry = &state->crit_entries[lv];
 			cr_entry->level = lv;
 			cr_entry->running = NULL;
-			//hrtimer_init(&ce->ghost_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
-			//ce->ghost_timer.function = on_ghost_timer;
 		}
 		sup_init(&state->sup_env);
 
 		hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 		state->timer.function = on_scheduling_timer;
-		
-//		hrtimer_init(&state->g_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
-//		state->g_timer.function = on_global_scheduling_timer;
 	}
 
 	mc2_setup_domain_proc();
@@ -1561,7 +1510,7 @@ static void mc2_finish_switch(struct task_struct *prev)
 	struct mc2_cpu_state *state = local_cpu_state();
 	
 	state->scheduled = is_realtime(current) ? current : NULL;
-	TRACE_TASK(prev, "FINISH CXS! complete=%d\n", tsk_rt(prev)->completed);
+	TRACE("FINISH CXS! from %s/%d to %s/%d\n", prev ? (prev)->comm : "null", prev ? (prev)->pid : 0, current ? (current)->comm : "null", current ? (current)->pid : 0);
 }
 
 static long mc2_deactivate_plugin(void)
@@ -1570,7 +1519,8 @@ static long mc2_deactivate_plugin(void)
 	struct mc2_cpu_state *state;
 	struct reservation *res;
 	struct next_timer_event *event;
-
+	struct cpu_entry *ce;
+	
 	raw_spin_lock(&_global_env.lock);
 
 	for_each_online_cpu(cpu) {
@@ -1578,7 +1528,14 @@ static long mc2_deactivate_plugin(void)
 		raw_spin_lock(&state->lock);
 
 		hrtimer_cancel(&state->timer);
-//		hrtimer_cancel(&state->g_timer);
+
+		ce = &_lowest_prio_cpu.cpu_entries[cpu];
+		
+		ce->cpu = cpu;
+		ce->scheduled = NULL;
+		ce->deadline = ULLONG_MAX;
+		ce->lv = NUM_CRIT_LEVELS;
+		ce->will_schedule = false;
 
 		/* Delete all reservations --- assumes struct reservation
 		 * is prefix of containing struct. */
-- 
1.8.1.2


From 6b091698a8c1575d96e6c4e3dd36252cfa7aabd1 Mon Sep 17 00:00:00 2001
From: ChengYang Fu <chengyangfu@gmail.com>
Date: Mon, 2 Mar 2015 16:11:18 -0500
Subject: [PATCH 093/119] Merge chengyangfu branch to wip-mc2-new

---
 litmus/bank_proc.c | 210 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 123 insertions(+), 87 deletions(-)

diff --git a/litmus/bank_proc.c b/litmus/bank_proc.c
index 07d5728..295c450 100644
--- a/litmus/bank_proc.c
+++ b/litmus/bank_proc.c
@@ -1,3 +1,9 @@
+/*
+ * bank_proc.c -- Implementation of the page coloring for cache and bank partition. 
+ *                The file will keep a pool of colored pages. Users can require pages with 
+ *		  specific color or bank number.
+ *                Part of the code is modified from Jonathan Herman's code  
+ */
 #include <linux/init.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -14,16 +20,23 @@
 
 #define LITMUS_LOCKDEP_NAME_MAX_LEN 50
 
-// This is Address Decoding for imx6-sabredsd board
+// This Address Decoding is used in imx6-sabredsd platform
 #define CACHE_MASK 0x0000f000      
 #define BANK_MASK  0x00007000      
 #define OFFSET_SHIFT 12
 
 #define PAGES_PER_COLOR 1024
 
+unsigned long used_cachecolor;
+unsigned long curr_cachecolor;
+
+
 unsigned long number_banks;
 unsigned long number_cachecolors;
 
+/*
+ * Every page list should contain a lock, a list, and a number recording how many pages it store
+ */ 
 struct color_group {
 	spinlock_t lock;
 	char _lock_name[LITMUS_LOCKDEP_NAME_MAX_LEN];
@@ -31,6 +44,10 @@ struct color_group {
 	atomic_t nr_pages;
 };
 
+/*
+ * This is old code which is not used in current version
+ */ 
+/*
 static struct alloced_pages {
 	spinlock_t lock;
 	struct list_head list;
@@ -41,6 +58,7 @@ struct alloced_page {
 	struct vm_area_struct *vma;
 	struct list_head list;
 };
+*/
 
 static struct color_group *color_groups;
 static struct lock_class_key color_lock_keys[16];
@@ -59,6 +77,9 @@ static inline unsigned long page_bank(struct page *page)
 	return ((page_to_phys(page)& BANK_MASK) >> PAGE_SHIFT);
 }
 
+/*
+ * It is used to determine the smallest number of page lists. 
+ */
 static unsigned long smallest_nr_pages(void)
 {
 	unsigned long i, min_pages = -1;
@@ -70,8 +91,9 @@ static unsigned long smallest_nr_pages(void)
 	}
 	return min_pages;
 }
+
 /*
- * Page's count should be one, it sould not be on any LRU list.
+ * Add a page to current pool.
  */
 void add_page_to_color_list(struct page *page)
 {
@@ -82,22 +104,26 @@ void add_page_to_color_list(struct page *page)
 	spin_lock(&cgroup->lock);
 	list_add_tail(&page->lru, &cgroup->list);
 	atomic_inc(&cgroup->nr_pages);
-//	SetPageLRU(page);
 	spin_unlock(&cgroup->lock);
 }
 
+/*
+ * Replenish the page pool. 
+ * If the newly allocate page is what we want, it will be pushed to the correct page list
+ * otherwise, it will be freed. 
+ */
 static int do_add_pages(void)
 {
-	//printk("LITMUS do add pages\n");
+	printk("LITMUS do add pages\n");
 	
 	struct page *page, *page_tmp;
 	LIST_HEAD(free_later);
 	unsigned long color;
 	int ret = 0;
 
+	// until all the page lists contain enough pages 
 	while (smallest_nr_pages() < PAGES_PER_COLOR) {
 	
-		//page = alloc_page(GFP_HIGHUSER | __GFP_MOVABLE);
 		page = alloc_page(GFP_HIGHUSER_MOVABLE);
 		
 		if (unlikely(!page)) {
@@ -107,70 +133,79 @@ static int do_add_pages(void)
 		}
 		color = page_color(page);
 		if (atomic_read(&color_groups[color].nr_pages) < PAGES_PER_COLOR) {
-	//		SetPageReserved(page);
 			add_page_to_color_list(page);
-		} else
+		} else{
+			// Pages here will be freed later 
 			list_add_tail(&page->lru, &free_later);
+		}
 	}
+	// Free the unwanted pages
 	list_for_each_entry_safe(page, page_tmp, &free_later, lru) {
 		list_del(&page->lru);
 		__free_page(page);
 	}
-	/* setup the color queue stuff */
-//	ret = setup_flusher_array();
 out:
 	return ret;
 }
 
-extern int l2_usable_sets;
+/*
+ * Provide pages for replacement according cache color 
+ * This should be the only implementation here
+ * This function should not be accessed by others directly. 
+ * 
+ */ 
+static struct  page *new_alloc_page_color( unsigned long color)
+{
+	printk("allocate new page color = %d\n", color);	
+	struct color_group *cgroup;
+	struct page *rPage = NULL;
+		
+	if( (color <0) || (color)>15) {
+		TRACE_CUR("Wrong color %lu\n", color);	
+		printk(KERN_WARNING "Wrong color %lu\n", color);
+		goto out_unlock;
+	}
+
+		
+	cgroup = &color_groups[color];
+	spin_lock(&cgroup->lock);
+	if (unlikely(!atomic_read(&cgroup->nr_pages))) {
+		TRACE_CUR("No free %lu colored pages.\n", color);
+		printk(KERN_WARNING "no free %lu colored pages.\n", color);
+		goto out_unlock;
+	}
+	rPage = list_first_entry(&cgroup->list, struct page, lru);
+	BUG_ON(page_count(rPage) > 1);
+	get_page(rPage);
+	list_del(&rPage->lru);
+	atomic_dec(&cgroup->nr_pages);
+//	ClearPageLRU(rPage);
+out_unlock:
+	spin_unlock(&cgroup->lock);
+out:
+	do_add_pages();
+	return rPage;
+}
+
 
 /*
- * provide pages for replacement 
+ * provide pages for replacement according to  
  * node = 0 for Level A, B tasks in Cpu 0
  * node = 1 for Level A, B tasks in Cpu 1
  * node = 2 for Level A, B tasks in Cpu 2
  * node = 3 for Level A, B tasks in Cpu 3
  * node = 4 for Level C tasks 
  */
-#if 1
 struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 {
-	//printk("allocate new page node = %d\n", node);	
+	printk("allocate new page node = %d\n", node);	
 //	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 	struct color_group *cgroup;
 	struct page *rPage = NULL;
 	unsigned int color;
 	get_random_bytes(&color, sizeof(unsigned int));
 	
-	/*
-	if(node ==0){
-		color = (color%2)*8+node;
-	}else if(node == 1){
-		color = (color%2)*8+node;
-	}else if(node == 2){
-		color = (color%2)*8+;
-	}else if(node == 3){
-		color = color%2 + 6;
-	}else if(node == 4){
-		color = color%8 + 8;
-	}else{
-		goto out;
-	}
-	*/
-	switch(node ){
-		case 0:
-			color = (color % l2_usable_sets);
-			break;
-		case 1: 
-		case 2: 
-		case 3:
-		case 4:
-			color = (color% (16-l2_usable_sets)) + l2_usable_sets;
-			break;
-		default:
-			goto out;
-	}
-	/*
+	// Decode the node to decide what color pages we should provide
 	switch(node ){
 		case 0:
 		case 1: 
@@ -184,34 +219,22 @@ struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 				color+=4;
 			break;
 		default:
-			goto out;
+			TRACE_CUR("Wrong color %lu\n", color);	
+			printk(KERN_WARNING "Wrong color %lu\n", color);
+			return rPage;
 	}
-	*/
 
-	//printk("allocate new page color = %d\n", color);
-	//TRACE("allocate new page color = %d\n", color);
+
+	printk("allocate new page color = %d\n", color);
 		
-	cgroup = &color_groups[color];
-	spin_lock(&cgroup->lock);
-	if (unlikely(!atomic_read(&cgroup->nr_pages))) {
-		//TRACE_CUR("No free %lu colored pages.\n", color);
-		printk(KERN_WARNING "no free %lu colored pages.\n", color);
-		goto out_unlock;
-	}
-	rPage = list_first_entry(&cgroup->list, struct page, lru);
-	BUG_ON(page_count(rPage) > 1);
-	get_page(rPage);
-	list_del(&rPage->lru);
-	atomic_dec(&cgroup->nr_pages);
-//	ClearPageLRU(rPage);
-out_unlock:
-	spin_unlock(&cgroup->lock);
-out:
-	do_add_pages();
-	return rPage;
+	rPage =  new_alloc_page_color(color);
+	return rPage; 
 }
-#endif
 
+/*
+ * Provide pages for replacement according to bank number. 
+ * This is used in cache way partition 
+ */
 struct page *new_alloc_page_banknr(struct page *page, unsigned long banknr, int **x)
 {
 	printk("allocate new page bank = %d\n", banknr);	
@@ -225,30 +248,43 @@ struct page *new_alloc_page_banknr(struct page *page, unsigned long banknr, int
 	}else{
 		goto out;
 	}
+	
+	rPage =  new_alloc_page_color(color);
 		
-	cgroup = &color_groups[color];
-	spin_lock(&cgroup->lock);
-	if (unlikely(!atomic_read(&cgroup->nr_pages))) {
-		TRACE_CUR("No free %lu colored pages.\n", color);
-		printk(KERN_WARNING "no free %lu colored pages.\n", color);
-		goto out_unlock;
-	}
-	rPage = list_first_entry(&cgroup->list, struct page, lru);
-	BUG_ON(page_count(rPage) > 1);
-	get_page(rPage);
-	list_del(&rPage->lru);
-	atomic_dec(&cgroup->nr_pages);
-//	ClearPageLRU(rPage);
-out_unlock:
-	spin_unlock(&cgroup->lock);
 out:
-	do_add_pages();
 	return rPage;
+}
 
 
+void set_number_of_colors(unsigned long colornr)
+{
+	used_cachecolor = colornr ; 
+	curr_cachecolor = 0;
+}
+
 
+/*
+ * Provide pages for replacement 
+ * This is used to generate experiments 
+ */
+struct page *new_alloc_page_predefined(struct page *page,  int **x)
+{
+	unsigned int color = curr_cachecolor; 
+	
+	printk("allocate new page color = %d\n", color);	
+	struct color_group *cgroup;
+	struct page *rPage = NULL;
+	
+	rPage =  new_alloc_page_color(color);
+	color = (color + 1)% used_cachecolor;
+out:
+	return rPage;
 }
 
+
+/*
+ * Initialize the numbers of banks and cache colors 
+ */ 
 static int __init init_variables(void)
 {
 	number_banks = 1+(BANK_MASK >> PAGE_SHIFT); 
@@ -256,7 +292,9 @@ static int __init init_variables(void)
 }
 
 
-
+/*
+ * Initialize the page pool 
+ */
 static int __init init_color_groups(void)
 {
 	struct color_group *cgroup;
@@ -275,22 +313,20 @@ static int __init init_color_groups(void)
 			atomic_set(&cgroup->nr_pages, 0);
 			INIT_LIST_HEAD(&cgroup->list);
 			spin_lock_init(&cgroup->lock);
-//			LOCKDEP_DYNAMIC_ALLOC(&cgroup->lock, &color_lock_keys[i],
-//					cgroup->_lock_name, "color%lu", i);
 		}
 	}
 	return err;
 }
 
 /*
- * Initialzie the this proc 
+ * Initialzie this proc 
  */
 static int __init litmus_color_init(void)
 {
 	int err=0;
 	
-	INIT_LIST_HEAD(&alloced_pages.list);
-	spin_lock_init(&alloced_pages.lock);
+	//INIT_LIST_HEAD(&alloced_pages.list);
+	//spin_lock_init(&alloced_pages.lock);
 	init_variables();
 	printk("Cache number = %d , Cache mask = 0x%lx\n", number_cachecolors, CACHE_MASK); 
 	printk("Bank number = %d , Bank mask = 0x%lx\n", number_banks, BANK_MASK); 
-- 
1.8.1.2


From a27319a7ffe7f72828faec29c6748453297488a8 Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Wed, 4 Mar 2015 09:58:24 -0500
Subject: [PATCH 094/119] fixed timer

---
 litmus/reservation.c | 26 ++++++++++++++++++--------
 litmus/sched_mc2.c   |  7 ++++---
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/litmus/reservation.c b/litmus/reservation.c
index 86d2f6e..25e838c 100644
--- a/litmus/reservation.c
+++ b/litmus/reservation.c
@@ -4,8 +4,8 @@
 #include <litmus/litmus.h>
 #include <litmus/reservation.h>
 
-#define TRACE(fmt, args...) do {} while (false)
-#define TRACE_TASK(fmt, args...) do {} while (false)
+//#define TRACE(fmt, args...) do {} while (false)
+//#define TRACE_TASK(fmt, args...) do {} while (false)
 
 void reservation_init(struct reservation *res)
 {
@@ -387,15 +387,25 @@ static void gmp_add_event(
 {
 	struct next_timer_event *nevent, *queued;
 	struct list_head *pos;
-	int found = 0;
+	int found = 0, update = 0;
 
 	//when = div64_u64(when, TIMER_RESOLUTION);
 	//when *= TIMER_RESOLUTION;
 //printk(KERN_ALERT "GMP_ADD id=%d type=%d when=%llu\n", id, type, when);
 	nevent = gmp_find_event_by_id(gmp_env, id);
 	
-	if (!nevent || nevent->type != type) {
-		nevent = kzalloc(sizeof(*nevent), GFP_ATOMIC);
+	if (nevent)
+		TRACE("EVENT R%d update prev = %llu, new = %llu\n", nevent->id, nevent->next_update, when);
+	
+	if (nevent && nevent->next_update > when) {
+		list_del(&nevent->list);
+		update = 1;
+		
+	}
+	
+	if (!nevent || nevent->type != type || update == 1) {
+		if (update == 0)
+			nevent = kzalloc(sizeof(*nevent), GFP_ATOMIC);
 		BUG_ON(!nevent);
 		nevent->next_update = when;
 		nevent->id = id;
@@ -421,9 +431,9 @@ static void gmp_add_event(
 ; //printk(KERN_ALERT "EVENT FOUND id = %d type=%d when=%llu, NEW EVENT type=%d when=%llu\n", nevent->id, nevent->type, nevent->next_update, type, when);
 	}
 	
-	//TRACE("======START PRINTING EVENT LIST======\n");
-	//gmp_print_events(gmp_env, litmus_clock());
-	//TRACE("======FINISH PRINTING EVENT LIST======\n");
+	TRACE("======START PRINTING EVENT LIST======\n");
+	gmp_print_events(gmp_env, litmus_clock());
+	TRACE("======FINISH PRINTING EVENT LIST======\n");
 }
 
 void gmp_add_event_after(
diff --git a/litmus/sched_mc2.c b/litmus/sched_mc2.c
index f7758f2..0d378c1 100644
--- a/litmus/sched_mc2.c
+++ b/litmus/sched_mc2.c
@@ -170,6 +170,7 @@ static void task_departs(struct task_struct *tsk, int job_complete)
 		ce = &state->crit_entries[lv];
 		ce->running = tsk;
 		res->is_ghost = 1;
+		gmp_add_event_after(&_global_env, res->cur_budget, res->id, EVENT_DRAIN);
 		TRACE_TASK(tsk, "BECOME GHOST at %llu\n", litmus_clock());
 		
 	}		
@@ -305,7 +306,7 @@ static void mc2_update_timer_and_unlock(struct mc2_cpu_state *state)
 					litmus_reschedule(cpu);
 				}
 			}
-		} else if (event->next_update < update && event->timer_armed_on == NO_CPU) {
+		} else if (event->next_update < update && (event->timer_armed_on == NO_CPU || event->timer_armed_on == state->cpu)) {
 			event->timer_armed_on = state->cpu;
 			update = event->next_update;
 			break;
@@ -484,7 +485,7 @@ static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 		update = now + remain_budget;
 	}
 	
-	//TRACE_CUR("on_scheduling_timer at %llu, upd:%llu (for cpu=%d) g_schedule_now:%d\n", now, update, state->cpu, global_schedule_now);
+	TRACE_CUR("on_scheduling_timer at %llu, upd:%llu (for cpu=%d) g_schedule_now:%d\n", now, update, state->cpu, global_schedule_now);
 //printk(KERN_ALERT "on_scheduling_timer at %llu, upd:%llu (for cpu=%d) g_schedule_now:%d\n", now, update, state->cpu, global_schedule_now);
 	if (update <= now) {
 		litmus_reschedule_local();
@@ -502,7 +503,7 @@ static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 			raw_spin_lock(&_lowest_prio_cpu.lock);
 			_lowest_prio_cpu.cpu_entries[cpu].will_schedule = true;
 			raw_spin_unlock(&_lowest_prio_cpu.lock);
-			//TRACE("LOWEST CPU = P%d\n", cpu);
+			TRACE("LOWEST CPU = P%d\n", cpu);
 			litmus_reschedule(cpu);
 		}
 	} 
-- 
1.8.1.2


From 52cf970884a42f24c583ee9baeab536a6622991e Mon Sep 17 00:00:00 2001
From: ChengYang Fu <chengyangfu@gmail.com>
Date: Mon, 9 Mar 2015 13:33:55 -0400
Subject: [PATCH 095/119] add new functions for bank_proc.c

---
 litmus/bank_proc.c  | 55 ++++++++++++++++++++++++++++++++++-------------------
 litmus/cache_proc.c |  9 +++++++--
 2 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/litmus/bank_proc.c b/litmus/bank_proc.c
index 295c450..844e090 100644
--- a/litmus/bank_proc.c
+++ b/litmus/bank_proc.c
@@ -18,6 +18,7 @@
 #include <litmus/litmus_proc.h>
 #include <litmus/sched_trace.h>
 
+
 #define LITMUS_LOCKDEP_NAME_MAX_LEN 50
 
 // This Address Decoding is used in imx6-sabredsd platform
@@ -27,13 +28,14 @@
 
 #define PAGES_PER_COLOR 1024
 
-unsigned long used_cachecolor;
 unsigned long curr_cachecolor;
+int used_cachecolor;
 
 
 unsigned long number_banks;
 unsigned long number_cachecolors;
 
+
 /*
  * Every page list should contain a lock, a list, and a number recording how many pages it store
  */ 
@@ -183,12 +185,30 @@ static struct  page *new_alloc_page_color( unsigned long color)
 out_unlock:
 	spin_unlock(&cgroup->lock);
 out:
-	do_add_pages();
+	if( smallest_nr_pages() == 0)
+		do_add_pages();
 	return rPage;
 }
 
 
 /*
+ * Provide pages for replacement 
+ * This is used to generate experiments 
+ */
+struct page *new_alloc_page_predefined(struct page *page,  int **x)
+{
+	unsigned int color = curr_cachecolor; 
+	
+	printk("allocate new page color = %d\n", color);	
+	struct color_group *cgroup;
+	struct page *rPage = NULL;
+	
+	rPage =  new_alloc_page_color(color);
+	curr_cachecolor = (color + 1)% used_cachecolor;
+out:
+	return rPage;
+}
+/*
  * provide pages for replacement according to  
  * node = 0 for Level A, B tasks in Cpu 0
  * node = 1 for Level A, B tasks in Cpu 1
@@ -196,6 +216,7 @@ out:
  * node = 3 for Level A, B tasks in Cpu 3
  * node = 4 for Level C tasks 
  */
+#if 1
 struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 {
 	printk("allocate new page node = %d\n", node);	
@@ -230,6 +251,12 @@ struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 	rPage =  new_alloc_page_color(color);
 	return rPage; 
 }
+#else
+struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
+{
+	return new_alloc_page_predefined(page,  x);
+}
+#endif
 
 /*
  * Provide pages for replacement according to bank number. 
@@ -256,30 +283,14 @@ out:
 }
 
 
-void set_number_of_colors(unsigned long colornr)
+void set_number_of_colors(int colornr)
 {
+	printk(KERN_WARNING "Set of colors = %d.\n", colornr);
 	used_cachecolor = colornr ; 
 	curr_cachecolor = 0;
 }
 
 
-/*
- * Provide pages for replacement 
- * This is used to generate experiments 
- */
-struct page *new_alloc_page_predefined(struct page *page,  int **x)
-{
-	unsigned int color = curr_cachecolor; 
-	
-	printk("allocate new page color = %d\n", color);	
-	struct color_group *cgroup;
-	struct page *rPage = NULL;
-	
-	rPage =  new_alloc_page_color(color);
-	color = (color + 1)% used_cachecolor;
-out:
-	return rPage;
-}
 
 
 /*
@@ -289,6 +300,10 @@ static int __init init_variables(void)
 {
 	number_banks = 1+(BANK_MASK >> PAGE_SHIFT); 
 	number_cachecolors = 1+(CACHE_MASK >> PAGE_SHIFT);
+	used_cachecolor = 16;
+	curr_cachecolor = 0;
+
+	
 }
 
 
diff --git a/litmus/cache_proc.c b/litmus/cache_proc.c
index 7b48d5c..e244616 100644
--- a/litmus/cache_proc.c
+++ b/litmus/cache_proc.c
@@ -13,6 +13,7 @@
 #include <asm/hardware/cache-l2x0.h>
 #include <asm/cacheflush.h>
 
+
 #define UNLOCK_ALL	0x00000000 /* allocation in any way */
 #define LOCK_ALL        (~UNLOCK_ALL)
 #define MAX_NR_WAYS	16
@@ -50,7 +51,7 @@ struct mutex lockdown_proc;
 static int min_usable_ways = 0;
 static int max_usable_ways = 16;
 static int min_usable_sets = 1;
-static int max_usable_sets = 15;
+static int max_usable_sets = 16;
 
 static int zero = 0;
 static int one = 1;
@@ -67,6 +68,9 @@ int l2_usable_sets;
 int lock_all;
 int nr_lockregs;
 
+extern void set_number_of_colors(int colornr);
+
+
 static void print_lockdown_registers(void)
 {
 	int i;
@@ -212,7 +216,7 @@ int l2_usable_sets_handler(struct ctl_table *table, int write, void __user *buff
 	printk("l2_usable_sets : %d\n", l2_usable_sets);
 	
 	if (write) {
-		;
+		set_number_of_colors(l2_usable_sets);
 	}
 
 out:
@@ -277,6 +281,7 @@ static int __init litmus_sysctl_init(void)
 
 	l2_usable_ways = 16;
 	l2_usable_sets = 5;
+	set_number_of_colors(l2_usable_sets);
 
 out:
 	return ret;
-- 
1.8.1.2


From 987ed4eca956b1d445b796d8c494a1ab2826422e Mon Sep 17 00:00:00 2001
From: ChengYang Fu <chengyangfu@gmail.com>
Date: Mon, 9 Mar 2015 13:38:17 -0400
Subject: [PATCH 096/119] comment all printk in bank_proc.c

---
 litmus/bank_proc.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/litmus/bank_proc.c b/litmus/bank_proc.c
index 844e090..53c20db 100644
--- a/litmus/bank_proc.c
+++ b/litmus/bank_proc.c
@@ -116,7 +116,7 @@ void add_page_to_color_list(struct page *page)
  */
 static int do_add_pages(void)
 {
-	printk("LITMUS do add pages\n");
+//	printk("LITMUS do add pages\n");
 	
 	struct page *page, *page_tmp;
 	LIST_HEAD(free_later);
@@ -129,7 +129,7 @@ static int do_add_pages(void)
 		page = alloc_page(GFP_HIGHUSER_MOVABLE);
 		
 		if (unlikely(!page)) {
-			printk(KERN_WARNING "Could not allocate pages.\n");
+	//		printk(KERN_WARNING "Could not allocate pages.\n");
 			ret = -ENOMEM;
 			goto out;
 		}
@@ -158,13 +158,13 @@ out:
  */ 
 static struct  page *new_alloc_page_color( unsigned long color)
 {
-	printk("allocate new page color = %d\n", color);	
+//	printk("allocate new page color = %d\n", color);	
 	struct color_group *cgroup;
 	struct page *rPage = NULL;
 		
 	if( (color <0) || (color)>15) {
 		TRACE_CUR("Wrong color %lu\n", color);	
-		printk(KERN_WARNING "Wrong color %lu\n", color);
+//		printk(KERN_WARNING "Wrong color %lu\n", color);
 		goto out_unlock;
 	}
 
@@ -173,7 +173,7 @@ static struct  page *new_alloc_page_color( unsigned long color)
 	spin_lock(&cgroup->lock);
 	if (unlikely(!atomic_read(&cgroup->nr_pages))) {
 		TRACE_CUR("No free %lu colored pages.\n", color);
-		printk(KERN_WARNING "no free %lu colored pages.\n", color);
+//		printk(KERN_WARNING "no free %lu colored pages.\n", color);
 		goto out_unlock;
 	}
 	rPage = list_first_entry(&cgroup->list, struct page, lru);
@@ -199,7 +199,7 @@ struct page *new_alloc_page_predefined(struct page *page,  int **x)
 {
 	unsigned int color = curr_cachecolor; 
 	
-	printk("allocate new page color = %d\n", color);	
+//	printk("allocate new page color = %d\n", color);	
 	struct color_group *cgroup;
 	struct page *rPage = NULL;
 	
@@ -219,7 +219,7 @@ out:
 #if 1
 struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 {
-	printk("allocate new page node = %d\n", node);	
+//	printk("allocate new page node = %d\n", node);	
 //	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 	struct color_group *cgroup;
 	struct page *rPage = NULL;
@@ -241,12 +241,12 @@ struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 			break;
 		default:
 			TRACE_CUR("Wrong color %lu\n", color);	
-			printk(KERN_WARNING "Wrong color %lu\n", color);
+//			printk(KERN_WARNING "Wrong color %lu\n", color);
 			return rPage;
 	}
 
 
-	printk("allocate new page color = %d\n", color);
+//	printk("allocate new page color = %d\n", color);
 		
 	rPage =  new_alloc_page_color(color);
 	return rPage; 
@@ -264,7 +264,7 @@ struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
  */
 struct page *new_alloc_page_banknr(struct page *page, unsigned long banknr, int **x)
 {
-	printk("allocate new page bank = %d\n", banknr);	
+//	printk("allocate new page bank = %d\n", banknr);	
 	struct color_group *cgroup;
 	struct page *rPage = NULL;
 	unsigned int color;
@@ -285,7 +285,7 @@ out:
 
 void set_number_of_colors(int colornr)
 {
-	printk(KERN_WARNING "Set of colors = %d.\n", colornr);
+//	printk(KERN_WARNING "Set of colors = %d.\n", colornr);
 	used_cachecolor = colornr ; 
 	curr_cachecolor = 0;
 }
@@ -319,7 +319,7 @@ static int __init init_color_groups(void)
 	color_groups = kmalloc(number_cachecolors *
 			sizeof(struct color_group), GFP_KERNEL);
 	if (!color_groups) {
-		printk(KERN_WARNING "Could not allocate color groups.\n");
+//		printk(KERN_WARNING "Could not allocate color groups.\n");
 		err = -ENOMEM;
 	}else{
 
@@ -343,12 +343,12 @@ static int __init litmus_color_init(void)
 	//INIT_LIST_HEAD(&alloced_pages.list);
 	//spin_lock_init(&alloced_pages.lock);
 	init_variables();
-	printk("Cache number = %d , Cache mask = 0x%lx\n", number_cachecolors, CACHE_MASK); 
-	printk("Bank number = %d , Bank mask = 0x%lx\n", number_banks, BANK_MASK); 
+//	printk("Cache number = %d , Cache mask = 0x%lx\n", number_cachecolors, CACHE_MASK); 
+//	printk("Bank number = %d , Bank mask = 0x%lx\n", number_banks, BANK_MASK); 
 	init_color_groups();			
 	do_add_pages();
 
-	printk(KERN_INFO "Registering LITMUS^RT color and bank proc.\n");
+//	printk(KERN_INFO "Registering LITMUS^RT color and bank proc.\n");
 	return err;
 }
 
-- 
1.8.1.2


From fa47a8c03809058a0823cfbeeff5a574eae344f7 Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Mon, 9 Mar 2015 13:41:28 -0400
Subject: [PATCH 097/119] Add way_partition proc.

---
 include/litmus/rt_param.h |   1 +
 litmus/cache_proc.c       |  54 ++++++++++++++++++++
 litmus/ctrldev.c          |   4 +-
 litmus/litmus.c           |  73 +++++++++++++--------------
 litmus/reservation.c      |   9 ++--
 litmus/sched_mc2.c        | 124 ++++++++++++++++++++++++++++------------------
 6 files changed, 175 insertions(+), 90 deletions(-)

diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
index 284b89e..a3dde39 100644
--- a/include/litmus/rt_param.h
+++ b/include/litmus/rt_param.h
@@ -326,6 +326,7 @@ struct rt_param {
 	
 	/* Mixed-criticality specific data */
 	struct mc2_task* mc2_data;
+	unsigned long addr_ctrl_page;
 };
 
 #endif
diff --git a/litmus/cache_proc.c b/litmus/cache_proc.c
index 7b48d5c..13c595c 100644
--- a/litmus/cache_proc.c
+++ b/litmus/cache_proc.c
@@ -9,6 +9,8 @@
 
 #include <litmus/litmus_proc.h>
 #include <litmus/sched_trace.h>
+#include <litmus/cache_proc.h>
+#include <litmus/mc2_common.h>
 
 #include <asm/hardware/cache-l2x0.h>
 #include <asm/cacheflush.h>
@@ -37,6 +39,14 @@ u32 nr_unlocked_way[MAX_NR_WAYS+1]  = {
 	0xFFFF0000, /* way ~15 unlocked. usable = 16 */
 };
 
+u32 way_partitions[5] = {
+	0xFFFFFFFC, /* cpu0 A and B */
+	0xFFFFFFF3, /* cpu1 A and B */
+	0xFFFFFFCF, /* cpu2 A and B */
+	0xFFFFFF3F, /* cpu3 A and B */
+	0xFFFF00FF, /* lv C */
+};
+
 static void __iomem *cache_base;
 static void __iomem *lockreg_d;
 static void __iomem *lockreg_i;
@@ -66,6 +76,7 @@ int l2_usable_ways;
 int l2_usable_sets;
 int lock_all;
 int nr_lockregs;
+int use_way_partition;
 
 static void print_lockdown_registers(void)
 {
@@ -195,6 +206,19 @@ out:
 	return ret;
 }
 
+void do_way_partition(enum crit_level lv, int cpu)
+{
+	if (use_way_partition == 1) {
+		if (lv < CRIT_LEVEL_C) {
+			writel_relaxed(way_partitions[cpu], ld_d_reg(cpu));
+			writel_relaxed(way_partitions[cpu], ld_i_reg(cpu));
+		} else {
+			writel_relaxed(way_partitions[4], ld_d_reg(cpu));
+			writel_relaxed(way_partitions[4], ld_i_reg(cpu));
+		}
+	}
+}
+		
 int l2_usable_sets_handler(struct ctl_table *table, int write, void __user *buffer,
 		size_t *lenp, loff_t *ppos)
 {
@@ -220,6 +244,26 @@ out:
 	return ret;
 }
 
+int use_way_partition_handler(struct ctl_table *table, int write, void __user *buffer,
+		size_t *lenp, loff_t *ppos)
+{
+	int ret = 0;
+	
+	mutex_lock(&lockdown_proc);
+	
+	flush_cache_all();
+	
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret)
+		goto out;
+		
+	printk(KERN_INFO "use_way_partition : %d\n", use_way_partition);
+
+out:
+	mutex_unlock(&lockdown_proc);
+	return ret;
+}
+
 static struct ctl_table cache_table[] =
 {
 	{
@@ -249,6 +293,15 @@ static struct ctl_table cache_table[] =
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+	{
+		.procname	= "use_way_partition",
+		.mode		= 0666,
+		.proc_handler	= use_way_partition_handler,
+		.data		= &use_way_partition,
+		.maxlen		= sizeof(use_way_partition),
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 	{ }
 };
 
@@ -277,6 +330,7 @@ static int __init litmus_sysctl_init(void)
 
 	l2_usable_ways = 16;
 	l2_usable_sets = 5;
+	use_way_partition = 0;
 
 out:
 	return ret;
diff --git a/litmus/ctrldev.c b/litmus/ctrldev.c
index 877f278..208a212 100644
--- a/litmus/ctrldev.c
+++ b/litmus/ctrldev.c
@@ -43,7 +43,7 @@ static int map_ctrl_page(struct task_struct *t, struct vm_area_struct* vma)
 
 	if (err)
 		TRACE_CUR(CTRL_NAME ": vm_insert_page() failed (%d)\n", err);
-
+	tsk_rt(t)->addr_ctrl_page = vma->vm_start;
 	return err;
 }
 
@@ -104,7 +104,7 @@ static int litmus_ctrl_mmap(struct file* filp, struct vm_area_struct* vma)
 	 * don't care if it was touched or not. PAGE_SHARED means RW access, but
 	 * not execute, and avoids copy-on-write behavior.
 	 * See protection_map in mmap.c.  */
-	vma->vm_page_prot = PAGE_SHARED;
+	//vma->vm_page_prot = PAGE_SHARED;
 
 	err = alloc_ctrl_page(current);
 	if (!err)
diff --git a/litmus/litmus.c b/litmus/litmus.c
index 4ff840d..344c68c 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -17,6 +17,7 @@
 #include <linux/migrate.h>
 #include <linux/mm.h>
 #include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
 
 #include <litmus/litmus.h>
 #include <litmus/bheap.h>
@@ -346,45 +347,33 @@ static inline unsigned long page_color(struct page *page)
 }
 
 /*
-static struct page *walk_page_table(unsigned long addr)
+static struct page *page_by_address(const struct mm_struct *const mm,
+                             const unsigned long address)
 {
     pgd_t *pgd;
-    pte_t *ptep, pte;
     pud_t *pud;
     pmd_t *pmd;
-
+    pte_t *pte;
     struct page *page = NULL;
-    struct mm_struct *mm = current->mm;
-
-    pgd = pgd_offset(mm, addr);
-    //if (pgd_none(*pgd) || pgd_bad(*pgd))
-	if (pgd_none_or_clear_bad(pgd))
-        goto out;
-    
-    pud = pud_offset(pgd, addr);
-    //if (pud_none(*pud) || pud_bad(*pud))
-	if (pud_none_or_clear_bad(pud))
-        goto out;
-    
-    pmd = pmd_offset(pud, addr);
-    //if (pmd_none(*pmd) || pmd_bad(*pmd))
-	if (pmd_none_or_clear_bad(pmd))
-        goto out;
-    
-    ptep = pte_offset_map(pmd, addr);
-    if (!ptep)
-        goto out;
-    pte = *ptep;
-
-    page = pte_page(pte);
-    if (pfn_valid(__page_to_pfn(page))) {
-        ;//printk(KERN_INFO "page frame struct is @ %p\n", page);
-		//printk(KERN_INFO "pfn is %lu\n", __page_to_pfn(page));
-	}
-	
-	pte_unmap(ptep);
 
- out:
+    pgd = pgd_offset(mm, address);
+    if (!pgd_present(*pgd))
+        goto do_return;
+
+    pud = pud_offset(pgd, address);
+    if (!pud_present(*pud))
+        goto do_return;
+
+    pmd = pmd_offset(pud, address);
+    if (!pmd_present(*pmd))
+        goto do_return;
+
+    pte = pte_offset_kernel(pmd, address);
+    if (!pte_present(*pte))
+        goto do_return;
+
+    page = pte_page(*pte);
+do_return:
     return page;
 }
 */
@@ -392,7 +381,7 @@ static struct page *walk_page_table(unsigned long addr)
 extern int isolate_lru_page(struct page *page);
 extern void putback_lru_page(struct page *page);
 
-#if 0
+#if 1
 static struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 {
 	return alloc_pages_exact_node(0, GFP_HIGHUSER_MOVABLE, 0);
@@ -456,7 +445,7 @@ asmlinkage long sys_set_page_color(int cpu)
 				continue;
 			}
 			
-			TRACE_TASK(current, "addr: %lu, pfn: %lu, _mapcount: %d, _count: %d\n", vma_itr->vm_start + PAGE_SIZE*i, __page_to_pfn(old_page), page_mapcount(old_page), page_count(old_page));
+			TRACE_TASK(current, "addr: %08x, pfn: %x, _mapcount: %d, _count: %d\n", vma_itr->vm_start + PAGE_SIZE*i, __page_to_pfn(old_page), page_mapcount(old_page), page_count(old_page));
 			
 			if (page_mapcount(old_page) != 0) {
 				ret = isolate_lru_page(old_page);
@@ -466,6 +455,7 @@ asmlinkage long sys_set_page_color(int cpu)
 					nr_pages++;
 				} else {
 					TRACE_TASK(current, "isolate_lru_page failed\n");
+					TRACE_TASK(current, "page_lru = %d PageLRU = %d\n", page_lru(old_page), PageLRU(old_page));
 					nr_failed++;
 				}
 				put_page(old_page);
@@ -506,6 +496,17 @@ asmlinkage long sys_set_page_color(int cpu)
 		}
 	}
 	
+	/* handle sigpage and litmus ctrl_page */
+	vma_itr = current->mm->mmap;
+	while (vma_itr != NULL) {
+		if (vma_itr->vm_start == tsk_rt(current)->addr_ctrl_page) {
+			TRACE("litmus ctrl_page = %08x\n", vma_itr->vm_start);
+			vma_itr->vm_page_prot = PAGE_SHARED;
+			break;
+		}
+		vma_itr = vma_itr->vm_next;
+	}
+	
 	/* copy shared pages HERE */
 /*	
 	ret = 0;
@@ -901,7 +902,7 @@ static int __init _init_litmus(void)
 #endif
 	
 	color_mask = ((cache_info_sets << line_size_log) - 1) ^ (PAGE_SIZE - 1);
-	printk("Page color mask %08x\n", color_mask);
+	printk("Page color mask %lx\n", color_mask);
 	return 0;
 }
 
diff --git a/litmus/reservation.c b/litmus/reservation.c
index 25e838c..8457b4b 100644
--- a/litmus/reservation.c
+++ b/litmus/reservation.c
@@ -555,7 +555,7 @@ static void gmp_charge_budget(
 			if (res->state != RESERVATION_ACTIVE_IDLE)
 				TRACE("BUG!!!!!!!!!!!! gmp_charge_budget()\n");
 			TRACE("gmp_charge_budget INACTIVE R%u drain %llu\n", res->id, delta);
-			//if (res->is_ghost == 1) {
+			//if (res->is_ghost != NO_CPU) {
 				TRACE("DRAIN !!\n");
 				drained = 1;
 				res->ops->drain_budget(res, delta);
@@ -574,7 +574,7 @@ static void gmp_charge_budget(
 			/* stop at the first ACTIVE reservation */
 		//	break;
 	}
-	TRACE("finished charging budgets\n");
+	//TRACE("finished charging budgets\n");
 }
 
 static void gmp_replenish_budgets(struct gmp_reservation_environment* gmp_env)
@@ -586,12 +586,15 @@ static void gmp_replenish_budgets(struct gmp_reservation_environment* gmp_env)
 		res = list_entry(pos, struct reservation, list);
 		if (res->next_replenishment <= gmp_env->env.current_time) {
 			res->ops->replenish(res);
+			if (res->is_ghost != NO_CPU) {
+				TRACE("R%d replenished! scheduled_on=%d\n", res->id, res->scheduled_on);
+			}
 		} else {
 			/* list is ordered by increasing depletion times */
 			break;
 		}
 	}
-	TRACE("finished replenishing budgets\n");
+	//TRACE("finished replenishing budgets\n");
 }
 
 #define EPSILON	50
diff --git a/litmus/sched_mc2.c b/litmus/sched_mc2.c
index 0d378c1..3aaa88c 100644
--- a/litmus/sched_mc2.c
+++ b/litmus/sched_mc2.c
@@ -20,11 +20,15 @@
 #include <litmus/budget.h>
 #include <litmus/litmus_proc.h>
 #include <litmus/sched_trace.h>
+#include <litmus/cache_proc.h>
 
 #include <litmus/mc2_common.h>
 #include <litmus/reservation.h>
 #include <litmus/polling_reservations.h>
 
+extern int use_way_partition;
+extern void do_way_partition(enum crit_level lv, int cpu);
+
 /* _global_env - reservation container for level-C tasks*/
 struct gmp_reservation_environment _global_env;
 
@@ -126,21 +130,6 @@ static struct reservation* res_find_by_id(struct mc2_cpu_state *state,
 	return res;
 }
 
-/* mc2_update_time - update time for a given criticality level. 
- *                   caller must hold a proper lock
- *                   (cpu_state lock or global lock)
- */
-static void mc2_update_time(enum crit_level lv, 
-                            struct mc2_cpu_state *state, lt_t time)
-{
-	if (lv < CRIT_LEVEL_C)
-		sup_update_time(&state->sup_env, time);
-	else if (lv == CRIT_LEVEL_C)
-		gmp_update_time(&_global_env, time);
-	else
-		TRACE("update_time(): Criticality level error!!!!\n");
-}
-
 /* task_depart - remove a task from its reservation
  *               If the job has remaining budget, convert it to a ghost job
  *               and update crit_entries[]
@@ -169,7 +158,7 @@ static void task_departs(struct task_struct *tsk, int job_complete)
 		
 		ce = &state->crit_entries[lv];
 		ce->running = tsk;
-		res->is_ghost = 1;
+		res->is_ghost = state->cpu;
 		gmp_add_event_after(&_global_env, res->cur_budget, res->id, EVENT_DRAIN);
 		TRACE_TASK(tsk, "BECOME GHOST at %llu\n", litmus_clock());
 		
@@ -256,6 +245,36 @@ static int get_lowest_prio_cpu(lt_t priority)
 	return ret;
 }
 
+/* mc2_update_time - update time for a given criticality level. 
+ *                   caller must hold a proper lock
+ *                   (cpu_state lock or global lock)
+ */
+static void mc2_update_time(enum crit_level lv, 
+                            struct mc2_cpu_state *state, lt_t time)
+{
+	int global_schedule_now;
+	
+	if (lv < CRIT_LEVEL_C)
+		sup_update_time(&state->sup_env, time);
+	else if (lv == CRIT_LEVEL_C) {
+		global_schedule_now = gmp_update_time(&_global_env, time);
+		while (global_schedule_now--) {
+			int cpu = get_lowest_prio_cpu(0);
+			if (cpu != NO_CPU) {
+				raw_spin_lock(&_lowest_prio_cpu.lock);
+				_lowest_prio_cpu.cpu_entries[cpu].will_schedule = true;
+				raw_spin_unlock(&_lowest_prio_cpu.lock);
+				TRACE("LOWEST CPU = P%d\n", cpu);
+				litmus_reschedule(cpu);
+			}
+		} 
+	}
+	else
+		TRACE("update_time(): Criticality level error!!!!\n");
+	
+	
+}
+
 /* NOTE: drops state->lock */
 /* mc2_update_timer_and_unlock - set a timer and g_timer and unlock 
  *                               Whenever res_env.current_time is updated,
@@ -392,7 +411,7 @@ static lt_t mc2_update_ghost_state(struct mc2_cpu_state *state)
 				TRACE("GHOST FINISH id %d at %llu\n", 
 				      tinfo->mc2_param.res_id, litmus_clock());
 				ce->running = NULL;
-				res->is_ghost = 0;
+				res->is_ghost = NO_CPU;
 				
 				if (lv < CRIT_LEVEL_C) {
 					res = list_first_entry_or_null(
@@ -485,7 +504,7 @@ static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 		update = now + remain_budget;
 	}
 	
-	TRACE_CUR("on_scheduling_timer at %llu, upd:%llu (for cpu=%d) g_schedule_now:%d\n", now, update, state->cpu, global_schedule_now);
+	TRACE_CUR("on_scheduling_timer at %llu, upd:%llu (for cpu=%d) g_schedule_now:%d remain_budget:%llu\n", now, update, state->cpu, global_schedule_now, remain_budget);
 //printk(KERN_ALERT "on_scheduling_timer at %llu, upd:%llu (for cpu=%d) g_schedule_now:%d\n", now, update, state->cpu, global_schedule_now);
 	if (update <= now) {
 		litmus_reschedule_local();
@@ -534,19 +553,21 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 					return tsk;
 				} else {
 					ce = &state->crit_entries[lv];
+					sup_scheduler_update_after(sup_env, res->cur_budget);
+					res->blocked_by_ghost = 0;
+					res->is_ghost = NO_CPU;
+					return tsk;
+/*
 					if (likely(!ce->running)) {
-						/* If we found the next task, clear all flags */
 						sup_scheduler_update_after(sup_env, res->cur_budget);
 						res->blocked_by_ghost = 0;
-						res->is_ghost = 0;
+						res->is_ghost = NO_CPU;
 						return tsk;
 					} else {
-						/* We cannot schedule the same criticality task
-						   because the ghost job exists. Set blocked_by_ghost
-						   flag not to charge budget */
 						res->blocked_by_ghost = 1;
 						TRACE_TASK(ce->running, " is GHOST\n");
 					}
+*/
 				}
 			}
 		}
@@ -562,10 +583,10 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 	
 	cur_priority = _lowest_prio_cpu.cpu_entries[state->cpu].deadline;
 	
-	TRACE("****** ACTIVE LIST ******\n");
-	TRACE_TASK(_lowest_prio_cpu.cpu_entries[state->cpu].scheduled, "** CURRENT JOB deadline %llu **\n", cur_priority);
+	//TRACE("****** ACTIVE LIST ******\n");
+	//TRACE_TASK(_lowest_prio_cpu.cpu_entries[state->cpu].scheduled, "** CURRENT JOB deadline %llu **\n", cur_priority);
 	list_for_each_entry_safe(res, next, &_global_env.active_reservations, list) {
-		TRACE("R%d deadline=%llu, scheduled_on=%d\n", res->id, res->priority, res->scheduled_on);
+		//TRACE("R%d deadline=%llu, scheduled_on=%d\n", res->id, res->priority, res->scheduled_on);
 		if (res->state == RESERVATION_ACTIVE && res->scheduled_on == NO_CPU) {
 			tsk = res->ops->dispatch_client(res, &time_slice);
 			if (likely(tsk)) {
@@ -580,7 +601,7 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 						gmp_add_event_after(&_global_env, res->cur_budget, res->id, EVENT_DRAIN);
 						res->event_added = 1;
 						res->blocked_by_ghost = 0;
-						res->is_ghost = 0;
+						res->is_ghost = NO_CPU;
 						res->scheduled_on = state->cpu;
 						return tsk;
 					//} else {
@@ -599,23 +620,23 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 }
 
 /* not used now */
-static void pre_schedule(struct task_struct *prev)
+static void pre_schedule(struct task_struct *prev, int cpu)
 {
-	enum crit_level lv;
-	if (!is_realtime(prev) || !prev)
+	if (!prev || !is_realtime(prev))
 		return;
 	
-	lv = get_task_crit_level(prev);
+	do_way_partition(CRIT_LEVEL_C, cpu);
 }
 
 /* not used now */
-static void post_schedule(struct task_struct *next)
+static void post_schedule(struct task_struct *next, int cpu)
 {
-	enum crit_level lv;
-	if (!is_realtime(next) || !next)
+	enum crit_level lev;
+	if (!next || !is_realtime(next))
 		return;
 	
-	lv = get_task_crit_level(next);
+	lev = get_task_crit_level(next);
+	do_way_partition(lev, cpu);
 }
 
 /* mc2_schedule - main scheduler function. pick the next task to run
@@ -626,7 +647,7 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 	lt_t now;
 	struct mc2_cpu_state *state = local_cpu_state();
 
-	pre_schedule(prev);
+	pre_schedule(prev, state->cpu);
 	
 	raw_spin_lock(&_global_env.lock);
 	raw_spin_lock(&state->lock);
@@ -698,7 +719,7 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 		TRACE_TASK(state->scheduled, "scheduled.\n");
 	}
 	
-	post_schedule(state->scheduled);
+	post_schedule(state->scheduled, state->cpu);
 	
 	return state->scheduled;
 }
@@ -817,7 +838,7 @@ static long mc2_complete_job(void)
 		
 		/* set next_replenishtime to synchronous release time */
 		res->next_replenishment = tsk_rt(current)->sporadic_release_time;
-		
+/*		
 		if (get_task_crit_level(current) == CRIT_LEVEL_A) {
 			struct table_driven_reservation *tdres;
 			tdres = container_of(res, struct table_driven_reservation, res);
@@ -825,6 +846,7 @@ static long mc2_complete_job(void)
 			tdres->major_cycle_start = tsk_rt(current)->sporadic_release_time;
 			res->next_replenishment += tdres->intervals[0].start;			
 		}
+*/		
 		res->cur_budget = 0;
 		res->env->change_state(res->env, res, RESERVATION_DEPLETED);
 		
@@ -849,15 +871,15 @@ static long mc2_complete_job(void)
 		set_current_state(TASK_INTERRUPTIBLE);
 		preempt_enable_no_resched();
 		err = schedule_hrtimeout(&next_release, HRTIMER_MODE_ABS);
-		if (get_task_crit_level(current) == CRIT_LEVEL_A)
-			sched_trace_task_release(current);
+//		if (get_task_crit_level(current) == CRIT_LEVEL_A)
+//			sched_trace_task_release(current);
 	} else {
 		/* release the next job immediately */
 		err = 0;
 		TRACE_CUR("TARDY: release=%llu now=%llu\n", get_release(current), litmus_clock());
 		preempt_enable();
-		if (get_task_crit_level(current) == CRIT_LEVEL_A)
-			sched_trace_task_release(current);
+//		if (get_task_crit_level(current) == CRIT_LEVEL_A)
+//			sched_trace_task_release(current);
 	}
 
 	TRACE_CUR("mc2_complete_job returns at %llu\n", litmus_clock());
@@ -1073,11 +1095,13 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 		sup_env = &state->sup_env;
 		list_for_each_entry_safe(res, next, &sup_env->depleted_reservations, list) {
 			if (res->id == reservation_id) {
-				if (lv == CRIT_LEVEL_A) {
+/*
+			if (lv == CRIT_LEVEL_A) {
 					struct table_driven_reservation *tdres;
 					tdres = container_of(res, struct table_driven_reservation, res);
 					kfree(tdres->intervals);
-				}
+			}
+*/
 				list_del(&res->list);
 				kfree(res);
 				found = 1;
@@ -1087,11 +1111,12 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 		if (!found) {
 			list_for_each_entry_safe(res, next, &sup_env->inactive_reservations, list) {
 				if (res->id == reservation_id) {
-					if (lv == CRIT_LEVEL_A) {
+/*					if (lv == CRIT_LEVEL_A) {
 						struct table_driven_reservation *tdres;
 						tdres = container_of(res, struct table_driven_reservation, res);
 						kfree(tdres->intervals);
 					}
+*/
 					list_del(&res->list);
 					kfree(res);
 					found = 1;
@@ -1102,11 +1127,12 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 		if (!found) {
 			list_for_each_entry_safe(res, next, &sup_env->active_reservations, list) {
 				if (res->id == reservation_id) {
-					if (lv == CRIT_LEVEL_A) {
+/*					if (lv == CRIT_LEVEL_A) {
 						struct table_driven_reservation *tdres;
 						tdres = container_of(res, struct table_driven_reservation, res);
 						kfree(tdres->intervals);
 					}
+*/
 					list_del(&res->list);
 					kfree(res);
 					found = 1;
@@ -1249,7 +1275,7 @@ static long create_polling_reservation(
 				config->polling_params.offset);
 			pres->res.id = config->id;
 			pres->res.blocked_by_ghost = 0;
-			pres->res.is_ghost = 0;
+			pres->res.is_ghost = NO_CPU;
 			if (!use_edf)
 				pres->res.priority = config->priority;
 			sup_add_new_reservation(&state->sup_env, &pres->res);
@@ -1274,7 +1300,7 @@ static long create_polling_reservation(
 			pres->res.id = config->id;
 			pres->res.blocked_by_ghost = 0;
 			pres->res.scheduled_on = NO_CPU;
-			pres->res.is_ghost = 0;
+			pres->res.is_ghost = NO_CPU;
 			if (!use_edf)
 				pres->res.priority = config->priority;
 			gmp_add_new_reservation(&_global_env, &pres->res);
@@ -1511,7 +1537,7 @@ static void mc2_finish_switch(struct task_struct *prev)
 	struct mc2_cpu_state *state = local_cpu_state();
 	
 	state->scheduled = is_realtime(current) ? current : NULL;
-	TRACE("FINISH CXS! from %s/%d to %s/%d\n", prev ? (prev)->comm : "null", prev ? (prev)->pid : 0, current ? (current)->comm : "null", current ? (current)->pid : 0);
+	//TRACE("FINISH CXS! from %s/%d to %s/%d\n", prev ? (prev)->comm : "null", prev ? (prev)->pid : 0, current ? (current)->comm : "null", current ? (current)->pid : 0);
 }
 
 static long mc2_deactivate_plugin(void)
-- 
1.8.1.2


From e5c2080e0d7cb2201d021edd7d89f3c2e783744e Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Tue, 10 Mar 2015 11:42:17 -0400
Subject: [PATCH 098/119] add use_set_partition

---
 arch/arm/kernel/irq.c       |  4 +++
 include/litmus/cache_proc.h |  2 ++
 litmus/cache_proc.c         | 68 ++++++++++++++++++++++++++++++++++++++++++++-
 litmus/litmus.c             | 20 +++++++++++--
 litmus/sched_mc2.c          |  4 +--
 5 files changed, 92 insertions(+), 6 deletions(-)

diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index 9723d17..ce01835 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -42,6 +42,8 @@
 #include <asm/mach/irq.h>
 #include <asm/mach/time.h>
 
+#include <litmus/cache_proc.h>
+
 unsigned long irq_err_count;
 
 int arch_show_interrupts(struct seq_file *p, int prec)
@@ -66,6 +68,7 @@ void handle_IRQ(unsigned int irq, struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
+	enter_irq_mode();
 	irq_enter();
 
 	/*
@@ -81,6 +84,7 @@ void handle_IRQ(unsigned int irq, struct pt_regs *regs)
 	}
 
 	irq_exit();
+	exit_irq_mode();
 	set_irq_regs(old_regs);
 }
 
diff --git a/include/litmus/cache_proc.h b/include/litmus/cache_proc.h
index a7a740e..5a66c34 100644
--- a/include/litmus/cache_proc.h
+++ b/include/litmus/cache_proc.h
@@ -4,6 +4,8 @@
 #ifdef __KERNEL__
 
 void litmus_setup_lockdown(void __iomem*, u32);
+void enter_irq_mode(void);
+void exit_irq_mode(void);
 
 #endif
 
diff --git a/litmus/cache_proc.c b/litmus/cache_proc.c
index f315391..bb9d341 100644
--- a/litmus/cache_proc.c
+++ b/litmus/cache_proc.c
@@ -48,6 +48,22 @@ u32 way_partitions[5] = {
 	0xFFFF00FF, /* lv C */
 };
 
+u32 prev_lockdown_d_reg[5] = {
+	0x00000000,
+	0x00000000,
+	0x00000000,
+	0x00000000,
+	0xFFFF00FF, /* share with level-C */
+};
+
+u32 prev_lockdown_i_reg[5] = {
+	0x00000000,
+	0x00000000,
+	0x00000000,
+	0x00000000,
+	0xFFFF00FF, /* share with level-C */
+};
+
 static void __iomem *cache_base;
 static void __iomem *lockreg_d;
 static void __iomem *lockreg_i;
@@ -78,6 +94,7 @@ int l2_usable_sets;
 int lock_all;
 int nr_lockregs;
 int use_way_partition;
+int use_set_partition;
 
 extern void set_number_of_colors(int colornr);
 
@@ -222,7 +239,26 @@ void do_way_partition(enum crit_level lv, int cpu)
 		}
 	}
 }
-		
+
+void enter_irq_mode(void)
+{
+	int cpu = smp_processor_id();
+	
+	prev_lockdown_i_reg[cpu] = readl_relaxed(ld_i_reg(cpu));
+	prev_lockdown_d_reg[cpu] = readl_relaxed(ld_d_reg(cpu));
+	
+	writel_relaxed(prev_lockdown_i_reg[4], ld_i_reg(cpu));
+	writel_relaxed(prev_lockdown_d_reg[4], ld_d_reg(cpu));
+}
+
+void exit_irq_mode(void)
+{
+	int cpu = smp_processor_id();
+	
+	writel_relaxed(prev_lockdown_i_reg[cpu], ld_i_reg(cpu));
+	writel_relaxed(prev_lockdown_d_reg[cpu], ld_d_reg(cpu));	
+}
+
 int l2_usable_sets_handler(struct ctl_table *table, int write, void __user *buffer,
 		size_t *lenp, loff_t *ppos)
 {
@@ -268,6 +304,26 @@ out:
 	return ret;
 }
 
+int use_set_partition_handler(struct ctl_table *table, int write, void __user *buffer,
+		size_t *lenp, loff_t *ppos)
+{
+	int ret = 0;
+	
+	mutex_lock(&lockdown_proc);
+	
+	flush_cache_all();
+	
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret)
+		goto out;
+		
+	printk(KERN_INFO "use_set_partition : %d\n", use_set_partition);
+
+out:
+	mutex_unlock(&lockdown_proc);
+	return ret;
+}
+
 static struct ctl_table cache_table[] =
 {
 	{
@@ -306,6 +362,15 @@ static struct ctl_table cache_table[] =
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+	{
+		.procname	= "use_set_partition",
+		.mode		= 0666,
+		.proc_handler	= use_set_partition_handler,
+		.data		= &use_set_partition,
+		.maxlen		= sizeof(use_set_partition),
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 	{ }
 };
 
@@ -335,6 +400,7 @@ static int __init litmus_sysctl_init(void)
 	l2_usable_ways = 16;
 	l2_usable_sets = 5;
 	use_way_partition = 0;
+	use_set_partition = 0;
 	set_number_of_colors(l2_usable_sets);
 
 out:
diff --git a/litmus/litmus.c b/litmus/litmus.c
index 344c68c..77c609b 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -314,8 +314,17 @@ asmlinkage long sys_null_call(cycles_t __user *ts)
 		now = litmus_get_cycles();
 		ret = put_user(now, ts);
 	}
-	else
-		flush_cache_all();
+	else {
+		//flush_cache_all();
+		int *dummy;
+		int size = 20, i, t = 0;
+		dummy = kmalloc(PAGE_SIZE*size, GFP_ATOMIC);
+		for (i = 0; i<PAGE_SIZE*size/sizeof(int); i++) {
+			dummy[i] = t++;
+		}
+		
+		kfree(dummy);
+	}
 
 	return ret;
 }
@@ -391,6 +400,8 @@ extern struct page *new_alloc_page(struct page *page, unsigned long node, int **
 
 #endif
 
+extern int use_set_partition;
+
 asmlinkage long sys_set_page_color(int cpu)
 {
 	long ret = 0;
@@ -401,10 +412,13 @@ asmlinkage long sys_set_page_color(int cpu)
 	//struct task_page *entry = NULL;
 	int nr_pages = 0, nr_shared_pages = 0, nr_failed = 0;
 	unsigned long node;
-	
+		
 	LIST_HEAD(pagelist);
 	LIST_HEAD(shared_pagelist);
 	
+	if (use_set_partition == 0)
+		return 0;
+	
 	down_read(&current->mm->mmap_sem);
 	TRACE_TASK(current, "SYSCALL set_page_color\n");
 	vma_itr = current->mm->mmap;
diff --git a/litmus/sched_mc2.c b/litmus/sched_mc2.c
index 3aaa88c..aa6452a 100644
--- a/litmus/sched_mc2.c
+++ b/litmus/sched_mc2.c
@@ -655,9 +655,9 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 	//BUG_ON(state->scheduled && state->scheduled != prev);
 	//BUG_ON(state->scheduled && !is_realtime(prev));
 	if (state->scheduled && state->scheduled != prev)
-		printk(KERN_ALERT "BUG1!!!!!!!! %s %s\n", state->scheduled ? (state->scheduled)->comm : "null", prev ? (prev)->comm : "null");
+		; //printk(KERN_ALERT "BUG1!!!!!!!! %s %s\n", state->scheduled ? (state->scheduled)->comm : "null", prev ? (prev)->comm : "null");
 	if (state->scheduled && !is_realtime(prev))
-		printk(KERN_ALERT "BUG2!!!!!!!! \n");
+		; //printk(KERN_ALERT "BUG2!!!!!!!! \n");
 
 	/* update time */
 	state->sup_env.will_schedule = true;
-- 
1.8.1.2


From a2db9de00d3e16763c85d8694194f66c53eca8af Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Fri, 20 Mar 2015 22:11:29 -0400
Subject: [PATCH 099/119] Fixed set partition bug

---
 litmus/bank_proc.c        | 10 +++++++++-
 litmus/cache_proc.c       | 18 +++++++++++++++++-
 litmus/sched_mc2.c        |  6 ++++--
 litmus/sched_task_trace.c |  2 +-
 4 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/litmus/bank_proc.c b/litmus/bank_proc.c
index 53c20db..05c7fc3 100644
--- a/litmus/bank_proc.c
+++ b/litmus/bank_proc.c
@@ -229,6 +229,14 @@ struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 	// Decode the node to decide what color pages we should provide
 	switch(node ){
 		case 0:
+		case 1:
+		case 2:
+		case 3:
+			color = (color%4) * 4 + node;
+		case 4:
+			color = (color%16);
+/*
+		case 0:
 		case 1: 
 		case 2: 
 		case 3:
@@ -241,8 +249,8 @@ struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 			break;
 		default:
 			TRACE_CUR("Wrong color %lu\n", color);	
-//			printk(KERN_WARNING "Wrong color %lu\n", color);
 			return rPage;
+*/
 	}
 
 
diff --git a/litmus/cache_proc.c b/litmus/cache_proc.c
index bb9d341..68f68bf 100644
--- a/litmus/cache_proc.c
+++ b/litmus/cache_proc.c
@@ -48,6 +48,11 @@ u32 way_partitions[5] = {
 	0xFFFF00FF, /* lv C */
 };
 
+u32 set_partitions[2] = {
+	0xFFFFFF00, /* cpuX A and B */
+	0xFFFF00FF, /* lv C */
+};
+
 u32 prev_lockdown_d_reg[5] = {
 	0x00000000,
 	0x00000000,
@@ -190,8 +195,8 @@ int lock_all_handler(struct ctl_table *table, int write, void __user *buffer,
 			writel_relaxed(nr_unlocked_way[0], ld_d_reg(i));
 			writel_relaxed(nr_unlocked_way[0], ld_i_reg(i));
 		}
-		print_lockdown_registers();
 	}
+	print_lockdown_registers();
 
 out:
 	mutex_unlock(&lockdown_proc);
@@ -229,6 +234,9 @@ out:
 
 void do_way_partition(enum crit_level lv, int cpu)
 {
+	if (use_set_partition == 1 && use_way_partition == 1)
+		printk(KERN_ALERT "BOTH SET, WAY ARE SET!!!!\n");
+	
 	if (use_way_partition == 1) {
 		if (lv < CRIT_LEVEL_C) {
 			writel_relaxed(way_partitions[cpu], ld_d_reg(cpu));
@@ -237,6 +245,14 @@ void do_way_partition(enum crit_level lv, int cpu)
 			writel_relaxed(way_partitions[4], ld_d_reg(cpu));
 			writel_relaxed(way_partitions[4], ld_i_reg(cpu));
 		}
+	} else if (use_set_partition == 1) {
+		if (lv < CRIT_LEVEL_C) {
+			writel_relaxed(set_partitions[0], ld_d_reg(cpu));
+			writel_relaxed(set_partitions[0], ld_i_reg(cpu));
+		} else {
+			writel_relaxed(set_partitions[1], ld_d_reg(cpu));
+			writel_relaxed(set_partitions[1], ld_i_reg(cpu));
+		}
 	}
 }
 
diff --git a/litmus/sched_mc2.c b/litmus/sched_mc2.c
index aa6452a..1c7cdfe 100644
--- a/litmus/sched_mc2.c
+++ b/litmus/sched_mc2.c
@@ -575,12 +575,13 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 	
 	/* no eligible level A or B tasks exists */
 	/* check the ghost job */
+	/*
 	ce = &state->crit_entries[CRIT_LEVEL_C];
 	if (ce->running) {
 		TRACE_TASK(ce->running," is GHOST\n");
 		return NULL;
 	}
-	
+	*/
 	cur_priority = _lowest_prio_cpu.cpu_entries[state->cpu].deadline;
 	
 	//TRACE("****** ACTIVE LIST ******\n");
@@ -858,6 +859,7 @@ static long mc2_complete_job(void)
 		local_irq_restore(flags);
 		preempt_enable();
 	}
+	
 	sched_trace_task_completion(current, 0);
 	
 	/* update the next release time and deadline */
@@ -879,7 +881,7 @@ static long mc2_complete_job(void)
 		TRACE_CUR("TARDY: release=%llu now=%llu\n", get_release(current), litmus_clock());
 		preempt_enable();
 //		if (get_task_crit_level(current) == CRIT_LEVEL_A)
-//			sched_trace_task_release(current);
+		sched_trace_task_release(current);
 	}
 
 	TRACE_CUR("mc2_complete_job returns at %llu\n", litmus_clock());
diff --git a/litmus/sched_task_trace.c b/litmus/sched_task_trace.c
index 933e7e4..6224f8c 100644
--- a/litmus/sched_task_trace.c
+++ b/litmus/sched_task_trace.c
@@ -15,7 +15,7 @@
 #include <litmus/feather_trace.h>
 #include <litmus/ftdev.h>
 
-#define NO_EVENTS		(1 << CONFIG_SCHED_TASK_TRACE_SHIFT)
+#define NO_EVENTS		(1 << (CONFIG_SCHED_TASK_TRACE_SHIFT+3))
 
 #define now() litmus_clock()
 
-- 
1.8.1.2


From 623fe6255439add90f416df69b92134fbd01f342 Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Fri, 20 Mar 2015 22:23:14 -0400
Subject: [PATCH 100/119] Change function do_way_partition to do_partition

---
 litmus/cache_proc.c | 2 +-
 litmus/sched_mc2.c  | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/litmus/cache_proc.c b/litmus/cache_proc.c
index 68f68bf..59f166e 100644
--- a/litmus/cache_proc.c
+++ b/litmus/cache_proc.c
@@ -232,7 +232,7 @@ out:
 	return ret;
 }
 
-void do_way_partition(enum crit_level lv, int cpu)
+void do_partition(enum crit_level lv, int cpu)
 {
 	if (use_set_partition == 1 && use_way_partition == 1)
 		printk(KERN_ALERT "BOTH SET, WAY ARE SET!!!!\n");
diff --git a/litmus/sched_mc2.c b/litmus/sched_mc2.c
index 1c7cdfe..9c6d762 100644
--- a/litmus/sched_mc2.c
+++ b/litmus/sched_mc2.c
@@ -26,8 +26,7 @@
 #include <litmus/reservation.h>
 #include <litmus/polling_reservations.h>
 
-extern int use_way_partition;
-extern void do_way_partition(enum crit_level lv, int cpu);
+extern void do_partition(enum crit_level lv, int cpu);
 
 /* _global_env - reservation container for level-C tasks*/
 struct gmp_reservation_environment _global_env;
@@ -626,7 +625,7 @@ static void pre_schedule(struct task_struct *prev, int cpu)
 	if (!prev || !is_realtime(prev))
 		return;
 	
-	do_way_partition(CRIT_LEVEL_C, cpu);
+	do_partition(CRIT_LEVEL_C, cpu);
 }
 
 /* not used now */
@@ -637,7 +636,7 @@ static void post_schedule(struct task_struct *next, int cpu)
 		return;
 	
 	lev = get_task_crit_level(next);
-	do_way_partition(lev, cpu);
+	do_partition(lev, cpu);
 }
 
 /* mc2_schedule - main scheduler function. pick the next task to run
-- 
1.8.1.2


From bf0b4079ab52d1eba4c99dfe404548fefea4b94d Mon Sep 17 00:00:00 2001
From: ChengYang Fu <chengyangfu@gmail.com>
Date: Sun, 22 Mar 2015 15:05:52 -0400
Subject: [PATCH 101/119] Use (Interleaving off), and provide system variables
 to adjust the size of cache and bank number

---
 litmus/bank_proc.c | 548 +++++++++++++++++++++++++++++++++++++++--------------
 litmus/litmus.c    |   5 +-
 2 files changed, 407 insertions(+), 146 deletions(-)

diff --git a/litmus/bank_proc.c b/litmus/bank_proc.c
index 05c7fc3..7cf07ee 100644
--- a/litmus/bank_proc.c
+++ b/litmus/bank_proc.c
@@ -18,22 +18,58 @@
 #include <litmus/litmus_proc.h>
 #include <litmus/sched_trace.h>
 
-
 #define LITMUS_LOCKDEP_NAME_MAX_LEN 50
 
 // This Address Decoding is used in imx6-sabredsd platform
-#define CACHE_MASK 0x0000f000      
-#define BANK_MASK  0x00007000      
-#define OFFSET_SHIFT 12
+#define BANK_MASK  0x38000000     
+#define BANK_SHIFT  27
+#define CACHE_MASK  0x0000f000      
+#define CACHE_SHIFT 12
+
+#define PAGES_PER_COLOR 256
+unsigned int NUM_PAGE_LIST;  //8*16
+
+unsigned int number_banks;
+unsigned int number_cachecolors;
+
+unsigned int set_partition_max = 0x0000ffff;
+unsigned int set_partition_min = 0;
+unsigned int bank_partition_max = 0x000000ff;
+unsigned int bank_partition_min = 0;
+
+unsigned int set_partition[9] = {
+        0x00000003,  /* Core 0, and Level A*/
+        0x00000003,  /* Core 0, and Level B*/
+        0x0000000C,  /* Core 1, and Level A*/
+        0x0000000C,  /* Core 1, and Level B*/
+        0x00000030,  /* Core 2, and Level A*/
+        0x00000030,  /* Core 2, and Level B*/
+        0x000000C0,  /* Core 3, and Level A*/
+        0x000000C0,  /* Core 3, and Level B*/
+        0x0000ff00,  /* Level C */
+};
 
-#define PAGES_PER_COLOR 1024
+unsigned int bank_partition[9] = {
+        0x00000010,  /* Core 0, and Level A*/
+        0x00000010,  /* Core 0, and Level B*/
+        0x00000020,  /* Core 1, and Level A*/
+        0x00000020,  /* Core 1, and Level B*/
+        0x00000040,  /* Core 2, and Level A*/
+        0x00000040,  /* Core 2, and Level B*/
+        0x00000080,  /* Core 3, and Level A*/
+        0x00000080,  /* Core 3, and Level B*/
+        0x0000000f,  /* Level C */
+};
 
-unsigned long curr_cachecolor;
-int used_cachecolor;
+unsigned int set_index[9] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0
+};
 
+unsigned int bank_index[9] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 0
+};
 
-unsigned long number_banks;
-unsigned long number_cachecolors;
+struct mutex void_lockdown_proc;
 
 
 /*
@@ -46,39 +82,80 @@ struct color_group {
 	atomic_t nr_pages;
 };
 
+
+static struct color_group *color_groups;
+
 /*
- * This is old code which is not used in current version
- */ 
-/*
-static struct alloced_pages {
-	spinlock_t lock;
-	struct list_head list;
-} alloced_pages;
+ * Naive function to count the number of 1's
+ */
+unsigned int counting_one_set(unsigned int v)
+{
+//    unsigned int v; // count the number of bits set in v
+    unsigned int c; // c accumulates the total bits set in v
+
+    for (c = 0; v; v >>= 1)
+    {
+        c += v & 1;
+    }
+    return c;
+}
 
-struct alloced_page {
-	struct page *page;
-	struct vm_area_struct *vma;
-	struct list_head list;
-};
-*/
+unsigned int two_exp(unsigned int e)
+{
+    unsigned int v = 1;
+    for (e; e>0; e-- )
+    {
+        v=v*2;
+    }
+    return v;
+}
+
+unsigned int num_by_bitmask_index(bitmask, index)
+{
+    unsigned int pos = 0;
+
+    while(true)
+    {
+        if(index ==0 && (bitmask & 1)==1)
+        {
+            break;
+        }
+        if(index !=0 && (bitmask & 1)==1){
+            index--;
+        }
+        pos++;
+        bitmask = bitmask >>1;
+
+    }
+    return pos;
+}
 
-static struct color_group *color_groups;
-static struct lock_class_key color_lock_keys[16];
 
-//static struct color_group *color_groups;
 
 /* Decoding page color, 0~15 */ 
-static inline unsigned long page_color(struct page *page)
+static inline unsigned int page_color(struct page *page)
 {
-	return ((page_to_phys(page)& CACHE_MASK) >> PAGE_SHIFT);
+	return ((page_to_phys(page)& CACHE_MASK) >> CACHE_SHIFT);
 }
 
 /* Decoding page bank number, 0~7 */ 
-static inline unsigned long page_bank(struct page *page)
+static inline unsigned int page_bank(struct page *page)
 {
-	return ((page_to_phys(page)& BANK_MASK) >> PAGE_SHIFT);
+	return ((page_to_phys(page)& BANK_MASK) >> BANK_SHIFT);
 }
 
+static inline unsigned int page_list_index(struct page *page)
+{
+    unsigned int idx;  
+    idx = (page_color(page) + page_bank(page)*(number_cachecolors));
+//    printk("address = %lx, ", page_to_phys(page));
+//    printk("color(%d), bank(%d), indx = %d\n", page_color(page), page_bank(page), idx);
+
+    return idx; 
+}
+
+
+
 /*
  * It is used to determine the smallest number of page lists. 
  */
@@ -86,7 +163,7 @@ static unsigned long smallest_nr_pages(void)
 {
 	unsigned long i, min_pages = -1;
 	struct color_group *cgroup;
-	for (i = 0; i < number_cachecolors; ++i) {
+	for (i = 0; i < NUM_PAGE_LIST; ++i) {
 		cgroup = &color_groups[i];
 		if (atomic_read(&cgroup->nr_pages) < min_pages)
 			min_pages = atomic_read(&cgroup->nr_pages);
@@ -94,12 +171,22 @@ static unsigned long smallest_nr_pages(void)
 	return min_pages;
 }
 
+static void show_nr_pages(void)
+{
+	unsigned long i;
+	struct color_group *cgroup;
+	for (i = 0; i < NUM_PAGE_LIST; ++i) {
+		cgroup = &color_groups[i];
+		printk("i =%d, nr_pages = %d\n", i, atomic_read(&cgroup->nr_pages));
+	}
+}
+
 /*
  * Add a page to current pool.
  */
 void add_page_to_color_list(struct page *page)
 {
-	const unsigned long color = page_color(page);
+	const unsigned long color = page_list_index(page);
 	struct color_group *cgroup = &color_groups[color];
 	BUG_ON(in_list(&page->lru) || PageLRU(page));
 	BUG_ON(page_count(page) > 1);
@@ -116,38 +203,47 @@ void add_page_to_color_list(struct page *page)
  */
 static int do_add_pages(void)
 {
-//	printk("LITMUS do add pages\n");
+	printk("LITMUS do add pages\n");
 	
 	struct page *page, *page_tmp;
 	LIST_HEAD(free_later);
 	unsigned long color;
 	int ret = 0;
+	int i = 0;
 
 	// until all the page lists contain enough pages 
+	//for (i =0; i<5; i++) {
 	while (smallest_nr_pages() < PAGES_PER_COLOR) {
 	
 		page = alloc_page(GFP_HIGHUSER_MOVABLE);
-		
 		if (unlikely(!page)) {
-	//		printk(KERN_WARNING "Could not allocate pages.\n");
+			printk(KERN_WARNING "Could not allocate pages.\n");
 			ret = -ENOMEM;
 			goto out;
 		}
-		color = page_color(page);
+		color = page_list_index(page);
+                //show_nr_pages();
+		//printk("before : nr_pages = %d\n", atomic_read(&color_groups[color].nr_pages));
 		if (atomic_read(&color_groups[color].nr_pages) < PAGES_PER_COLOR) {
 			add_page_to_color_list(page);
 		} else{
 			// Pages here will be freed later 
 			list_add_tail(&page->lru, &free_later);
 		}
-	}
+               //show_nr_pages();
+		//printk("after : nr_pages = %d\n", atomic_read(&color_groups[color].nr_pages));
+
+        }
+        show_nr_pages();
+#if 1
 	// Free the unwanted pages
 	list_for_each_entry_safe(page, page_tmp, &free_later, lru) {
 		list_del(&page->lru);
 		__free_page(page);
 	}
+#endif
 out:
-	return ret;
+        return ret;
 }
 
 /*
@@ -162,7 +258,7 @@ static struct  page *new_alloc_page_color( unsigned long color)
 	struct color_group *cgroup;
 	struct page *rPage = NULL;
 		
-	if( (color <0) || (color)>15) {
+	if( (color <0) || (color)>(number_cachecolors*number_banks -1)) {
 		TRACE_CUR("Wrong color %lu\n", color);	
 //		printk(KERN_WARNING "Wrong color %lu\n", color);
 		goto out_unlock;
@@ -192,31 +288,17 @@ out:
 
 
 /*
- * Provide pages for replacement 
- * This is used to generate experiments 
- */
-struct page *new_alloc_page_predefined(struct page *page,  int **x)
-{
-	unsigned int color = curr_cachecolor; 
-	
-//	printk("allocate new page color = %d\n", color);	
-	struct color_group *cgroup;
-	struct page *rPage = NULL;
-	
-	rPage =  new_alloc_page_color(color);
-	curr_cachecolor = (color + 1)% used_cachecolor;
-out:
-	return rPage;
-}
-/*
  * provide pages for replacement according to  
- * node = 0 for Level A, B tasks in Cpu 0
- * node = 1 for Level A, B tasks in Cpu 1
- * node = 2 for Level A, B tasks in Cpu 2
- * node = 3 for Level A, B tasks in Cpu 3
- * node = 4 for Level C tasks 
+ * node = 0 for Level A tasks in Cpu 0
+ * node = 1 for Level B tasks in Cpu 0
+ * node = 2 for Level A tasks in Cpu 1
+ * node = 3 for Level B tasks in Cpu 1
+ * node = 4 for Level A tasks in Cpu 2
+ * node = 5 for Level B tasks in Cpu 2
+ * node = 6 for Level A tasks in Cpu 3
+ * node = 7 for Level B tasks in Cpu 3
+ * node = 8 for Level C tasks 
  */
-#if 1
 struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 {
 //	printk("allocate new page node = %d\n", node);	
@@ -224,79 +306,20 @@ struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 	struct color_group *cgroup;
 	struct page *rPage = NULL;
 	unsigned int color;
-	get_random_bytes(&color, sizeof(unsigned int));
 	
-	// Decode the node to decide what color pages we should provide
-	switch(node ){
-		case 0:
-		case 1:
-		case 2:
-		case 3:
-			color = (color%4) * 4 + node;
-		case 4:
-			color = (color%16);
-/*
-		case 0:
-		case 1: 
-		case 2: 
-		case 3:
-			color = (color%2) * 8 + node;
-			break;
-		case 4:
-			color = (color%8)+4;
-			if(color >=8)	
-				color+=4;
-			break;
-		default:
-			TRACE_CUR("Wrong color %lu\n", color);	
-			return rPage;
-*/
-	}
 
+        unsigned int idx = 0;
+        idx += num_by_bitmask_index(set_partition[node], set_index[node]);
+        idx += number_cachecolors* num_by_bitmask_index(bank_partition[node], bank_index[node]);
+	printk("node  = %d, idx = %d\n", node, idx);
 
-//	printk("allocate new page color = %d\n", color);
-		
-	rPage =  new_alloc_page_color(color);
+	rPage =  new_alloc_page_color(idx);
+        
+            
+        set_index[node] = (set_index[node]+1) % counting_one_set(set_partition[node]);
+        bank_index[node] = (bank_index[node]+1) % counting_one_set(bank_partition[node]);
 	return rPage; 
 }
-#else
-struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
-{
-	return new_alloc_page_predefined(page,  x);
-}
-#endif
-
-/*
- * Provide pages for replacement according to bank number. 
- * This is used in cache way partition 
- */
-struct page *new_alloc_page_banknr(struct page *page, unsigned long banknr, int **x)
-{
-//	printk("allocate new page bank = %d\n", banknr);	
-	struct color_group *cgroup;
-	struct page *rPage = NULL;
-	unsigned int color;
-	get_random_bytes(&color, sizeof(unsigned int));
-	
-	if((banknr<= 7) && (banknr>=0)){
-		color = (color%2) * 8 + banknr;
-	}else{
-		goto out;
-	}
-	
-	rPage =  new_alloc_page_color(color);
-		
-out:
-	return rPage;
-}
-
-
-void set_number_of_colors(int colornr)
-{
-//	printk(KERN_WARNING "Set of colors = %d.\n", colornr);
-	used_cachecolor = colornr ; 
-	curr_cachecolor = 0;
-}
 
 
@@ -306,12 +329,14 @@ void set_number_of_colors(int colornr)
  */ 
 static int __init init_variables(void)
 {
-	number_banks = 1+(BANK_MASK >> PAGE_SHIFT); 
-	number_cachecolors = 1+(CACHE_MASK >> PAGE_SHIFT);
-	used_cachecolor = 16;
-	curr_cachecolor = 0;
-
-	
+	number_banks = counting_one_set(BANK_MASK); 
+	number_banks = two_exp(number_banks); 
+
+	number_cachecolors = counting_one_set(CACHE_MASK);
+	number_cachecolors = two_exp(number_cachecolors);
+	NUM_PAGE_LIST = number_banks * number_cachecolors; 
+        printk(KERN_WARNING "number of banks = %d, number of cachecolors=%d\n", number_banks, number_cachecolors);
+	mutex_init(&void_lockdown_proc);
 }
 
 
@@ -324,39 +349,274 @@ static int __init init_color_groups(void)
 	unsigned long i;
 	int err = 0;
 
-	color_groups = kmalloc(number_cachecolors *
-			sizeof(struct color_group), GFP_KERNEL);
+        printk("NUM_PAGE_LIST = %d\n", NUM_PAGE_LIST);
+        color_groups = kmalloc(NUM_PAGE_LIST *sizeof(struct color_group), GFP_KERNEL);
+
 	if (!color_groups) {
-//		printk(KERN_WARNING "Could not allocate color groups.\n");
+		printk(KERN_WARNING "Could not allocate color groups.\n");
 		err = -ENOMEM;
 	}else{
 
-		for (i = 0; i < number_cachecolors; ++i) {
+		for (i = 0; i < NUM_PAGE_LIST; ++i) {
 			cgroup = &color_groups[i];
 			atomic_set(&cgroup->nr_pages, 0);
 			INIT_LIST_HEAD(&cgroup->list);
 			spin_lock_init(&cgroup->lock);
 		}
 	}
-	return err;
+        return err;
+}
+
+int set_partition_handler(struct ctl_table *table, int write, void __user *buffer,
+		size_t *lenp, loff_t *ppos)
+{
+	int ret = 0, i = 0;
+	mutex_lock(&void_lockdown_proc);
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret)
+		goto out;
+	if (write) {
+            printk("New set Partition : \n");
+	    for(i =0;i <9;i++)
+            {
+                set_index[i] = 0;
+                printk("set[%d] = %x \n", i, set_partition[i]);
+            }
+	}
+out:
+	mutex_unlock(&void_lockdown_proc);
+	return ret;
 }
 
+int bank_partition_handler(struct ctl_table *table, int write, void __user *buffer,
+		size_t *lenp, loff_t *ppos)
+{
+	int ret = 0, i = 0;
+	mutex_lock(&void_lockdown_proc);
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret)
+		goto out;
+	if (write) {
+	    for(i =0;i <9;i++)
+            {
+                bank_index[i] = 0;
+            }
+	}
+out:
+	mutex_unlock(&void_lockdown_proc);
+	return ret;
+}
+
+
+static struct ctl_table cache_table[] =
+{
+        
+	{
+		.procname	= "C0_LA_set",
+		.mode		= 0666,
+		.proc_handler	= set_partition_handler,
+		.data		= &set_partition[0],
+		.maxlen		= sizeof(set_partition[0]),
+		.extra1		= &set_partition_min,
+		.extra2		= &set_partition_max,
+	},	
+	{
+		.procname	= "C0_LB_set",
+		.mode		= 0666,
+		.proc_handler	= set_partition_handler,
+		.data		= &set_partition[1],
+		.maxlen		= sizeof(set_partition[1]),
+		.extra1		= &set_partition_min,
+		.extra2		= &set_partition_max,
+	},	
+	{
+		.procname	= "C1_LA_set",
+		.mode		= 0666,
+		.proc_handler	= set_partition_handler,
+		.data		= &set_partition[2],
+		.maxlen		= sizeof(set_partition[2]),
+		.extra1		= &set_partition_min,
+		.extra2		= &set_partition_max,
+	},
+	{
+		.procname	= "C1_LB_set",
+		.mode		= 0666,
+		.proc_handler	= set_partition_handler,
+		.data		= &set_partition[3],
+		.maxlen		= sizeof(set_partition[3]),
+		.extra1		= &set_partition_min,
+		.extra2		= &set_partition_max,
+	},
+	{
+		.procname	= "C2_LA_set",
+		.mode		= 0666,
+		.proc_handler	= set_partition_handler,
+		.data		= &set_partition[4],
+		.maxlen		= sizeof(set_partition[4]),
+		.extra1		= &set_partition_min,
+		.extra2		= &set_partition_max,
+	},
+	{
+		.procname	= "C2_LB_set",
+		.mode		= 0666,
+		.proc_handler	= set_partition_handler,
+		.data		= &set_partition[5],
+		.maxlen		= sizeof(set_partition[5]),
+		.extra1		= &set_partition_min,
+		.extra2		= &set_partition_max,
+	},
+	{
+		.procname	= "C3_LA_set",
+		.mode		= 0666,
+		.proc_handler	= set_partition_handler,
+		.data		= &set_partition[6],
+		.maxlen		= sizeof(set_partition[6]),
+		.extra1		= &set_partition_min,
+		.extra2		= &set_partition_max,
+	},
+	{
+		.procname	= "C3_LB_set",
+		.mode		= 0666,
+		.proc_handler	= set_partition_handler,
+		.data		= &set_partition[7],
+		.maxlen		= sizeof(set_partition[7]),
+		.extra1		= &set_partition_min,
+		.extra2		= &set_partition_max,
+	},	
+	{
+		.procname	= "Call_LC_set",
+		.mode		= 0666,
+		.proc_handler	= set_partition_handler,
+		.data		= &set_partition[8],
+		.maxlen		= sizeof(set_partition[8]),
+		.extra1		= &set_partition_min,
+		.extra2		= &set_partition_max,
+	},	
+	{
+		.procname	= "C0_LA_bank",
+		.mode		= 0666,
+		.proc_handler	= bank_partition_handler,
+		.data		= &bank_partition[0],
+		.maxlen		= sizeof(set_partition[0]),
+		.extra1		= &bank_partition_min,
+		.extra2		= &bank_partition_max,
+	},
+	{
+		.procname	= "C0_LB_bank",
+		.mode		= 0666,
+		.proc_handler	= bank_partition_handler,
+		.data		= &bank_partition[1],
+		.maxlen		= sizeof(set_partition[1]),
+		.extra1		= &bank_partition_min,
+		.extra2		= &bank_partition_max,
+	},		
+	{
+		.procname	= "C1_LA_bank",
+		.mode		= 0666,
+		.proc_handler	= bank_partition_handler,
+		.data		= &bank_partition[2],
+		.maxlen		= sizeof(set_partition[2]),
+		.extra1		= &bank_partition_min,
+		.extra2		= &bank_partition_max,
+	},
+	{
+		.procname	= "C1_LB_bank",
+		.mode		= 0666,
+		.proc_handler	= bank_partition_handler,
+		.data		= &bank_partition[3],
+		.maxlen		= sizeof(set_partition[3]),
+		.extra1		= &bank_partition_min,
+		.extra2		= &bank_partition_max,
+	},
+	{
+		.procname	= "C2_LA_bank",
+		.mode		= 0666,
+		.proc_handler	= bank_partition_handler,
+		.data		= &bank_partition[4],
+		.maxlen		= sizeof(set_partition[4]),
+		.extra1		= &bank_partition_min,
+		.extra2		= &bank_partition_max,
+	},	
+	{
+		.procname	= "C2_LB_bank",
+		.mode		= 0666,
+		.proc_handler	= bank_partition_handler,
+		.data		= &bank_partition[5],
+		.maxlen		= sizeof(set_partition[5]),
+		.extra1		= &bank_partition_min,
+		.extra2		= &bank_partition_max,
+	},		
+	{
+		.procname	= "C3_LA_bank",
+		.mode		= 0666,
+		.proc_handler	= bank_partition_handler,
+		.data		= &bank_partition[6],
+		.maxlen		= sizeof(set_partition[6]),
+		.extra1		= &bank_partition_min,
+		.extra2		= &bank_partition_max,
+	},	
+	{
+		.procname	= "C3_LB_bank",
+		.mode		= 0666,
+		.proc_handler	= bank_partition_handler,
+		.data		= &bank_partition[7],
+		.maxlen		= sizeof(set_partition[7]),
+		.extra1		= &bank_partition_min,
+		.extra2		= &bank_partition_max,
+	},	
+	{
+		.procname	= "Call_LC_bank",
+		.mode		= 0666,
+		.proc_handler	= bank_partition_handler,
+		.data		= &bank_partition[8],
+		.maxlen		= sizeof(set_partition[8]),
+		.extra1		= &bank_partition_min,
+		.extra2		= &bank_partition_max,
+	},	
+
+
+	{ }
+};
+
+static struct ctl_table litmus_dir_table[] = {
+	{
+		.procname	= "litmus",
+ 		.mode		= 0555,
+		.child		= cache_table,
+	},
+	{ }
+};
+
+
+static struct ctl_table_header *litmus_sysctls;
+
+
 /*
  * Initialzie this proc 
  */
 static int __init litmus_color_init(void)
 {
 	int err=0;
-	
+        printk("Init bankproc.c\n");
+
 	//INIT_LIST_HEAD(&alloced_pages.list);
 	//spin_lock_init(&alloced_pages.lock);
 	init_variables();
-//	printk("Cache number = %d , Cache mask = 0x%lx\n", number_cachecolors, CACHE_MASK); 
-//	printk("Bank number = %d , Bank mask = 0x%lx\n", number_banks, BANK_MASK); 
+
+	printk(KERN_INFO "Registering LITMUS^RT proc color sysctl.\n");
+
+	litmus_sysctls = register_sysctl_table(litmus_dir_table);
+	if (!litmus_sysctls) {
+		printk(KERN_WARNING "Could not register LITMUS^RT color sysctl.\n");
+		err = -EFAULT;
+		goto out;
+	}
+
 	init_color_groups();			
 	do_add_pages();
 
-//	printk(KERN_INFO "Registering LITMUS^RT color and bank proc.\n");
+	printk(KERN_INFO "Registering LITMUS^RT color and bank proc.\n");
+out:
 	return err;
 }
 
diff --git a/litmus/litmus.c b/litmus/litmus.c
index 77c609b..0352079 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -390,7 +390,7 @@ do_return:
 extern int isolate_lru_page(struct page *page);
 extern void putback_lru_page(struct page *page);
 
-#if 1
+#if 0
 static struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 {
 	return alloc_pages_exact_node(0, GFP_HIGHUSER_MOVABLE, 0);
@@ -501,7 +501,8 @@ asmlinkage long sys_set_page_color(int cpu)
 		node = 4;
 	else
 		node = cpu;
-	
+
+        //node= 0;
 	if (!list_empty(&pagelist)) {
 		ret = migrate_pages(&pagelist, new_alloc_page, node, MIGRATE_ASYNC, MR_SYSCALL);
 		TRACE_TASK(current, "%ld pages not migrated.\n", ret);
-- 
1.8.1.2


From 56a820730e5ba600fa2654db635cf21b6cde5f21 Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Sun, 22 Mar 2015 17:15:52 -0400
Subject: [PATCH 102/119] Added way-partition /proc fs

---
 litmus/cache_proc.c       | 258 ++++++++++++++++++++++------------------------
 litmus/litmus.c           |   5 -
 litmus/reservation.c      |  19 +++-
 litmus/sched_mc2.c        |  12 ++-
 litmus/sched_task_trace.c |   2 +-
 5 files changed, 152 insertions(+), 144 deletions(-)

diff --git a/litmus/cache_proc.c b/litmus/cache_proc.c
index 59f166e..a2d560a 100644
--- a/litmus/cache_proc.c
+++ b/litmus/cache_proc.c
@@ -40,12 +40,16 @@ u32 nr_unlocked_way[MAX_NR_WAYS+1]  = {
 	0xFFFF0000, /* way ~15 unlocked. usable = 16 */
 };
 
-u32 way_partitions[5] = {
-	0xFFFFFFFC, /* cpu0 A and B */
-	0xFFFFFFF3, /* cpu1 A and B */
-	0xFFFFFFCF, /* cpu2 A and B */
-	0xFFFFFF3F, /* cpu3 A and B */
-	0xFFFF00FF, /* lv C */
+u32 way_partitions[9] = {
+	0x00000003, /* cpu0 A */
+	0x00000003, /* cpu0 B */
+	0x0000000C, /* cpu1 A */
+	0x0000000C, /* cpu1 B */
+	0x00000030, /* cpu2 A */
+	0x00000030, /* cpu2 B */
+	0x000000C0, /* cpu3 A */
+	0x000000C0, /* cpu3 B */
+	0x0000FF00, /* lv C */
 };
 
 u32 set_partitions[2] = {
@@ -78,11 +82,8 @@ static u32 cache_id;
 struct mutex actlr_mutex;
 struct mutex l2x0_prefetch_mutex;
 struct mutex lockdown_proc;
-
-static int min_usable_ways = 0;
-static int max_usable_ways = 16;
-static int min_usable_sets = 1;
-static int max_usable_sets = 16;
+static unsigned int way_partition_min;
+static unsigned int way_partition_max;
 
 static int zero = 0;
 static int one = 1;
@@ -94,15 +95,8 @@ static int one = 1;
 			void __iomem *__v = cache_base + L2X0_LOCKDOWN_WAY_I_BASE + \
 			__cpu * L2X0_LOCKDOWN_STRIDE; __v; })
 
-int l2_usable_ways;
-int l2_usable_sets;
 int lock_all;
 int nr_lockregs;
-int use_way_partition;
-int use_set_partition;
-
-extern void set_number_of_colors(int colornr);
-
 
 static void print_lockdown_registers(void)
 {
@@ -177,7 +171,8 @@ void litmus_setup_lockdown(void __iomem *base, u32 id)
 	
 	test_lockdown(NULL);
 }
-int lock_all_handler(struct ctl_table *table, int write, void __user *buffer,
+
+int way_partition_handler(struct ctl_table *table, int write, void __user *buffer,
 		size_t *lenp, loff_t *ppos)
 {
 	int ret = 0, i;
@@ -190,10 +185,10 @@ int lock_all_handler(struct ctl_table *table, int write, void __user *buffer,
 	if (ret)
 		goto out;
 	
-	if (write && lock_all == 1) {
-		for (i = 0; i < nr_lockregs;  i++) {
-			writel_relaxed(nr_unlocked_way[0], ld_d_reg(i));
-			writel_relaxed(nr_unlocked_way[0], ld_i_reg(i));
+	if (write) {
+		printk("Way-partition settings:\n");
+		for (i = 0; i < 9; i++) {
+			printk("0x%08X\n", ~way_partitions[i]);
 		}
 	}
 	print_lockdown_registers();
@@ -203,10 +198,10 @@ out:
 	return ret;
 }
 
-int l2_usable_ways_handler(struct ctl_table *table, int write, void __user *buffer,
+int lock_all_handler(struct ctl_table *table, int write, void __user *buffer,
 		size_t *lenp, loff_t *ppos)
 {
-	int ret = 0, i = 0;
+	int ret = 0, i;
 	
 	mutex_lock(&lockdown_proc);
 	
@@ -215,17 +210,14 @@ int l2_usable_ways_handler(struct ctl_table *table, int write, void __user *buff
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret)
 		goto out;
-		
-	TRACE_CUR("l2_usable_ways : %d\n", l2_usable_ways);
-	printk("l2_usable_ways : %d\n", l2_usable_ways);
 	
-	if (write) {
-		//for (i = 0; i < nr_lockregs;  i++) {
-			writel_relaxed(nr_unlocked_way[l2_usable_ways], ld_d_reg(i));
-			writel_relaxed(nr_unlocked_way[l2_usable_ways], ld_i_reg(i));
-		//}
-		print_lockdown_registers();
+	if (write && lock_all == 1) {
+		for (i = 0; i < nr_lockregs;  i++) {
+			writel_relaxed(nr_unlocked_way[0], ld_d_reg(i));
+			writel_relaxed(nr_unlocked_way[0], ld_i_reg(i));
+		}
 	}
+	print_lockdown_registers();
 
 out:
 	mutex_unlock(&lockdown_proc);
@@ -234,6 +226,28 @@ out:
 
 void do_partition(enum crit_level lv, int cpu)
 {
+	u32 regs;
+	switch(lv) {
+		case CRIT_LEVEL_A:
+			regs = ~way_partitions[cpu*2];
+			writel_relaxed(regs, ld_d_reg(cpu));
+			writel_relaxed(regs, ld_i_reg(cpu));
+			break;
+		case CRIT_LEVEL_B:
+			regs = ~way_partitions[cpu*2+1];
+			writel_relaxed(regs, ld_d_reg(cpu));
+			writel_relaxed(regs, ld_i_reg(cpu));
+			break;
+		case CRIT_LEVEL_C:
+		case NUM_CRIT_LEVELS:
+			regs = ~way_partitions[8];
+			writel_relaxed(regs, ld_d_reg(cpu));
+			writel_relaxed(regs, ld_i_reg(cpu));
+			break;
+		default:
+			BUG();
+	}
+/*
 	if (use_set_partition == 1 && use_way_partition == 1)
 		printk(KERN_ALERT "BOTH SET, WAY ARE SET!!!!\n");
 	
@@ -254,6 +268,7 @@ void do_partition(enum crit_level lv, int cpu)
 			writel_relaxed(set_partitions[1], ld_i_reg(cpu));
 		}
 	}
+*/
 }
 
 void enter_irq_mode(void)
@@ -275,115 +290,95 @@ void exit_irq_mode(void)
 	writel_relaxed(prev_lockdown_d_reg[cpu], ld_d_reg(cpu));	
 }
 
-int l2_usable_sets_handler(struct ctl_table *table, int write, void __user *buffer,
-		size_t *lenp, loff_t *ppos)
-{
-	int ret = 0;
-	
-	mutex_lock(&lockdown_proc);
-	
-	flush_cache_all();
-	
-	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-	if (ret)
-		goto out;
-		
-	TRACE_CUR("l2_usable_sets : %d\n", l2_usable_sets);
-	printk("l2_usable_sets : %d\n", l2_usable_sets);
-	
-	if (write) {
-		set_number_of_colors(l2_usable_sets);
-	}
-
-out:
-	mutex_unlock(&lockdown_proc);
-	return ret;
-}
-
-int use_way_partition_handler(struct ctl_table *table, int write, void __user *buffer,
-		size_t *lenp, loff_t *ppos)
-{
-	int ret = 0;
-	
-	mutex_lock(&lockdown_proc);
-	
-	flush_cache_all();
-	
-	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-	if (ret)
-		goto out;
-		
-	printk(KERN_INFO "use_way_partition : %d\n", use_way_partition);
-
-out:
-	mutex_unlock(&lockdown_proc);
-	return ret;
-}
-
-int use_set_partition_handler(struct ctl_table *table, int write, void __user *buffer,
-		size_t *lenp, loff_t *ppos)
-{
-	int ret = 0;
-	
-	mutex_lock(&lockdown_proc);
-	
-	flush_cache_all();
-	
-	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-	if (ret)
-		goto out;
-		
-	printk(KERN_INFO "use_set_partition : %d\n", use_set_partition);
-
-out:
-	mutex_unlock(&lockdown_proc);
-	return ret;
-}
-
 static struct ctl_table cache_table[] =
 {
 	{
-		.procname	= "l2_usable_ways",
+		.procname	= "C0_LA_way",
 		.mode		= 0666,
-		.proc_handler	= l2_usable_ways_handler,
-		.data		= &l2_usable_ways,
-		.maxlen		= sizeof(l2_usable_ways),
-		.extra1		= &min_usable_ways,
-		.extra2		= &max_usable_ways,
+		.proc_handler	= way_partition_handler,
+		.data		= &way_partitions[0],
+		.maxlen		= sizeof(way_partitions[0]),
+		.extra1		= &way_partition_min,
+		.extra2		= &way_partition_max,
+	},	
+	{
+		.procname	= "C0_LB_way",
+		.mode		= 0666,
+		.proc_handler	= way_partition_handler,
+		.data		= &way_partitions[1],
+		.maxlen		= sizeof(way_partitions[1]),
+		.extra1		= &way_partition_min,
+		.extra2		= &way_partition_max,
+	},	
+	{
+		.procname	= "C1_LA_way",
+		.mode		= 0666,
+		.proc_handler	= way_partition_handler,
+		.data		= &way_partitions[2],
+		.maxlen		= sizeof(way_partitions[2]),
+		.extra1		= &way_partition_min,
+		.extra2		= &way_partition_max,
 	},
 	{
-		.procname	= "l2_usable_sets",
+		.procname	= "C1_LB_way",
 		.mode		= 0666,
-		.proc_handler	= l2_usable_sets_handler,
-		.data		= &l2_usable_sets,
-		.maxlen		= sizeof(l2_usable_sets),
-		.extra1		= &min_usable_sets,
-		.extra2		= &max_usable_sets,
+		.proc_handler	= way_partition_handler,
+		.data		= &way_partitions[3],
+		.maxlen		= sizeof(way_partitions[3]),
+		.extra1		= &way_partition_min,
+		.extra2		= &way_partition_max,
 	},
 	{
-		.procname	= "lock_all",
+		.procname	= "C2_LA_way",
 		.mode		= 0666,
-		.proc_handler	= lock_all_handler,
-		.data		= &lock_all,
-		.maxlen		= sizeof(lock_all),
-		.extra1		= &zero,
-		.extra2		= &one,
+		.proc_handler	= way_partition_handler,
+		.data		= &way_partitions[4],
+		.maxlen		= sizeof(way_partitions[4]),
+		.extra1		= &way_partition_min,
+		.extra2		= &way_partition_max,
 	},
 	{
-		.procname	= "use_way_partition",
+		.procname	= "C2_LB_way",
 		.mode		= 0666,
-		.proc_handler	= use_way_partition_handler,
-		.data		= &use_way_partition,
-		.maxlen		= sizeof(use_way_partition),
-		.extra1		= &zero,
-		.extra2		= &one,
+		.proc_handler	= way_partition_handler,
+		.data		= &way_partitions[5],
+		.maxlen		= sizeof(way_partitions[5]),
+		.extra1		= &way_partition_min,
+		.extra2		= &way_partition_max,
 	},
 	{
-		.procname	= "use_set_partition",
+		.procname	= "C3_LA_way",
 		.mode		= 0666,
-		.proc_handler	= use_set_partition_handler,
-		.data		= &use_set_partition,
-		.maxlen		= sizeof(use_set_partition),
+		.proc_handler	= way_partition_handler,
+		.data		= &way_partitions[6],
+		.maxlen		= sizeof(way_partitions[6]),
+		.extra1		= &way_partition_min,
+		.extra2		= &way_partition_max,
+	},
+	{
+		.procname	= "C3_LB_way",
+		.mode		= 0666,
+		.proc_handler	= way_partition_handler,
+		.data		= &way_partitions[7],
+		.maxlen		= sizeof(way_partitions[7]),
+		.extra1		= &way_partition_min,
+		.extra2		= &way_partition_max,
+	},	
+	{
+		.procname	= "Call_LC_way",
+		.mode		= 0666,
+		.proc_handler	= way_partition_handler,
+		.data		= &way_partitions[8],
+		.maxlen		= sizeof(way_partitions[8]),
+		.extra1		= &way_partition_min,
+		.extra2		= &way_partition_max,
+	},		
+	{
+		.procname	= "lock_all",
+		.mode		= 0666,
+		.proc_handler	= lock_all_handler,
+		.data		= &lock_all,
+		.maxlen		= sizeof(lock_all),
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
@@ -413,12 +408,9 @@ static int __init litmus_sysctl_init(void)
 		goto out;
 	}
 
-	l2_usable_ways = 16;
-	l2_usable_sets = 5;
-	use_way_partition = 0;
-	use_set_partition = 0;
-	set_number_of_colors(l2_usable_sets);
-
+	way_partition_min = 0x00000000;
+	way_partition_max = 0x0000FFFF;
+	
 out:
 	return ret;
 }
diff --git a/litmus/litmus.c b/litmus/litmus.c
index 0352079..04c5017 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -400,8 +400,6 @@ extern struct page *new_alloc_page(struct page *page, unsigned long node, int **
 
 #endif
 
-extern int use_set_partition;
-
 asmlinkage long sys_set_page_color(int cpu)
 {
 	long ret = 0;
@@ -416,9 +414,6 @@ asmlinkage long sys_set_page_color(int cpu)
 	LIST_HEAD(pagelist);
 	LIST_HEAD(shared_pagelist);
 	
-	if (use_set_partition == 0)
-		return 0;
-	
 	down_read(&current->mm->mmap_sem);
 	TRACE_TASK(current, "SYSCALL set_page_color\n");
 	vma_itr = current->mm->mmap;
diff --git a/litmus/reservation.c b/litmus/reservation.c
index 8457b4b..af5a934 100644
--- a/litmus/reservation.c
+++ b/litmus/reservation.c
@@ -7,6 +7,8 @@
 //#define TRACE(fmt, args...) do {} while (false)
 //#define TRACE_TASK(fmt, args...) do {} while (false)
 
+#define BUDGET_ENFORCEMENT_AT_C 1
+	
 void reservation_init(struct reservation *res)
 {
 	memset(res, sizeof(*res), 0);
@@ -495,9 +497,11 @@ static void gmp_queue_active(
 	/* check for possible preemption */
 	if (res->state == RESERVATION_ACTIVE && check_preempt)
 		gmp_env->schedule_now++;
-	
+
+//#if BUDGET_ENFORCEMENT_AT_C	
 	gmp_add_event_after(gmp_env, res->cur_budget, res->id, EVENT_DRAIN);
-	res->event_added = 1;
+//#endif
+	res->event_added = 1;	
 }
 
 static void gmp_queue_reservation(
@@ -530,6 +534,7 @@ void gmp_add_new_reservation(
 	gmp_queue_reservation(gmp_env, new_res);
 }
 
+//#if BUDGET_ENFORCEMENT_AT_C
 static void gmp_charge_budget(
 	struct gmp_reservation_environment* gmp_env,
 	lt_t delta)
@@ -576,6 +581,16 @@ static void gmp_charge_budget(
 	}
 	//TRACE("finished charging budgets\n");
 }
+//#else
+/*
+static void gmp_charge_budget(
+	struct gmp_reservation_environment* gmp_env,
+	lt_t delta)
+{
+	return;
+}
+*/
+//#endif
 
 static void gmp_replenish_budgets(struct gmp_reservation_environment* gmp_env)
 {
diff --git a/litmus/sched_mc2.c b/litmus/sched_mc2.c
index 9c6d762..885218e 100644
--- a/litmus/sched_mc2.c
+++ b/litmus/sched_mc2.c
@@ -26,6 +26,8 @@
 #include <litmus/reservation.h>
 #include <litmus/polling_reservations.h>
 
+#define BUDGET_ENFORCEMENT_AT_C 1
+
 extern void do_partition(enum crit_level lv, int cpu);
 
 /* _global_env - reservation container for level-C tasks*/
@@ -158,7 +160,9 @@ static void task_departs(struct task_struct *tsk, int job_complete)
 		ce = &state->crit_entries[lv];
 		ce->running = tsk;
 		res->is_ghost = state->cpu;
+//#if BUDGET_ENFORCEMENT_AT_C		
 		gmp_add_event_after(&_global_env, res->cur_budget, res->id, EVENT_DRAIN);
+//#endif
 		TRACE_TASK(tsk, "BECOME GHOST at %llu\n", litmus_clock());
 		
 	}		
@@ -583,10 +587,10 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 	*/
 	cur_priority = _lowest_prio_cpu.cpu_entries[state->cpu].deadline;
 	
-	//TRACE("****** ACTIVE LIST ******\n");
-	//TRACE_TASK(_lowest_prio_cpu.cpu_entries[state->cpu].scheduled, "** CURRENT JOB deadline %llu **\n", cur_priority);
+	TRACE("****** ACTIVE LIST ******\n");
+	TRACE_TASK(_lowest_prio_cpu.cpu_entries[state->cpu].scheduled, "** CURRENT JOB deadline %llu **\n", cur_priority);
 	list_for_each_entry_safe(res, next, &_global_env.active_reservations, list) {
-		//TRACE("R%d deadline=%llu, scheduled_on=%d\n", res->id, res->priority, res->scheduled_on);
+		TRACE("R%d deadline=%llu, scheduled_on=%d\n", res->id, res->priority, res->scheduled_on);
 		if (res->state == RESERVATION_ACTIVE && res->scheduled_on == NO_CPU) {
 			tsk = res->ops->dispatch_client(res, &time_slice);
 			if (likely(tsk)) {
@@ -598,7 +602,9 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 				} else if (lv == CRIT_LEVEL_C) {
 					//ce = &state->crit_entries[lv];
 					//if (likely(!ce->running)) {
+//#if BUDGET_ENFORCEMENT_AT_C						
 						gmp_add_event_after(&_global_env, res->cur_budget, res->id, EVENT_DRAIN);
+//#endif
 						res->event_added = 1;
 						res->blocked_by_ghost = 0;
 						res->is_ghost = NO_CPU;
diff --git a/litmus/sched_task_trace.c b/litmus/sched_task_trace.c
index 6224f8c..3a6756d 100644
--- a/litmus/sched_task_trace.c
+++ b/litmus/sched_task_trace.c
@@ -15,7 +15,7 @@
 #include <litmus/feather_trace.h>
 #include <litmus/ftdev.h>
 
-#define NO_EVENTS		(1 << (CONFIG_SCHED_TASK_TRACE_SHIFT+3))
+#define NO_EVENTS		(1 << (CONFIG_SCHED_TASK_TRACE_SHIFT+6))
 
 #define now() litmus_clock()
 
-- 
1.8.1.2


From 0041e83c7994510cebe9f335eb30b6049d8b4c1f Mon Sep 17 00:00:00 2001
From: ChengYang Fu <chengyangfu@gmail.com>
Date: Mon, 23 Mar 2015 23:31:09 -0400
Subject: [PATCH 103/119] solve the bug in bank_proc.c

---
 litmus/bank_proc.c | 95 +++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 79 insertions(+), 16 deletions(-)

diff --git a/litmus/bank_proc.c b/litmus/bank_proc.c
index 7cf07ee..3cf9cda 100644
--- a/litmus/bank_proc.c
+++ b/litmus/bank_proc.c
@@ -26,7 +26,7 @@
 #define CACHE_MASK  0x0000f000      
 #define CACHE_SHIFT 12
 
-#define PAGES_PER_COLOR 256
+#define PAGES_PER_COLOR 512
 unsigned int NUM_PAGE_LIST;  //8*16
 
 unsigned int number_banks;
@@ -37,6 +37,8 @@ unsigned int set_partition_min = 0;
 unsigned int bank_partition_max = 0x000000ff;
 unsigned int bank_partition_min = 0;
 
+int show_page_pool = 0;
+
 unsigned int set_partition[9] = {
         0x00000003,  /* Core 0, and Level A*/
         0x00000003,  /* Core 0, and Level B*/
@@ -58,7 +60,7 @@ unsigned int bank_partition[9] = {
         0x00000040,  /* Core 2, and Level B*/
         0x00000080,  /* Core 3, and Level A*/
         0x00000080,  /* Core 3, and Level B*/
-        0x0000000f,  /* Level C */
+        0x0000000c,  /* Level C */
 };
 
 unsigned int set_index[9] = {
@@ -161,9 +163,11 @@ static inline unsigned int page_list_index(struct page *page)
  */
 static unsigned long smallest_nr_pages(void)
 {
-	unsigned long i, min_pages = -1;
+	unsigned long i, min_pages;
 	struct color_group *cgroup;
-	for (i = 0; i < NUM_PAGE_LIST; ++i) {
+	cgroup = &color_groups[16*2];
+	min_pages =atomic_read(&cgroup->nr_pages); 
+	for (i = 16*2; i < NUM_PAGE_LIST; ++i) {
 		cgroup = &color_groups[i];
 		if (atomic_read(&cgroup->nr_pages) < min_pages)
 			min_pages = atomic_read(&cgroup->nr_pages);
@@ -175,9 +179,13 @@ static void show_nr_pages(void)
 {
 	unsigned long i;
 	struct color_group *cgroup;
+	printk("show nr pages***************************************\n");
 	for (i = 0; i < NUM_PAGE_LIST; ++i) {
 		cgroup = &color_groups[i];
-		printk("i =%d, nr_pages = %d\n", i, atomic_read(&cgroup->nr_pages));
+		printk("(%03d) =  %03d, ", i, atomic_read(&cgroup->nr_pages));
+		if((i % 8) ==7){
+		    printk("\n");
+                }
 	}
 }
 
@@ -193,6 +201,7 @@ void add_page_to_color_list(struct page *page)
 	spin_lock(&cgroup->lock);
 	list_add_tail(&page->lru, &cgroup->list);
 	atomic_inc(&cgroup->nr_pages);
+	SetPageLRU(page);
 	spin_unlock(&cgroup->lock);
 }
 
@@ -210,30 +219,65 @@ static int do_add_pages(void)
 	unsigned long color;
 	int ret = 0;
 	int i = 0;
+        int free_counter = 0;
+        unsigned long counter[128]= {0}; 
+        
+        printk("Before refill : \n");
+        show_nr_pages();
 
 	// until all the page lists contain enough pages 
 	//for (i =0; i<5; i++) {
-	while (smallest_nr_pages() < PAGES_PER_COLOR) {
-	
+	for (i=0; i< 1024*100;i++) {
+//	while (smallest_nr_pages() < PAGES_PER_COLOR) {
+       //         printk("smallest = %d\n", smallest_nr_pages());	
 		page = alloc_page(GFP_HIGHUSER_MOVABLE);
+	    //    page = alloc_pages_exact_node(0, GFP_HIGHUSER_MOVABLE, 0);
+	
 		if (unlikely(!page)) {
 			printk(KERN_WARNING "Could not allocate pages.\n");
 			ret = -ENOMEM;
 			goto out;
 		}
 		color = page_list_index(page);
+		counter[color]++;
+	//	printk("page(%d) = color %x, bank %x, [color] =%d \n", color, page_color(page), page_bank(page), atomic_read(&color_groups[color].nr_pages));
                 //show_nr_pages();
-		//printk("before : nr_pages = %d\n", atomic_read(&color_groups[color].nr_pages));
-		if (atomic_read(&color_groups[color].nr_pages) < PAGES_PER_COLOR) {
+		if (atomic_read(&color_groups[color].nr_pages) < PAGES_PER_COLOR && color>=32) {
 			add_page_to_color_list(page);
+	//		printk("add page(%d) = color %x, bank %x\n", color, page_color(page), page_bank(page));
 		} else{
 			// Pages here will be freed later 
 			list_add_tail(&page->lru, &free_later);
+			free_counter++;
+		        //list_del(&page->lru);
+		//        __free_page(page);
+	//		printk("useless page(%d) = color %x, bank %x\n", color,  page_color(page), page_bank(page));
 		}
                //show_nr_pages();
-		//printk("after : nr_pages = %d\n", atomic_read(&color_groups[color].nr_pages));
+                /*
+                if(free_counter >= PAGES_PER_COLOR)
+                {
+                    printk("free unwanted page list eariler");
+                    free_counter = 0;
+	            list_for_each_entry_safe(page, page_tmp, &free_later, lru) {
+		        list_del(&page->lru);
+		        __free_page(page);
+	            }
+
+                    show_nr_pages();
+                }
+                */
+        }
+        printk("page counter = \n");
+        for (i=0; i<128; i++)
+        {
+            printk("(%03d) = %4d, ", i , counter[i]);
+            if(i%8 == 7){
+                printk("\n");
+            }
 
         }
+        printk("After refill : \n");
         show_nr_pages();
 #if 1
 	// Free the unwanted pages
@@ -274,10 +318,10 @@ static struct  page *new_alloc_page_color( unsigned long color)
 	}
 	rPage = list_first_entry(&cgroup->list, struct page, lru);
 	BUG_ON(page_count(rPage) > 1);
-	get_page(rPage);
+//	get_page(rPage);
 	list_del(&rPage->lru);
 	atomic_dec(&cgroup->nr_pages);
-//	ClearPageLRU(rPage);
+	ClearPageLRU(rPage);
 out_unlock:
 	spin_unlock(&cgroup->lock);
 out:
@@ -311,7 +355,7 @@ struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
         unsigned int idx = 0;
         idx += num_by_bitmask_index(set_partition[node], set_index[node]);
         idx += number_cachecolors* num_by_bitmask_index(bank_partition[node], bank_index[node]);
-	printk("node  = %d, idx = %d\n", node, idx);
+	//printk("node  = %d, idx = %d\n", node, idx);
 
 	rPage =  new_alloc_page_color(idx);
         
@@ -407,7 +451,21 @@ out:
 	return ret;
 }
 
-
+int show_page_pool_handler(struct ctl_table *table, int write, void __user *buffer,
+		size_t *lenp, loff_t *ppos)
+{
+	int ret = 0, i = 0;
+	mutex_lock(&void_lockdown_proc);
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret)
+		goto out;
+	if (write) {
+            show_nr_pages();
+	}
+out:
+	mutex_unlock(&void_lockdown_proc);
+	return ret;
+}
 static struct ctl_table cache_table[] =
 {
         
@@ -573,8 +631,13 @@ static struct ctl_table cache_table[] =
 		.extra1		= &bank_partition_min,
 		.extra2		= &bank_partition_max,
 	},	
-
-
+	{
+		.procname	= "show_page_pool",
+		.mode		= 0666,
+		.proc_handler	= show_page_pool_handler,
+		.data		= &show_page_pool,
+		.maxlen		= sizeof(show_page_pool),
+	},	
 	{ }
 };
 
-- 
1.8.1.2


From 6941230b519e96be0ea464206e795046c0938d01 Mon Sep 17 00:00:00 2001
From: ChengYang Fu <chengyangfu@gmail.com>
Date: Tue, 24 Mar 2015 21:28:44 -0400
Subject: [PATCH 104/119] change loop condition in refill page process in
 bank_proc.

---
 litmus/bank_proc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/litmus/bank_proc.c b/litmus/bank_proc.c
index 3cf9cda..2688d79 100644
--- a/litmus/bank_proc.c
+++ b/litmus/bank_proc.c
@@ -242,7 +242,8 @@ static int do_add_pages(void)
 		counter[color]++;
 	//	printk("page(%d) = color %x, bank %x, [color] =%d \n", color, page_color(page), page_bank(page), atomic_read(&color_groups[color].nr_pages));
                 //show_nr_pages();
-		if (atomic_read(&color_groups[color].nr_pages) < PAGES_PER_COLOR && color>=32) {
+		//if (atomic_read(&color_groups[color].nr_pages) < PAGES_PER_COLOR && color>=32) {
+		if ( PAGES_PER_COLOR && color>=32) {
 			add_page_to_color_list(page);
 	//		printk("add page(%d) = color %x, bank %x\n", color, page_color(page), page_bank(page));
 		} else{
-- 
1.8.1.2


From f441fa3f923ccb825d9888c8a75c033a67316d91 Mon Sep 17 00:00:00 2001
From: ChengYang Fu <chengyangfu@gmail.com>
Date: Wed, 25 Mar 2015 10:59:55 -0400
Subject: [PATCH 105/119] add reclaim_page function in bank_proc.c to recyle
 all the pages. And only do_add_pages in the kernel booting stage

---
 litmus/bank_proc.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/litmus/bank_proc.c b/litmus/bank_proc.c
index 2688d79..9771529 100644
--- a/litmus/bank_proc.c
+++ b/litmus/bank_proc.c
@@ -38,6 +38,7 @@ unsigned int bank_partition_max = 0x000000ff;
 unsigned int bank_partition_min = 0;
 
 int show_page_pool = 0;
+spinlock_t reclaim_lock;
 
 unsigned int set_partition[9] = {
         0x00000003,  /* Core 0, and Level A*/
@@ -227,7 +228,7 @@ static int do_add_pages(void)
 
 	// until all the page lists contain enough pages 
 	//for (i =0; i<5; i++) {
-	for (i=0; i< 1024*100;i++) {
+	for (i=0; i< 1024*200;i++) {
 //	while (smallest_nr_pages() < PAGES_PER_COLOR) {
        //         printk("smallest = %d\n", smallest_nr_pages());	
 		page = alloc_page(GFP_HIGHUSER_MOVABLE);
@@ -243,7 +244,7 @@ static int do_add_pages(void)
 	//	printk("page(%d) = color %x, bank %x, [color] =%d \n", color, page_color(page), page_bank(page), atomic_read(&color_groups[color].nr_pages));
                 //show_nr_pages();
 		//if (atomic_read(&color_groups[color].nr_pages) < PAGES_PER_COLOR && color>=32) {
-		if ( PAGES_PER_COLOR && color>=32) {
+		if ( PAGES_PER_COLOR && color>=16*2) {
 			add_page_to_color_list(page);
 	//		printk("add page(%d) = color %x, bank %x\n", color, page_color(page), page_bank(page));
 		} else{
@@ -319,7 +320,7 @@ static struct  page *new_alloc_page_color( unsigned long color)
 	}
 	rPage = list_first_entry(&cgroup->list, struct page, lru);
 	BUG_ON(page_count(rPage) > 1);
-//	get_page(rPage);
+	get_page(rPage);
 	list_del(&rPage->lru);
 	atomic_dec(&cgroup->nr_pages);
 	ClearPageLRU(rPage);
@@ -327,7 +328,11 @@ out_unlock:
 	spin_unlock(&cgroup->lock);
 out:
 	if( smallest_nr_pages() == 0)
-		do_add_pages();
+        {
+//		do_add_pages();
+            printk("ERROR(bank_proc.c) = We don't have enough pages in bank_proc.c\n");        
+        
+        }
 	return rPage;
 }
 
@@ -367,6 +372,20 @@ struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 }
 
 
+/*
+ * Reclaim pages.
+ */
+void reclaim_page(struct page *page)
+{
+	const unsigned long color = page_list_index(page);
+	unsigned long nr_reclaimed = 0;
+	spin_lock(&reclaim_lock);
+    	put_page(page);
+	add_page_to_color_list(page);
+
+	spin_unlock(&reclaim_lock);
+	printk("Reclaimed page(%d) = color %x, bank %x, [color] =%d \n", color, page_color(page), page_bank(page), atomic_read(&color_groups[color].nr_pages));
+}
 
 
 /*
@@ -382,6 +401,8 @@ static int __init init_variables(void)
 	NUM_PAGE_LIST = number_banks * number_cachecolors; 
         printk(KERN_WARNING "number of banks = %d, number of cachecolors=%d\n", number_banks, number_cachecolors);
 	mutex_init(&void_lockdown_proc);
+	spin_lock_init(&reclaim_lock);
+
 }
 
 
@@ -663,8 +684,6 @@ static int __init litmus_color_init(void)
 	int err=0;
         printk("Init bankproc.c\n");
 
-	//INIT_LIST_HEAD(&alloced_pages.list);
-	//spin_lock_init(&alloced_pages.lock);
 	init_variables();
 
 	printk(KERN_INFO "Registering LITMUS^RT proc color sysctl.\n");
-- 
1.8.1.2


From 09471d13bd498bdc9d6f0874c0e00eba574f5558 Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Wed, 25 Mar 2015 11:03:29 -0400
Subject: [PATCH 106/119] litmus_migrate_pages

---
 include/linux/balloon_compaction.h |  14 +++++
 include/linux/migrate.h            |   2 +
 litmus/litmus.c                    |   2 +-
 mm/migrate.c                       | 113 +++++++++++++++++++++++++++++++++++++
 4 files changed, 130 insertions(+), 1 deletion(-)

diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index 089743a..1dbef0b 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -93,6 +93,20 @@ static inline void balloon_page_free(struct page *page)
 	__free_page(page);
 }
 
+static inline void litmus_balloon_page_free(struct page *page)
+{
+	/*
+	 * Balloon pages always get an extra refcount before being isolated
+	 * and before being dequeued to help on sorting out fortuite colisions
+	 * between a thread attempting to isolate and another thread attempting
+	 * to release the very same balloon page.
+	 *
+	 * Before we handle the page back to Buddy, lets drop its extra refcnt.
+	 */
+	put_page(page);
+	__free_page(page);
+}
+
 #ifdef CONFIG_BALLOON_COMPACTION
 extern bool balloon_page_isolate(struct page *page);
 extern void balloon_page_putback(struct page *page);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index a405d3dc..a2a7e25e 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -41,6 +41,8 @@ extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
 extern int migrate_pages(struct list_head *l, new_page_t x,
 		unsigned long private, enum migrate_mode mode, int reason);
+extern int litmus_migrate_pages(struct list_head *l, new_page_t x,
+		unsigned long private, enum migrate_mode mode, int reason);
 extern int migrate_huge_page(struct page *, new_page_t x,
 		unsigned long private, enum migrate_mode mode);
 
diff --git a/litmus/litmus.c b/litmus/litmus.c
index 04c5017..c8ed597 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -499,7 +499,7 @@ asmlinkage long sys_set_page_color(int cpu)
 
         //node= 0;
 	if (!list_empty(&pagelist)) {
-		ret = migrate_pages(&pagelist, new_alloc_page, node, MIGRATE_ASYNC, MR_SYSCALL);
+		ret = litmus_migrate_pages(&pagelist, new_alloc_page, node, MIGRATE_ASYNC, MR_SYSCALL);
 		TRACE_TASK(current, "%ld pages not migrated.\n", ret);
 		if (ret) {
 			putback_lru_pages(&pagelist);
diff --git a/mm/migrate.c b/mm/migrate.c
index a88c12f..eab459a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -918,6 +918,65 @@ out:
 	return rc;
 }
 
+static int litmus_unmap_and_move(new_page_t get_new_page, unsigned long private,
+			struct page *page, int force, enum migrate_mode mode)
+{
+	int rc = 0;
+	int *result = NULL;
+	struct page *newpage = get_new_page(page, private, &result);
+
+	if (!newpage)
+		return -ENOMEM;
+
+	if (page_count(page) == 1) {
+		/* page was freed from under us. So we are done. */
+		goto out;
+	}
+
+	if (unlikely(PageTransHuge(page)))
+		if (unlikely(split_huge_page(page)))
+			goto out;
+
+	rc = __unmap_and_move(page, newpage, force, mode);
+
+	if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
+		/*
+		 * A ballooned page has been migrated already.
+		 * Now, it's the time to wrap-up counters,
+		 * handle the page back to Buddy and return.
+		 */
+		dec_zone_page_state(page, NR_ISOLATED_ANON +
+				    page_is_file_cache(page));
+		litmus_balloon_page_free(page);
+		return MIGRATEPAGE_SUCCESS;
+	}
+out:
+	if (rc != -EAGAIN) {
+		/*
+		 * A page that has been migrated has all references
+		 * removed and will be freed. A page that has not been
+		 * migrated will have kepts its references and be
+		 * restored.
+		 */
+		list_del(&page->lru);
+		dec_zone_page_state(page, NR_ISOLATED_ANON +
+				page_is_file_cache(page));
+		putback_lru_page(page);
+	}
+	/*
+	 * Move the new page to the LRU. If migration was not successful
+	 * then this will free the page.
+	 */
+	putback_lru_page(newpage);
+	if (result) {
+		if (rc)
+			*result = rc;
+		else
+			*result = page_to_nid(newpage);
+	}
+	return rc;
+}
+
 /*
  * Counterpart of unmap_and_move_page() for hugepage migration.
  *
@@ -1058,6 +1117,60 @@ out:
 	return rc;
 }
 
+int litmus_migrate_pages(struct list_head *from, new_page_t get_new_page,
+		unsigned long private, enum migrate_mode mode, int reason)
+{
+	int retry = 1;
+	int nr_failed = 0;
+	int nr_succeeded = 0;
+	int pass = 0;
+	struct page *page;
+	struct page *page2;
+	int swapwrite = current->flags & PF_SWAPWRITE;
+	int rc;
+
+	if (!swapwrite)
+		current->flags |= PF_SWAPWRITE;
+
+	for(pass = 0; pass < 10 && retry; pass++) {
+		retry = 0;
+
+		list_for_each_entry_safe(page, page2, from, lru) {
+			cond_resched();
+
+			rc = litmus_unmap_and_move(get_new_page, private,
+						page, pass > 2, mode);
+
+			switch(rc) {
+			case -ENOMEM:
+				goto out;
+			case -EAGAIN:
+				retry++;
+				break;
+			case MIGRATEPAGE_SUCCESS:
+				nr_succeeded++;
+				break;
+			default:
+				/* Permanent failure */
+				nr_failed++;
+				break;
+			}
+		}
+	}
+	rc = nr_failed + retry;
+out:
+	if (nr_succeeded)
+		count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
+	if (nr_failed)
+		count_vm_events(PGMIGRATE_FAIL, nr_failed);
+	trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
+
+	if (!swapwrite)
+		current->flags &= ~PF_SWAPWRITE;
+
+	return rc;
+}
+
 int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
 		      unsigned long private, enum migrate_mode mode)
 {
-- 
1.8.1.2


From 0d90fd30739e41acb6b060b7e145fbdf6a946686 Mon Sep 17 00:00:00 2001
From: ChengYang Fu <chengyangfu@gmail.com>
Date: Wed, 25 Mar 2015 23:28:27 -0400
Subject: [PATCH 107/119] modified refill mechanism

---
 litmus/bank_proc.c | 34 +++++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/litmus/bank_proc.c b/litmus/bank_proc.c
index 9771529..7be55b7 100644
--- a/litmus/bank_proc.c
+++ b/litmus/bank_proc.c
@@ -38,6 +38,7 @@ unsigned int bank_partition_max = 0x000000ff;
 unsigned int bank_partition_min = 0;
 
 int show_page_pool = 0;
+int refill_page_pool = 0;
 spinlock_t reclaim_lock;
 
 unsigned int set_partition[9] = {
@@ -228,7 +229,7 @@ static int do_add_pages(void)
 
 	// until all the page lists contain enough pages 
 	//for (i =0; i<5; i++) {
-	for (i=0; i< 1024*200;i++) {
+	for (i=0; i< 1024*100;i++) {
 //	while (smallest_nr_pages() < PAGES_PER_COLOR) {
        //         printk("smallest = %d\n", smallest_nr_pages());	
 		page = alloc_page(GFP_HIGHUSER_MOVABLE);
@@ -243,8 +244,8 @@ static int do_add_pages(void)
 		counter[color]++;
 	//	printk("page(%d) = color %x, bank %x, [color] =%d \n", color, page_color(page), page_bank(page), atomic_read(&color_groups[color].nr_pages));
                 //show_nr_pages();
-		//if (atomic_read(&color_groups[color].nr_pages) < PAGES_PER_COLOR && color>=32) {
-		if ( PAGES_PER_COLOR && color>=16*2) {
+		if (atomic_read(&color_groups[color].nr_pages) < PAGES_PER_COLOR && color>=32) {
+	//	if ( PAGES_PER_COLOR && color>=16*2) {
 			add_page_to_color_list(page);
 	//		printk("add page(%d) = color %x, bank %x\n", color, page_color(page), page_bank(page));
 		} else{
@@ -329,8 +330,8 @@ out_unlock:
 out:
 	if( smallest_nr_pages() == 0)
         {
-//		do_add_pages();
-            printk("ERROR(bank_proc.c) = We don't have enough pages in bank_proc.c\n");        
+		do_add_pages();
+       //     printk("ERROR(bank_proc.c) = We don't have enough pages in bank_proc.c\n");        
         
         }
 	return rPage;
@@ -488,6 +489,23 @@ out:
 	mutex_unlock(&void_lockdown_proc);
 	return ret;
 }
+
+int refill_page_pool_handler(struct ctl_table *table, int write, void __user *buffer,
+		size_t *lenp, loff_t *ppos)
+{
+	int ret = 0, i = 0;
+	mutex_lock(&void_lockdown_proc);
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret)
+		goto out;
+	if (write) {
+            do_add_pages();
+	}
+out:
+	mutex_unlock(&void_lockdown_proc);
+	return ret;
+}
+
 static struct ctl_table cache_table[] =
 {
         
@@ -659,6 +677,12 @@ static struct ctl_table cache_table[] =
 		.proc_handler	= show_page_pool_handler,
 		.data		= &show_page_pool,
 		.maxlen		= sizeof(show_page_pool),
+	},		{
+		.procname	= "refill_page_pool",
+		.mode		= 0666,
+		.proc_handler	= refill_page_pool_handler,
+		.data		= &refill_page_pool,
+		.maxlen		= sizeof(refill_page_pool),
 	},	
 	{ }
 };
-- 
1.8.1.2


From bf0775c5b3366443a580874ebf82349b7d185f6f Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Wed, 25 Mar 2015 23:30:58 -0400
Subject: [PATCH 108/119] add cache conf. proc file system

---
 arch/arm/kernel/irq.c |   4 +-
 litmus/cache_proc.c   | 173 +++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 173 insertions(+), 4 deletions(-)

diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index ce01835..145f290 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -68,7 +68,7 @@ void handle_IRQ(unsigned int irq, struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
-	enter_irq_mode();
+	//enter_irq_mode();
 	irq_enter();
 
 	/*
@@ -84,7 +84,7 @@ void handle_IRQ(unsigned int irq, struct pt_regs *regs)
 	}
 
 	irq_exit();
-	exit_irq_mode();
+	//exit_irq_mode();
 	set_irq_regs(old_regs);
 }
 
diff --git a/litmus/cache_proc.c b/litmus/cache_proc.c
index a2d560a..c331dd6 100644
--- a/litmus/cache_proc.c
+++ b/litmus/cache_proc.c
@@ -88,6 +88,11 @@ static unsigned int way_partition_max;
 static int zero = 0;
 static int one = 1;
 
+static int l1_prefetch_proc;
+static int l2_prefetch_hint_proc;
+static int l2_double_linefill_proc;
+static int l2_data_prefetch_proc;
+
 #define ld_d_reg(cpu) ({ int __cpu = cpu; \
 			void __iomem *__v = cache_base + L2X0_LOCKDOWN_WAY_D_BASE + \
 			__cpu * L2X0_LOCKDOWN_STRIDE; __v; })
@@ -185,12 +190,12 @@ int way_partition_handler(struct ctl_table *table, int write, void __user *buffe
 	if (ret)
 		goto out;
 	
-	if (write) {
+	//if (write) {
 		printk("Way-partition settings:\n");
 		for (i = 0; i < 9; i++) {
 			printk("0x%08X\n", ~way_partitions[i]);
 		}
-	}
+	//}
 	print_lockdown_registers();
 
 out:
@@ -217,6 +222,12 @@ int lock_all_handler(struct ctl_table *table, int write, void __user *buffer,
 			writel_relaxed(nr_unlocked_way[0], ld_i_reg(i));
 		}
 	}
+	if (write && lock_all == 0) {
+		for (i = 0; i < nr_lockregs;  i++) {
+			writel_relaxed(nr_unlocked_way[16], ld_d_reg(i));
+			writel_relaxed(nr_unlocked_way[16], ld_i_reg(i));
+		}
+	}
 	print_lockdown_registers();
 
 out:
@@ -227,6 +238,9 @@ out:
 void do_partition(enum crit_level lv, int cpu)
 {
 	u32 regs;
+	
+	if (lock_all)
+		return;
 	switch(lv) {
 		case CRIT_LEVEL_A:
 			regs = ~way_partitions[cpu*2];
@@ -290,6 +304,133 @@ void exit_irq_mode(void)
 	writel_relaxed(prev_lockdown_d_reg[cpu], ld_d_reg(cpu));	
 }
 
+/* Operate on the Cortex-A9's ACTLR register */
+#define ACTLR_L2_PREFETCH_HINT	(1 << 1)
+#define ACTLR_L1_PREFETCH	(1 << 2)
+
+/*
+ * Change the ACTLR.
+ * @mode	- If 1 (0), set (clear) the bit given in @mask in the ACTLR.
+ * @mask	- A mask in which one bit is set to operate on the ACTLR.
+ */
+static void actlr_change(int mode, int mask)
+{
+	u32 orig_value, new_value, reread_value;
+
+	if (0 != mode && 1 != mode) {
+		printk(KERN_WARNING "Called %s with mode != 0 and mode != 1.\n",
+				__FUNCTION__);
+		return;
+	}
+
+	/* get the original value */
+	asm volatile("mrc p15, 0, %0, c1, c0, 1" : "=r" (orig_value));
+
+	if (0 == mode)
+		new_value = orig_value & ~(mask);
+	else
+		new_value = orig_value | mask;
+
+	asm volatile("mcr p15, 0, %0, c1, c0, 1" : : "r" (new_value));
+	asm volatile("mrc p15, 0, %0, c1, c0, 1" : "=r" (reread_value));
+
+	printk("ACTLR: orig: 0x%8x  wanted: 0x%8x  new: 0x%8x\n",
+			orig_value, new_value, reread_value);
+}
+
+int litmus_l1_prefetch_proc_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret, mode;
+
+	mutex_lock(&actlr_mutex);
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+	if (!ret && write) {
+		mode = *((int*)table->data);
+		actlr_change(mode, ACTLR_L1_PREFETCH);
+	}
+	mutex_unlock(&actlr_mutex);
+
+	return ret;
+}
+
+int litmus_l2_prefetch_hint_proc_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret, mode;
+
+	mutex_lock(&actlr_mutex);
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (!ret && write) {
+		mode = *((int*)table->data);
+		actlr_change(mode, ACTLR_L2_PREFETCH_HINT);
+	}
+	mutex_unlock(&actlr_mutex);
+
+	return ret;
+}
+
+
+/* Operate on the PL-310's Prefetch Control Register, L2X0_PREFETCH_CTRL */
+#define L2X0_PREFETCH_DOUBLE_LINEFILL	(1 << 30)
+#define L2X0_PREFETCH_DATA_PREFETCH	(1 << 28)
+static void l2x0_prefetch_change(int mode, int mask)
+{
+	u32 orig_value, new_value, reread_value;
+
+	if (0 != mode && 1 != mode) {
+		printk(KERN_WARNING "Called %s with mode != 0 and mode != 1.\n",
+				__FUNCTION__);
+		return;
+	}
+
+	orig_value = readl_relaxed(cache_base + L2X0_PREFETCH_CTRL);
+
+	if (0 == mode)
+		new_value = orig_value & ~(mask);
+	else
+		new_value = orig_value | mask;
+
+	writel_relaxed(new_value, cache_base + L2X0_PREFETCH_CTRL);
+	reread_value = readl_relaxed(cache_base + L2X0_PREFETCH_CTRL);
+
+	printk("l2x0 prefetch: orig: 0x%8x  wanted: 0x%8x  new: 0x%8x\n",
+			orig_value, new_value, reread_value);
+}
+
+int litmus_l2_double_linefill_proc_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret, mode;
+
+	mutex_lock(&l2x0_prefetch_mutex);
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (!ret && write) {
+		mode = *((int*)table->data);
+		l2x0_prefetch_change(mode, L2X0_PREFETCH_DOUBLE_LINEFILL);
+	}
+	mutex_unlock(&l2x0_prefetch_mutex);
+
+	return ret;
+}
+
+int litmus_l2_data_prefetch_proc_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret, mode;
+
+	mutex_lock(&l2x0_prefetch_mutex);
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (!ret && write) {
+		mode = *((int*)table->data);
+		l2x0_prefetch_change(mode, L2X0_PREFETCH_DATA_PREFETCH);
+	}
+	mutex_unlock(&l2x0_prefetch_mutex);
+
+	return ret;
+}
+
 static struct ctl_table cache_table[] =
 {
 	{
@@ -382,6 +523,34 @@ static struct ctl_table cache_table[] =
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+	{
+		.procname	= "l1_prefetch",
+		.mode		= 0644,
+		.proc_handler	= litmus_l1_prefetch_proc_handler,
+		.data		= &l1_prefetch_proc,
+		.maxlen		= sizeof(l1_prefetch_proc),
+	},
+	{
+		.procname	= "l2_prefetch_hint",
+		.mode		= 0644,
+		.proc_handler	= litmus_l2_prefetch_hint_proc_handler,
+		.data		= &l2_prefetch_hint_proc,
+		.maxlen		= sizeof(l2_prefetch_hint_proc),
+	},
+	{
+		.procname	= "l2_double_linefill",
+		.mode		= 0644,
+		.proc_handler	= litmus_l2_double_linefill_proc_handler,
+		.data		= &l2_double_linefill_proc,
+		.maxlen		= sizeof(l2_double_linefill_proc),
+	},
+	{
+		.procname	= "l2_data_prefetch",
+		.mode		= 0644,
+		.proc_handler	= litmus_l2_data_prefetch_proc_handler,
+		.data		= &l2_data_prefetch_proc,
+		.maxlen		= sizeof(l2_data_prefetch_proc),
+	},
 	{ }
 };
 
-- 
1.8.1.2


From 701f70e21800aabf5d5d6042fd105adf531843a4 Mon Sep 17 00:00:00 2001
From: ChengYang Fu <chengyangfu@gmail.com>
Date: Thu, 26 Mar 2015 20:01:33 -0400
Subject: [PATCH 109/119] modify the bank_proc.c

---
 litmus/bank_proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litmus/bank_proc.c b/litmus/bank_proc.c
index 7be55b7..888b6a6 100644
--- a/litmus/bank_proc.c
+++ b/litmus/bank_proc.c
@@ -328,7 +328,7 @@ static struct  page *new_alloc_page_color( unsigned long color)
 out_unlock:
 	spin_unlock(&cgroup->lock);
 out:
-	if( smallest_nr_pages() == 0)
+	while( smallest_nr_pages() == 0)
         {
 		do_add_pages();
        //     printk("ERROR(bank_proc.c) = We don't have enough pages in bank_proc.c\n");        
-- 
1.8.1.2


From 1bbe2730b0bd7755253ab8e5471699d3f2297d22 Mon Sep 17 00:00:00 2001
From: ChengYang Fu <chengyangfu@gmail.com>
Date: Fri, 27 Mar 2015 14:27:42 -0400
Subject: [PATCH 110/119] make bank_proc be more robust

---
 litmus/bank_proc.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/litmus/bank_proc.c b/litmus/bank_proc.c
index 888b6a6..e1025b5 100644
--- a/litmus/bank_proc.c
+++ b/litmus/bank_proc.c
@@ -97,9 +97,10 @@ unsigned int counting_one_set(unsigned int v)
 //    unsigned int v; // count the number of bits set in v
     unsigned int c; // c accumulates the total bits set in v
 
-    for (c = 0; v; v >>= 1)
+    for (c = 0; v; v = v>>1)
     {
-        c += v & 1;
+        if(v&1)
+            c++;
     }
     return c;
 }
@@ -214,7 +215,7 @@ void add_page_to_color_list(struct page *page)
  */
 static int do_add_pages(void)
 {
-	printk("LITMUS do add pages\n");
+//	printk("LITMUS do add pages\n");
 	
 	struct page *page, *page_tmp;
 	LIST_HEAD(free_later);
@@ -236,7 +237,7 @@ static int do_add_pages(void)
 	    //    page = alloc_pages_exact_node(0, GFP_HIGHUSER_MOVABLE, 0);
 	
 		if (unlikely(!page)) {
-			printk(KERN_WARNING "Could not allocate pages.\n");
+	//		printk(KERN_WARNING "Could not allocate pages.\n");
 			ret = -ENOMEM;
 			goto out;
 		}
@@ -271,7 +272,7 @@ static int do_add_pages(void)
                 }
                 */
         }
-        printk("page counter = \n");
+        /*printk("page counter = \n");
         for (i=0; i<128; i++)
         {
             printk("(%03d) = %4d, ", i , counter[i]);
@@ -280,6 +281,7 @@ static int do_add_pages(void)
             }
 
         }
+        */
         printk("After refill : \n");
         show_nr_pages();
 #if 1
@@ -357,18 +359,24 @@ struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 	struct color_group *cgroup;
 	struct page *rPage = NULL;
 	unsigned int color;
-	
+	unsigned int randvalue;
+	get_random_bytes(&randvalue, sizeof(unsigned int));
+        	
 
         unsigned int idx = 0;
-        idx += num_by_bitmask_index(set_partition[node], set_index[node]);
-        idx += number_cachecolors* num_by_bitmask_index(bank_partition[node], bank_index[node]);
-	//printk("node  = %d, idx = %d\n", node, idx);
+//        printk("set = %lx, counting %d\n", set_partition[node],  counting_one_set(set_partition[node]));
+  //      printk("bank = %lx, counting %d\n", bank_partition[node],  counting_one_set(bank_partition[node]));
+        
+
+        idx += num_by_bitmask_index(set_partition[node], randvalue % counting_one_set(set_partition[node]));
+        idx += number_cachecolors* num_by_bitmask_index(bank_partition[node],randvalue % counting_one_set(bank_partition[node]) );
+//	printk("node  = %d, idx = %d\n", node, idx);
 
 	rPage =  new_alloc_page_color(idx);
         
             
-        set_index[node] = (set_index[node]+1) % counting_one_set(set_partition[node]);
-        bank_index[node] = (bank_index[node]+1) % counting_one_set(bank_partition[node]);
+  //      set_index[node] = (set_index[node]+1) % counting_one_set(set_partition[node]);
+//        bank_index[node] = (bank_index[node]+1) % counting_one_set(bank_partition[node]);
 	return rPage; 
 }
 
-- 
1.8.1.2


From 2e42005ed354304c219b8379a83d403122073a50 Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Sat, 28 Mar 2015 08:46:09 -0400
Subject: [PATCH 111/119] Added OS isolation interface

---
 arch/arm/kernel/irq.c |  4 ++--
 litmus/cache_proc.c   | 65 ++++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 53 insertions(+), 16 deletions(-)

diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index 145f290..ce01835 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -68,7 +68,7 @@ void handle_IRQ(unsigned int irq, struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
-	//enter_irq_mode();
+	enter_irq_mode();
 	irq_enter();
 
 	/*
@@ -84,7 +84,7 @@ void handle_IRQ(unsigned int irq, struct pt_regs *regs)
 	}
 
 	irq_exit();
-	//exit_irq_mode();
+	exit_irq_mode();
 	set_irq_regs(old_regs);
 }
 
diff --git a/litmus/cache_proc.c b/litmus/cache_proc.c
index c331dd6..c7f39b5 100644
--- a/litmus/cache_proc.c
+++ b/litmus/cache_proc.c
@@ -58,18 +58,18 @@ u32 set_partitions[2] = {
 };
 
 u32 prev_lockdown_d_reg[5] = {
-	0x00000000,
-	0x00000000,
-	0x00000000,
-	0x00000000,
+	0xFFFFFF00,
+	0xFFFFFF00,
+	0xFFFFFF00,
+	0xFFFFFF00,
 	0xFFFF00FF, /* share with level-C */
 };
 
 u32 prev_lockdown_i_reg[5] = {
-	0x00000000,
-	0x00000000,
-	0x00000000,
-	0x00000000,
+	0xFFFFFF00,
+	0xFFFFFF00,
+	0xFFFFFF00,
+	0xFFFFFF00,
 	0xFFFF00FF, /* share with level-C */
 };
 
@@ -92,6 +92,7 @@ static int l1_prefetch_proc;
 static int l2_prefetch_hint_proc;
 static int l2_double_linefill_proc;
 static int l2_data_prefetch_proc;
+static int os_isolation;
 
 #define ld_d_reg(cpu) ({ int __cpu = cpu; \
 			void __iomem *__v = cache_base + L2X0_LOCKDOWN_WAY_D_BASE + \
@@ -228,6 +229,7 @@ int lock_all_handler(struct ctl_table *table, int write, void __user *buffer,
 			writel_relaxed(nr_unlocked_way[16], ld_i_reg(i));
 		}
 	}
+	printk("LOCK_ALL HANDLER\n");
 	print_lockdown_registers();
 
 out:
@@ -285,21 +287,49 @@ void do_partition(enum crit_level lv, int cpu)
 */
 }
 
-void enter_irq_mode(void)
+int os_isolation_proc_handler(struct ctl_table *table, int write, void __user *buffer,
+		size_t *lenp, loff_t *ppos)
+{
+	int ret = 0;
+	
+	mutex_lock(&lockdown_proc);
+	
+	flush_cache_all();
+	
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret)
+		goto out;
+	
+
+	printk("OS_ISOLATION HANDLER = %d\n", os_isolation);
+
+out:
+	mutex_unlock(&lockdown_proc);
+	return ret;
+}
+
+void inline enter_irq_mode(void)
 {
 	int cpu = smp_processor_id();
+
 	
-	prev_lockdown_i_reg[cpu] = readl_relaxed(ld_i_reg(cpu));
-	prev_lockdown_d_reg[cpu] = readl_relaxed(ld_d_reg(cpu));
+	//prev_lockdown_i_reg[cpu] = readl_relaxed(ld_i_reg(cpu));
+	//prev_lockdown_d_reg[cpu] = readl_relaxed(ld_d_reg(cpu));
+	
+	if (os_isolation == 0)
+		return;	
 	
 	writel_relaxed(prev_lockdown_i_reg[4], ld_i_reg(cpu));
 	writel_relaxed(prev_lockdown_d_reg[4], ld_d_reg(cpu));
 }
 
-void exit_irq_mode(void)
+void inline exit_irq_mode(void)
 {
 	int cpu = smp_processor_id();
-	
+
+	if (os_isolation == 0)
+		return;
+
 	writel_relaxed(prev_lockdown_i_reg[cpu], ld_i_reg(cpu));
 	writel_relaxed(prev_lockdown_d_reg[cpu], ld_d_reg(cpu));	
 }
@@ -551,6 +581,13 @@ static struct ctl_table cache_table[] =
 		.data		= &l2_data_prefetch_proc,
 		.maxlen		= sizeof(l2_data_prefetch_proc),
 	},
+	{
+		.procname	= "os_isolation",
+		.mode		= 0644,
+		.proc_handler	= os_isolation_proc_handler,
+		.data		= &os_isolation,
+		.maxlen		= sizeof(os_isolation),
+	},
 	{ }
 };
 
@@ -579,7 +616,7 @@ static int __init litmus_sysctl_init(void)
 
 	way_partition_min = 0x00000000;
 	way_partition_max = 0x0000FFFF;
-	
+	os_isolation = 0;
 out:
 	return ret;
 }
-- 
1.8.1.2


From ee66dccb3330c6fdf90040d038fc0e6c5f9f286f Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Wed, 8 Apr 2015 20:47:40 -0400
Subject: [PATCH 112/119] Added cache flush func. and fixed bug

---
 arch/arm/mm/cache-l2x0.c    |   6 +-
 include/litmus/cache_proc.h |   3 +
 litmus/bank_proc.c          |  51 ++--
 litmus/cache_proc.c         | 562 +++++++++++++++++++++++++++++++++++++++++---
 litmus/litmus.c             | 104 ++++----
 litmus/sched_mc2.c          |  20 +-
 6 files changed, 624 insertions(+), 122 deletions(-)

diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c
index cff808e..b57810a 100644
--- a/arch/arm/mm/cache-l2x0.c
+++ b/arch/arm/mm/cache-l2x0.c
@@ -148,7 +148,7 @@ static void __l2x0_flush_all(void)
 	debug_writel(0x00);
 }
 
-static void l2x0_flush_all(void)
+void l2x0_flush_all(void)
 {
 	unsigned long flags;
 
@@ -338,7 +338,7 @@ void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask)
 	else
 		cache_id = readl_relaxed(l2x0_base + L2X0_CACHE_ID);
 	aux = readl_relaxed(l2x0_base + L2X0_AUX_CTRL);
-	
+	printk("AUX READ VALUE = %08x\n", aux);
 	cache_type = readl_relaxed(l2x0_base + L2X0_CACHE_TYPE);
 
 	aux &= aux_mask;
@@ -398,7 +398,7 @@ void __init l2x0_init(void __iomem *base, u32 aux_val, u32 aux_mask)
 		l2x0_unlock(cache_id);
 
 		/* l2x0 controller is disabled */
-		//aux |= (1 << 12);
+		//aux |= (1 << 12); // exclusive
 		//printk("AUX BIT = %08x\n", aux);
 		writel_relaxed(aux, l2x0_base + L2X0_AUX_CTRL);
 
diff --git a/include/litmus/cache_proc.h b/include/litmus/cache_proc.h
index 5a66c34..24128d7 100644
--- a/include/litmus/cache_proc.h
+++ b/include/litmus/cache_proc.h
@@ -6,6 +6,9 @@
 void litmus_setup_lockdown(void __iomem*, u32);
 void enter_irq_mode(void);
 void exit_irq_mode(void);
+void flush_cache(void);
+
+extern struct page *new_alloc_page_color(unsigned long color);
 
 #endif
 
diff --git a/litmus/bank_proc.c b/litmus/bank_proc.c
index e1025b5..655eb27 100644
--- a/litmus/bank_proc.c
+++ b/litmus/bank_proc.c
@@ -97,10 +97,9 @@ unsigned int counting_one_set(unsigned int v)
 //    unsigned int v; // count the number of bits set in v
     unsigned int c; // c accumulates the total bits set in v
 
-    for (c = 0; v; v = v>>1)
+    for (c = 0; v; v >>= 1)
     {
-        if(v&1)
-            c++;
+        c += v & 1;
     }
     return c;
 }
@@ -215,7 +214,7 @@ void add_page_to_color_list(struct page *page)
  */
 static int do_add_pages(void)
 {
-//	printk("LITMUS do add pages\n");
+	//printk("LITMUS do add pages\n");
 	
 	struct page *page, *page_tmp;
 	LIST_HEAD(free_later);
@@ -225,19 +224,19 @@ static int do_add_pages(void)
         int free_counter = 0;
         unsigned long counter[128]= {0}; 
         
-        printk("Before refill : \n");
-        show_nr_pages();
+        //printk("Before refill : \n");
+        //show_nr_pages();
 
 	// until all the page lists contain enough pages 
 	//for (i =0; i<5; i++) {
 	for (i=0; i< 1024*100;i++) {
-//	while (smallest_nr_pages() < PAGES_PER_COLOR) {
+	//while (smallest_nr_pages() < PAGES_PER_COLOR) {
        //         printk("smallest = %d\n", smallest_nr_pages());	
 		page = alloc_page(GFP_HIGHUSER_MOVABLE);
 	    //    page = alloc_pages_exact_node(0, GFP_HIGHUSER_MOVABLE, 0);
 	
 		if (unlikely(!page)) {
-	//		printk(KERN_WARNING "Could not allocate pages.\n");
+			printk(KERN_WARNING "Could not allocate pages.\n");
 			ret = -ENOMEM;
 			goto out;
 		}
@@ -272,7 +271,7 @@ static int do_add_pages(void)
                 }
                 */
         }
-        /*printk("page counter = \n");
+/*        printk("page counter = \n");
         for (i=0; i<128; i++)
         {
             printk("(%03d) = %4d, ", i , counter[i]);
@@ -281,9 +280,9 @@ static int do_add_pages(void)
             }
 
         }
-        */
-        printk("After refill : \n");
-        show_nr_pages();
+*/	
+        //printk("After refill : \n");
+        //show_nr_pages();
 #if 1
 	// Free the unwanted pages
 	list_for_each_entry_safe(page, page_tmp, &free_later, lru) {
@@ -310,7 +309,7 @@ static struct  page *new_alloc_page_color( unsigned long color)
 	if( (color <0) || (color)>(number_cachecolors*number_banks -1)) {
 		TRACE_CUR("Wrong color %lu\n", color);	
 //		printk(KERN_WARNING "Wrong color %lu\n", color);
-		goto out_unlock;
+		goto out;
 	}
 
 		
@@ -323,14 +322,14 @@ static struct  page *new_alloc_page_color( unsigned long color)
 	}
 	rPage = list_first_entry(&cgroup->list, struct page, lru);
 	BUG_ON(page_count(rPage) > 1);
-	get_page(rPage);
+	//get_page(rPage);
 	list_del(&rPage->lru);
 	atomic_dec(&cgroup->nr_pages);
 	ClearPageLRU(rPage);
 out_unlock:
 	spin_unlock(&cgroup->lock);
 out:
-	while( smallest_nr_pages() == 0)
+	if( smallest_nr_pages() == 0)
         {
 		do_add_pages();
        //     printk("ERROR(bank_proc.c) = We don't have enough pages in bank_proc.c\n");        
@@ -339,6 +338,10 @@ out:
 	return rPage;
 }
 
+struct page* get_colored_page(unsigned long color)
+{
+	return new_alloc_page_color(color);
+}
 
 /*
  * provide pages for replacement according to  
@@ -359,24 +362,18 @@ struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 	struct color_group *cgroup;
 	struct page *rPage = NULL;
 	unsigned int color;
-	unsigned int randvalue;
-	get_random_bytes(&randvalue, sizeof(unsigned int));
-        	
+	
 
         unsigned int idx = 0;
-//        printk("set = %lx, counting %d\n", set_partition[node],  counting_one_set(set_partition[node]));
-  //      printk("bank = %lx, counting %d\n", bank_partition[node],  counting_one_set(bank_partition[node]));
-        
-
-        idx += num_by_bitmask_index(set_partition[node], randvalue % counting_one_set(set_partition[node]));
-        idx += number_cachecolors* num_by_bitmask_index(bank_partition[node],randvalue % counting_one_set(bank_partition[node]) );
-//	printk("node  = %d, idx = %d\n", node, idx);
+        idx += num_by_bitmask_index(set_partition[node], set_index[node]);
+        idx += number_cachecolors* num_by_bitmask_index(bank_partition[node], bank_index[node]);
+	//printk("node  = %d, idx = %d\n", node, idx);
 
 	rPage =  new_alloc_page_color(idx);
         
             
-  //      set_index[node] = (set_index[node]+1) % counting_one_set(set_partition[node]);
-//        bank_index[node] = (bank_index[node]+1) % counting_one_set(bank_partition[node]);
+        set_index[node] = (set_index[node]+1) % counting_one_set(set_partition[node]);
+        bank_index[node] = (bank_index[node]+1) % counting_one_set(bank_partition[node]);
 	return rPage; 
 }
 
diff --git a/litmus/cache_proc.c b/litmus/cache_proc.c
index c7f39b5..0e123fac 100644
--- a/litmus/cache_proc.c
+++ b/litmus/cache_proc.c
@@ -6,6 +6,7 @@
 #include <linux/slab.h>
 #include <linux/io.h>
 #include <linux/mutex.h>
+#include <linux/time.h>
 
 #include <litmus/litmus_proc.h>
 #include <litmus/sched_trace.h>
@@ -19,6 +20,34 @@
 #define UNLOCK_ALL	0x00000000 /* allocation in any way */
 #define LOCK_ALL        (~UNLOCK_ALL)
 #define MAX_NR_WAYS	16
+#define MAX_NR_COLORS	16
+
+void mem_lock(u32 lock_val, int cpu);
+
+/*
+ * unlocked_way[i] : allocation can occur in way i
+ *
+ * 0 = allocation can occur in the corresponding way
+ * 1 = allocation cannot occur in the corresponding way
+ */
+u32 unlocked_way[MAX_NR_WAYS]  = {
+	0xFFFFFFFE, /* way 0 unlocked */
+	0xFFFFFFFD,
+	0xFFFFFFFB,
+	0xFFFFFFF7,
+	0xFFFFFFEF, /* way 4 unlocked */
+	0xFFFFFFDF,
+	0xFFFFFFBF,
+	0xFFFFFF7F,
+	0xFFFFFEFF, /* way 8 unlocked */
+	0xFFFFFDFF,
+	0xFFFFFBFF,
+	0xFFFFF7FF,
+	0xFFFFEFFF, /* way 12 unlocked */
+	0xFFFFDFFF,
+	0xFFFFBFFF,
+	0xFFFF7FFF,
+};
 
 u32 nr_unlocked_way[MAX_NR_WAYS+1]  = {
 	0xFFFFFFFF, /* all ways are locked. usable = 0*/
@@ -40,16 +69,23 @@ u32 nr_unlocked_way[MAX_NR_WAYS+1]  = {
 	0xFFFF0000, /* way ~15 unlocked. usable = 16 */
 };
 
+u32 way_partition[4] = {
+	0xfffffff0, /* cpu0 */
+	0xffffff0f, /* cpu1 */
+	0xfffff0ff, /* cpu2 */
+	0xffff0fff, /* cpu3 */
+};
+
 u32 way_partitions[9] = {
-	0x00000003, /* cpu0 A */
-	0x00000003, /* cpu0 B */
-	0x0000000C, /* cpu1 A */
-	0x0000000C, /* cpu1 B */
-	0x00000030, /* cpu2 A */
-	0x00000030, /* cpu2 B */
-	0x000000C0, /* cpu3 A */
-	0x000000C0, /* cpu3 B */
-	0x0000FF00, /* lv C */
+	0xffff0003, /* cpu0 A */
+	0xffff0003, /* cpu0 B */
+	0xffff000C, /* cpu1 A */
+	0xffff000C, /* cpu1 B */
+	0xffff0030, /* cpu2 A */
+	0xffff0030, /* cpu2 B */
+	0xffff00C0, /* cpu3 A */
+	0xffff00C0, /* cpu3 B */
+	0xffffff00, /* lv C */
 };
 
 u32 set_partitions[2] = {
@@ -82,8 +118,8 @@ static u32 cache_id;
 struct mutex actlr_mutex;
 struct mutex l2x0_prefetch_mutex;
 struct mutex lockdown_proc;
-static unsigned int way_partition_min;
-static unsigned int way_partition_max;
+static u32 way_partition_min;
+static u32 way_partition_max;
 
 static int zero = 0;
 static int one = 1;
@@ -93,6 +129,7 @@ static int l2_prefetch_hint_proc;
 static int l2_double_linefill_proc;
 static int l2_data_prefetch_proc;
 static int os_isolation;
+static int use_part;
 
 #define ld_d_reg(cpu) ({ int __cpu = cpu; \
 			void __iomem *__v = cache_base + L2X0_LOCKDOWN_WAY_D_BASE + \
@@ -103,15 +140,45 @@ static int os_isolation;
 
 int lock_all;
 int nr_lockregs;
+static raw_spinlock_t cache_lock;
+static raw_spinlock_t prefetch_lock;
+static void ***flusher_pages = NULL;
+
+extern void l2x0_flush_all(void);
+
+static inline void cache_wait_way(void __iomem *reg, unsigned long mask)
+{
+	/* wait for cache operation by line or way to complete */
+	while (readl_relaxed(reg) & mask)
+		cpu_relax();
+}
+
+#ifdef CONFIG_CACHE_PL310
+static inline void cache_wait(void __iomem *reg, unsigned long mask)
+{
+	/* cache operations by line are atomic on PL310 */
+}
+#else
+#define cache_wait	cache_wait_way
+#endif
+
+static inline void cache_sync(void)
+{
+	void __iomem *base = cache_base;
+
+	writel_relaxed(0, base + L2X0_CACHE_SYNC);
+	cache_wait(base + L2X0_CACHE_SYNC, 1);
+}
 
 static void print_lockdown_registers(void)
 {
 	int i;
 
-	for (i = 0; i < nr_lockregs; i++) {
-		printk("Lockdown Data CPU %2d: 0x%8x\n",
+	//for (i = 0; i < nr_lockregs; i++) {
+	for (i = 0; i < 4; i++) {
+		printk("Lockdown Data CPU %2d: 0x%04x\n",
 				i, readl_relaxed(ld_d_reg(i)));
-		printk("Lockdown Inst CPU %2d: 0x%8x\n",
+		printk("Lockdown Inst CPU %2d: 0x%04x\n",
 				i, readl_relaxed(ld_i_reg(i)));
 	}
 }
@@ -159,7 +226,7 @@ static void test_lockdown(void *ignore)
 
 void litmus_setup_lockdown(void __iomem *base, u32 id)
 {
-    cache_base = base;
+	cache_base = base;
 	cache_id = id;
 	lockreg_d = cache_base + L2X0_LOCKDOWN_WAY_D_BASE;
 	lockreg_i = cache_base + L2X0_LOCKDOWN_WAY_I_BASE;
@@ -174,6 +241,8 @@ void litmus_setup_lockdown(void __iomem *base, u32 id)
 	mutex_init(&actlr_mutex);
 	mutex_init(&l2x0_prefetch_mutex);
 	mutex_init(&lockdown_proc);
+	raw_spin_lock_init(&cache_lock);
+	raw_spin_lock_init(&prefetch_lock);
 	
 	test_lockdown(NULL);
 }
@@ -185,18 +254,26 @@ int way_partition_handler(struct ctl_table *table, int write, void __user *buffe
 	
 	mutex_lock(&lockdown_proc);
 	
-	flush_cache_all();
+	//flush_cache_all();
+	//cache_sync();
+	l2x0_flush_all();
 	
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret)
 		goto out;
 	
-	//if (write) {
+	if (write) {
 		printk("Way-partition settings:\n");
 		for (i = 0; i < 9; i++) {
-			printk("0x%08X\n", ~way_partitions[i]);
+			printk("0x%08X\n", way_partitions[i]);
+		}
+		for (i = 0; i < 4; i++) {
+			writel_relaxed(~way_partitions[i*2], cache_base + L2X0_LOCKDOWN_WAY_D_BASE +
+				       i * L2X0_LOCKDOWN_STRIDE);
+			writel_relaxed(~way_partitions[i*2], cache_base + L2X0_LOCKDOWN_WAY_I_BASE +
+				       i * L2X0_LOCKDOWN_STRIDE);
 		}
-	//}
+	}
 	print_lockdown_registers();
 
 out:
@@ -211,23 +288,49 @@ int lock_all_handler(struct ctl_table *table, int write, void __user *buffer,
 	
 	mutex_lock(&lockdown_proc);
 	
-	flush_cache_all();
+	//flush_cache_all();
+	//outer_flush_all();
+	//cache_sync();
+	//l2x0_flush_all();
+	flush_cache();
 	
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret)
 		goto out;
 	
 	if (write && lock_all == 1) {
+		for (i = 0; i < nr_lockregs; i++) {
+			writel_relaxed(0xFFFF, cache_base + L2X0_LOCKDOWN_WAY_D_BASE +
+				       i * L2X0_LOCKDOWN_STRIDE);
+			writel_relaxed(0xFFFF, cache_base + L2X0_LOCKDOWN_WAY_I_BASE +
+				       i * L2X0_LOCKDOWN_STRIDE);
+		}
+/*		
 		for (i = 0; i < nr_lockregs;  i++) {
-			writel_relaxed(nr_unlocked_way[0], ld_d_reg(i));
-			writel_relaxed(nr_unlocked_way[0], ld_i_reg(i));
+			barrier();
+			mem_lock(LOCK_ALL, i);
+			barrier();
+			//writel_relaxed(nr_unlocked_way[0], ld_d_reg(i));
+			//writel_relaxed(nr_unlocked_way[0], ld_i_reg(i));
 		}
+*/		
 	}
 	if (write && lock_all == 0) {
+		for (i = 0; i < nr_lockregs; i++) {
+			writel_relaxed(0x0, cache_base + L2X0_LOCKDOWN_WAY_D_BASE +
+				       i * L2X0_LOCKDOWN_STRIDE);
+			writel_relaxed(0x0, cache_base + L2X0_LOCKDOWN_WAY_I_BASE +
+				       i * L2X0_LOCKDOWN_STRIDE);
+		}
+/*
 		for (i = 0; i < nr_lockregs;  i++) {
-			writel_relaxed(nr_unlocked_way[16], ld_d_reg(i));
-			writel_relaxed(nr_unlocked_way[16], ld_i_reg(i));
+			barrier();
+			mem_lock(UNLOCK_ALL, i);
+			barrier();
+			//writel_relaxed(nr_unlocked_way[16], ld_d_reg(i));
+			//writel_relaxed(nr_unlocked_way[16], ld_i_reg(i));
 		}
+*/
 	}
 	printk("LOCK_ALL HANDLER\n");
 	print_lockdown_registers();
@@ -237,32 +340,60 @@ out:
 	return ret;
 }
 
+void mem_lock(u32 lock_val, int cpu)
+{
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&cache_lock, flags);
+
+	__asm__ __volatile__ (
+"	str	%[lockval], [%[dcachereg]]\n"
+"	str	%[lockval], [%[icachereg]]\n"
+	: 
+	: [dcachereg] "r" (ld_d_reg(cpu)),
+	  [icachereg] "r" (ld_i_reg(cpu)),
+	  [lockval] "r" (lock_val)
+	: "cc");
+
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
+}
+
 void do_partition(enum crit_level lv, int cpu)
 {
 	u32 regs;
+	//unsigned long flags;
 	
-	if (lock_all)
+	if (lock_all || !use_part)
 		return;
 	switch(lv) {
 		case CRIT_LEVEL_A:
 			regs = ~way_partitions[cpu*2];
+			regs |= 0xffff0000;
 			writel_relaxed(regs, ld_d_reg(cpu));
 			writel_relaxed(regs, ld_i_reg(cpu));
 			break;
 		case CRIT_LEVEL_B:
 			regs = ~way_partitions[cpu*2+1];
+			regs |= 0xffff0000;
 			writel_relaxed(regs, ld_d_reg(cpu));
 			writel_relaxed(regs, ld_i_reg(cpu));
 			break;
 		case CRIT_LEVEL_C:
 		case NUM_CRIT_LEVELS:
 			regs = ~way_partitions[8];
+			regs |= 0xffff0000;
 			writel_relaxed(regs, ld_d_reg(cpu));
 			writel_relaxed(regs, ld_i_reg(cpu));
 			break;
 		default:
 			BUG();
+
 	}
+	//cache_sync();
+//	barrier();
+//	mem_lock(regs, cpu);
+//	barrier();		
+	//print_lockdown_registers();
 /*
 	if (use_set_partition == 1 && use_way_partition == 1)
 		printk(KERN_ALERT "BOTH SET, WAY ARE SET!!!!\n");
@@ -287,6 +418,29 @@ void do_partition(enum crit_level lv, int cpu)
 */
 }
 
+int use_part_proc_handler(struct ctl_table *table, int write, void __user *buffer,
+		size_t *lenp, loff_t *ppos)
+{
+	int ret = 0;
+	
+	mutex_lock(&lockdown_proc);
+	
+	//flush_cache_all();
+	//cache_sync();
+	//l2x0_flush_all();
+	
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret)
+		goto out;
+	
+
+	printk("USE_PART HANDLER = %d\n", use_part);
+
+out:
+	mutex_unlock(&lockdown_proc);
+	return ret;
+}
+
 int os_isolation_proc_handler(struct ctl_table *table, int write, void __user *buffer,
 		size_t *lenp, loff_t *ppos)
 {
@@ -294,7 +448,10 @@ int os_isolation_proc_handler(struct ctl_table *table, int write, void __user *b
 	
 	mutex_lock(&lockdown_proc);
 	
-	flush_cache_all();
+	//flush_cache_all();
+	//cache_sync();
+	//l2x0_flush_all();
+	//flush_cache();
 	
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret)
@@ -312,9 +469,9 @@ void inline enter_irq_mode(void)
 {
 	int cpu = smp_processor_id();
 
-	
-	//prev_lockdown_i_reg[cpu] = readl_relaxed(ld_i_reg(cpu));
-	//prev_lockdown_d_reg[cpu] = readl_relaxed(ld_d_reg(cpu));
+	return;
+	prev_lockdown_i_reg[cpu] = readl_relaxed(ld_i_reg(cpu));
+	prev_lockdown_d_reg[cpu] = readl_relaxed(ld_d_reg(cpu));
 	
 	if (os_isolation == 0)
 		return;	
@@ -326,7 +483,7 @@ void inline enter_irq_mode(void)
 void inline exit_irq_mode(void)
 {
 	int cpu = smp_processor_id();
-
+	return;
 	if (os_isolation == 0)
 		return;
 
@@ -404,6 +561,7 @@ int litmus_l2_prefetch_hint_proc_handler(struct ctl_table *table, int write,
 
 /* Operate on the PL-310's Prefetch Control Register, L2X0_PREFETCH_CTRL */
 #define L2X0_PREFETCH_DOUBLE_LINEFILL	(1 << 30)
+#define L2X0_PREFETCH_INST_PREFETCH	(1 << 29)
 #define L2X0_PREFETCH_DATA_PREFETCH	(1 << 28)
 static void l2x0_prefetch_change(int mode, int mask)
 {
@@ -454,13 +612,19 @@ int litmus_l2_data_prefetch_proc_handler(struct ctl_table *table, int write,
 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (!ret && write) {
 		mode = *((int*)table->data);
-		l2x0_prefetch_change(mode, L2X0_PREFETCH_DATA_PREFETCH);
+		l2x0_prefetch_change(mode, L2X0_PREFETCH_DATA_PREFETCH|L2X0_PREFETCH_INST_PREFETCH);
 	}
 	mutex_unlock(&l2x0_prefetch_mutex);
 
 	return ret;
 }
 
+int do_perf_test_proc_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos);
+
+int setup_flusher_proc_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos);
+		
 static struct ctl_table cache_table[] =
 {
 	{
@@ -588,6 +752,23 @@ static struct ctl_table cache_table[] =
 		.data		= &os_isolation,
 		.maxlen		= sizeof(os_isolation),
 	},
+	{
+		.procname	= "use_part",
+		.mode		= 0644,
+		.proc_handler	= use_part_proc_handler,
+		.data		= &use_part,
+		.maxlen		= sizeof(use_part),
+	},
+	{
+		.procname	= "do_perf_test",
+		.mode		= 0644,
+		.proc_handler	= do_perf_test_proc_handler,
+	},
+	{
+		.procname	= "setup_flusher",
+		.mode		= 0644,
+		.proc_handler	= setup_flusher_proc_handler,
+	},
 	{ }
 };
 
@@ -600,6 +781,321 @@ static struct ctl_table litmus_dir_table[] = {
 	{ }
 };
 
+u32 color_read_in_mem(u32 lock_val, u32 unlock_val, void *start, void *end)
+{
+	u32 v = 0;
+
+	__asm__ __volatile__ (
+"	.align 5\n"
+"	str	%[lockval], [%[cachereg]]\n"
+"1:	ldr	%[val], [%[addr]], #32		@ 32 bytes = 1 cache line\n"
+"	cmp	%[end], %[addr]			@ subtracts addr from end\n"
+"	bgt	1b\n				@ read more, if necessary\n"
+	: [addr] "+r" (start),
+	  [val] "+r" (v)
+	: [end] "r" (end),
+#ifdef CONFIG_CACHE_PL310
+	  [cachereg] "r" (ld_d_reg(raw_smp_processor_id())),
+#else
+	  [cachereg] "r" (lockreg_d),
+#endif
+	  [lockval] "r" (lock_val)
+	: "cc");
+
+	return v;
+}
+
+
+/*
+ * Prefetch by reading the first word of each cache line in a page.
+ *
+ * @lockdown_reg: address of the lockdown register to write
+ * @lock_val: value to be written to @lockdown_reg
+ * @unlock_val: will unlock the cache to this value
+ * @addr: start address to be prefetched
+ * @end_addr: end address to prefetch (exclusive)
+ *
+ * Assumes: addr < end_addr AND addr != end_addr
+ */
+u32 color_read_in_mem_lock(u32 lock_val, u32 unlock_val, void *start, void *end)
+{
+#ifndef CONFIG_CACHE_PL310
+	unsigned long flags;
+#endif
+	u32 v = 0;
+
+#ifndef CONFIG_CACHE_PL310
+	raw_spin_lock_irqsave(&prefetch_lock, flags);
+#endif
+
+	__asm__ __volatile__ (
+"	.align 5\n"
+"	str	%[lockval], [%[cachereg]]\n"
+"1:	ldr	%[val], [%[addr]], #32		@ 32 bytes = 1 cache line\n"
+"	cmp	%[end], %[addr]			@ subtracts addr from end\n"
+"	bgt	1b\n				@ read more, if necessary\n"
+"	str	%[unlockval], [%[cachereg]]\n"
+	: [addr] "+r" (start),
+	  [val] "+r" (v)
+	: [end] "r" (end),
+#ifdef CONFIG_CACHE_PL310
+	  [cachereg] "r" (ld_d_reg(raw_smp_processor_id())),
+#else
+	  [cachereg] "r" (lockreg_d),
+#endif
+	  [lockval] "r" (lock_val),
+	  [unlockval] "r" (unlock_val)
+	: "cc");
+
+#ifndef CONFIG_CACHE_PL310
+	raw_spin_unlock_irqrestore(&prefetch_lock, flags);
+#endif
+
+	return v;
+}
+
+static long update_timeval(struct timespec lhs, struct timespec rhs)
+{
+	long val;
+	struct timespec ts;
+
+	ts = timespec_sub(rhs, lhs);
+	val = ts.tv_sec*NSEC_PER_SEC + ts.tv_nsec;
+
+	return val;
+}
+
+extern void v7_flush_kern_dcache_area(void *, size_t);
+
+/*
+ * Ensure that this page is not in the L1 or L2 cache.
+ * Since the L1 cache is VIPT and the L2 cache is PIPT, we can use either the
+ * kernel or user vaddr.
+ */
+void color_flush_page(void *vaddr)
+{
+	v7_flush_kern_dcache_area(vaddr, PAGE_SIZE);
+}
+
+extern struct page* get_colored_page(unsigned long color);
+
+int setup_flusher_array(void)
+{
+	int color, way, ret = 0;
+	struct page *page;
+
+	if (flusher_pages != NULL)
+		goto out;
+
+	flusher_pages = (void***) kmalloc(MAX_NR_WAYS
+			* sizeof(*flusher_pages), GFP_KERNEL);
+	if (!flusher_pages) {
+		printk(KERN_WARNING "No memory for flusher array!\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (way = 0; way < MAX_NR_WAYS; way++) {
+		void **flusher_color_arr;
+		flusher_color_arr = (void**) kmalloc(sizeof(**flusher_pages)
+				* MAX_NR_COLORS, GFP_KERNEL);
+		if (!flusher_color_arr) {
+			printk(KERN_WARNING "No memory for flusher array!\n");
+			ret = -ENOMEM;
+			goto out_free;
+		}
+
+		flusher_pages[way] = flusher_color_arr;
+
+		for (color = 0; color < MAX_NR_COLORS; color++) {
+			int node;
+			switch (color) {
+				case 0:
+					node = 32;
+					break;
+				case 1:
+					node = 33;
+					break;
+				case 2:
+					node = 50;
+					break;
+				case 3:
+					node = 51;
+					break;
+				case 4:
+					node = 68;
+					break;
+				case 5:
+					node = 69;
+					break;
+				case 6:
+					node = 86;
+					break;
+				case 7:
+					node = 87;
+					break;
+				case 8:
+					node = 88;
+					break;
+				case 9:
+					node = 105;
+					break;
+				case 10:
+					node = 106;
+					break;
+				case 11:
+					node = 107;
+					break;
+				case 12:
+					node = 108;
+					break;					
+				case 13:
+					node = 125;
+					break;
+				case 14:
+					node = 126;
+					break;
+				case 15:
+					node = 127;
+					break;
+			}	
+			page = get_colored_page(node);
+			if (!page) {
+				printk(KERN_WARNING "no more colored pages\n");
+				ret = -EINVAL;
+				goto out_free;
+			}
+			flusher_pages[way][color] = page_address(page);
+			if (!flusher_pages[way][color]) {
+				printk(KERN_WARNING "bad page address\n");
+				ret = -EINVAL;
+				goto out_free;
+			}
+		}
+	}
+out:
+	return ret;
+out_free:
+	for (way = 0; way < MAX_NR_WAYS; way++) {
+		for (color = 0; color < MAX_NR_COLORS; color++) {
+			/* not bothering to try and give back colored pages */
+		}
+		kfree(flusher_pages[way]);
+	}
+	kfree(flusher_pages);
+	flusher_pages = NULL;
+	return ret;
+}
+
+void flush_cache(void)
+{
+/*	int *dummy;
+	
+	flush_cache_all();
+	int size = 128, i, t = 0;
+	
+	dummy = kmalloc(PAGE_SIZE*size, GFP_KERNEL);
+	for (i = 0; i<PAGE_SIZE*size/sizeof(int); i++) {
+		dummy[i] = t++;
+	}
+	
+	kfree(dummy);
+*/	
+	int way, color;
+	for (way=0;way<MAX_NR_WAYS;way++) {
+		for (color=0;color<MAX_NR_COLORS;color++) {
+			void *vaddr = flusher_pages[way][color];
+			u32 lvalue  = unlocked_way[way];
+			color_read_in_mem_lock(lvalue, LOCK_ALL,
+					       vaddr, vaddr + PAGE_SIZE);
+		}
+
+	}	
+}
+
+#define TRIALS 1000
+
+static int perf_test(void) {
+	struct timespec before, after;
+	struct page *page;
+	void *vaddr;
+	u32 *data;
+	long time;
+	int i;
+
+	page = alloc_page(__GFP_MOVABLE);
+	if (!page) {
+		printk(KERN_WARNING "No memory\n");
+		return -ENOMEM;
+	}
+
+	vaddr = page_address(page);
+	if (!vaddr)
+		printk(KERN_WARNING "%s: vaddr is null\n", __FUNCTION__);
+	data = (u32*) vaddr;
+
+	getnstimeofday(&before);
+	barrier();
+	for (i = 0; i < TRIALS; i++) {
+		color_flush_page(vaddr);
+	}
+	barrier();
+	getnstimeofday(&after);
+	time = update_timeval(before, after);
+	printk("Average for flushes without re-reading: %ld\n", time / TRIALS);
+
+	color_read_in_mem(unlocked_way[0], UNLOCK_ALL, vaddr, vaddr + PAGE_SIZE);
+	barrier();
+	getnstimeofday(&before);
+	barrier();
+	for (i = 0; i < TRIALS; i++) {
+		color_read_in_mem(unlocked_way[0], UNLOCK_ALL, vaddr, vaddr + PAGE_SIZE);
+	}
+	barrier();
+	getnstimeofday(&after);
+	time = update_timeval(before, after);
+	printk("Average for read in (no flush): %ld\n", time / TRIALS);
+
+	getnstimeofday(&before);
+	barrier();
+	for (i = 0; i < TRIALS; i++) {
+		color_read_in_mem(unlocked_way[0], UNLOCK_ALL, vaddr, vaddr + PAGE_SIZE);
+		color_flush_page(vaddr);
+	}
+	barrier();
+	getnstimeofday(&after);
+	time = update_timeval(before, after);
+	printk("Average for read in and then flush: %ld\n", time / TRIALS);
+
+	free_page((unsigned long)vaddr);
+	return 0;
+}
+
+int do_perf_test_proc_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret = 0;
+
+	if (write) {
+		ret = perf_test();
+	}
+
+	return ret;
+}
+
+int setup_flusher_proc_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret = -EINVAL;
+
+	if (write && flusher_pages == NULL) {
+		ret = setup_flusher_array();
+	}
+	
+	printk(KERN_INFO "setup flusher return: %d\n", ret);
+	return ret;
+}
+
 static struct ctl_table_header *litmus_sysctls;
 
 static int __init litmus_sysctl_init(void)
@@ -614,9 +1110,13 @@ static int __init litmus_sysctl_init(void)
 		goto out;
 	}
 
+	//setup_flusher_array();
+	printk(KERN_INFO "Setup flush_array.\n");
 	way_partition_min = 0x00000000;
 	way_partition_max = 0x0000FFFF;
 	os_isolation = 0;
+	use_part = 0;
+	
 out:
 	return ret;
 }
diff --git a/litmus/litmus.c b/litmus/litmus.c
index c8ed597..70342e7 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -26,6 +26,7 @@
 #include <litmus/sched_trace.h>
 #include <litmus/litmus_proc.h>
 #include <litmus/clock.h>
+#include <litmus/cache_proc.h>
 
 #include <asm/cacheflush.h>
 
@@ -46,6 +47,8 @@ atomic_t rt_task_count 		= ATOMIC_INIT(0);
 atomic_t release_master_cpu = ATOMIC_INIT(NO_CPU);
 #endif
 
+extern void l2x0_flush_all(void);
+
 static struct kmem_cache * bheap_node_cache;
 extern struct kmem_cache * release_heap_cache;
 
@@ -339,15 +342,6 @@ asmlinkage long sys_reservation_destroy(unsigned int reservation_id, int cpu)
 	return litmus->reservation_destroy(reservation_id, cpu);
 }
 
-struct task_page {
-	unsigned long vm_start;
-	unsigned long vm_end;
-	struct page* page;
-	struct list_head list;
-};
-
-LIST_HEAD(task_page_list);
-
 static unsigned long color_mask;
 
 static inline unsigned long page_color(struct page *page)
@@ -403,11 +397,8 @@ extern struct page *new_alloc_page(struct page *page, unsigned long node, int **
 asmlinkage long sys_set_page_color(int cpu)
 {
 	long ret = 0;
-	//struct task_page *task_page_itr = NULL;
-	//struct task_page *task_page_itr_next = NULL;
 	struct page *page_itr = NULL;
 	struct vm_area_struct *vma_itr = NULL;
-	//struct task_page *entry = NULL;
 	int nr_pages = 0, nr_shared_pages = 0, nr_failed = 0;
 	unsigned long node;
 		
@@ -420,27 +411,13 @@ asmlinkage long sys_set_page_color(int cpu)
 	while (vma_itr != NULL) {
 		unsigned int num_pages = 0, i;
 		struct page *old_page = NULL;
-		/*
-		entry = kmalloc(sizeof(struct task_page), GFP_ATOMIC);
-		if (entry == NULL) {
-			return -ENOSPC;
-		}
-		entry->vm_start = vma_itr->vm_start;
-		entry->vm_end = vma_itr->vm_end;
-		*/
+		
 		num_pages = (vma_itr->vm_end - vma_itr->vm_start) / PAGE_SIZE;
 		// print vma flags
 		//printk(KERN_INFO "flags: 0x%lx\n", vma_itr->vm_flags);
 		//printk(KERN_INFO "start - end: 0x%lx - 0x%lx (%lu)\n", vma_itr->vm_start, vma_itr->vm_end, (vma_itr->vm_end - vma_itr->vm_start)/PAGE_SIZE);
 		
 		for (i = 0; i < num_pages; i++) {
-/*
-			new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma_itr, vma_itr->vm_start);
-			if (!new_page)
-				return -ENOSPC;
-			printk(KERN_INFO "PAGE_COLOR: %lu\n", page_color(new_page));
-*/
-			//old_page = walk_page_table(vma_itr->vm_start + PAGE_SIZE*i);
 			old_page = follow_page(vma_itr, vma_itr->vm_start + PAGE_SIZE*i, FOLL_GET|FOLL_SPLIT);
 			
 			if (IS_ERR(old_page))
@@ -481,9 +458,6 @@ asmlinkage long sys_set_page_color(int cpu)
 			}
 		}
 		
-		//INIT_LIST_HEAD(&entry->list);
-		//list_add(&entry->list, &task_page_list);
-		
 		vma_itr = vma_itr->vm_next;
 	}
 
@@ -493,13 +467,13 @@ asmlinkage long sys_set_page_color(int cpu)
 	
 	ret = 0;
 	if (cpu == -1)
-		node = 4;
+		node = 8;
 	else
 		node = cpu;
 
         //node= 0;
 	if (!list_empty(&pagelist)) {
-		ret = litmus_migrate_pages(&pagelist, new_alloc_page, node, MIGRATE_ASYNC, MR_SYSCALL);
+		ret = migrate_pages(&pagelist, new_alloc_page, node, MIGRATE_ASYNC, MR_SYSCALL);
 		TRACE_TASK(current, "%ld pages not migrated.\n", ret);
 		if (ret) {
 			putback_lru_pages(&pagelist);
@@ -516,32 +490,17 @@ asmlinkage long sys_set_page_color(int cpu)
 		}
 		vma_itr = vma_itr->vm_next;
 	}
-	
-	/* copy shared pages HERE */
-/*	
-	ret = 0;
-	if (!list_empty(&shared_pagelist)) {
-		ret = migrate_shared_pages(&shared_pagelist, new_alloc_page, 0, MIGRATE_ASYNC, MR_SYSCALL);
-		if (ret) {
-			printk(KERN_INFO "%ld shared pages not migrated.\n", ret);
-			putback_lru_pages(&shared_pagelist);
-		}
-	}
-*/
+
 	up_read(&current->mm->mmap_sem);
 
 	list_for_each_entry(page_itr, &shared_pagelist, lru) {
 		TRACE("S Anon=%d, pfn = %lu, _mapcount = %d, _count = %d\n", PageAnon(page_itr), __page_to_pfn(page_itr), page_mapcount(page_itr), page_count(page_itr));
 	}
 	
-/*	
-	list_for_each_entry_safe(task_page_itr, task_page_itr_next, &task_page_list, list) {
-		//printk(KERN_INFO "start - end: 0x%lx - 0x%lx (%lu)\n", task_page_itr->vm_start, task_page_itr->vm_end, (task_page_itr->vm_end - task_page_itr->vm_start)/PAGE_SIZE);
-		list_del(&task_page_itr->list);
-		kfree(task_page_itr);		
-	}
-*/	
 	TRACE_TASK(current, "nr_pages = %d nr_failed = %d\n", nr_pages, nr_failed);
+	printk(KERN_INFO "nr_pages = %d nr_failed = %d\n", nr_pages, nr_failed);
+	flush_cache();
+	
 	return ret;
 }
 
@@ -834,6 +793,44 @@ static struct notifier_block shutdown_notifier = {
 	.notifier_call = litmus_shutdown_nb,
 };
 
+static void litmus_nsacr_register(void)
+{
+	u32 val, new_value, reread;
+
+	asm volatile("mrc p15, 0, %0, c1, c1, 2" : "=r" (val));
+	
+	new_value = val | 0x00048c00;
+	
+	asm volatile("mcr p15, 0, %0, c1, c1, 2" : : "r" (new_value));
+	asm volatile("mrc p15, 0, %0, c1, c1, 2" : "=r" (reread));
+	printk("NSACR REGISTER = orig: 0x%08x new: 0x%08x reread: 0x%08x\n", val, new_value, reread);
+	
+	
+	return;
+}
+
+static void litmus_pmu_register(void)
+{
+	u32 val, new_value, reread;
+
+	asm volatile("mrc p15, 0, %0, c1, c1, 1" : "=r" (val));
+	
+	new_value = val | 0x00000003;
+	
+	asm volatile("mcr p15, 0, %0, c1, c1, 1" : : "r" (new_value));
+	asm volatile("mrc p15, 0, %0, c1, c1, 1" : "=r" (reread));
+	printk("SDER REGISTER = orig: 0x%08x new: 0x%08x reread: 0x%08x\n", val, new_value, reread);
+	
+	
+	asm volatile("mrc p15, 5, %0, c15, c7, 2" : "=r" (val));
+	
+	//asm volatile("mcr p15, 0, %0, c1, c1, 1" : : "r" (new_value));
+	//asm volatile("mrc p15, 0, %0, c1, c1, 1" : "=r" (reread));
+	printk("TLB ATTR REGISTER = orig: 0x%08x\n", val);
+	
+	return;
+}
+
 #if defined(CONFIG_CPU_V7) && !defined(CONFIG_HW_PERF_EVENTS)
 static void __init litmus_enable_perfcounters_v7(void *_ignore)
 {
@@ -866,7 +863,7 @@ static void __init litmus_enable_perfcounters_v7(void *_ignore)
 	asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r"(enable_val));
 
 	/* enables counters (cycle counter and event 1) */
-    asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r"(0x80000001));
+	asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r"(0x80000001));
 }
 
 static void __init litmus_enable_perfcounters(void)
@@ -910,7 +907,8 @@ static int __init _init_litmus(void)
 #if defined(CONFIG_CPU_V7) && !defined(CONFIG_HW_PERF_EVENTS)	
 	litmus_enable_perfcounters();
 #endif
-	
+	//litmus_nsacr_register();
+	//litmus_pmu_register();
 	color_mask = ((cache_info_sets << line_size_log) - 1) ^ (PAGE_SIZE - 1);
 	printk("Page color mask %lx\n", color_mask);
 	return 0;
diff --git a/litmus/sched_mc2.c b/litmus/sched_mc2.c
index 885218e..4800bc4 100644
--- a/litmus/sched_mc2.c
+++ b/litmus/sched_mc2.c
@@ -29,6 +29,7 @@
 #define BUDGET_ENFORCEMENT_AT_C 1
 
 extern void do_partition(enum crit_level lv, int cpu);
+extern void l2x0_flush_all(void);
 
 /* _global_env - reservation container for level-C tasks*/
 struct gmp_reservation_environment _global_env;
@@ -626,7 +627,7 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 }
 
 /* not used now */
-static void pre_schedule(struct task_struct *prev, int cpu)
+static inline void pre_schedule(struct task_struct *prev, int cpu)
 {
 	if (!prev || !is_realtime(prev))
 		return;
@@ -635,13 +636,15 @@ static void pre_schedule(struct task_struct *prev, int cpu)
 }
 
 /* not used now */
-static void post_schedule(struct task_struct *next, int cpu)
+static inline void post_schedule(struct task_struct *next, int cpu)
 {
 	enum crit_level lev;
-	if (!next || !is_realtime(next))
+	if (!next) // || !is_realtime(next))
 		return;
-	
-	lev = get_task_crit_level(next);
+	if (!is_realtime(next))
+		lev = NUM_CRIT_LEVELS;
+	else
+		lev = get_task_crit_level(next);
 	do_partition(lev, cpu);
 }
 
@@ -653,7 +656,7 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 	lt_t now;
 	struct mc2_cpu_state *state = local_cpu_state();
 
-	pre_schedule(prev, state->cpu);
+	//pre_schedule(prev, state->cpu);
 	
 	raw_spin_lock(&_global_env.lock);
 	raw_spin_lock(&state->lock);
@@ -873,6 +876,7 @@ static long mc2_complete_job(void)
 	next_release = ns_to_ktime(get_release(current));
 	preempt_disable();
 	TRACE_CUR("next_release=%llu\n", get_release(current));
+	flush_cache();
 	if (get_release(current) > litmus_clock()) {
 		/* sleep until next_release */
 		set_current_state(TASK_INTERRUPTIBLE);
@@ -888,7 +892,7 @@ static long mc2_complete_job(void)
 //		if (get_task_crit_level(current) == CRIT_LEVEL_A)
 		sched_trace_task_release(current);
 	}
-
+	//l2x0_flush_all();
 	TRACE_CUR("mc2_complete_job returns at %llu\n", litmus_clock());
 
 	return err;
@@ -1048,7 +1052,7 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 	struct reservation *res = NULL, *next;
 	struct sup_reservation_environment *sup_env;
 	int found = 0;
-	enum crit_level lv = get_task_crit_level(current);
+	//enum crit_level lv = get_task_crit_level(current);
 	unsigned long flags;
 	
 	if (cpu == -1) {
-- 
1.8.1.2


From a2927ecc8f7f3d02d8178eaef3e426fa597d530e Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Wed, 8 Apr 2015 22:47:25 -0400
Subject: [PATCH 113/119] cache flush bug

---
 litmus/cache_proc.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/litmus/cache_proc.c b/litmus/cache_proc.c
index 0e123fac..e91862f 100644
--- a/litmus/cache_proc.c
+++ b/litmus/cache_proc.c
@@ -1001,7 +1001,7 @@ void flush_cache(void)
 	
 	kfree(dummy);
 */	
-	int way, color;
+	int way, color, i;
 	for (way=0;way<MAX_NR_WAYS;way++) {
 		for (color=0;color<MAX_NR_COLORS;color++) {
 			void *vaddr = flusher_pages[way][color];
@@ -1010,6 +1010,11 @@ void flush_cache(void)
 					       vaddr, vaddr + PAGE_SIZE);
 		}
 
+	}
+
+	for (i = 0; i < nr_lockregs; i++) {
+		writel_relaxed(UNLOCK_ALL, ld_d_reg(i));
+		writel_relaxed(UNLOCK_ALL, ld_i_reg(i));
 	}	
 }
 
@@ -1090,9 +1095,14 @@ int setup_flusher_proc_handler(struct ctl_table *table, int write,
 
 	if (write && flusher_pages == NULL) {
 		ret = setup_flusher_array();
+		printk(KERN_INFO "setup flusher return: %d\n", ret);
+	
+	}
+	else if (flusher_pages) {
+		printk(KERN_INFO "flusher_pages is already set!\n");
+		ret = 0;
 	}
 	
-	printk(KERN_INFO "setup flusher return: %d\n", ret);
 	return ret;
 }
 
-- 
1.8.1.2


From c3079b56cfd3b62c08e02684bee671d2361ad9c9 Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Wed, 8 Apr 2015 23:42:58 -0400
Subject: [PATCH 114/119] fixed flush bug

---
 litmus/cache_proc.c  | 32 ++++++++++++++++++++++++++++++--
 litmus/reservation.c |  2 +-
 litmus/sched_mc2.c   | 10 +++++++---
 3 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/litmus/cache_proc.c b/litmus/cache_proc.c
index e91862f..4f7fc00 100644
--- a/litmus/cache_proc.c
+++ b/litmus/cache_proc.c
@@ -109,6 +109,28 @@ u32 prev_lockdown_i_reg[5] = {
 	0xFFFF00FF, /* share with level-C */
 };
 
+u32 prev_lbm_i_reg[8] = {
+	0x00000000,
+	0x00000000,
+	0x00000000,
+	0x00000000,
+	0x00000000,
+	0x00000000,
+	0x00000000,
+	0x00000000,
+};
+
+u32 prev_lbm_d_reg[8] = {
+	0x00000000,
+	0x00000000,
+	0x00000000,
+	0x00000000,
+	0x00000000,
+	0x00000000,
+	0x00000000,
+	0x00000000,
+};
+
 static void __iomem *cache_base;
 static void __iomem *lockreg_d;
 static void __iomem *lockreg_i;
@@ -1002,6 +1024,12 @@ void flush_cache(void)
 	kfree(dummy);
 */	
 	int way, color, i;
+	
+	for (i = 0; i < nr_lockregs; i++) {
+		prev_lbm_i_reg[i] = readl_relaxed(ld_i_reg(i));
+		prev_lbm_d_reg[i] = readl_relaxed(ld_d_reg(i));
+	}
+	
 	for (way=0;way<MAX_NR_WAYS;way++) {
 		for (color=0;color<MAX_NR_COLORS;color++) {
 			void *vaddr = flusher_pages[way][color];
@@ -1013,8 +1041,8 @@ void flush_cache(void)
 	}
 
 	for (i = 0; i < nr_lockregs; i++) {
-		writel_relaxed(UNLOCK_ALL, ld_d_reg(i));
-		writel_relaxed(UNLOCK_ALL, ld_i_reg(i));
+		writel_relaxed(prev_lbm_i_reg[i], ld_i_reg(i));
+		writel_relaxed(prev_lbm_d_reg[i], ld_d_reg(i));
 	}	
 }
 
diff --git a/litmus/reservation.c b/litmus/reservation.c
index af5a934..efd16da 100644
--- a/litmus/reservation.c
+++ b/litmus/reservation.c
@@ -206,7 +206,7 @@ static void sup_charge_budget(
 				encountered_active = 1;
 			}			
 		} else {
-			BUG_ON(res->state != RESERVATION_ACTIVE_IDLE);
+			//BUG_ON(res->state != RESERVATION_ACTIVE_IDLE);
 			TRACE("sup_charge_budget INACTIVE R%u drain %llu\n", res->id, delta);
 			res->ops->drain_budget(res, delta);
 		}
diff --git a/litmus/sched_mc2.c b/litmus/sched_mc2.c
index 4800bc4..1e39362 100644
--- a/litmus/sched_mc2.c
+++ b/litmus/sched_mc2.c
@@ -123,7 +123,7 @@ static enum crit_level get_task_crit_level(struct task_struct *tsk)
 static struct reservation* res_find_by_id(struct mc2_cpu_state *state,
                                           unsigned int id)
 {
-	struct reservation *res;
+	struct reservation *res = NULL;
 
 	res = sup_find_by_id(&state->sup_env, id);
 	if (!res)
@@ -404,8 +404,12 @@ static lt_t mc2_update_ghost_state(struct mc2_cpu_state *state)
 				continue;
 			
 			res = res_find_by_id(state, tinfo->mc2_param.res_id);
-			BUG_ON(!res);
-//printk(KERN_ALERT "R%d found!\n", res->id);			
+			//BUG_ON(!res);
+			if (!res) {
+				printk(KERN_ALERT "mc2_update_ghost_state(): R%d not found!\n", tinfo->mc2_param.res_id);			
+				return 0;
+			}
+			
 			TRACE("LV %d running id %d budget %llu\n", 
 			       lv, tinfo->mc2_param.res_id, res->cur_budget);
 			/* If the budget is exhausted, clear is_ghost and reschedule */
-- 
1.8.1.2


From e5d0df8359d1a297b4ffb59ebae18df63d7dab4f Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Thu, 9 Apr 2015 23:27:52 -0400
Subject: [PATCH 115/119] fix

---
 litmus/cache_proc.c | 120 ++++++++++++++++++++++------------------------------
 litmus/litmus.c     |  14 ++++--
 litmus/sched_mc2.c  |  14 +++---
 3 files changed, 66 insertions(+), 82 deletions(-)

diff --git a/litmus/cache_proc.c b/litmus/cache_proc.c
index 4f7fc00..68b451d 100644
--- a/litmus/cache_proc.c
+++ b/litmus/cache_proc.c
@@ -192,24 +192,24 @@ static inline void cache_sync(void)
 	cache_wait(base + L2X0_CACHE_SYNC, 1);
 }
 
-static void print_lockdown_registers(void)
+static void print_lockdown_registers(int cpu)
 {
 	int i;
-
 	//for (i = 0; i < nr_lockregs; i++) {
 	for (i = 0; i < 4; i++) {
-		printk("Lockdown Data CPU %2d: 0x%04x\n",
+		printk("P%d Lockdown Data CPU %2d: 0x%04x\n", cpu,
 				i, readl_relaxed(ld_d_reg(i)));
-		printk("Lockdown Inst CPU %2d: 0x%04x\n",
+		printk("P%d Lockdown Inst CPU %2d: 0x%04x\n", cpu,
 				i, readl_relaxed(ld_i_reg(i)));
 	}
 }
 
 static void test_lockdown(void *ignore)
 {
-	int i;
+	int i, cpu;
 
-	printk("Start lockdown test on CPU %d.\n", smp_processor_id());
+	cpu = smp_processor_id();
+	printk("Start lockdown test on CPU %d.\n", cpu);
 
 	for (i = 0; i < nr_lockregs; i++) {
 		printk("CPU %2d data reg: 0x%8p\n", i, ld_d_reg(i));
@@ -217,7 +217,7 @@ static void test_lockdown(void *ignore)
 	}
 
 	printk("Lockdown initial state:\n");
-	print_lockdown_registers();
+	print_lockdown_registers(cpu);
 	printk("---\n");
 
 	for (i = 0; i < nr_lockregs; i++) {
@@ -225,7 +225,7 @@ static void test_lockdown(void *ignore)
 		writel_relaxed(2, ld_i_reg(i));
 	}
 	printk("Lockdown all data=1 instr=2:\n");
-	print_lockdown_registers();
+	print_lockdown_registers(cpu);
 	printk("---\n");
 
 	for (i = 0; i < nr_lockregs; i++) {
@@ -233,7 +233,7 @@ static void test_lockdown(void *ignore)
 		writel_relaxed(((1 << 8) >> i), ld_i_reg(i));
 	}
 	printk("Lockdown varies:\n");
-	print_lockdown_registers();
+	print_lockdown_registers(cpu);
 	printk("---\n");
 
 	for (i = 0; i < nr_lockregs; i++) {
@@ -241,7 +241,7 @@ static void test_lockdown(void *ignore)
 		writel_relaxed(UNLOCK_ALL, ld_i_reg(i));
 	}
 	printk("Lockdown all zero:\n");
-	print_lockdown_registers();
+	print_lockdown_registers(cpu);
 
 	printk("End lockdown test.\n");
 }
@@ -273,12 +273,14 @@ int way_partition_handler(struct ctl_table *table, int write, void __user *buffe
 		size_t *lenp, loff_t *ppos)
 {
 	int ret = 0, i;
+	unsigned long flags;
 	
 	mutex_lock(&lockdown_proc);
 	
 	//flush_cache_all();
 	//cache_sync();
-	l2x0_flush_all();
+	//l2x0_flush_all();
+	flush_cache();
 	
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret)
@@ -296,8 +298,10 @@ int way_partition_handler(struct ctl_table *table, int write, void __user *buffe
 				       i * L2X0_LOCKDOWN_STRIDE);
 		}
 	}
-	print_lockdown_registers();
-
+	
+	local_irq_save(flags);
+	print_lockdown_registers(smp_processor_id());
+	local_irq_restore(flags);
 out:
 	mutex_unlock(&lockdown_proc);
 	return ret;
@@ -307,6 +311,7 @@ int lock_all_handler(struct ctl_table *table, int write, void __user *buffer,
 		size_t *lenp, loff_t *ppos)
 {
 	int ret = 0, i;
+	unsigned long flags;
 	
 	mutex_lock(&lockdown_proc);
 	
@@ -355,8 +360,9 @@ int lock_all_handler(struct ctl_table *table, int write, void __user *buffer,
 */
 	}
 	printk("LOCK_ALL HANDLER\n");
-	print_lockdown_registers();
-
+	local_irq_save(flags);
+	print_lockdown_registers(smp_processor_id());
+	local_irq_restore(flags);
 out:
 	mutex_unlock(&lockdown_proc);
 	return ret;
@@ -383,27 +389,28 @@ void mem_lock(u32 lock_val, int cpu)
 void do_partition(enum crit_level lv, int cpu)
 {
 	u32 regs;
-	//unsigned long flags;
+	unsigned long flags;
 	
 	if (lock_all || !use_part)
 		return;
+	raw_spin_lock_irqsave(&cache_lock, flags);
 	switch(lv) {
 		case CRIT_LEVEL_A:
 			regs = ~way_partitions[cpu*2];
-			regs |= 0xffff0000;
+			//regs |= 0xffff0000;
 			writel_relaxed(regs, ld_d_reg(cpu));
 			writel_relaxed(regs, ld_i_reg(cpu));
 			break;
 		case CRIT_LEVEL_B:
 			regs = ~way_partitions[cpu*2+1];
-			regs |= 0xffff0000;
+			//regs |= 0xffff0000;
 			writel_relaxed(regs, ld_d_reg(cpu));
 			writel_relaxed(regs, ld_i_reg(cpu));
 			break;
 		case CRIT_LEVEL_C:
 		case NUM_CRIT_LEVELS:
 			regs = ~way_partitions[8];
-			regs |= 0xffff0000;
+			//regs |= 0xffff0000;
 			writel_relaxed(regs, ld_d_reg(cpu));
 			writel_relaxed(regs, ld_i_reg(cpu));
 			break;
@@ -411,33 +418,16 @@ void do_partition(enum crit_level lv, int cpu)
 			BUG();
 
 	}
+/*	
+	printk(KERN_INFO "P%d lockdown on P%d\n", smp_processor_id(), cpu);
+	printk(KERN_INFO "CRIT_LEVEL %d\n", lv);
+	print_lockdown_registers(smp_processor_id());
+*/	
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
 	//cache_sync();
 //	barrier();
 //	mem_lock(regs, cpu);
-//	barrier();		
-	//print_lockdown_registers();
-/*
-	if (use_set_partition == 1 && use_way_partition == 1)
-		printk(KERN_ALERT "BOTH SET, WAY ARE SET!!!!\n");
-	
-	if (use_way_partition == 1) {
-		if (lv < CRIT_LEVEL_C) {
-			writel_relaxed(way_partitions[cpu], ld_d_reg(cpu));
-			writel_relaxed(way_partitions[cpu], ld_i_reg(cpu));
-		} else {
-			writel_relaxed(way_partitions[4], ld_d_reg(cpu));
-			writel_relaxed(way_partitions[4], ld_i_reg(cpu));
-		}
-	} else if (use_set_partition == 1) {
-		if (lv < CRIT_LEVEL_C) {
-			writel_relaxed(set_partitions[0], ld_d_reg(cpu));
-			writel_relaxed(set_partitions[0], ld_i_reg(cpu));
-		} else {
-			writel_relaxed(set_partitions[1], ld_d_reg(cpu));
-			writel_relaxed(set_partitions[1], ld_i_reg(cpu));
-		}
-	}
-*/
+//	barrier();
 }
 
 int use_part_proc_handler(struct ctl_table *table, int write, void __user *buffer,
@@ -491,21 +481,21 @@ void inline enter_irq_mode(void)
 {
 	int cpu = smp_processor_id();
 
-	return;
+	//return;
 	prev_lockdown_i_reg[cpu] = readl_relaxed(ld_i_reg(cpu));
 	prev_lockdown_d_reg[cpu] = readl_relaxed(ld_d_reg(cpu));
 	
 	if (os_isolation == 0)
 		return;	
 	
-	writel_relaxed(prev_lockdown_i_reg[4], ld_i_reg(cpu));
-	writel_relaxed(prev_lockdown_d_reg[4], ld_d_reg(cpu));
+	writel_relaxed(way_partitions[8], ld_i_reg(cpu));
+	writel_relaxed(way_partitions[8], ld_d_reg(cpu));
 }
 
 void inline exit_irq_mode(void)
 {
 	int cpu = smp_processor_id();
-	return;
+	//return;
 	if (os_isolation == 0)
 		return;
 
@@ -1011,26 +1001,19 @@ out_free:
 
 void flush_cache(void)
 {
-/*	int *dummy;
-	
-	flush_cache_all();
-	int size = 128, i, t = 0;
-	
-	dummy = kmalloc(PAGE_SIZE*size, GFP_KERNEL);
-	for (i = 0; i<PAGE_SIZE*size/sizeof(int); i++) {
-		dummy[i] = t++;
-	}
-	
-	kfree(dummy);
-*/	
-	int way, color, i;
+	int way, color, cpu;
+	unsigned long flags;
 	
-	for (i = 0; i < nr_lockregs; i++) {
-		prev_lbm_i_reg[i] = readl_relaxed(ld_i_reg(i));
-		prev_lbm_d_reg[i] = readl_relaxed(ld_d_reg(i));
-	}
+	raw_spin_lock_irqsave(&cache_lock, flags);
+	cpu = raw_smp_processor_id();
 	
+	prev_lbm_i_reg[cpu] = readl_relaxed(ld_i_reg(cpu));
+	prev_lbm_d_reg[cpu] = readl_relaxed(ld_d_reg(cpu));
+	//printk("P%d reg value = 0x%04x\n", cpu, prev_lbm_d_reg[cpu]);
 	for (way=0;way<MAX_NR_WAYS;way++) {
+		if ( (0x00000001 << way) & (prev_lbm_d_reg[cpu]) )
+			continue;
+		//printk("P%d flushes way #%d\n", cpu, way);
 		for (color=0;color<MAX_NR_COLORS;color++) {
 			void *vaddr = flusher_pages[way][color];
 			u32 lvalue  = unlocked_way[way];
@@ -1040,10 +1023,9 @@ void flush_cache(void)
 
 	}
 
-	for (i = 0; i < nr_lockregs; i++) {
-		writel_relaxed(prev_lbm_i_reg[i], ld_i_reg(i));
-		writel_relaxed(prev_lbm_d_reg[i], ld_d_reg(i));
-	}	
+	writel_relaxed(prev_lbm_i_reg[cpu], ld_i_reg(cpu));
+	writel_relaxed(prev_lbm_d_reg[cpu], ld_d_reg(cpu));
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
 }
 
 #define TRIALS 1000
@@ -1148,8 +1130,6 @@ static int __init litmus_sysctl_init(void)
 		goto out;
 	}
 
-	//setup_flusher_array();
-	printk(KERN_INFO "Setup flush_array.\n");
 	way_partition_min = 0x00000000;
 	way_partition_max = 0x0000FFFF;
 	os_isolation = 0;
diff --git a/litmus/litmus.c b/litmus/litmus.c
index 70342e7..5692905 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -27,6 +27,7 @@
 #include <litmus/litmus_proc.h>
 #include <litmus/clock.h>
 #include <litmus/cache_proc.h>
+#include <litmus/mc2_common.h>
 
 #include <asm/cacheflush.h>
 
@@ -394,6 +395,8 @@ extern struct page *new_alloc_page(struct page *page, unsigned long node, int **
 
 #endif
 
+//static raw_spinlock_t migrate_lock;
+
 asmlinkage long sys_set_page_color(int cpu)
 {
 	long ret = 0;
@@ -401,10 +404,12 @@ asmlinkage long sys_set_page_color(int cpu)
 	struct vm_area_struct *vma_itr = NULL;
 	int nr_pages = 0, nr_shared_pages = 0, nr_failed = 0;
 	unsigned long node;
+	enum crit_level lv;
 		
 	LIST_HEAD(pagelist);
 	LIST_HEAD(shared_pagelist);
 	
+	
 	down_read(&current->mm->mmap_sem);
 	TRACE_TASK(current, "SYSCALL set_page_color\n");
 	vma_itr = current->mm->mmap;
@@ -466,10 +471,11 @@ asmlinkage long sys_set_page_color(int cpu)
 //	}
 	
 	ret = 0;
+	lv = tsk_rt(current)->mc2_data->crit;
 	if (cpu == -1)
 		node = 8;
 	else
-		node = cpu;
+		node = cpu*2 + lv;
 
         //node= 0;
 	if (!list_empty(&pagelist)) {
@@ -492,13 +498,13 @@ asmlinkage long sys_set_page_color(int cpu)
 	}
 
 	up_read(&current->mm->mmap_sem);
-
+	
 	list_for_each_entry(page_itr, &shared_pagelist, lru) {
 		TRACE("S Anon=%d, pfn = %lu, _mapcount = %d, _count = %d\n", PageAnon(page_itr), __page_to_pfn(page_itr), page_mapcount(page_itr), page_count(page_itr));
 	}
 	
 	TRACE_TASK(current, "nr_pages = %d nr_failed = %d\n", nr_pages, nr_failed);
-	printk(KERN_INFO "nr_pages = %d nr_failed = %d\n", nr_pages, nr_failed);
+	printk(KERN_INFO "node = %ld, nr_pages = %d, nr_failed = %d\n", node, nr_pages, nr_failed);
 	flush_cache();
 	
 	return ret;
@@ -911,6 +917,8 @@ static int __init _init_litmus(void)
 	//litmus_pmu_register();
 	color_mask = ((cache_info_sets << line_size_log) - 1) ^ (PAGE_SIZE - 1);
 	printk("Page color mask %lx\n", color_mask);
+	
+	//raw_spin_lock_init(&migrate_lock);
 	return 0;
 }
 
diff --git a/litmus/sched_mc2.c b/litmus/sched_mc2.c
index 1e39362..e59030f 100644
--- a/litmus/sched_mc2.c
+++ b/litmus/sched_mc2.c
@@ -630,7 +630,6 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 	return NULL;
 }
 
-/* not used now */
 static inline void pre_schedule(struct task_struct *prev, int cpu)
 {
 	if (!prev || !is_realtime(prev))
@@ -639,16 +638,15 @@ static inline void pre_schedule(struct task_struct *prev, int cpu)
 	do_partition(CRIT_LEVEL_C, cpu);
 }
 
-/* not used now */
 static inline void post_schedule(struct task_struct *next, int cpu)
 {
 	enum crit_level lev;
-	if (!next) // || !is_realtime(next))
+	if ((!next) || !is_realtime(next))
 		return;
-	if (!is_realtime(next))
+/*	if (!is_realtime(next))
 		lev = NUM_CRIT_LEVELS;
-	else
-		lev = get_task_crit_level(next);
+	else */
+	lev = get_task_crit_level(next);
 	do_partition(lev, cpu);
 }
 
@@ -660,7 +658,7 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 	lt_t now;
 	struct mc2_cpu_state *state = local_cpu_state();
 
-	//pre_schedule(prev, state->cpu);
+	pre_schedule(prev, state->cpu);
 	
 	raw_spin_lock(&_global_env.lock);
 	raw_spin_lock(&state->lock);
@@ -705,8 +703,6 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 	/* NOTE: drops state->lock */
 	mc2_update_timer_and_unlock(state);
 
-
-	
 	if (prev != state->scheduled && is_realtime(prev)) {
 		struct mc2_task_state* tinfo = get_mc2_state(prev);
 		struct reservation* res = tinfo->res_info.client.reservation;
-- 
1.8.1.2


From 8ea8941a1aa8fff86a51fe9d5c7f0b6e80e5c23d Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Thu, 9 Apr 2015 23:48:27 -0400
Subject: [PATCH 116/119] commit

---
 litmus/cache_proc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/litmus/cache_proc.c b/litmus/cache_proc.c
index 68b451d..7e2e355 100644
--- a/litmus/cache_proc.c
+++ b/litmus/cache_proc.c
@@ -418,6 +418,7 @@ void do_partition(enum crit_level lv, int cpu)
 			BUG();
 
 	}
+	
 /*	
 	printk(KERN_INFO "P%d lockdown on P%d\n", smp_processor_id(), cpu);
 	printk(KERN_INFO "CRIT_LEVEL %d\n", lv);
-- 
1.8.1.2


From 9d6f8815cc53c932a4cf4d0457374b2c5fc100e3 Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Fri, 10 Apr 2015 10:13:36 -0400
Subject: [PATCH 117/119] fix

---
 include/litmus/cache_proc.h |   2 +-
 litmus/cache_proc.c         | 112 ++++++++++++++++++++++----------------------
 litmus/litmus.c             |   3 +-
 litmus/sched_mc2.c          |   5 +-
 4 files changed, 60 insertions(+), 62 deletions(-)

diff --git a/include/litmus/cache_proc.h b/include/litmus/cache_proc.h
index 24128d7..cf5fb04 100644
--- a/include/litmus/cache_proc.h
+++ b/include/litmus/cache_proc.h
@@ -6,7 +6,7 @@
 void litmus_setup_lockdown(void __iomem*, u32);
 void enter_irq_mode(void);
 void exit_irq_mode(void);
-void flush_cache(void);
+void flush_cache(int all);
 
 extern struct page *new_alloc_page_color(unsigned long color);
 
diff --git a/litmus/cache_proc.c b/litmus/cache_proc.c
index 7e2e355..3c7724d 100644
--- a/litmus/cache_proc.c
+++ b/litmus/cache_proc.c
@@ -50,23 +50,23 @@ u32 unlocked_way[MAX_NR_WAYS]  = {
 };
 
 u32 nr_unlocked_way[MAX_NR_WAYS+1]  = {
-	0xFFFFFFFF, /* all ways are locked. usable = 0*/
-	0xFFFFFFFE, /* way ~0 unlocked. usable = 1 */
-	0xFFFFFFFC,
-	0xFFFFFFF8,
-	0xFFFFFFF0,
-	0xFFFFFFE0,
-	0xFFFFFFC0,
-	0xFFFFFF80,
-	0xFFFFFF00,
-	0xFFFFFE00,
-	0xFFFFFC00,
-	0xFFFFF800,
-	0xFFFFF000,
-	0xFFFFE000,
-	0xFFFFC000,
-	0xFFFF8000,
-	0xFFFF0000, /* way ~15 unlocked. usable = 16 */
+	0x0000FFFF, /* all ways are locked. usable = 0*/
+	0x0000FFFE, /* way ~0 unlocked. usable = 1 */
+	0x0000FFFC,
+	0x0000FFF8,
+	0x0000FFF0,
+	0x0000FFE0,
+	0x0000FFC0,
+	0x0000FF80,
+	0x0000FF00,
+	0x0000FE00,
+	0x0000FC00,
+	0x0000F800,
+	0x0000F000,
+	0x0000E000,
+	0x0000C000,
+	0x00008000,
+	0x00000000, /* way ~15 unlocked. usable = 16 */
 };
 
 u32 way_partition[4] = {
@@ -88,25 +88,20 @@ u32 way_partitions[9] = {
 	0xffffff00, /* lv C */
 };
 
-u32 set_partitions[2] = {
-	0xFFFFFF00, /* cpuX A and B */
-	0xFFFF00FF, /* lv C */
-};
-
 u32 prev_lockdown_d_reg[5] = {
-	0xFFFFFF00,
-	0xFFFFFF00,
-	0xFFFFFF00,
-	0xFFFFFF00,
-	0xFFFF00FF, /* share with level-C */
+	0x0000FF00,
+	0x0000FF00,
+	0x0000FF00,
+	0x0000FF00,
+	0x000000FF, /* share with level-C */
 };
 
 u32 prev_lockdown_i_reg[5] = {
-	0xFFFFFF00,
-	0xFFFFFF00,
-	0xFFFFFF00,
-	0xFFFFFF00,
-	0xFFFF00FF, /* share with level-C */
+	0x0000FF00,
+	0x0000FF00,
+	0x0000FF00,
+	0x0000FF00,
+	0x000000FF, /* share with level-C */
 };
 
 u32 prev_lbm_i_reg[8] = {
@@ -280,7 +275,7 @@ int way_partition_handler(struct ctl_table *table, int write, void __user *buffe
 	//flush_cache_all();
 	//cache_sync();
 	//l2x0_flush_all();
-	flush_cache();
+	flush_cache(1);
 	
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret)
@@ -319,7 +314,7 @@ int lock_all_handler(struct ctl_table *table, int write, void __user *buffer,
 	//outer_flush_all();
 	//cache_sync();
 	//l2x0_flush_all();
-	flush_cache();
+	flush_cache(1);
 	
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret)
@@ -368,11 +363,11 @@ out:
 	return ret;
 }
 
-void mem_lock(u32 lock_val, int cpu)
+void cache_lockdown(u32 lock_val, int cpu)
 {
-	unsigned long flags;
+	//unsigned long flags;
 
-	raw_spin_lock_irqsave(&cache_lock, flags);
+	//raw_spin_lock_irqsave(&cache_lock, flags);
 
 	__asm__ __volatile__ (
 "	str	%[lockval], [%[dcachereg]]\n"
@@ -383,7 +378,7 @@ void mem_lock(u32 lock_val, int cpu)
 	  [lockval] "r" (lock_val)
 	: "cc");
 
-	raw_spin_unlock_irqrestore(&cache_lock, flags);
+	//raw_spin_unlock_irqrestore(&cache_lock, flags);
 }
 
 void do_partition(enum crit_level lv, int cpu)
@@ -397,34 +392,39 @@ void do_partition(enum crit_level lv, int cpu)
 	switch(lv) {
 		case CRIT_LEVEL_A:
 			regs = ~way_partitions[cpu*2];
-			//regs |= 0xffff0000;
-			writel_relaxed(regs, ld_d_reg(cpu));
-			writel_relaxed(regs, ld_i_reg(cpu));
+			regs &= 0x0000ffff;
+			//writel_relaxed(regs, ld_d_reg(cpu));
+			//writel_relaxed(regs, ld_i_reg(cpu));
 			break;
 		case CRIT_LEVEL_B:
 			regs = ~way_partitions[cpu*2+1];
-			//regs |= 0xffff0000;
-			writel_relaxed(regs, ld_d_reg(cpu));
-			writel_relaxed(regs, ld_i_reg(cpu));
+			regs &= 0x0000ffff;
+			//writel_relaxed(regs, ld_d_reg(cpu));
+			//writel_relaxed(regs, ld_i_reg(cpu));
 			break;
 		case CRIT_LEVEL_C:
 		case NUM_CRIT_LEVELS:
 			regs = ~way_partitions[8];
-			//regs |= 0xffff0000;
-			writel_relaxed(regs, ld_d_reg(cpu));
-			writel_relaxed(regs, ld_i_reg(cpu));
+			regs &= 0x0000ffff;
+			//writel_relaxed(regs, ld_d_reg(cpu));
+			//writel_relaxed(regs, ld_i_reg(cpu));
 			break;
 		default:
 			BUG();
 
 	}
-	
+	barrier();
+	cache_lockdown(regs, cpu);
+	barrier();
 /*	
 	printk(KERN_INFO "P%d lockdown on P%d\n", smp_processor_id(), cpu);
 	printk(KERN_INFO "CRIT_LEVEL %d\n", lv);
 	print_lockdown_registers(smp_processor_id());
 */	
 	raw_spin_unlock_irqrestore(&cache_lock, flags);
+	
+	flush_cache(0);
+	
 	//cache_sync();
 //	barrier();
 //	mem_lock(regs, cpu);
@@ -483,11 +483,11 @@ void inline enter_irq_mode(void)
 	int cpu = smp_processor_id();
 
 	//return;
-	prev_lockdown_i_reg[cpu] = readl_relaxed(ld_i_reg(cpu));
-	prev_lockdown_d_reg[cpu] = readl_relaxed(ld_d_reg(cpu));
-	
 	if (os_isolation == 0)
 		return;	
+
+	prev_lockdown_i_reg[cpu] = readl_relaxed(ld_i_reg(cpu));
+	prev_lockdown_d_reg[cpu] = readl_relaxed(ld_d_reg(cpu));
 	
 	writel_relaxed(way_partitions[8], ld_i_reg(cpu));
 	writel_relaxed(way_partitions[8], ld_d_reg(cpu));
@@ -499,7 +499,6 @@ void inline exit_irq_mode(void)
 	//return;
 	if (os_isolation == 0)
 		return;
-
 	writel_relaxed(prev_lockdown_i_reg[cpu], ld_i_reg(cpu));
 	writel_relaxed(prev_lockdown_d_reg[cpu], ld_d_reg(cpu));	
 }
@@ -1000,7 +999,7 @@ out_free:
 	return ret;
 }
 
-void flush_cache(void)
+void flush_cache(int all)
 {
 	int way, color, cpu;
 	unsigned long flags;
@@ -1010,11 +1009,12 @@ void flush_cache(void)
 	
 	prev_lbm_i_reg[cpu] = readl_relaxed(ld_i_reg(cpu));
 	prev_lbm_d_reg[cpu] = readl_relaxed(ld_d_reg(cpu));
-	//printk("P%d reg value = 0x%04x\n", cpu, prev_lbm_d_reg[cpu]);
+	printk("P%d reg value = 0x%04x\n", cpu, prev_lbm_d_reg[cpu]);
 	for (way=0;way<MAX_NR_WAYS;way++) {
-		if ( (0x00000001 << way) & (prev_lbm_d_reg[cpu]) )
+		if (( (0x00000001 << way) & (prev_lbm_d_reg[cpu]) ) &&
+			!all)
 			continue;
-		//printk("P%d flushes way #%d\n", cpu, way);
+		printk("P%d flushes way #%d\n", cpu, way);
 		for (color=0;color<MAX_NR_COLORS;color++) {
 			void *vaddr = flusher_pages[way][color];
 			u32 lvalue  = unlocked_way[way];
diff --git a/litmus/litmus.c b/litmus/litmus.c
index 5692905..d720390 100644
--- a/litmus/litmus.c
+++ b/litmus/litmus.c
@@ -319,7 +319,6 @@ asmlinkage long sys_null_call(cycles_t __user *ts)
 		ret = put_user(now, ts);
 	}
 	else {
-		//flush_cache_all();
 		int *dummy;
 		int size = 20, i, t = 0;
 		dummy = kmalloc(PAGE_SIZE*size, GFP_ATOMIC);
@@ -505,7 +504,7 @@ asmlinkage long sys_set_page_color(int cpu)
 	
 	TRACE_TASK(current, "nr_pages = %d nr_failed = %d\n", nr_pages, nr_failed);
 	printk(KERN_INFO "node = %ld, nr_pages = %d, nr_failed = %d\n", node, nr_pages, nr_failed);
-	flush_cache();
+	flush_cache(1);
 	
 	return ret;
 }
diff --git a/litmus/sched_mc2.c b/litmus/sched_mc2.c
index e59030f..4536556 100644
--- a/litmus/sched_mc2.c
+++ b/litmus/sched_mc2.c
@@ -29,7 +29,6 @@
 #define BUDGET_ENFORCEMENT_AT_C 1
 
 extern void do_partition(enum crit_level lv, int cpu);
-extern void l2x0_flush_all(void);
 
 /* _global_env - reservation container for level-C tasks*/
 struct gmp_reservation_environment _global_env;
@@ -658,7 +657,7 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 	lt_t now;
 	struct mc2_cpu_state *state = local_cpu_state();
 
-	pre_schedule(prev, state->cpu);
+	//pre_schedule(prev, state->cpu);
 	
 	raw_spin_lock(&_global_env.lock);
 	raw_spin_lock(&state->lock);
@@ -876,7 +875,7 @@ static long mc2_complete_job(void)
 	next_release = ns_to_ktime(get_release(current));
 	preempt_disable();
 	TRACE_CUR("next_release=%llu\n", get_release(current));
-	flush_cache();
+	//flush_cache();
 	if (get_release(current) > litmus_clock()) {
 		/* sleep until next_release */
 		set_current_state(TASK_INTERRUPTIBLE);
-- 
1.8.1.2


From e7fb9c6907dbf764dadf9b05038dc7c80c2aa95b Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Fri, 17 Apr 2015 12:50:42 -0400
Subject: [PATCH 118/119] comment printk

---
 litmus/cache_proc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/litmus/cache_proc.c b/litmus/cache_proc.c
index 3c7724d..703b290 100644
--- a/litmus/cache_proc.c
+++ b/litmus/cache_proc.c
@@ -1009,12 +1009,12 @@ void flush_cache(int all)
 	
 	prev_lbm_i_reg[cpu] = readl_relaxed(ld_i_reg(cpu));
 	prev_lbm_d_reg[cpu] = readl_relaxed(ld_d_reg(cpu));
-	printk("P%d reg value = 0x%04x\n", cpu, prev_lbm_d_reg[cpu]);
+	//printk("P%d reg value = 0x%04x\n", cpu, prev_lbm_d_reg[cpu]);
 	for (way=0;way<MAX_NR_WAYS;way++) {
 		if (( (0x00000001 << way) & (prev_lbm_d_reg[cpu]) ) &&
 			!all)
 			continue;
-		printk("P%d flushes way #%d\n", cpu, way);
+		//printk("P%d flushes way #%d\n", cpu, way);
 		for (color=0;color<MAX_NR_COLORS;color++) {
 			void *vaddr = flusher_pages[way][color];
 			u32 lvalue  = unlocked_way[way];
-- 
1.8.1.2


From a302acc51e029fcf78e339a9360ba6231caae98d Mon Sep 17 00:00:00 2001
From: Namhoon Kim <namhoonk@cs.unc.edu>
Date: Thu, 4 Jun 2015 17:13:25 -0400
Subject: [PATCH 119/119] Submit version

---
 litmus/bank_proc.c            |  99 ++++++-------------------------
 litmus/cache_proc.c           |  67 ++-------------------
 litmus/polling_reservations.c |  55 +++---------------
 litmus/reservation.c          |  95 +++++-------------------------
 litmus/sched_mc2.c            | 131 +++++-------------------------------------
 litmus/uncachedev.c           |   4 +-
 6 files changed, 59 insertions(+), 392 deletions(-)

diff --git a/litmus/bank_proc.c b/litmus/bank_proc.c
index 655eb27..932340d 100644
--- a/litmus/bank_proc.c
+++ b/litmus/bank_proc.c
@@ -94,7 +94,6 @@ static struct color_group *color_groups;
  */
 unsigned int counting_one_set(unsigned int v)
 {
-//    unsigned int v; // count the number of bits set in v
     unsigned int c; // c accumulates the total bits set in v
 
     for (c = 0; v; v >>= 1)
@@ -152,8 +151,6 @@ static inline unsigned int page_list_index(struct page *page)
 {
     unsigned int idx;  
     idx = (page_color(page) + page_bank(page)*(number_cachecolors));
-//    printk("address = %lx, ", page_to_phys(page));
-//    printk("color(%d), bank(%d), indx = %d\n", page_color(page), page_bank(page), idx);
 
     return idx; 
 }
@@ -214,26 +211,16 @@ void add_page_to_color_list(struct page *page)
  */
 static int do_add_pages(void)
 {
-	//printk("LITMUS do add pages\n");
-	
 	struct page *page, *page_tmp;
 	LIST_HEAD(free_later);
 	unsigned long color;
 	int ret = 0;
 	int i = 0;
-        int free_counter = 0;
-        unsigned long counter[128]= {0}; 
-        
-        //printk("Before refill : \n");
-        //show_nr_pages();
+	int free_counter = 0;
+	unsigned long counter[128]= {0}; 
 
-	// until all the page lists contain enough pages 
-	//for (i =0; i<5; i++) {
 	for (i=0; i< 1024*100;i++) {
-	//while (smallest_nr_pages() < PAGES_PER_COLOR) {
-       //         printk("smallest = %d\n", smallest_nr_pages());	
 		page = alloc_page(GFP_HIGHUSER_MOVABLE);
-	    //    page = alloc_pages_exact_node(0, GFP_HIGHUSER_MOVABLE, 0);
 	
 		if (unlikely(!page)) {
 			printk(KERN_WARNING "Could not allocate pages.\n");
@@ -242,47 +229,16 @@ static int do_add_pages(void)
 		}
 		color = page_list_index(page);
 		counter[color]++;
-	//	printk("page(%d) = color %x, bank %x, [color] =%d \n", color, page_color(page), page_bank(page), atomic_read(&color_groups[color].nr_pages));
-                //show_nr_pages();
+
 		if (atomic_read(&color_groups[color].nr_pages) < PAGES_PER_COLOR && color>=32) {
-	//	if ( PAGES_PER_COLOR && color>=16*2) {
 			add_page_to_color_list(page);
-	//		printk("add page(%d) = color %x, bank %x\n", color, page_color(page), page_bank(page));
-		} else{
+		} else {
 			// Pages here will be freed later 
 			list_add_tail(&page->lru, &free_later);
 			free_counter++;
-		        //list_del(&page->lru);
-		//        __free_page(page);
-	//		printk("useless page(%d) = color %x, bank %x\n", color,  page_color(page), page_bank(page));
 		}
-               //show_nr_pages();
-                /*
-                if(free_counter >= PAGES_PER_COLOR)
-                {
-                    printk("free unwanted page list eariler");
-                    free_counter = 0;
-	            list_for_each_entry_safe(page, page_tmp, &free_later, lru) {
-		        list_del(&page->lru);
-		        __free_page(page);
-	            }
-
-                    show_nr_pages();
-                }
-                */
-        }
-/*        printk("page counter = \n");
-        for (i=0; i<128; i++)
-        {
-            printk("(%03d) = %4d, ", i , counter[i]);
-            if(i%8 == 7){
-                printk("\n");
-            }
+	}
 
-        }
-*/	
-        //printk("After refill : \n");
-        //show_nr_pages();
 #if 1
 	// Free the unwanted pages
 	list_for_each_entry_safe(page, page_tmp, &free_later, lru) {
@@ -302,13 +258,11 @@ out:
  */ 
 static struct  page *new_alloc_page_color( unsigned long color)
 {
-//	printk("allocate new page color = %d\n", color);	
 	struct color_group *cgroup;
 	struct page *rPage = NULL;
 		
 	if( (color <0) || (color)>(number_cachecolors*number_banks -1)) {
 		TRACE_CUR("Wrong color %lu\n", color);	
-//		printk(KERN_WARNING "Wrong color %lu\n", color);
 		goto out;
 	}
 
@@ -317,24 +271,19 @@ static struct  page *new_alloc_page_color( unsigned long color)
 	spin_lock(&cgroup->lock);
 	if (unlikely(!atomic_read(&cgroup->nr_pages))) {
 		TRACE_CUR("No free %lu colored pages.\n", color);
-//		printk(KERN_WARNING "no free %lu colored pages.\n", color);
 		goto out_unlock;
 	}
 	rPage = list_first_entry(&cgroup->list, struct page, lru);
 	BUG_ON(page_count(rPage) > 1);
-	//get_page(rPage);
 	list_del(&rPage->lru);
 	atomic_dec(&cgroup->nr_pages);
 	ClearPageLRU(rPage);
 out_unlock:
 	spin_unlock(&cgroup->lock);
 out:
-	if( smallest_nr_pages() == 0)
-        {
+	if(smallest_nr_pages() == 0) {
 		do_add_pages();
-       //     printk("ERROR(bank_proc.c) = We don't have enough pages in bank_proc.c\n");        
-        
-        }
+	}
 	return rPage;
 }
 
@@ -357,23 +306,17 @@ struct page* get_colored_page(unsigned long color)
  */
 struct page *new_alloc_page(struct page *page, unsigned long node, int **x)
 {
-//	printk("allocate new page node = %d\n", node);	
-//	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 	struct color_group *cgroup;
 	struct page *rPage = NULL;
 	unsigned int color;
+	unsigned int idx = 0;
+	idx += num_by_bitmask_index(set_partition[node], set_index[node]);
+	idx += number_cachecolors* num_by_bitmask_index(bank_partition[node], bank_index[node]);
 	
-
-        unsigned int idx = 0;
-        idx += num_by_bitmask_index(set_partition[node], set_index[node]);
-        idx += number_cachecolors* num_by_bitmask_index(bank_partition[node], bank_index[node]);
-	//printk("node  = %d, idx = %d\n", node, idx);
-
 	rPage =  new_alloc_page_color(idx);
-        
-            
-        set_index[node] = (set_index[node]+1) % counting_one_set(set_partition[node]);
-        bank_index[node] = (bank_index[node]+1) % counting_one_set(bank_partition[node]);
+	set_index[node] = (set_index[node]+1) % counting_one_set(set_partition[node]);
+	bank_index[node] = (bank_index[node]+1) % counting_one_set(bank_partition[node]);
+
 	return rPage; 
 }
 
@@ -386,11 +329,10 @@ void reclaim_page(struct page *page)
 	const unsigned long color = page_list_index(page);
 	unsigned long nr_reclaimed = 0;
 	spin_lock(&reclaim_lock);
-    	put_page(page);
+	put_page(page);
 	add_page_to_color_list(page);
 
 	spin_unlock(&reclaim_lock);
-	printk("Reclaimed page(%d) = color %x, bank %x, [color] =%d \n", color, page_color(page), page_bank(page), atomic_read(&color_groups[color].nr_pages));
 }
 
 
@@ -405,10 +347,9 @@ static int __init init_variables(void)
 	number_cachecolors = counting_one_set(CACHE_MASK);
 	number_cachecolors = two_exp(number_cachecolors);
 	NUM_PAGE_LIST = number_banks * number_cachecolors; 
-        printk(KERN_WARNING "number of banks = %d, number of cachecolors=%d\n", number_banks, number_cachecolors);
+
 	mutex_init(&void_lockdown_proc);
 	spin_lock_init(&reclaim_lock);
-
 }
 
 
@@ -421,14 +362,13 @@ static int __init init_color_groups(void)
 	unsigned long i;
 	int err = 0;
 
-        printk("NUM_PAGE_LIST = %d\n", NUM_PAGE_LIST);
-        color_groups = kmalloc(NUM_PAGE_LIST *sizeof(struct color_group), GFP_KERNEL);
+	printk("NUM_PAGE_LIST = %d\n", NUM_PAGE_LIST);
+	color_groups = kmalloc(NUM_PAGE_LIST *sizeof(struct color_group), GFP_KERNEL);
 
 	if (!color_groups) {
 		printk(KERN_WARNING "Could not allocate color groups.\n");
 		err = -ENOMEM;
-	}else{
-
+	} else {
 		for (i = 0; i < NUM_PAGE_LIST; ++i) {
 			cgroup = &color_groups[i];
 			atomic_set(&cgroup->nr_pages, 0);
@@ -436,7 +376,7 @@ static int __init init_color_groups(void)
 			spin_lock_init(&cgroup->lock);
 		}
 	}
-        return err;
+	return err;
 }
 
 int set_partition_handler(struct ctl_table *table, int write, void __user *buffer,
@@ -733,4 +673,3 @@ out:
 }
 
 module_init(litmus_color_init);
-
diff --git a/litmus/cache_proc.c b/litmus/cache_proc.c
index 703b290..e343e73 100644
--- a/litmus/cache_proc.c
+++ b/litmus/cache_proc.c
@@ -190,7 +190,6 @@ static inline void cache_sync(void)
 static void print_lockdown_registers(int cpu)
 {
 	int i;
-	//for (i = 0; i < nr_lockregs; i++) {
 	for (i = 0; i < 4; i++) {
 		printk("P%d Lockdown Data CPU %2d: 0x%04x\n", cpu,
 				i, readl_relaxed(ld_d_reg(i)));
@@ -272,9 +271,6 @@ int way_partition_handler(struct ctl_table *table, int write, void __user *buffe
 	
 	mutex_lock(&lockdown_proc);
 	
-	//flush_cache_all();
-	//cache_sync();
-	//l2x0_flush_all();
 	flush_cache(1);
 	
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
@@ -310,10 +306,6 @@ int lock_all_handler(struct ctl_table *table, int write, void __user *buffer,
 	
 	mutex_lock(&lockdown_proc);
 	
-	//flush_cache_all();
-	//outer_flush_all();
-	//cache_sync();
-	//l2x0_flush_all();
 	flush_cache(1);
 	
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
@@ -327,15 +319,7 @@ int lock_all_handler(struct ctl_table *table, int write, void __user *buffer,
 			writel_relaxed(0xFFFF, cache_base + L2X0_LOCKDOWN_WAY_I_BASE +
 				       i * L2X0_LOCKDOWN_STRIDE);
 		}
-/*		
-		for (i = 0; i < nr_lockregs;  i++) {
-			barrier();
-			mem_lock(LOCK_ALL, i);
-			barrier();
-			//writel_relaxed(nr_unlocked_way[0], ld_d_reg(i));
-			//writel_relaxed(nr_unlocked_way[0], ld_i_reg(i));
-		}
-*/		
+
 	}
 	if (write && lock_all == 0) {
 		for (i = 0; i < nr_lockregs; i++) {
@@ -344,15 +328,6 @@ int lock_all_handler(struct ctl_table *table, int write, void __user *buffer,
 			writel_relaxed(0x0, cache_base + L2X0_LOCKDOWN_WAY_I_BASE +
 				       i * L2X0_LOCKDOWN_STRIDE);
 		}
-/*
-		for (i = 0; i < nr_lockregs;  i++) {
-			barrier();
-			mem_lock(UNLOCK_ALL, i);
-			barrier();
-			//writel_relaxed(nr_unlocked_way[16], ld_d_reg(i));
-			//writel_relaxed(nr_unlocked_way[16], ld_i_reg(i));
-		}
-*/
 	}
 	printk("LOCK_ALL HANDLER\n");
 	local_irq_save(flags);
@@ -365,10 +340,6 @@ out:
 
 void cache_lockdown(u32 lock_val, int cpu)
 {
-	//unsigned long flags;
-
-	//raw_spin_lock_irqsave(&cache_lock, flags);
-
 	__asm__ __volatile__ (
 "	str	%[lockval], [%[dcachereg]]\n"
 "	str	%[lockval], [%[icachereg]]\n"
@@ -378,7 +349,6 @@ void cache_lockdown(u32 lock_val, int cpu)
 	  [lockval] "r" (lock_val)
 	: "cc");
 
-	//raw_spin_unlock_irqrestore(&cache_lock, flags);
 }
 
 void do_partition(enum crit_level lv, int cpu)
@@ -393,42 +363,26 @@ void do_partition(enum crit_level lv, int cpu)
 		case CRIT_LEVEL_A:
 			regs = ~way_partitions[cpu*2];
 			regs &= 0x0000ffff;
-			//writel_relaxed(regs, ld_d_reg(cpu));
-			//writel_relaxed(regs, ld_i_reg(cpu));
 			break;
 		case CRIT_LEVEL_B:
 			regs = ~way_partitions[cpu*2+1];
 			regs &= 0x0000ffff;
-			//writel_relaxed(regs, ld_d_reg(cpu));
-			//writel_relaxed(regs, ld_i_reg(cpu));
 			break;
 		case CRIT_LEVEL_C:
 		case NUM_CRIT_LEVELS:
 			regs = ~way_partitions[8];
 			regs &= 0x0000ffff;
-			//writel_relaxed(regs, ld_d_reg(cpu));
-			//writel_relaxed(regs, ld_i_reg(cpu));
 			break;
 		default:
 			BUG();
-
 	}
 	barrier();
 	cache_lockdown(regs, cpu);
 	barrier();
-/*	
-	printk(KERN_INFO "P%d lockdown on P%d\n", smp_processor_id(), cpu);
-	printk(KERN_INFO "CRIT_LEVEL %d\n", lv);
-	print_lockdown_registers(smp_processor_id());
-*/	
+
 	raw_spin_unlock_irqrestore(&cache_lock, flags);
 	
 	flush_cache(0);
-	
-	//cache_sync();
-//	barrier();
-//	mem_lock(regs, cpu);
-//	barrier();
 }
 
 int use_part_proc_handler(struct ctl_table *table, int write, void __user *buffer,
@@ -438,15 +392,10 @@ int use_part_proc_handler(struct ctl_table *table, int write, void __user *buffe
 	
 	mutex_lock(&lockdown_proc);
 	
-	//flush_cache_all();
-	//cache_sync();
-	//l2x0_flush_all();
-	
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret)
 		goto out;
 	
-
 	printk("USE_PART HANDLER = %d\n", use_part);
 
 out:
@@ -461,15 +410,9 @@ int os_isolation_proc_handler(struct ctl_table *table, int write, void __user *b
 	
 	mutex_lock(&lockdown_proc);
 	
-	//flush_cache_all();
-	//cache_sync();
-	//l2x0_flush_all();
-	//flush_cache();
-	
 	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (ret)
 		goto out;
-	
 
 	printk("OS_ISOLATION HANDLER = %d\n", os_isolation);
 
@@ -482,7 +425,6 @@ void inline enter_irq_mode(void)
 {
 	int cpu = smp_processor_id();
 
-	//return;
 	if (os_isolation == 0)
 		return;	
 
@@ -496,7 +438,7 @@ void inline enter_irq_mode(void)
 void inline exit_irq_mode(void)
 {
 	int cpu = smp_processor_id();
-	//return;
+
 	if (os_isolation == 0)
 		return;
 	writel_relaxed(prev_lockdown_i_reg[cpu], ld_i_reg(cpu));
@@ -921,6 +863,7 @@ int setup_flusher_array(void)
 
 		for (color = 0; color < MAX_NR_COLORS; color++) {
 			int node;
+			/* manually assigned node numbers */
 			switch (color) {
 				case 0:
 					node = 32;
@@ -1009,12 +952,10 @@ void flush_cache(int all)
 	
 	prev_lbm_i_reg[cpu] = readl_relaxed(ld_i_reg(cpu));
 	prev_lbm_d_reg[cpu] = readl_relaxed(ld_d_reg(cpu));
-	//printk("P%d reg value = 0x%04x\n", cpu, prev_lbm_d_reg[cpu]);
 	for (way=0;way<MAX_NR_WAYS;way++) {
 		if (( (0x00000001 << way) & (prev_lbm_d_reg[cpu]) ) &&
 			!all)
 			continue;
-		//printk("P%d flushes way #%d\n", cpu, way);
 		for (color=0;color<MAX_NR_COLORS;color++) {
 			void *vaddr = flusher_pages[way][color];
 			u32 lvalue  = unlocked_way[way];
diff --git a/litmus/polling_reservations.c b/litmus/polling_reservations.c
index df1aeb0..fa73070 100644
--- a/litmus/polling_reservations.c
+++ b/litmus/polling_reservations.c
@@ -30,11 +30,6 @@ static void periodic_polling_client_arrives(
 				instances =  div64_u64(tmp, pres->period);
 				res->next_replenishment = res->env->time_zero + instances * pres->period;
 			}
-				
-			TRACE("ENV_TIME_ZERO %llu\n", res->env->time_zero);
-			TRACE("pol-res: R%d activate tmp=%llu instances=%llu period=%llu nextrp=%llu cur=%llu\n",
-				res->id, tmp, instances, pres->period, res->next_replenishment,
-				res->env->current_time);
 
 			res->env->change_state(res->env, res,
 				RESERVATION_DEPLETED);
@@ -72,9 +67,6 @@ static void periodic_polling_client_departs(
 			if (list_empty(&res->clients)) {
 				res->env->change_state(res->env, res,
 						RESERVATION_ACTIVE_IDLE);
-//					did_signal_job_completion ?
-//						RESERVATION_DEPLETED :
-//						RESERVATION_ACTIVE_IDLE);
 			} /* else: nothing to do, more clients ready */
 			break;
 
@@ -96,7 +88,6 @@ static void periodic_polling_on_replenishment(
 	res->next_replenishment += pres->period;
 	res->budget_consumed = 0;
 
-	TRACE("polling_replenish(%u): next_replenishment=%llu\n", res->id, res->next_replenishment);
 	switch (res->state) {
 		case RESERVATION_DEPLETED:
 		case RESERVATION_INACTIVE:
@@ -147,8 +138,7 @@ static void common_drain_budget(
 	switch (res->state) {
 		case RESERVATION_DEPLETED:
 		case RESERVATION_INACTIVE:
-			//BUG();
-			TRACE("!!!!!!!!!!!!!!!STATE ERROR R%d STATE(%d)\n", res->id, res->state);
+			BUG();
 			break;
 
 		case RESERVATION_ACTIVE_IDLE:
@@ -282,7 +272,7 @@ void polling_reservation_init(
 	pres->period = period;
 	pres->deadline = deadline;
 	pres->offset = offset;
-	TRACE_TASK(current, "polling_reservation_init: periodic %d, use_edf %d\n", use_periodic_polling, use_edf_prio);
+
 	if (use_periodic_polling) {
 		if (use_edf_prio)
 			pres->res.ops = &periodic_polling_ops_edf;
@@ -363,7 +353,7 @@ static void td_client_departs(
 	switch (res->state) {
 		case RESERVATION_INACTIVE:
 		case RESERVATION_ACTIVE_IDLE:
-			//BUG(); /* INACTIVE or IDLE <=> no client */
+			/* INACTIVE or IDLE <=> no client */
 			break;
 
 		case RESERVATION_ACTIVE:
@@ -383,7 +373,6 @@ static lt_t td_time_remaining_until_end(struct table_driven_reservation *tdres)
 {
 	lt_t now = tdres->res.env->current_time;
 	lt_t end = tdres->cur_interval.end;
-	//TRACE("td_remaining(%u): start=%llu now=%llu end=%llu state=%d\n", tdres->res.id,	tdres->cur_interval.start, now, end, tdres->res.state);
 	if (now >=  end)
 		return 0;
 	else
@@ -396,22 +385,15 @@ static void td_replenish(
 	struct table_driven_reservation *tdres =
 		container_of(res, struct table_driven_reservation, res);
 
-	//TRACE("td_replenish(%u): expected_replenishment=%llu\n", res->id, res->next_replenishment);
-
 	/* figure out current interval */
 	tdres->cur_interval.start = tdres->major_cycle_start +
 		tdres->intervals[tdres->next_interval].start;
 	tdres->cur_interval.end =  tdres->major_cycle_start +
 		tdres->intervals[tdres->next_interval].end;
-/*	TRACE("major_cycle_start=%llu => [%llu, %llu]\n",
-		tdres->major_cycle_start,
-		tdres->cur_interval.start,
-		tdres->cur_interval.end);
-*/
+
 	/* reset budget */
 	res->cur_budget = td_time_remaining_until_end(tdres);
 	res->budget_consumed = 0;
-	//TRACE("td_replenish(%u): %s budget=%llu\n", res->id, res->cur_budget ? "" : "WARNING", res->cur_budget);
 
 	/* prepare next slot */
 	tdres->next_interval = (tdres->next_interval + 1) % tdres->num_intervals;
@@ -422,8 +404,6 @@ static void td_replenish(
 	/* determine next time this reservation becomes eligible to execute */
 	res->next_replenishment  = tdres->major_cycle_start;
 	res->next_replenishment += tdres->intervals[tdres->next_interval].start;
-	//TRACE("td_replenish(%u): next_replenishment=%llu\n", res->id, res->next_replenishment);
-
 
 	switch (res->state) {
 		case RESERVATION_DEPLETED:
@@ -457,34 +437,22 @@ static void td_drain_budget(
 	/* Table-driven scheduling: instead of tracking the budget, we compute
 	 * how much time is left in this allocation interval. */
 
-	/* sanity check: we should never try to drain from future slots */
-	//TRACE("TD_DRAIN STATE(%d) [%llu,%llu]  %llu ?\n", res->state, tdres->cur_interval.start, tdres->cur_interval.end, res->env->current_time);
-	//BUG_ON(tdres->cur_interval.start > res->env->current_time);
-	if (tdres->cur_interval.start > res->env->current_time)
-		TRACE("TD_DRAIN BUG!!!!!!!!!!\n");
-
 	switch (res->state) {
 		case RESERVATION_DEPLETED:
 		case RESERVATION_INACTIVE:
-			//BUG();
-			TRACE("TD_DRAIN!!!!!!!!! RES_STATE = %d\n", res->state);
+			BUG();
 			break;
 
 		case RESERVATION_ACTIVE_IDLE:
 		case RESERVATION_ACTIVE:
 			res->cur_budget = td_time_remaining_until_end(tdres);
-			//TRACE("td_drain_budget(%u): drained to budget=%llu\n", res->id, res->cur_budget);
 			if (!res->cur_budget) {
 				res->env->change_state(res->env, res,
 					RESERVATION_DEPLETED);
 			} else {
 				/* sanity check budget calculation */
-				//BUG_ON(res->env->current_time >= tdres->cur_interval.end);
-				//BUG_ON(res->env->current_time < tdres->cur_interval.start);
-				if (res->env->current_time >= tdres->cur_interval.end)
-					printk(KERN_ALERT "TD_DRAIN_BUDGET WARNING1\n");
-				if (res->env->current_time < tdres->cur_interval.start)
-					printk(KERN_ALERT "TD_DRAIN_BUDGET WARNING2\n");
+				BUG_ON(res->env->current_time >= tdres->cur_interval.end);
+				BUG_ON(res->env->current_time < tdres->cur_interval.start);
 			}
 
 			break;
@@ -502,24 +470,15 @@ static struct task_struct* td_dispatch_client(
 	/* usual logic for selecting a client */
 	t = default_dispatch_client(res, for_at_most);
 
-	TRACE_TASK(t, "td_dispatch_client(%u): selected, budget=%llu\n",
-		res->id, res->cur_budget);
-
 	/* check how much budget we have left in this time slot */
 	res->cur_budget = td_time_remaining_until_end(tdres);
 
-	TRACE_TASK(t, "td_dispatch_client(%u): updated to budget=%llu next=%d\n",
-		res->id, res->cur_budget, tdres->next_interval);
-
 	if (unlikely(!res->cur_budget)) {
 		/* Unlikely case: if we ran out of budget, the user configured
 		 * a broken scheduling table (overlapping table slots).
 		 * Not much we can do about this, but we can't dispatch a job
 		 * now without causing overload. So let's register this reservation
 		 * as depleted and wait for the next allocation. */
-		TRACE("td_dispatch_client(%u): budget unexpectedly depleted "
-			"(check scheduling table for unintended overlap)\n",
-			res->id);
 		res->env->change_state(res->env, res,
 			RESERVATION_DEPLETED);
 		return NULL;
diff --git a/litmus/reservation.c b/litmus/reservation.c
index efd16da..2afb8ee 100644
--- a/litmus/reservation.c
+++ b/litmus/reservation.c
@@ -54,7 +54,6 @@ static void sup_scheduler_update_at(
 	struct sup_reservation_environment* sup_env,
 	lt_t when)
 {
-	//TRACE("SCHEDULER_UPDATE_AT update: %llu > when %llu\n", sup_env->next_scheduler_update, when);
 	if (sup_env->next_scheduler_update > when)
 		sup_env->next_scheduler_update = when;
 }
@@ -199,15 +198,11 @@ static void sup_charge_budget(
 		/* charge all ACTIVE_IDLE up to the first ACTIVE reservation */
 		res = list_entry(pos, struct reservation, list);
 		if (res->state == RESERVATION_ACTIVE) {
-			TRACE("sup_charge_budget ACTIVE R%u drain %llu\n", res->id, delta);
 			if (encountered_active == 0 && res->blocked_by_ghost == 0) {
-				TRACE("DRAIN !!\n");
 				res->ops->drain_budget(res, delta);
 				encountered_active = 1;
 			}			
 		} else {
-			//BUG_ON(res->state != RESERVATION_ACTIVE_IDLE);
-			TRACE("sup_charge_budget INACTIVE R%u drain %llu\n", res->id, delta);
 			res->ops->drain_budget(res, delta);
 		}
 		if (res->state == RESERVATION_ACTIVE ||
@@ -215,15 +210,9 @@ static void sup_charge_budget(
 		{
 			/* make sure scheduler is invoked when this reservation expires
 			 * its remaining budget */
-			 TRACE("requesting scheduler update for reservation %u in %llu nanoseconds\n",
-				res->id, res->cur_budget);
 			 sup_scheduler_update_after(sup_env, res->cur_budget);
 		}
-		//if (encountered_active == 2)
-			/* stop at the first ACTIVE reservation */
-		//	break;
 	}
-	//TRACE("finished charging budgets\n");
 }
 
 static void sup_replenish_budgets(struct sup_reservation_environment* sup_env)
@@ -240,7 +229,6 @@ static void sup_replenish_budgets(struct sup_reservation_environment* sup_env)
 			break;
 		}
 	}
-	//TRACE("finished replenishing budgets\n");
 
 	/* request a scheduler update at the next replenishment instant */
 	res = list_first_entry_or_null(&sup_env->depleted_reservations,
@@ -258,7 +246,6 @@ void sup_update_time(
 	/* If the time didn't advance, there is nothing to do.
 	 * This check makes it safe to call sup_advance_time() potentially
 	 * multiple times (e.g., via different code paths. */
-	//TRACE("(sup_update_time) now: %llu, current_time: %llu\n", now, sup_env->env.current_time);
 	if (unlikely(now <= sup_env->env.current_time))
 		return;
 
@@ -270,11 +257,9 @@ void sup_update_time(
 		sup_env->next_scheduler_update = SUP_NO_SCHEDULER_UPDATE;
 
 	/* deplete budgets by passage of time */
-	//TRACE("CHARGE###\n");
 	sup_charge_budget(sup_env, delta);
 
 	/* check if any budgets where replenished */
-	//TRACE("REPLENISH###\n");
 	sup_replenish_budgets(sup_env);
 }
 
@@ -308,9 +293,6 @@ static void sup_res_change_state(
 
 	sup_env = container_of(env, struct sup_reservation_environment, env);
 
-	TRACE("reservation R%d state %d->%d at %llu\n",
-		res->id, res->state, new_state, env->current_time);
-
 	list_del(&res->list);
 	/* check if we need to reschedule because we lost an active reservation */
 	if (res->state == RESERVATION_ACTIVE && !sup_env->will_schedule)
@@ -391,18 +373,11 @@ static void gmp_add_event(
 	struct list_head *pos;
 	int found = 0, update = 0;
 
-	//when = div64_u64(when, TIMER_RESOLUTION);
-	//when *= TIMER_RESOLUTION;
-//printk(KERN_ALERT "GMP_ADD id=%d type=%d when=%llu\n", id, type, when);
 	nevent = gmp_find_event_by_id(gmp_env, id);
 	
-	if (nevent)
-		TRACE("EVENT R%d update prev = %llu, new = %llu\n", nevent->id, nevent->next_update, when);
-	
 	if (nevent && nevent->next_update > when) {
 		list_del(&nevent->list);
 		update = 1;
-		
 	}
 	
 	if (!nevent || nevent->type != type || update == 1) {
@@ -419,29 +394,21 @@ static void gmp_add_event(
 			if (queued->next_update > nevent->next_update) {
 				list_add(&nevent->list, pos->prev);
 				found = 1;
-				TRACE("NEXT_EVENT id=%d type=%d update=%llu ADDED at before %llu\n", nevent->id, nevent->type, nevent->next_update, queued->next_update);
 				break;
 			}
 		}
 		
 		if (!found) {
 			list_add_tail(&nevent->list, &gmp_env->next_events);
-			TRACE("NEXT_EVENT id=%d type=%d update=%llu ADDED at TAIL\n", nevent->id, nevent->type, nevent->next_update);
 		}
-	} else {
-		//TRACE("EVENT FOUND id = %d type=%d when=%llu, NEW EVENT type=%d when=%llu\n", nevent->id, nevent->type, nevent->next_update, type, when);
-; //printk(KERN_ALERT "EVENT FOUND id = %d type=%d when=%llu, NEW EVENT type=%d when=%llu\n", nevent->id, nevent->type, nevent->next_update, type, when);
 	}
 	
-	TRACE("======START PRINTING EVENT LIST======\n");
-	gmp_print_events(gmp_env, litmus_clock());
-	TRACE("======FINISH PRINTING EVENT LIST======\n");
+	/* gmp_print_events(gmp_env, litmus_clock()); */
 }
 
 void gmp_add_event_after(
 	struct gmp_reservation_environment* gmp_env, lt_t timeout, unsigned int id, event_type_t type)
 {
-	//printk(KERN_ALERT "ADD_EVENT_AFTER id = %d\n", id);
 	gmp_add_event(gmp_env, gmp_env->env.current_time + timeout, id, type);
 }
 
@@ -452,13 +419,10 @@ static void gmp_queue_depleted(
 	struct list_head *pos;
 	struct reservation *queued;
 	int found = 0;
-
-//printk(KERN_ALERT "R%d request to enqueue depleted_list\n", res->id);
 	
 	list_for_each(pos, &gmp_env->depleted_reservations) {
 		queued = list_entry(pos, struct reservation, list);
 		if (queued && (queued->next_replenishment > res->next_replenishment)) {
-//printk(KERN_ALERT "QUEUED R%d %llu\n", queued->id, queued->next_replenishment);
 			list_add(&res->list, pos->prev);
 			found = 1;
 			break;
@@ -468,8 +432,6 @@ static void gmp_queue_depleted(
 	if (!found)
 		list_add_tail(&res->list, &gmp_env->depleted_reservations);
 
-	TRACE("R%d queued to depleted_list\n", res->id);
-//printk(KERN_ALERT "R%d queued to depleted_list\n", res->id);
 	gmp_add_event(gmp_env, res->next_replenishment, res->id, EVENT_REPLENISH);
 }
 
@@ -498,9 +460,9 @@ static void gmp_queue_active(
 	if (res->state == RESERVATION_ACTIVE && check_preempt)
 		gmp_env->schedule_now++;
 
-//#if BUDGET_ENFORCEMENT_AT_C	
+#if BUDGET_ENFORCEMENT_AT_C	
 	gmp_add_event_after(gmp_env, res->cur_budget, res->id, EVENT_DRAIN);
-//#endif
+#endif
 	res->event_added = 1;	
 }
 
@@ -508,8 +470,6 @@ static void gmp_queue_reservation(
 	struct gmp_reservation_environment* gmp_env,
 	struct reservation *res)
 {
-
-//printk(KERN_ALERT "DEBUG: Passed %s %d %p R%d STATE %d\n",__FUNCTION__,__LINE__, gmp_env, res->id, res->state);
 	switch (res->state) {
 		case RESERVATION_INACTIVE:
 			list_add(&res->list, &gmp_env->inactive_reservations);
@@ -534,7 +494,7 @@ void gmp_add_new_reservation(
 	gmp_queue_reservation(gmp_env, new_res);
 }
 
-//#if BUDGET_ENFORCEMENT_AT_C
+#if BUDGET_ENFORCEMENT_AT_C
 static void gmp_charge_budget(
 	struct gmp_reservation_environment* gmp_env,
 	lt_t delta)
@@ -547,50 +507,35 @@ static void gmp_charge_budget(
 		/* charge all ACTIVE_IDLE up to the first ACTIVE reservation */
 		res = list_entry(pos, struct reservation, list);
 		if (res->state == RESERVATION_ACTIVE) {
-			TRACE("gmp_charge_budget ACTIVE R%u scheduled_on=%d drain %llu\n", res->id, res->scheduled_on, delta);
 			if (res->scheduled_on != NO_CPU && res->blocked_by_ghost == 0) {
-				TRACE("DRAIN !!\n");
 				drained = 1;
 				res->ops->drain_budget(res, delta);
 			} else {
-				TRACE("NO DRAIN (not scheduled)!!\n");
+				; /* Do not drain budget (not scheduled) */
 			}
 		} else {
-			//BUG_ON(res->state != RESERVATION_ACTIVE_IDLE);
-			if (res->state != RESERVATION_ACTIVE_IDLE)
-				TRACE("BUG!!!!!!!!!!!! gmp_charge_budget()\n");
-			TRACE("gmp_charge_budget INACTIVE R%u drain %llu\n", res->id, delta);
-			//if (res->is_ghost != NO_CPU) {
-				TRACE("DRAIN !!\n");
-				drained = 1;
-				res->ops->drain_budget(res, delta);
-			//}
+			BUG_ON(res->state != RESERVATION_ACTIVE_IDLE);
+			drained = 1;
+			res->ops->drain_budget(res, delta);
 		}
 		if ((res->state == RESERVATION_ACTIVE ||
 			res->state == RESERVATION_ACTIVE_IDLE) && (drained == 1))
 		{
 			/* make sure scheduler is invoked when this reservation expires
 			 * its remaining budget */
-			 TRACE("requesting gmp_scheduler update for reservation %u in %llu nanoseconds\n", res->id, res->cur_budget);
 			 gmp_add_event_after(gmp_env, res->cur_budget, res->id, EVENT_DRAIN);
 			 res->event_added = 1;
 		}
-		//if (encountered_active == 2)
-			/* stop at the first ACTIVE reservation */
-		//	break;
 	}
-	//TRACE("finished charging budgets\n");
 }
-//#else
-/*
+#else
 static void gmp_charge_budget(
 	struct gmp_reservation_environment* gmp_env,
 	lt_t delta)
 {
 	return;
 }
-*/
-//#endif
+#endif
 
 static void gmp_replenish_budgets(struct gmp_reservation_environment* gmp_env)
 {
@@ -601,20 +546,18 @@ static void gmp_replenish_budgets(struct gmp_reservation_environment* gmp_env)
 		res = list_entry(pos, struct reservation, list);
 		if (res->next_replenishment <= gmp_env->env.current_time) {
 			res->ops->replenish(res);
-			if (res->is_ghost != NO_CPU) {
-				TRACE("R%d replenished! scheduled_on=%d\n", res->id, res->scheduled_on);
-			}
 		} else {
 			/* list is ordered by increasing depletion times */
 			break;
 		}
 	}
-	//TRACE("finished replenishing budgets\n");
 }
 
 #define EPSILON	50
 
-/* return schedule_now */
+/* return value: schedule_now (the variable indicates the number of jobs
+ *               that need to reschedule.)
+ */
 int gmp_update_time(
 	struct gmp_reservation_environment* gmp_env,
 	lt_t now)
@@ -625,35 +568,28 @@ int gmp_update_time(
 	/* If the time didn't advance, there is nothing to do.
 	 * This check makes it safe to call sup_advance_time() potentially
 	 * multiple times (e.g., via different code paths. */
-	//TRACE("(gmp_update_time) now: %llu, current_time: %llu\n", now, gmp_env->env.current_time);
 	if (unlikely(now <= gmp_env->env.current_time + EPSILON))
 		return 0;
 
 	delta = now - gmp_env->env.current_time;
 	gmp_env->env.current_time = now;
 
-
-	//gmp_print_events(gmp_env, now);
 	/* deplete budgets by passage of time */
-	//TRACE("CHARGE###\n");
 	gmp_charge_budget(gmp_env, delta);
 
 	/* check if any budgets where replenished */
-	//TRACE("REPLENISH###\n");
 	gmp_replenish_budgets(gmp_env);
-
 	
 	list_for_each_entry_safe(event, next, &gmp_env->next_events, list) {
 		if (event->next_update < now) {
 			list_del(&event->list);
-			TRACE("EVENT at %llu IS DELETED\n", event->next_update);
 			kfree(event);
 		} else {
 			break;
 		}
 	}		
 	
-	//gmp_print_events(gmp_env, litmus_clock());
+	/* gmp_print_events(gmp_env, litmus_clock()); */
 	
 	ret = min(gmp_env->schedule_now, NR_CPUS);
 	gmp_env->schedule_now = 0;
@@ -680,9 +616,6 @@ static void gmp_res_change_state(
 
 	gmp_env = container_of(env, struct gmp_reservation_environment, env);
 
-	TRACE("GMP reservation R%d state %d->%d at %llu\n",
-		res->id, res->state, new_state, env->current_time);
-
 	list_del(&res->list);
 	/* check if we need to reschedule because we lost an active reservation */
 	if (res->state == RESERVATION_ACTIVE)
diff --git a/litmus/sched_mc2.c b/litmus/sched_mc2.c
index 4536556..2e299d8 100644
--- a/litmus/sched_mc2.c
+++ b/litmus/sched_mc2.c
@@ -160,9 +160,9 @@ static void task_departs(struct task_struct *tsk, int job_complete)
 		ce = &state->crit_entries[lv];
 		ce->running = tsk;
 		res->is_ghost = state->cpu;
-//#if BUDGET_ENFORCEMENT_AT_C		
+#if BUDGET_ENFORCEMENT_AT_C		
 		gmp_add_event_after(&_global_env, res->cur_budget, res->id, EVENT_DRAIN);
-//#endif
+#endif
 		TRACE_TASK(tsk, "BECOME GHOST at %llu\n", litmus_clock());
 		
 	}		
@@ -397,13 +397,11 @@ static lt_t mc2_update_ghost_state(struct mc2_cpu_state *state)
 	for (lv = 0; lv < NUM_CRIT_LEVELS; lv++) {
 		ce = &state->crit_entries[lv];
 		if (ce->running != NULL) {
-//printk(KERN_ALERT "P%d ce->running : %s/%d\n", state->cpu,  ce->running ? (ce->running)->comm : "null", ce->running ? (ce->running)->pid : 0);
 			tinfo = get_mc2_state(ce->running);
 			if (!tinfo)
 				continue;
 			
 			res = res_find_by_id(state, tinfo->mc2_param.res_id);
-			//BUG_ON(!res);
 			if (!res) {
 				printk(KERN_ALERT "mc2_update_ghost_state(): R%d not found!\n", tinfo->mc2_param.res_id);			
 				return 0;
@@ -434,8 +432,6 @@ static lt_t mc2_update_ghost_state(struct mc2_cpu_state *state)
 						litmus_reschedule(state->cpu);
 				}
 			} else {
-				//TRACE("GHOST NOT FINISH id %d budget %llu\n", res->id, res->cur_budget);
-				//gmp_add_event_after(&_global_env, res->cur_budget, res->id, EVENT_DRAIN);
 				if (ret > res->cur_budget) {
 					ret = res->cur_budget;
 				}
@@ -497,11 +493,9 @@ static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 	TRACE("TIMER FIRED at %llu\n", litmus_clock());
 	raw_spin_lock_irqsave(&_global_env.lock, flags);
 	raw_spin_lock(&state->lock);
-//printk(KERN_ALERT "P%d on_scheduling_timer() hold lock %s/%d\n", state->cpu, current ? (current)->comm : "null", current ? (current)->pid : 0);			
 	now = litmus_clock();
 	sup_update_time(&state->sup_env, now);
 	global_schedule_now = gmp_update_time(&_global_env, now);
-//printk(KERN_ALERT "P%d update_time in timer() %s/%d\n", state->cpu, current ? (current)->comm : "null", current ? (current)->pid : 0);			
 	remain_budget = mc2_update_ghost_state(state);
 	
 	update = state->sup_env.next_scheduler_update;
@@ -512,7 +506,6 @@ static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 	}
 	
 	TRACE_CUR("on_scheduling_timer at %llu, upd:%llu (for cpu=%d) g_schedule_now:%d remain_budget:%llu\n", now, update, state->cpu, global_schedule_now, remain_budget);
-//printk(KERN_ALERT "on_scheduling_timer at %llu, upd:%llu (for cpu=%d) g_schedule_now:%d\n", now, update, state->cpu, global_schedule_now);
 	if (update <= now) {
 		litmus_reschedule_local();
 	} else if (update != SUP_NO_SCHEDULER_UPDATE) {
@@ -536,7 +529,7 @@ static enum hrtimer_restart on_scheduling_timer(struct hrtimer *timer)
 	
 	raw_spin_unlock(&state->lock);
 	raw_spin_unlock_irqrestore(&_global_env.lock, flags);
-//printk(KERN_ALERT "P%d on_scheduling_timer() release lock %s/%d\n", state->cpu, current ? (current)->comm : "null", current ? (current)->pid : 0);	
+
 	return restart;
 }
 
@@ -564,35 +557,13 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 					res->blocked_by_ghost = 0;
 					res->is_ghost = NO_CPU;
 					return tsk;
-/*
-					if (likely(!ce->running)) {
-						sup_scheduler_update_after(sup_env, res->cur_budget);
-						res->blocked_by_ghost = 0;
-						res->is_ghost = NO_CPU;
-						return tsk;
-					} else {
-						res->blocked_by_ghost = 1;
-						TRACE_TASK(ce->running, " is GHOST\n");
-					}
-*/
 				}
 			}
 		}
 	}
 	
-	/* no eligible level A or B tasks exists */
-	/* check the ghost job */
-	/*
-	ce = &state->crit_entries[CRIT_LEVEL_C];
-	if (ce->running) {
-		TRACE_TASK(ce->running," is GHOST\n");
-		return NULL;
-	}
-	*/
 	cur_priority = _lowest_prio_cpu.cpu_entries[state->cpu].deadline;
 	
-	TRACE("****** ACTIVE LIST ******\n");
-	TRACE_TASK(_lowest_prio_cpu.cpu_entries[state->cpu].scheduled, "** CURRENT JOB deadline %llu **\n", cur_priority);
 	list_for_each_entry_safe(res, next, &_global_env.active_reservations, list) {
 		TRACE("R%d deadline=%llu, scheduled_on=%d\n", res->id, res->priority, res->scheduled_on);
 		if (res->state == RESERVATION_ACTIVE && res->scheduled_on == NO_CPU) {
@@ -601,24 +572,16 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 				lv = get_task_crit_level(tsk);
 				if (lv == NUM_CRIT_LEVELS) {
 					gmp_add_event_after(&_global_env, res->cur_budget, res->id, EVENT_DRAIN);
-					//res->event_added = 1;
 					return tsk;
 				} else if (lv == CRIT_LEVEL_C) {
-					//ce = &state->crit_entries[lv];
-					//if (likely(!ce->running)) {
-//#if BUDGET_ENFORCEMENT_AT_C						
+#if BUDGET_ENFORCEMENT_AT_C						
 						gmp_add_event_after(&_global_env, res->cur_budget, res->id, EVENT_DRAIN);
-//#endif
+#endif
 						res->event_added = 1;
 						res->blocked_by_ghost = 0;
 						res->is_ghost = NO_CPU;
 						res->scheduled_on = state->cpu;
 						return tsk;
-					//} else {
-					//	res->blocked_by_ghost = 1;
-					//	TRACE_TASK(ce->running, " is GHOST\n");
-					//	return NULL;
-					//}
 				} else {
 					BUG();
 				}
@@ -631,9 +594,6 @@ struct task_struct* mc2_dispatch(struct sup_reservation_environment* sup_env, st
 
 static inline void pre_schedule(struct task_struct *prev, int cpu)
 {
-	if (!prev || !is_realtime(prev))
-		return;
-	
 	do_partition(CRIT_LEVEL_C, cpu);
 }
 
@@ -642,9 +602,7 @@ static inline void post_schedule(struct task_struct *next, int cpu)
 	enum crit_level lev;
 	if ((!next) || !is_realtime(next))
 		return;
-/*	if (!is_realtime(next))
-		lev = NUM_CRIT_LEVELS;
-	else */
+
 	lev = get_task_crit_level(next);
 	do_partition(lev, cpu);
 }
@@ -657,17 +615,13 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 	lt_t now;
 	struct mc2_cpu_state *state = local_cpu_state();
 
-	//pre_schedule(prev, state->cpu);
+	pre_schedule(prev, state->cpu);
 	
 	raw_spin_lock(&_global_env.lock);
 	raw_spin_lock(&state->lock);
 	
 	//BUG_ON(state->scheduled && state->scheduled != prev);
 	//BUG_ON(state->scheduled && !is_realtime(prev));
-	if (state->scheduled && state->scheduled != prev)
-		; //printk(KERN_ALERT "BUG1!!!!!!!! %s %s\n", state->scheduled ? (state->scheduled)->comm : "null", prev ? (prev)->comm : "null");
-	if (state->scheduled && !is_realtime(prev))
-		; //printk(KERN_ALERT "BUG2!!!!!!!! \n");
 
 	/* update time */
 	state->sup_env.will_schedule = true;
@@ -713,7 +667,7 @@ static struct task_struct* mc2_schedule(struct task_struct * prev)
 			int cpu;
 			raw_spin_lock(&_global_env.lock);
 			cpu = get_lowest_prio_cpu(res?res->priority:0);
-			//TRACE("LEVEL-C TASK PREEMPTED!! poking CPU %d to reschedule\n", cpu);
+			TRACE("LEVEL-C TASK PREEMPTED!! poking CPU %d to reschedule\n", cpu);
 			if (cpu != NO_CPU) {
 				raw_spin_lock(&_lowest_prio_cpu.lock);
 				_lowest_prio_cpu.cpu_entries[cpu].will_schedule = true;
@@ -767,7 +721,7 @@ static void mc2_task_resume(struct task_struct  *tsk)
 		state = local_cpu_state();
 
 	raw_spin_lock(&_global_env.lock);
-//printk(KERN_ALERT "P%d resume() hold lock\n", state->cpu);	
+
 	/* Requeue only if self-suspension was already processed. */
 	if (tinfo->has_departed)
 	{
@@ -778,9 +732,7 @@ static void mc2_task_resume(struct task_struct  *tsk)
 		if (tinfo->cpu != -1) {
 			sup_update_time(&state->sup_env, litmus_clock());
 		} else {
-			//TRACE("RESUME UPDATE ####\n");
 			gmp_update_time(&_global_env, litmus_clock());
-			//TRACE("RESUME UPDATE $$$$\n");
 		}
 			
 		mc2_update_ghost_state(state);
@@ -788,11 +740,9 @@ static void mc2_task_resume(struct task_struct  *tsk)
 		/* NOTE: drops state->lock */
 		TRACE_TASK(tsk, "mc2_resume()\n");
 		mc2_update_timer_and_unlock(state);	
-//printk(KERN_ALERT "P%d resume() dropped lock\n", state->cpu);			
 	} else {
 		TRACE_TASK(tsk, "resume event ignored, still scheduled\n");
 		raw_spin_unlock(&_global_env.lock);
-//printk(KERN_ALERT "P%d resume() release lock\n", state->cpu);			
 	}
 
 	local_irq_restore(flags);
@@ -833,7 +783,6 @@ static long mc2_complete_job(void)
 		
 		raw_spin_lock(&_global_env.lock);
 		raw_spin_lock(&state->lock);
-//printk(KERN_ALERT "P%d complete() hold lock\n", state->cpu);
 		env = &(state->sup_env.env);
 		
 		res = res_find_by_id(state, tinfo->mc2_param.res_id);
@@ -846,23 +795,11 @@ static long mc2_complete_job(void)
 		
 		/* set next_replenishtime to synchronous release time */
 		res->next_replenishment = tsk_rt(current)->sporadic_release_time;
-/*		
-		if (get_task_crit_level(current) == CRIT_LEVEL_A) {
-			struct table_driven_reservation *tdres;
-			tdres = container_of(res, struct table_driven_reservation, res);
-			tdres->next_interval = 0;
-			tdres->major_cycle_start = tsk_rt(current)->sporadic_release_time;
-			res->next_replenishment += tdres->intervals[0].start;			
-		}
-*/		
 		res->cur_budget = 0;
 		res->env->change_state(res->env, res, RESERVATION_DEPLETED);
-		
-		//TRACE_CUR("CHANGE NEXT_REP = %llu\n NEXT_UPDATE = %llu\n", res->next_replenishment, state->sup_env.next_scheduler_update);
-		
+
 		raw_spin_unlock(&state->lock);
 		raw_spin_unlock(&_global_env.lock);
-//printk(KERN_ALERT "P%d complete() release lock\n", state->cpu);				
 		local_irq_restore(flags);
 		preempt_enable();
 	}
@@ -875,23 +812,19 @@ static long mc2_complete_job(void)
 	next_release = ns_to_ktime(get_release(current));
 	preempt_disable();
 	TRACE_CUR("next_release=%llu\n", get_release(current));
-	//flush_cache();
+
 	if (get_release(current) > litmus_clock()) {
 		/* sleep until next_release */
 		set_current_state(TASK_INTERRUPTIBLE);
 		preempt_enable_no_resched();
 		err = schedule_hrtimeout(&next_release, HRTIMER_MODE_ABS);
-//		if (get_task_crit_level(current) == CRIT_LEVEL_A)
-//			sched_trace_task_release(current);
 	} else {
 		/* release the next job immediately */
 		err = 0;
 		TRACE_CUR("TARDY: release=%llu now=%llu\n", get_release(current), litmus_clock());
 		preempt_enable();
-//		if (get_task_crit_level(current) == CRIT_LEVEL_A)
 		sched_trace_task_release(current);
 	}
-	//l2x0_flush_all();
 	TRACE_CUR("mc2_complete_job returns at %llu\n", litmus_clock());
 
 	return err;
@@ -945,7 +878,7 @@ static long mc2_admit_task(struct task_struct *tsk)
 		raw_spin_unlock_irqrestore(&state->lock, flags);
 	} else if (lv == CRIT_LEVEL_C) {
 		raw_spin_lock_irqsave(&_global_env.lock, flags);
-//printk(KERN_ALERT "admit() hold lock\n");		
+
 		state = local_cpu_state();
 		
 		raw_spin_lock(&state->lock);
@@ -954,7 +887,6 @@ static long mc2_admit_task(struct task_struct *tsk)
 
 		/* found the appropriate reservation (or vCPU) */
 		if (res) {
-			TRACE_TASK(tsk, "GMP FOUND RES ID\n");
 			tinfo->mc2_param.crit = mp->crit;
 			tinfo->mc2_param.res_id = mp->res_id;
 			
@@ -970,7 +902,6 @@ static long mc2_admit_task(struct task_struct *tsk)
 
 		raw_spin_unlock(&state->lock);
 		raw_spin_unlock_irqrestore(&_global_env.lock, flags);
-//printk(KERN_ALERT "admit() release lock\n");		
 	}
 	
 	preempt_enable();
@@ -1006,7 +937,7 @@ static void mc2_task_new(struct task_struct *tsk, int on_runqueue,
 	/* acquire the lock protecting the state and disable interrupts */
 	raw_spin_lock(&_global_env.lock);
 	raw_spin_lock(&state->lock);
-//printk(KERN_ALERT "new() hold lock R%d\n", tinfo->mc2_param.res_id);	
+
 	if (is_running) {
 		state->scheduled = tsk;
 		/* make sure this task should actually be running */
@@ -1023,14 +954,10 @@ static void mc2_task_new(struct task_struct *tsk, int on_runqueue,
 		mc2_update_ghost_state(state);
 		task_arrives(state, tsk);
 		/* NOTE: drops state->lock */
-		TRACE("mc2_new()\n");
-		
 		mc2_update_timer_and_unlock(state);
-//printk(KERN_ALERT "new() dropped lock R%d\n",tinfo->mc2_param.res_id);		
 	} else {
 		raw_spin_unlock(&state->lock);
 		raw_spin_unlock(&_global_env.lock);
-//printk(KERN_ALERT "new() release lock R%d\n",tinfo->mc2_param.res_id);		
 	}
 	local_irq_restore(flags);
 	
@@ -1051,7 +978,6 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 	struct reservation *res = NULL, *next;
 	struct sup_reservation_environment *sup_env;
 	int found = 0;
-	//enum crit_level lv = get_task_crit_level(current);
 	unsigned long flags;
 	
 	if (cpu == -1) {
@@ -1063,7 +989,6 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 		
 		list_for_each_entry_safe(res, next, &_global_env.depleted_reservations, list) {
 			if (res->id == reservation_id) {
-				TRACE("DESTROY RES FOUND!!!\n");
 				list_del(&res->list);
 				kfree(res);
 				found = 1;
@@ -1073,7 +998,6 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 		if (!found) {
 			list_for_each_entry_safe(res, next, &_global_env.inactive_reservations, list) {
 				if (res->id == reservation_id) {
-					TRACE("DESTROY RES FOUND!!!\n");
 					list_del(&res->list);
 					kfree(res);
 					found = 1;
@@ -1084,7 +1008,6 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 		if (!found) {
 			list_for_each_entry_safe(res, next, &_global_env.active_reservations, list) {
 				if (res->id == reservation_id) {
-					TRACE("DESTROY RES FOUND!!!\n");
 					list_del(&res->list);
 					kfree(res);
 					found = 1;
@@ -1101,17 +1024,9 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 		state = cpu_state_for(cpu);
 		raw_spin_lock_irqsave(&state->lock, flags);
 		
-	//	res = sup_find_by_id(&state->sup_env, reservation_id);
 		sup_env = &state->sup_env;
 		list_for_each_entry_safe(res, next, &sup_env->depleted_reservations, list) {
 			if (res->id == reservation_id) {
-/*
-			if (lv == CRIT_LEVEL_A) {
-					struct table_driven_reservation *tdres;
-					tdres = container_of(res, struct table_driven_reservation, res);
-					kfree(tdres->intervals);
-			}
-*/
 				list_del(&res->list);
 				kfree(res);
 				found = 1;
@@ -1121,12 +1036,6 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 		if (!found) {
 			list_for_each_entry_safe(res, next, &sup_env->inactive_reservations, list) {
 				if (res->id == reservation_id) {
-/*					if (lv == CRIT_LEVEL_A) {
-						struct table_driven_reservation *tdres;
-						tdres = container_of(res, struct table_driven_reservation, res);
-						kfree(tdres->intervals);
-					}
-*/
 					list_del(&res->list);
 					kfree(res);
 					found = 1;
@@ -1137,12 +1046,6 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 		if (!found) {
 			list_for_each_entry_safe(res, next, &sup_env->active_reservations, list) {
 				if (res->id == reservation_id) {
-/*					if (lv == CRIT_LEVEL_A) {
-						struct table_driven_reservation *tdres;
-						tdres = container_of(res, struct table_driven_reservation, res);
-						kfree(tdres->intervals);
-					}
-*/
 					list_del(&res->list);
 					kfree(res);
 					found = 1;
@@ -1154,7 +1057,6 @@ static long mc2_reservation_destroy(unsigned int reservation_id, int cpu)
 		raw_spin_unlock_irqrestore(&state->lock, flags);
 	}
 	
-	TRACE("RESERVATION_DESTROY ret = %d\n", ret);
 	return ret;
 }
 
@@ -1196,8 +1098,6 @@ static void mc2_task_exit(struct task_struct *tsk)
 		task_departs(tsk, 0);
 		
 		/* NOTE: drops state->lock */
-		TRACE("mc2_exit()\n");
-
 		mc2_update_timer_and_unlock(state);	
 	} else {
 		raw_spin_unlock(&state->lock);
@@ -1547,7 +1447,6 @@ static void mc2_finish_switch(struct task_struct *prev)
 	struct mc2_cpu_state *state = local_cpu_state();
 	
 	state->scheduled = is_realtime(current) ? current : NULL;
-	//TRACE("FINISH CXS! from %s/%d to %s/%d\n", prev ? (prev)->comm : "null", prev ? (prev)->pid : 0, current ? (current)->comm : "null", current ? (current)->pid : 0);
 }
 
 static long mc2_deactivate_plugin(void)
@@ -1606,7 +1505,6 @@ static long mc2_deactivate_plugin(void)
 
 	
 	while (!list_empty(&_global_env.active_reservations)) {
-		TRACE("RES FOUND!!!\n");
 		res = list_first_entry(
 			&_global_env.active_reservations,
 				struct reservation, list);
@@ -1615,7 +1513,6 @@ static long mc2_deactivate_plugin(void)
 	}
 
 	while (!list_empty(&_global_env.inactive_reservations)) {
-		TRACE("RES FOUND!!!\n");
 		res = list_first_entry(
 			&_global_env.inactive_reservations,
 				struct reservation, list);
@@ -1624,7 +1521,6 @@ static long mc2_deactivate_plugin(void)
 	}
 
 	while (!list_empty(&_global_env.depleted_reservations)) {
-		TRACE("RES FOUND!!!\n");
 		res = list_first_entry(
 			&_global_env.depleted_reservations,
 				struct reservation, list);
@@ -1633,7 +1529,6 @@ static long mc2_deactivate_plugin(void)
 	}
 	
 	while (!list_empty(&_global_env.next_events)) {
-		TRACE("EVENT FOUND!!!\n");
 		event = list_first_entry(
 			&_global_env.next_events,
 				struct next_timer_event, list);
diff --git a/litmus/uncachedev.c b/litmus/uncachedev.c
index 06a6a7c..cf8217ee 100644
--- a/litmus/uncachedev.c
+++ b/litmus/uncachedev.c
@@ -54,8 +54,8 @@ static int litmus_uncache_mmap(struct file* filp, struct vm_area_struct* vma)
 		return -EINVAL;
 
 	/* you can't share it with anyone */
-	if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
-		return -EINVAL;
+	//if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+		//return -EINVAL;
 
 	/* cannot be expanded, and is not a "normal" page. */
 	vma->vm_flags |= VM_DONTEXPAND;
-- 
1.8.1.2