Attachment 'litmus-rt-2012.1.patch'

Download

   1  Makefile                                    |    4 +-
   2  arch/arm/Kconfig                            |    8 +
   3  arch/arm/include/asm/timex.h                |    2 +
   4  arch/arm/include/asm/unistd.h               |    3 +
   5  arch/arm/kernel/calls.S                     |   12 +
   6  arch/arm/kernel/smp.c                       |    4 +
   7  arch/arm/mach-realview/include/mach/timex.h |   27 +
   8  arch/x86/Kconfig                            |    8 +
   9  arch/x86/include/asm/entry_arch.h           |    1 +
  10  arch/x86/include/asm/feather_trace.h        |   17 +
  11  arch/x86/include/asm/feather_trace_32.h     |   79 ++
  12  arch/x86/include/asm/feather_trace_64.h     |   67 ++
  13  arch/x86/include/asm/hw_irq.h               |    3 +
  14  arch/x86/include/asm/irq_vectors.h          |    7 +
  15  arch/x86/include/asm/processor.h            |    4 +
  16  arch/x86/include/asm/unistd_32.h            |    6 +-
  17  arch/x86/include/asm/unistd_64.h            |    4 +
  18  arch/x86/kernel/Makefile                    |    2 +
  19  arch/x86/kernel/cpu/intel_cacheinfo.c       |   17 +
  20  arch/x86/kernel/entry_64.S                  |    2 +
  21  arch/x86/kernel/ft_event.c                  |  118 +++
  22  arch/x86/kernel/irqinit.c                   |    3 +
  23  arch/x86/kernel/smp.c                       |   31 +
  24  arch/x86/kernel/syscall_table_32.S          |   12 +
  25  fs/exec.c                                   |   13 +-
  26  fs/inode.c                                  |    2 +
  27  include/linux/completion.h                  |    1 +
  28  include/linux/fs.h                          |   21 +-
  29  include/linux/hardirq.h                     |    4 +
  30  include/linux/hrtimer.h                     |   32 +
  31  include/linux/sched.h                       |   19 +-
  32  include/linux/smp.h                         |    5 +
  33  include/linux/tick.h                        |    5 +
  34  include/litmus/affinity.h                   |   80 ++
  35  include/litmus/bheap.h                      |   77 ++
  36  include/litmus/budget.h                     |    8 +
  37  include/litmus/clustered.h                  |   44 ++
  38  include/litmus/debug_trace.h                |   37 +
  39  include/litmus/edf_common.h                 |   25 +
  40  include/litmus/fdso.h                       |   71 ++
  41  include/litmus/feather_buffer.h             |   94 +++
  42  include/litmus/feather_trace.h              |   65 ++
  43  include/litmus/ftdev.h                      |   55 ++
  44  include/litmus/jobs.h                       |    9 +
  45  include/litmus/litmus.h                     |  275 +++++++
  46  include/litmus/litmus_proc.h                |   25 +
  47  include/litmus/locking.h                    |   28 +
  48  include/litmus/preempt.h                    |  164 ++++
  49  include/litmus/rt_domain.h                  |  182 +++++
  50  include/litmus/rt_param.h                   |  209 ++++++
  51  include/litmus/sched_plugin.h               |  111 +++
  52  include/litmus/sched_trace.h                |  200 +++++
  53  include/litmus/srp.h                        |   28 +
  54  include/litmus/trace.h                      |  116 +++
  55  include/litmus/trace_irq.h                  |   21 +
  56  include/litmus/unistd_32.h                  |   21 +
  57  include/litmus/unistd_64.h                  |   33 +
  58  kernel/exit.c                               |    4 +
  59  kernel/fork.c                               |    7 +
  60  kernel/hrtimer.c                            |   95 +++
  61  kernel/printk.c                             |   14 +-
  62  kernel/sched.c                              |  137 ++++-
  63  kernel/sched_fair.c                         |    3 +
  64  kernel/sched_rt.c                           |    2 +-
  65  kernel/time/tick-sched.c                    |   47 ++
  66  litmus/Kconfig                              |  218 ++++++
  67  litmus/Makefile                             |   29 +
  68  litmus/affinity.c                           |   42 ++
  69  litmus/bheap.c                              |  314 ++++++++
  70  litmus/budget.c                             |  111 +++
  71  litmus/clustered.c                          |  111 +++
  72  litmus/ctrldev.c                            |  150 ++++
  73  litmus/edf_common.c                         |  118 +++
  74  litmus/fdso.c                               |  293 ++++++++
  75  litmus/ft_event.c                           |   43 ++
  76  litmus/ftdev.c                              |  439 +++++++++++
  77  litmus/jobs.c                               |   43 ++
  78  litmus/litmus.c                             |  564 ++++++++++++++
  79  litmus/litmus_proc.c                        |  347 +++++++++
  80  litmus/locking.c                            |  139 ++++
  81  litmus/preempt.c                            |  133 ++++
  82  litmus/rt_domain.c                          |  357 +++++++++
  83  litmus/sched_cedf.c                         |  863 ++++++++++++++++++++++
  84  litmus/sched_gsn_edf.c                      | 1030 ++++++++++++++++++++++++++
  85  litmus/sched_litmus.c                       |  325 ++++++++
  86  litmus/sched_pfair.c                        | 1067 +++++++++++++++++++++++++++
  87  litmus/sched_plugin.c                       |  227 ++++++
  88  litmus/sched_psn_edf.c                      |  645 ++++++++++++++++
  89  litmus/sched_task_trace.c                   |  241 ++++++
  90  litmus/sched_trace.c                        |  252 +++++++
  91  litmus/srp.c                                |  295 ++++++++
  92  litmus/sync.c                               |  104 +++
  93  litmus/trace.c                              |  225 ++++++
  94  93 files changed, 11521 insertions(+), 34 deletions(-)
  95 
  96 diff --git a/Makefile b/Makefile
  97 index 6a5bdad..a327725 100644
  98 --- a/Makefile
  99 +++ b/Makefile
 100 @@ -1,7 +1,7 @@
 101  VERSION = 3
 102  PATCHLEVEL = 0
 103  SUBLEVEL = 0
 104 -EXTRAVERSION =
 105 +EXTRAVERSION =-litmus
 106  NAME = Sneaky Weasel
 107  
 108  # *DOCUMENTATION*
 109 @@ -708,7 +708,7 @@ export mod_strip_cmd
 110  
 111  
 112  ifeq ($(KBUILD_EXTMOD),)
 113 -core-y		+= kernel/ mm/ fs/ ipc/ security/ crypto/ block/
 114 +core-y		+= kernel/ mm/ fs/ ipc/ security/ crypto/ block/ litmus/
 115  
 116  vmlinux-dirs	:= $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
 117  		     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
 118 diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
 119 index 9adc278..fb228ea 100644
 120 --- a/arch/arm/Kconfig
 121 +++ b/arch/arm/Kconfig
 122 @@ -2040,3 +2040,11 @@ source "security/Kconfig"
 123  source "crypto/Kconfig"
 124  
 125  source "lib/Kconfig"
 126 +
 127 +config ARCH_HAS_SEND_PULL_TIMERS
 128 +	def_bool n
 129 +
 130 +config ARCH_HAS_FEATHER_TRACE
 131 +	def_bool n
 132 +
 133 +source "litmus/Kconfig"
 134 diff --git a/arch/arm/include/asm/timex.h b/arch/arm/include/asm/timex.h
 135 index 3be8de3..8a102a3 100644
 136 --- a/arch/arm/include/asm/timex.h
 137 +++ b/arch/arm/include/asm/timex.h
 138 @@ -16,9 +16,11 @@
 139  
 140  typedef unsigned long cycles_t;
 141  
 142 +#ifndef get_cycles
 143  static inline cycles_t get_cycles (void)
 144  {
 145  	return 0;
 146  }
 147 +#endif
 148  
 149  #endif
 150 diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
 151 index 2c04ed5..0196edf 100644
 152 --- a/arch/arm/include/asm/unistd.h
 153 +++ b/arch/arm/include/asm/unistd.h
 154 @@ -403,6 +403,9 @@
 155  #define __NR_sendmmsg			(__NR_SYSCALL_BASE+374)
 156  #define __NR_setns			(__NR_SYSCALL_BASE+375)
 157  
 158 +#define __NR_LITMUS (__NR_SYSCALL_BASE+376)
 159 +#include <litmus/unistd_32.h>
 160 +
 161  /*
 162   * The following SWIs are ARM private.
 163   */
 164 diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
 165 index 80f7896..ed2ae93 100644
 166 --- a/arch/arm/kernel/calls.S
 167 +++ b/arch/arm/kernel/calls.S
 168 @@ -385,6 +385,18 @@
 169  		CALL(sys_syncfs)
 170  		CALL(sys_sendmmsg)
 171  /* 375 */	CALL(sys_setns)
 172 +		CALL(sys_set_rt_task_param)
 173 +		CALL(sys_get_rt_task_param)
 174 +		CALL(sys_complete_job)
 175 +		CALL(sys_od_open)
 176 +/* 380 */	CALL(sys_od_close)
 177 +		CALL(sys_litmus_lock)
 178 +		CALL(sys_litmus_unlock)
 179 +		CALL(sys_query_job_no)
 180 +		CALL(sys_wait_for_job_release)
 181 +/* 385 */	CALL(sys_wait_for_ts_release)
 182 +		CALL(sys_release_ts)
 183 +		CALL(sys_null_call)
 184  #ifndef syscalls_counted
 185  .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
 186  #define syscalls_counted
 187 diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
 188 index e7f92a4..5a57429 100644
 189 --- a/arch/arm/kernel/smp.c
 190 +++ b/arch/arm/kernel/smp.c
 191 @@ -40,6 +40,8 @@
 192  #include <asm/ptrace.h>
 193  #include <asm/localtimer.h>
 194  
 195 +#include <litmus/preempt.h>
 196 +
 197  /*
 198   * as from 2.5, kernels no longer have an init_tasks structure
 199   * so we need some other way of telling a new secondary core
 200 @@ -572,6 +574,8 @@ asmlinkage void __exception_irq_entry do_IPI(int ipinr, struct pt_regs *regs)
 201  		break;
 202  
 203  	case IPI_RESCHEDULE:
 204 +		/* LITMUS^RT: take action based on scheduler state */
 205 +		sched_state_ipi();
 206  		scheduler_ipi();
 207  		break;
 208  
 209 diff --git a/arch/arm/mach-realview/include/mach/timex.h b/arch/arm/mach-realview/include/mach/timex.h
 210 index 4eeb069..e8bcc40 100644
 211 --- a/arch/arm/mach-realview/include/mach/timex.h
 212 +++ b/arch/arm/mach-realview/include/mach/timex.h
 213 @@ -21,3 +21,30 @@
 214   */
 215  
 216  #define CLOCK_TICK_RATE		(50000000 / 16)
 217 +
 218 +#if defined(CONFIG_MACH_REALVIEW_PB11MP) || defined(CONFIG_MACH_REALVIEW_PB1176)
 219 +
 220 +static inline unsigned long realview_get_arm11_cp15_ccnt(void)
 221 +{
 222 +	unsigned long cycles;
 223 +	/* Read CP15 CCNT register. */
 224 +	asm volatile ("mrc p15, 0, %0, c15, c12, 1" : "=r" (cycles));
 225 +	return cycles;
 226 +}
 227 +
 228 +#define get_cycles realview_get_arm11_cp15_ccnt
 229 +
 230 +#elif defined(CONFIG_MACH_REALVIEW_PBA8)
 231 +
 232 +
 233 +static inline unsigned long realview_get_a8_cp15_ccnt(void)
 234 +{
 235 +	unsigned long cycles;
 236 +	/* Read CP15 CCNT register. */
 237 +	asm volatile ("mrc p15, 0, %0, c9, c13, 0" : "=r" (cycles));
 238 +	return cycles;
 239 +}
 240 +
 241 +#define get_cycles realview_get_a8_cp15_ccnt
 242 +
 243 +#endif
 244 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
 245 index 37357a5..9f5e143 100644
 246 --- a/arch/x86/Kconfig
 247 +++ b/arch/x86/Kconfig
 248 @@ -2166,3 +2166,11 @@ source "crypto/Kconfig"
 249  source "arch/x86/kvm/Kconfig"
 250  
 251  source "lib/Kconfig"
 252 +
 253 +config ARCH_HAS_FEATHER_TRACE
 254 +	def_bool y
 255 +
 256 +config ARCH_HAS_SEND_PULL_TIMERS
 257 +	def_bool y
 258 +
 259 +source "litmus/Kconfig"
 260 diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
 261 index 1cd6d26..3b0d7ef 100644
 262 --- a/arch/x86/include/asm/entry_arch.h
 263 +++ b/arch/x86/include/asm/entry_arch.h
 264 @@ -13,6 +13,7 @@
 265  BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
 266  BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
 267  BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
 268 +BUILD_INTERRUPT(pull_timers_interrupt,PULL_TIMERS_VECTOR)
 269  BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
 270  BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
 271  
 272 diff --git a/arch/x86/include/asm/feather_trace.h b/arch/x86/include/asm/feather_trace.h
 273 new file mode 100644
 274 index 0000000..4fd3163
 275 --- /dev/null
 276 +++ b/arch/x86/include/asm/feather_trace.h
 277 @@ -0,0 +1,17 @@
 278 +#ifndef _ARCH_FEATHER_TRACE_H
 279 +#define _ARCH_FEATHER_TRACE_H
 280 +
 281 +#include <asm/msr.h>
 282 +
 283 +static inline unsigned long long ft_timestamp(void)
 284 +{
 285 +	return __native_read_tsc();
 286 +}
 287 +
 288 +#ifdef CONFIG_X86_32
 289 +#include "feather_trace_32.h"
 290 +#else
 291 +#include "feather_trace_64.h"
 292 +#endif
 293 +
 294 +#endif
 295 diff --git a/arch/x86/include/asm/feather_trace_32.h b/arch/x86/include/asm/feather_trace_32.h
 296 new file mode 100644
 297 index 0000000..70202f9
 298 --- /dev/null
 299 +++ b/arch/x86/include/asm/feather_trace_32.h
 300 @@ -0,0 +1,79 @@
 301 +/* Do not directly include this file. Include feather_trace.h instead */
 302 +
 303 +#define feather_callback __attribute__((regparm(0)))
 304 +
 305 +/*
 306 + * make the compiler reload any register that is not saved in
 307 + * a cdecl function call
 308 + */
 309 +#define CLOBBER_LIST "memory", "cc", "eax", "ecx", "edx"
 310 +
 311 +#define ft_event(id, callback)                                  \
 312 +        __asm__ __volatile__(                                   \
 313 +            "1: jmp 2f                                    \n\t" \
 314 +	    " call " #callback "                          \n\t" \
 315 +            ".section __event_table, \"aw\"               \n\t" \
 316 +            ".long " #id  ", 0, 1b, 2f                    \n\t" \
 317 +            ".previous                                    \n\t" \
 318 +            "2:                                           \n\t" \
 319 +        : : : CLOBBER_LIST)
 320 +
 321 +#define ft_event0(id, callback)                                 \
 322 +        __asm__ __volatile__(                                   \
 323 +            "1: jmp 2f                                    \n\t" \
 324 +	    " subl $4, %%esp                              \n\t" \
 325 +            " movl $" #id  ", (%%esp)                     \n\t" \
 326 +	    " call " #callback "                          \n\t" \
 327 +	    " addl $4, %%esp                              \n\t" \
 328 +            ".section __event_table, \"aw\"               \n\t" \
 329 +            ".long " #id  ", 0, 1b, 2f                    \n\t" \
 330 +            ".previous                                    \n\t" \
 331 +            "2:                                           \n\t" \
 332 +        : :  : CLOBBER_LIST)
 333 +
 334 +#define ft_event1(id, callback, param)                          \
 335 +        __asm__ __volatile__(                                   \
 336 +            "1: jmp 2f                                    \n\t" \
 337 +	    " subl $8, %%esp                              \n\t" \
 338 +	    " movl %0, 4(%%esp)                           \n\t" \
 339 +            " movl $" #id  ", (%%esp)                     \n\t" \
 340 +	    " call " #callback "                          \n\t" \
 341 +	    " addl $8, %%esp                              \n\t" \
 342 +            ".section __event_table, \"aw\"               \n\t" \
 343 +            ".long " #id  ", 0, 1b, 2f                    \n\t" \
 344 +            ".previous                                    \n\t" \
 345 +            "2:                                           \n\t" \
 346 +        : : "r" (param)  : CLOBBER_LIST)
 347 +
 348 +#define ft_event2(id, callback, param, param2)                  \
 349 +        __asm__ __volatile__(                                   \
 350 +            "1: jmp 2f                                    \n\t" \
 351 +	    " subl $12, %%esp                             \n\t" \
 352 +	    " movl %1, 8(%%esp)                           \n\t" \
 353 +	    " movl %0, 4(%%esp)                           \n\t" \
 354 +            " movl $" #id  ", (%%esp)                     \n\t" \
 355 +	    " call " #callback "                          \n\t" \
 356 +	    " addl $12, %%esp                             \n\t" \
 357 +            ".section __event_table, \"aw\"               \n\t" \
 358 +            ".long " #id  ", 0, 1b, 2f                    \n\t" \
 359 +            ".previous                                    \n\t" \
 360 +            "2:                                           \n\t" \
 361 +        : : "r" (param), "r" (param2)  : CLOBBER_LIST)
 362 +
 363 +
 364 +#define ft_event3(id, callback, p, p2, p3)                      \
 365 +        __asm__ __volatile__(                                   \
 366 +            "1: jmp 2f                                    \n\t" \
 367 +	    " subl $16, %%esp                             \n\t" \
 368 +	    " movl %2, 12(%%esp)                          \n\t" \
 369 +	    " movl %1, 8(%%esp)                           \n\t" \
 370 +	    " movl %0, 4(%%esp)                           \n\t" \
 371 +            " movl $" #id  ", (%%esp)                     \n\t" \
 372 +	    " call " #callback "                          \n\t" \
 373 +	    " addl $16, %%esp                             \n\t" \
 374 +            ".section __event_table, \"aw\"               \n\t" \
 375 +            ".long " #id  ", 0, 1b, 2f                    \n\t" \
 376 +            ".previous                                    \n\t" \
 377 +            "2:                                           \n\t" \
 378 +        : : "r" (p), "r" (p2), "r" (p3)  : CLOBBER_LIST)
 379 +
 380 diff --git a/arch/x86/include/asm/feather_trace_64.h b/arch/x86/include/asm/feather_trace_64.h
 381 new file mode 100644
 382 index 0000000..54ac2ae
 383 --- /dev/null
 384 +++ b/arch/x86/include/asm/feather_trace_64.h
 385 @@ -0,0 +1,67 @@
 386 +/* Do not directly include this file. Include feather_trace.h instead */
 387 +
 388 +/* regparm is the default on x86_64 */
 389 +#define feather_callback
 390 +
 391 +# define _EVENT_TABLE(id,from,to) \
 392 +            ".section __event_table, \"aw\"\n\t" \
 393 +	    ".balign 8\n\t" \
 394 +            ".quad " #id  ", 0, " #from ", " #to " \n\t" \
 395 +            ".previous \n\t"
 396 +
 397 +/*
 398 + * x86_64 callee only owns rbp, rbx, r12 -> r15
 399 + * the called can freely modify the others
 400 + */
 401 +#define CLOBBER_LIST	"memory", "cc", "rdi", "rsi", "rdx", "rcx", \
 402 +			"r8", "r9", "r10", "r11", "rax"
 403 +
 404 +#define ft_event(id, callback)                                  \
 405 +        __asm__ __volatile__(                                   \
 406 +            "1: jmp 2f                                    \n\t" \
 407 +	    " call " #callback "                          \n\t" \
 408 +            _EVENT_TABLE(id,1b,2f) \
 409 +            "2:                                           \n\t" \
 410 +        : : : CLOBBER_LIST)
 411 +
 412 +#define ft_event0(id, callback)                                 \
 413 +        __asm__ __volatile__(                                   \
 414 +            "1: jmp 2f                                    \n\t" \
 415 +	    " movq $" #id ", %%rdi			  \n\t" \
 416 +	    " call " #callback "                          \n\t" \
 417 +	    _EVENT_TABLE(id,1b,2f) \
 418 +            "2:                                           \n\t" \
 419 +        : :  : CLOBBER_LIST)
 420 +
 421 +#define ft_event1(id, callback, param)                          \
 422 +	__asm__ __volatile__(                                   \
 423 +	    "1: jmp 2f                                    \n\t" \
 424 +	    " movq %0, %%rsi				  \n\t"	\
 425 +	    " movq $" #id ", %%rdi			  \n\t" \
 426 +	    " call " #callback "                          \n\t" \
 427 +	    _EVENT_TABLE(id,1b,2f) \
 428 +	    "2:                                           \n\t" \
 429 +	: : "r" (param)  : CLOBBER_LIST)
 430 +
 431 +#define ft_event2(id, callback, param, param2)                  \
 432 +        __asm__ __volatile__(                                   \
 433 +            "1: jmp 2f                                    \n\t" \
 434 +	    " movq %1, %%rdx				  \n\t"	\
 435 +	    " movq %0, %%rsi				  \n\t"	\
 436 +	    " movq $" #id ", %%rdi			  \n\t" \
 437 +	    " call " #callback "                          \n\t" \
 438 +            _EVENT_TABLE(id,1b,2f) \
 439 +            "2:                                           \n\t" \
 440 +        : : "r" (param), "r" (param2)  : CLOBBER_LIST)
 441 +
 442 +#define ft_event3(id, callback, p, p2, p3)                      \
 443 +        __asm__ __volatile__(                                   \
 444 +            "1: jmp 2f                                    \n\t" \
 445 +	    " movq %2, %%rcx				  \n\t"	\
 446 +	    " movq %1, %%rdx				  \n\t"	\
 447 +	    " movq %0, %%rsi				  \n\t"	\
 448 +	    " movq $" #id ", %%rdi			  \n\t" \
 449 +	    " call " #callback "                          \n\t" \
 450 +            _EVENT_TABLE(id,1b,2f) \
 451 +            "2:                                           \n\t" \
 452 +        : : "r" (p), "r" (p2), "r" (p3)  : CLOBBER_LIST)
 453 diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
 454 index bb9efe8..c490d89 100644
 455 --- a/arch/x86/include/asm/hw_irq.h
 456 +++ b/arch/x86/include/asm/hw_irq.h
 457 @@ -77,6 +77,8 @@ extern void threshold_interrupt(void);
 458  extern void call_function_interrupt(void);
 459  extern void call_function_single_interrupt(void);
 460  
 461 +extern void pull_timers_interrupt(void);
 462 +
 463  /* IOAPIC */
 464  #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))
 465  extern unsigned long io_apic_irqs;
 466 @@ -155,6 +157,7 @@ extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
 467  extern void smp_reschedule_interrupt(struct pt_regs *);
 468  extern void smp_call_function_interrupt(struct pt_regs *);
 469  extern void smp_call_function_single_interrupt(struct pt_regs *);
 470 +extern void smp_pull_timers_interrupt(struct pt_regs *);
 471  #ifdef CONFIG_X86_32
 472  extern void smp_invalidate_interrupt(struct pt_regs *);
 473  #else
 474 diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
 475 index 6e976ee..99a44cf 100644
 476 --- a/arch/x86/include/asm/irq_vectors.h
 477 +++ b/arch/x86/include/asm/irq_vectors.h
 478 @@ -135,6 +135,13 @@
 479  #define INVALIDATE_TLB_VECTOR_START	\
 480  	(INVALIDATE_TLB_VECTOR_END-NUM_INVALIDATE_TLB_VECTORS+1)
 481  
 482 +/*
 483 + * LITMUS^RT pull timers IRQ vector
 484 + * Make sure it's below the above max 32 vectors.
 485 + */
 486 +#define PULL_TIMERS_VECTOR		0xce
 487 +
 488 +
 489  #define NR_VECTORS			 256
 490  
 491  #define FPU_IRQ				  13
 492 diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
 493 index 2193715..b844edc 100644
 494 --- a/arch/x86/include/asm/processor.h
 495 +++ b/arch/x86/include/asm/processor.h
 496 @@ -166,6 +166,10 @@ extern void print_cpu_info(struct cpuinfo_x86 *);
 497  extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
 498  extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
 499  extern unsigned short num_cache_leaves;
 500 +#ifdef CONFIG_SYSFS
 501 +extern int get_shared_cpu_map(cpumask_var_t mask,
 502 +			       unsigned int cpu, int index);
 503 +#endif
 504  
 505  extern void detect_extended_topology(struct cpuinfo_x86 *c);
 506  extern void detect_ht(struct cpuinfo_x86 *c);
 507 diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
 508 index 593485b..2f6e127 100644
 509 --- a/arch/x86/include/asm/unistd_32.h
 510 +++ b/arch/x86/include/asm/unistd_32.h
 511 @@ -353,9 +353,13 @@
 512  #define __NR_sendmmsg		345
 513  #define __NR_setns		346
 514  
 515 +#define __NR_LITMUS		347
 516 +
 517 +#include "litmus/unistd_32.h"
 518 +
 519  #ifdef __KERNEL__
 520  
 521 -#define NR_syscalls 347
 522 +#define NR_syscalls 347 + NR_litmus_syscalls
 523  
 524  #define __ARCH_WANT_IPC_PARSE_VERSION
 525  #define __ARCH_WANT_OLD_READDIR
 526 diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
 527 index 705bf13..e347f07 100644
 528 --- a/arch/x86/include/asm/unistd_64.h
 529 +++ b/arch/x86/include/asm/unistd_64.h
 530 @@ -682,6 +682,10 @@ __SYSCALL(__NR_sendmmsg, sys_sendmmsg)
 531  #define __NR_setns				308
 532  __SYSCALL(__NR_setns, sys_setns)
 533  
 534 +#define __NR_LITMUS				309
 535 +
 536 +#include "litmus/unistd_64.h"
 537 +
 538  #ifndef __NO_STUBS
 539  #define __ARCH_WANT_OLD_READDIR
 540  #define __ARCH_WANT_OLD_STAT
 541 diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
 542 index 90b06d4..d727f8f 100644
 543 --- a/arch/x86/kernel/Makefile
 544 +++ b/arch/x86/kernel/Makefile
 545 @@ -116,6 +116,8 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
 546  obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb.o
 547  obj-$(CONFIG_OF)			+= devicetree.o
 548  
 549 +obj-$(CONFIG_FEATHER_TRACE)	+= ft_event.o
 550 +
 551  ###
 552  # 64 bit specific files
 553  ifeq ($(CONFIG_X86_64),y)
 554 diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
 555 index c105c53..0bf1264 100644
 556 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c
 557 +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
 558 @@ -747,6 +747,23 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 559  static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
 560  #define CPUID4_INFO_IDX(x, y)	(&((per_cpu(ici_cpuid4_info, x))[y]))
 561  
 562 +/* returns CPUs that share the index cache with cpu */
 563 +int get_shared_cpu_map(cpumask_var_t mask, unsigned int cpu, int index)
 564 +{
 565 +	int ret = 0;
 566 +	struct _cpuid4_info *this_leaf;
 567 +
 568 +	if (index >= num_cache_leaves) {
 569 +		index = num_cache_leaves - 1;
 570 +		ret = index;
 571 +	}
 572 +
 573 +	this_leaf = CPUID4_INFO_IDX(cpu,index);
 574 +	cpumask_copy(mask, to_cpumask(this_leaf->shared_cpu_map));
 575 +
 576 +	return ret;
 577 +}
 578 +
 579  #ifdef CONFIG_SMP
 580  static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
 581  {
 582 diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
 583 index 8a445a0..47a4bcd 100644
 584 --- a/arch/x86/kernel/entry_64.S
 585 +++ b/arch/x86/kernel/entry_64.S
 586 @@ -1003,6 +1003,8 @@ apicinterrupt CALL_FUNCTION_VECTOR \
 587  	call_function_interrupt smp_call_function_interrupt
 588  apicinterrupt RESCHEDULE_VECTOR \
 589  	reschedule_interrupt smp_reschedule_interrupt
 590 +apicinterrupt PULL_TIMERS_VECTOR \
 591 +	pull_timers_interrupt smp_pull_timers_interrupt
 592  #endif
 593  
 594  apicinterrupt ERROR_APIC_VECTOR \
 595 diff --git a/arch/x86/kernel/ft_event.c b/arch/x86/kernel/ft_event.c
 596 new file mode 100644
 597 index 0000000..37cc332
 598 --- /dev/null
 599 +++ b/arch/x86/kernel/ft_event.c
 600 @@ -0,0 +1,118 @@
 601 +#include <linux/types.h>
 602 +
 603 +#include <litmus/feather_trace.h>
 604 +
 605 +/* the feather trace management functions assume
 606 + * exclusive access to the event table
 607 + */
 608 +
 609 +#ifndef CONFIG_DEBUG_RODATA
 610 +
 611 +#define BYTE_JUMP      0xeb
 612 +#define BYTE_JUMP_LEN  0x02
 613 +
 614 +/* for each event, there is an entry in the event table */
 615 +struct trace_event {
 616 +	long 	id;
 617 +	long	count;
 618 +	long	start_addr;
 619 +	long	end_addr;
 620 +};
 621 +
 622 +extern struct trace_event  __start___event_table[];
 623 +extern struct trace_event  __stop___event_table[];
 624 +
 625 +/* Workaround: if no events are defined, then the event_table section does not
 626 + * exist and the above references cause linker errors. This could probably be
 627 + * fixed by adjusting the linker script, but it is easier to maintain for us if
 628 + * we simply create a dummy symbol in the event table section.
 629 + */
 630 +int __event_table_dummy[0] __attribute__ ((section("__event_table")));
 631 +
 632 +int ft_enable_event(unsigned long id)
 633 +{
 634 +	struct trace_event* te = __start___event_table;
 635 +	int count = 0;
 636 +	char* delta;
 637 +	unsigned char* instr;
 638 +
 639 +	while (te < __stop___event_table) {
 640 +		if (te->id == id && ++te->count == 1) {
 641 +			instr  = (unsigned char*) te->start_addr;
 642 +			/* make sure we don't clobber something wrong */
 643 +			if (*instr == BYTE_JUMP) {
 644 +				delta  = (((unsigned char*) te->start_addr) + 1);
 645 +				*delta = 0;
 646 +			}
 647 +		}
 648 +		if (te->id == id)
 649 +			count++;
 650 +		te++;
 651 +	}
 652 +
 653 +	printk(KERN_DEBUG "ft_enable_event: enabled %d events\n", count);
 654 +	return count;
 655 +}
 656 +
 657 +int ft_disable_event(unsigned long id)
 658 +{
 659 +	struct trace_event* te = __start___event_table;
 660 +	int count = 0;
 661 +	char* delta;
 662 +	unsigned char* instr;
 663 +
 664 +	while (te < __stop___event_table) {
 665 +		if (te->id == id && --te->count == 0) {
 666 +			instr  = (unsigned char*) te->start_addr;
 667 +			if (*instr == BYTE_JUMP) {
 668 +				delta  = (((unsigned char*) te->start_addr) + 1);
 669 +				*delta = te->end_addr - te->start_addr -
 670 +					BYTE_JUMP_LEN;
 671 +			}
 672 +		}
 673 +		if (te->id == id)
 674 +			count++;
 675 +		te++;
 676 +	}
 677 +
 678 +	printk(KERN_DEBUG "ft_disable_event: disabled %d events\n", count);
 679 +	return count;
 680 +}
 681 +
 682 +int ft_disable_all_events(void)
 683 +{
 684 +	struct trace_event* te = __start___event_table;
 685 +	int count = 0;
 686 +	char* delta;
 687 +	unsigned char* instr;
 688 +
 689 +	while (te < __stop___event_table) {
 690 +		if (te->count) {
 691 +			instr  = (unsigned char*) te->start_addr;
 692 +			if (*instr == BYTE_JUMP) {
 693 +				delta  = (((unsigned char*) te->start_addr)
 694 +					  + 1);
 695 +				*delta = te->end_addr - te->start_addr -
 696 +					BYTE_JUMP_LEN;
 697 +				te->count = 0;
 698 +				count++;
 699 +			}
 700 +		}
 701 +		te++;
 702 +	}
 703 +	return count;
 704 +}
 705 +
 706 +int ft_is_event_enabled(unsigned long id)
 707 +{
 708 +	struct trace_event* te = __start___event_table;
 709 +
 710 +	while (te < __stop___event_table) {
 711 +		if (te->id == id)
 712 +			return te->count;
 713 +		te++;
 714 +	}
 715 +	return 0;
 716 +}
 717 +
 718 +#endif
 719 diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
 720 index f470e4e..48acf71 100644
 721 --- a/arch/x86/kernel/irqinit.c
 722 +++ b/arch/x86/kernel/irqinit.c
 723 @@ -252,6 +252,9 @@ static void __init smp_intr_init(void)
 724  	alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
 725  			call_function_single_interrupt);
 726  
 727 +	/* IPI for hrtimer pulling on remote cpus */
 728 +	alloc_intr_gate(PULL_TIMERS_VECTOR, pull_timers_interrupt);
 729 +
 730  	/* Low priority IPI to cleanup after moving an irq */
 731  	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
 732  	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
 733 diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
 734 index 013e7eb..ed4c4f5 100644
 735 --- a/arch/x86/kernel/smp.c
 736 +++ b/arch/x86/kernel/smp.c
 737 @@ -23,6 +23,10 @@
 738  #include <linux/cpu.h>
 739  #include <linux/gfp.h>
 740  
 741 +#include <litmus/preempt.h>
 742 +#include <litmus/debug_trace.h>
 743 +#include <litmus/trace.h>
 744 +
 745  #include <asm/mtrr.h>
 746  #include <asm/tlbflush.h>
 747  #include <asm/mmu_context.h>
 748 @@ -118,6 +122,7 @@ static void native_smp_send_reschedule(int cpu)
 749  		WARN_ON(1);
 750  		return;
 751  	}
 752 +	TS_SEND_RESCHED_START(cpu);
 753  	apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
 754  }
 755  
 756 @@ -147,6 +152,16 @@ void native_send_call_func_ipi(const struct cpumask *mask)
 757  	free_cpumask_var(allbutself);
 758  }
 759  
 760 +/* trigger timers on remote cpu */
 761 +void smp_send_pull_timers(int cpu)
 762 +{
 763 +	if (unlikely(cpu_is_offline(cpu))) {
 764 +		WARN_ON(1);
 765 +		return;
 766 +	}
 767 +	apic->send_IPI_mask(cpumask_of(cpu), PULL_TIMERS_VECTOR);
 768 +}
 769 +
 770  /*
 771   * this function calls the 'stop' function on all other CPUs in the system.
 772   */
 773 @@ -199,8 +214,15 @@ static void native_stop_other_cpus(int wait)
 774  void smp_reschedule_interrupt(struct pt_regs *regs)
 775  {
 776  	ack_APIC_irq();
 777 +	/* LITMUS^RT: this IPI might need to trigger the sched state machine. */
 778 +	sched_state_ipi();
 779  	inc_irq_stat(irq_resched_count);
 780 +	/*
 781 +	 * LITMUS^RT: starting from 3.0 schedule_ipi() actually does something.
 782 +	 * This may increase IPI latencies compared with previous versions.
 783 +	 */
 784  	scheduler_ipi();
 785 +	TS_SEND_RESCHED_END;
 786  	/*
 787  	 * KVM uses this interrupt to force a cpu out of guest mode
 788  	 */
 789 @@ -224,6 +246,15 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
 790  	irq_exit();
 791  }
 792  
 793 +extern void hrtimer_pull(void);
 794 +
 795 +void smp_pull_timers_interrupt(struct pt_regs *regs)
 796 +{
 797 +	ack_APIC_irq();
 798 +	TRACE("pull timer interrupt\n");
 799 +	hrtimer_pull();
 800 +}
 801 +
 802  struct smp_ops smp_ops = {
 803  	.smp_prepare_boot_cpu	= native_smp_prepare_boot_cpu,
 804  	.smp_prepare_cpus	= native_smp_prepare_cpus,
 805 diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
 806 index fbb0a04..d012622 100644
 807 --- a/arch/x86/kernel/syscall_table_32.S
 808 +++ b/arch/x86/kernel/syscall_table_32.S
 809 @@ -346,3 +346,15 @@ ENTRY(sys_call_table)
 810  	.long sys_syncfs
 811  	.long sys_sendmmsg		/* 345 */
 812  	.long sys_setns
 813 +	.long sys_set_rt_task_param	/* LITMUS^RT 347 */
 814 +	.long sys_get_rt_task_param
 815 +	.long sys_complete_job
 816 +	.long sys_od_open
 817 +	.long sys_od_close
 818 +	.long sys_litmus_lock		/* +5 */
 819 +	.long sys_litmus_unlock
 820 +	.long sys_query_job_no
 821 +	.long sys_wait_for_job_release
 822 +	.long sys_wait_for_ts_release
 823 +	.long sys_release_ts		/* +10 */
 824 +	.long sys_null_call
 825 diff --git a/fs/exec.c b/fs/exec.c
 826 index 6075a1e..9984562 100644
 827 --- a/fs/exec.c
 828 +++ b/fs/exec.c
 829 @@ -19,7 +19,7 @@
 830   * current->executable is only used by the procfs.  This allows a dispatch
 831   * table to check for several different types  of binary formats.  We keep
 832   * trying until we recognize the file or we run out of supported binary
 833 - * formats. 
 834 + * formats.
 835   */
 836  
 837  #include <linux/slab.h>
 838 @@ -56,6 +56,8 @@
 839  #include <linux/oom.h>
 840  #include <linux/compat.h>
 841  
 842 +#include <litmus/litmus.h>
 843 +
 844  #include <asm/uaccess.h>
 845  #include <asm/mmu_context.h>
 846  #include <asm/tlb.h>
 847 @@ -85,7 +87,7 @@ int __register_binfmt(struct linux_binfmt * fmt, int insert)
 848  	insert ? list_add(&fmt->lh, &formats) :
 849  		 list_add_tail(&fmt->lh, &formats);
 850  	write_unlock(&binfmt_lock);
 851 -	return 0;	
 852 +	return 0;
 853  }
 854  
 855  EXPORT_SYMBOL(__register_binfmt);
 856 @@ -1160,7 +1162,7 @@ void setup_new_exec(struct linux_binprm * bprm)
 857  	   group */
 858  
 859  	current->self_exec_id++;
 860 -			
 861 +
 862  	flush_signal_handlers(current, 0);
 863  	flush_old_files(current->files);
 864  }
 865 @@ -1250,8 +1252,8 @@ int check_unsafe_exec(struct linux_binprm *bprm)
 866  	return res;
 867  }
 868  
 869 -/* 
 870 - * Fill the binprm structure from the inode. 
 871 +/*
 872 + * Fill the binprm structure from the inode.
 873   * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
 874   *
 875   * This may be called multiple times for binary chains (scripts for example).
 876 @@ -1459,6 +1461,7 @@ static int do_execve_common(const char *filename,
 877  		goto out_unmark;
 878  
 879  	sched_exec();
 880 +	litmus_exec();
 881  
 882  	bprm->file = file;
 883  	bprm->filename = filename;
 884 diff --git a/fs/inode.c b/fs/inode.c
 885 index 43566d1..dbf0e76 100644
 886 --- a/fs/inode.c
 887 +++ b/fs/inode.c
 888 @@ -308,6 +308,8 @@ void inode_init_once(struct inode *inode)
 889  #ifdef CONFIG_FSNOTIFY
 890  	INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
 891  #endif
 892 +	INIT_LIST_HEAD(&inode->i_obj_list);
 893 +	mutex_init(&inode->i_obj_mutex);
 894  }
 895  EXPORT_SYMBOL(inode_init_once);
 896  
 897 diff --git a/include/linux/completion.h b/include/linux/completion.h
 898 index 51494e6..9d72727 100644
 899 --- a/include/linux/completion.h
 900 +++ b/include/linux/completion.h
 901 @@ -90,6 +90,7 @@ extern bool completion_done(struct completion *x);
 902  
 903  extern void complete(struct completion *);
 904  extern void complete_all(struct completion *);
 905 +extern void complete_n(struct completion *, int n);
 906  
 907  /**
 908   * INIT_COMPLETION - reinitialize a completion structure
 909 diff --git a/include/linux/fs.h b/include/linux/fs.h
 910 index b5b9792..8d5834b 100644
 911 --- a/include/linux/fs.h
 912 +++ b/include/linux/fs.h
 913 @@ -17,8 +17,8 @@
 914   * nr_file rlimit, so it's safe to set up a ridiculously high absolute
 915   * upper limit on files-per-process.
 916   *
 917 - * Some programs (notably those using select()) may have to be 
 918 - * recompiled to take full advantage of the new limits..  
 919 + * Some programs (notably those using select()) may have to be
 920 + * recompiled to take full advantage of the new limits..
 921   */
 922  
 923  /* Fixed constants first: */
 924 @@ -172,7 +172,7 @@ struct inodes_stat_t {
 925  #define SEL_EX		4
 926  
 927  /* public flags for file_system_type */
 928 -#define FS_REQUIRES_DEV 1 
 929 +#define FS_REQUIRES_DEV 1
 930  #define FS_BINARY_MOUNTDATA 2
 931  #define FS_HAS_SUBTYPE 4
 932  #define FS_REVAL_DOT	16384	/* Check the paths ".", ".." for staleness */
 933 @@ -480,7 +480,7 @@ struct iattr {
 934   */
 935  #include <linux/quota.h>
 936  
 937 -/** 
 938 +/**
 939   * enum positive_aop_returns - aop return codes with specific semantics
 940   *
 941   * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
 942 @@ -490,7 +490,7 @@ struct iattr {
 943   * 			    be a candidate for writeback again in the near
 944   * 			    future.  Other callers must be careful to unlock
 945   * 			    the page if they get this return.  Returned by
 946 - * 			    writepage(); 
 947 + * 			    writepage();
 948   *
 949   * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
 950   *  			unlocked it and the page might have been truncated.
 951 @@ -734,6 +734,7 @@ static inline int mapping_writably_mapped(struct address_space *mapping)
 952  
 953  struct posix_acl;
 954  #define ACL_NOT_CACHED ((void *)(-1))
 955 +struct inode_obj_id_table;
 956  
 957  struct inode {
 958  	/* RCU path lookup touches following: */
 959 @@ -807,6 +808,8 @@ struct inode {
 960  	struct posix_acl	*i_acl;
 961  	struct posix_acl	*i_default_acl;
 962  #endif
 963 +	struct list_head	i_obj_list;
 964 +	struct mutex		i_obj_mutex;
 965  	void			*i_private; /* fs or device private pointer */
 966  };
 967  
 968 @@ -1032,10 +1035,10 @@ static inline int file_check_writeable(struct file *filp)
 969  
 970  #define	MAX_NON_LFS	((1UL<<31) - 1)
 971  
 972 -/* Page cache limit. The filesystems should put that into their s_maxbytes 
 973 -   limits, otherwise bad things can happen in VM. */ 
 974 +/* Page cache limit. The filesystems should put that into their s_maxbytes
 975 +   limits, otherwise bad things can happen in VM. */
 976  #if BITS_PER_LONG==32
 977 -#define MAX_LFS_FILESIZE	(((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) 
 978 +#define MAX_LFS_FILESIZE	(((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
 979  #elif BITS_PER_LONG==64
 980  #define MAX_LFS_FILESIZE 	0x7fffffffffffffffUL
 981  #endif
 982 @@ -2234,7 +2237,7 @@ extern void free_write_pipe(struct file *);
 983  
 984  extern int kernel_read(struct file *, loff_t, char *, unsigned long);
 985  extern struct file * open_exec(const char *);
 986 - 
 987 +
 988  /* fs/dcache.c -- generic fs support functions */
 989  extern int is_subdir(struct dentry *, struct dentry *);
 990  extern int path_is_under(struct path *, struct path *);
 991 diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
 992 index ba36217..e6dd5a4 100644
 993 --- a/include/linux/hardirq.h
 994 +++ b/include/linux/hardirq.h
 995 @@ -6,6 +6,8 @@
 996  #include <linux/ftrace_irq.h>
 997  #include <asm/hardirq.h>
 998  
 999 +#include <litmus/trace_irq.h>
1000 +
1001  /*
1002   * We put the hardirq and softirq counter into the preemption
1003   * counter. The bitmask has the following meaning:
1004 @@ -186,6 +188,7 @@ extern void rcu_nmi_exit(void);
1005  		account_system_vtime(current);		\
1006  		add_preempt_count(HARDIRQ_OFFSET);	\
1007  		trace_hardirq_enter();			\
1008 +		ft_irq_fired();				\
1009  	} while (0)
1010  
1011  /*
1012 @@ -216,6 +219,7 @@ extern void irq_exit(void);
1013  		lockdep_off();					\
1014  		rcu_nmi_enter();				\
1015  		trace_hardirq_enter();				\
1016 +		ft_irq_fired();					\
1017  	} while (0)
1018  
1019  #define nmi_exit()						\
1020 diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
1021 index fd0dc30..d91bba5 100644
1022 --- a/include/linux/hrtimer.h
1023 +++ b/include/linux/hrtimer.h
1024 @@ -174,6 +174,7 @@ enum  hrtimer_base_type {
1025   * @nr_hangs:		Total number of hrtimer interrupt hangs
1026   * @max_hang_time:	Maximum time spent in hrtimer_interrupt
1027   * @clock_base:		array of clock bases for this cpu
1028 + * @to_pull:		LITMUS^RT list of timers to be pulled on this cpu
1029   */
1030  struct hrtimer_cpu_base {
1031  	raw_spinlock_t			lock;
1032 @@ -188,8 +189,32 @@ struct hrtimer_cpu_base {
1033  	ktime_t				max_hang_time;
1034  #endif
1035  	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
1036 +	struct list_head		to_pull;
1037  };
1038  
1039 +#ifdef CONFIG_ARCH_HAS_SEND_PULL_TIMERS
1040 +
1041 +#define HRTIMER_START_ON_INACTIVE	0
1042 +#define HRTIMER_START_ON_QUEUED		1
1043 +
1044 +/*
1045 + * struct hrtimer_start_on_info - save timer info on remote cpu
1046 + * @list:	list of hrtimer_start_on_info on remote cpu (to_pull)
1047 + * @timer:	timer to be triggered on remote cpu
1048 + * @time:	time event
1049 + * @mode:	timer mode
1050 + * @state:	activity flag
1051 + */
1052 +struct hrtimer_start_on_info {
1053 +	struct list_head	list;
1054 +	struct hrtimer		*timer;
1055 +	ktime_t			time;
1056 +	enum hrtimer_mode	mode;
1057 +	atomic_t		state;
1058 +};
1059 +
1060 +#endif
1061 +
1062  static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
1063  {
1064  	timer->node.expires = time;
1065 @@ -355,6 +380,13 @@ __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
1066  			 unsigned long delta_ns,
1067  			 const enum hrtimer_mode mode, int wakeup);
1068  
1069 +#ifdef CONFIG_ARCH_HAS_SEND_PULL_TIMERS
1070 +extern void hrtimer_start_on_info_init(struct hrtimer_start_on_info *info);
1071 +extern int hrtimer_start_on(int cpu, struct hrtimer_start_on_info *info,
1072 +			struct hrtimer *timer, ktime_t time,
1073 +			const enum hrtimer_mode mode);
1074 +#endif
1075 +
1076  extern int hrtimer_cancel(struct hrtimer *timer);
1077  extern int hrtimer_try_to_cancel(struct hrtimer *timer);
1078  
1079 diff --git a/include/linux/sched.h b/include/linux/sched.h
1080 index 14a6c7b..9c990d1 100644
1081 --- a/include/linux/sched.h
1082 +++ b/include/linux/sched.h
1083 @@ -39,6 +39,7 @@
1084  #define SCHED_BATCH		3
1085  /* SCHED_ISO: reserved but not implemented yet */
1086  #define SCHED_IDLE		5
1087 +#define SCHED_LITMUS		6
1088  /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
1089  #define SCHED_RESET_ON_FORK     0x40000000
1090  
1091 @@ -93,6 +94,9 @@ struct sched_param {
1092  
1093  #include <asm/processor.h>
1094  
1095 +#include <litmus/rt_param.h>
1096 +#include <litmus/preempt.h>
1097 +
1098  struct exec_domain;
1099  struct futex_pi_state;
1100  struct robust_list_head;
1101 @@ -1209,6 +1213,7 @@ struct sched_rt_entity {
1102  };
1103  
1104  struct rcu_node;
1105 +struct od_table_entry;
1106  
1107  enum perf_event_task_context {
1108  	perf_invalid_context = -1,
1109 @@ -1313,9 +1318,9 @@ struct task_struct {
1110  	unsigned long stack_canary;
1111  #endif
1112  
1113 -	/* 
1114 +	/*
1115  	 * pointers to (original) parent process, youngest child, younger sibling,
1116 -	 * older sibling, respectively.  (p->father can be replaced with 
1117 +	 * older sibling, respectively.  (p->father can be replaced with
1118  	 * p->real_parent->pid)
1119  	 */
1120  	struct task_struct *real_parent; /* real parent process */
1121 @@ -1526,6 +1531,13 @@ struct task_struct {
1122  	int make_it_fail;
1123  #endif
1124  	struct prop_local_single dirties;
1125 +
1126 +	/* LITMUS RT parameters and state */
1127 +	struct rt_param rt_param;
1128 +
1129 +	/* references to PI semaphores, etc. */
1130 +	struct od_table_entry *od_table;
1131 +
1132  #ifdef CONFIG_LATENCYTOP
1133  	int latency_record_count;
1134  	struct latency_record latency_record[LT_SAVECOUNT];
1135 @@ -2136,7 +2148,7 @@ static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, s
1136  	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
1137  
1138  	return ret;
1139 -}	
1140 +}
1141  
1142  extern void block_all_signals(int (*notifier)(void *priv), void *priv,
1143  			      sigset_t *mask);
1144 @@ -2446,6 +2458,7 @@ static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
1145  static inline void set_tsk_need_resched(struct task_struct *tsk)
1146  {
1147  	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
1148 +	sched_state_will_schedule(tsk);
1149  }
1150  
1151  static inline void clear_tsk_need_resched(struct task_struct *tsk)
1152 diff --git a/include/linux/smp.h b/include/linux/smp.h
1153 index 8cc38d3..53b1bee 100644
1154 --- a/include/linux/smp.h
1155 +++ b/include/linux/smp.h
1156 @@ -82,6 +82,11 @@ int smp_call_function_any(const struct cpumask *mask,
1157  			  smp_call_func_t func, void *info, int wait);
1158  
1159  /*
1160 + * sends a 'pull timer' event to a remote CPU
1161 + */
1162 +extern void smp_send_pull_timers(int cpu);
1163 +
1164 +/*
1165   * Generic and arch helpers
1166   */
1167  #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
1168 diff --git a/include/linux/tick.h b/include/linux/tick.h
1169 index b232ccc..1e29bd5 100644
1170 --- a/include/linux/tick.h
1171 +++ b/include/linux/tick.h
1172 @@ -74,6 +74,11 @@ extern int tick_is_oneshot_available(void);
1173  extern struct tick_device *tick_get_device(int cpu);
1174  
1175  # ifdef CONFIG_HIGH_RES_TIMERS
1176 +/* LITMUS^RT tick alignment */
1177 +#define LINUX_DEFAULT_TICKS	0
1178 +#define LITMUS_ALIGNED_TICKS	1
1179 +#define	LITMUS_STAGGERED_TICKS	2
1180 +
1181  extern int tick_init_highres(void);
1182  extern int tick_program_event(ktime_t expires, int force);
1183  extern void tick_setup_sched_timer(void);
1184 diff --git a/include/litmus/affinity.h b/include/litmus/affinity.h
1185 new file mode 100644
1186 index 0000000..ca2e442
1187 --- /dev/null
1188 +++ b/include/litmus/affinity.h
1189 @@ -0,0 +1,80 @@
1190 +#ifndef __LITMUS_AFFINITY_H
1191 +#define __LITMUS_AFFINITY_H
1192 +
1193 +#include <linux/cpumask.h>
1194 +
1195 +/*
1196 +  L1 (instr) = depth 0
1197 +  L1 (data)  = depth 1
1198 +  L2 = depth 2
1199 +  L3 = depth 3
1200 + */
1201 +#define NUM_CACHE_LEVELS 4
1202 +
1203 +struct neighborhood
1204 +{
1205 +	unsigned int size[NUM_CACHE_LEVELS];
1206 +	cpumask_var_t neighbors[NUM_CACHE_LEVELS];
1207 +};
1208 +
1209 +/* topology info is stored redundently in a big array for fast lookups */
1210 +extern struct neighborhood neigh_info[NR_CPUS];
1211 +
1212 +void init_topology(void); /* called by Litmus module's _init_litmus() */
1213 +
1214 +/* Works like:
1215 +void get_nearest_available_cpu(
1216 +	cpu_entry_t **nearest,
1217 +	cpu_entry_t *start,
1218 +	cpu_entry_t *entries,
1219 +	int release_master)
1220 +
1221 +Set release_master = NO_CPU for no Release Master.
1222 +
1223 +We use a macro here to exploit the fact that C-EDF and G-EDF
1224 +have similar structures for their cpu_entry_t structs, even though
1225 +they do not share a common base-struct.  The macro allows us to
1226 +avoid code duplication.
1227 +
1228 +TODO: Factor out the job-to-processor linking from C/G-EDF into
1229 +a reusable "processor mapping".  (See B.B.'s RTSS'09 paper &
1230 +dissertation.)
1231 + */
1232 +#define get_nearest_available_cpu(nearest, start, entries, release_master) \
1233 +{ \
1234 +	(nearest) = NULL; \
1235 +	if (!(start)->linked) { \
1236 +		(nearest) = (start); \
1237 +	} else { \
1238 +		int __level; \
1239 +		int __cpu; \
1240 +		int __release_master = ((release_master) == NO_CPU) ? -1 : (release_master); \
1241 +		struct neighborhood *__neighbors = &neigh_info[(start)->cpu]; \
1242 +		\
1243 +		for (__level = 0; (__level < NUM_CACHE_LEVELS) && !(nearest); ++__level) { \
1244 +			if (__neighbors->size[__level] > 1) { \
1245 +				for_each_cpu(__cpu, __neighbors->neighbors[__level]) { \
1246 +					if (__cpu != __release_master) { \
1247 +						cpu_entry_t *__entry = &per_cpu((entries), __cpu); \
1248 +						if (!__entry->linked) { \
1249 +							(nearest) = __entry; \
1250 +							break; \
1251 +						} \
1252 +					} \
1253 +				} \
1254 +			} else if (__neighbors->size[__level] == 0) { \
1255 +				break; \
1256 +			} \
1257 +		} \
1258 +	} \
1259 +	\
1260 +	if ((nearest)) { \
1261 +		TRACE("P%d is closest available CPU to P%d\n", \
1262 +				(nearest)->cpu, (start)->cpu); \
1263 +	} else { \
1264 +		TRACE("Could not find an available CPU close to P%d\n", \
1265 +				(start)->cpu); \
1266 +	} \
1267 +}
1268 +
1269 +#endif
1270 diff --git a/include/litmus/bheap.h b/include/litmus/bheap.h
1271 new file mode 100644
1272 index 0000000..cf4864a
1273 --- /dev/null
1274 +++ b/include/litmus/bheap.h
1275 @@ -0,0 +1,77 @@
1276 +/* bheaps.h -- Binomial Heaps
1277 + *
1278 + * (c) 2008, 2009 Bjoern Brandenburg
1279 + */
1280 +
1281 +#ifndef BHEAP_H
1282 +#define BHEAP_H
1283 +
1284 +#define NOT_IN_HEAP UINT_MAX
1285 +
1286 +struct bheap_node {
1287 +	struct bheap_node* 	parent;
1288 +	struct bheap_node* 	next;
1289 +	struct bheap_node* 	child;
1290 +
1291 +	unsigned int 		degree;
1292 +	void*			value;
1293 +	struct bheap_node**	ref;
1294 +};
1295 +
1296 +struct bheap {
1297 +	struct bheap_node* 	head;
1298 +	/* We cache the minimum of the heap.
1299 +	 * This speeds up repeated peek operations.
1300 +	 */
1301 +	struct bheap_node*	min;
1302 +};
1303 +
1304 +typedef int (*bheap_prio_t)(struct bheap_node* a, struct bheap_node* b);
1305 +
1306 +void bheap_init(struct bheap* heap);
1307 +void bheap_node_init(struct bheap_node** ref_to_bheap_node_ptr, void* value);
1308 +
1309 +static inline int bheap_node_in_heap(struct bheap_node* h)
1310 +{
1311 +	return h->degree != NOT_IN_HEAP;
1312 +}
1313 +
1314 +static inline int bheap_empty(struct bheap* heap)
1315 +{
1316 +	return heap->head == NULL && heap->min == NULL;
1317 +}
1318 +
1319 +/* insert (and reinitialize) a node into the heap */
1320 +void bheap_insert(bheap_prio_t higher_prio,
1321 +		 struct bheap* heap,
1322 +		 struct bheap_node* node);
1323 +
1324 +/* merge addition into target */
1325 +void bheap_union(bheap_prio_t higher_prio,
1326 +		struct bheap* target,
1327 +		struct bheap* addition);
1328 +
1329 +struct bheap_node* bheap_peek(bheap_prio_t higher_prio,
1330 +			    struct bheap* heap);
1331 +
1332 +struct bheap_node* bheap_take(bheap_prio_t higher_prio,
1333 +			    struct bheap* heap);
1334 +
1335 +void bheap_uncache_min(bheap_prio_t higher_prio, struct bheap* heap);
1336 +int  bheap_decrease(bheap_prio_t higher_prio, struct bheap_node* node);
1337 +
1338 +void bheap_delete(bheap_prio_t higher_prio,
1339 +		 struct bheap* heap,
1340 +		 struct bheap_node* node);
1341 +
1342 +/* allocate from memcache */
1343 +struct bheap_node* bheap_node_alloc(int gfp_flags);
1344 +void bheap_node_free(struct bheap_node* hn);
1345 +
1346 +/* allocate a heap node for value and insert into the heap */
1347 +int bheap_add(bheap_prio_t higher_prio, struct bheap* heap,
1348 +	     void* value, int gfp_flags);
1349 +
1350 +void* bheap_take_del(bheap_prio_t higher_prio,
1351 +		    struct bheap* heap);
1352 +#endif
1353 diff --git a/include/litmus/budget.h b/include/litmus/budget.h
1354 new file mode 100644
1355 index 0000000..732530e
1356 --- /dev/null
1357 +++ b/include/litmus/budget.h
1358 @@ -0,0 +1,8 @@
1359 +#ifndef _LITMUS_BUDGET_H_
1360 +#define _LITMUS_BUDGET_H_
1361 +
1362 +/* Update the per-processor enforcement timer (arm/reproram/cancel) for
1363 + * the next task. */
1364 +void update_enforcement_timer(struct task_struct* t);
1365 +
1366 +#endif
1367 diff --git a/include/litmus/clustered.h b/include/litmus/clustered.h
1368 new file mode 100644
1369 index 0000000..0c18dcb
1370 --- /dev/null
1371 +++ b/include/litmus/clustered.h
1372 @@ -0,0 +1,44 @@
1373 +#ifndef CLUSTERED_H
1374 +#define CLUSTERED_H
1375 +
1376 +/* Which cache level should be used to group CPUs into clusters?
1377 + * GLOBAL_CLUSTER means that all CPUs form a single cluster (just like under
1378 + * global scheduling).
1379 + */
1380 +enum cache_level {
1381 +	GLOBAL_CLUSTER = 0,
1382 +	L1_CLUSTER     = 1,
1383 +	L2_CLUSTER     = 2,
1384 +	L3_CLUSTER     = 3
1385 +};
1386 +
1387 +int parse_cache_level(const char *str, enum cache_level *level);
1388 +const char* cache_level_name(enum cache_level level);
1389 +
1390 +/* expose a cache level in a /proc dir */
1391 +struct proc_dir_entry* create_cluster_file(struct proc_dir_entry* parent,
1392 +					   enum cache_level* level);
1393 +
1394 +
1395 +
1396 +struct scheduling_cluster {
1397 +	unsigned int id;
1398 +	/* list of CPUs that are part of this cluster */
1399 +	struct list_head cpus;
1400 +};
1401 +
1402 +struct cluster_cpu {
1403 +	unsigned int id; /* which CPU is this? */
1404 +	struct list_head cluster_list; /* List of the CPUs in this cluster. */
1405 +	struct scheduling_cluster* cluster; /* The cluster that this CPU belongs to. */
1406 +};
1407 +
1408 +int get_cluster_size(enum cache_level level);
1409 +
1410 +int assign_cpus_to_clusters(enum cache_level level,
1411 +			    struct scheduling_cluster* clusters[],
1412 +			    unsigned int num_clusters,
1413 +			    struct cluster_cpu* cpus[],
1414 +			    unsigned int num_cpus);
1415 +
1416 +#endif
1417 diff --git a/include/litmus/debug_trace.h b/include/litmus/debug_trace.h
1418 new file mode 100644
1419 index 0000000..48d086d
1420 --- /dev/null
1421 +++ b/include/litmus/debug_trace.h
1422 @@ -0,0 +1,37 @@
1423 +#ifndef LITMUS_DEBUG_TRACE_H
1424 +#define LITMUS_DEBUG_TRACE_H
1425 +
1426 +#ifdef CONFIG_SCHED_DEBUG_TRACE
1427 +void sched_trace_log_message(const char* fmt, ...);
1428 +void dump_trace_buffer(int max);
1429 +#else
1430 +
1431 +#define sched_trace_log_message(fmt, ...)
1432 +
1433 +#endif
1434 +
1435 +extern atomic_t __log_seq_no;
1436 +
1437 +#ifdef CONFIG_SCHED_DEBUG_TRACE_CALLER
1438 +#define TRACE_PREFIX "%d P%d [%s@%s:%d]: "
1439 +#define TRACE_ARGS  atomic_add_return(1, &__log_seq_no),	\
1440 +		raw_smp_processor_id(),				\
1441 +		__FUNCTION__, __FILE__, __LINE__
1442 +#else
1443 +#define TRACE_PREFIX "%d P%d: "
1444 +#define TRACE_ARGS  atomic_add_return(1, &__log_seq_no), \
1445 +		raw_smp_processor_id()
1446 +#endif
1447 +
1448 +#define TRACE(fmt, args...)						\
1449 +	sched_trace_log_message(TRACE_PREFIX fmt,			\
1450 +				TRACE_ARGS,  ## args)
1451 +
1452 +#define TRACE_TASK(t, fmt, args...)			\
1453 +	TRACE("(%s/%d:%d) " fmt, (t)->comm, (t)->pid,	\
1454 +	      (t)->rt_param.job_params.job_no,  ##args)
1455 +
1456 +#define TRACE_CUR(fmt, args...) \
1457 +	TRACE_TASK(current, fmt, ## args)
1458 +
1459 +#endif
1460 diff --git a/include/litmus/edf_common.h b/include/litmus/edf_common.h
1461 new file mode 100644
1462 index 0000000..bbaf22e
1463 --- /dev/null
1464 +++ b/include/litmus/edf_common.h
1465 @@ -0,0 +1,25 @@
1466 +/*
1467 + * EDF common data structures and utility functions shared by all EDF
1468 + * based scheduler plugins
1469 + */
1470 +
1471 +/* CLEANUP: Add comments and make it less messy.
1472 + *
1473 + */
1474 +
1475 +#ifndef __UNC_EDF_COMMON_H__
1476 +#define __UNC_EDF_COMMON_H__
1477 +
1478 +#include <litmus/rt_domain.h>
1479 +
1480 +void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
1481 +		     release_jobs_t release);
1482 +
1483 +int edf_higher_prio(struct task_struct* first,
1484 +		    struct task_struct* second);
1485 +
1486 +int edf_ready_order(struct bheap_node* a, struct bheap_node* b);
1487 +
1488 +int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t);
1489 +
1490 +#endif
1491 diff --git a/include/litmus/fdso.h b/include/litmus/fdso.h
1492 new file mode 100644
1493 index 0000000..caf2a1e
1494 --- /dev/null
1495 +++ b/include/litmus/fdso.h
1496 @@ -0,0 +1,71 @@
1497 +/* fdso.h - file descriptor attached shared objects
1498 + *
1499 + * (c) 2007 B. Brandenburg, LITMUS^RT project
1500 + */
1501 +
1502 +#ifndef _LINUX_FDSO_H_
1503 +#define _LINUX_FDSO_H_
1504 +
1505 +#include <linux/list.h>
1506 +#include <asm/atomic.h>
1507 +
1508 +#include <linux/fs.h>
1509 +#include <linux/slab.h>
1510 +
1511 +#define MAX_OBJECT_DESCRIPTORS 32
1512 +
1513 +typedef enum  {
1514 +	MIN_OBJ_TYPE 	= 0,
1515 +
1516 +	FMLP_SEM	= 0,
1517 +	SRP_SEM		= 1,
1518 +
1519 +	MAX_OBJ_TYPE	= 1
1520 +} obj_type_t;
1521 +
1522 +struct inode_obj_id {
1523 +	struct list_head	list;
1524 +	atomic_t		count;
1525 +	struct inode*		inode;
1526 +
1527 +	obj_type_t 		type;
1528 +	void*			obj;
1529 +	unsigned int		id;
1530 +};
1531 +
1532 +struct fdso_ops;
1533 +
1534 +struct od_table_entry {
1535 +	unsigned int		used;
1536 +
1537 +	struct inode_obj_id*	obj;
1538 +	const struct fdso_ops*	class;
1539 +};
1540 +
1541 +struct fdso_ops {
1542 +	int   (*create)(void** obj_ref, obj_type_t type, void* __user);
1543 +	void  (*destroy)(obj_type_t type, void*);
1544 +	int   (*open)	(struct od_table_entry*, void* __user);
1545 +	int   (*close)	(struct od_table_entry*);
1546 +};
1547 +
1548 +/* translate a userspace supplied od into the raw table entry
1549 + * returns NULL if od is invalid
1550 + */
1551 +struct od_table_entry* get_entry_for_od(int od);
1552 +
1553 +/* translate a userspace supplied od into the associated object
1554 + * returns NULL if od is invalid
1555 + */
1556 +static inline void* od_lookup(int od, obj_type_t type)
1557 +{
1558 +	struct od_table_entry* e = get_entry_for_od(od);
1559 +	return e && e->obj->type == type ? e->obj->obj : NULL;
1560 +}
1561 +
1562 +#define lookup_fmlp_sem(od)((struct pi_semaphore*)  od_lookup(od, FMLP_SEM))
1563 +#define lookup_srp_sem(od) ((struct srp_semaphore*) od_lookup(od, SRP_SEM))
1564 +#define lookup_ics(od)     ((struct ics*)           od_lookup(od, ICS_ID))
1565 +
1566 +
1567 +#endif
1568 diff --git a/include/litmus/feather_buffer.h b/include/litmus/feather_buffer.h
1569 new file mode 100644
1570 index 0000000..6c18277
1571 --- /dev/null
1572 +++ b/include/litmus/feather_buffer.h
1573 @@ -0,0 +1,94 @@
1574 +#ifndef _FEATHER_BUFFER_H_
1575 +#define _FEATHER_BUFFER_H_
1576 +
1577 +/* requires UINT_MAX and memcpy */
1578 +
1579 +#define SLOT_FREE	0
1580 +#define	SLOT_BUSY 	1
1581 +#define	SLOT_READY	2
1582 +
1583 +struct ft_buffer {
1584 +	unsigned int	slot_count;
1585 +	unsigned int	slot_size;
1586 +
1587 +	int 		free_count;
1588 +	unsigned int 	write_idx;
1589 +	unsigned int 	read_idx;
1590 +
1591 +	char*		slots;
1592 +	void*		buffer_mem;
1593 +	unsigned int	failed_writes;
1594 +};
1595 +
1596 +static inline int init_ft_buffer(struct ft_buffer*	buf,
1597 +				 unsigned int 		slot_count,
1598 +				 unsigned int 		slot_size,
1599 +				 char*			slots,
1600 +				 void* 			buffer_mem)
1601 +{
1602 +	int i = 0;
1603 +	if (!slot_count || UINT_MAX % slot_count != slot_count - 1) {
1604 +		/* The slot count must divide UNIT_MAX + 1 so that when it
1605 +		 * wraps around the index correctly points to 0.
1606 +		 */
1607 +		return 0;
1608 +	} else {
1609 +		buf->slot_count    = slot_count;
1610 +		buf->slot_size     = slot_size;
1611 +		buf->slots         = slots;
1612 +		buf->buffer_mem    = buffer_mem;
1613 +		buf->free_count    = slot_count;
1614 +		buf->write_idx     = 0;
1615 +		buf->read_idx      = 0;
1616 +		buf->failed_writes = 0;
1617 +		for (i = 0; i < slot_count; i++)
1618 +			buf->slots[i] = SLOT_FREE;
1619 +		return 1;
1620 +	}
1621 +}
1622 +
1623 +static inline int ft_buffer_start_write(struct ft_buffer* buf, void **ptr)
1624 +{
1625 +	int free = fetch_and_dec(&buf->free_count);
1626 +	unsigned int idx;
1627 +	if (free <= 0) {
1628 +		fetch_and_inc(&buf->free_count);
1629 +		*ptr = 0;
1630 +		fetch_and_inc(&buf->failed_writes);
1631 +		return 0;
1632 +	} else {
1633 +		idx  = fetch_and_inc((int*) &buf->write_idx) % buf->slot_count;
1634 +		buf->slots[idx] = SLOT_BUSY;
1635 +		*ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size;
1636 +		return 1;
1637 +	}
1638 +}
1639 +
1640 +static inline void ft_buffer_finish_write(struct ft_buffer* buf, void *ptr)
1641 +{
1642 +	unsigned int idx = ((char*) ptr - (char*) buf->buffer_mem) / buf->slot_size;
1643 +	buf->slots[idx]  = SLOT_READY;
1644 +}
1645 +
1646 +
1647 +/* exclusive reader access is assumed */
1648 +static inline int ft_buffer_read(struct ft_buffer* buf, void* dest)
1649 +{
1650 +	unsigned int idx;
1651 +	if (buf->free_count == buf->slot_count)
1652 +		/* nothing available */
1653 +		return 0;
1654 +	idx = buf->read_idx % buf->slot_count;
1655 +	if (buf->slots[idx] == SLOT_READY) {
1656 +		memcpy(dest, ((char*) buf->buffer_mem) + idx * buf->slot_size,
1657 +		       buf->slot_size);
1658 +		buf->slots[idx] = SLOT_FREE;
1659 +		buf->read_idx++;
1660 +		fetch_and_inc(&buf->free_count);
1661 +		return 1;
1662 +	} else
1663 +		return 0;
1664 +}
1665 +
1666 +
1667 +#endif
1668 diff --git a/include/litmus/feather_trace.h b/include/litmus/feather_trace.h
1669 new file mode 100644
1670 index 0000000..028dfb2
1671 --- /dev/null
1672 +++ b/include/litmus/feather_trace.h
1673 @@ -0,0 +1,65 @@
1674 +#ifndef _FEATHER_TRACE_H_
1675 +#define _FEATHER_TRACE_H_
1676 +
1677 +#include <asm/atomic.h>
1678 +
1679 +int ft_enable_event(unsigned long id);
1680 +int ft_disable_event(unsigned long id);
1681 +int ft_is_event_enabled(unsigned long id);
1682 +int ft_disable_all_events(void);
1683 +
1684 +/* atomic_* funcitons are inline anyway */
1685 +static inline int fetch_and_inc(int *val)
1686 +{
1687 +	return atomic_add_return(1, (atomic_t*) val) - 1;
1688 +}
1689 +
1690 +static inline int fetch_and_dec(int *val)
1691 +{
1692 +	return atomic_sub_return(1, (atomic_t*) val) + 1;
1693 +}
1694 +
1695 +/* Don't use rewriting implementation if kernel text pages are read-only.
1696 + * Ftrace gets around this by using the identity mapping, but that's more
1697 + * effort that is warrented right now for Feather-Trace.
1698 + * Eventually, it may make sense to replace Feather-Trace with ftrace.
1699 + */
1700 +#if defined(CONFIG_ARCH_HAS_FEATHER_TRACE) && !defined(CONFIG_DEBUG_RODATA)
1701 +
1702 +#include <asm/feather_trace.h>
1703 +
1704 +#else /* !__ARCH_HAS_FEATHER_TRACE */
1705 +
1706 +/* provide default implementation */
1707 +
1708 +#include <asm/timex.h> /* for get_cycles() */
1709 +
1710 +static inline unsigned long long ft_timestamp(void)
1711 +{
1712 +	return get_cycles();
1713 +}
1714 +
1715 +#define feather_callback
1716 +
1717 +#define MAX_EVENTS 1024
1718 +
1719 +extern int ft_events[MAX_EVENTS];
1720 +
1721 +#define ft_event(id, callback) \
1722 +	if (ft_events[id]) callback();
1723 +
1724 +#define ft_event0(id, callback) \
1725 +	if (ft_events[id]) callback(id);
1726 +
1727 +#define ft_event1(id, callback, param) \
1728 +	if (ft_events[id]) callback(id, param);
1729 +
1730 +#define ft_event2(id, callback, param, param2) \
1731 +	if (ft_events[id]) callback(id, param, param2);
1732 +
1733 +#define ft_event3(id, callback, p, p2, p3) \
1734 +	if (ft_events[id]) callback(id, p, p2, p3);
1735 +
1736 +#endif /* __ARCH_HAS_FEATHER_TRACE */
1737 +
1738 +#endif
1739 diff --git a/include/litmus/ftdev.h b/include/litmus/ftdev.h
1740 new file mode 100644
1741 index 0000000..0b95987
1742 --- /dev/null
1743 +++ b/include/litmus/ftdev.h
1744 @@ -0,0 +1,55 @@
1745 +#ifndef _LITMUS_FTDEV_H_
1746 +#define	_LITMUS_FTDEV_H_
1747 +
1748 +#include <litmus/feather_trace.h>
1749 +#include <litmus/feather_buffer.h>
1750 +#include <linux/mutex.h>
1751 +#include <linux/cdev.h>
1752 +
1753 +#define FTDEV_ENABLE_CMD 	0
1754 +#define FTDEV_DISABLE_CMD 	1
1755 +
1756 +struct ftdev;
1757 +
1758 +/* return 0 if buffer can be opened, otherwise -$REASON */
1759 +typedef int  (*ftdev_can_open_t)(struct ftdev* dev, unsigned int buf_no);
1760 +/* return 0 on success, otherwise -$REASON */
1761 +typedef int  (*ftdev_alloc_t)(struct ftdev* dev, unsigned int buf_no);
1762 +typedef void (*ftdev_free_t)(struct ftdev* dev, unsigned int buf_no);
1763 +/* Let devices handle writes from userspace. No synchronization provided. */
1764 +typedef ssize_t (*ftdev_write_t)(struct ft_buffer* buf, size_t len, const char __user *from);
1765 +
1766 +struct ftdev_event;
1767 +
1768 +struct ftdev_minor {
1769 +	struct ft_buffer*	buf;
1770 +	unsigned int		readers;
1771 +	struct mutex		lock;
1772 +	/* FIXME: filter for authorized events */
1773 +	struct ftdev_event*	events;
1774 +	struct device*		device;
1775 +	struct ftdev*		ftdev;
1776 +};
1777 +
1778 +struct ftdev {
1779 +	dev_t			major;
1780 +	struct cdev		cdev;
1781 +	struct class*		class;
1782 +	const char*		name;
1783 +	struct ftdev_minor*	minor;
1784 +	unsigned int		minor_cnt;
1785 +	ftdev_alloc_t		alloc;
1786 +	ftdev_free_t		free;
1787 +	ftdev_can_open_t	can_open;
1788 +	ftdev_write_t		write;
1789 +};
1790 +
1791 +struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size);
1792 +void free_ft_buffer(struct ft_buffer* buf);
1793 +
1794 +int ftdev_init(	struct ftdev* ftdev, struct module* owner,
1795 +		const int minor_cnt, const char* name);
1796 +void ftdev_exit(struct ftdev* ftdev);
1797 +int register_ftdev(struct ftdev* ftdev);
1798 +
1799 +#endif
1800 diff --git a/include/litmus/jobs.h b/include/litmus/jobs.h
1801 new file mode 100644
1802 index 0000000..9bd361e
1803 --- /dev/null
1804 +++ b/include/litmus/jobs.h
1805 @@ -0,0 +1,9 @@
1806 +#ifndef __LITMUS_JOBS_H__
1807 +#define __LITMUS_JOBS_H__
1808 +
1809 +void prepare_for_next_period(struct task_struct *t);
1810 +void release_at(struct task_struct *t, lt_t start);
1811 +long complete_job(void);
1812 +
1813 +#endif
1814 +
1815 diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h
1816 new file mode 100644
1817 index 0000000..12af222
1818 --- /dev/null
1819 +++ b/include/litmus/litmus.h
1820 @@ -0,0 +1,275 @@
1821 +/*
1822 + * Constant definitions related to
1823 + * scheduling policy.
1824 + */
1825 +
1826 +#ifndef _LINUX_LITMUS_H_
1827 +#define _LINUX_LITMUS_H_
1828 +
1829 +#include <litmus/debug_trace.h>
1830 +
1831 +#ifdef CONFIG_RELEASE_MASTER
1832 +extern atomic_t release_master_cpu;
1833 +#endif
1834 +
1835 +/* in_list - is a given list_head queued on some list?
1836 + */
1837 +static inline int in_list(struct list_head* list)
1838 +{
1839 +	return !(  /* case 1: deleted */
1840 +		   (list->next == LIST_POISON1 &&
1841 +		    list->prev == LIST_POISON2)
1842 +		 ||
1843 +		   /* case 2: initialized */
1844 +		   (list->next == list &&
1845 +		    list->prev == list)
1846 +		);
1847 +}
1848 +
1849 +struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq);
1850 +
1851 +#define NO_CPU			0xffffffff
1852 +
1853 +void litmus_fork(struct task_struct *tsk);
1854 +void litmus_exec(void);
1855 +/* clean up real-time state of a task */
1856 +void exit_litmus(struct task_struct *dead_tsk);
1857 +
1858 +long litmus_admit_task(struct task_struct *tsk);
1859 +void litmus_exit_task(struct task_struct *tsk);
1860 +
1861 +#define is_realtime(t) 		((t)->policy == SCHED_LITMUS)
1862 +#define rt_transition_pending(t) \
1863 +	((t)->rt_param.transition_pending)
1864 +
1865 +#define tsk_rt(t)		(&(t)->rt_param)
1866 +
1867 +/*	Realtime utility macros */
1868 +#define get_rt_flags(t)		(tsk_rt(t)->flags)
1869 +#define set_rt_flags(t,f) 	(tsk_rt(t)->flags=(f))
1870 +#define get_exec_cost(t)  	(tsk_rt(t)->task_params.exec_cost)
1871 +#define get_exec_time(t)	(tsk_rt(t)->job_params.exec_time)
1872 +#define get_rt_period(t)	(tsk_rt(t)->task_params.period)
1873 +#define get_rt_phase(t)		(tsk_rt(t)->task_params.phase)
1874 +#define get_partition(t) 	(tsk_rt(t)->task_params.cpu)
1875 +#define get_deadline(t)		(tsk_rt(t)->job_params.deadline)
1876 +#define get_release(t)		(tsk_rt(t)->job_params.release)
1877 +#define get_class(t)		(tsk_rt(t)->task_params.cls)
1878 +
1879 +#define is_priority_boosted(t)	(tsk_rt(t)->priority_boosted)
1880 +#define get_boost_start(t)	(tsk_rt(t)->boost_start_time)
1881 +
1882 +inline static int budget_exhausted(struct task_struct* t)
1883 +{
1884 +	return get_exec_time(t) >= get_exec_cost(t);
1885 +}
1886 +
1887 +inline static lt_t budget_remaining(struct task_struct* t)
1888 +{
1889 +	if (!budget_exhausted(t))
1890 +		return get_exec_cost(t) - get_exec_time(t);
1891 +	else
1892 +		/* avoid overflow */
1893 +		return 0;
1894 +}
1895 +
1896 +#define budget_enforced(t) (tsk_rt(t)->task_params.budget_policy != NO_ENFORCEMENT)
1897 +
1898 +#define budget_precisely_enforced(t) (tsk_rt(t)->task_params.budget_policy \
1899 +				      == PRECISE_ENFORCEMENT)
1900 +
1901 +#define is_hrt(t)     		\
1902 +	(tsk_rt(t)->task_params.cls == RT_CLASS_HARD)
1903 +#define is_srt(t)     		\
1904 +	(tsk_rt(t)->task_params.cls == RT_CLASS_SOFT)
1905 +#define is_be(t)      		\
1906 +	(tsk_rt(t)->task_params.cls == RT_CLASS_BEST_EFFORT)
1907 +
1908 +/* Our notion of time within LITMUS: kernel monotonic time. */
1909 +static inline lt_t litmus_clock(void)
1910 +{
1911 +	return ktime_to_ns(ktime_get());
1912 +}
1913 +
1914 +/* A macro to convert from nanoseconds to ktime_t. */
1915 +#define ns_to_ktime(t)		ktime_add_ns(ktime_set(0, 0), t)
1916 +
1917 +#define get_domain(t) (tsk_rt(t)->domain)
1918 +
1919 +/* Honor the flag in the preempt_count variable that is set
1920 + * when scheduling is in progress.
1921 + */
1922 +#define is_running(t) 			\
1923 +	((t)->state == TASK_RUNNING || 	\
1924 +	 task_thread_info(t)->preempt_count & PREEMPT_ACTIVE)
1925 +
1926 +#define is_blocked(t)       \
1927 +	(!is_running(t))
1928 +#define is_released(t, now)	\
1929 +	(lt_before_eq(get_release(t), now))
1930 +#define is_tardy(t, now)    \
1931 +	(lt_before_eq(tsk_rt(t)->job_params.deadline, now))
1932 +
1933 +/* real-time comparison macros */
1934 +#define earlier_deadline(a, b) (lt_before(\
1935 +	(a)->rt_param.job_params.deadline,\
1936 +	(b)->rt_param.job_params.deadline))
1937 +#define earlier_release(a, b)  (lt_before(\
1938 +	(a)->rt_param.job_params.release,\
1939 +	(b)->rt_param.job_params.release))
1940 +
1941 +void preempt_if_preemptable(struct task_struct* t, int on_cpu);
1942 +
1943 +#ifdef CONFIG_LITMUS_LOCKING
1944 +void srp_ceiling_block(void);
1945 +#else
1946 +#define srp_ceiling_block() /* nothing */
1947 +#endif
1948 +
1949 +#define bheap2task(hn) ((struct task_struct*) hn->value)
1950 +
1951 +#ifdef CONFIG_NP_SECTION
1952 +
1953 +static inline int is_kernel_np(struct task_struct *t)
1954 +{
1955 +	return tsk_rt(t)->kernel_np;
1956 +}
1957 +
1958 +static inline int is_user_np(struct task_struct *t)
1959 +{
1960 +	return tsk_rt(t)->ctrl_page ? tsk_rt(t)->ctrl_page->sched.np.flag : 0;
1961 +}
1962 +
1963 +static inline void request_exit_np(struct task_struct *t)
1964 +{
1965 +	if (is_user_np(t)) {
1966 +		/* Set the flag that tells user space to call
1967 +		 * into the kernel at the end of a critical section. */
1968 +		if (likely(tsk_rt(t)->ctrl_page)) {
1969 +			TRACE_TASK(t, "setting delayed_preemption flag\n");
1970 +			tsk_rt(t)->ctrl_page->sched.np.preempt = 1;
1971 +		}
1972 +	}
1973 +}
1974 +
1975 +static inline void make_np(struct task_struct *t)
1976 +{
1977 +	tsk_rt(t)->kernel_np++;
1978 +}
1979 +
1980 +/* Caller should check if preemption is necessary when
1981 + * the function return 0.
1982 + */
1983 +static inline int take_np(struct task_struct *t)
1984 +{
1985 +	return --tsk_rt(t)->kernel_np;
1986 +}
1987 +
1988 +/* returns 0 if remote CPU needs an IPI to preempt, 1 if no IPI is required */
1989 +static inline int request_exit_np_atomic(struct task_struct *t)
1990 +{
1991 +	union np_flag old, new;
1992 +
1993 +	if (tsk_rt(t)->ctrl_page) {
1994 +		old.raw = tsk_rt(t)->ctrl_page->sched.raw;
1995 +		if (old.np.flag == 0) {
1996 +			/* no longer non-preemptive */
1997 +			return 0;
1998 +		} else if (old.np.preempt) {
1999 +			/* already set, nothing for us to do */
2000 +			return 1;
2001 +		} else {
2002 +			/* non preemptive and flag not set */
2003 +			new.raw = old.raw;
2004 +			new.np.preempt = 1;
2005 +			/* if we get old back, then we atomically set the flag */
2006 +			return cmpxchg(&tsk_rt(t)->ctrl_page->sched.raw, old.raw, new.raw) == old.raw;
2007 +			/* If we raced with a concurrent change, then so be
2008 +			 * it. Deliver it by IPI.  We don't want an unbounded
2009 +			 * retry loop here since tasks might exploit that to
2010 +			 * keep the kernel busy indefinitely. */
2011 +		}
2012 +	} else
2013 +		return 0;
2014 +}
2015 +
2016 +#else
2017 +
2018 +static inline int is_kernel_np(struct task_struct* t)
2019 +{
2020 +	return 0;
2021 +}
2022 +
2023 +static inline int is_user_np(struct task_struct* t)
2024 +{
2025 +	return 0;
2026 +}
2027 +
2028 +static inline void request_exit_np(struct task_struct *t)
2029 +{
2030 +	/* request_exit_np() shouldn't be called if !CONFIG_NP_SECTION */
2031 +	BUG();
2032 +}
2033 +
2034 +static inline int request_exist_np_atomic(struct task_struct *t)
2035 +{
2036 +	return 0;
2037 +}
2038 +
2039 +#endif
2040 +
2041 +static inline void clear_exit_np(struct task_struct *t)
2042 +{
2043 +	if (likely(tsk_rt(t)->ctrl_page))
2044 +		tsk_rt(t)->ctrl_page->sched.np.preempt = 0;
2045 +}
2046 +
2047 +static inline int is_np(struct task_struct *t)
2048 +{
2049 +#ifdef CONFIG_SCHED_DEBUG_TRACE
2050 +	int kernel, user;
2051 +	kernel = is_kernel_np(t);
2052 +	user   = is_user_np(t);
2053 +	if (kernel || user)
2054 +		TRACE_TASK(t, " is non-preemptive: kernel=%d user=%d\n",
2055 +
2056 +			   kernel, user);
2057 +	return kernel || user;
2058 +#else
2059 +	return unlikely(is_kernel_np(t) || is_user_np(t));
2060 +#endif
2061 +}
2062 +
2063 +static inline int is_present(struct task_struct* t)
2064 +{
2065 +	return t && tsk_rt(t)->present;
2066 +}
2067 +
2068 +
2069 +/* make the unit explicit */
2070 +typedef unsigned long quanta_t;
2071 +
2072 +enum round {
2073 +	FLOOR,
2074 +	CEIL
2075 +};
2076 +
2077 +
2078 +/* Tick period is used to convert ns-specified execution
2079 + * costs and periods into tick-based equivalents.
2080 + */
2081 +extern ktime_t tick_period;
2082 +
2083 +static inline quanta_t time2quanta(lt_t time, enum round round)
2084 +{
2085 +	s64  quantum_length = ktime_to_ns(tick_period);
2086 +
2087 +	if (do_div(time, quantum_length) && round == CEIL)
2088 +		time++;
2089 +	return (quanta_t) time;
2090 +}
2091 +
2092 +/* By how much is cpu staggered behind CPU 0? */
2093 +u64 cpu_stagger_offset(int cpu);
2094 +
2095 +#endif
2096 diff --git a/include/litmus/litmus_proc.h b/include/litmus/litmus_proc.h
2097 new file mode 100644
2098 index 0000000..6800e72
2099 --- /dev/null
2100 +++ b/include/litmus/litmus_proc.h
2101 @@ -0,0 +1,25 @@
2102 +#include <litmus/sched_plugin.h>
2103 +#include <linux/proc_fs.h>
2104 +
2105 +int __init init_litmus_proc(void);
2106 +void exit_litmus_proc(void);
2107 +
2108 +/*
2109 + * On success, returns 0 and sets the pointer to the location of the new
2110 + * proc dir entry, otherwise returns an error code and sets pde to NULL.
2111 + */
2112 +long make_plugin_proc_dir(struct sched_plugin* plugin,
2113 +		struct proc_dir_entry** pde);
2114 +
2115 +/*
2116 + * Plugins should deallocate all child proc directory entries before
2117 + * calling this, to avoid memory leaks.
2118 + */
2119 +void remove_plugin_proc_dir(struct sched_plugin* plugin);
2120 +
2121 +
2122 +/* Copy at most size-1 bytes from ubuf into kbuf, null-terminate buf, and
2123 + * remove a '\n' if present. Returns the number of bytes that were read or
2124 + * -EFAULT. */
2125 +int copy_and_chomp(char *kbuf, unsigned long ksize,
2126 +		   __user const char* ubuf, unsigned long ulength);
2127 diff --git a/include/litmus/locking.h b/include/litmus/locking.h
2128 new file mode 100644
2129 index 0000000..4d7b870
2130 --- /dev/null
2131 +++ b/include/litmus/locking.h
2132 @@ -0,0 +1,28 @@
2133 +#ifndef LITMUS_LOCKING_H
2134 +#define LITMUS_LOCKING_H
2135 +
2136 +struct litmus_lock_ops;
2137 +
2138 +/* Generic base struct for LITMUS^RT userspace semaphores.
2139 + * This structure should be embedded in protocol-specific semaphores.
2140 + */
2141 +struct litmus_lock {
2142 +	struct litmus_lock_ops *ops;
2143 +	int type;
2144 +};
2145 +
2146 +struct litmus_lock_ops {
2147 +	/* Current task tries to obtain / drop a reference to a lock.
2148 +	 * Optional methods, allowed by default. */
2149 +	int (*open)(struct litmus_lock*, void* __user);
2150 +	int (*close)(struct litmus_lock*);
2151 +
2152 +	/* Current tries to lock/unlock this lock (mandatory methods). */
2153 +	int (*lock)(struct litmus_lock*);
2154 +	int (*unlock)(struct litmus_lock*);
2155 +
2156 +	/* The lock is no longer being referenced (mandatory method). */
2157 +	void (*deallocate)(struct litmus_lock*);
2158 +};
2159 +
2160 +#endif
2161 diff --git a/include/litmus/preempt.h b/include/litmus/preempt.h
2162 new file mode 100644
2163 index 0000000..380b886
2164 --- /dev/null
2165 +++ b/include/litmus/preempt.h
2166 @@ -0,0 +1,164 @@
2167 +#ifndef LITMUS_PREEMPT_H
2168 +#define LITMUS_PREEMPT_H
2169 +
2170 +#include <linux/types.h>
2171 +#include <linux/cache.h>
2172 +#include <linux/percpu.h>
2173 +#include <asm/atomic.h>
2174 +
2175 +#include <litmus/debug_trace.h>
2176 +
2177 +extern DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, resched_state);
2178 +
2179 +#ifdef CONFIG_PREEMPT_STATE_TRACE
2180 +const char* sched_state_name(int s);
2181 +#define TRACE_STATE(fmt, args...) TRACE("SCHED_STATE " fmt, args)
2182 +#else
2183 +#define TRACE_STATE(fmt, args...) /* ignore */
2184 +#endif
2185 +
2186 +#define VERIFY_SCHED_STATE(x)						\
2187 +	do { int __s = get_sched_state();				\
2188 +		if ((__s & (x)) == 0)					\
2189 +			TRACE_STATE("INVALID s=0x%x (%s) not "		\
2190 +				    "in 0x%x (%s) [%s]\n",		\
2191 +				    __s, sched_state_name(__s),		\
2192 +				    (x), #x, __FUNCTION__);		\
2193 +	} while (0);
2194 +
2195 +#define TRACE_SCHED_STATE_CHANGE(x, y, cpu)				\
2196 +	TRACE_STATE("[P%d] 0x%x (%s) -> 0x%x (%s)\n",			\
2197 +		    cpu,  (x), sched_state_name(x),			\
2198 +		    (y), sched_state_name(y))
2199 +
2200 +
2201 +typedef enum scheduling_state {
2202 +	TASK_SCHEDULED    = (1 << 0),  /* The currently scheduled task is the one that
2203 +					* should be scheduled, and the processor does not
2204 +					* plan to invoke schedule(). */
2205 +	SHOULD_SCHEDULE   = (1 << 1),  /* A remote processor has determined that the
2206 +					* processor should reschedule, but this has not
2207 +					* been communicated yet (IPI still pending). */
2208 +	WILL_SCHEDULE     = (1 << 2),  /* The processor has noticed that it has to
2209 +					* reschedule and will do so shortly. */
2210 +	TASK_PICKED       = (1 << 3),  /* The processor is currently executing schedule(),
2211 +					* has selected a new task to schedule, but has not
2212 +					* yet performed the actual context switch. */
2213 +	PICKED_WRONG_TASK = (1 << 4),  /* The processor has not yet performed the context
2214 +					* switch, but a remote processor has already
2215 +					* determined that a higher-priority task became
2216 +					* eligible after the task was picked. */
2217 +} sched_state_t;
2218 +
2219 +static inline sched_state_t get_sched_state_on(int cpu)
2220 +{
2221 +	return atomic_read(&per_cpu(resched_state, cpu));
2222 +}
2223 +
2224 +static inline sched_state_t get_sched_state(void)
2225 +{
2226 +	return atomic_read(&__get_cpu_var(resched_state));
2227 +}
2228 +
2229 +static inline int is_in_sched_state(int possible_states)
2230 +{
2231 +	return get_sched_state() & possible_states;
2232 +}
2233 +
2234 +static inline int cpu_is_in_sched_state(int cpu, int possible_states)
2235 +{
2236 +	return get_sched_state_on(cpu) & possible_states;
2237 +}
2238 +
2239 +static inline void set_sched_state(sched_state_t s)
2240 +{
2241 +	TRACE_SCHED_STATE_CHANGE(get_sched_state(), s, smp_processor_id());
2242 +	atomic_set(&__get_cpu_var(resched_state), s);
2243 +}
2244 +
2245 +static inline int sched_state_transition(sched_state_t from, sched_state_t to)
2246 +{
2247 +	sched_state_t old_state;
2248 +
2249 +	old_state = atomic_cmpxchg(&__get_cpu_var(resched_state), from, to);
2250 +	if (old_state == from) {
2251 +		TRACE_SCHED_STATE_CHANGE(from, to, smp_processor_id());
2252 +		return 1;
2253 +	} else
2254 +		return 0;
2255 +}
2256 +
2257 +static inline int sched_state_transition_on(int cpu,
2258 +					    sched_state_t from,
2259 +					    sched_state_t to)
2260 +{
2261 +	sched_state_t old_state;
2262 +
2263 +	old_state = atomic_cmpxchg(&per_cpu(resched_state, cpu), from, to);
2264 +	if (old_state == from) {
2265 +		TRACE_SCHED_STATE_CHANGE(from, to, cpu);
2266 +		return 1;
2267 +	} else
2268 +		return 0;
2269 +}
2270 +
2271 +/* Plugins must call this function after they have decided which job to
2272 + * schedule next.  IMPORTANT: this function must be called while still holding
2273 + * the lock that is used to serialize scheduling decisions.
2274 + *
2275 + * (Ideally, we would like to use runqueue locks for this purpose, but that
2276 + * would lead to deadlocks with the migration code.)
2277 + */
2278 +static inline void sched_state_task_picked(void)
2279 +{
2280 +	VERIFY_SCHED_STATE(WILL_SCHEDULE);
2281 +
2282 +	/* WILL_SCHEDULE has only a local tansition => simple store is ok */
2283 +	set_sched_state(TASK_PICKED);
2284 +}
2285 +
2286 +static inline void sched_state_entered_schedule(void)
2287 +{
2288 +	/* Update state for the case that we entered schedule() not due to
2289 +	 * set_tsk_need_resched() */
2290 +	set_sched_state(WILL_SCHEDULE);
2291 +}
2292 +
2293 +/* Called by schedule() to check if the scheduling decision is still valid
2294 + * after a context switch. Returns 1 if the CPU needs to reschdule. */
2295 +static inline int sched_state_validate_switch(void)
2296 +{
2297 +	int left_state_ok = 0;
2298 +
2299 +	VERIFY_SCHED_STATE(PICKED_WRONG_TASK | TASK_PICKED);
2300 +
2301 +	if (is_in_sched_state(TASK_PICKED)) {
2302 +		/* Might be good; let's try to transition out of this
2303 +		 * state. This must be done atomically since remote processors
2304 +		 * may try to change the state, too. */
2305 +		left_state_ok = sched_state_transition(TASK_PICKED, TASK_SCHEDULED);
2306 +	}
2307 +
2308 +	if (!left_state_ok) {
2309 +		/* We raced with a higher-priority task arrival => not
2310 +		 * valid. The CPU needs to reschedule. */
2311 +		set_sched_state(WILL_SCHEDULE);
2312 +		return 1;
2313 +	} else
2314 +		return 0;
2315 +}
2316 +
2317 +/* State transition events. See litmus/preempt.c for details. */
2318 +void sched_state_will_schedule(struct task_struct* tsk);
2319 +void sched_state_ipi(void);
2320 +/* Cause a CPU (remote or local) to reschedule. */
2321 +void litmus_reschedule(int cpu);
2322 +void litmus_reschedule_local(void);
2323 +
2324 +#ifdef CONFIG_DEBUG_KERNEL
2325 +void sched_state_plugin_check(void);
2326 +#else
2327 +#define sched_state_plugin_check() /* no check */
2328 +#endif
2329 +
2330 +#endif
2331 diff --git a/include/litmus/rt_domain.h b/include/litmus/rt_domain.h
2332 new file mode 100644
2333 index 0000000..ac24929
2334 --- /dev/null
2335 +++ b/include/litmus/rt_domain.h
2336 @@ -0,0 +1,182 @@
2337 +/* CLEANUP: Add comments and make it less messy.
2338 + *
2339 + */
2340 +
2341 +#ifndef __UNC_RT_DOMAIN_H__
2342 +#define __UNC_RT_DOMAIN_H__
2343 +
2344 +#include <litmus/bheap.h>
2345 +
2346 +#define RELEASE_QUEUE_SLOTS 127 /* prime */
2347 +
2348 +struct _rt_domain;
2349 +
2350 +typedef int (*check_resched_needed_t)(struct _rt_domain *rt);
2351 +typedef void (*release_jobs_t)(struct _rt_domain *rt, struct bheap* tasks);
2352 +
2353 +struct release_queue {
2354 +	/* each slot maintains a list of release heaps sorted
2355 +	 * by release time */
2356 +	struct list_head		slot[RELEASE_QUEUE_SLOTS];
2357 +};
2358 +
2359 +typedef struct _rt_domain {
2360 +	/* runnable rt tasks are in here */
2361 +	raw_spinlock_t 			ready_lock;
2362 +	struct bheap	 		ready_queue;
2363 +
2364 +	/* real-time tasks waiting for release are in here */
2365 +	raw_spinlock_t 			release_lock;
2366 +	struct release_queue 		release_queue;
2367 +
2368 +#ifdef CONFIG_RELEASE_MASTER
2369 +	int				release_master;
2370 +#endif
2371 +
2372 +	/* for moving tasks to the release queue */
2373 +	raw_spinlock_t			tobe_lock;
2374 +	struct list_head		tobe_released;
2375 +
2376 +	/* how do we check if we need to kick another CPU? */
2377 +	check_resched_needed_t		check_resched;
2378 +
2379 +	/* how do we release jobs? */
2380 +	release_jobs_t			release_jobs;
2381 +
2382 +	/* how are tasks ordered in the ready queue? */
2383 +	bheap_prio_t			order;
2384 +} rt_domain_t;
2385 +
2386 +struct release_heap {
2387 +	/* list_head for per-time-slot list */
2388 +	struct list_head		list;
2389 +	lt_t				release_time;
2390 +	/* all tasks to be released at release_time */
2391 +	struct bheap			heap;
2392 +	/* used to trigger the release */
2393 +	struct hrtimer			timer;
2394 +
2395 +#ifdef CONFIG_RELEASE_MASTER
2396 +	/* used to delegate releases */
2397 +	struct hrtimer_start_on_info	info;
2398 +#endif
2399 +	/* required for the timer callback */
2400 +	rt_domain_t*			dom;
2401 +};
2402 +
2403 +
2404 +static inline struct task_struct* __next_ready(rt_domain_t* rt)
2405 +{
2406 +	struct bheap_node *hn = bheap_peek(rt->order, &rt->ready_queue);
2407 +	if (hn)
2408 +		return bheap2task(hn);
2409 +	else
2410 +		return NULL;
2411 +}
2412 +
2413 +void rt_domain_init(rt_domain_t *rt, bheap_prio_t order,
2414 +		    check_resched_needed_t check,
2415 +		    release_jobs_t relase);
2416 +
2417 +void __add_ready(rt_domain_t* rt, struct task_struct *new);
2418 +void __merge_ready(rt_domain_t* rt, struct bheap *tasks);
2419 +void __add_release(rt_domain_t* rt, struct task_struct *task);
2420 +
2421 +static inline struct task_struct* __take_ready(rt_domain_t* rt)
2422 +{
2423 +	struct bheap_node* hn = bheap_take(rt->order, &rt->ready_queue);
2424 +	if (hn)
2425 +		return bheap2task(hn);
2426 +	else
2427 +		return NULL;
2428 +}
2429 +
2430 +static inline struct task_struct* __peek_ready(rt_domain_t* rt)
2431 +{
2432 +	struct bheap_node* hn = bheap_peek(rt->order, &rt->ready_queue);
2433 +	if (hn)
2434 +		return bheap2task(hn);
2435 +	else
2436 +		return NULL;
2437 +}
2438 +
2439 +static inline int  is_queued(struct task_struct *t)
2440 +{
2441 +	BUG_ON(!tsk_rt(t)->heap_node);
2442 +	return bheap_node_in_heap(tsk_rt(t)->heap_node);
2443 +}
2444 +
2445 +static inline void remove(rt_domain_t* rt, struct task_struct *t)
2446 +{
2447 +	bheap_delete(rt->order, &rt->ready_queue, tsk_rt(t)->heap_node);
2448 +}
2449 +
2450 +static inline void add_ready(rt_domain_t* rt, struct task_struct *new)
2451 +{
2452 +	unsigned long flags;
2453 +	/* first we need the write lock for rt_ready_queue */
2454 +	raw_spin_lock_irqsave(&rt->ready_lock, flags);
2455 +	__add_ready(rt, new);
2456 +	raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
2457 +}
2458 +
2459 +static inline void merge_ready(rt_domain_t* rt, struct bheap* tasks)
2460 +{
2461 +	unsigned long flags;
2462 +	raw_spin_lock_irqsave(&rt->ready_lock, flags);
2463 +	__merge_ready(rt, tasks);
2464 +	raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
2465 +}
2466 +
2467 +static inline struct task_struct* take_ready(rt_domain_t* rt)
2468 +{
2469 +	unsigned long flags;
2470 +	struct task_struct* ret;
2471 +	/* first we need the write lock for rt_ready_queue */
2472 +	raw_spin_lock_irqsave(&rt->ready_lock, flags);
2473 +	ret = __take_ready(rt);
2474 +	raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
2475 +	return ret;
2476 +}
2477 +
2478 +
2479 +static inline void add_release(rt_domain_t* rt, struct task_struct *task)
2480 +{
2481 +	unsigned long flags;
2482 +	raw_spin_lock_irqsave(&rt->tobe_lock, flags);
2483 +	__add_release(rt, task);
2484 +	raw_spin_unlock_irqrestore(&rt->tobe_lock, flags);
2485 +}
2486 +
2487 +#ifdef CONFIG_RELEASE_MASTER
2488 +void __add_release_on(rt_domain_t* rt, struct task_struct *task,
2489 +		      int target_cpu);
2490 +
2491 +static inline void add_release_on(rt_domain_t* rt,
2492 +				  struct task_struct *task,
2493 +				  int target_cpu)
2494 +{
2495 +	unsigned long flags;
2496 +	raw_spin_lock_irqsave(&rt->tobe_lock, flags);
2497 +	__add_release_on(rt, task, target_cpu);
2498 +	raw_spin_unlock_irqrestore(&rt->tobe_lock, flags);
2499 +}
2500 +#endif
2501 +
2502 +static inline int __jobs_pending(rt_domain_t* rt)
2503 +{
2504 +	return !bheap_empty(&rt->ready_queue);
2505 +}
2506 +
2507 +static inline int jobs_pending(rt_domain_t* rt)
2508 +{
2509 +	unsigned long flags;
2510 +	int ret;
2511 +	/* first we need the write lock for rt_ready_queue */
2512 +	raw_spin_lock_irqsave(&rt->ready_lock, flags);
2513 +	ret = !bheap_empty(&rt->ready_queue);
2514 +	raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
2515 +	return ret;
2516 +}
2517 +
2518 +#endif
2519 diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
2520 new file mode 100644
2521 index 0000000..d6d7991
2522 --- /dev/null
2523 +++ b/include/litmus/rt_param.h
2524 @@ -0,0 +1,209 @@
2525 +/*
2526 + * Definition of the scheduler plugin interface.
2527 + *
2528 + */
2529 +#ifndef _LINUX_RT_PARAM_H_
2530 +#define _LINUX_RT_PARAM_H_
2531 +
2532 +/* Litmus time type. */
2533 +typedef unsigned long long lt_t;
2534 +
2535 +static inline int lt_after(lt_t a, lt_t b)
2536 +{
2537 +	return ((long long) b) - ((long long) a) < 0;
2538 +}
2539 +#define lt_before(a, b) lt_after(b, a)
2540 +
2541 +static inline int lt_after_eq(lt_t a, lt_t b)
2542 +{
2543 +	return ((long long) a) - ((long long) b) >= 0;
2544 +}
2545 +#define lt_before_eq(a, b) lt_after_eq(b, a)
2546 +
2547 +/* different types of clients */
2548 +typedef enum {
2549 +	RT_CLASS_HARD,
2550 +	RT_CLASS_SOFT,
2551 +	RT_CLASS_BEST_EFFORT
2552 +} task_class_t;
2553 +
2554 +typedef enum {
2555 +	NO_ENFORCEMENT,      /* job may overrun unhindered */
2556 +	QUANTUM_ENFORCEMENT, /* budgets are only checked on quantum boundaries */
2557 +	PRECISE_ENFORCEMENT  /* budgets are enforced with hrtimers */
2558 +} budget_policy_t;
2559 +
2560 +struct rt_task {
2561 +	lt_t 		exec_cost;
2562 +	lt_t 		period;
2563 +	lt_t		phase;
2564 +	unsigned int	cpu;
2565 +	task_class_t	cls;
2566 +	budget_policy_t budget_policy; /* ignored by pfair */
2567 +};
2568 +
2569 +union np_flag {
2570 +	uint32_t raw;
2571 +	struct {
2572 +		/* Is the task currently in a non-preemptive section? */
2573 +		uint32_t flag:31;
2574 +		/* Should the task call into the scheduler? */
2575 +		uint32_t preempt:1;
2576 +	} np;
2577 +};
2578 +
2579 +/* The definition of the data that is shared between the kernel and real-time
2580 + * tasks via a shared page (see litmus/ctrldev.c).
2581 + *
2582 + * WARNING: User space can write to this, so don't trust
2583 + * the correctness of the fields!
2584 + *
2585 + * This servees two purposes: to enable efficient signaling
2586 + * of non-preemptive sections (user->kernel) and
2587 + * delayed preemptions (kernel->user), and to export
2588 + * some real-time relevant statistics such as preemption and
2589 + * migration data to user space. We can't use a device to export
2590 + * statistics because we want to avoid system call overhead when
2591 + * determining preemption/migration overheads).
2592 + */
2593 +struct control_page {
2594 +	volatile union np_flag sched;
2595 +
2596 +	/* to be extended */
2597 +};
2598 +
2599 +/* don't export internal data structures to user space (liblitmus) */
2600 +#ifdef __KERNEL__
2601 +
2602 +struct _rt_domain;
2603 +struct bheap_node;
2604 +struct release_heap;
2605 +
2606 +struct rt_job {
2607 +	/* Time instant the the job was or will be released.  */
2608 +	lt_t	release;
2609 +	/* What is the current deadline? */
2610 +	lt_t   	deadline;
2611 +
2612 +	/* How much service has this job received so far? */
2613 +	lt_t	exec_time;
2614 +
2615 +	/* Which job is this. This is used to let user space
2616 +	 * specify which job to wait for, which is important if jobs
2617 +	 * overrun. If we just call sys_sleep_next_period() then we
2618 +	 * will unintentionally miss jobs after an overrun.
2619 +	 *
2620 +	 * Increase this sequence number when a job is released.
2621 +	 */
2622 +	unsigned int    job_no;
2623 +};
2624 +
2625 +struct pfair_param;
2626 +
2627 +/*	RT task parameters for scheduling extensions
2628 + *	These parameters are inherited during clone and therefore must
2629 + *	be explicitly set up before the task set is launched.
2630 + */
2631 +struct rt_param {
2632 +	/* is the task sleeping? */
2633 +	unsigned int 		flags:8;
2634 +
2635 +	/* do we need to check for srp blocking? */
2636 +	unsigned int		srp_non_recurse:1;
2637 +
2638 +	/* is the task present? (true if it can be scheduled) */
2639 +	unsigned int		present:1;
2640 +
2641 +#ifdef CONFIG_LITMUS_LOCKING
2642 +	/* Is the task being priority-boosted by a locking protocol? */
2643 +	unsigned int		priority_boosted:1;
2644 +	/* If so, when did this start? */
2645 +	lt_t			boost_start_time;
2646 +#endif
2647 +
2648 +	/* user controlled parameters */
2649 +	struct rt_task 		task_params;
2650 +
2651 +	/* timing parameters */
2652 +	struct rt_job 		job_params;
2653 +
2654 +	/* task representing the current "inherited" task
2655 +	 * priority, assigned by inherit_priority and
2656 +	 * return priority in the scheduler plugins.
2657 +	 * could point to self if PI does not result in
2658 +	 * an increased task priority.
2659 +	 */
2660 +	 struct task_struct*	inh_task;
2661 +
2662 +#ifdef CONFIG_NP_SECTION
2663 +	/* For the FMLP under PSN-EDF, it is required to make the task
2664 +	 * non-preemptive from kernel space. In order not to interfere with
2665 +	 * user space, this counter indicates the kernel space np setting.
2666 +	 * kernel_np > 0 => task is non-preemptive
2667 +	 */
2668 +	unsigned int	kernel_np;
2669 +#endif
2670 +
2671 +	/* This field can be used by plugins to store where the task
2672 +	 * is currently scheduled. It is the responsibility of the
2673 +	 * plugin to avoid race conditions.
2674 +	 *
2675 +	 * This used by GSN-EDF and PFAIR.
2676 +	 */
2677 +	volatile int		scheduled_on;
2678 +
2679 +	/* Is the stack of the task currently in use? This is updated by
2680 +	 * the LITMUS core.
2681 +	 *
2682 +	 * Be careful to avoid deadlocks!
2683 +	 */
2684 +	volatile int		stack_in_use;
2685 +
2686 +	/* This field can be used by plugins to store where the task
2687 +	 * is currently linked. It is the responsibility of the plugin
2688 +	 * to avoid race conditions.
2689 +	 *
2690 +	 * Used by GSN-EDF.
2691 +	 */
2692 +	volatile int		linked_on;
2693 +
2694 +	/* PFAIR/PD^2 state. Allocated on demand. */
2695 +	struct pfair_param*	pfair;
2696 +
2697 +	/* Fields saved before BE->RT transition.
2698 +	 */
2699 +	int old_policy;
2700 +	int old_prio;
2701 +
2702 +	/* ready queue for this task */
2703 +	struct _rt_domain* domain;
2704 +
2705 +	/* heap element for this task
2706 +	 *
2707 +	 * Warning: Don't statically allocate this node. The heap
2708 +	 *          implementation swaps these between tasks, thus after
2709 +	 *          dequeuing from a heap you may end up with a different node
2710 +	 *          then the one you had when enqueuing the task.  For the same
2711 +	 *          reason, don't obtain and store references to this node
2712 +	 *          other than this pointer (which is updated by the heap
2713 +	 *          implementation).
2714 +	 */
2715 +	struct bheap_node*	heap_node;
2716 +	struct release_heap*	rel_heap;
2717 +
2718 +	/* Used by rt_domain to queue task in release list.
2719 +	 */
2720 +	struct list_head list;
2721 +
2722 +	/* Pointer to the page shared between userspace and kernel. */
2723 +	struct control_page * ctrl_page;
2724 +};
2725 +
2726 +/*	Possible RT flags	*/
2727 +#define RT_F_RUNNING		0x00000000
2728 +#define RT_F_SLEEP		0x00000001
2729 +#define RT_F_EXIT_SEM		0x00000008
2730 +
2731 +#endif
2732 +
2733 +#endif
2734 diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h
2735 new file mode 100644
2736 index 0000000..6e7cabd
2737 --- /dev/null
2738 +++ b/include/litmus/sched_plugin.h
2739 @@ -0,0 +1,111 @@
2740 +/*
2741 + * Definition of the scheduler plugin interface.
2742 + *
2743 + */
2744 +#ifndef _LINUX_SCHED_PLUGIN_H_
2745 +#define _LINUX_SCHED_PLUGIN_H_
2746 +
2747 +#include <linux/sched.h>
2748 +
2749 +#ifdef CONFIG_LITMUS_LOCKING
2750 +#include <litmus/locking.h>
2751 +#endif
2752 +
2753 +/************************ setup/tear down ********************/
2754 +
2755 +typedef long (*activate_plugin_t) (void);
2756 +typedef long (*deactivate_plugin_t) (void);
2757 +
2758 +
2759 +
2760 +/********************* scheduler invocation ******************/
2761 +
2762 +/*  Plugin-specific realtime tick handler */
2763 +typedef void (*scheduler_tick_t) (struct task_struct *cur);
2764 +/* Novell make sched decision function */
2765 +typedef struct task_struct* (*schedule_t)(struct task_struct * prev);
2766 +/* Clean up after the task switch has occured.
2767 + * This function is called after every (even non-rt) task switch.
2768 + */
2769 +typedef void (*finish_switch_t)(struct task_struct *prev);
2770 +
2771 +
2772 +/********************* task state changes ********************/
2773 +
2774 +/* Called to setup a new real-time task.
2775 + * Release the first job, enqueue, etc.
2776 + * Task may already be running.
2777 + */
2778 +typedef void (*task_new_t) (struct task_struct *task,
2779 +			    int on_rq,
2780 +			    int running);
2781 +
2782 +/* Called to re-introduce a task after blocking.
2783 + * Can potentially be called multiple times.
2784 + */
2785 +typedef void (*task_wake_up_t) (struct task_struct *task);
2786 +/* called to notify the plugin of a blocking real-time task
2787 + * it will only be called for real-time tasks and before schedule is called */
2788 +typedef void (*task_block_t)  (struct task_struct *task);
2789 +/* Called when a real-time task exits or changes to a different scheduling
2790 + * class.
2791 + * Free any allocated resources
2792 + */
2793 +typedef void (*task_exit_t)    (struct task_struct *);
2794 +
2795 +/* Called when the current task attempts to create a new lock of a given
2796 + * protocol type. */
2797 +typedef long (*allocate_lock_t) (struct litmus_lock **lock, int type,
2798 +				 void* __user config);
2799 +
2800 +
2801 +/********************* sys call backends  ********************/
2802 +/* This function causes the caller to sleep until the next release */
2803 +typedef long (*complete_job_t) (void);
2804 +
2805 +typedef long (*admit_task_t)(struct task_struct* tsk);
2806 +
2807 +typedef void (*release_at_t)(struct task_struct *t, lt_t start);
2808 +
2809 +struct sched_plugin {
2810 +	struct list_head	list;
2811 +	/* 	basic info 		*/
2812 +	char 			*plugin_name;
2813 +
2814 +	/*	setup			*/
2815 +	activate_plugin_t	activate_plugin;
2816 +	deactivate_plugin_t	deactivate_plugin;
2817 +
2818 +	/* 	scheduler invocation 	*/
2819 +	scheduler_tick_t        tick;
2820 +	schedule_t 		schedule;
2821 +	finish_switch_t 	finish_switch;
2822 +
2823 +	/*	syscall backend 	*/
2824 +	complete_job_t 		complete_job;
2825 +	release_at_t		release_at;
2826 +
2827 +	/*	task state changes 	*/
2828 +	admit_task_t		admit_task;
2829 +
2830 +        task_new_t 		task_new;
2831 +	task_wake_up_t		task_wake_up;
2832 +	task_block_t		task_block;
2833 +	task_exit_t 		task_exit;
2834 +
2835 +#ifdef CONFIG_LITMUS_LOCKING
2836 +	/*	locking protocols	*/
2837 +	allocate_lock_t		allocate_lock;
2838 +#endif
2839 +} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
2840 +
2841 +
2842 +extern struct sched_plugin *litmus;
2843 +
2844 +int register_sched_plugin(struct sched_plugin* plugin);
2845 +struct sched_plugin* find_sched_plugin(const char* name);
2846 +int print_sched_plugins(char* buf, int max);
2847 +
2848 +extern struct sched_plugin linux_sched_plugin;
2849 +
2850 +#endif
2851 diff --git a/include/litmus/sched_trace.h b/include/litmus/sched_trace.h
2852 new file mode 100644
2853 index 0000000..7ca34cb
2854 --- /dev/null
2855 +++ b/include/litmus/sched_trace.h
2856 @@ -0,0 +1,200 @@
2857 +/*
2858 + * sched_trace.h -- record scheduler events to a byte stream for offline analysis.
2859 + */
2860 +#ifndef _LINUX_SCHED_TRACE_H_
2861 +#define _LINUX_SCHED_TRACE_H_
2862 +
2863 +/* all times in nanoseconds */
2864 +
2865 +struct st_trace_header {
2866 +	u8	type;		/* Of what type is this record?  */
2867 +	u8	cpu;		/* On which CPU was it recorded? */
2868 +	u16	pid;		/* PID of the task.              */
2869 +	u32	job;		/* The job sequence number.      */
2870 +};
2871 +
2872 +#define ST_NAME_LEN 16
2873 +struct st_name_data {
2874 +	char	cmd[ST_NAME_LEN];/* The name of the executable of this process. */
2875 +};
2876 +
2877 +struct st_param_data {		/* regular params */
2878 +	u32	wcet;
2879 +	u32	period;
2880 +	u32	phase;
2881 +	u8	partition;
2882 +	u8	class;
2883 +	u8	__unused[2];
2884 +};
2885 +
2886 +struct st_release_data {	/* A job is was/is going to be released. */
2887 +	u64	release;	/* What's the release time?              */
2888 +	u64	deadline;	/* By when must it finish?		 */
2889 +};
2890 +
2891 +struct st_assigned_data {	/* A job was asigned to a CPU. 		 */
2892 +	u64	when;
2893 +	u8	target;		/* Where should it execute?	         */
2894 +	u8	__unused[7];
2895 +};
2896 +
2897 +struct st_switch_to_data {	/* A process was switched to on a given CPU.   */
2898 +	u64	when;		/* When did this occur?                        */
2899 +	u32	exec_time;	/* Time the current job has executed.          */
2900 +	u8	__unused[4];
2901 +
2902 +};
2903 +
2904 +struct st_switch_away_data {	/* A process was switched away from on a given CPU. */
2905 +	u64	when;
2906 +	u64	exec_time;
2907 +};
2908 +
2909 +struct st_completion_data {	/* A job completed. */
2910 +	u64	when;
2911 +	u8	forced:1; 	/* Set to 1 if job overran and kernel advanced to the
2912 +				 * next task automatically; set to 0 otherwise.
2913 +				 */
2914 +	u8	__uflags:7;
2915 +	u8	__unused[7];
2916 +};
2917 +
2918 +struct st_block_data {		/* A task blocks. */
2919 +	u64	when;
2920 +	u64	__unused;
2921 +};
2922 +
2923 +struct st_resume_data {		/* A task resumes. */
2924 +	u64	when;
2925 +	u64	__unused;
2926 +};
2927 +
2928 +struct st_action_data {
2929 +	u64	when;
2930 +	u8	action;
2931 +	u8	__unused[7];
2932 +};
2933 +
2934 +struct st_sys_release_data {
2935 +	u64	when;
2936 +	u64	release;
2937 +};
2938 +
2939 +#define DATA(x) struct st_ ## x ## _data x;
2940 +
2941 +typedef enum {
2942 +        ST_NAME = 1,		/* Start at one, so that we can spot
2943 +				 * uninitialized records. */
2944 +	ST_PARAM,
2945 +	ST_RELEASE,
2946 +	ST_ASSIGNED,
2947 +	ST_SWITCH_TO,
2948 +	ST_SWITCH_AWAY,
2949 +	ST_COMPLETION,
2950 +	ST_BLOCK,
2951 +	ST_RESUME,
2952 +	ST_ACTION,
2953 +	ST_SYS_RELEASE
2954 +} st_event_record_type_t;
2955 +
2956 +struct st_event_record {
2957 +	struct st_trace_header hdr;
2958 +	union {
2959 +		u64 raw[2];
2960 +
2961 +		DATA(name);
2962 +		DATA(param);
2963 +		DATA(release);
2964 +		DATA(assigned);
2965 +		DATA(switch_to);
2966 +		DATA(switch_away);
2967 +		DATA(completion);
2968 +		DATA(block);
2969 +		DATA(resume);
2970 +		DATA(action);
2971 +		DATA(sys_release);
2972 +	} data;
2973 +};
2974 +
2975 +#undef DATA
2976 +
2977 +#ifdef __KERNEL__
2978 +
2979 +#include <linux/sched.h>
2980 +#include <litmus/feather_trace.h>
2981 +
2982 +#ifdef CONFIG_SCHED_TASK_TRACE
2983 +
2984 +#define SCHED_TRACE(id, callback, task) \
2985 +	ft_event1(id, callback, task)
2986 +#define SCHED_TRACE2(id, callback, task, xtra) \
2987 +	ft_event2(id, callback, task, xtra)
2988 +
2989 +/* provide prototypes; needed on sparc64 */
2990 +#ifndef NO_TASK_TRACE_DECLS
2991 +feather_callback void do_sched_trace_task_name(unsigned long id,
2992 +					       struct task_struct* task);
2993 +feather_callback void do_sched_trace_task_param(unsigned long id,
2994 +						struct task_struct* task);
2995 +feather_callback void do_sched_trace_task_release(unsigned long id,
2996 +						  struct task_struct* task);
2997 +feather_callback void do_sched_trace_task_switch_to(unsigned long id,
2998 +						    struct task_struct* task);
2999 +feather_callback void do_sched_trace_task_switch_away(unsigned long id,
3000 +						      struct task_struct* task);
3001 +feather_callback void do_sched_trace_task_completion(unsigned long id,
3002 +						     struct task_struct* task,
3003 +						     unsigned long forced);
3004 +feather_callback void do_sched_trace_task_block(unsigned long id,
3005 +						struct task_struct* task);
3006 +feather_callback void do_sched_trace_task_resume(unsigned long id,
3007 +						 struct task_struct* task);
3008 +feather_callback void do_sched_trace_action(unsigned long id,
3009 +					    struct task_struct* task,
3010 +					    unsigned long action);
3011 +feather_callback void do_sched_trace_sys_release(unsigned long id,
3012 +						 lt_t* start);
3013 +
3014 +#endif
3015 +
3016 +#else
3017 +
3018 +#define SCHED_TRACE(id, callback, task)        /* no tracing */
3019 +#define SCHED_TRACE2(id, callback, task, xtra) /* no tracing */
3020 +
3021 +#endif
3022 +
3023 +
3024 +#define SCHED_TRACE_BASE_ID 500
3025 +
3026 +
3027 +#define sched_trace_task_name(t) \
3028 +	SCHED_TRACE(SCHED_TRACE_BASE_ID + 1, do_sched_trace_task_name, t)
3029 +#define sched_trace_task_param(t) \
3030 +	SCHED_TRACE(SCHED_TRACE_BASE_ID + 2, do_sched_trace_task_param, t)
3031 +#define sched_trace_task_release(t) \
3032 +	SCHED_TRACE(SCHED_TRACE_BASE_ID + 3, do_sched_trace_task_release, t)
3033 +#define sched_trace_task_switch_to(t) \
3034 +	SCHED_TRACE(SCHED_TRACE_BASE_ID + 4, do_sched_trace_task_switch_to, t)
3035 +#define sched_trace_task_switch_away(t) \
3036 +	SCHED_TRACE(SCHED_TRACE_BASE_ID + 5, do_sched_trace_task_switch_away, t)
3037 +#define sched_trace_task_completion(t, forced) \
3038 +	SCHED_TRACE2(SCHED_TRACE_BASE_ID + 6, do_sched_trace_task_completion, t, \
3039 +		     (unsigned long) forced)
3040 +#define sched_trace_task_block(t) \
3041 +	SCHED_TRACE(SCHED_TRACE_BASE_ID + 7, do_sched_trace_task_block, t)
3042 +#define sched_trace_task_resume(t) \
3043 +	SCHED_TRACE(SCHED_TRACE_BASE_ID + 8, do_sched_trace_task_resume, t)
3044 +#define sched_trace_action(t, action) \
3045 +	SCHED_TRACE2(SCHED_TRACE_BASE_ID + 9, do_sched_trace_action, t, \
3046 +		     (unsigned long) action);
3047 +/* when is a pointer, it does not need an explicit cast to unsigned long */
3048 +#define sched_trace_sys_release(when) \
3049 +	SCHED_TRACE(SCHED_TRACE_BASE_ID + 10, do_sched_trace_sys_release, when)
3050 +
3051 +
3052 +#define sched_trace_quantum_boundary() /* NOT IMPLEMENTED */
3053 +
3054 +#endif /* __KERNEL__ */
3055 +
3056 +#endif
3057 diff --git a/include/litmus/srp.h b/include/litmus/srp.h
3058 new file mode 100644
3059 index 0000000..c9a4552
3060 --- /dev/null
3061 +++ b/include/litmus/srp.h
3062 @@ -0,0 +1,28 @@
3063 +#ifndef LITMUS_SRP_H
3064 +#define LITMUS_SRP_H
3065 +
3066 +struct srp_semaphore;
3067 +
3068 +struct srp_priority {
3069 +	struct list_head	list;
3070 +        unsigned int 		priority;
3071 +	pid_t			pid;
3072 +};
3073 +#define list2prio(l) list_entry(l, struct srp_priority, list)
3074 +
3075 +/* struct for uniprocessor SRP "semaphore" */
3076 +struct srp_semaphore {
3077 +	struct litmus_lock litmus_lock;
3078 +	struct srp_priority ceiling;
3079 +	struct task_struct* owner;
3080 +	int cpu; /* cpu associated with this "semaphore" and resource */
3081 +};
3082 +
3083 +/* map a task to its SRP preemption level priority */
3084 +typedef unsigned int (*srp_prioritization_t)(struct task_struct* t);
3085 +/* Must be updated by each plugin that uses SRP.*/
3086 +extern srp_prioritization_t get_srp_prio;
3087 +
3088 +struct srp_semaphore* allocate_srp_semaphore(void);
3089 +
3090 +#endif
3091 diff --git a/include/litmus/trace.h b/include/litmus/trace.h
3092 new file mode 100644
3093 index 0000000..e809376
3094 --- /dev/null
3095 +++ b/include/litmus/trace.h
3096 @@ -0,0 +1,116 @@
3097 +#ifndef _SYS_TRACE_H_
3098 +#define	_SYS_TRACE_H_
3099 +
3100 +#ifdef CONFIG_SCHED_OVERHEAD_TRACE
3101 +
3102 +#include <litmus/feather_trace.h>
3103 +#include <litmus/feather_buffer.h>
3104 +
3105 +
3106 +/*********************** TIMESTAMPS ************************/
3107 +
3108 +enum task_type_marker {
3109 +	TSK_BE,
3110 +	TSK_RT,
3111 +	TSK_UNKNOWN
3112 +};
3113 +
3114 +struct timestamp {
3115 +	uint64_t		timestamp;
3116 +	uint32_t		seq_no;
3117 +	uint8_t			cpu;
3118 +	uint8_t			event;
3119 +	uint8_t			task_type:2;
3120 +	uint8_t			irq_flag:1;
3121 +	uint8_t			irq_count:5;
3122 +};
3123 +
3124 +/* tracing callbacks */
3125 +feather_callback void save_timestamp(unsigned long event);
3126 +feather_callback void save_timestamp_def(unsigned long event, unsigned long type);
3127 +feather_callback void save_timestamp_task(unsigned long event, unsigned long t_ptr);
3128 +feather_callback void save_timestamp_cpu(unsigned long event, unsigned long cpu);
3129 +feather_callback void save_task_latency(unsigned long event, unsigned long when_ptr);
3130 +
3131 +#define TIMESTAMP(id) ft_event0(id, save_timestamp)
3132 +
3133 +#define DTIMESTAMP(id, def)  ft_event1(id, save_timestamp_def, (unsigned long) def)
3134 +
3135 +#define TTIMESTAMP(id, task) \
3136 +	ft_event1(id, save_timestamp_task, (unsigned long) task)
3137 +
3138 +#define CTIMESTAMP(id, cpu) \
3139 +	ft_event1(id, save_timestamp_cpu, (unsigned long) cpu)
3140 +
3141 +#define LTIMESTAMP(id, task) \
3142 +	ft_event1(id, save_task_latency, (unsigned long) task)
3143 +
3144 +#else /* !CONFIG_SCHED_OVERHEAD_TRACE */
3145 +
3146 +#define TIMESTAMP(id)        /* no tracing */
3147 +
3148 +#define DTIMESTAMP(id, def)  /* no tracing */
3149 +
3150 +#define TTIMESTAMP(id, task) /* no tracing */
3151 +
3152 +#define CTIMESTAMP(id, cpu)  /* no tracing */
3153 +
3154 +#define LTIMESTAMP(id, when_ptr) /* no tracing */
3155 +
3156 +#endif
3157 +
3158 +
3159 +/* Convention for timestamps
3160 + * =========================
3161 + *
3162 + * In order to process the trace files with a common tool, we use the following
3163 + * convention to measure execution times: The end time id of a code segment is
3164 + * always the next number after the start time event id.
3165 + */
3166 +
3167 +
3168 +
3169 +#define TS_SCHED_START			DTIMESTAMP(100, TSK_UNKNOWN) /* we only
3170 +								      * care
3171 +								      * about
3172 +								      * next */
3173 +#define TS_SCHED_END(t)			TTIMESTAMP(101, t)
3174 +#define TS_SCHED2_START(t) 		TTIMESTAMP(102, t)
3175 +#define TS_SCHED2_END(t)       		TTIMESTAMP(103, t)
3176 +
3177 +#define TS_CXS_START(t)			TTIMESTAMP(104, t)
3178 +#define TS_CXS_END(t)			TTIMESTAMP(105, t)
3179 +
3180 +#define TS_RELEASE_START		DTIMESTAMP(106, TSK_RT)
3181 +#define TS_RELEASE_END			DTIMESTAMP(107, TSK_RT)
3182 +
3183 +#define TS_TICK_START(t)		TTIMESTAMP(110, t)
3184 +#define TS_TICK_END(t) 			TTIMESTAMP(111, t)
3185 +
3186 +
3187 +#define TS_PLUGIN_SCHED_START		/* TIMESTAMP(120) */  /* currently unused */
3188 +#define TS_PLUGIN_SCHED_END		/* TIMESTAMP(121) */
3189 +
3190 +#define TS_PLUGIN_TICK_START		/* TIMESTAMP(130) */
3191 +#define TS_PLUGIN_TICK_END		/* TIMESTAMP(131) */
3192 +
3193 +#define TS_ENTER_NP_START		TIMESTAMP(140)
3194 +#define TS_ENTER_NP_END			TIMESTAMP(141)
3195 +
3196 +#define TS_EXIT_NP_START		TIMESTAMP(150)
3197 +#define TS_EXIT_NP_END			TIMESTAMP(151)
3198 +
3199 +#define TS_LOCK_START			TIMESTAMP(170)
3200 +#define TS_LOCK_SUSPEND			TIMESTAMP(171)
3201 +#define TS_LOCK_RESUME			TIMESTAMP(172)
3202 +#define TS_LOCK_END			TIMESTAMP(173)
3203 +
3204 +#define TS_UNLOCK_START			TIMESTAMP(180)
3205 +#define TS_UNLOCK_END			TIMESTAMP(181)
3206 +
3207 +#define TS_SEND_RESCHED_START(c)	CTIMESTAMP(190, c)
3208 +#define TS_SEND_RESCHED_END		DTIMESTAMP(191, TSK_UNKNOWN)
3209 +
3210 +#define TS_RELEASE_LATENCY(when)	LTIMESTAMP(208, &(when))
3211 +
3212 +#endif /* !_SYS_TRACE_H_ */
3213 diff --git a/include/litmus/trace_irq.h b/include/litmus/trace_irq.h
3214 new file mode 100644
3215 index 0000000..f18b127
3216 --- /dev/null
3217 +++ b/include/litmus/trace_irq.h
3218 @@ -0,0 +1,21 @@
3219 +#ifndef _LITMUS_TRACE_IRQ_H_
3220 +#define	_LITMUS_TRACE_IRQ_H_
3221 +
3222 +#ifdef CONFIG_SCHED_OVERHEAD_TRACE
3223 +
3224 +extern DEFINE_PER_CPU(atomic_t, irq_fired_count);
3225 +
3226 +static inline void ft_irq_fired(void)
3227 +{
3228 +	/* Only called with preemptions disabled.  */
3229 +	atomic_inc(&__get_cpu_var(irq_fired_count));
3230 +}
3231 +
3232 +
3233 +#else
3234 +
3235 +#define ft_irq_fired() /* nothing to do */
3236 +
3237 +#endif
3238 +
3239 +#endif
3240 diff --git a/include/litmus/unistd_32.h b/include/litmus/unistd_32.h
3241 new file mode 100644
3242 index 0000000..94264c2
3243 --- /dev/null
3244 +++ b/include/litmus/unistd_32.h
3245 @@ -0,0 +1,21 @@
3246 +/*
3247 + * included from arch/x86/include/asm/unistd_32.h
3248 + *
3249 + * LITMUS^RT syscalls with "relative" numbers
3250 + */
3251 +#define __LSC(x) (__NR_LITMUS + x)
3252 +
3253 +#define __NR_set_rt_task_param	__LSC(0)
3254 +#define __NR_get_rt_task_param	__LSC(1)
3255 +#define __NR_complete_job	__LSC(2)
3256 +#define __NR_od_open		__LSC(3)
3257 +#define __NR_od_close		__LSC(4)
3258 +#define __NR_litmus_lock       	__LSC(5)
3259 +#define __NR_litmus_unlock	__LSC(6)
3260 +#define __NR_query_job_no	__LSC(7)
3261 +#define __NR_wait_for_job_release __LSC(8)
3262 +#define __NR_wait_for_ts_release __LSC(9)
3263 +#define __NR_release_ts		__LSC(10)
3264 +#define __NR_null_call		__LSC(11)
3265 +
3266 +#define NR_litmus_syscalls 12
3267 diff --git a/include/litmus/unistd_64.h b/include/litmus/unistd_64.h
3268 new file mode 100644
3269 index 0000000..d5ced0d
3270 --- /dev/null
3271 +++ b/include/litmus/unistd_64.h
3272 @@ -0,0 +1,33 @@
3273 +/*
3274 + * included from arch/x86/include/asm/unistd_64.h
3275 + *
3276 + * LITMUS^RT syscalls with "relative" numbers
3277 + */
3278 +#define __LSC(x) (__NR_LITMUS + x)
3279 +
3280 +#define __NR_set_rt_task_param			__LSC(0)
3281 +__SYSCALL(__NR_set_rt_task_param, sys_set_rt_task_param)
3282 +#define __NR_get_rt_task_param			__LSC(1)
3283 +__SYSCALL(__NR_get_rt_task_param, sys_get_rt_task_param)
3284 +#define __NR_complete_job	  		__LSC(2)
3285 +__SYSCALL(__NR_complete_job, sys_complete_job)
3286 +#define __NR_od_open				__LSC(3)
3287 +__SYSCALL(__NR_od_open, sys_od_open)
3288 +#define __NR_od_close				__LSC(4)
3289 +__SYSCALL(__NR_od_close, sys_od_close)
3290 +#define __NR_litmus_lock	       		__LSC(5)
3291 +__SYSCALL(__NR_litmus_lock, sys_litmus_lock)
3292 +#define __NR_litmus_unlock	       		__LSC(6)
3293 +__SYSCALL(__NR_litmus_unlock, sys_litmus_unlock)
3294 +#define __NR_query_job_no			__LSC(7)
3295 +__SYSCALL(__NR_query_job_no, sys_query_job_no)
3296 +#define __NR_wait_for_job_release		__LSC(8)
3297 +__SYSCALL(__NR_wait_for_job_release, sys_wait_for_job_release)
3298 +#define __NR_wait_for_ts_release		__LSC(9)
3299 +__SYSCALL(__NR_wait_for_ts_release, sys_wait_for_ts_release)
3300 +#define __NR_release_ts				__LSC(10)
3301 +__SYSCALL(__NR_release_ts, sys_release_ts)
3302 +#define __NR_null_call				__LSC(11)
3303 +__SYSCALL(__NR_null_call, sys_null_call)
3304 +
3305 +#define NR_litmus_syscalls 12
3306 diff --git a/kernel/exit.c b/kernel/exit.c
3307 index f2b321b..64879bd 100644
3308 --- a/kernel/exit.c
3309 +++ b/kernel/exit.c
3310 @@ -57,6 +57,8 @@
3311  #include <asm/pgtable.h>
3312  #include <asm/mmu_context.h>
3313  
3314 +extern void exit_od_table(struct task_struct *t);
3315 +
3316  static void exit_mm(struct task_struct * tsk);
3317  
3318  static void __unhash_process(struct task_struct *p, bool group_dead)
3319 @@ -980,6 +982,8 @@ NORET_TYPE void do_exit(long code)
3320  	if (unlikely(tsk->audit_context))
3321  		audit_free(tsk);
3322  
3323 +	exit_od_table(tsk);
3324 +
3325  	tsk->exit_code = code;
3326  	taskstats_exit(tsk, group_dead);
3327  
3328 diff --git a/kernel/fork.c b/kernel/fork.c
3329 index 0276c30..25c6111 100644
3330 --- a/kernel/fork.c
3331 +++ b/kernel/fork.c
3332 @@ -77,6 +77,9 @@
3333  
3334  #include <trace/events/sched.h>
3335  
3336 +#include <litmus/litmus.h>
3337 +#include <litmus/sched_plugin.h>
3338 +
3339  /*
3340   * Protected counters by write_lock_irq(&tasklist_lock)
3341   */
3342 @@ -191,6 +194,7 @@ void __put_task_struct(struct task_struct *tsk)
3343  	WARN_ON(atomic_read(&tsk->usage));
3344  	WARN_ON(tsk == current);
3345  
3346 +	exit_litmus(tsk);
3347  	exit_creds(tsk);
3348  	delayacct_tsk_free(tsk);
3349  	put_signal_struct(tsk->signal);
3350 @@ -275,6 +279,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
3351  
3352  	tsk->stack = ti;
3353  
3354 +	/* Don't let the new task be a real-time task. */
3355 +	litmus_fork(tsk);
3356 +
3357  	err = prop_local_init_single(&tsk->dirties);
3358  	if (err)
3359  		goto out;
3360 diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
3361 index a9205e3..11e8969 100644
3362 --- a/kernel/hrtimer.c
3363 +++ b/kernel/hrtimer.c
3364 @@ -46,6 +46,8 @@
3365  #include <linux/sched.h>
3366  #include <linux/timer.h>
3367  
3368 +#include <litmus/litmus.h>
3369 +
3370  #include <asm/uaccess.h>
3371  
3372  #include <trace/events/timer.h>
3373 @@ -1026,6 +1028,98 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
3374  }
3375  EXPORT_SYMBOL_GPL(hrtimer_start);
3376  
3377 +#ifdef CONFIG_ARCH_HAS_SEND_PULL_TIMERS
3378 +
3379 +/**
3380 + * hrtimer_start_on_info_init - Initialize hrtimer_start_on_info
3381 + */
3382 +void hrtimer_start_on_info_init(struct hrtimer_start_on_info *info)
3383 +{
3384 +	memset(info, 0, sizeof(struct hrtimer_start_on_info));
3385 +	atomic_set(&info->state, HRTIMER_START_ON_INACTIVE);
3386 +}
3387 +
3388 +/**
3389 + *  hrtimer_pull - PULL_TIMERS_VECTOR callback on remote cpu
3390 + */
3391 +void hrtimer_pull(void)
3392 +{
3393 +	struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
3394 +	struct hrtimer_start_on_info *info;
3395 +	struct list_head *pos, *safe, list;
3396 +
3397 +	raw_spin_lock(&base->lock);
3398 +	list_replace_init(&base->to_pull, &list);
3399 +	raw_spin_unlock(&base->lock);
3400 +
3401 +	list_for_each_safe(pos, safe, &list) {
3402 +		info = list_entry(pos, struct hrtimer_start_on_info, list);
3403 +		TRACE("pulled timer 0x%x\n", info->timer);
3404 +		list_del(pos);
3405 +		hrtimer_start(info->timer, info->time, info->mode);
3406 +	}
3407 +}
3408 +
3409 +/**
3410 + *  hrtimer_start_on - trigger timer arming on remote cpu
3411 + *  @cpu:	remote cpu
3412 + *  @info:	save timer information for enqueuing on remote cpu
3413 + *  @timer:	timer to be pulled
3414 + *  @time:	expire time
3415 + *  @mode:	timer mode
3416 + */
3417 +int hrtimer_start_on(int cpu, struct hrtimer_start_on_info* info,
3418 +		struct hrtimer *timer, ktime_t time,
3419 +		const enum hrtimer_mode mode)
3420 +{
3421 +	unsigned long flags;
3422 +	struct hrtimer_cpu_base* base;
3423 +	int in_use = 0, was_empty;
3424 +
3425 +	/* serialize access to info through the timer base */
3426 +	lock_hrtimer_base(timer, &flags);
3427 +
3428 +	in_use = (atomic_read(&info->state) != HRTIMER_START_ON_INACTIVE);
3429 +	if (!in_use) {
3430 +		INIT_LIST_HEAD(&info->list);
3431 +		info->timer = timer;
3432 +		info->time  = time;
3433 +		info->mode  = mode;
3434 +		/* mark as in use */
3435 +		atomic_set(&info->state, HRTIMER_START_ON_QUEUED);
3436 +	}
3437 +
3438 +	unlock_hrtimer_base(timer, &flags);
3439 +
3440 +	if (!in_use) {
3441 +		/* initiate pull  */
3442 +		preempt_disable();
3443 +		if (cpu == smp_processor_id()) {
3444 +			/* start timer locally; we may get called
3445 +			 * with rq->lock held, do not wake up anything
3446 +			 */
3447 +			TRACE("hrtimer_start_on: starting on local CPU\n");
3448 +			__hrtimer_start_range_ns(info->timer, info->time,
3449 +						 0, info->mode, 0);
3450 +		} else {
3451 +			TRACE("hrtimer_start_on: pulling to remote CPU\n");
3452 +			base = &per_cpu(hrtimer_bases, cpu);
3453 +			raw_spin_lock_irqsave(&base->lock, flags);
3454 +			was_empty = list_empty(&base->to_pull);
3455 +			list_add(&info->list, &base->to_pull);
3456 +			raw_spin_unlock_irqrestore(&base->lock, flags);
3457 +			if (was_empty)
3458 +				/* only send IPI if other no else
3459 +				 * has done so already
3460 +				 */
3461 +				smp_send_pull_timers(cpu);
3462 +		}
3463 +		preempt_enable();
3464 +	}
3465 +	return in_use;
3466 +}
3467 +
3468 +#endif
3469  
3470  /**
3471   * hrtimer_try_to_cancel - try to deactivate a timer
3472 @@ -1625,6 +1719,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
3473  	}
3474  
3475  	hrtimer_init_hres(cpu_base);
3476 +	INIT_LIST_HEAD(&cpu_base->to_pull);
3477  }
3478  
3479  #ifdef CONFIG_HOTPLUG_CPU
3480 diff --git a/kernel/printk.c b/kernel/printk.c
3481 index 3518539..b799a2e 100644
3482 --- a/kernel/printk.c
3483 +++ b/kernel/printk.c
3484 @@ -70,6 +70,13 @@ int console_printk[4] = {
3485  };
3486  
3487  /*
3488 + * divert printk() messages when there is a LITMUS^RT debug listener
3489 + */
3490 +#include <litmus/litmus.h>
3491 +int trace_override = 0;
3492 +int trace_recurse  = 0;
3493 +
3494 +/*
3495   * Low level drivers may need that to know if they can schedule in
3496   * their unblank() callback or not. So let's export it.
3497   */
3498 @@ -871,6 +878,9 @@ asmlinkage int vprintk(const char *fmt, va_list args)
3499  	/* Emit the output into the temporary buffer */
3500  	printed_len += vscnprintf(printk_buf + printed_len,
3501  				  sizeof(printk_buf) - printed_len, fmt, args);
3502 +	/* if LITMUS^RT tracer is active divert printk() msgs */
3503 +	if (trace_override && !trace_recurse)
3504 +		TRACE("%s", printk_buf);
3505  
3506  	p = printk_buf;
3507  
3508 @@ -947,7 +957,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
3509  	 * Try to acquire and then immediately release the
3510  	 * console semaphore. The release will do all the
3511  	 * actual magic (print out buffers, wake up klogd,
3512 -	 * etc). 
3513 +	 * etc).
3514  	 *
3515  	 * The console_trylock_for_printk() function
3516  	 * will release 'logbuf_lock' regardless of whether it
3517 @@ -1220,7 +1230,7 @@ int printk_needs_cpu(int cpu)
3518  
3519  void wake_up_klogd(void)
3520  {
3521 -	if (waitqueue_active(&log_wait))
3522 +	if (!trace_override && waitqueue_active(&log_wait))
3523  		this_cpu_write(printk_pending, 1);
3524  }
3525  
3526 diff --git a/kernel/sched.c b/kernel/sched.c
3527 index fde6ff9..baaca61 100644
3528 --- a/kernel/sched.c
3529 +++ b/kernel/sched.c
3530 @@ -80,6 +80,11 @@
3531  #include "workqueue_sched.h"
3532  #include "sched_autogroup.h"
3533  
3534 +#include <litmus/sched_trace.h>
3535 +#include <litmus/trace.h>
3536 +
3537 +static void litmus_tick(struct rq*, struct task_struct*);
3538 +
3539  #define CREATE_TRACE_POINTS
3540  #include <trace/events/sched.h>
3541  
3542 @@ -410,6 +415,12 @@ struct rt_rq {
3543  #endif
3544  };
3545  
3546 +/* Litmus related fields in a runqueue */
3547 +struct litmus_rq {
3548 +	unsigned long nr_running;
3549 +	struct task_struct *prev;
3550 +};
3551 +
3552  #ifdef CONFIG_SMP
3553  
3554  /*
3555 @@ -475,6 +486,7 @@ struct rq {
3556  
3557  	struct cfs_rq cfs;
3558  	struct rt_rq rt;
3559 +	struct litmus_rq litmus;
3560  
3561  #ifdef CONFIG_FAIR_GROUP_SCHED
3562  	/* list of leaf cfs_rq on this cpu: */
3563 @@ -1045,6 +1057,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
3564  	raw_spin_lock(&rq->lock);
3565  	update_rq_clock(rq);
3566  	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
3567 +	litmus_tick(rq, rq->curr);
3568  	raw_spin_unlock(&rq->lock);
3569  
3570  	return HRTIMER_NORESTART;
3571 @@ -1773,7 +1786,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
3572  
3573  static const struct sched_class rt_sched_class;
3574  
3575 -#define sched_class_highest (&stop_sched_class)
3576 +#define sched_class_highest (&litmus_sched_class)
3577  #define for_each_class(class) \
3578     for (class = sched_class_highest; class; class = class->next)
3579  
3580 @@ -2031,6 +2044,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
3581  #include "sched_rt.c"
3582  #include "sched_autogroup.c"
3583  #include "sched_stoptask.c"
3584 +#include "../litmus/sched_litmus.c"
3585  #ifdef CONFIG_SCHED_DEBUG
3586  # include "sched_debug.c"
3587  #endif
3588 @@ -2153,6 +2167,10 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
3589  	 * A queue event has occurred, and we're going to schedule.  In
3590  	 * this case, we can save a useless back to back clock update.
3591  	 */
3592 +	/* LITMUS^RT:
3593 +	 * The "disable-clock-update" approach was buggy in Linux 2.6.36.
3594 +	 * The issue has been solved in 2.6.37.
3595 +	 */
3596  	if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
3597  		rq->skip_clock_update = 1;
3598  }
3599 @@ -2643,7 +2661,12 @@ static void ttwu_queue(struct task_struct *p, int cpu)
3600  	struct rq *rq = cpu_rq(cpu);
3601  
3602  #if defined(CONFIG_SMP)
3603 -	if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
3604 +	/*
3605 +	 * LITMUS^RT: whether to send an IPI to the remote CPU
3606 +	 * is plugin specific.
3607 +	 */
3608 +	if (!is_realtime(p) &&
3609 +			sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
3610  		sched_clock_cpu(cpu); /* sync clocks x-cpu */
3611  		ttwu_queue_remote(p, cpu);
3612  		return;
3613 @@ -2676,6 +2699,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
3614  	unsigned long flags;
3615  	int cpu, success = 0;
3616  
3617 +	if (is_realtime(p))
3618 +		TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state);
3619 +
3620  	smp_wmb();
3621  	raw_spin_lock_irqsave(&p->pi_lock, flags);
3622  	if (!(p->state & state))
3623 @@ -2712,6 +2738,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
3624  	 */
3625  	smp_rmb();
3626  
3627 +	/* LITMUS^RT: once the task can be safely referenced by this
3628 +	 * CPU, don't mess up with Linux load balancing stuff.
3629 +	 */
3630 +	if (is_realtime(p))
3631 +		goto litmus_out_activate;
3632 +
3633  	p->sched_contributes_to_load = !!task_contributes_to_load(p);
3634  	p->state = TASK_WAKING;
3635  
3636 @@ -2723,12 +2755,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
3637  		wake_flags |= WF_MIGRATED;
3638  		set_task_cpu(p, cpu);
3639  	}
3640 +
3641 +litmus_out_activate:
3642  #endif /* CONFIG_SMP */
3643  
3644  	ttwu_queue(p, cpu);
3645  stat:
3646  	ttwu_stat(p, cpu, wake_flags);
3647  out:
3648 +	if (is_realtime(p))
3649 +		TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state);
3650  	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3651  
3652  	return success;
3653 @@ -2839,7 +2875,8 @@ void sched_fork(struct task_struct *p)
3654  	 * Revert to default priority/policy on fork if requested.
3655  	 */
3656  	if (unlikely(p->sched_reset_on_fork)) {
3657 -		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
3658 +		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR ||
3659 +		    p->policy == SCHED_LITMUS) {
3660  			p->policy = SCHED_NORMAL;
3661  			p->normal_prio = p->static_prio;
3662  		}
3663 @@ -3050,6 +3087,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
3664  	 */
3665  	prev_state = prev->state;
3666  	finish_arch_switch(prev);
3667 +	litmus->finish_switch(prev);
3668 +	prev->rt_param.stack_in_use = NO_CPU;
3669  #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
3670  	local_irq_disable();
3671  #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
3672 @@ -3079,6 +3118,15 @@ static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
3673  {
3674  	if (prev->sched_class->pre_schedule)
3675  		prev->sched_class->pre_schedule(rq, prev);
3676 +
3677 +	/* LITMUS^RT not very clean hack: we need to save the prev task
3678 +	 * as our scheduling decision rely on it (as we drop the rq lock
3679 +	 * something in prev can change...); there is no way to escape
3680 +	 * this ack apart from modifying pick_nex_task(rq, _prev_) or
3681 +	 * falling back on the previous solution of decoupling
3682 +	 * scheduling decisions
3683 +	 */
3684 +	rq->litmus.prev = prev;
3685  }
3686  
3687  /* rq->lock is NOT held, but preemption is disabled */
3688 @@ -4094,18 +4142,26 @@ void scheduler_tick(void)
3689  
3690  	sched_clock_tick();
3691  
3692 +	TS_TICK_START(current);
3693 +
3694  	raw_spin_lock(&rq->lock);
3695  	update_rq_clock(rq);
3696  	update_cpu_load_active(rq);
3697  	curr->sched_class->task_tick(rq, curr, 0);
3698 +
3699 +	/* litmus_tick may force current to resched */
3700 +	litmus_tick(rq, curr);
3701 +
3702  	raw_spin_unlock(&rq->lock);
3703  
3704  	perf_event_task_tick();
3705  
3706  #ifdef CONFIG_SMP
3707  	rq->idle_at_tick = idle_cpu(cpu);
3708 -	trigger_load_balance(rq, cpu);
3709 +	if (!is_realtime(current))
3710 +		trigger_load_balance(rq, cpu);
3711  #endif
3712 +	TS_TICK_END(current);
3713  }
3714  
3715  notrace unsigned long get_parent_ip(unsigned long addr)
3716 @@ -4225,12 +4281,20 @@ pick_next_task(struct rq *rq)
3717  	/*
3718  	 * Optimization: we know that if all tasks are in
3719  	 * the fair class we can call that function directly:
3720 -	 */
3721 -	if (likely(rq->nr_running == rq->cfs.nr_running)) {
3722 +
3723 +	 * NOT IN LITMUS^RT!
3724 +
3725 +	 * This breaks many assumptions in the plugins.
3726 +	 * Do not uncomment without thinking long and hard
3727 +	 * about how this affects global plugins such as GSN-EDF.
3728 +
3729 +	if (rq->nr_running == rq->cfs.nr_running) {
3730 +		TRACE("taking shortcut in pick_next_task()\n");
3731  		p = fair_sched_class.pick_next_task(rq);
3732  		if (likely(p))
3733  			return p;
3734  	}
3735 +	*/
3736  
3737  	for_each_class(class) {
3738  		p = class->pick_next_task(rq);
3739 @@ -4253,11 +4317,19 @@ asmlinkage void __sched schedule(void)
3740  
3741  need_resched:
3742  	preempt_disable();
3743 +	sched_state_entered_schedule();
3744  	cpu = smp_processor_id();
3745  	rq = cpu_rq(cpu);
3746  	rcu_note_context_switch(cpu);
3747  	prev = rq->curr;
3748  
3749 +	/* LITMUS^RT: quickly re-evaluate the scheduling decision
3750 +	 * if the previous one is no longer valid after CTX.
3751 +	 */
3752 +litmus_need_resched_nonpreemptible:
3753 +	TS_SCHED_START;
3754 +	sched_trace_task_switch_away(prev);
3755 +
3756  	schedule_debug(prev);
3757  
3758  	if (sched_feat(HRTICK))
3759 @@ -4314,7 +4386,10 @@ need_resched:
3760  		rq->curr = next;
3761  		++*switch_count;
3762  
3763 +		TS_SCHED_END(next);
3764 +		TS_CXS_START(next);
3765  		context_switch(rq, prev, next); /* unlocks the rq */
3766 +		TS_CXS_END(current);
3767  		/*
3768  		 * The context switch have flipped the stack from under us
3769  		 * and restored the local variables which were saved when
3770 @@ -4323,14 +4398,23 @@ need_resched:
3771  		 */
3772  		cpu = smp_processor_id();
3773  		rq = cpu_rq(cpu);
3774 -	} else
3775 +	} else {
3776 +		TS_SCHED_END(prev);
3777  		raw_spin_unlock_irq(&rq->lock);
3778 +	}
3779 +
3780 +	sched_trace_task_switch_to(current);
3781  
3782  	post_schedule(rq);
3783  
3784 +	if (sched_state_validate_switch())
3785 +		goto litmus_need_resched_nonpreemptible;
3786 +
3787  	preempt_enable_no_resched();
3788  	if (need_resched())
3789  		goto need_resched;
3790 +
3791 +	srp_ceiling_block();
3792  }
3793  EXPORT_SYMBOL(schedule);
3794  
3795 @@ -4600,6 +4684,17 @@ void complete_all(struct completion *x)
3796  }
3797  EXPORT_SYMBOL(complete_all);
3798  
3799 +void complete_n(struct completion *x, int n)
3800 +{
3801 +	unsigned long flags;
3802 +
3803 +	spin_lock_irqsave(&x->wait.lock, flags);
3804 +	x->done += n;
3805 +	__wake_up_common(&x->wait, TASK_NORMAL, n, 0, NULL);
3806 +	spin_unlock_irqrestore(&x->wait.lock, flags);
3807 +}
3808 +EXPORT_SYMBOL(complete_n);
3809 +
3810  static inline long __sched
3811  do_wait_for_common(struct completion *x, long timeout, int state)
3812  {
3813 @@ -5039,7 +5134,9 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
3814  	p->normal_prio = normal_prio(p);
3815  	/* we are holding p->pi_lock already */
3816  	p->prio = rt_mutex_getprio(p);
3817 -	if (rt_prio(p->prio))
3818 +	if (p->policy == SCHED_LITMUS)
3819 +		p->sched_class = &litmus_sched_class;
3820 +	else if (rt_prio(p->prio))
3821  		p->sched_class = &rt_sched_class;
3822  	else
3823  		p->sched_class = &fair_sched_class;
3824 @@ -5087,7 +5184,7 @@ recheck:
3825  
3826  		if (policy != SCHED_FIFO && policy != SCHED_RR &&
3827  				policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3828 -				policy != SCHED_IDLE)
3829 +				policy != SCHED_IDLE && policy != SCHED_LITMUS)
3830  			return -EINVAL;
3831  	}
3832  
3833 @@ -5102,6 +5199,8 @@ recheck:
3834  		return -EINVAL;
3835  	if (rt_policy(policy) != (param->sched_priority != 0))
3836  		return -EINVAL;
3837 +	if (policy == SCHED_LITMUS && policy == p->policy)
3838 +		return -EINVAL;
3839  
3840  	/*
3841  	 * Allow unprivileged RT tasks to decrease priority:
3842 @@ -5145,6 +5244,12 @@ recheck:
3843  			return retval;
3844  	}
3845  
3846 +	if (policy == SCHED_LITMUS) {
3847 +		retval = litmus_admit_task(p);
3848 +		if (retval)
3849 +			return retval;
3850 +	}
3851 +
3852  	/*
3853  	 * make sure no PI-waiters arrive (or leave) while we are
3854  	 * changing the priority of the task:
3855 @@ -5203,10 +5308,19 @@ recheck:
3856  
3857  	p->sched_reset_on_fork = reset_on_fork;
3858  
3859 +	if (p->policy == SCHED_LITMUS)
3860 +		litmus_exit_task(p);
3861 +
3862  	oldprio = p->prio;
3863  	prev_class = p->sched_class;
3864  	__setscheduler(rq, p, policy, param->sched_priority);
3865  
3866 +	if (policy == SCHED_LITMUS) {
3867 +		p->rt_param.stack_in_use = running ? rq->cpu : NO_CPU;
3868 +		p->rt_param.present = running;
3869 +		litmus->task_new(p, on_rq, running);
3870 +	}
3871 +
3872  	if (running)
3873  		p->sched_class->set_curr_task(rq);
3874  	if (on_rq)
3875 @@ -5374,10 +5488,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3876  	rcu_read_lock();
3877  
3878  	p = find_process_by_pid(pid);
3879 -	if (!p) {
3880 +	/* Don't set affinity if task not found and for LITMUS tasks */
3881 +	if (!p || is_realtime(p)) {
3882  		rcu_read_unlock();
3883  		put_online_cpus();
3884 -		return -ESRCH;
3885 +		return p ? -EPERM : -ESRCH;
3886  	}
3887  
3888  	/* Prevent p going away */
3889 diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
3890 index c768588..334eb47 100644
3891 --- a/kernel/sched_fair.c
3892 +++ b/kernel/sched_fair.c
3893 @@ -1890,6 +1890,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
3894  	int scale = cfs_rq->nr_running >= sched_nr_latency;
3895  	int next_buddy_marked = 0;
3896  
3897 +	if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS)
3898 +		goto preempt;
3899 +
3900  	if (unlikely(se == pse))
3901  		return;
3902  
3903 diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
3904 index 10d0182..58cf5d1 100644
3905 --- a/kernel/sched_rt.c
3906 +++ b/kernel/sched_rt.c
3907 @@ -1078,7 +1078,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
3908   */
3909  static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
3910  {
3911 -	if (p->prio < rq->curr->prio) {
3912 +	if (p->prio < rq->curr->prio || p->policy == SCHED_LITMUS) {
3913  		resched_task(rq->curr);
3914  		return;
3915  	}
3916 diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
3917 index d5097c4..0c0e02f 100644
3918 --- a/kernel/time/tick-sched.c
3919 +++ b/kernel/time/tick-sched.c
3920 @@ -766,12 +766,53 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
3921  }
3922  
3923  /**
3924 + * tick_set_quanta_type - get the quanta type as a boot option
3925 + * Default is standard setup with ticks staggered over first
3926 + * half of tick period.
3927 + */
3928 +int quanta_type = LINUX_DEFAULT_TICKS;
3929 +static int __init tick_set_quanta_type(char *str)
3930 +{
3931 +	if (strcmp("aligned", str) == 0) {
3932 +		quanta_type = LITMUS_ALIGNED_TICKS;
3933 +		printk(KERN_INFO "LITMUS^RT: setting aligned quanta\n");
3934 +	}
3935 +	else if (strcmp("staggered", str) == 0) {
3936 +		quanta_type = LITMUS_STAGGERED_TICKS;
3937 +		printk(KERN_INFO "LITMUS^RT: setting staggered quanta\n");
3938 +	}
3939 +	return 1;
3940 +}
3941 +__setup("quanta=", tick_set_quanta_type);
3942 +
3943 +u64 cpu_stagger_offset(int cpu)
3944 +{
3945 +	u64 offset = 0;
3946 +	switch (quanta_type) {
3947 +		case LITMUS_ALIGNED_TICKS:
3948 +			offset = 0;
3949 +			break;
3950 +		case LITMUS_STAGGERED_TICKS:
3951 +			offset = ktime_to_ns(tick_period);
3952 +			do_div(offset, num_possible_cpus());
3953 +			offset *= cpu;
3954 +			break;
3955 +		default:
3956 +			offset = ktime_to_ns(tick_period) >> 1;
3957 +			do_div(offset, num_possible_cpus());
3958 +			offset *= cpu;
3959 +	}
3960 +	return offset;
3961 +}
3962 +
3963 +/**
3964   * tick_setup_sched_timer - setup the tick emulation timer
3965   */
3966  void tick_setup_sched_timer(void)
3967  {
3968  	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
3969  	ktime_t now = ktime_get();
3970 +	u64 offset;
3971  
3972  	/*
3973  	 * Emulate tick processing via per-CPU hrtimers:
3974 @@ -782,6 +823,12 @@ void tick_setup_sched_timer(void)
3975  	/* Get the next period (per cpu) */
3976  	hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
3977  
3978 +	/* Offset must be set correctly to achieve desired quanta type. */
3979 +	offset = cpu_stagger_offset(smp_processor_id());
3980 +
3981 +	/* Add the correct offset to expiration time */
3982 +	hrtimer_add_expires_ns(&ts->sched_timer, offset);
3983 +
3984  	for (;;) {
3985  		hrtimer_forward(&ts->sched_timer, now, tick_period);
3986  		hrtimer_start_expires(&ts->sched_timer,
3987 diff --git a/litmus/Kconfig b/litmus/Kconfig
3988 new file mode 100644
3989 index 0000000..94b48e1
3990 --- /dev/null
3991 +++ b/litmus/Kconfig
3992 @@ -0,0 +1,218 @@
3993 +menu "LITMUS^RT"
3994 +
3995 +menu "Scheduling"
3996 +
3997 +config PLUGIN_CEDF
3998 +        bool "Clustered-EDF"
3999 +	depends on X86 && SYSFS
4000 +        default y
4001 +        help
4002 +          Include the Clustered EDF (C-EDF) plugin in the kernel.
4003 +          This is appropriate for large platforms with shared caches.
4004 +          On smaller platforms (e.g., ARM PB11MPCore), using C-EDF
4005 +          makes little sense since there aren't any shared caches.
4006 +
4007 +config PLUGIN_PFAIR
4008 +	bool "PFAIR"
4009 +	depends on HIGH_RES_TIMERS && !NO_HZ
4010 +	default y
4011 +	help
4012 +	  Include the PFAIR plugin (i.e., the PD^2 scheduler) in the kernel.
4013 +	  The PFAIR plugin requires high resolution timers (for staggered quanta)
4014 +	  and does not support NO_HZ (quanta could be missed when the system is idle).
4015 +
4016 +	  If unsure, say Yes.
4017 +
4018 +config RELEASE_MASTER
4019 +        bool "Release-master Support"
4020 +	depends on ARCH_HAS_SEND_PULL_TIMERS
4021 +	default n
4022 +	help
4023 +           Allow one processor to act as a dedicated interrupt processor
4024 +           that services all timer interrupts, but that does not schedule
4025 +           real-time tasks. See RTSS'09 paper for details
4026 +	   (http://www.cs.unc.edu/~anderson/papers.html).
4027 +           Currently only supported by GSN-EDF.
4028 +
4029 +endmenu
4030 +
4031 +menu "Real-Time Synchronization"
4032 +
4033 +config NP_SECTION
4034 +        bool "Non-preemptive section support"
4035 +	default n
4036 +	help
4037 +	  Allow tasks to become non-preemptable.
4038 +          Note that plugins still need to explicitly support non-preemptivity.
4039 +          Currently, only GSN-EDF and PSN-EDF have such support.
4040 +
4041 +	  This is required to support locking protocols such as the FMLP.
4042 +	  If disabled, all tasks will be considered preemptable at all times.
4043 +
4044 +config LITMUS_LOCKING
4045 +        bool "Support for real-time locking protocols"
4046 +	depends on NP_SECTION
4047 +	default n
4048 +	help
4049 +	  Enable LITMUS^RT's deterministic multiprocessor real-time
4050 +	  locking protocols.
4051 +
4052 +	  Say Yes if you want to include locking protocols such as the FMLP and
4053 +	  Baker's SRP.
4054 +
4055 +endmenu
4056 +
4057 +menu "Performance Enhancements"
4058 +
4059 +config SCHED_CPU_AFFINITY
4060 +	bool "Local Migration Affinity"
4061 +	depends on X86
4062 +	default y
4063 +	help
4064 +	  Rescheduled tasks prefer CPUs near to their previously used CPU.  This
4065 +	  may improve performance through possible preservation of cache affinity.
4066 +
4067 +	  Warning: May make bugs harder to find since tasks may migrate less often.
4068 +
4069 +	  NOTES:
4070 +	  	* Feature is not utilized by PFair/PD^2.
4071 +
4072 +	  Say Yes if unsure.
4073 +
4074 +endmenu
4075 +
4076 +menu "Tracing"
4077 +
4078 +config FEATHER_TRACE
4079 +	bool "Feather-Trace Infrastructure"
4080 +	default y
4081 +	help
4082 +	  Feather-Trace basic tracing infrastructure. Includes device file
4083 +	  driver and instrumentation point support.
4084 +
4085 +	  There are actually two implementations of Feather-Trace.
4086 +	  1) A slower, but portable, default implementation.
4087 +	  2) Architecture-specific implementations that rewrite kernel .text at runtime.
4088 +
4089 +	  If enabled, Feather-Trace will be based on 2) if available (currently only for x86).
4090 +	  However, if DEBUG_RODATA=y, then Feather-Trace will choose option 1) in any case
4091 +	  to avoid problems with write-protected .text pages.
4092 +
4093 +	  Bottom line: to avoid increased overheads, choose DEBUG_RODATA=n.
4094 +
4095 +	  Note that this option only enables the basic Feather-Trace infrastructure;
4096 +	  you still need to enable SCHED_TASK_TRACE and/or SCHED_OVERHEAD_TRACE to
4097 +	  actually enable any events.
4098 +
4099 +config SCHED_TASK_TRACE
4100 +	bool "Trace real-time tasks"
4101 +	depends on FEATHER_TRACE
4102 +	default y
4103 +	help
4104 +	  Include support for the sched_trace_XXX() tracing functions. This
4105 +          allows the collection of real-time task events such as job
4106 +	  completions, job releases, early completions, etc. This results in  a
4107 +	  small overhead in the scheduling code. Disable if the overhead is not
4108 +	  acceptable (e.g., benchmarking).
4109 +
4110 +	  Say Yes for debugging.
4111 +	  Say No for overhead tracing.
4112 +
4113 +config SCHED_TASK_TRACE_SHIFT
4114 +       int "Buffer size for sched_trace_xxx() events"
4115 +       depends on SCHED_TASK_TRACE
4116 +       range 8 13
4117 +       default 9
4118 +       help
4119 +
4120 +         Select the buffer size of sched_trace_xxx() events as a power of two.
4121 +	 These buffers are statically allocated as per-CPU data. Each event
4122 +	 requires 24 bytes storage plus one additional flag byte. Too large
4123 +	 buffers can cause issues with the per-cpu allocator (and waste
4124 +	 memory). Too small buffers can cause scheduling events to be lost. The
4125 +	 "right" size is workload dependent and depends on the number of tasks,
4126 +	 each task's period, each task's number of suspensions, and how often
4127 +	 the buffer is flushed.
4128 +
4129 +	 Examples: 12 =>   4k events
4130 +		   10 =>   1k events
4131 +		    8 =>  512 events
4132 +
4133 +config SCHED_OVERHEAD_TRACE
4134 +	bool "Record timestamps for overhead measurements"
4135 +	depends on FEATHER_TRACE
4136 +	default n
4137 +	help
4138 +	  Export event stream for overhead tracing.
4139 +	  Say Yes for overhead tracing.
4140 +
4141 +config SCHED_DEBUG_TRACE
4142 +	bool "TRACE() debugging"
4143 +	default y
4144 +	help
4145 +	  Include support for sched_trace_log_messageg(), which is used to
4146 +	  implement TRACE(). If disabled, no TRACE() messages will be included
4147 +	  in the kernel, and no overheads due to debugging statements will be
4148 +	  incurred by the scheduler. Disable if the overhead is not acceptable
4149 +	  (e.g. benchmarking).
4150 +
4151 +	  Say Yes for debugging.
4152 +	  Say No for overhead tracing.
4153 +
4154 +config SCHED_DEBUG_TRACE_SHIFT
4155 +       int "Buffer size for TRACE() buffer"
4156 +       depends on SCHED_DEBUG_TRACE
4157 +       range 14 22
4158 +       default 18
4159 +       help
4160 +
4161 +	Select the amount of memory needed per for the TRACE() buffer, as a
4162 +	power of two. The TRACE() buffer is global and statically allocated. If
4163 +	the buffer is too small, there will be holes in the TRACE() log if the
4164 +	buffer-flushing task is starved.
4165 +
4166 +	The default should be sufficient for most systems. Increase the buffer
4167 +	size if the log contains holes. Reduce the buffer size when running on
4168 +	a memory-constrained system.
4169 +
4170 +	Examples: 14 =>  16KB
4171 +		  18 => 256KB
4172 +		  20 =>   1MB
4173 +
4174 +        This buffer is exported to usespace using a misc device as
4175 +        'litmus/log'. On a system with default udev rules, a corresponding
4176 +        character device node should be created at /dev/litmus/log. The buffer
4177 +        can be flushed using cat, e.g., 'cat /dev/litmus/log > my_log_file.txt'.
4178 +
4179 +config SCHED_DEBUG_TRACE_CALLER
4180 +       bool "Include [function@file:line] tag in TRACE() log"
4181 +       depends on SCHED_DEBUG_TRACE
4182 +       default n
4183 +       help
4184 +         With this option enabled, TRACE() prepends
4185 +
4186 +	      "[<function name>@<filename>:<line number>]"
4187 +
4188 +	 to each message in the debug log. Enable this to aid in figuring out
4189 +         what was called in which order. The downside is that it adds a lot of
4190 +         clutter.
4191 +
4192 +	 If unsure, say No.
4193 +
4194 +config PREEMPT_STATE_TRACE
4195 +       bool "Trace preemption state machine transitions"
4196 +       depends on SCHED_DEBUG_TRACE
4197 +       default n
4198 +       help
4199 +         With this option enabled, each CPU will log when it transitions
4200 +	 states in the preemption state machine. This state machine is
4201 +	 used to determine how to react to IPIs (avoid races with in-flight IPIs).
4202 +
4203 +	 Warning: this creates a lot of information in the debug trace. Only
4204 +	 recommended when you are debugging preemption-related races.
4205 +
4206 +	 If unsure, say No.
4207 +
4208 +endmenu
4209 +
4210 +endmenu
4211 diff --git a/litmus/Makefile b/litmus/Makefile
4212 new file mode 100644
4213 index 0000000..7338180
4214 --- /dev/null
4215 +++ b/litmus/Makefile
4216 @@ -0,0 +1,29 @@
4217 +#
4218 +# Makefile for LITMUS^RT
4219 +#
4220 +
4221 +obj-y     = sched_plugin.o litmus.o \
4222 +	    preempt.o \
4223 +	    litmus_proc.o \
4224 +	    budget.o \
4225 +	    clustered.o \
4226 +	    jobs.o \
4227 +	    sync.o \
4228 +	    rt_domain.o \
4229 +	    edf_common.o \
4230 +	    fdso.o \
4231 +	    locking.o \
4232 +	    srp.o \
4233 +	    bheap.o \
4234 +	    ctrldev.o \
4235 +	    sched_gsn_edf.o \
4236 +	    sched_psn_edf.o
4237 +
4238 +obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
4239 +obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o
4240 +obj-$(CONFIG_SCHED_CPU_AFFINITY) += affinity.o
4241 +
4242 +obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
4243 +obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
4244 +obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o
4245 +obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o
4246 diff --git a/litmus/affinity.c b/litmus/affinity.c
4247 new file mode 100644
4248 index 0000000..3fa6dd7
4249 --- /dev/null
4250 +++ b/litmus/affinity.c
4251 @@ -0,0 +1,42 @@
4252 +#include <linux/cpu.h>
4253 +
4254 +#include <litmus/affinity.h>
4255 +
4256 +struct neighborhood neigh_info[NR_CPUS];
4257 +
4258 +/* called by _init_litmus() */
4259 +void init_topology(void) {
4260 +	int cpu;
4261 +	int i;
4262 +	int chk;
4263 +	int depth = num_cache_leaves;
4264 +
4265 +	if (depth > NUM_CACHE_LEVELS)
4266 +		depth = NUM_CACHE_LEVELS;
4267 +
4268 +	for_each_online_cpu(cpu) {
4269 +		for (i = 0; i < depth; ++i) {
4270 +			chk = get_shared_cpu_map((struct cpumask *)&neigh_info[cpu].neighbors[i], cpu, i);
4271 +			if (chk) {
4272 +				/* failed */
4273 +				neigh_info[cpu].size[i] = 0;
4274 +			} else {
4275 +				/* size = num bits in mask */
4276 +				neigh_info[cpu].size[i] =
4277 +					cpumask_weight((struct cpumask *)&neigh_info[cpu].neighbors[i]);
4278 +			}
4279 +			printk("CPU %d has %d neighbors at level %d. (mask = %lx)\n",
4280 +							cpu, neigh_info[cpu].size[i], i, 
4281 +							*cpumask_bits(neigh_info[cpu].neighbors[i]));
4282 +		}
4283 +
4284 +		/* set data for non-existent levels */
4285 +		for (; i < NUM_CACHE_LEVELS; ++i) {
4286 +			neigh_info[cpu].size[i] = 0;
4287 +
4288 +			printk("CPU %d has %d neighbors at level %d. (mask = %lx)\n",
4289 +						cpu, neigh_info[cpu].size[i], i, 0lu);
4290 +		}
4291 +	}
4292 +}
4293 +
4294 diff --git a/litmus/bheap.c b/litmus/bheap.c
4295 new file mode 100644
4296 index 0000000..528af97
4297 --- /dev/null
4298 +++ b/litmus/bheap.c
4299 @@ -0,0 +1,314 @@
4300 +#include "linux/kernel.h"
4301 +#include "litmus/bheap.h"
4302 +
4303 +void bheap_init(struct bheap* heap)
4304 +{
4305 +	heap->head = NULL;
4306 +	heap->min  = NULL;
4307 +}
4308 +
4309 +void bheap_node_init(struct bheap_node** _h, void* value)
4310 +{
4311 +	struct bheap_node* h = *_h;
4312 +	h->parent = NULL;
4313 +	h->next   = NULL;
4314 +	h->child  = NULL;
4315 +	h->degree = NOT_IN_HEAP;
4316 +	h->value  = value;
4317 +	h->ref    = _h;
4318 +}
4319 +
4320 +
4321 +/* make child a subtree of root */
4322 +static void __bheap_link(struct bheap_node* root,
4323 +			struct bheap_node* child)
4324 +{
4325 +	child->parent = root;
4326 +	child->next   = root->child;
4327 +	root->child   = child;
4328 +	root->degree++;
4329 +}
4330 +
4331 +/* merge root lists */
4332 +static  struct bheap_node* __bheap_merge(struct bheap_node* a,
4333 +					     struct bheap_node* b)
4334 +{
4335 +	struct bheap_node* head = NULL;
4336 +	struct bheap_node** pos = &head;
4337 +
4338 +	while (a && b) {
4339 +		if (a->degree < b->degree) {
4340 +			*pos = a;
4341 +			a = a->next;
4342 +		} else {
4343 +			*pos = b;
4344 +			b = b->next;
4345 +		}
4346 +		pos = &(*pos)->next;
4347 +	}
4348 +	if (a)
4349 +		*pos = a;
4350 +	else
4351 +		*pos = b;
4352 +	return head;
4353 +}
4354 +
4355 +/* reverse a linked list of nodes. also clears parent pointer */
4356 +static  struct bheap_node* __bheap_reverse(struct bheap_node* h)
4357 +{
4358 +	struct bheap_node* tail = NULL;
4359 +	struct bheap_node* next;
4360 +
4361 +	if (!h)
4362 +		return h;
4363 +
4364 +	h->parent = NULL;
4365 +	while (h->next) {
4366 +		next    = h->next;
4367 +		h->next = tail;
4368 +		tail    = h;
4369 +		h       = next;
4370 +		h->parent = NULL;
4371 +	}
4372 +	h->next = tail;
4373 +	return h;
4374 +}
4375 +
4376 +static  void __bheap_min(bheap_prio_t higher_prio, struct bheap* heap,
4377 +			      struct bheap_node** prev, struct bheap_node** node)
4378 +{
4379 +	struct bheap_node *_prev, *cur;
4380 +	*prev = NULL;
4381 +
4382 +	if (!heap->head) {
4383 +		*node = NULL;
4384 +		return;
4385 +	}
4386 +
4387 +	*node = heap->head;
4388 +	_prev = heap->head;
4389 +	cur   = heap->head->next;
4390 +	while (cur) {
4391 +		if (higher_prio(cur, *node)) {
4392 +			*node = cur;
4393 +			*prev = _prev;
4394 +		}
4395 +		_prev = cur;
4396 +		cur   = cur->next;
4397 +	}
4398 +}
4399 +
4400 +static  void __bheap_union(bheap_prio_t higher_prio, struct bheap* heap,
4401 +				struct bheap_node* h2)
4402 +{
4403 +	struct bheap_node* h1;
4404 +	struct bheap_node *prev, *x, *next;
4405 +	if (!h2)
4406 +		return;
4407 +	h1 = heap->head;
4408 +	if (!h1) {
4409 +		heap->head = h2;
4410 +		return;
4411 +	}
4412 +	h1 = __bheap_merge(h1, h2);
4413 +	prev = NULL;
4414 +	x    = h1;
4415 +	next = x->next;
4416 +	while (next) {
4417 +		if (x->degree != next->degree ||
4418 +		    (next->next && next->next->degree == x->degree)) {
4419 +			/* nothing to do, advance */
4420 +			prev = x;
4421 +			x    = next;
4422 +		} else if (higher_prio(x, next)) {
4423 +			/* x becomes the root of next */
4424 +			x->next = next->next;
4425 +			__bheap_link(x, next);
4426 +		} else {
4427 +			/* next becomes the root of x */
4428 +			if (prev)
4429 +				prev->next = next;
4430 +			else
4431 +				h1 = next;
4432 +			__bheap_link(next, x);
4433 +			x = next;
4434 +		}
4435 +		next = x->next;
4436 +	}
4437 +	heap->head = h1;
4438 +}
4439 +
4440 +static struct bheap_node* __bheap_extract_min(bheap_prio_t higher_prio,
4441 +					    struct bheap* heap)
4442 +{
4443 +	struct bheap_node *prev, *node;
4444 +	__bheap_min(higher_prio, heap, &prev, &node);
4445 +	if (!node)
4446 +		return NULL;
4447 +	if (prev)
4448 +		prev->next = node->next;
4449 +	else
4450 +		heap->head = node->next;
4451 +	__bheap_union(higher_prio, heap, __bheap_reverse(node->child));
4452 +	return node;
4453 +}
4454 +
4455 +/* insert (and reinitialize) a node into the heap */
4456 +void bheap_insert(bheap_prio_t higher_prio, struct bheap* heap,
4457 +		 struct bheap_node* node)
4458 +{
4459 +	struct bheap_node *min;
4460 +	node->child  = NULL;
4461 +	node->parent = NULL;
4462 +	node->next   = NULL;
4463 +	node->degree = 0;
4464 +	if (heap->min && higher_prio(node, heap->min)) {
4465 +		/* swap min cache */
4466 +		min = heap->min;
4467 +		min->child  = NULL;
4468 +		min->parent = NULL;
4469 +		min->next   = NULL;
4470 +		min->degree = 0;
4471 +		__bheap_union(higher_prio, heap, min);
4472 +		heap->min   = node;
4473 +	} else
4474 +		__bheap_union(higher_prio, heap, node);
4475 +}
4476 +
4477 +void bheap_uncache_min(bheap_prio_t higher_prio, struct bheap* heap)
4478 +{
4479 +	struct bheap_node* min;
4480 +	if (heap->min) {
4481 +		min = heap->min;
4482 +		heap->min = NULL;
4483 +		bheap_insert(higher_prio, heap, min);
4484 +	}
4485 +}
4486 +
4487 +/* merge addition into target */
4488 +void bheap_union(bheap_prio_t higher_prio,
4489 +		struct bheap* target, struct bheap* addition)
4490 +{
4491 +	/* first insert any cached minima, if necessary */
4492 +	bheap_uncache_min(higher_prio, target);
4493 +	bheap_uncache_min(higher_prio, addition);
4494 +	__bheap_union(higher_prio, target, addition->head);
4495 +	/* this is a destructive merge */
4496 +	addition->head = NULL;
4497 +}
4498 +
4499 +struct bheap_node* bheap_peek(bheap_prio_t higher_prio,
4500 +			    struct bheap* heap)
4501 +{
4502 +	if (!heap->min)
4503 +		heap->min = __bheap_extract_min(higher_prio, heap);
4504 +	return heap->min;
4505 +}
4506 +
4507 +struct bheap_node* bheap_take(bheap_prio_t higher_prio,
4508 +			    struct bheap* heap)
4509 +{
4510 +	struct bheap_node *node;
4511 +	if (!heap->min)
4512 +		heap->min = __bheap_extract_min(higher_prio, heap);
4513 +	node = heap->min;
4514 +	heap->min = NULL;
4515 +	if (node)
4516 +		node->degree = NOT_IN_HEAP;
4517 +	return node;
4518 +}
4519 +
4520 +int bheap_decrease(bheap_prio_t higher_prio, struct bheap_node* node)
4521 +{
4522 +	struct bheap_node  *parent;
4523 +	struct bheap_node** tmp_ref;
4524 +	void* tmp;
4525 +
4526 +	/* bubble up */
4527 +	parent = node->parent;
4528 +	while (parent && higher_prio(node, parent)) {
4529 +		/* swap parent and node */
4530 +		tmp           = parent->value;
4531 +		parent->value = node->value;
4532 +		node->value   = tmp;
4533 +		/* swap references */
4534 +		*(parent->ref) = node;
4535 +		*(node->ref)   = parent;
4536 +		tmp_ref        = parent->ref;
4537 +		parent->ref    = node->ref;
4538 +		node->ref      = tmp_ref;
4539 +		/* step up */
4540 +		node   = parent;
4541 +		parent = node->parent;
4542 +	}
4543 +
4544 +	return parent != NULL;
4545 +}
4546 +
4547 +void bheap_delete(bheap_prio_t higher_prio, struct bheap* heap,
4548 +		 struct bheap_node* node)
4549 +{
4550 +	struct bheap_node *parent, *prev, *pos;
4551 +	struct bheap_node** tmp_ref;
4552 +	void* tmp;
4553 +
4554 +	if (heap->min != node) {
4555 +		/* bubble up */
4556 +		parent = node->parent;
4557 +		while (parent) {
4558 +			/* swap parent and node */
4559 +			tmp           = parent->value;
4560 +			parent->value = node->value;
4561 +			node->value   = tmp;
4562 +			/* swap references */
4563 +			*(parent->ref) = node;
4564 +			*(node->ref)   = parent;
4565 +			tmp_ref        = parent->ref;
4566 +			parent->ref    = node->ref;
4567 +			node->ref      = tmp_ref;
4568 +			/* step up */
4569 +			node   = parent;
4570 +			parent = node->parent;
4571 +		}
4572 +		/* now delete:
4573 +		 * first find prev */
4574 +		prev = NULL;
4575 +		pos  = heap->head;
4576 +		while (pos != node) {
4577 +			prev = pos;
4578 +			pos  = pos->next;
4579 +		}
4580 +		/* we have prev, now remove node */
4581 +		if (prev)
4582 +			prev->next = node->next;
4583 +		else
4584 +			heap->head = node->next;
4585 +		__bheap_union(higher_prio, heap, __bheap_reverse(node->child));
4586 +	} else
4587 +		heap->min = NULL;
4588 +	node->degree = NOT_IN_HEAP;
4589 +}
4590 +
4591 +/* allocate a heap node for value and insert into the heap */
4592 +int bheap_add(bheap_prio_t higher_prio, struct bheap* heap,
4593 +	     void* value, int gfp_flags)
4594 +{
4595 +	struct bheap_node* hn = bheap_node_alloc(gfp_flags);
4596 +	if (likely(hn)) {
4597 +		bheap_node_init(&hn, value);
4598 +		bheap_insert(higher_prio, heap, hn);
4599 +	}
4600 +	return hn != NULL;
4601 +}
4602 +
4603 +void* bheap_take_del(bheap_prio_t higher_prio,
4604 +		    struct bheap* heap)
4605 +{
4606 +	struct bheap_node* hn = bheap_take(higher_prio, heap);
4607 +	void* ret = NULL;
4608 +	if (hn) {
4609 +		ret = hn->value;
4610 +		bheap_node_free(hn);
4611 +	}
4612 +	return ret;
4613 +}
4614 diff --git a/litmus/budget.c b/litmus/budget.c
4615 new file mode 100644
4616 index 0000000..310e9a3
4617 --- /dev/null
4618 +++ b/litmus/budget.c
4619 @@ -0,0 +1,111 @@
4620 +#include <linux/sched.h>
4621 +#include <linux/percpu.h>
4622 +#include <linux/hrtimer.h>
4623 +
4624 +#include <litmus/litmus.h>
4625 +#include <litmus/preempt.h>
4626 +
4627 +struct enforcement_timer {
4628 +	/* The enforcement timer is used to accurately police
4629 +	 * slice budgets. */
4630 +	struct hrtimer		timer;
4631 +	int			armed;
4632 +};
4633 +
4634 +DEFINE_PER_CPU(struct enforcement_timer, budget_timer);
4635 +
4636 +static enum hrtimer_restart on_enforcement_timeout(struct hrtimer *timer)
4637 +{
4638 +	struct enforcement_timer* et = container_of(timer,
4639 +						    struct enforcement_timer,
4640 +						    timer);
4641 +	unsigned long flags;
4642 +
4643 +	local_irq_save(flags);
4644 +	TRACE("enforcement timer fired.\n");
4645 +	et->armed = 0;
4646 +	/* activate scheduler */
4647 +	litmus_reschedule_local();
4648 +	local_irq_restore(flags);
4649 +
4650 +	return  HRTIMER_NORESTART;
4651 +}
4652 +
4653 +/* assumes called with IRQs off */
4654 +static void cancel_enforcement_timer(struct enforcement_timer* et)
4655 +{
4656 +	int ret;
4657 +
4658 +	TRACE("cancelling enforcement timer.\n");
4659 +
4660 +	/* Since interrupts are disabled and et->armed is only
4661 +	 * modified locally, we do not need any locks.
4662 +	 */
4663 +
4664 +	if (et->armed) {
4665 +		ret = hrtimer_try_to_cancel(&et->timer);
4666 +		/* Should never be inactive. */
4667 +		BUG_ON(ret == 0);
4668 +		/* Should never be running concurrently. */
4669 +		BUG_ON(ret == -1);
4670 +
4671 +		et->armed = 0;
4672 +	}
4673 +}
4674 +
4675 +/* assumes called with IRQs off */
4676 +static void arm_enforcement_timer(struct enforcement_timer* et,
4677 +				  struct task_struct* t)
4678 +{
4679 +	lt_t when_to_fire;
4680 +	TRACE_TASK(t, "arming enforcement timer.\n");
4681 +
4682 +	/* Calling this when there is no budget left for the task
4683 +	 * makes no sense, unless the task is non-preemptive. */
4684 +	BUG_ON(budget_exhausted(t) && (!is_np(t)));
4685 +
4686 +	/* __hrtimer_start_range_ns() cancels the timer
4687 +	 * anyway, so we don't have to check whether it is still armed */
4688 +
4689 +	if (likely(!is_np(t))) {
4690 +		when_to_fire = litmus_clock() + budget_remaining(t);
4691 +		__hrtimer_start_range_ns(&et->timer,
4692 +					 ns_to_ktime(when_to_fire),
4693 +					 0 /* delta */,
4694 +					 HRTIMER_MODE_ABS_PINNED,
4695 +					 0 /* no wakeup */);
4696 +		et->armed = 1;
4697 +	}
4698 +}
4699 +
4700 +
4701 +/* expects to be called with IRQs off */
4702 +void update_enforcement_timer(struct task_struct* t)
4703 +{
4704 +	struct enforcement_timer* et = &__get_cpu_var(budget_timer);
4705 +
4706 +	if (t && budget_precisely_enforced(t)) {
4707 +		/* Make sure we call into the scheduler when this budget
4708 +		 * expires. */
4709 +		arm_enforcement_timer(et, t);
4710 +	} else if (et->armed) {
4711 +		/* Make sure we don't cause unnecessary interrupts. */
4712 +		cancel_enforcement_timer(et);
4713 +	}
4714 +}
4715 +
4716 +
4717 +static int __init init_budget_enforcement(void)
4718 +{
4719 +	int cpu;
4720 +	struct enforcement_timer* et;
4721 +
4722 +	for (cpu = 0; cpu < NR_CPUS; cpu++)  {
4723 +		et = &per_cpu(budget_timer, cpu);
4724 +		hrtimer_init(&et->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
4725 +		et->timer.function = on_enforcement_timeout;
4726 +	}
4727 +	return 0;
4728 +}
4729 +
4730 +module_init(init_budget_enforcement);
4731 diff --git a/litmus/clustered.c b/litmus/clustered.c
4732 new file mode 100644
4733 index 0000000..6fe1b51
4734 --- /dev/null
4735 +++ b/litmus/clustered.c
4736 @@ -0,0 +1,111 @@
4737 +#include <linux/gfp.h>
4738 +#include <linux/cpumask.h>
4739 +#include <linux/list.h>
4740 +
4741 +#include <litmus/clustered.h>
4742 +
4743 +#ifndef CONFIG_X86
4744 +/* fake get_shared_cpu_map() on non-x86 architectures */
4745 +
4746 +int get_shared_cpu_map(cpumask_var_t mask, unsigned int cpu, int index)
4747 +{
4748 +	if (index != 1)
4749 +		return 1;
4750 +	else {
4751 +		/* Fake L1: CPU is all by itself. */
4752 +		cpumask_clear(mask);
4753 +		cpumask_set_cpu(cpu, mask);
4754 +		return 0;
4755 +	}
4756 +}
4757 +
4758 +#endif
4759 +
4760 +int get_cluster_size(enum cache_level level)
4761 +{
4762 +	cpumask_var_t mask;
4763 +	int ok;
4764 +	int num_cpus;
4765 +
4766 +	if (level == GLOBAL_CLUSTER)
4767 +		return num_online_cpus();
4768 +	else {
4769 +		if (!zalloc_cpumask_var(&mask, GFP_ATOMIC))
4770 +			return -ENOMEM;
4771 +		/* assumes CPU 0 is representative of all CPUs */
4772 +		ok = get_shared_cpu_map(mask, 0, level);
4773 +		/* ok == 0 means we got the map; otherwise it's an invalid cache level */
4774 +		if (ok == 0)
4775 +			num_cpus = cpumask_weight(mask);
4776 +		free_cpumask_var(mask);
4777 +
4778 +		if (ok == 0)
4779 +			return num_cpus;
4780 +		else
4781 +			return -EINVAL;
4782 +	}
4783 +}
4784 +
4785 +int assign_cpus_to_clusters(enum cache_level level,
4786 +			    struct scheduling_cluster* clusters[],
4787 +			    unsigned int num_clusters,
4788 +			    struct cluster_cpu* cpus[],
4789 +			    unsigned int num_cpus)
4790 +{
4791 +	cpumask_var_t mask;
4792 +	unsigned int i, free_cluster = 0, low_cpu;
4793 +	int err = 0;
4794 +
4795 +	if (!zalloc_cpumask_var(&mask, GFP_ATOMIC))
4796 +		return -ENOMEM;
4797 +
4798 +	/* clear cluster pointers */
4799 +	for (i = 0; i < num_cpus; i++) {
4800 +		cpus[i]->id      = i;
4801 +		cpus[i]->cluster = NULL;
4802 +	}
4803 +
4804 +	/* initialize clusters */
4805 +	for (i = 0; i < num_clusters; i++) {
4806 +		clusters[i]->id = i;
4807 +		INIT_LIST_HEAD(&clusters[i]->cpus);
4808 +	}
4809 +
4810 +	/* Assign each CPU. Two assumtions are made:
4811 +	 * 1) The index of a cpu in cpus corresponds to its processor id (i.e., the index in a cpu mask).
4812 +	 * 2) All cpus that belong to some cluster are online.
4813 +	 */
4814 +	for_each_online_cpu(i) {
4815 +		/* get lowest-id CPU in cluster */
4816 +		if (level != GLOBAL_CLUSTER) {
4817 +			err = get_shared_cpu_map(mask, cpus[i]->id, level);
4818 +			if (err != 0) {
4819 +				/* ugh... wrong cache level? Either caller screwed up
4820 +				 * or the CPU topology is weird. */
4821 +				printk(KERN_ERR "Could not set up clusters for L%d sharing (max: L%d).\n",
4822 +				       level, err);
4823 +				err = -EINVAL;
4824 +				goto out;
4825 +			}
4826 +			low_cpu = cpumask_first(mask);
4827 +		} else
4828 +			low_cpu = 0;
4829 +		if (low_cpu == i) {
4830 +			/* caller must provide an appropriate number of clusters */
4831 +			BUG_ON(free_cluster >= num_clusters);
4832 +
4833 +			/* create new cluster */
4834 +			cpus[i]->cluster = clusters[free_cluster++];
4835 +		} else {
4836 +			/* low_cpu points to the right cluster
4837 +			 * Assumption: low_cpu is actually online and was processed earlier. */
4838 +			cpus[i]->cluster = cpus[low_cpu]->cluster;
4839 +		}
4840 +		/* enqueue in cpus list */
4841 +		list_add_tail(&cpus[i]->cluster_list, &cpus[i]->cluster->cpus);
4842 +		printk(KERN_INFO "Assigning CPU%u to cluster %u\n.", i, cpus[i]->cluster->id);
4843 +	}
4844 +out:
4845 +	free_cpumask_var(mask);
4846 +	return err;
4847 +}
4848 diff --git a/litmus/ctrldev.c b/litmus/ctrldev.c
4849 new file mode 100644
4850 index 0000000..6677a67
4851 --- /dev/null
4852 +++ b/litmus/ctrldev.c
4853 @@ -0,0 +1,150 @@
4854 +#include <linux/sched.h>
4855 +#include <linux/mm.h>
4856 +#include <linux/fs.h>
4857 +#include <linux/miscdevice.h>
4858 +#include <linux/module.h>
4859 +
4860 +#include <litmus/litmus.h>
4861 +
4862 +/* only one page for now, but we might want to add a RO version at some point */
4863 +
4864 +#define CTRL_NAME        "litmus/ctrl"
4865 +
4866 +/* allocate t->rt_param.ctrl_page*/
4867 +static int alloc_ctrl_page(struct task_struct *t)
4868 +{
4869 +	int err = 0;
4870 +
4871 +	/* only allocate if the task doesn't have one yet */
4872 +	if (!tsk_rt(t)->ctrl_page) {
4873 +		tsk_rt(t)->ctrl_page = (void*) get_zeroed_page(GFP_KERNEL);
4874 +		if (!tsk_rt(t)->ctrl_page)
4875 +			err = -ENOMEM;
4876 +		/* will get de-allocated in task teardown */
4877 +		TRACE_TASK(t, "%s ctrl_page = %p\n", __FUNCTION__,
4878 +			   tsk_rt(t)->ctrl_page);
4879 +	}
4880 +	return err;
4881 +}
4882 +
4883 +static int map_ctrl_page(struct task_struct *t, struct vm_area_struct* vma)
4884 +{
4885 +	int err;
4886 +	unsigned long pfn;
4887 +
4888 +	struct page* ctrl = virt_to_page(tsk_rt(t)->ctrl_page);
4889 +
4890 +	/* Increase ref count. Is decreased when vma is destroyed. */
4891 +	get_page(ctrl);
4892 +
4893 +	/* compute page frame number */
4894 +	pfn = page_to_pfn(ctrl);
4895 +
4896 +	TRACE_CUR(CTRL_NAME
4897 +		  ": mapping %p (pfn:%lx, %lx) to 0x%lx (prot:%lx)\n",
4898 +		  tsk_rt(t)->ctrl_page, pfn, page_to_pfn(ctrl), vma->vm_start,
4899 +		  vma->vm_page_prot);
4900 +
4901 +	/* Map it into the vma. Make sure to use PAGE_SHARED, otherwise
4902 +	 * userspace actually gets a copy-on-write page. */
4903 +	err = remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, PAGE_SHARED);
4904 +
4905 +	if (err)
4906 +		TRACE_CUR(CTRL_NAME ": remap_pfn_range() failed (%d)\n", err);
4907 +
4908 +	return err;
4909 +}
4910 +
4911 +static void litmus_ctrl_vm_close(struct vm_area_struct* vma)
4912 +{
4913 +	TRACE_CUR("%s flags=0x%x prot=0x%x\n", __FUNCTION__,
4914 +		  vma->vm_flags, vma->vm_page_prot);
4915 +
4916 +	TRACE_CUR(CTRL_NAME
4917 +		  ": %p:%p vma:%p vma->vm_private_data:%p closed.\n",
4918 +		  (void*) vma->vm_start, (void*) vma->vm_end, vma,
4919 +		  vma->vm_private_data, current->comm,
4920 +		  current->pid);
4921 +}
4922 +
4923 +static int litmus_ctrl_vm_fault(struct vm_area_struct* vma,
4924 +				      struct vm_fault* vmf)
4925 +{
4926 +	/* This function should never be called, since
4927 +	 * all pages should have been mapped by mmap()
4928 +	 * already. */
4929 +	TRACE_CUR("%s flags=0x%x\n", __FUNCTION__, vma->vm_flags);
4930 +
4931 +	/* nope, you only get one page */
4932 +	return VM_FAULT_SIGBUS;
4933 +}
4934 +
4935 +static struct vm_operations_struct litmus_ctrl_vm_ops = {
4936 +	.close = litmus_ctrl_vm_close,
4937 +	.fault = litmus_ctrl_vm_fault,
4938 +};
4939 +
4940 +static int litmus_ctrl_mmap(struct file* filp, struct vm_area_struct* vma)
4941 +{
4942 +	int err = 0;
4943 +
4944 +	/* first make sure mapper knows what he's doing */
4945 +
4946 +	/* you can only get one page */
4947 +	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
4948 +		return -EINVAL;
4949 +
4950 +	/* you can only map the "first" page */
4951 +	if (vma->vm_pgoff != 0)
4952 +		return -EINVAL;
4953 +
4954 +	/* you can't share it with anyone */
4955 +	if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
4956 +		return -EINVAL;
4957 +
4958 +	vma->vm_ops = &litmus_ctrl_vm_ops;
4959 +	/* this mapping should not be kept across forks,
4960 +	 * and cannot be expanded */
4961 +	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
4962 +
4963 +	err = alloc_ctrl_page(current);
4964 +	if (!err)
4965 +		err = map_ctrl_page(current, vma);
4966 +
4967 +	TRACE_CUR("%s flags=0x%x prot=0x%lx\n",
4968 +		  __FUNCTION__, vma->vm_flags, vma->vm_page_prot);
4969 +
4970 +	return err;
4971 +}
4972 +
4973 +static struct file_operations litmus_ctrl_fops = {
4974 +	.owner = THIS_MODULE,
4975 +	.mmap  = litmus_ctrl_mmap,
4976 +};
4977 +
4978 +static struct miscdevice litmus_ctrl_dev = {
4979 +	.name  = CTRL_NAME,
4980 +	.minor = MISC_DYNAMIC_MINOR,
4981 +	.fops  = &litmus_ctrl_fops,
4982 +};
4983 +
4984 +static int __init init_litmus_ctrl_dev(void)
4985 +{
4986 +	int err;
4987 +
4988 +	BUILD_BUG_ON(sizeof(struct control_page) > PAGE_SIZE);
4989 +
4990 +	printk("Initializing LITMUS^RT control device.\n");
4991 +	err = misc_register(&litmus_ctrl_dev);
4992 +	if (err)
4993 +		printk("Could not allocate %s device (%d).\n", CTRL_NAME, err);
4994 +	return err;
4995 +}
4996 +
4997 +static void __exit exit_litmus_ctrl_dev(void)
4998 +{
4999 +	misc_deregister(&litmus_ctrl_dev);
5000 +}
5001 +
5002 +module_init(init_litmus_ctrl_dev);
5003 +module_exit(exit_litmus_ctrl_dev);
5004 diff --git a/litmus/edf_common.c b/litmus/edf_common.c
5005 new file mode 100644
5006 index 0000000..9b44dc2
5007 --- /dev/null
5008 +++ b/litmus/edf_common.c
5009 @@ -0,0 +1,118 @@
5010 +/*
5011 + * kernel/edf_common.c
5012 + *
5013 + * Common functions for EDF based scheduler.
5014 + */
5015 +
5016 +#include <linux/percpu.h>
5017 +#include <linux/sched.h>
5018 +#include <linux/list.h>
5019 +
5020 +#include <litmus/litmus.h>
5021 +#include <litmus/sched_plugin.h>
5022 +#include <litmus/sched_trace.h>
5023 +
5024 +#include <litmus/edf_common.h>
5025 +
5026 +/* edf_higher_prio -  returns true if first has a higher EDF priority
5027 + *                    than second. Deadline ties are broken by PID.
5028 + *
5029 + * both first and second may be NULL
5030 + */
5031 +int edf_higher_prio(struct task_struct* first,
5032 +		    struct task_struct* second)
5033 +{
5034 +	struct task_struct *first_task = first;
5035 +	struct task_struct *second_task = second;
5036 +
5037 +	/* There is no point in comparing a task to itself. */
5038 +	if (first && first == second) {
5039 +		TRACE_TASK(first,
5040 +			   "WARNING: pointless edf priority comparison.\n");
5041 +		return 0;
5042 +	}
5043 +
5044 +
5045 +	/* check for NULL tasks */
5046 +	if (!first || !second)
5047 +		return first && !second;
5048 +
5049 +#ifdef CONFIG_LITMUS_LOCKING
5050 +
5051 +	/* Check for inherited priorities. Change task
5052 +	 * used for comparison in such a case.
5053 +	 */
5054 +	if (unlikely(first->rt_param.inh_task))
5055 +		first_task = first->rt_param.inh_task;
5056 +	if (unlikely(second->rt_param.inh_task))
5057 +		second_task = second->rt_param.inh_task;
5058 +
5059 +	/* Check for priority boosting. Tie-break by start of boosting.
5060 +	 */
5061 +	if (unlikely(is_priority_boosted(first_task))) {
5062 +		/* first_task is boosted, how about second_task? */
5063 +		if (!is_priority_boosted(second_task) ||
5064 +		    lt_before(get_boost_start(first_task),
5065 +			      get_boost_start(second_task)))
5066 +			return 1;
5067 +		else
5068 +			return 0;
5069 +	} else if (unlikely(is_priority_boosted(second_task)))
5070 +		/* second_task is boosted, first is not*/
5071 +		return 0;
5072 +
5073 +#endif
5074 +
5075 +
5076 +	return !is_realtime(second_task)  ||
5077 +
5078 +		/* is the deadline of the first task earlier?
5079 +		 * Then it has higher priority.
5080 +		 */
5081 +		earlier_deadline(first_task, second_task) ||
5082 +
5083 +		/* Do we have a deadline tie?
5084 +		 * Then break by PID.
5085 +		 */
5086 +		(get_deadline(first_task) == get_deadline(second_task) &&
5087 +	        (first_task->pid < second_task->pid ||
5088 +
5089 +		/* If the PIDs are the same then the task with the inherited
5090 +		 * priority wins.
5091 +		 */
5092 +		(first_task->pid == second_task->pid &&
5093 +		 !second->rt_param.inh_task)));
5094 +}
5095 +
5096 +int edf_ready_order(struct bheap_node* a, struct bheap_node* b)
5097 +{
5098 +	return edf_higher_prio(bheap2task(a), bheap2task(b));
5099 +}
5100 +
5101 +void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
5102 +		      release_jobs_t release)
5103 +{
5104 +	rt_domain_init(rt,  edf_ready_order, resched, release);
5105 +}
5106 +
5107 +/* need_to_preempt - check whether the task t needs to be preempted
5108 + *                   call only with irqs disabled and with  ready_lock acquired
5109 + *                   THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT!
5110 + */
5111 +int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t)
5112 +{
5113 +	/* we need the read lock for edf_ready_queue */
5114 +	/* no need to preempt if there is nothing pending */
5115 +	if (!__jobs_pending(rt))
5116 +		return 0;
5117 +	/* we need to reschedule if t doesn't exist */
5118 +	if (!t)
5119 +		return 1;
5120 +
5121 +	/* NOTE: We cannot check for non-preemptibility since we
5122 +	 *       don't know what address space we're currently in.
5123 +	 */
5124 +
5125 +	/* make sure to get non-rt stuff out of the way */
5126 +	return !is_realtime(t) || edf_higher_prio(__next_ready(rt), t);
5127 +}
5128 diff --git a/litmus/fdso.c b/litmus/fdso.c
5129 new file mode 100644
5130 index 0000000..aa7b384
5131 --- /dev/null
5132 +++ b/litmus/fdso.c
5133 @@ -0,0 +1,293 @@
5134 +/* fdso.c - file descriptor attached shared objects
5135 + *
5136 + * (c) 2007 B. Brandenburg, LITMUS^RT project
5137 + *
5138 + * Notes:
5139 + *   - objects descriptor (OD) tables are not cloned during a fork.
5140 + *   - objects are created on-demand, and freed after the last reference
5141 + *     is dropped.
5142 + *   - for now, object types are hard coded.
5143 + *   - As long as we have live objects, we keep a reference to the inode.
5144 + */
5145 +
5146 +#include <linux/errno.h>
5147 +#include <linux/sched.h>
5148 +#include <linux/mutex.h>
5149 +#include <linux/file.h>
5150 +#include <asm/uaccess.h>
5151 +
5152 +#include <litmus/fdso.h>
5153 +
5154 +extern struct fdso_ops generic_lock_ops;
5155 +
5156 +static const struct fdso_ops* fdso_ops[] = {
5157 +	&generic_lock_ops, /* FMLP_SEM */
5158 +	&generic_lock_ops, /* SRP_SEM */
5159 +};
5160 +
5161 +static int fdso_create(void** obj_ref, obj_type_t type, void* __user config)
5162 +{
5163 +	if (fdso_ops[type]->create)
5164 +		return fdso_ops[type]->create(obj_ref, type, config);
5165 +	else
5166 +		return -EINVAL;
5167 +}
5168 +
5169 +static void fdso_destroy(obj_type_t type, void* obj)
5170 +{
5171 +	fdso_ops[type]->destroy(type, obj);
5172 +}
5173 +
5174 +static int fdso_open(struct od_table_entry* entry, void* __user config)
5175 +{
5176 +	if (fdso_ops[entry->obj->type]->open)
5177 +		return fdso_ops[entry->obj->type]->open(entry, config);
5178 +	else
5179 +		return 0;
5180 +}
5181 +
5182 +static int fdso_close(struct od_table_entry* entry)
5183 +{
5184 +	if (fdso_ops[entry->obj->type]->close)
5185 +		return fdso_ops[entry->obj->type]->close(entry);
5186 +	else
5187 +		return 0;
5188 +}
5189 +
5190 +/* inode must be locked already */
5191 +static int alloc_inode_obj(struct inode_obj_id** obj_ref,
5192 +			   struct inode* inode,
5193 +			   obj_type_t type,
5194 +			   unsigned int id,
5195 +			   void* __user config)
5196 +{
5197 +	struct inode_obj_id* obj;
5198 +	void* raw_obj;
5199 +	int err;
5200 +
5201 +	obj = kmalloc(sizeof(*obj), GFP_KERNEL);
5202 +	if (!obj) {
5203 +		return -ENOMEM;
5204 +	}
5205 +
5206 +	err = fdso_create(&raw_obj, type, config);
5207 +	if (err != 0) {
5208 +		kfree(obj);
5209 +		return err;
5210 +	}
5211 +
5212 +	INIT_LIST_HEAD(&obj->list);
5213 +	atomic_set(&obj->count, 1);
5214 +	obj->type  = type;
5215 +	obj->id    = id;
5216 +	obj->obj   = raw_obj;
5217 +	obj->inode = inode;
5218 +
5219 +	list_add(&obj->list, &inode->i_obj_list);
5220 +	atomic_inc(&inode->i_count);
5221 +
5222 +	printk(KERN_DEBUG "alloc_inode_obj(%p, %d, %d): object created\n", inode, type, id);
5223 +
5224 +	*obj_ref = obj;
5225 +	return 0;
5226 +}
5227 +
5228 +/* inode must be locked already */
5229 +static struct inode_obj_id* get_inode_obj(struct inode* inode,
5230 +					  obj_type_t type,
5231 +					  unsigned int id)
5232 +{
5233 +	struct list_head* pos;
5234 +	struct inode_obj_id* obj = NULL;
5235 +
5236 +	list_for_each(pos, &inode->i_obj_list) {
5237 +		obj = list_entry(pos, struct inode_obj_id, list);
5238 +		if (obj->id == id && obj->type == type) {
5239 +			atomic_inc(&obj->count);
5240 +			return obj;
5241 +		}
5242 +	}
5243 +	printk(KERN_DEBUG "get_inode_obj(%p, %d, %d): couldn't find object\n", inode, type, id);
5244 +	return NULL;
5245 +}
5246 +
5247 +
5248 +static void put_inode_obj(struct inode_obj_id* obj)
5249 +{
5250 +	struct inode* inode;
5251 +	int let_go = 0;
5252 +
5253 +	inode = obj->inode;
5254 +	if (atomic_dec_and_test(&obj->count)) {
5255 +
5256 +		mutex_lock(&inode->i_obj_mutex);
5257 +		/* no new references can be obtained */
5258 +		if (!atomic_read(&obj->count)) {
5259 +			list_del(&obj->list);
5260 +			fdso_destroy(obj->type, obj->obj);
5261 +			kfree(obj);
5262 +			let_go = 1;
5263 +		}
5264 +		mutex_unlock(&inode->i_obj_mutex);
5265 +		if (let_go)
5266 +			iput(inode);
5267 +	}
5268 +}
5269 +
5270 +static struct od_table_entry*  get_od_entry(struct task_struct* t)
5271 +{
5272 +	struct od_table_entry* table;
5273 +	int i;
5274 +
5275 +
5276 +	table = t->od_table;
5277 +	if (!table) {
5278 +		table = kzalloc(sizeof(*table) * MAX_OBJECT_DESCRIPTORS,
5279 +				GFP_KERNEL);
5280 +		t->od_table = table;
5281 +	}
5282 +
5283 +	for (i = 0; table &&  i < MAX_OBJECT_DESCRIPTORS; i++)
5284 +		if (!table[i].used) {
5285 +			table[i].used = 1;
5286 +			return table + i;
5287 +		}
5288 +	return NULL;
5289 +}
5290 +
5291 +static int put_od_entry(struct od_table_entry* od)
5292 +{
5293 +	put_inode_obj(od->obj);
5294 +	od->used = 0;
5295 +	return 0;
5296 +}
5297 +
5298 +void exit_od_table(struct task_struct* t)
5299 +{
5300 +	int i;
5301 +
5302 +	if (t->od_table) {
5303 +		for (i = 0; i < MAX_OBJECT_DESCRIPTORS; i++)
5304 +			if (t->od_table[i].used)
5305 +				put_od_entry(t->od_table + i);
5306 +		kfree(t->od_table);
5307 +		t->od_table = NULL;
5308 +	}
5309 +}
5310 +
5311 +static int do_sys_od_open(struct file* file, obj_type_t type, int id,
5312 +			  void* __user config)
5313 +{
5314 +	int idx = 0, err = 0;
5315 +	struct inode* inode;
5316 +	struct inode_obj_id* obj = NULL;
5317 +	struct od_table_entry* entry;
5318 +
5319 +	inode = file->f_dentry->d_inode;
5320 +
5321 +	entry = get_od_entry(current);
5322 +	if (!entry)
5323 +		return -ENOMEM;
5324 +
5325 +	mutex_lock(&inode->i_obj_mutex);
5326 +	obj = get_inode_obj(inode, type, id);
5327 +	if (!obj)
5328 +		err = alloc_inode_obj(&obj, inode, type, id, config);
5329 +	if (err != 0) {
5330 +		obj = NULL;
5331 +		idx = err;
5332 +		entry->used = 0;
5333 +	} else {
5334 +		entry->obj   = obj;
5335 +		entry->class = fdso_ops[type];
5336 +		idx = entry - current->od_table;
5337 +	}
5338 +
5339 +	mutex_unlock(&inode->i_obj_mutex);
5340 +
5341 +	/* open only if creation succeeded */
5342 +	if (!err)
5343 +		err = fdso_open(entry, config);
5344 +	if (err < 0) {
5345 +		/* The class rejected the open call.
5346 +		 * We need to clean up and tell user space.
5347 +		 */
5348 +		if (obj)
5349 +			put_od_entry(entry);
5350 +		idx = err;
5351 +	}
5352 +
5353 +	return idx;
5354 +}
5355 +
5356 +
5357 +struct od_table_entry* get_entry_for_od(int od)
5358 +{
5359 +	struct task_struct *t = current;
5360 +
5361 +	if (!t->od_table)
5362 +		return NULL;
5363 +	if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
5364 +		return NULL;
5365 +	if (!t->od_table[od].used)
5366 +		return NULL;
5367 +	return t->od_table + od;
5368 +}
5369 +
5370 +
5371 +asmlinkage long sys_od_open(int fd, int type, int obj_id, void* __user config)
5372 +{
5373 +	int ret = 0;
5374 +	struct file*  file;
5375 +
5376 +	/*
5377 +	   1) get file from fd, get inode from file
5378 +	   2) lock inode
5379 +	   3) try to lookup object
5380 +	   4) if not present create and enqueue object, inc inode refcnt
5381 +	   5) increment refcnt of object
5382 +	   6) alloc od_table_entry, setup ptrs
5383 +	   7) unlock inode
5384 +	   8) return offset in od_table as OD
5385 +	 */
5386 +
5387 +	if (type < MIN_OBJ_TYPE || type > MAX_OBJ_TYPE) {
5388 +		ret = -EINVAL;
5389 +		goto out;
5390 +	}
5391 +
5392 +	file = fget(fd);
5393 +	if (!file) {
5394 +		ret = -EBADF;
5395 +		goto out;
5396 +	}
5397 +
5398 +	ret = do_sys_od_open(file, type, obj_id, config);
5399 +
5400 +	fput(file);
5401 +
5402 +out:
5403 +	return ret;
5404 +}
5405 +
5406 +
5407 +asmlinkage long sys_od_close(int od)
5408 +{
5409 +	int ret = -EINVAL;
5410 +	struct task_struct *t = current;
5411 +
5412 +	if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
5413 +		return ret;
5414 +
5415 +	if (!t->od_table || !t->od_table[od].used)
5416 +		return ret;
5417 +
5418 +
5419 +	/* give the class a chance to reject the close
5420 +	 */
5421 +	ret = fdso_close(t->od_table + od);
5422 +	if (ret == 0)
5423 +		ret = put_od_entry(t->od_table + od);
5424 +
5425 +	return ret;
5426 +}
5427 diff --git a/litmus/ft_event.c b/litmus/ft_event.c
5428 new file mode 100644
5429 index 0000000..399a07b
5430 --- /dev/null
5431 +++ b/litmus/ft_event.c
5432 @@ -0,0 +1,43 @@
5433 +#include <linux/types.h>
5434 +
5435 +#include <litmus/feather_trace.h>
5436 +
5437 +#if !defined(CONFIG_ARCH_HAS_FEATHER_TRACE) || defined(CONFIG_DEBUG_RODATA)
5438 +/* provide dummy implementation */
5439 +
5440 +int ft_events[MAX_EVENTS];
5441 +
5442 +int ft_enable_event(unsigned long id)
5443 +{
5444 +	if (id < MAX_EVENTS) {
5445 +		ft_events[id]++;
5446 +		return 1;
5447 +	} else
5448 +		return 0;
5449 +}
5450 +
5451 +int ft_disable_event(unsigned long id)
5452 +{
5453 +	if (id < MAX_EVENTS && ft_events[id]) {
5454 +		ft_events[id]--;
5455 +		return 1;
5456 +	} else
5457 +		return 0;
5458 +}
5459 +
5460 +int ft_disable_all_events(void)
5461 +{
5462 +	int i;
5463 +
5464 +	for (i = 0; i < MAX_EVENTS; i++)
5465 +		ft_events[i] = 0;
5466 +
5467 +	return MAX_EVENTS;
5468 +}
5469 +
5470 +int ft_is_event_enabled(unsigned long id)
5471 +{
5472 +	return 	id < MAX_EVENTS && ft_events[id];
5473 +}
5474 +
5475 +#endif
5476 diff --git a/litmus/ftdev.c b/litmus/ftdev.c
5477 new file mode 100644
5478 index 0000000..06fcf4c
5479 --- /dev/null
5480 +++ b/litmus/ftdev.c
5481 @@ -0,0 +1,439 @@
5482 +#include <linux/sched.h>
5483 +#include <linux/fs.h>
5484 +#include <linux/slab.h>
5485 +#include <linux/cdev.h>
5486 +#include <asm/uaccess.h>
5487 +#include <linux/module.h>
5488 +#include <linux/device.h>
5489 +
5490 +#include <litmus/litmus.h>
5491 +#include <litmus/feather_trace.h>
5492 +#include <litmus/ftdev.h>
5493 +
5494 +struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size)
5495 +{
5496 +	struct ft_buffer* buf;
5497 +	size_t total = (size + 1) * count;
5498 +	char* mem;
5499 +	int order = 0, pages = 1;
5500 +
5501 +	buf = kmalloc(sizeof(*buf), GFP_KERNEL);
5502 +	if (!buf)
5503 +		return NULL;
5504 +
5505 +	total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
5506 +	while (pages < total) {
5507 +		order++;
5508 +		pages *= 2;
5509 +	}
5510 +
5511 +	mem = (char*) __get_free_pages(GFP_KERNEL, order);
5512 +	if (!mem) {
5513 +		kfree(buf);
5514 +		return NULL;
5515 +	}
5516 +
5517 +	if (!init_ft_buffer(buf, count, size,
5518 +			    mem + (count * size),  /* markers at the end */
5519 +			    mem)) {                /* buffer objects     */
5520 +		free_pages((unsigned long) mem, order);
5521 +		kfree(buf);
5522 +		return NULL;
5523 +	}
5524 +	return buf;
5525 +}
5526 +
5527 +void free_ft_buffer(struct ft_buffer* buf)
5528 +{
5529 +	int order = 0, pages = 1;
5530 +	size_t total;
5531 +
5532 +	if (buf) {
5533 +		total = (buf->slot_size + 1) * buf->slot_count;
5534 +		total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
5535 +		while (pages < total) {
5536 +			order++;
5537 +			pages *= 2;
5538 +		}
5539 +		free_pages((unsigned long) buf->buffer_mem, order);
5540 +		kfree(buf);
5541 +	}
5542 +}
5543 +
5544 +struct ftdev_event {
5545 +	int id;
5546 +	struct ftdev_event* next;
5547 +};
5548 +
5549 +static int activate(struct ftdev_event** chain, int id)
5550 +{
5551 +	struct ftdev_event* ev = kmalloc(sizeof(*ev), GFP_KERNEL);
5552 +	if (ev) {
5553 +		printk(KERN_INFO
5554 +		       "Enabling feather-trace event %d.\n", (int) id);
5555 +		ft_enable_event(id);
5556 +		ev->id = id;
5557 +		ev->next = *chain;
5558 +		*chain    = ev;
5559 +	}
5560 +	return ev ? 0 : -ENOMEM;
5561 +}
5562 +
5563 +static void deactivate(struct ftdev_event** chain, int id)
5564 +{
5565 +	struct ftdev_event **cur = chain;
5566 +	struct ftdev_event *nxt;
5567 +	while (*cur) {
5568 +		if ((*cur)->id == id) {
5569 +			nxt   = (*cur)->next;
5570 +			kfree(*cur);
5571 +			*cur  = nxt;
5572 +			printk(KERN_INFO
5573 +			       "Disabling feather-trace event %d.\n", (int) id);
5574 +			ft_disable_event(id);
5575 +			break;
5576 +		}
5577 +		cur = &(*cur)->next;
5578 +	}
5579 +}
5580 +
5581 +static int ftdev_open(struct inode *in, struct file *filp)
5582 +{
5583 +	struct ftdev* ftdev;
5584 +	struct ftdev_minor* ftdm;
5585 +	unsigned int buf_idx = iminor(in);
5586 +	int err = 0;
5587 +
5588 +	ftdev = container_of(in->i_cdev, struct ftdev, cdev);
5589 +
5590 +	if (buf_idx >= ftdev->minor_cnt) {
5591 +		err = -ENODEV;
5592 +		goto out;
5593 +	}
5594 +	if (ftdev->can_open && (err = ftdev->can_open(ftdev, buf_idx)))
5595 +		goto out;
5596 +
5597 +	ftdm = ftdev->minor + buf_idx;
5598 +	ftdm->ftdev = ftdev;
5599 +	filp->private_data = ftdm;
5600 +
5601 +	if (mutex_lock_interruptible(&ftdm->lock)) {
5602 +		err = -ERESTARTSYS;
5603 +		goto out;
5604 +	}
5605 +
5606 +	if (!ftdm->readers && ftdev->alloc)
5607 +		err = ftdev->alloc(ftdev, buf_idx);
5608 +	if (0 == err)
5609 +		ftdm->readers++;
5610 +
5611 +	mutex_unlock(&ftdm->lock);
5612 +out:
5613 +	return err;
5614 +}
5615 +
5616 +static int ftdev_release(struct inode *in, struct file *filp)
5617 +{
5618 +	struct ftdev* ftdev;
5619 +	struct ftdev_minor* ftdm;
5620 +	unsigned int buf_idx = iminor(in);
5621 +	int err = 0;
5622 +
5623 +	ftdev = container_of(in->i_cdev, struct ftdev, cdev);
5624 +
5625 +	if (buf_idx >= ftdev->minor_cnt) {
5626 +		err = -ENODEV;
5627 +		goto out;
5628 +	}
5629 +	ftdm = ftdev->minor + buf_idx;
5630 +
5631 +	if (mutex_lock_interruptible(&ftdm->lock)) {
5632 +		err = -ERESTARTSYS;
5633 +		goto out;
5634 +	}
5635 +
5636 +	if (ftdm->readers == 1) {
5637 +		while (ftdm->events)
5638 +			deactivate(&ftdm->events, ftdm->events->id);
5639 +
5640 +		/* wait for any pending events to complete */
5641 +		set_current_state(TASK_UNINTERRUPTIBLE);
5642 +		schedule_timeout(HZ);
5643 +
5644 +		printk(KERN_ALERT "Failed trace writes: %u\n",
5645 +		       ftdm->buf->failed_writes);
5646 +
5647 +		if (ftdev->free)
5648 +			ftdev->free(ftdev, buf_idx);
5649 +	}
5650 +
5651 +	ftdm->readers--;
5652 +	mutex_unlock(&ftdm->lock);
5653 +out:
5654 +	return err;
5655 +}
5656 +
5657 +/* based on ft_buffer_read
5658 + * @returns < 0 : page fault
5659 + *          = 0 : no data available
5660 + *          = 1 : one slot copied
5661 + */
5662 +static int ft_buffer_copy_to_user(struct ft_buffer* buf, char __user *dest)
5663 +{
5664 +	unsigned int idx;
5665 +	int err = 0;
5666 +	if (buf->free_count != buf->slot_count) {
5667 +		/* data available */
5668 +		idx = buf->read_idx % buf->slot_count;
5669 +		if (buf->slots[idx] == SLOT_READY) {
5670 +			err = copy_to_user(dest, ((char*) buf->buffer_mem) +
5671 +					   idx * buf->slot_size,
5672 +					   buf->slot_size);
5673 +			if (err == 0) {
5674 +				/* copy ok */
5675 +				buf->slots[idx] = SLOT_FREE;
5676 +				buf->read_idx++;
5677 +				fetch_and_inc(&buf->free_count);
5678 +				err = 1;
5679 +			}
5680 +		}
5681 +	}
5682 +	return err;
5683 +}
5684 +
5685 +static ssize_t ftdev_read(struct file *filp,
5686 +			  char __user *to, size_t len, loff_t *f_pos)
5687 +{
5688 +	/* 	we ignore f_pos, this is strictly sequential */
5689 +
5690 +	ssize_t err = 0;
5691 +	size_t chunk;
5692 +	int copied;
5693 +	struct ftdev_minor* ftdm = filp->private_data;
5694 +
5695 +	if (mutex_lock_interruptible(&ftdm->lock)) {
5696 +		err = -ERESTARTSYS;
5697 +		goto out;
5698 +	}
5699 +
5700 +
5701 +	chunk = ftdm->buf->slot_size;
5702 +	while (len >= chunk) {
5703 +		copied = ft_buffer_copy_to_user(ftdm->buf, to);
5704 +		if (copied == 1) {
5705 +			len    -= chunk;
5706 +			to     += chunk;
5707 +			err    += chunk;
5708 +	        } else if (err == 0 && copied == 0 && ftdm->events) {
5709 +			/* Only wait if there are any events enabled and only
5710 +			 * if we haven't copied some data yet. We cannot wait
5711 +			 * here with copied data because that data would get
5712 +			 * lost if the task is interrupted (e.g., killed).
5713 +			 */
5714 +			set_current_state(TASK_INTERRUPTIBLE);
5715 +			schedule_timeout(50);
5716 +			if (signal_pending(current)) {
5717 +				if (err == 0)
5718 +					/* nothing read yet, signal problem */
5719 +					err = -ERESTARTSYS;
5720 +				break;
5721 +			}
5722 +		} else if (copied < 0) {
5723 +			/* page fault */
5724 +			err = copied;
5725 +			break;
5726 +		} else
5727 +			/* nothing left to get, return to user space */
5728 +			break;
5729 +	}
5730 +	mutex_unlock(&ftdm->lock);
5731 +out:
5732 +	return err;
5733 +}
5734 +
5735 +static long ftdev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
5736 +{
5737 +	long err = -ENOIOCTLCMD;
5738 +	struct ftdev_minor* ftdm = filp->private_data;
5739 +
5740 +	if (mutex_lock_interruptible(&ftdm->lock)) {
5741 +		err = -ERESTARTSYS;
5742 +		goto out;
5743 +	}
5744 +
5745 +	/* FIXME: check id against list of acceptable events */
5746 +
5747 +	switch (cmd) {
5748 +	case  FTDEV_ENABLE_CMD:
5749 +		if (activate(&ftdm->events, arg))
5750 +			err = -ENOMEM;
5751 +		else
5752 +			err = 0;
5753 +		break;
5754 +
5755 +	case FTDEV_DISABLE_CMD:
5756 +		deactivate(&ftdm->events, arg);
5757 +		err = 0;
5758 +		break;
5759 +
5760 +	default:
5761 +		printk(KERN_DEBUG "ftdev: strange ioctl (%u, %lu)\n", cmd, arg);
5762 +	};
5763 +
5764 +	mutex_unlock(&ftdm->lock);
5765 +out:
5766 +	return err;
5767 +}
5768 +
5769 +static ssize_t ftdev_write(struct file *filp, const char __user *from,
5770 +			   size_t len, loff_t *f_pos)
5771 +{
5772 +	struct ftdev_minor* ftdm = filp->private_data;
5773 +	ssize_t err = -EINVAL;
5774 +	struct ftdev* ftdev = ftdm->ftdev;
5775 +
5776 +	/* dispatch write to buffer-specific code, if available */
5777 +	if (ftdev->write)
5778 +		err = ftdev->write(ftdm->buf, len, from);
5779 +
5780 +	return err;
5781 +}
5782 +
5783 +struct file_operations ftdev_fops = {
5784 +	.owner   = THIS_MODULE,
5785 +	.open    = ftdev_open,
5786 +	.release = ftdev_release,
5787 +	.write   = ftdev_write,
5788 +	.read    = ftdev_read,
5789 +	.unlocked_ioctl = ftdev_ioctl,
5790 +};
5791 +
5792 +int ftdev_init(	struct ftdev* ftdev, struct module* owner,
5793 +		const int minor_cnt, const char* name)
5794 +{
5795 +	int i, err;
5796 +
5797 +	BUG_ON(minor_cnt < 1);
5798 +
5799 +	cdev_init(&ftdev->cdev, &ftdev_fops);
5800 +	ftdev->name = name;
5801 +	ftdev->minor_cnt = minor_cnt;
5802 +	ftdev->cdev.owner = owner;
5803 +	ftdev->cdev.ops = &ftdev_fops;
5804 +	ftdev->alloc    = NULL;
5805 +	ftdev->free     = NULL;
5806 +	ftdev->can_open = NULL;
5807 +	ftdev->write	= NULL;
5808 +
5809 +	ftdev->minor = kcalloc(ftdev->minor_cnt, sizeof(*ftdev->minor),
5810 +			GFP_KERNEL);
5811 +	if (!ftdev->minor) {
5812 +		printk(KERN_WARNING "ftdev(%s): Could not allocate memory\n",
5813 +			ftdev->name);
5814 +		err = -ENOMEM;
5815 +		goto err_out;
5816 +	}
5817 +
5818 +	for (i = 0; i < ftdev->minor_cnt; i++) {
5819 +		mutex_init(&ftdev->minor[i].lock);
5820 +		ftdev->minor[i].readers = 0;
5821 +		ftdev->minor[i].buf     = NULL;
5822 +		ftdev->minor[i].events  = NULL;
5823 +	}
5824 +
5825 +	ftdev->class = class_create(owner, ftdev->name);
5826 +	if (IS_ERR(ftdev->class)) {
5827 +		err = PTR_ERR(ftdev->class);
5828 +		printk(KERN_WARNING "ftdev(%s): "
5829 +			"Could not create device class.\n", ftdev->name);
5830 +		goto err_dealloc;
5831 +	}
5832 +
5833 +	return 0;
5834 +
5835 +err_dealloc:
5836 +	kfree(ftdev->minor);
5837 +err_out:
5838 +	return err;
5839 +}
5840 +
5841 +/*
5842 + * Destroy minor devices up to, but not including, up_to.
5843 + */
5844 +static void ftdev_device_destroy(struct ftdev* ftdev, unsigned int up_to)
5845 +{
5846 +	dev_t minor_cntr;
5847 +
5848 +	if (up_to < 1)
5849 +		up_to = (ftdev->minor_cnt < 1) ? 0 : ftdev->minor_cnt;
5850 +
5851 +	for (minor_cntr = 0; minor_cntr < up_to; ++minor_cntr)
5852 +		device_destroy(ftdev->class, MKDEV(ftdev->major, minor_cntr));
5853 +}
5854 +
5855 +void ftdev_exit(struct ftdev* ftdev)
5856 +{
5857 +	printk("ftdev(%s): Exiting\n", ftdev->name);
5858 +	ftdev_device_destroy(ftdev, -1);
5859 +	cdev_del(&ftdev->cdev);
5860 +	unregister_chrdev_region(MKDEV(ftdev->major, 0), ftdev->minor_cnt);
5861 +	class_destroy(ftdev->class);
5862 +	kfree(ftdev->minor);
5863 +}
5864 +
5865 +int register_ftdev(struct ftdev* ftdev)
5866 +{
5867 +	struct device **device;
5868 +	dev_t trace_dev_tmp, minor_cntr;
5869 +	int err;
5870 +
5871 +	err = alloc_chrdev_region(&trace_dev_tmp, 0, ftdev->minor_cnt,
5872 +			ftdev->name);
5873 +	if (err) {
5874 +		printk(KERN_WARNING "ftdev(%s): "
5875 +		       "Could not allocate char. device region (%d minors)\n",
5876 +		       ftdev->name, ftdev->minor_cnt);
5877 +		goto err_out;
5878 +	}
5879 +
5880 +	ftdev->major = MAJOR(trace_dev_tmp);
5881 +
5882 +	err = cdev_add(&ftdev->cdev, trace_dev_tmp, ftdev->minor_cnt);
5883 +	if (err) {
5884 +		printk(KERN_WARNING "ftdev(%s): "
5885 +		       "Could not add cdev for major %u with %u minor(s).\n",
5886 +		       ftdev->name, ftdev->major, ftdev->minor_cnt);
5887 +		goto err_unregister;
5888 +	}
5889 +
5890 +	/* create the minor device(s) */
5891 +	for (minor_cntr = 0; minor_cntr < ftdev->minor_cnt; ++minor_cntr)
5892 +	{
5893 +		trace_dev_tmp = MKDEV(ftdev->major, minor_cntr);
5894 +		device = &ftdev->minor[minor_cntr].device;
5895 +
5896 +		*device = device_create(ftdev->class, NULL, trace_dev_tmp, NULL,
5897 +				"litmus/%s%d", ftdev->name, minor_cntr);
5898 +		if (IS_ERR(*device)) {
5899 +			err = PTR_ERR(*device);
5900 +			printk(KERN_WARNING "ftdev(%s): "
5901 +				"Could not create device major/minor number "
5902 +				"%u/%u\n", ftdev->name, ftdev->major,
5903 +				minor_cntr);
5904 +			printk(KERN_WARNING "ftdev(%s): "
5905 +				"will attempt deletion of allocated devices.\n",
5906 +				ftdev->name);
5907 +			goto err_minors;
5908 +		}
5909 +	}
5910 +
5911 +	return 0;
5912 +
5913 +err_minors:
5914 +	ftdev_device_destroy(ftdev, minor_cntr);
5915 +	cdev_del(&ftdev->cdev);
5916 +err_unregister:
5917 +	unregister_chrdev_region(MKDEV(ftdev->major, 0), ftdev->minor_cnt);
5918 +err_out:
5919 +	return err;
5920 +}
5921 diff --git a/litmus/jobs.c b/litmus/jobs.c
5922 new file mode 100644
5923 index 0000000..36e3146
5924 --- /dev/null
5925 +++ b/litmus/jobs.c
5926 @@ -0,0 +1,43 @@
5927 +/* litmus/jobs.c - common job control code
5928 + */
5929 +
5930 +#include <linux/sched.h>
5931 +
5932 +#include <litmus/litmus.h>
5933 +#include <litmus/jobs.h>
5934 +
5935 +void prepare_for_next_period(struct task_struct *t)
5936 +{
5937 +	BUG_ON(!t);
5938 +	/* prepare next release */
5939 +	t->rt_param.job_params.release   = t->rt_param.job_params.deadline;
5940 +	t->rt_param.job_params.deadline += get_rt_period(t);
5941 +	t->rt_param.job_params.exec_time = 0;
5942 +	/* update job sequence number */
5943 +	t->rt_param.job_params.job_no++;
5944 +
5945 +	/* don't confuse Linux */
5946 +	t->rt.time_slice = 1;
5947 +}
5948 +
5949 +void release_at(struct task_struct *t, lt_t start)
5950 +{
5951 +	t->rt_param.job_params.deadline = start;
5952 +	prepare_for_next_period(t);
5953 +	set_rt_flags(t, RT_F_RUNNING);
5954 +}
5955 +
5956 +
5957 +/*
5958 + *	Deactivate current task until the beginning of the next period.
5959 + */
5960 +long complete_job(void)
5961 +{
5962 +	/* Mark that we do not excute anymore */
5963 +	set_rt_flags(current, RT_F_SLEEP);
5964 +	/* call schedule, this will return when a new job arrives
5965 +	 * it also takes care of preparing for the next release
5966 +	 */
5967 +	schedule();
5968 +	return 0;
5969 +}
5970 diff --git a/litmus/litmus.c b/litmus/litmus.c
5971 new file mode 100644
5972 index 0000000..3013901
5973 --- /dev/null
5974 +++ b/litmus/litmus.c
5975 @@ -0,0 +1,564 @@
5976 +/*
5977 + * litmus.c -- Implementation of the LITMUS syscalls,
5978 + *             the LITMUS intialization code,
5979 + *             and the procfs interface..
5980 + */
5981 +#include <asm/uaccess.h>
5982 +#include <linux/uaccess.h>
5983 +#include <linux/sysrq.h>
5984 +#include <linux/sched.h>
5985 +#include <linux/module.h>
5986 +#include <linux/slab.h>
5987 +
5988 +#include <litmus/litmus.h>
5989 +#include <litmus/bheap.h>
5990 +#include <litmus/trace.h>
5991 +#include <litmus/rt_domain.h>
5992 +#include <litmus/litmus_proc.h>
5993 +#include <litmus/sched_trace.h>
5994 +
5995 +#ifdef CONFIG_SCHED_CPU_AFFINITY
5996 +#include <litmus/affinity.h>
5997 +#endif
5998 +
5999 +/* Number of RT tasks that exist in the system */
6000 +atomic_t rt_task_count 		= ATOMIC_INIT(0);
6001 +static DEFINE_RAW_SPINLOCK(task_transition_lock);
6002 +/* synchronize plugin switching */
6003 +atomic_t cannot_use_plugin	= ATOMIC_INIT(0);
6004 +
6005 +/* Give log messages sequential IDs. */
6006 +atomic_t __log_seq_no = ATOMIC_INIT(0);
6007 +
6008 +#ifdef CONFIG_RELEASE_MASTER
6009 +/* current master CPU for handling timer IRQs */
6010 +atomic_t release_master_cpu = ATOMIC_INIT(NO_CPU);
6011 +#endif
6012 +
6013 +static struct kmem_cache * bheap_node_cache;
6014 +extern struct kmem_cache * release_heap_cache;
6015 +
6016 +struct bheap_node* bheap_node_alloc(int gfp_flags)
6017 +{
6018 +	return kmem_cache_alloc(bheap_node_cache, gfp_flags);
6019 +}
6020 +
6021 +void bheap_node_free(struct bheap_node* hn)
6022 +{
6023 +	kmem_cache_free(bheap_node_cache, hn);
6024 +}
6025 +
6026 +struct release_heap* release_heap_alloc(int gfp_flags);
6027 +void release_heap_free(struct release_heap* rh);
6028 +
6029 +/*
6030 + * sys_set_task_rt_param
6031 + * @pid: Pid of the task which scheduling parameters must be changed
6032 + * @param: New real-time extension parameters such as the execution cost and
6033 + *         period
6034 + * Syscall for manipulating with task rt extension params
6035 + * Returns EFAULT  if param is NULL.
6036 + *         ESRCH   if pid is not corrsponding
6037 + *	           to a valid task.
6038 + *	   EINVAL  if either period or execution cost is <=0
6039 + *	   EPERM   if pid is a real-time task
6040 + *	   0       if success
6041 + *
6042 + * Only non-real-time tasks may be configured with this system call
6043 + * to avoid races with the scheduler. In practice, this means that a
6044 + * task's parameters must be set _before_ calling sys_prepare_rt_task()
6045 + *
6046 + * find_task_by_vpid() assumes that we are in the same namespace of the
6047 + * target.
6048 + */
6049 +asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param)
6050 +{
6051 +	struct rt_task tp;
6052 +	struct task_struct *target;
6053 +	int retval = -EINVAL;
6054 +
6055 +	printk("Setting up rt task parameters for process %d.\n", pid);
6056 +
6057 +	if (pid < 0 || param == 0) {
6058 +		goto out;
6059 +	}
6060 +	if (copy_from_user(&tp, param, sizeof(tp))) {
6061 +		retval = -EFAULT;
6062 +		goto out;
6063 +	}
6064 +
6065 +	/* Task search and manipulation must be protected */
6066 +	read_lock_irq(&tasklist_lock);
6067 +	if (!(target = find_task_by_vpid(pid))) {
6068 +		retval = -ESRCH;
6069 +		goto out_unlock;
6070 +	}
6071 +
6072 +	if (is_realtime(target)) {
6073 +		/* The task is already a real-time task.
6074 +		 * We cannot not allow parameter changes at this point.
6075 +		 */
6076 +		retval = -EBUSY;
6077 +		goto out_unlock;
6078 +	}
6079 +
6080 +	if (tp.exec_cost <= 0)
6081 +		goto out_unlock;
6082 +	if (tp.period <= 0)
6083 +		goto out_unlock;
6084 +	if (!cpu_online(tp.cpu))
6085 +		goto out_unlock;
6086 +	if (tp.period < tp.exec_cost)
6087 +	{
6088 +		printk(KERN_INFO "litmus: real-time task %d rejected "
6089 +		       "because wcet > period\n", pid);
6090 +		goto out_unlock;
6091 +	}
6092 +	if (	tp.cls != RT_CLASS_HARD &&
6093 +		tp.cls != RT_CLASS_SOFT &&
6094 +		tp.cls != RT_CLASS_BEST_EFFORT)
6095 +	{
6096 +		printk(KERN_INFO "litmus: real-time task %d rejected "
6097 +				 "because its class is invalid\n", pid);
6098 +		goto out_unlock;
6099 +	}
6100 +	if (tp.budget_policy != NO_ENFORCEMENT &&
6101 +	    tp.budget_policy != QUANTUM_ENFORCEMENT &&
6102 +	    tp.budget_policy != PRECISE_ENFORCEMENT)
6103 +	{
6104 +		printk(KERN_INFO "litmus: real-time task %d rejected "
6105 +		       "because unsupported budget enforcement policy "
6106 +		       "specified (%d)\n",
6107 +		       pid, tp.budget_policy);
6108 +		goto out_unlock;
6109 +	}
6110 +
6111 +	target->rt_param.task_params = tp;
6112 +
6113 +	retval = 0;
6114 +      out_unlock:
6115 +	read_unlock_irq(&tasklist_lock);
6116 +      out:
6117 +	return retval;
6118 +}
6119 +
6120 +/*
6121 + * Getter of task's RT params
6122 + *   returns EINVAL if param or pid is NULL
6123 + *   returns ESRCH  if pid does not correspond to a valid task
6124 + *   returns EFAULT if copying of parameters has failed.
6125 + *
6126 + *   find_task_by_vpid() assumes that we are in the same namespace of the
6127 + *   target.
6128 + */
6129 +asmlinkage long sys_get_rt_task_param(pid_t pid, struct rt_task __user * param)
6130 +{
6131 +	int retval = -EINVAL;
6132 +	struct task_struct *source;
6133 +	struct rt_task lp;
6134 +	if (param == 0 || pid < 0)
6135 +		goto out;
6136 +	read_lock(&tasklist_lock);
6137 +	if (!(source = find_task_by_vpid(pid))) {
6138 +		retval = -ESRCH;
6139 +		goto out_unlock;
6140 +	}
6141 +	lp = source->rt_param.task_params;
6142 +	read_unlock(&tasklist_lock);
6143 +	/* Do copying outside the lock */
6144 +	retval =
6145 +	    copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0;
6146 +	return retval;
6147 +      out_unlock:
6148 +	read_unlock(&tasklist_lock);
6149 +      out:
6150 +	return retval;
6151 +
6152 +}
6153 +
6154 +/*
6155 + *	This is the crucial function for periodic task implementation,
6156 + *	It checks if a task is periodic, checks if such kind of sleep
6157 + *	is permitted and calls plugin-specific sleep, which puts the
6158 + *	task into a wait array.
6159 + *	returns 0 on successful wakeup
6160 + *	returns EPERM if current conditions do not permit such sleep
6161 + *	returns EINVAL if current task is not able to go to sleep
6162 + */
6163 +asmlinkage long sys_complete_job(void)
6164 +{
6165 +	int retval = -EPERM;
6166 +	if (!is_realtime(current)) {
6167 +		retval = -EINVAL;
6168 +		goto out;
6169 +	}
6170 +	/* Task with negative or zero period cannot sleep */
6171 +	if (get_rt_period(current) <= 0) {
6172 +		retval = -EINVAL;
6173 +		goto out;
6174 +	}
6175 +	/* The plugin has to put the task into an
6176 +	 * appropriate queue and call schedule
6177 +	 */
6178 +	retval = litmus->complete_job();
6179 +      out:
6180 +	return retval;
6181 +}
6182 +
6183 +/*	This is an "improved" version of sys_complete_job that
6184 + *      addresses the problem of unintentionally missing a job after
6185 + *      an overrun.
6186 + *
6187 + *	returns 0 on successful wakeup
6188 + *	returns EPERM if current conditions do not permit such sleep
6189 + *	returns EINVAL if current task is not able to go to sleep
6190 + */
6191 +asmlinkage long sys_wait_for_job_release(unsigned int job)
6192 +{
6193 +	int retval = -EPERM;
6194 +	if (!is_realtime(current)) {
6195 +		retval = -EINVAL;
6196 +		goto out;
6197 +	}
6198 +
6199 +	/* Task with negative or zero period cannot sleep */
6200 +	if (get_rt_period(current) <= 0) {
6201 +		retval = -EINVAL;
6202 +		goto out;
6203 +	}
6204 +
6205 +	retval = 0;
6206 +
6207 +	/* first wait until we have "reached" the desired job
6208 +	 *
6209 +	 * This implementation has at least two problems:
6210 +	 *
6211 +	 * 1) It doesn't gracefully handle the wrap around of
6212 +	 *    job_no. Since LITMUS is a prototype, this is not much
6213 +	 *    of a problem right now.
6214 +	 *
6215 +	 * 2) It is theoretically racy if a job release occurs
6216 +	 *    between checking job_no and calling sleep_next_period().
6217 +	 *    A proper solution would requiring adding another callback
6218 +	 *    in the plugin structure and testing the condition with
6219 +	 *    interrupts disabled.
6220 +	 *
6221 +	 * FIXME: At least problem 2 should be taken care of eventually.
6222 +	 */
6223 +	while (!retval && job > current->rt_param.job_params.job_no)
6224 +		/* If the last job overran then job <= job_no and we
6225 +		 * don't send the task to sleep.
6226 +		 */
6227 +		retval = litmus->complete_job();
6228 +      out:
6229 +	return retval;
6230 +}
6231 +
6232 +/*	This is a helper syscall to query the current job sequence number.
6233 + *
6234 + *	returns 0 on successful query
6235 + *	returns EPERM if task is not a real-time task.
6236 + *      returns EFAULT if &job is not a valid pointer.
6237 + */
6238 +asmlinkage long sys_query_job_no(unsigned int __user *job)
6239 +{
6240 +	int retval = -EPERM;
6241 +	if (is_realtime(current))
6242 +		retval = put_user(current->rt_param.job_params.job_no, job);
6243 +
6244 +	return retval;
6245 +}
6246 +
6247 +/* sys_null_call() is only used for determining raw system call
6248 + * overheads (kernel entry, kernel exit). It has no useful side effects.
6249 + * If ts is non-NULL, then the current Feather-Trace time is recorded.
6250 + */
6251 +asmlinkage long sys_null_call(cycles_t __user *ts)
6252 +{
6253 +	long ret = 0;
6254 +	cycles_t now;
6255 +
6256 +	if (ts) {
6257 +		now = get_cycles();
6258 +		ret = put_user(now, ts);
6259 +	}
6260 +
6261 +	return ret;
6262 +}
6263 +
6264 +/* p is a real-time task. Re-init its state as a best-effort task. */
6265 +static void reinit_litmus_state(struct task_struct* p, int restore)
6266 +{
6267 +	struct rt_task  user_config = {};
6268 +	void*  ctrl_page     = NULL;
6269 +
6270 +	if (restore) {
6271 +		/* Safe user-space provided configuration data.
6272 +		 * and allocated page. */
6273 +		user_config = p->rt_param.task_params;
6274 +		ctrl_page   = p->rt_param.ctrl_page;
6275 +	}
6276 +
6277 +	/* We probably should not be inheriting any task's priority
6278 +	 * at this point in time.
6279 +	 */
6280 +	WARN_ON(p->rt_param.inh_task);
6281 +
6282 +	/* Cleanup everything else. */
6283 +	memset(&p->rt_param, 0, sizeof(p->rt_param));
6284 +
6285 +	/* Restore preserved fields. */
6286 +	if (restore) {
6287 +		p->rt_param.task_params = user_config;
6288 +		p->rt_param.ctrl_page   = ctrl_page;
6289 +	}
6290 +}
6291 +
6292 +long litmus_admit_task(struct task_struct* tsk)
6293 +{
6294 +	long retval = 0;
6295 +	unsigned long flags;
6296 +
6297 +	BUG_ON(is_realtime(tsk));
6298 +
6299 +	if (get_rt_period(tsk) == 0 ||
6300 +	    get_exec_cost(tsk) > get_rt_period(tsk)) {
6301 +		TRACE_TASK(tsk, "litmus admit: invalid task parameters "
6302 +			   "(%lu, %lu)\n",
6303 +		           get_exec_cost(tsk), get_rt_period(tsk));
6304 +		retval = -EINVAL;
6305 +		goto out;
6306 +	}
6307 +
6308 +	if (!cpu_online(get_partition(tsk))) {
6309 +		TRACE_TASK(tsk, "litmus admit: cpu %d is not online\n",
6310 +			   get_partition(tsk));
6311 +		retval = -EINVAL;
6312 +		goto out;
6313 +	}
6314 +
6315 +	INIT_LIST_HEAD(&tsk_rt(tsk)->list);
6316 +
6317 +	/* avoid scheduler plugin changing underneath us */
6318 +	raw_spin_lock_irqsave(&task_transition_lock, flags);
6319 +
6320 +	/* allocate heap node for this task */
6321 +	tsk_rt(tsk)->heap_node = bheap_node_alloc(GFP_ATOMIC);
6322 +	tsk_rt(tsk)->rel_heap = release_heap_alloc(GFP_ATOMIC);
6323 +
6324 +	if (!tsk_rt(tsk)->heap_node || !tsk_rt(tsk)->rel_heap) {
6325 +		printk(KERN_WARNING "litmus: no more heap node memory!?\n");
6326 +
6327 +		bheap_node_free(tsk_rt(tsk)->heap_node);
6328 +		release_heap_free(tsk_rt(tsk)->rel_heap);
6329 +
6330 +		retval = -ENOMEM;
6331 +		goto out_unlock;
6332 +	} else {
6333 +		bheap_node_init(&tsk_rt(tsk)->heap_node, tsk);
6334 +	}
6335 +
6336 +	retval = litmus->admit_task(tsk);
6337 +
6338 +	if (!retval) {
6339 +		sched_trace_task_name(tsk);
6340 +		sched_trace_task_param(tsk);
6341 +		atomic_inc(&rt_task_count);
6342 +	}
6343 +
6344 +out_unlock:
6345 +	raw_spin_unlock_irqrestore(&task_transition_lock, flags);
6346 +out:
6347 +	return retval;
6348 +}
6349 +
6350 +void litmus_exit_task(struct task_struct* tsk)
6351 +{
6352 +	if (is_realtime(tsk)) {
6353 +		sched_trace_task_completion(tsk, 1);
6354 +
6355 +		litmus->task_exit(tsk);
6356 +
6357 +		BUG_ON(bheap_node_in_heap(tsk_rt(tsk)->heap_node));
6358 +	        bheap_node_free(tsk_rt(tsk)->heap_node);
6359 +		release_heap_free(tsk_rt(tsk)->rel_heap);
6360 +
6361 +		atomic_dec(&rt_task_count);
6362 +		reinit_litmus_state(tsk, 1);
6363 +	}
6364 +}
6365 +
6366 +/* IPI callback to synchronize plugin switching */
6367 +static void synch_on_plugin_switch(void* info)
6368 +{
6369 +	atomic_inc(&cannot_use_plugin);
6370 +	while (atomic_read(&cannot_use_plugin) > 0)
6371 +		cpu_relax();
6372 +}
6373 +
6374 +/* Switching a plugin in use is tricky.
6375 + * We must watch out that no real-time tasks exists
6376 + * (and that none is created in parallel) and that the plugin is not
6377 + * currently in use on any processor (in theory).
6378 + */
6379 +int switch_sched_plugin(struct sched_plugin* plugin)
6380 +{
6381 +	unsigned long flags;
6382 +	int ret = 0;
6383 +
6384 +	BUG_ON(!plugin);
6385 +
6386 +	/* forbid other cpus to use the plugin */
6387 +	atomic_set(&cannot_use_plugin, 1);
6388 +	/* send IPI to force other CPUs to synch with us */
6389 +	smp_call_function(synch_on_plugin_switch, NULL, 0);
6390 +
6391 +	/* wait until all other CPUs have started synch */
6392 +	while (atomic_read(&cannot_use_plugin) < num_online_cpus())
6393 +		cpu_relax();
6394 +
6395 +	/* stop task transitions */
6396 +	raw_spin_lock_irqsave(&task_transition_lock, flags);
6397 +
6398 +	/* don't switch if there are active real-time tasks */
6399 +	if (atomic_read(&rt_task_count) == 0) {
6400 +		ret = litmus->deactivate_plugin();
6401 +		if (0 != ret)
6402 +			goto out;
6403 +		ret = plugin->activate_plugin();
6404 +		if (0 != ret) {
6405 +			printk(KERN_INFO "Can't activate %s (%d).\n",
6406 +			       plugin->plugin_name, ret);
6407 +			plugin = &linux_sched_plugin;
6408 +		}
6409 +		printk(KERN_INFO "Switching to LITMUS^RT plugin %s.\n", plugin->plugin_name);
6410 +		litmus = plugin;
6411 +	} else
6412 +		ret = -EBUSY;
6413 +out:
6414 +	raw_spin_unlock_irqrestore(&task_transition_lock, flags);
6415 +	atomic_set(&cannot_use_plugin, 0);
6416 +	return ret;
6417 +}
6418 +
6419 +/* Called upon fork.
6420 + * p is the newly forked task.
6421 + */
6422 +void litmus_fork(struct task_struct* p)
6423 +{
6424 +	if (is_realtime(p)) {
6425 +		/* clean out any litmus related state, don't preserve anything */
6426 +		reinit_litmus_state(p, 0);
6427 +		/* Don't let the child be a real-time task.  */
6428 +		p->sched_reset_on_fork = 1;
6429 +	} else
6430 +		/* non-rt tasks might have ctrl_page set */
6431 +		tsk_rt(p)->ctrl_page = NULL;
6432 +
6433 +	/* od tables are never inherited across a fork */
6434 +	p->od_table = NULL;
6435 +}
6436 +
6437 +/* Called upon execve().
6438 + * current is doing the exec.
6439 + * Don't let address space specific stuff leak.
6440 + */
6441 +void litmus_exec(void)
6442 +{
6443 +	struct task_struct* p = current;
6444 +
6445 +	if (is_realtime(p)) {
6446 +		WARN_ON(p->rt_param.inh_task);
6447 +		if (tsk_rt(p)->ctrl_page) {
6448 +			free_page((unsigned long) tsk_rt(p)->ctrl_page);
6449 +			tsk_rt(p)->ctrl_page = NULL;
6450 +		}
6451 +	}
6452 +}
6453 +
6454 +void exit_litmus(struct task_struct *dead_tsk)
6455 +{
6456 +	/* We also allow non-RT tasks to
6457 +	 * allocate control pages to allow
6458 +	 * measurements with non-RT tasks.
6459 +	 * So check if we need to free the page
6460 +	 * in any case.
6461 +	 */
6462 +	if (tsk_rt(dead_tsk)->ctrl_page) {
6463 +		TRACE_TASK(dead_tsk,
6464 +			   "freeing ctrl_page %p\n",
6465 +			   tsk_rt(dead_tsk)->ctrl_page);
6466 +		free_page((unsigned long) tsk_rt(dead_tsk)->ctrl_page);
6467 +	}
6468 +
6469 +	/* main cleanup only for RT tasks */
6470 +	if (is_realtime(dead_tsk))
6471 +		litmus_exit_task(dead_tsk);
6472 +}
6473 +
6474 +
6475 +#ifdef CONFIG_MAGIC_SYSRQ
6476 +int sys_kill(int pid, int sig);
6477 +
6478 +static void sysrq_handle_kill_rt_tasks(int key)
6479 +{
6480 +	struct task_struct *t;
6481 +	read_lock(&tasklist_lock);
6482 +	for_each_process(t) {
6483 +		if (is_realtime(t)) {
6484 +			sys_kill(t->pid, SIGKILL);
6485 +		}
6486 +	}
6487 +	read_unlock(&tasklist_lock);
6488 +}
6489 +
6490 +static struct sysrq_key_op sysrq_kill_rt_tasks_op = {
6491 +	.handler	= sysrq_handle_kill_rt_tasks,
6492 +	.help_msg	= "quit-rt-tasks(X)",
6493 +	.action_msg	= "sent SIGKILL to all LITMUS^RT real-time tasks",
6494 +};
6495 +#endif
6496 +
6497 +extern struct sched_plugin linux_sched_plugin;
6498 +
6499 +static int __init _init_litmus(void)
6500 +{
6501 +	/*      Common initializers,
6502 +	 *      mode change lock is used to enforce single mode change
6503 +	 *      operation.
6504 +	 */
6505 +	printk("Starting LITMUS^RT kernel\n");
6506 +
6507 +	BUILD_BUG_ON(sizeof(union np_flag) != sizeof(uint32_t));
6508 +
6509 +	register_sched_plugin(&linux_sched_plugin);
6510 +
6511 +	bheap_node_cache    = KMEM_CACHE(bheap_node, SLAB_PANIC);
6512 +	release_heap_cache = KMEM_CACHE(release_heap, SLAB_PANIC);
6513 +
6514 +#ifdef CONFIG_MAGIC_SYSRQ
6515 +	/* offer some debugging help */
6516 +	if (!register_sysrq_key('x', &sysrq_kill_rt_tasks_op))
6517 +		printk("Registered kill rt tasks magic sysrq.\n");
6518 +	else
6519 +		printk("Could not register kill rt tasks magic sysrq.\n");
6520 +#endif
6521 +
6522 +	init_litmus_proc();
6523 +
6524 +#ifdef CONFIG_SCHED_CPU_AFFINITY
6525 +	init_topology();
6526 +#endif
6527 +
6528 +	return 0;
6529 +}
6530 +
6531 +static void _exit_litmus(void)
6532 +{
6533 +	exit_litmus_proc();
6534 +	kmem_cache_destroy(bheap_node_cache);
6535 +	kmem_cache_destroy(release_heap_cache);
6536 +}
6537 +
6538 +module_init(_init_litmus);
6539 +module_exit(_exit_litmus);
6540 diff --git a/litmus/litmus_proc.c b/litmus/litmus_proc.c
6541 new file mode 100644
6542 index 0000000..4bf725a
6543 --- /dev/null
6544 +++ b/litmus/litmus_proc.c
6545 @@ -0,0 +1,347 @@
6546 +/*
6547 + * litmus_proc.c -- Implementation of the /proc/litmus directory tree.
6548 + */
6549 +
6550 +#include <linux/sched.h>
6551 +#include <linux/uaccess.h>
6552 +
6553 +#include <litmus/litmus.h>
6554 +#include <litmus/litmus_proc.h>
6555 +
6556 +#include <litmus/clustered.h>
6557 +
6558 +/* in litmus/litmus.c */
6559 +extern atomic_t rt_task_count;
6560 +
6561 +static struct proc_dir_entry *litmus_dir = NULL,
6562 +	*curr_file = NULL,
6563 +	*stat_file = NULL,
6564 +	*plugs_dir = NULL,
6565 +#ifdef CONFIG_RELEASE_MASTER
6566 +	*release_master_file = NULL,
6567 +#endif
6568 +	*plugs_file = NULL;
6569 +
6570 +/* in litmus/sync.c */
6571 +int count_tasks_waiting_for_release(void);
6572 +
6573 +static int proc_read_stats(char *page, char **start,
6574 +			   off_t off, int count,
6575 +			   int *eof, void *data)
6576 +{
6577 +	int len;
6578 +
6579 +	len = snprintf(page, PAGE_SIZE,
6580 +		       "real-time tasks   = %d\n"
6581 +		       "ready for release = %d\n",
6582 +		       atomic_read(&rt_task_count),
6583 +		       count_tasks_waiting_for_release());
6584 +	return len;
6585 +}
6586 +
6587 +static int proc_read_plugins(char *page, char **start,
6588 +			   off_t off, int count,
6589 +			   int *eof, void *data)
6590 +{
6591 +	int len;
6592 +
6593 +	len = print_sched_plugins(page, PAGE_SIZE);
6594 +	return len;
6595 +}
6596 +
6597 +static int proc_read_curr(char *page, char **start,
6598 +			  off_t off, int count,
6599 +			  int *eof, void *data)
6600 +{
6601 +	int len;
6602 +
6603 +	len = snprintf(page, PAGE_SIZE, "%s\n", litmus->plugin_name);
6604 +	return len;
6605 +}
6606 +
6607 +/* in litmus/litmus.c */
6608 +int switch_sched_plugin(struct sched_plugin*);
6609 +
6610 +static int proc_write_curr(struct file *file,
6611 +			   const char *buffer,
6612 +			   unsigned long count,
6613 +			   void *data)
6614 +{
6615 +	int len, ret;
6616 +	char name[65];
6617 +	struct sched_plugin* found;
6618 +
6619 +	len = copy_and_chomp(name, sizeof(name), buffer, count);
6620 +	if (len < 0)
6621 +		return len;
6622 +
6623 +	found = find_sched_plugin(name);
6624 +
6625 +	if (found) {
6626 +		ret = switch_sched_plugin(found);
6627 +		if (ret != 0)
6628 +			printk(KERN_INFO "Could not switch plugin: %d\n", ret);
6629 +	} else
6630 +		printk(KERN_INFO "Plugin '%s' is unknown.\n", name);
6631 +
6632 +	return len;
6633 +}
6634 +
6635 +#ifdef CONFIG_RELEASE_MASTER
6636 +static int proc_read_release_master(char *page, char **start,
6637 +				    off_t off, int count,
6638 +				    int *eof, void *data)
6639 +{
6640 +	int len, master;
6641 +	master = atomic_read(&release_master_cpu);
6642 +	if (master == NO_CPU)
6643 +		len = snprintf(page, PAGE_SIZE, "NO_CPU\n");
6644 +	else
6645 +		len = snprintf(page, PAGE_SIZE, "%d\n", master);
6646 +	return len;
6647 +}
6648 +
6649 +static int proc_write_release_master(struct file *file,
6650 +				     const char *buffer,
6651 +				     unsigned long count,
6652 +				     void *data)
6653 +{
6654 +	int cpu, err, len, online = 0;
6655 +	char msg[64];
6656 +
6657 +	len = copy_and_chomp(msg, sizeof(msg), buffer, count);
6658 +
6659 +	if (len < 0)
6660 +		return len;
6661 +
6662 +	if (strcmp(msg, "NO_CPU") == 0)
6663 +		atomic_set(&release_master_cpu, NO_CPU);
6664 +	else {
6665 +		err = sscanf(msg, "%d", &cpu);
6666 +		if (err == 1 && cpu >= 0 && (online = cpu_online(cpu))) {
6667 +			atomic_set(&release_master_cpu, cpu);
6668 +		} else {
6669 +			TRACE("invalid release master: '%s' "
6670 +			      "(err:%d cpu:%d online:%d)\n",
6671 +			      msg, err, cpu, online);
6672 +			len = -EINVAL;
6673 +		}
6674 +	}
6675 +	return len;
6676 +}
6677 +#endif
6678 +
6679 +int __init init_litmus_proc(void)
6680 +{
6681 +	litmus_dir = proc_mkdir("litmus", NULL);
6682 +	if (!litmus_dir) {
6683 +		printk(KERN_ERR "Could not allocate LITMUS^RT procfs entry.\n");
6684 +		return -ENOMEM;
6685 +	}
6686 +
6687 +	curr_file = create_proc_entry("active_plugin",
6688 +				      0644, litmus_dir);
6689 +	if (!curr_file) {
6690 +		printk(KERN_ERR "Could not allocate active_plugin "
6691 +		       "procfs entry.\n");
6692 +		return -ENOMEM;
6693 +	}
6694 +	curr_file->read_proc  = proc_read_curr;
6695 +	curr_file->write_proc = proc_write_curr;
6696 +
6697 +#ifdef CONFIG_RELEASE_MASTER
6698 +	release_master_file = create_proc_entry("release_master",
6699 +						0644, litmus_dir);
6700 +	if (!release_master_file) {
6701 +		printk(KERN_ERR "Could not allocate release_master "
6702 +		       "procfs entry.\n");
6703 +		return -ENOMEM;
6704 +	}
6705 +	release_master_file->read_proc = proc_read_release_master;
6706 +	release_master_file->write_proc  = proc_write_release_master;
6707 +#endif
6708 +
6709 +	stat_file = create_proc_read_entry("stats", 0444, litmus_dir,
6710 +					   proc_read_stats, NULL);
6711 +
6712 +	plugs_dir = proc_mkdir("plugins", litmus_dir);
6713 +	if (!plugs_dir){
6714 +		printk(KERN_ERR "Could not allocate plugins directory "
6715 +				"procfs entry.\n");
6716 +		return -ENOMEM;
6717 +	}
6718 +
6719 +	plugs_file = create_proc_read_entry("loaded", 0444, plugs_dir,
6720 +					   proc_read_plugins, NULL);
6721 +
6722 +	return 0;
6723 +}
6724 +
6725 +void exit_litmus_proc(void)
6726 +{
6727 +	if (plugs_file)
6728 +		remove_proc_entry("loaded", plugs_dir);
6729 +	if (plugs_dir)
6730 +		remove_proc_entry("plugins", litmus_dir);
6731 +	if (stat_file)
6732 +		remove_proc_entry("stats", litmus_dir);
6733 +	if (curr_file)
6734 +		remove_proc_entry("active_plugin", litmus_dir);
6735 +#ifdef CONFIG_RELEASE_MASTER
6736 +	if (release_master_file)
6737 +		remove_proc_entry("release_master", litmus_dir);
6738 +#endif
6739 +	if (litmus_dir)
6740 +		remove_proc_entry("litmus", NULL);
6741 +}
6742 +
6743 +long make_plugin_proc_dir(struct sched_plugin* plugin,
6744 +		struct proc_dir_entry** pde_in)
6745 +{
6746 +	struct proc_dir_entry *pde_new = NULL;
6747 +	long rv;
6748 +
6749 +	if (!plugin || !plugin->plugin_name){
6750 +		printk(KERN_ERR "Invalid plugin struct passed to %s.\n",
6751 +				__func__);
6752 +		rv = -EINVAL;
6753 +		goto out_no_pde;
6754 +	}
6755 +
6756 +	if (!plugs_dir){
6757 +		printk(KERN_ERR "Could not make plugin sub-directory, because "
6758 +				"/proc/litmus/plugins does not exist.\n");
6759 +		rv = -ENOENT;
6760 +		goto out_no_pde;
6761 +	}
6762 +
6763 +	pde_new = proc_mkdir(plugin->plugin_name, plugs_dir);
6764 +	if (!pde_new){
6765 +		printk(KERN_ERR "Could not make plugin sub-directory: "
6766 +				"out of memory?.\n");
6767 +		rv = -ENOMEM;
6768 +		goto out_no_pde;
6769 +	}
6770 +
6771 +	rv = 0;
6772 +	*pde_in = pde_new;
6773 +	goto out_ok;
6774 +
6775 +out_no_pde:
6776 +	*pde_in = NULL;
6777 +out_ok:
6778 +	return rv;
6779 +}
6780 +
6781 +void remove_plugin_proc_dir(struct sched_plugin* plugin)
6782 +{
6783 +	if (!plugin || !plugin->plugin_name){
6784 +		printk(KERN_ERR "Invalid plugin struct passed to %s.\n",
6785 +				__func__);
6786 +		return;
6787 +	}
6788 +	remove_proc_entry(plugin->plugin_name, plugs_dir);
6789 +}
6790 +
6791 +
6792 +
6793 +/* misc. I/O helper functions */
6794 +
6795 +int copy_and_chomp(char *kbuf, unsigned long ksize,
6796 +		   __user const char* ubuf, unsigned long ulength)
6797 +{
6798 +	/* caller must provide buffer space */
6799 +	BUG_ON(!ksize);
6800 +
6801 +	ksize--; /* leave space for null byte */
6802 +
6803 +	if (ksize > ulength)
6804 +		ksize = ulength;
6805 +
6806 +	if(copy_from_user(kbuf, ubuf, ksize))
6807 +		return -EFAULT;
6808 +
6809 +	kbuf[ksize] = '\0';
6810 +
6811 +	/* chomp kbuf */
6812 +	if (ksize > 0 && kbuf[ksize - 1] == '\n')
6813 +		kbuf[ksize - 1] = '\0';
6814 +
6815 +	return ksize;
6816 +}
6817 +
6818 +/* helper functions for clustered plugins */
6819 +static const char* cache_level_names[] = {
6820 +	"ALL",
6821 +	"L1",
6822 +	"L2",
6823 +	"L3",
6824 +};
6825 +
6826 +int parse_cache_level(const char *cache_name, enum cache_level *level)
6827 +{
6828 +	int err = -EINVAL;
6829 +	int i;
6830 +	/* do a quick and dirty comparison to find the cluster size */
6831 +	for (i = GLOBAL_CLUSTER; i <= L3_CLUSTER; i++)
6832 +		if (!strcmp(cache_name, cache_level_names[i])) {
6833 +			*level = (enum cache_level) i;
6834 +			err = 0;
6835 +			break;
6836 +		}
6837 +	return err;
6838 +}
6839 +
6840 +const char* cache_level_name(enum cache_level level)
6841 +{
6842 +	int idx = level;
6843 +
6844 +	if (idx >= GLOBAL_CLUSTER && idx <= L3_CLUSTER)
6845 +		return cache_level_names[idx];
6846 +	else
6847 +		return "INVALID";
6848 +}
6849 +
6850 +
6851 +/* proc file interface to configure the cluster size */
6852 +static int proc_read_cluster_size(char *page, char **start,
6853 +				  off_t off, int count,
6854 +				  int *eof, void *data)
6855 +{
6856 +	return snprintf(page, PAGE_SIZE, "%s\n",
6857 +			cache_level_name(*((enum cache_level*) data)));;
6858 +}
6859 +
6860 +static int proc_write_cluster_size(struct file *file,
6861 +				   const char *buffer,
6862 +				   unsigned long count,
6863 +				   void *data)
6864 +{
6865 +	int len;
6866 +	char cache_name[8];
6867 +
6868 +	len = copy_and_chomp(cache_name, sizeof(cache_name), buffer, count);
6869 +
6870 +	if (len > 0 && parse_cache_level(cache_name, (enum cache_level*) data))
6871 +		printk(KERN_INFO "Cluster '%s' is unknown.\n", cache_name);
6872 +
6873 +	return len;
6874 +}
6875 +
6876 +struct proc_dir_entry* create_cluster_file(struct proc_dir_entry* parent,
6877 +					   enum cache_level* level)
6878 +{
6879 +	struct proc_dir_entry* cluster_file;
6880 +
6881 +	cluster_file = create_proc_entry("cluster", 0644, parent);
6882 +	if (!cluster_file) {
6883 +		printk(KERN_ERR "Could not allocate %s/cluster "
6884 +		       "procfs entry.\n", parent->name);
6885 +	} else {
6886 +		cluster_file->read_proc = proc_read_cluster_size;
6887 +		cluster_file->write_proc = proc_write_cluster_size;
6888 +		cluster_file->data = level;
6889 +	}
6890 +	return cluster_file;
6891 +}
6892 +
6893 diff --git a/litmus/locking.c b/litmus/locking.c
6894 new file mode 100644
6895 index 0000000..0c1aa6a
6896 --- /dev/null
6897 +++ b/litmus/locking.c
6898 @@ -0,0 +1,139 @@
6899 +#include <litmus/fdso.h>
6900 +
6901 +#ifdef CONFIG_LITMUS_LOCKING
6902 +
6903 +#include <litmus/sched_plugin.h>
6904 +#include <litmus/trace.h>
6905 +
6906 +static int create_generic_lock(void** obj_ref, obj_type_t type, void* __user arg);
6907 +static int open_generic_lock(struct od_table_entry* entry, void* __user arg);
6908 +static int close_generic_lock(struct od_table_entry* entry);
6909 +static void destroy_generic_lock(obj_type_t type, void* sem);
6910 +
6911 +struct fdso_ops generic_lock_ops = {
6912 +	.create  = create_generic_lock,
6913 +	.open    = open_generic_lock,
6914 +	.close   = close_generic_lock,
6915 +	.destroy = destroy_generic_lock
6916 +};
6917 +
6918 +static inline bool is_lock(struct od_table_entry* entry)
6919 +{
6920 +	return entry->class == &generic_lock_ops;
6921 +}
6922 +
6923 +static inline struct litmus_lock* get_lock(struct od_table_entry* entry)
6924 +{
6925 +	BUG_ON(!is_lock(entry));
6926 +	return (struct litmus_lock*) entry->obj->obj;
6927 +}
6928 +
6929 +static  int create_generic_lock(void** obj_ref, obj_type_t type, void* __user arg)
6930 +{
6931 +	struct litmus_lock* lock;
6932 +	int err;
6933 +
6934 +	err = litmus->allocate_lock(&lock, type, arg);
6935 +	if (err == 0)
6936 +		*obj_ref = lock;
6937 +	return err;
6938 +}
6939 +
6940 +static int open_generic_lock(struct od_table_entry* entry, void* __user arg)
6941 +{
6942 +	struct litmus_lock* lock = get_lock(entry);
6943 +	if (lock->ops->open)
6944 +		return lock->ops->open(lock, arg);
6945 +	else
6946 +		return 0; /* default: any task can open it */
6947 +}
6948 +
6949 +static int close_generic_lock(struct od_table_entry* entry)
6950 +{
6951 +	struct litmus_lock* lock = get_lock(entry);
6952 +	if (lock->ops->close)
6953 +		return lock->ops->close(lock);
6954 +	else
6955 +		return 0; /* default: closing succeeds */
6956 +}
6957 +
6958 +static void destroy_generic_lock(obj_type_t type, void* obj)
6959 +{
6960 +	struct litmus_lock* lock = (struct litmus_lock*) obj;
6961 +	lock->ops->deallocate(lock);
6962 +}
6963 +
6964 +asmlinkage long sys_litmus_lock(int lock_od)
6965 +{
6966 +	long err = -EINVAL;
6967 +	struct od_table_entry* entry;
6968 +	struct litmus_lock* l;
6969 +
6970 +	TS_LOCK_START;
6971 +
6972 +	entry = get_entry_for_od(lock_od);
6973 +	if (entry && is_lock(entry)) {
6974 +		l = get_lock(entry);
6975 +		TRACE_CUR("attempts to lock 0x%p\n", l);
6976 +		err = l->ops->lock(l);
6977 +	}
6978 +
6979 +	/* Note: task my have been suspended or preempted in between!  Take
6980 +	 * this into account when computing overheads. */
6981 +	TS_LOCK_END;
6982 +
6983 +	return err;
6984 +}
6985 +
6986 +asmlinkage long sys_litmus_unlock(int lock_od)
6987 +{
6988 +	long err = -EINVAL;
6989 +	struct od_table_entry* entry;
6990 +	struct litmus_lock* l;
6991 +
6992 +	TS_UNLOCK_START;
6993 +
6994 +	entry = get_entry_for_od(lock_od);
6995 +	if (entry && is_lock(entry)) {
6996 +		l = get_lock(entry);
6997 +		TRACE_CUR("attempts to unlock 0x%p\n", l);
6998 +		err = l->ops->unlock(l);
6999 +	}
7000 +
7001 +	/* Note: task my have been preempted in between!  Take this into
7002 +	 * account when computing overheads. */
7003 +	TS_UNLOCK_END;
7004 +
7005 +	return err;
7006 +}
7007 +
7008 +struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq)
7009 +{
7010 +	wait_queue_t* q;
7011 +	struct task_struct* t = NULL;
7012 +
7013 +	if (waitqueue_active(wq)) {
7014 +		q = list_entry(wq->task_list.next,
7015 +			       wait_queue_t, task_list);
7016 +		t = (struct task_struct*) q->private;
7017 +		__remove_wait_queue(wq, q);
7018 +	}
7019 +	return(t);
7020 +}
7021 +
7022 +
7023 +#else
7024 +
7025 +struct fdso_ops generic_lock_ops = {};
7026 +
7027 +asmlinkage long sys_litmus_lock(int sem_od)
7028 +{
7029 +	return -ENOSYS;
7030 +}
7031 +
7032 +asmlinkage long sys_litmus_unlock(int sem_od)
7033 +{
7034 +	return -ENOSYS;
7035 +}
7036 +
7037 +#endif
7038 diff --git a/litmus/preempt.c b/litmus/preempt.c
7039 new file mode 100644
7040 index 0000000..5704d0b
7041 --- /dev/null
7042 +++ b/litmus/preempt.c
7043 @@ -0,0 +1,133 @@
7044 +#include <linux/sched.h>
7045 +
7046 +#include <litmus/litmus.h>
7047 +#include <litmus/preempt.h>
7048 +
7049 +/* The rescheduling state of each processor.
7050 + */
7051 +DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, resched_state);
7052 +
7053 +void sched_state_will_schedule(struct task_struct* tsk)
7054 +{
7055 +	/* Litmus hack: we only care about processor-local invocations of
7056 +	 * set_tsk_need_resched(). We can't reliably set the flag remotely
7057 +	 * since it might race with other updates to the scheduling state.  We
7058 +	 * can't rely on the runqueue lock protecting updates to the sched
7059 +	 * state since processors do not acquire the runqueue locks for all
7060 +	 * updates to the sched state (to avoid acquiring two runqueue locks at
7061 +	 * the same time). Further, if tsk is residing on a remote processor,
7062 +	 * then that processor doesn't actually know yet that it is going to
7063 +	 * reschedule; it still must receive an IPI (unless a local invocation
7064 +	 * races).
7065 +	 */
7066 +	if (likely(task_cpu(tsk) == smp_processor_id())) {
7067 +		VERIFY_SCHED_STATE(TASK_SCHEDULED | SHOULD_SCHEDULE | TASK_PICKED | WILL_SCHEDULE);
7068 +		if (is_in_sched_state(TASK_PICKED | PICKED_WRONG_TASK))
7069 +			set_sched_state(PICKED_WRONG_TASK);
7070 +		else
7071 +			set_sched_state(WILL_SCHEDULE);
7072 +	} else
7073 +		/* Litmus tasks should never be subject to a remote
7074 +		 * set_tsk_need_resched(). */
7075 +		BUG_ON(is_realtime(tsk));
7076 +#ifdef CONFIG_PREEMPT_STATE_TRACE
7077 +	TRACE_TASK(tsk, "set_tsk_need_resched() ret:%p\n",
7078 +		   __builtin_return_address(0));
7079 +#endif
7080 +}
7081 +
7082 +/* Called by the IPI handler after another CPU called smp_send_resched(). */
7083 +void sched_state_ipi(void)
7084 +{
7085 +	/* If the IPI was slow, we might be in any state right now. The IPI is
7086 +	 * only meaningful if we are in SHOULD_SCHEDULE. */
7087 +	if (is_in_sched_state(SHOULD_SCHEDULE)) {
7088 +		/* Cause scheduler to be invoked.
7089 +		 * This will cause a transition to WILL_SCHEDULE. */
7090 +		set_tsk_need_resched(current);
7091 +		TRACE_STATE("IPI -> set_tsk_need_resched(%s/%d)\n",
7092 +			    current->comm, current->pid);
7093 +	} else {
7094 +		/* ignore */
7095 +		TRACE_STATE("ignoring IPI in state %x (%s)\n",
7096 +			    get_sched_state(),
7097 +			    sched_state_name(get_sched_state()));
7098 +	}
7099 +}
7100 +
7101 +/* Called by plugins to cause a CPU to reschedule. IMPORTANT: the caller must
7102 + * hold the lock that is used to serialize scheduling decisions. */
7103 +void litmus_reschedule(int cpu)
7104 +{
7105 +	int picked_transition_ok = 0;
7106 +	int scheduled_transition_ok = 0;
7107 +
7108 +	/* The (remote) CPU could be in any state. */
7109 +
7110 +	/* The critical states are TASK_PICKED and TASK_SCHEDULED, as the CPU
7111 +	 * is not aware of the need to reschedule at this point. */
7112 +
7113 +	/* is a context switch in progress? */
7114 +	if (cpu_is_in_sched_state(cpu, TASK_PICKED))
7115 +		picked_transition_ok = sched_state_transition_on(
7116 +			cpu, TASK_PICKED, PICKED_WRONG_TASK);
7117 +
7118 +	if (!picked_transition_ok &&
7119 +	    cpu_is_in_sched_state(cpu, TASK_SCHEDULED)) {
7120 +		/* We either raced with the end of the context switch, or the
7121 +		 * CPU was in TASK_SCHEDULED anyway. */
7122 +		scheduled_transition_ok = sched_state_transition_on(
7123 +			cpu, TASK_SCHEDULED, SHOULD_SCHEDULE);
7124 +	}
7125 +
7126 +	/* If the CPU was in state TASK_SCHEDULED, then we need to cause the
7127 +	 * scheduler to be invoked. */
7128 +	if (scheduled_transition_ok) {
7129 +		if (smp_processor_id() == cpu)
7130 +			set_tsk_need_resched(current);
7131 +		else
7132 +			smp_send_reschedule(cpu);
7133 +	}
7134 +
7135 +	TRACE_STATE("%s picked-ok:%d sched-ok:%d\n",
7136 +		    __FUNCTION__,
7137 +		    picked_transition_ok,
7138 +		    scheduled_transition_ok);
7139 +}
7140 +
7141 +void litmus_reschedule_local(void)
7142 +{
7143 +	if (is_in_sched_state(TASK_PICKED))
7144 +		set_sched_state(PICKED_WRONG_TASK);
7145 +	else if (is_in_sched_state(TASK_SCHEDULED | SHOULD_SCHEDULE)) {
7146 +		set_sched_state(WILL_SCHEDULE);
7147 +		set_tsk_need_resched(current);
7148 +	}
7149 +}
7150 +
7151 +#ifdef CONFIG_DEBUG_KERNEL
7152 +
7153 +void sched_state_plugin_check(void)
7154 +{
7155 +	if (!is_in_sched_state(TASK_PICKED | PICKED_WRONG_TASK)) {
7156 +		TRACE("!!!! plugin did not call sched_state_task_picked()!"
7157 +		      "Calling sched_state_task_picked() is mandatory---fix this.\n");
7158 +		set_sched_state(TASK_PICKED);
7159 +	}
7160 +}
7161 +
7162 +#define NAME_CHECK(x) case x:  return #x
7163 +const char* sched_state_name(int s)
7164 +{
7165 +	switch (s) {
7166 +		NAME_CHECK(TASK_SCHEDULED);
7167 +		NAME_CHECK(SHOULD_SCHEDULE);
7168 +		NAME_CHECK(WILL_SCHEDULE);
7169 +		NAME_CHECK(TASK_PICKED);
7170 +		NAME_CHECK(PICKED_WRONG_TASK);
7171 +	default:
7172 +		return "UNKNOWN";
7173 +	};
7174 +}
7175 +
7176 +#endif
7177 diff --git a/litmus/rt_domain.c b/litmus/rt_domain.c
7178 new file mode 100644
7179 index 0000000..d405854
7180 --- /dev/null
7181 +++ b/litmus/rt_domain.c
7182 @@ -0,0 +1,357 @@
7183 +/*
7184 + * litmus/rt_domain.c
7185 + *
7186 + * LITMUS real-time infrastructure. This file contains the
7187 + * functions that manipulate RT domains. RT domains are an abstraction
7188 + * of a ready queue and a release queue.
7189 + */
7190 +
7191 +#include <linux/percpu.h>
7192 +#include <linux/sched.h>
7193 +#include <linux/list.h>
7194 +#include <linux/slab.h>
7195 +
7196 +#include <litmus/litmus.h>
7197 +#include <litmus/sched_plugin.h>
7198 +#include <litmus/sched_trace.h>
7199 +
7200 +#include <litmus/rt_domain.h>
7201 +
7202 +#include <litmus/trace.h>
7203 +
7204 +#include <litmus/bheap.h>
7205 +
7206 +/* Uncomment when debugging timer races... */
7207 +#if 0
7208 +#define VTRACE_TASK TRACE_TASK
7209 +#define VTRACE TRACE
7210 +#else
7211 +#define VTRACE_TASK(t, fmt, args...) /* shut up */
7212 +#define VTRACE(fmt, args...) /* be quiet already */
7213 +#endif
7214 +
7215 +static int dummy_resched(rt_domain_t *rt)
7216 +{
7217 +	return 0;
7218 +}
7219 +
7220 +static int dummy_order(struct bheap_node* a, struct bheap_node* b)
7221 +{
7222 +	return 0;
7223 +}
7224 +
7225 +/* default implementation: use default lock */
7226 +static void default_release_jobs(rt_domain_t* rt, struct bheap* tasks)
7227 +{
7228 +	merge_ready(rt, tasks);
7229 +}
7230 +
7231 +static unsigned int time2slot(lt_t time)
7232 +{
7233 +	return (unsigned int) time2quanta(time, FLOOR) % RELEASE_QUEUE_SLOTS;
7234 +}
7235 +
7236 +static enum hrtimer_restart on_release_timer(struct hrtimer *timer)
7237 +{
7238 +	unsigned long flags;
7239 +	struct release_heap* rh;
7240 +	rh = container_of(timer, struct release_heap, timer);
7241 +
7242 +	TS_RELEASE_LATENCY(rh->release_time);
7243 +
7244 +	VTRACE("on_release_timer(0x%p) starts.\n", timer);
7245 +
7246 +	TS_RELEASE_START;
7247 +
7248 +
7249 +	raw_spin_lock_irqsave(&rh->dom->release_lock, flags);
7250 +	VTRACE("CB has the release_lock 0x%p\n", &rh->dom->release_lock);
7251 +	/* remove from release queue */
7252 +	list_del(&rh->list);
7253 +	raw_spin_unlock_irqrestore(&rh->dom->release_lock, flags);
7254 +	VTRACE("CB returned release_lock 0x%p\n", &rh->dom->release_lock);
7255 +
7256 +	/* call release callback */
7257 +	rh->dom->release_jobs(rh->dom, &rh->heap);
7258 +	/* WARNING: rh can be referenced from other CPUs from now on. */
7259 +
7260 +	TS_RELEASE_END;
7261 +
7262 +	VTRACE("on_release_timer(0x%p) ends.\n", timer);
7263 +
7264 +	return  HRTIMER_NORESTART;
7265 +}
7266 +
7267 +/* allocated in litmus.c */
7268 +struct kmem_cache * release_heap_cache;
7269 +
7270 +struct release_heap* release_heap_alloc(int gfp_flags)
7271 +{
7272 +	struct release_heap* rh;
7273 +	rh= kmem_cache_alloc(release_heap_cache, gfp_flags);
7274 +	if (rh) {
7275 +		/* initialize timer */
7276 +		hrtimer_init(&rh->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
7277 +		rh->timer.function = on_release_timer;
7278 +	}
7279 +	return rh;
7280 +}
7281 +
7282 +void release_heap_free(struct release_heap* rh)
7283 +{
7284 +	/* make sure timer is no longer in use */
7285 +	hrtimer_cancel(&rh->timer);
7286 +	kmem_cache_free(release_heap_cache, rh);
7287 +}
7288 +
7289 +/* Caller must hold release lock.
7290 + * Will return heap for given time. If no such heap exists prior to
7291 + * the invocation it will be created.
7292 + */
7293 +static struct release_heap* get_release_heap(rt_domain_t *rt,
7294 +					     struct task_struct* t,
7295 +					     int use_task_heap)
7296 +{
7297 +	struct list_head* pos;
7298 +	struct release_heap* heap = NULL;
7299 +	struct release_heap* rh;
7300 +	lt_t release_time = get_release(t);
7301 +	unsigned int slot = time2slot(release_time);
7302 +
7303 +	/* initialize pos for the case that the list is empty */
7304 +	pos = rt->release_queue.slot[slot].next;
7305 +	list_for_each(pos, &rt->release_queue.slot[slot]) {
7306 +		rh = list_entry(pos, struct release_heap, list);
7307 +		if (release_time == rh->release_time) {
7308 +			/* perfect match -- this happens on hyperperiod
7309 +			 * boundaries
7310 +			 */
7311 +			heap = rh;
7312 +			break;
7313 +		} else if (lt_before(release_time, rh->release_time)) {
7314 +			/* we need to insert a new node since rh is
7315 +			 * already in the future
7316 +			 */
7317 +			break;
7318 +		}
7319 +	}
7320 +	if (!heap && use_task_heap) {
7321 +		/* use pre-allocated release heap */
7322 +		rh = tsk_rt(t)->rel_heap;
7323 +
7324 +		rh->dom = rt;
7325 +		rh->release_time = release_time;
7326 +
7327 +		/* add to release queue */
7328 +		list_add(&rh->list, pos->prev);
7329 +		heap = rh;
7330 +	}
7331 +	return heap;
7332 +}
7333 +
7334 +static void reinit_release_heap(struct task_struct* t)
7335 +{
7336 +	struct release_heap* rh;
7337 +
7338 +	/* use pre-allocated release heap */
7339 +	rh = tsk_rt(t)->rel_heap;
7340 +
7341 +	/* Make sure it is safe to use.  The timer callback could still
7342 +	 * be executing on another CPU; hrtimer_cancel() will wait
7343 +	 * until the timer callback has completed.  However, under no
7344 +	 * circumstances should the timer be active (= yet to be
7345 +	 * triggered).
7346 +	 *
7347 +	 * WARNING: If the CPU still holds the release_lock at this point,
7348 +	 *          deadlock may occur!
7349 +	 */
7350 +	BUG_ON(hrtimer_cancel(&rh->timer));
7351 +
7352 +	/* initialize */
7353 +	bheap_init(&rh->heap);
7354 +#ifdef CONFIG_RELEASE_MASTER
7355 +	atomic_set(&rh->info.state, HRTIMER_START_ON_INACTIVE);
7356 +#endif
7357 +}
7358 +/* arm_release_timer() - start local release timer or trigger
7359 + *     remote timer (pull timer)
7360 + *
7361 + * Called by add_release() with:
7362 + * - tobe_lock taken
7363 + * - IRQ disabled
7364 + */
7365 +#ifdef CONFIG_RELEASE_MASTER
7366 +#define arm_release_timer(t) arm_release_timer_on((t), NO_CPU)
7367 +static void arm_release_timer_on(rt_domain_t *_rt , int target_cpu)
7368 +#else
7369 +static void arm_release_timer(rt_domain_t *_rt)
7370 +#endif
7371 +{
7372 +	rt_domain_t *rt = _rt;
7373 +	struct list_head list;
7374 +	struct list_head *pos, *safe;
7375 +	struct task_struct* t;
7376 +	struct release_heap* rh;
7377 +
7378 +	VTRACE("arm_release_timer() at %llu\n", litmus_clock());
7379 +	list_replace_init(&rt->tobe_released, &list);
7380 +
7381 +	list_for_each_safe(pos, safe, &list) {
7382 +		/* pick task of work list */
7383 +		t = list_entry(pos, struct task_struct, rt_param.list);
7384 +		sched_trace_task_release(t);
7385 +		list_del(pos);
7386 +
7387 +		/* put into release heap while holding release_lock */
7388 +		raw_spin_lock(&rt->release_lock);
7389 +		VTRACE_TASK(t, "I have the release_lock 0x%p\n", &rt->release_lock);
7390 +
7391 +		rh = get_release_heap(rt, t, 0);
7392 +		if (!rh) {
7393 +			/* need to use our own, but drop lock first */
7394 +			raw_spin_unlock(&rt->release_lock);
7395 +			VTRACE_TASK(t, "Dropped release_lock 0x%p\n",
7396 +				    &rt->release_lock);
7397 +
7398 +			reinit_release_heap(t);
7399 +			VTRACE_TASK(t, "release_heap ready\n");
7400 +
7401 +			raw_spin_lock(&rt->release_lock);
7402 +			VTRACE_TASK(t, "Re-acquired release_lock 0x%p\n",
7403 +				    &rt->release_lock);
7404 +
7405 +			rh = get_release_heap(rt, t, 1);
7406 +		}
7407 +		bheap_insert(rt->order, &rh->heap, tsk_rt(t)->heap_node);
7408 +		VTRACE_TASK(t, "arm_release_timer(): added to release heap\n");
7409 +
7410 +		raw_spin_unlock(&rt->release_lock);
7411 +		VTRACE_TASK(t, "Returned the release_lock 0x%p\n", &rt->release_lock);
7412 +
7413 +		/* To avoid arming the timer multiple times, we only let the
7414 +		 * owner do the arming (which is the "first" task to reference
7415 +		 * this release_heap anyway).
7416 +		 */
7417 +		if (rh == tsk_rt(t)->rel_heap) {
7418 +			VTRACE_TASK(t, "arming timer 0x%p\n", &rh->timer);
7419 +			/* we cannot arm the timer using hrtimer_start()
7420 +			 * as it may deadlock on rq->lock
7421 +			 *
7422 +			 * PINNED mode is ok on both local and remote CPU
7423 +			 */
7424 +#ifdef CONFIG_RELEASE_MASTER
7425 +			if (rt->release_master == NO_CPU &&
7426 +			    target_cpu == NO_CPU)
7427 +#endif
7428 +				__hrtimer_start_range_ns(&rh->timer,
7429 +						ns_to_ktime(rh->release_time),
7430 +						0, HRTIMER_MODE_ABS_PINNED, 0);
7431 +#ifdef CONFIG_RELEASE_MASTER
7432 +			else
7433 +				hrtimer_start_on(
7434 +					/* target_cpu overrides release master */
7435 +					(target_cpu != NO_CPU ?
7436 +					 target_cpu : rt->release_master),
7437 +					&rh->info, &rh->timer,
7438 +					ns_to_ktime(rh->release_time),
7439 +					HRTIMER_MODE_ABS_PINNED);
7440 +#endif
7441 +		} else
7442 +			VTRACE_TASK(t, "0x%p is not my timer\n", &rh->timer);
7443 +	}
7444 +}
7445 +
7446 +void rt_domain_init(rt_domain_t *rt,
7447 +		    bheap_prio_t order,
7448 +		    check_resched_needed_t check,
7449 +		    release_jobs_t release
7450 +		   )
7451 +{
7452 +	int i;
7453 +
7454 +	BUG_ON(!rt);
7455 +	if (!check)
7456 +		check = dummy_resched;
7457 +	if (!release)
7458 +		release = default_release_jobs;
7459 +	if (!order)
7460 +		order = dummy_order;
7461 +
7462 +#ifdef CONFIG_RELEASE_MASTER
7463 +	rt->release_master = NO_CPU;
7464 +#endif
7465 +
7466 +	bheap_init(&rt->ready_queue);
7467 +	INIT_LIST_HEAD(&rt->tobe_released);
7468 +	for (i = 0; i < RELEASE_QUEUE_SLOTS; i++)
7469 +		INIT_LIST_HEAD(&rt->release_queue.slot[i]);
7470 +
7471 +	raw_spin_lock_init(&rt->ready_lock);
7472 +	raw_spin_lock_init(&rt->release_lock);
7473 +	raw_spin_lock_init(&rt->tobe_lock);
7474 +
7475 +	rt->check_resched 	= check;
7476 +	rt->release_jobs	= release;
7477 +	rt->order		= order;
7478 +}
7479 +
7480 +/* add_ready - add a real-time task to the rt ready queue. It must be runnable.
7481 + * @new:       the newly released task
7482 + */
7483 +void __add_ready(rt_domain_t* rt, struct task_struct *new)
7484 +{
7485 +	TRACE("rt: adding %s/%d (%llu, %llu) rel=%llu to ready queue at %llu\n",
7486 +	      new->comm, new->pid, get_exec_cost(new), get_rt_period(new),
7487 +	      get_release(new), litmus_clock());
7488 +
7489 +	BUG_ON(bheap_node_in_heap(tsk_rt(new)->heap_node));
7490 +
7491 +	bheap_insert(rt->order, &rt->ready_queue, tsk_rt(new)->heap_node);
7492 +	rt->check_resched(rt);
7493 +}
7494 +
7495 +/* merge_ready - Add a sorted set of tasks to the rt ready queue. They must be runnable.
7496 + * @tasks      - the newly released tasks
7497 + */
7498 +void __merge_ready(rt_domain_t* rt, struct bheap* tasks)
7499 +{
7500 +	bheap_union(rt->order, &rt->ready_queue, tasks);
7501 +	rt->check_resched(rt);
7502 +}
7503 +
7504 +
7505 +#ifdef CONFIG_RELEASE_MASTER
7506 +void __add_release_on(rt_domain_t* rt, struct task_struct *task,
7507 +		      int target_cpu)
7508 +{
7509 +	TRACE_TASK(task, "add_release_on(), rel=%llu, target=%d\n",
7510 +		   get_release(task), target_cpu);
7511 +	list_add(&tsk_rt(task)->list, &rt->tobe_released);
7512 +	task->rt_param.domain = rt;
7513 +
7514 +	/* start release timer */
7515 +	TS_SCHED2_START(task);
7516 +
7517 +	arm_release_timer_on(rt, target_cpu);
7518 +
7519 +	TS_SCHED2_END(task);
7520 +}
7521 +#endif
7522 +
7523 +/* add_release - add a real-time task to the rt release queue.
7524 + * @task:        the sleeping task
7525 + */
7526 +void __add_release(rt_domain_t* rt, struct task_struct *task)
7527 +{
7528 +	TRACE_TASK(task, "add_release(), rel=%llu\n", get_release(task));
7529 +	list_add(&tsk_rt(task)->list, &rt->tobe_released);
7530 +	task->rt_param.domain = rt;
7531 +
7532 +	/* start release timer */
7533 +	TS_SCHED2_START(task);
7534 +
7535 +	arm_release_timer(rt);
7536 +
7537 +	TS_SCHED2_END(task);
7538 +}
7539 +
7540 diff --git a/litmus/sched_cedf.c b/litmus/sched_cedf.c
7541 new file mode 100644
7542 index 0000000..480c62b
7543 --- /dev/null
7544 +++ b/litmus/sched_cedf.c
7545 @@ -0,0 +1,863 @@
7546 +/*
7547 + * litmus/sched_cedf.c
7548 + *
7549 + * Implementation of the C-EDF scheduling algorithm.
7550 + *
7551 + * This implementation is based on G-EDF:
7552 + * - CPUs are clustered around L2 or L3 caches.
7553 + * - Clusters topology is automatically detected (this is arch dependent
7554 + *   and is working only on x86 at the moment --- and only with modern
7555 + *   cpus that exports cpuid4 information)
7556 + * - The plugins _does not_ attempt to put tasks in the right cluster i.e.
7557 + *   the programmer needs to be aware of the topology to place tasks
7558 + *   in the desired cluster
7559 + * - default clustering is around L2 cache (cache index = 2)
7560 + *   supported clusters are: L1 (private cache: pedf), L2, L3, ALL (all
7561 + *   online_cpus are placed in a single cluster).
7562 + *
7563 + *   For details on functions, take a look at sched_gsn_edf.c
7564 + *
7565 + * Currently, we do not support changes in the number of online cpus.
7566 + * If the num_online_cpus() dynamically changes, the plugin is broken.
7567 + *
7568 + * This version uses the simple approach and serializes all scheduling
7569 + * decisions by the use of a queue lock. This is probably not the
7570 + * best way to do it, but it should suffice for now.
7571 + */
7572 +
7573 +#include <linux/spinlock.h>
7574 +#include <linux/percpu.h>
7575 +#include <linux/sched.h>
7576 +#include <linux/slab.h>
7577 +
7578 +#include <linux/module.h>
7579 +
7580 +#include <litmus/litmus.h>
7581 +#include <litmus/jobs.h>
7582 +#include <litmus/preempt.h>
7583 +#include <litmus/sched_plugin.h>
7584 +#include <litmus/edf_common.h>
7585 +#include <litmus/sched_trace.h>
7586 +
7587 +#include <litmus/clustered.h>
7588 +
7589 +#include <litmus/bheap.h>
7590 +
7591 +#ifdef CONFIG_SCHED_CPU_AFFINITY
7592 +#include <litmus/affinity.h>
7593 +#endif
7594 +
7595 +/* to configure the cluster size */
7596 +#include <litmus/litmus_proc.h>
7597 +#include <linux/uaccess.h>
7598 +
7599 +/* Reference configuration variable. Determines which cache level is used to
7600 + * group CPUs into clusters.  GLOBAL_CLUSTER, which is the default, means that
7601 + * all CPUs form a single cluster (just like GSN-EDF).
7602 + */
7603 +static enum cache_level cluster_config = GLOBAL_CLUSTER;
7604 +
7605 +struct clusterdomain;
7606 +
7607 +/* cpu_entry_t - maintain the linked and scheduled state
7608 + *
7609 + * A cpu also contains a pointer to the cedf_domain_t cluster
7610 + * that owns it (struct clusterdomain*)
7611 + */
7612 +typedef struct  {
7613 +	int 			cpu;
7614 +	struct clusterdomain*	cluster;	/* owning cluster */
7615 +	struct task_struct*	linked;		/* only RT tasks */
7616 +	struct task_struct*	scheduled;	/* only RT tasks */
7617 +	atomic_t		will_schedule;	/* prevent unneeded IPIs */
7618 +	struct bheap_node*	hn;
7619 +} cpu_entry_t;
7620 +
7621 +/* one cpu_entry_t per CPU */
7622 +DEFINE_PER_CPU(cpu_entry_t, cedf_cpu_entries);
7623 +
7624 +#define set_will_schedule() \
7625 +	(atomic_set(&__get_cpu_var(cedf_cpu_entries).will_schedule, 1))
7626 +#define clear_will_schedule() \
7627 +	(atomic_set(&__get_cpu_var(cedf_cpu_entries).will_schedule, 0))
7628 +#define test_will_schedule(cpu) \
7629 +	(atomic_read(&per_cpu(cedf_cpu_entries, cpu).will_schedule))
7630 +
7631 +/*
7632 + * In C-EDF there is a cedf domain _per_ cluster
7633 + * The number of clusters is dynamically determined accordingly to the
7634 + * total cpu number and the cluster size
7635 + */
7636 +typedef struct clusterdomain {
7637 +	/* rt_domain for this cluster */
7638 +	rt_domain_t	domain;
7639 +	/* cpus in this cluster */
7640 +	cpu_entry_t*	*cpus;
7641 +	/* map of this cluster cpus */
7642 +	cpumask_var_t	cpu_map;
7643 +	/* the cpus queue themselves according to priority in here */
7644 +	struct bheap_node *heap_node;
7645 +	struct bheap      cpu_heap;
7646 +	/* lock for this cluster */
7647 +#define cluster_lock domain.ready_lock
7648 +} cedf_domain_t;
7649 +
7650 +/* a cedf_domain per cluster; allocation is done at init/activation time */
7651 +cedf_domain_t *cedf;
7652 +
7653 +#define remote_cluster(cpu)	((cedf_domain_t *) per_cpu(cedf_cpu_entries, cpu).cluster)
7654 +#define task_cpu_cluster(task)	remote_cluster(get_partition(task))
7655 +
7656 +/* Uncomment WANT_ALL_SCHED_EVENTS if you want to see all scheduling
7657 + * decisions in the TRACE() log; uncomment VERBOSE_INIT for verbose
7658 + * information during the initialization of the plugin (e.g., topology)
7659 +#define WANT_ALL_SCHED_EVENTS
7660 + */
7661 +#define VERBOSE_INIT
7662 +
7663 +static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
7664 +{
7665 +	cpu_entry_t *a, *b;
7666 +	a = _a->value;
7667 +	b = _b->value;
7668 +	/* Note that a and b are inverted: we want the lowest-priority CPU at
7669 +	 * the top of the heap.
7670 +	 */
7671 +	return edf_higher_prio(b->linked, a->linked);
7672 +}
7673 +
7674 +/* update_cpu_position - Move the cpu entry to the correct place to maintain
7675 + *                       order in the cpu queue. Caller must hold cedf lock.
7676 + */
7677 +static void update_cpu_position(cpu_entry_t *entry)
7678 +{
7679 +	cedf_domain_t *cluster = entry->cluster;
7680 +
7681 +	if (likely(bheap_node_in_heap(entry->hn)))
7682 +		bheap_delete(cpu_lower_prio,
7683 +				&cluster->cpu_heap,
7684 +				entry->hn);
7685 +
7686 +	bheap_insert(cpu_lower_prio, &cluster->cpu_heap, entry->hn);
7687 +}
7688 +
7689 +/* caller must hold cedf lock */
7690 +static cpu_entry_t* lowest_prio_cpu(cedf_domain_t *cluster)
7691 +{
7692 +	struct bheap_node* hn;
7693 +	hn = bheap_peek(cpu_lower_prio, &cluster->cpu_heap);
7694 +	return hn->value;
7695 +}
7696 +
7697 +
7698 +/* link_task_to_cpu - Update the link of a CPU.
7699 + *                    Handles the case where the to-be-linked task is already
7700 + *                    scheduled on a different CPU.
7701 + */
7702 +static noinline void link_task_to_cpu(struct task_struct* linked,
7703 +				      cpu_entry_t *entry)
7704 +{
7705 +	cpu_entry_t *sched;
7706 +	struct task_struct* tmp;
7707 +	int on_cpu;
7708 +
7709 +	BUG_ON(linked && !is_realtime(linked));
7710 +
7711 +	/* Currently linked task is set to be unlinked. */
7712 +	if (entry->linked) {
7713 +		entry->linked->rt_param.linked_on = NO_CPU;
7714 +	}
7715 +
7716 +	/* Link new task to CPU. */
7717 +	if (linked) {
7718 +		set_rt_flags(linked, RT_F_RUNNING);
7719 +		/* handle task is already scheduled somewhere! */
7720 +		on_cpu = linked->rt_param.scheduled_on;
7721 +		if (on_cpu != NO_CPU) {
7722 +			sched = &per_cpu(cedf_cpu_entries, on_cpu);
7723 +			/* this should only happen if not linked already */
7724 +			BUG_ON(sched->linked == linked);
7725 +
7726 +			/* If we are already scheduled on the CPU to which we
7727 +			 * wanted to link, we don't need to do the swap --
7728 +			 * we just link ourselves to the CPU and depend on
7729 +			 * the caller to get things right.
7730 +			 */
7731 +			if (entry != sched) {
7732 +				TRACE_TASK(linked,
7733 +					   "already scheduled on %d, updating link.\n",
7734 +					   sched->cpu);
7735 +				tmp = sched->linked;
7736 +				linked->rt_param.linked_on = sched->cpu;
7737 +				sched->linked = linked;
7738 +				update_cpu_position(sched);
7739 +				linked = tmp;
7740 +			}
7741 +		}
7742 +		if (linked) /* might be NULL due to swap */
7743 +			linked->rt_param.linked_on = entry->cpu;
7744 +	}
7745 +	entry->linked = linked;
7746 +#ifdef WANT_ALL_SCHED_EVENTS
7747 +	if (linked)
7748 +		TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
7749 +	else
7750 +		TRACE("NULL linked to %d.\n", entry->cpu);
7751 +#endif
7752 +	update_cpu_position(entry);
7753 +}
7754 +
7755 +/* unlink - Make sure a task is not linked any longer to an entry
7756 + *          where it was linked before. Must hold cedf_lock.
7757 + */
7758 +static noinline void unlink(struct task_struct* t)
7759 +{
7760 +    	cpu_entry_t *entry;
7761 +
7762 +	if (t->rt_param.linked_on != NO_CPU) {
7763 +		/* unlink */
7764 +		entry = &per_cpu(cedf_cpu_entries, t->rt_param.linked_on);
7765 +		t->rt_param.linked_on = NO_CPU;
7766 +		link_task_to_cpu(NULL, entry);
7767 +	} else if (is_queued(t)) {
7768 +		/* This is an interesting situation: t is scheduled,
7769 +		 * but was just recently unlinked.  It cannot be
7770 +		 * linked anywhere else (because then it would have
7771 +		 * been relinked to this CPU), thus it must be in some
7772 +		 * queue. We must remove it from the list in this
7773 +		 * case.
7774 +		 *
7775 +		 * in C-EDF case is should be somewhere in the queue for
7776 +		 * its domain, therefore and we can get the domain using
7777 +		 * task_cpu_cluster
7778 +		 */
7779 +		remove(&(task_cpu_cluster(t))->domain, t);
7780 +	}
7781 +}
7782 +
7783 +
7784 +/* preempt - force a CPU to reschedule
7785 + */
7786 +static void preempt(cpu_entry_t *entry)
7787 +{
7788 +	preempt_if_preemptable(entry->scheduled, entry->cpu);
7789 +}
7790 +
7791 +/* requeue - Put an unlinked task into gsn-edf domain.
7792 + *           Caller must hold cedf_lock.
7793 + */
7794 +static noinline void requeue(struct task_struct* task)
7795 +{
7796 +	cedf_domain_t *cluster = task_cpu_cluster(task);
7797 +	BUG_ON(!task);
7798 +	/* sanity check before insertion */
7799 +	BUG_ON(is_queued(task));
7800 +
7801 +	if (is_released(task, litmus_clock()))
7802 +		__add_ready(&cluster->domain, task);
7803 +	else {
7804 +		/* it has got to wait */
7805 +		add_release(&cluster->domain, task);
7806 +	}
7807 +}
7808 +
7809 +#ifdef CONFIG_SCHED_CPU_AFFINITY
7810 +static cpu_entry_t* cedf_get_nearest_available_cpu(
7811 +				cedf_domain_t *cluster, cpu_entry_t *start)
7812 +{
7813 +	cpu_entry_t *affinity;
7814 +
7815 +	get_nearest_available_cpu(affinity, start, cedf_cpu_entries,
7816 +#ifdef CONFIG_RELEASE_MASTER
7817 +		cluster->domain.release_master
7818 +#else
7819 +		NO_CPU
7820 +#endif
7821 +		);
7822 +
7823 +	/* make sure CPU is in our cluster */
7824 +	if (affinity && cpu_isset(affinity->cpu, *cluster->cpu_map))
7825 +		return(affinity);
7826 +	else
7827 +		return(NULL);
7828 +}
7829 +#endif
7830 +
7831 +
7832 +/* check for any necessary preemptions */
7833 +static void check_for_preemptions(cedf_domain_t *cluster)
7834 +{
7835 +	struct task_struct *task;
7836 +	cpu_entry_t *last;
7837 +
7838 +	for(last = lowest_prio_cpu(cluster);
7839 +	    edf_preemption_needed(&cluster->domain, last->linked);
7840 +	    last = lowest_prio_cpu(cluster)) {
7841 +		/* preemption necessary */
7842 +		task = __take_ready(&cluster->domain);
7843 +		TRACE("check_for_preemptions: attempting to link task %d to %d\n",
7844 +		      task->pid, last->cpu);
7845 +#ifdef CONFIG_SCHED_CPU_AFFINITY
7846 +		{
7847 +			cpu_entry_t *affinity =
7848 +					cedf_get_nearest_available_cpu(cluster,
7849 +						&per_cpu(cedf_cpu_entries, task_cpu(task)));
7850 +			if(affinity)
7851 +				last = affinity;
7852 +			else if(last->linked)
7853 +				requeue(last->linked);
7854 +		}
7855 +#else
7856 +		if (last->linked)
7857 +			requeue(last->linked);
7858 +#endif
7859 +		link_task_to_cpu(task, last);
7860 +		preempt(last);
7861 +	}
7862 +}
7863 +
7864 +/* cedf_job_arrival: task is either resumed or released */
7865 +static noinline void cedf_job_arrival(struct task_struct* task)
7866 +{
7867 +	cedf_domain_t *cluster = task_cpu_cluster(task);
7868 +	BUG_ON(!task);
7869 +
7870 +	requeue(task);
7871 +	check_for_preemptions(cluster);
7872 +}
7873 +
7874 +static void cedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
7875 +{
7876 +	cedf_domain_t* cluster = container_of(rt, cedf_domain_t, domain);
7877 +	unsigned long flags;
7878 +
7879 +	raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
7880 +
7881 +	__merge_ready(&cluster->domain, tasks);
7882 +	check_for_preemptions(cluster);
7883 +
7884 +	raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
7885 +}
7886 +
7887 +/* caller holds cedf_lock */
7888 +static noinline void job_completion(struct task_struct *t, int forced)
7889 +{
7890 +	BUG_ON(!t);
7891 +
7892 +	sched_trace_task_completion(t, forced);
7893 +
7894 +	TRACE_TASK(t, "job_completion().\n");
7895 +
7896 +	/* set flags */
7897 +	set_rt_flags(t, RT_F_SLEEP);
7898 +	/* prepare for next period */
7899 +	prepare_for_next_period(t);
7900 +	if (is_released(t, litmus_clock()))
7901 +		sched_trace_task_release(t);
7902 +	/* unlink */
7903 +	unlink(t);
7904 +	/* requeue
7905 +	 * But don't requeue a blocking task. */
7906 +	if (is_running(t))
7907 +		cedf_job_arrival(t);
7908 +}
7909 +
7910 +/* cedf_tick - this function is called for every local timer
7911 + *                         interrupt.
7912 + *
7913 + *                   checks whether the current task has expired and checks
7914 + *                   whether we need to preempt it if it has not expired
7915 + */
7916 +static void cedf_tick(struct task_struct* t)
7917 +{
7918 +	if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
7919 +		if (!is_np(t)) {
7920 +			/* np tasks will be preempted when they become
7921 +			 * preemptable again
7922 +			 */
7923 +			litmus_reschedule_local();
7924 +			set_will_schedule();
7925 +			TRACE("cedf_scheduler_tick: "
7926 +			      "%d is preemptable "
7927 +			      " => FORCE_RESCHED\n", t->pid);
7928 +		} else if (is_user_np(t)) {
7929 +			TRACE("cedf_scheduler_tick: "
7930 +			      "%d is non-preemptable, "
7931 +			      "preemption delayed.\n", t->pid);
7932 +			request_exit_np(t);
7933 +		}
7934 +	}
7935 +}
7936 +
7937 +/* Getting schedule() right is a bit tricky. schedule() may not make any
7938 + * assumptions on the state of the current task since it may be called for a
7939 + * number of reasons. The reasons include a scheduler_tick() determined that it
7940 + * was necessary, because sys_exit_np() was called, because some Linux
7941 + * subsystem determined so, or even (in the worst case) because there is a bug
7942 + * hidden somewhere. Thus, we must take extreme care to determine what the
7943 + * current state is.
7944 + *
7945 + * The CPU could currently be scheduling a task (or not), be linked (or not).
7946 + *
7947 + * The following assertions for the scheduled task could hold:
7948 + *
7949 + *      - !is_running(scheduled)        // the job blocks
7950 + *	- scheduled->timeslice == 0	// the job completed (forcefully)
7951 + *	- get_rt_flag() == RT_F_SLEEP	// the job completed (by syscall)
7952 + * 	- linked != scheduled		// we need to reschedule (for any reason)
7953 + * 	- is_np(scheduled)		// rescheduling must be delayed,
7954 + *					   sys_exit_np must be requested
7955 + *
7956 + * Any of these can occur together.
7957 + */
7958 +static struct task_struct* cedf_schedule(struct task_struct * prev)
7959 +{
7960 +	cpu_entry_t* entry = &__get_cpu_var(cedf_cpu_entries);
7961 +	cedf_domain_t *cluster = entry->cluster;
7962 +	int out_of_time, sleep, preempt, np, exists, blocks;
7963 +	struct task_struct* next = NULL;
7964 +
7965 +#ifdef CONFIG_RELEASE_MASTER
7966 +	/* Bail out early if we are the release master.
7967 +	 * The release master never schedules any real-time tasks.
7968 +	 */
7969 +	if (unlikely(cluster->domain.release_master == entry->cpu)) {
7970 +		sched_state_task_picked();
7971 +		return NULL;
7972 +	}
7973 +#endif
7974 +
7975 +	raw_spin_lock(&cluster->cluster_lock);
7976 +	clear_will_schedule();
7977 +
7978 +	/* sanity checking */
7979 +	BUG_ON(entry->scheduled && entry->scheduled != prev);
7980 +	BUG_ON(entry->scheduled && !is_realtime(prev));
7981 +	BUG_ON(is_realtime(prev) && !entry->scheduled);
7982 +
7983 +	/* (0) Determine state */
7984 +	exists      = entry->scheduled != NULL;
7985 +	blocks      = exists && !is_running(entry->scheduled);
7986 +	out_of_time = exists &&
7987 +				  budget_enforced(entry->scheduled) &&
7988 +				  budget_exhausted(entry->scheduled);
7989 +	np 	    = exists && is_np(entry->scheduled);
7990 +	sleep	    = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
7991 +	preempt     = entry->scheduled != entry->linked;
7992 +
7993 +#ifdef WANT_ALL_SCHED_EVENTS
7994 +	TRACE_TASK(prev, "invoked cedf_schedule.\n");
7995 +#endif
7996 +
7997 +	if (exists)
7998 +		TRACE_TASK(prev,
7999 +			   "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
8000 +			   "state:%d sig:%d\n",
8001 +			   blocks, out_of_time, np, sleep, preempt,
8002 +			   prev->state, signal_pending(prev));
8003 +	if (entry->linked && preempt)
8004 +		TRACE_TASK(prev, "will be preempted by %s/%d\n",
8005 +			   entry->linked->comm, entry->linked->pid);
8006 +
8007 +
8008 +	/* If a task blocks we have no choice but to reschedule.
8009 +	 */
8010 +	if (blocks)
8011 +		unlink(entry->scheduled);
8012 +
8013 +	/* Request a sys_exit_np() call if we would like to preempt but cannot.
8014 +	 * We need to make sure to update the link structure anyway in case
8015 +	 * that we are still linked. Multiple calls to request_exit_np() don't
8016 +	 * hurt.
8017 +	 */
8018 +	if (np && (out_of_time || preempt || sleep)) {
8019 +		unlink(entry->scheduled);
8020 +		request_exit_np(entry->scheduled);
8021 +	}
8022 +
8023 +	/* Any task that is preemptable and either exhausts its execution
8024 +	 * budget or wants to sleep completes. We may have to reschedule after
8025 +	 * this. Don't do a job completion if we block (can't have timers running
8026 +	 * for blocked jobs). Preemption go first for the same reason.
8027 +	 */
8028 +	if (!np && (out_of_time || sleep) && !blocks && !preempt)
8029 +		job_completion(entry->scheduled, !sleep);
8030 +
8031 +	/* Link pending task if we became unlinked.
8032 +	 */
8033 +	if (!entry->linked)
8034 +		link_task_to_cpu(__take_ready(&cluster->domain), entry);
8035 +
8036 +	/* The final scheduling decision. Do we need to switch for some reason?
8037 +	 * If linked is different from scheduled, then select linked as next.
8038 +	 */
8039 +	if ((!np || blocks) &&
8040 +	    entry->linked != entry->scheduled) {
8041 +		/* Schedule a linked job? */
8042 +		if (entry->linked) {
8043 +			entry->linked->rt_param.scheduled_on = entry->cpu;
8044 +			next = entry->linked;
8045 +		}
8046 +		if (entry->scheduled) {
8047 +			/* not gonna be scheduled soon */
8048 +			entry->scheduled->rt_param.scheduled_on = NO_CPU;
8049 +			TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
8050 +		}
8051 +	} else
8052 +		/* Only override Linux scheduler if we have a real-time task
8053 +		 * scheduled that needs to continue.
8054 +		 */
8055 +		if (exists)
8056 +			next = prev;
8057 +
8058 +	sched_state_task_picked();
8059 +	raw_spin_unlock(&cluster->cluster_lock);
8060 +
8061 +#ifdef WANT_ALL_SCHED_EVENTS
8062 +	TRACE("cedf_lock released, next=0x%p\n", next);
8063 +
8064 +	if (next)
8065 +		TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
8066 +	else if (exists && !next)
8067 +		TRACE("becomes idle at %llu.\n", litmus_clock());
8068 +#endif
8069 +
8070 +
8071 +	return next;
8072 +}
8073 +
8074 +
8075 +/* _finish_switch - we just finished the switch away from prev
8076 + */
8077 +static void cedf_finish_switch(struct task_struct *prev)
8078 +{
8079 +	cpu_entry_t* 	entry = &__get_cpu_var(cedf_cpu_entries);
8080 +
8081 +	entry->scheduled = is_realtime(current) ? current : NULL;
8082 +#ifdef WANT_ALL_SCHED_EVENTS
8083 +	TRACE_TASK(prev, "switched away from\n");
8084 +#endif
8085 +}
8086 +
8087 +
8088 +/*	Prepare a task for running in RT mode
8089 + */
8090 +static void cedf_task_new(struct task_struct * t, int on_rq, int running)
8091 +{
8092 +	unsigned long 		flags;
8093 +	cpu_entry_t* 		entry;
8094 +	cedf_domain_t*		cluster;
8095 +
8096 +	TRACE("gsn edf: task new %d\n", t->pid);
8097 +
8098 +	/* the cluster doesn't change even if t is running */
8099 +	cluster = task_cpu_cluster(t);
8100 +
8101 +	raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
8102 +
8103 +	/* setup job params */
8104 +	release_at(t, litmus_clock());
8105 +
8106 +	if (running) {
8107 +		entry = &per_cpu(cedf_cpu_entries, task_cpu(t));
8108 +		BUG_ON(entry->scheduled);
8109 +
8110 +#ifdef CONFIG_RELEASE_MASTER
8111 +		if (entry->cpu != cluster->domain.release_master) {
8112 +#endif
8113 +			entry->scheduled = t;
8114 +			tsk_rt(t)->scheduled_on = task_cpu(t);
8115 +#ifdef CONFIG_RELEASE_MASTER
8116 +		} else {
8117 +			/* do not schedule on release master */
8118 +			preempt(entry); /* force resched */
8119 +			tsk_rt(t)->scheduled_on = NO_CPU;
8120 +		}
8121 +#endif
8122 +	} else {
8123 +		t->rt_param.scheduled_on = NO_CPU;
8124 +	}
8125 +	t->rt_param.linked_on          = NO_CPU;
8126 +
8127 +	cedf_job_arrival(t);
8128 +	raw_spin_unlock_irqrestore(&(cluster->cluster_lock), flags);
8129 +}
8130 +
8131 +static void cedf_task_wake_up(struct task_struct *task)
8132 +{
8133 +	unsigned long flags;
8134 +	lt_t now;
8135 +	cedf_domain_t *cluster;
8136 +
8137 +	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
8138 +
8139 +	cluster = task_cpu_cluster(task);
8140 +
8141 +	raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
8142 +	/* We need to take suspensions because of semaphores into
8143 +	 * account! If a job resumes after being suspended due to acquiring
8144 +	 * a semaphore, it should never be treated as a new job release.
8145 +	 */
8146 +	if (get_rt_flags(task) == RT_F_EXIT_SEM) {
8147 +		set_rt_flags(task, RT_F_RUNNING);
8148 +	} else {
8149 +		now = litmus_clock();
8150 +		if (is_tardy(task, now)) {
8151 +			/* new sporadic release */
8152 +			release_at(task, now);
8153 +			sched_trace_task_release(task);
8154 +		}
8155 +		else {
8156 +			if (task->rt.time_slice) {
8157 +				/* came back in time before deadline
8158 +				*/
8159 +				set_rt_flags(task, RT_F_RUNNING);
8160 +			}
8161 +		}
8162 +	}
8163 +	cedf_job_arrival(task);
8164 +	raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
8165 +}
8166 +
8167 +static void cedf_task_block(struct task_struct *t)
8168 +{
8169 +	unsigned long flags;
8170 +	cedf_domain_t *cluster;
8171 +
8172 +	TRACE_TASK(t, "block at %llu\n", litmus_clock());
8173 +
8174 +	cluster = task_cpu_cluster(t);
8175 +
8176 +	/* unlink if necessary */
8177 +	raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
8178 +	unlink(t);
8179 +	raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
8180 +
8181 +	BUG_ON(!is_realtime(t));
8182 +}
8183 +
8184 +
8185 +static void cedf_task_exit(struct task_struct * t)
8186 +{
8187 +	unsigned long flags;
8188 +	cedf_domain_t *cluster = task_cpu_cluster(t);
8189 +
8190 +	/* unlink if necessary */
8191 +	raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
8192 +	unlink(t);
8193 +	if (tsk_rt(t)->scheduled_on != NO_CPU) {
8194 +		cpu_entry_t *cpu;
8195 +		cpu = &per_cpu(cedf_cpu_entries, tsk_rt(t)->scheduled_on);
8196 +		cpu->scheduled = NULL;
8197 +		tsk_rt(t)->scheduled_on = NO_CPU;
8198 +	}
8199 +	raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
8200 +
8201 +	BUG_ON(!is_realtime(t));
8202 +        TRACE_TASK(t, "RIP\n");
8203 +}
8204 +
8205 +static long cedf_admit_task(struct task_struct* tsk)
8206 +{
8207 +	return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL;
8208 +}
8209 +
8210 +/* total number of cluster */
8211 +static int num_clusters;
8212 +/* we do not support cluster of different sizes */
8213 +static unsigned int cluster_size;
8214 +
8215 +#ifdef VERBOSE_INIT
8216 +static void print_cluster_topology(cpumask_var_t mask, int cpu)
8217 +{
8218 +	int chk;
8219 +	char buf[255];
8220 +
8221 +	chk = cpulist_scnprintf(buf, 254, mask);
8222 +	buf[chk] = '\0';
8223 +	printk(KERN_INFO "CPU = %d, shared cpu(s) = %s\n", cpu, buf);
8224 +
8225 +}
8226 +#endif
8227 +
8228 +static int clusters_allocated = 0;
8229 +
8230 +static void cleanup_cedf(void)
8231 +{
8232 +	int i;
8233 +
8234 +	if (clusters_allocated) {
8235 +		for (i = 0; i < num_clusters; i++) {
8236 +			kfree(cedf[i].cpus);
8237 +			kfree(cedf[i].heap_node);
8238 +			free_cpumask_var(cedf[i].cpu_map);
8239 +		}
8240 +
8241 +		kfree(cedf);
8242 +	}
8243 +}
8244 +
8245 +static long cedf_activate_plugin(void)
8246 +{
8247 +	int i, j, cpu, ccpu, cpu_count;
8248 +	cpu_entry_t *entry;
8249 +
8250 +	cpumask_var_t mask;
8251 +	int chk = 0;
8252 +
8253 +	/* de-allocate old clusters, if any */
8254 +	cleanup_cedf();
8255 +
8256 +	printk(KERN_INFO "C-EDF: Activate Plugin, cluster configuration = %d\n",
8257 +			cluster_config);
8258 +
8259 +	/* need to get cluster_size first */
8260 +	if(!zalloc_cpumask_var(&mask, GFP_ATOMIC))
8261 +		return -ENOMEM;
8262 +
8263 +	if (unlikely(cluster_config == GLOBAL_CLUSTER)) {
8264 +		cluster_size = num_online_cpus();
8265 +	} else {
8266 +		chk = get_shared_cpu_map(mask, 0, cluster_config);
8267 +		if (chk) {
8268 +			/* if chk != 0 then it is the max allowed index */
8269 +			printk(KERN_INFO "C-EDF: Cluster configuration = %d "
8270 +			       "is not supported on this hardware.\n",
8271 +			       cluster_config);
8272 +			/* User should notice that the configuration failed, so
8273 +			 * let's bail out. */
8274 +			return -EINVAL;
8275 +		}
8276 +
8277 +		cluster_size = cpumask_weight(mask);
8278 +	}
8279 +
8280 +	if ((num_online_cpus() % cluster_size) != 0) {
8281 +		/* this can't be right, some cpus are left out */
8282 +		printk(KERN_ERR "C-EDF: Trying to group %d cpus in %d!\n",
8283 +				num_online_cpus(), cluster_size);
8284 +		return -1;
8285 +	}
8286 +
8287 +	num_clusters = num_online_cpus() / cluster_size;
8288 +	printk(KERN_INFO "C-EDF: %d cluster(s) of size = %d\n",
8289 +			num_clusters, cluster_size);
8290 +
8291 +	/* initialize clusters */
8292 +	cedf = kmalloc(num_clusters * sizeof(cedf_domain_t), GFP_ATOMIC);
8293 +	for (i = 0; i < num_clusters; i++) {
8294 +
8295 +		cedf[i].cpus = kmalloc(cluster_size * sizeof(cpu_entry_t),
8296 +				GFP_ATOMIC);
8297 +		cedf[i].heap_node = kmalloc(
8298 +				cluster_size * sizeof(struct bheap_node),
8299 +				GFP_ATOMIC);
8300 +		bheap_init(&(cedf[i].cpu_heap));
8301 +		edf_domain_init(&(cedf[i].domain), NULL, cedf_release_jobs);
8302 +
8303 +		if(!zalloc_cpumask_var(&cedf[i].cpu_map, GFP_ATOMIC))
8304 +			return -ENOMEM;
8305 +#ifdef CONFIG_RELEASE_MASTER
8306 +		cedf[i].domain.release_master = atomic_read(&release_master_cpu);
8307 +#endif
8308 +	}
8309 +
8310 +	/* cycle through cluster and add cpus to them */
8311 +	for (i = 0; i < num_clusters; i++) {
8312 +
8313 +		for_each_online_cpu(cpu) {
8314 +			/* check if the cpu is already in a cluster */
8315 +			for (j = 0; j < num_clusters; j++)
8316 +				if (cpumask_test_cpu(cpu, cedf[j].cpu_map))
8317 +					break;
8318 +			/* if it is in a cluster go to next cpu */
8319 +			if (j < num_clusters &&
8320 +					cpumask_test_cpu(cpu, cedf[j].cpu_map))
8321 +				continue;
8322 +
8323 +			/* this cpu isn't in any cluster */
8324 +			/* get the shared cpus */
8325 +			if (unlikely(cluster_config == GLOBAL_CLUSTER))
8326 +				cpumask_copy(mask, cpu_online_mask);
8327 +			else
8328 +				get_shared_cpu_map(mask, cpu, cluster_config);
8329 +
8330 +			cpumask_copy(cedf[i].cpu_map, mask);
8331 +#ifdef VERBOSE_INIT
8332 +			print_cluster_topology(mask, cpu);
8333 +#endif
8334 +			/* add cpus to current cluster and init cpu_entry_t */
8335 +			cpu_count = 0;
8336 +			for_each_cpu(ccpu, cedf[i].cpu_map) {
8337 +
8338 +				entry = &per_cpu(cedf_cpu_entries, ccpu);
8339 +				cedf[i].cpus[cpu_count] = entry;
8340 +				atomic_set(&entry->will_schedule, 0);
8341 +				entry->cpu = ccpu;
8342 +				entry->cluster = &cedf[i];
8343 +				entry->hn = &(cedf[i].heap_node[cpu_count]);
8344 +				bheap_node_init(&entry->hn, entry);
8345 +
8346 +				cpu_count++;
8347 +
8348 +				entry->linked = NULL;
8349 +				entry->scheduled = NULL;
8350 +#ifdef CONFIG_RELEASE_MASTER
8351 +				/* only add CPUs that should schedule jobs */
8352 +				if (entry->cpu != entry->cluster->domain.release_master)
8353 +#endif
8354 +					update_cpu_position(entry);
8355 +			}
8356 +			/* done with this cluster */
8357 +			break;
8358 +		}
8359 +	}
8360 +
8361 +	free_cpumask_var(mask);
8362 +	clusters_allocated = 1;
8363 +	return 0;
8364 +}
8365 +
8366 +/*	Plugin object	*/
8367 +static struct sched_plugin cedf_plugin __cacheline_aligned_in_smp = {
8368 +	.plugin_name		= "C-EDF",
8369 +	.finish_switch		= cedf_finish_switch,
8370 +	.tick			= cedf_tick,
8371 +	.task_new		= cedf_task_new,
8372 +	.complete_job		= complete_job,
8373 +	.task_exit		= cedf_task_exit,
8374 +	.schedule		= cedf_schedule,
8375 +	.task_wake_up		= cedf_task_wake_up,
8376 +	.task_block		= cedf_task_block,
8377 +	.admit_task		= cedf_admit_task,
8378 +	.activate_plugin	= cedf_activate_plugin,
8379 +};
8380 +
8381 +static struct proc_dir_entry *cluster_file = NULL, *cedf_dir = NULL;
8382 +
8383 +static int __init init_cedf(void)
8384 +{
8385 +	int err, fs;
8386 +
8387 +	err = register_sched_plugin(&cedf_plugin);
8388 +	if (!err) {
8389 +		fs = make_plugin_proc_dir(&cedf_plugin, &cedf_dir);
8390 +		if (!fs)
8391 +			cluster_file = create_cluster_file(cedf_dir, &cluster_config);
8392 +		else
8393 +			printk(KERN_ERR "Could not allocate C-EDF procfs dir.\n");
8394 +	}
8395 +	return err;
8396 +}
8397 +
8398 +static void clean_cedf(void)
8399 +{
8400 +	cleanup_cedf();
8401 +	if (cluster_file)
8402 +		remove_proc_entry("cluster", cedf_dir);
8403 +	if (cedf_dir)
8404 +		remove_plugin_proc_dir(&cedf_plugin);
8405 +}
8406 +
8407 +module_init(init_cedf);
8408 +module_exit(clean_cedf);
8409 diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c
8410 new file mode 100644
8411 index 0000000..6ed504f
8412 --- /dev/null
8413 +++ b/litmus/sched_gsn_edf.c
8414 @@ -0,0 +1,1030 @@
8415 +/*
8416 + * litmus/sched_gsn_edf.c
8417 + *
8418 + * Implementation of the GSN-EDF scheduling algorithm.
8419 + *
8420 + * This version uses the simple approach and serializes all scheduling
8421 + * decisions by the use of a queue lock. This is probably not the
8422 + * best way to do it, but it should suffice for now.
8423 + */
8424 +
8425 +#include <linux/spinlock.h>
8426 +#include <linux/percpu.h>
8427 +#include <linux/sched.h>
8428 +#include <linux/slab.h>
8429 +
8430 +#include <litmus/litmus.h>
8431 +#include <litmus/jobs.h>
8432 +#include <litmus/sched_plugin.h>
8433 +#include <litmus/edf_common.h>
8434 +#include <litmus/sched_trace.h>
8435 +#include <litmus/trace.h>
8436 +
8437 +#include <litmus/preempt.h>
8438 +
8439 +#include <litmus/bheap.h>
8440 +
8441 +#ifdef CONFIG_SCHED_CPU_AFFINITY
8442 +#include <litmus/affinity.h>
8443 +#endif
8444 +
8445 +#include <linux/module.h>
8446 +
8447 +/* Overview of GSN-EDF operations.
8448 + *
8449 + * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This
8450 + * description only covers how the individual operations are implemented in
8451 + * LITMUS.
8452 + *
8453 + * link_task_to_cpu(T, cpu) 	- Low-level operation to update the linkage
8454 + *                                structure (NOT the actually scheduled
8455 + *                                task). If there is another linked task To
8456 + *                                already it will set To->linked_on = NO_CPU
8457 + *                                (thereby removing its association with this
8458 + *                                CPU). However, it will not requeue the
8459 + *                                previously linked task (if any). It will set
8460 + *                                T's state to RT_F_RUNNING and check whether
8461 + *                                it is already running somewhere else. If T
8462 + *                                is scheduled somewhere else it will link
8463 + *                                it to that CPU instead (and pull the linked
8464 + *                                task to cpu). T may be NULL.
8465 + *
8466 + * unlink(T)			- Unlink removes T from all scheduler data
8467 + *                                structures. If it is linked to some CPU it
8468 + *                                will link NULL to that CPU. If it is
8469 + *                                currently queued in the gsnedf queue it will
8470 + *                                be removed from the rt_domain. It is safe to
8471 + *                                call unlink(T) if T is not linked. T may not
8472 + *                                be NULL.
8473 + *
8474 + * requeue(T)			- Requeue will insert T into the appropriate
8475 + *                                queue. If the system is in real-time mode and
8476 + *                                the T is released already, it will go into the
8477 + *                                ready queue. If the system is not in
8478 + *                                real-time mode is T, then T will go into the
8479 + *                                release queue. If T's release time is in the
8480 + *                                future, it will go into the release
8481 + *                                queue. That means that T's release time/job
8482 + *                                no/etc. has to be updated before requeu(T) is
8483 + *                                called. It is not safe to call requeue(T)
8484 + *                                when T is already queued. T may not be NULL.
8485 + *
8486 + * gsnedf_job_arrival(T)	- This is the catch all function when T enters
8487 + *                                the system after either a suspension or at a
8488 + *                                job release. It will queue T (which means it
8489 + *                                is not safe to call gsnedf_job_arrival(T) if
8490 + *                                T is already queued) and then check whether a
8491 + *                                preemption is necessary. If a preemption is
8492 + *                                necessary it will update the linkage
8493 + *                                accordingly and cause scheduled to be called
8494 + *                                (either with an IPI or need_resched). It is
8495 + *                                safe to call gsnedf_job_arrival(T) if T's
8496 + *                                next job has not been actually released yet
8497 + *                                (releast time in the future). T will be put
8498 + *                                on the release queue in that case.
8499 + *
8500 + * job_completion(T)		- Take care of everything that needs to be done
8501 + *                                to prepare T for its next release and place
8502 + *                                it in the right queue with
8503 + *                                gsnedf_job_arrival().
8504 + *
8505 + *
8506 + * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is
8507 + * equivalent to unlink(T). Note that if you unlink a task from a CPU none of
8508 + * the functions will automatically propagate pending task from the ready queue
8509 + * to a linked task. This is the job of the calling function ( by means of
8510 + * __take_ready).
8511 + */
8512 +
8513 +
8514 +/* cpu_entry_t - maintain the linked and scheduled state
8515 + */
8516 +typedef struct  {
8517 +	int 			cpu;
8518 +	struct task_struct*	linked;		/* only RT tasks */
8519 +	struct task_struct*	scheduled;	/* only RT tasks */
8520 +	struct bheap_node*	hn;
8521 +} cpu_entry_t;
8522 +DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries);
8523 +
8524 +cpu_entry_t* gsnedf_cpus[NR_CPUS];
8525 +
8526 +/* the cpus queue themselves according to priority in here */
8527 +static struct bheap_node gsnedf_heap_node[NR_CPUS];
8528 +static struct bheap      gsnedf_cpu_heap;
8529 +
8530 +static rt_domain_t gsnedf;
8531 +#define gsnedf_lock (gsnedf.ready_lock)
8532 +
8533 +
8534 +/* Uncomment this if you want to see all scheduling decisions in the
8535 + * TRACE() log.
8536 +#define WANT_ALL_SCHED_EVENTS
8537 + */
8538 +
8539 +static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
8540 +{
8541 +	cpu_entry_t *a, *b;
8542 +	a = _a->value;
8543 +	b = _b->value;
8544 +	/* Note that a and b are inverted: we want the lowest-priority CPU at
8545 +	 * the top of the heap.
8546 +	 */
8547 +	return edf_higher_prio(b->linked, a->linked);
8548 +}
8549 +
8550 +/* update_cpu_position - Move the cpu entry to the correct place to maintain
8551 + *                       order in the cpu queue. Caller must hold gsnedf lock.
8552 + */
8553 +static void update_cpu_position(cpu_entry_t *entry)
8554 +{
8555 +	if (likely(bheap_node_in_heap(entry->hn)))
8556 +		bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
8557 +	bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
8558 +}
8559 +
8560 +/* caller must hold gsnedf lock */
8561 +static cpu_entry_t* lowest_prio_cpu(void)
8562 +{
8563 +	struct bheap_node* hn;
8564 +	hn = bheap_peek(cpu_lower_prio, &gsnedf_cpu_heap);
8565 +	return hn->value;
8566 +}
8567 +
8568 +
8569 +/* link_task_to_cpu - Update the link of a CPU.
8570 + *                    Handles the case where the to-be-linked task is already
8571 + *                    scheduled on a different CPU.
8572 + */
8573 +static noinline void link_task_to_cpu(struct task_struct* linked,
8574 +				      cpu_entry_t *entry)
8575 +{
8576 +	cpu_entry_t *sched;
8577 +	struct task_struct* tmp;
8578 +	int on_cpu;
8579 +
8580 +	BUG_ON(linked && !is_realtime(linked));
8581 +
8582 +	/* Currently linked task is set to be unlinked. */
8583 +	if (entry->linked) {
8584 +		entry->linked->rt_param.linked_on = NO_CPU;
8585 +	}
8586 +
8587 +	/* Link new task to CPU. */
8588 +	if (linked) {
8589 +		set_rt_flags(linked, RT_F_RUNNING);
8590 +		/* handle task is already scheduled somewhere! */
8591 +		on_cpu = linked->rt_param.scheduled_on;
8592 +		if (on_cpu != NO_CPU) {
8593 +			sched = &per_cpu(gsnedf_cpu_entries, on_cpu);
8594 +			/* this should only happen if not linked already */
8595 +			BUG_ON(sched->linked == linked);
8596 +
8597 +			/* If we are already scheduled on the CPU to which we
8598 +			 * wanted to link, we don't need to do the swap --
8599 +			 * we just link ourselves to the CPU and depend on
8600 +			 * the caller to get things right.
8601 +			 */
8602 +			if (entry != sched) {
8603 +				TRACE_TASK(linked,
8604 +					   "already scheduled on %d, updating link.\n",
8605 +					   sched->cpu);
8606 +				tmp = sched->linked;
8607 +				linked->rt_param.linked_on = sched->cpu;
8608 +				sched->linked = linked;
8609 +				update_cpu_position(sched);
8610 +				linked = tmp;
8611 +			}
8612 +		}
8613 +		if (linked) /* might be NULL due to swap */
8614 +			linked->rt_param.linked_on = entry->cpu;
8615 +	}
8616 +	entry->linked = linked;
8617 +#ifdef WANT_ALL_SCHED_EVENTS
8618 +	if (linked)
8619 +		TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
8620 +	else
8621 +		TRACE("NULL linked to %d.\n", entry->cpu);
8622 +#endif
8623 +	update_cpu_position(entry);
8624 +}
8625 +
8626 +/* unlink - Make sure a task is not linked any longer to an entry
8627 + *          where it was linked before. Must hold gsnedf_lock.
8628 + */
8629 +static noinline void unlink(struct task_struct* t)
8630 +{
8631 +    	cpu_entry_t *entry;
8632 +
8633 +	if (t->rt_param.linked_on != NO_CPU) {
8634 +		/* unlink */
8635 +		entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on);
8636 +		t->rt_param.linked_on = NO_CPU;
8637 +		link_task_to_cpu(NULL, entry);
8638 +	} else if (is_queued(t)) {
8639 +		/* This is an interesting situation: t is scheduled,
8640 +		 * but was just recently unlinked.  It cannot be
8641 +		 * linked anywhere else (because then it would have
8642 +		 * been relinked to this CPU), thus it must be in some
8643 +		 * queue. We must remove it from the list in this
8644 +		 * case.
8645 +		 */
8646 +		remove(&gsnedf, t);
8647 +	}
8648 +}
8649 +
8650 +
8651 +/* preempt - force a CPU to reschedule
8652 + */
8653 +static void preempt(cpu_entry_t *entry)
8654 +{
8655 +	preempt_if_preemptable(entry->scheduled, entry->cpu);
8656 +}
8657 +
8658 +/* requeue - Put an unlinked task into gsn-edf domain.
8659 + *           Caller must hold gsnedf_lock.
8660 + */
8661 +static noinline void requeue(struct task_struct* task)
8662 +{
8663 +	BUG_ON(!task);
8664 +	/* sanity check before insertion */
8665 +	BUG_ON(is_queued(task));
8666 +
8667 +	if (is_released(task, litmus_clock()))
8668 +		__add_ready(&gsnedf, task);
8669 +	else {
8670 +		/* it has got to wait */
8671 +		add_release(&gsnedf, task);
8672 +	}
8673 +}
8674 +
8675 +#ifdef CONFIG_SCHED_CPU_AFFINITY
8676 +static cpu_entry_t* gsnedf_get_nearest_available_cpu(cpu_entry_t *start)
8677 +{
8678 +	cpu_entry_t *affinity;
8679 +
8680 +	get_nearest_available_cpu(affinity, start, gsnedf_cpu_entries,
8681 +#ifdef CONFIG_RELEASE_MASTER
8682 +			gsnedf.release_master
8683 +#else
8684 +			NO_CPU
8685 +#endif
8686 +			);
8687 +
8688 +	return(affinity);
8689 +}
8690 +#endif
8691 +
8692 +/* check for any necessary preemptions */
8693 +static void check_for_preemptions(void)
8694 +{
8695 +	struct task_struct *task;
8696 +	cpu_entry_t *last;
8697 +
8698 +	for (last = lowest_prio_cpu();
8699 +	     edf_preemption_needed(&gsnedf, last->linked);
8700 +	     last = lowest_prio_cpu()) {
8701 +		/* preemption necessary */
8702 +		task = __take_ready(&gsnedf);
8703 +		TRACE("check_for_preemptions: attempting to link task %d to %d\n",
8704 +		      task->pid, last->cpu);
8705 +
8706 +#ifdef CONFIG_SCHED_CPU_AFFINITY
8707 +		{
8708 +			cpu_entry_t *affinity =
8709 +					gsnedf_get_nearest_available_cpu(
8710 +						&per_cpu(gsnedf_cpu_entries, task_cpu(task)));
8711 +			if (affinity)
8712 +				last = affinity;
8713 +			else if (last->linked)
8714 +				requeue(last->linked);
8715 +		}
8716 +#else
8717 +		if (last->linked)
8718 +			requeue(last->linked);
8719 +#endif
8720 +
8721 +		link_task_to_cpu(task, last);
8722 +		preempt(last);
8723 +	}
8724 +}
8725 +
8726 +/* gsnedf_job_arrival: task is either resumed or released */
8727 +static noinline void gsnedf_job_arrival(struct task_struct* task)
8728 +{
8729 +	BUG_ON(!task);
8730 +
8731 +	requeue(task);
8732 +	check_for_preemptions();
8733 +}
8734 +
8735 +static void gsnedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
8736 +{
8737 +	unsigned long flags;
8738 +
8739 +	raw_spin_lock_irqsave(&gsnedf_lock, flags);
8740 +
8741 +	__merge_ready(rt, tasks);
8742 +	check_for_preemptions();
8743 +
8744 +	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
8745 +}
8746 +
8747 +/* caller holds gsnedf_lock */
8748 +static noinline void job_completion(struct task_struct *t, int forced)
8749 +{
8750 +	BUG_ON(!t);
8751 +
8752 +	sched_trace_task_completion(t, forced);
8753 +
8754 +	TRACE_TASK(t, "job_completion().\n");
8755 +
8756 +	/* set flags */
8757 +	set_rt_flags(t, RT_F_SLEEP);
8758 +	/* prepare for next period */
8759 +	prepare_for_next_period(t);
8760 +	if (is_released(t, litmus_clock()))
8761 +		sched_trace_task_release(t);
8762 +	/* unlink */
8763 +	unlink(t);
8764 +	/* requeue
8765 +	 * But don't requeue a blocking task. */
8766 +	if (is_running(t))
8767 +		gsnedf_job_arrival(t);
8768 +}
8769 +
8770 +/* gsnedf_tick - this function is called for every local timer
8771 + *                         interrupt.
8772 + *
8773 + *                   checks whether the current task has expired and checks
8774 + *                   whether we need to preempt it if it has not expired
8775 + */
8776 +static void gsnedf_tick(struct task_struct* t)
8777 +{
8778 +	if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
8779 +		if (!is_np(t)) {
8780 +			/* np tasks will be preempted when they become
8781 +			 * preemptable again
8782 +			 */
8783 +			litmus_reschedule_local();
8784 +			TRACE("gsnedf_scheduler_tick: "
8785 +			      "%d is preemptable "
8786 +			      " => FORCE_RESCHED\n", t->pid);
8787 +		} else if (is_user_np(t)) {
8788 +			TRACE("gsnedf_scheduler_tick: "
8789 +			      "%d is non-preemptable, "
8790 +			      "preemption delayed.\n", t->pid);
8791 +			request_exit_np(t);
8792 +		}
8793 +	}
8794 +}
8795 +
8796 +/* Getting schedule() right is a bit tricky. schedule() may not make any
8797 + * assumptions on the state of the current task since it may be called for a
8798 + * number of reasons. The reasons include a scheduler_tick() determined that it
8799 + * was necessary, because sys_exit_np() was called, because some Linux
8800 + * subsystem determined so, or even (in the worst case) because there is a bug
8801 + * hidden somewhere. Thus, we must take extreme care to determine what the
8802 + * current state is.
8803 + *
8804 + * The CPU could currently be scheduling a task (or not), be linked (or not).
8805 + *
8806 + * The following assertions for the scheduled task could hold:
8807 + *
8808 + *      - !is_running(scheduled)        // the job blocks
8809 + *	- scheduled->timeslice == 0	// the job completed (forcefully)
8810 + *	- get_rt_flag() == RT_F_SLEEP	// the job completed (by syscall)
8811 + * 	- linked != scheduled		// we need to reschedule (for any reason)
8812 + * 	- is_np(scheduled)		// rescheduling must be delayed,
8813 + *					   sys_exit_np must be requested
8814 + *
8815 + * Any of these can occur together.
8816 + */
8817 +static struct task_struct* gsnedf_schedule(struct task_struct * prev)
8818 +{
8819 +	cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries);
8820 +	int out_of_time, sleep, preempt, np, exists, blocks;
8821 +	struct task_struct* next = NULL;
8822 +
8823 +#ifdef CONFIG_RELEASE_MASTER
8824 +	/* Bail out early if we are the release master.
8825 +	 * The release master never schedules any real-time tasks.
8826 +	 */
8827 +	if (unlikely(gsnedf.release_master == entry->cpu)) {
8828 +		sched_state_task_picked();
8829 +		return NULL;
8830 +	}
8831 +#endif
8832 +
8833 +	raw_spin_lock(&gsnedf_lock);
8834 +
8835 +	/* sanity checking */
8836 +	BUG_ON(entry->scheduled && entry->scheduled != prev);
8837 +	BUG_ON(entry->scheduled && !is_realtime(prev));
8838 +	BUG_ON(is_realtime(prev) && !entry->scheduled);
8839 +
8840 +	/* (0) Determine state */
8841 +	exists      = entry->scheduled != NULL;
8842 +	blocks      = exists && !is_running(entry->scheduled);
8843 +	out_of_time = exists &&
8844 +				  budget_enforced(entry->scheduled) &&
8845 +				  budget_exhausted(entry->scheduled);
8846 +	np 	    = exists && is_np(entry->scheduled);
8847 +	sleep	    = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
8848 +	preempt     = entry->scheduled != entry->linked;
8849 +
8850 +#ifdef WANT_ALL_SCHED_EVENTS
8851 +	TRACE_TASK(prev, "invoked gsnedf_schedule.\n");
8852 +#endif
8853 +
8854 +	if (exists)
8855 +		TRACE_TASK(prev,
8856 +			   "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
8857 +			   "state:%d sig:%d\n",
8858 +			   blocks, out_of_time, np, sleep, preempt,
8859 +			   prev->state, signal_pending(prev));
8860 +	if (entry->linked && preempt)
8861 +		TRACE_TASK(prev, "will be preempted by %s/%d\n",
8862 +			   entry->linked->comm, entry->linked->pid);
8863 +
8864 +
8865 +	/* If a task blocks we have no choice but to reschedule.
8866 +	 */
8867 +	if (blocks)
8868 +		unlink(entry->scheduled);
8869 +
8870 +	/* Request a sys_exit_np() call if we would like to preempt but cannot.
8871 +	 * We need to make sure to update the link structure anyway in case
8872 +	 * that we are still linked. Multiple calls to request_exit_np() don't
8873 +	 * hurt.
8874 +	 */
8875 +	if (np && (out_of_time || preempt || sleep)) {
8876 +		unlink(entry->scheduled);
8877 +		request_exit_np(entry->scheduled);
8878 +	}
8879 +
8880 +	/* Any task that is preemptable and either exhausts its execution
8881 +	 * budget or wants to sleep completes. We may have to reschedule after
8882 +	 * this. Don't do a job completion if we block (can't have timers running
8883 +	 * for blocked jobs). Preemption go first for the same reason.
8884 +	 */
8885 +	if (!np && (out_of_time || sleep) && !blocks && !preempt)
8886 +		job_completion(entry->scheduled, !sleep);
8887 +
8888 +	/* Link pending task if we became unlinked.
8889 +	 */
8890 +	if (!entry->linked)
8891 +		link_task_to_cpu(__take_ready(&gsnedf), entry);
8892 +
8893 +	/* The final scheduling decision. Do we need to switch for some reason?
8894 +	 * If linked is different from scheduled, then select linked as next.
8895 +	 */
8896 +	if ((!np || blocks) &&
8897 +	    entry->linked != entry->scheduled) {
8898 +		/* Schedule a linked job? */
8899 +		if (entry->linked) {
8900 +			entry->linked->rt_param.scheduled_on = entry->cpu;
8901 +			next = entry->linked;
8902 +			TRACE_TASK(next, "scheduled_on = P%d\n", smp_processor_id());
8903 +		}
8904 +		if (entry->scheduled) {
8905 +			/* not gonna be scheduled soon */
8906 +			entry->scheduled->rt_param.scheduled_on = NO_CPU;
8907 +			TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
8908 +		}
8909 +	} else
8910 +		/* Only override Linux scheduler if we have a real-time task
8911 +		 * scheduled that needs to continue.
8912 +		 */
8913 +		if (exists)
8914 +			next = prev;
8915 +
8916 +	sched_state_task_picked();
8917 +
8918 +	raw_spin_unlock(&gsnedf_lock);
8919 +
8920 +#ifdef WANT_ALL_SCHED_EVENTS
8921 +	TRACE("gsnedf_lock released, next=0x%p\n", next);
8922 +
8923 +	if (next)
8924 +		TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
8925 +	else if (exists && !next)
8926 +		TRACE("becomes idle at %llu.\n", litmus_clock());
8927 +#endif
8928 +
8929 +
8930 +	return next;
8931 +}
8932 +
8933 +
8934 +/* _finish_switch - we just finished the switch away from prev
8935 + */
8936 +static void gsnedf_finish_switch(struct task_struct *prev)
8937 +{
8938 +	cpu_entry_t* 	entry = &__get_cpu_var(gsnedf_cpu_entries);
8939 +
8940 +	entry->scheduled = is_realtime(current) ? current : NULL;
8941 +#ifdef WANT_ALL_SCHED_EVENTS
8942 +	TRACE_TASK(prev, "switched away from\n");
8943 +#endif
8944 +}
8945 +
8946 +
8947 +/*	Prepare a task for running in RT mode
8948 + */
8949 +static void gsnedf_task_new(struct task_struct * t, int on_rq, int running)
8950 +{
8951 +	unsigned long 		flags;
8952 +	cpu_entry_t* 		entry;
8953 +
8954 +	TRACE("gsn edf: task new %d\n", t->pid);
8955 +
8956 +	raw_spin_lock_irqsave(&gsnedf_lock, flags);
8957 +
8958 +	/* setup job params */
8959 +	release_at(t, litmus_clock());
8960 +
8961 +	if (running) {
8962 +		entry = &per_cpu(gsnedf_cpu_entries, task_cpu(t));
8963 +		BUG_ON(entry->scheduled);
8964 +
8965 +#ifdef CONFIG_RELEASE_MASTER
8966 +		if (entry->cpu != gsnedf.release_master) {
8967 +#endif
8968 +			entry->scheduled = t;
8969 +			tsk_rt(t)->scheduled_on = task_cpu(t);
8970 +#ifdef CONFIG_RELEASE_MASTER
8971 +		} else {
8972 +			/* do not schedule on release master */
8973 +			preempt(entry); /* force resched */
8974 +			tsk_rt(t)->scheduled_on = NO_CPU;
8975 +		}
8976 +#endif
8977 +	} else {
8978 +		t->rt_param.scheduled_on = NO_CPU;
8979 +	}
8980 +	t->rt_param.linked_on          = NO_CPU;
8981 +
8982 +	gsnedf_job_arrival(t);
8983 +	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
8984 +}
8985 +
8986 +static void gsnedf_task_wake_up(struct task_struct *task)
8987 +{
8988 +	unsigned long flags;
8989 +	lt_t now;
8990 +
8991 +	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
8992 +
8993 +	raw_spin_lock_irqsave(&gsnedf_lock, flags);
8994 +	/* We need to take suspensions because of semaphores into
8995 +	 * account! If a job resumes after being suspended due to acquiring
8996 +	 * a semaphore, it should never be treated as a new job release.
8997 +	 */
8998 +	if (get_rt_flags(task) == RT_F_EXIT_SEM) {
8999 +		set_rt_flags(task, RT_F_RUNNING);
9000 +	} else {
9001 +		now = litmus_clock();
9002 +		if (is_tardy(task, now)) {
9003 +			/* new sporadic release */
9004 +			release_at(task, now);
9005 +			sched_trace_task_release(task);
9006 +		}
9007 +		else {
9008 +			if (task->rt.time_slice) {
9009 +				/* came back in time before deadline
9010 +				*/
9011 +				set_rt_flags(task, RT_F_RUNNING);
9012 +			}
9013 +		}
9014 +	}
9015 +	gsnedf_job_arrival(task);
9016 +	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
9017 +}
9018 +
9019 +static void gsnedf_task_block(struct task_struct *t)
9020 +{
9021 +	unsigned long flags;
9022 +
9023 +	TRACE_TASK(t, "block at %llu\n", litmus_clock());
9024 +
9025 +	/* unlink if necessary */
9026 +	raw_spin_lock_irqsave(&gsnedf_lock, flags);
9027 +	unlink(t);
9028 +	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
9029 +
9030 +	BUG_ON(!is_realtime(t));
9031 +}
9032 +
9033 +
9034 +static void gsnedf_task_exit(struct task_struct * t)
9035 +{
9036 +	unsigned long flags;
9037 +
9038 +	/* unlink if necessary */
9039 +	raw_spin_lock_irqsave(&gsnedf_lock, flags);
9040 +	unlink(t);
9041 +	if (tsk_rt(t)->scheduled_on != NO_CPU) {
9042 +		gsnedf_cpus[tsk_rt(t)->scheduled_on]->scheduled = NULL;
9043 +		tsk_rt(t)->scheduled_on = NO_CPU;
9044 +	}
9045 +	raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
9046 +
9047 +	BUG_ON(!is_realtime(t));
9048 +        TRACE_TASK(t, "RIP\n");
9049 +}
9050 +
9051 +
9052 +static long gsnedf_admit_task(struct task_struct* tsk)
9053 +{
9054 +	return 0;
9055 +}
9056 +
9057 +#ifdef CONFIG_LITMUS_LOCKING
9058 +
9059 +#include <litmus/fdso.h>
9060 +
9061 +/* called with IRQs off */
9062 +static void set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
9063 +{
9064 +	int linked_on;
9065 +	int check_preempt = 0;
9066 +
9067 +	raw_spin_lock(&gsnedf_lock);
9068 +
9069 +	TRACE_TASK(t, "inherits priority from %s/%d\n", prio_inh->comm, prio_inh->pid);
9070 +	tsk_rt(t)->inh_task = prio_inh;
9071 +
9072 +	linked_on  = tsk_rt(t)->linked_on;
9073 +
9074 +	/* If it is scheduled, then we need to reorder the CPU heap. */
9075 +	if (linked_on != NO_CPU) {
9076 +		TRACE_TASK(t, "%s: linked  on %d\n",
9077 +			   __FUNCTION__, linked_on);
9078 +		/* Holder is scheduled; need to re-order CPUs.
9079 +		 * We can't use heap_decrease() here since
9080 +		 * the cpu_heap is ordered in reverse direction, so
9081 +		 * it is actually an increase. */
9082 +		bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap,
9083 +			    gsnedf_cpus[linked_on]->hn);
9084 +		bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap,
9085 +			    gsnedf_cpus[linked_on]->hn);
9086 +	} else {
9087 +		/* holder may be queued: first stop queue changes */
9088 +		raw_spin_lock(&gsnedf.release_lock);
9089 +		if (is_queued(t)) {
9090 +			TRACE_TASK(t, "%s: is queued\n",
9091 +				   __FUNCTION__);
9092 +			/* We need to update the position of holder in some
9093 +			 * heap. Note that this could be a release heap if we
9094 +			 * budget enforcement is used and this job overran. */
9095 +			check_preempt =
9096 +				!bheap_decrease(edf_ready_order,
9097 +					       tsk_rt(t)->heap_node);
9098 +		} else {
9099 +			/* Nothing to do: if it is not queued and not linked
9100 +			 * then it is either sleeping or currently being moved
9101 +			 * by other code (e.g., a timer interrupt handler) that
9102 +			 * will use the correct priority when enqueuing the
9103 +			 * task. */
9104 +			TRACE_TASK(t, "%s: is NOT queued => Done.\n",
9105 +				   __FUNCTION__);
9106 +		}
9107 +		raw_spin_unlock(&gsnedf.release_lock);
9108 +
9109 +		/* If holder was enqueued in a release heap, then the following
9110 +		 * preemption check is pointless, but we can't easily detect
9111 +		 * that case. If you want to fix this, then consider that
9112 +		 * simply adding a state flag requires O(n) time to update when
9113 +		 * releasing n tasks, which conflicts with the goal to have
9114 +		 * O(log n) merges. */
9115 +		if (check_preempt) {
9116 +			/* heap_decrease() hit the top level of the heap: make
9117 +			 * sure preemption checks get the right task, not the
9118 +			 * potentially stale cache. */
9119 +			bheap_uncache_min(edf_ready_order,
9120 +					 &gsnedf.ready_queue);
9121 +			check_for_preemptions();
9122 +		}
9123 +	}
9124 +
9125 +	raw_spin_unlock(&gsnedf_lock);
9126 +}
9127 +
9128 +/* called with IRQs off */
9129 +static void clear_priority_inheritance(struct task_struct* t)
9130 +{
9131 +	raw_spin_lock(&gsnedf_lock);
9132 +
9133 +	/* A job only stops inheriting a priority when it releases a
9134 +	 * resource. Thus we can make the following assumption.*/
9135 +	BUG_ON(tsk_rt(t)->scheduled_on == NO_CPU);
9136 +
9137 +	TRACE_TASK(t, "priority restored\n");
9138 +	tsk_rt(t)->inh_task = NULL;
9139 +
9140 +	/* Check if rescheduling is necessary. We can't use heap_decrease()
9141 +	 * since the priority was effectively lowered. */
9142 +	unlink(t);
9143 +	gsnedf_job_arrival(t);
9144 +
9145 +	raw_spin_unlock(&gsnedf_lock);
9146 +}
9147 +
9148 +
9149 +/* ******************** FMLP support ********************** */
9150 +
9151 +/* struct for semaphore with priority inheritance */
9152 +struct fmlp_semaphore {
9153 +	struct litmus_lock litmus_lock;
9154 +
9155 +	/* current resource holder */
9156 +	struct task_struct *owner;
9157 +
9158 +	/* highest-priority waiter */
9159 +	struct task_struct *hp_waiter;
9160 +
9161 +	/* FIFO queue of waiting tasks */
9162 +	wait_queue_head_t wait;
9163 +};
9164 +
9165 +static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
9166 +{
9167 +	return container_of(lock, struct fmlp_semaphore, litmus_lock);
9168 +}
9169 +
9170 +/* caller is responsible for locking */
9171 +struct task_struct* find_hp_waiter(struct fmlp_semaphore *sem,
9172 +				   struct task_struct* skip)
9173 +{
9174 +	struct list_head	*pos;
9175 +	struct task_struct 	*queued, *found = NULL;
9176 +
9177 +	list_for_each(pos, &sem->wait.task_list) {
9178 +		queued  = (struct task_struct*) list_entry(pos, wait_queue_t,
9179 +							   task_list)->private;
9180 +
9181 +		/* Compare task prios, find high prio task. */
9182 +		if (queued != skip && edf_higher_prio(queued, found))
9183 +			found = queued;
9184 +	}
9185 +	return found;
9186 +}
9187 +
9188 +int gsnedf_fmlp_lock(struct litmus_lock* l)
9189 +{
9190 +	struct task_struct* t = current;
9191 +	struct fmlp_semaphore *sem = fmlp_from_lock(l);
9192 +	wait_queue_t wait;
9193 +	unsigned long flags;
9194 +
9195 +	if (!is_realtime(t))
9196 +		return -EPERM;
9197 +
9198 +	spin_lock_irqsave(&sem->wait.lock, flags);
9199 +
9200 +	if (sem->owner) {
9201 +		/* resource is not free => must suspend and wait */
9202 +
9203 +		init_waitqueue_entry(&wait, t);
9204 +
9205 +		/* FIXME: interruptible would be nice some day */
9206 +		set_task_state(t, TASK_UNINTERRUPTIBLE);
9207 +
9208 +		__add_wait_queue_tail_exclusive(&sem->wait, &wait);
9209 +
9210 +		/* check if we need to activate priority inheritance */
9211 +		if (edf_higher_prio(t, sem->hp_waiter)) {
9212 +			sem->hp_waiter = t;
9213 +			if (edf_higher_prio(t, sem->owner))
9214 +				set_priority_inheritance(sem->owner, sem->hp_waiter);
9215 +		}
9216 +
9217 +		TS_LOCK_SUSPEND;
9218 +
9219 +		/* release lock before sleeping */
9220 +		spin_unlock_irqrestore(&sem->wait.lock, flags);
9221 +
9222 +		/* We depend on the FIFO order.  Thus, we don't need to recheck
9223 +		 * when we wake up; we are guaranteed to have the lock since
9224 +		 * there is only one wake up per release.
9225 +		 */
9226 +
9227 +		schedule();
9228 +
9229 +		TS_LOCK_RESUME;
9230 +
9231 +		/* Since we hold the lock, no other task will change
9232 +		 * ->owner. We can thus check it without acquiring the spin
9233 +		 * lock. */
9234 +		BUG_ON(sem->owner != t);
9235 +	} else {
9236 +		/* it's ours now */
9237 +		sem->owner = t;
9238 +
9239 +		spin_unlock_irqrestore(&sem->wait.lock, flags);
9240 +	}
9241 +
9242 +	return 0;
9243 +}
9244 +
9245 +int gsnedf_fmlp_unlock(struct litmus_lock* l)
9246 +{
9247 +	struct task_struct *t = current, *next;
9248 +	struct fmlp_semaphore *sem = fmlp_from_lock(l);
9249 +	unsigned long flags;
9250 +	int err = 0;
9251 +
9252 +	spin_lock_irqsave(&sem->wait.lock, flags);
9253 +
9254 +	if (sem->owner != t) {
9255 +		err = -EINVAL;
9256 +		goto out;
9257 +	}
9258 +
9259 +	/* check if there are jobs waiting for this resource */
9260 +	next = __waitqueue_remove_first(&sem->wait);
9261 +	if (next) {
9262 +		/* next becomes the resouce holder */
9263 +		sem->owner = next;
9264 +		TRACE_CUR("lock ownership passed to %s/%d\n", next->comm, next->pid);
9265 +
9266 +		/* determine new hp_waiter if necessary */
9267 +		if (next == sem->hp_waiter) {
9268 +			TRACE_TASK(next, "was highest-prio waiter\n");
9269 +			/* next has the highest priority --- it doesn't need to
9270 +			 * inherit.  However, we need to make sure that the
9271 +			 * next-highest priority in the queue is reflected in
9272 +			 * hp_waiter. */
9273 +			sem->hp_waiter = find_hp_waiter(sem, next);
9274 +			if (sem->hp_waiter)
9275 +				TRACE_TASK(sem->hp_waiter, "is new highest-prio waiter\n");
9276 +			else
9277 +				TRACE("no further waiters\n");
9278 +		} else {
9279 +			/* Well, if next is not the highest-priority waiter,
9280 +			 * then it ought to inherit the highest-priority
9281 +			 * waiter's priority. */
9282 +			set_priority_inheritance(next, sem->hp_waiter);
9283 +		}
9284 +
9285 +		/* wake up next */
9286 +		wake_up_process(next);
9287 +	} else
9288 +		/* becomes available */
9289 +		sem->owner = NULL;
9290 +
9291 +	/* we lose the benefit of priority inheritance (if any) */
9292 +	if (tsk_rt(t)->inh_task)
9293 +		clear_priority_inheritance(t);
9294 +
9295 +out:
9296 +	spin_unlock_irqrestore(&sem->wait.lock, flags);
9297 +
9298 +	return err;
9299 +}
9300 +
9301 +int gsnedf_fmlp_close(struct litmus_lock* l)
9302 +{
9303 +	struct task_struct *t = current;
9304 +	struct fmlp_semaphore *sem = fmlp_from_lock(l);
9305 +	unsigned long flags;
9306 +
9307 +	int owner;
9308 +
9309 +	spin_lock_irqsave(&sem->wait.lock, flags);
9310 +
9311 +	owner = sem->owner == t;
9312 +
9313 +	spin_unlock_irqrestore(&sem->wait.lock, flags);
9314 +
9315 +	if (owner)
9316 +		gsnedf_fmlp_unlock(l);
9317 +
9318 +	return 0;
9319 +}
9320 +
9321 +void gsnedf_fmlp_free(struct litmus_lock* lock)
9322 +{
9323 +	kfree(fmlp_from_lock(lock));
9324 +}
9325 +
9326 +static struct litmus_lock_ops gsnedf_fmlp_lock_ops = {
9327 +	.close  = gsnedf_fmlp_close,
9328 +	.lock   = gsnedf_fmlp_lock,
9329 +	.unlock = gsnedf_fmlp_unlock,
9330 +	.deallocate = gsnedf_fmlp_free,
9331 +};
9332 +
9333 +static struct litmus_lock* gsnedf_new_fmlp(void)
9334 +{
9335 +	struct fmlp_semaphore* sem;
9336 +
9337 +	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
9338 +	if (!sem)
9339 +		return NULL;
9340 +
9341 +	sem->owner   = NULL;
9342 +	sem->hp_waiter = NULL;
9343 +	init_waitqueue_head(&sem->wait);
9344 +	sem->litmus_lock.ops = &gsnedf_fmlp_lock_ops;
9345 +
9346 +	return &sem->litmus_lock;
9347 +}
9348 +
9349 +/* **** lock constructor **** */
9350 +
9351 +
9352 +static long gsnedf_allocate_lock(struct litmus_lock **lock, int type,
9353 +				 void* __user unused)
9354 +{
9355 +	int err = -ENXIO;
9356 +
9357 +	/* GSN-EDF currently only supports the FMLP for global resources. */
9358 +	switch (type) {
9359 +
9360 +	case FMLP_SEM:
9361 +		/* Flexible Multiprocessor Locking Protocol */
9362 +		*lock = gsnedf_new_fmlp();
9363 +		if (*lock)
9364 +			err = 0;
9365 +		else
9366 +			err = -ENOMEM;
9367 +		break;
9368 +
9369 +	};
9370 +
9371 +	return err;
9372 +}
9373 +
9374 +#endif
9375 +
9376 +
9377 +static long gsnedf_activate_plugin(void)
9378 +{
9379 +	int cpu;
9380 +	cpu_entry_t *entry;
9381 +
9382 +	bheap_init(&gsnedf_cpu_heap);
9383 +#ifdef CONFIG_RELEASE_MASTER
9384 +	gsnedf.release_master = atomic_read(&release_master_cpu);
9385 +#endif
9386 +
9387 +	for_each_online_cpu(cpu) {
9388 +		entry = &per_cpu(gsnedf_cpu_entries, cpu);
9389 +		bheap_node_init(&entry->hn, entry);
9390 +		entry->linked    = NULL;
9391 +		entry->scheduled = NULL;
9392 +#ifdef CONFIG_RELEASE_MASTER
9393 +		if (cpu != gsnedf.release_master) {
9394 +#endif
9395 +			TRACE("GSN-EDF: Initializing CPU #%d.\n", cpu);
9396 +			update_cpu_position(entry);
9397 +#ifdef CONFIG_RELEASE_MASTER
9398 +		} else {
9399 +			TRACE("GSN-EDF: CPU %d is release master.\n", cpu);
9400 +		}
9401 +#endif
9402 +	}
9403 +	return 0;
9404 +}
9405 +
9406 +/*	Plugin object	*/
9407 +static struct sched_plugin gsn_edf_plugin __cacheline_aligned_in_smp = {
9408 +	.plugin_name		= "GSN-EDF",
9409 +	.finish_switch		= gsnedf_finish_switch,
9410 +	.tick			= gsnedf_tick,
9411 +	.task_new		= gsnedf_task_new,
9412 +	.complete_job		= complete_job,
9413 +	.task_exit		= gsnedf_task_exit,
9414 +	.schedule		= gsnedf_schedule,
9415 +	.task_wake_up		= gsnedf_task_wake_up,
9416 +	.task_block		= gsnedf_task_block,
9417 +	.admit_task		= gsnedf_admit_task,
9418 +	.activate_plugin	= gsnedf_activate_plugin,
9419 +#ifdef CONFIG_LITMUS_LOCKING
9420 +	.allocate_lock		= gsnedf_allocate_lock,
9421 +#endif
9422 +};
9423 +
9424 +
9425 +static int __init init_gsn_edf(void)
9426 +{
9427 +	int cpu;
9428 +	cpu_entry_t *entry;
9429 +
9430 +	bheap_init(&gsnedf_cpu_heap);
9431 +	/* initialize CPU state */
9432 +	for (cpu = 0; cpu < NR_CPUS; cpu++)  {
9433 +		entry = &per_cpu(gsnedf_cpu_entries, cpu);
9434 +		gsnedf_cpus[cpu] = entry;
9435 +		entry->cpu 	 = cpu;
9436 +		entry->hn        = &gsnedf_heap_node[cpu];
9437 +		bheap_node_init(&entry->hn, entry);
9438 +	}
9439 +	edf_domain_init(&gsnedf, NULL, gsnedf_release_jobs);
9440 +	return register_sched_plugin(&gsn_edf_plugin);
9441 +}
9442 +
9443 +
9444 +module_init(init_gsn_edf);
9445 diff --git a/litmus/sched_litmus.c b/litmus/sched_litmus.c
9446 new file mode 100644
9447 index 0000000..5a15ce9
9448 --- /dev/null
9449 +++ b/litmus/sched_litmus.c
9450 @@ -0,0 +1,325 @@
9451 +/* This file is included from kernel/sched.c */
9452 +
9453 +#include <litmus/litmus.h>
9454 +#include <litmus/budget.h>
9455 +#include <litmus/sched_plugin.h>
9456 +#include <litmus/preempt.h>
9457 +
9458 +static void update_time_litmus(struct rq *rq, struct task_struct *p)
9459 +{
9460 +	u64 delta = rq->clock - p->se.exec_start;
9461 +	if (unlikely((s64)delta < 0))
9462 +		delta = 0;
9463 +	/* per job counter */
9464 +	p->rt_param.job_params.exec_time += delta;
9465 +	/* task counter */
9466 +	p->se.sum_exec_runtime += delta;
9467 +	/* sched_clock() */
9468 +	p->se.exec_start = rq->clock;
9469 +	cpuacct_charge(p, delta);
9470 +}
9471 +
9472 +static void double_rq_lock(struct rq *rq1, struct rq *rq2);
9473 +static void double_rq_unlock(struct rq *rq1, struct rq *rq2);
9474 +
9475 +/*
9476 + * litmus_tick gets called by scheduler_tick() with HZ freq
9477 + * Interrupts are disabled
9478 + */
9479 +static void litmus_tick(struct rq *rq, struct task_struct *p)
9480 +{
9481 +	TS_PLUGIN_TICK_START;
9482 +
9483 +	if (is_realtime(p))
9484 +		update_time_litmus(rq, p);
9485 +
9486 +	/* plugin tick */
9487 +	litmus->tick(p);
9488 +
9489 +	TS_PLUGIN_TICK_END;
9490 +
9491 +	return;
9492 +}
9493 +
9494 +static struct task_struct *
9495 +litmus_schedule(struct rq *rq, struct task_struct *prev)
9496 +{
9497 +	struct rq* other_rq;
9498 +	struct task_struct *next;
9499 +
9500 +	long was_running;
9501 +	lt_t _maybe_deadlock = 0;
9502 +
9503 +	/* let the plugin schedule */
9504 +	next = litmus->schedule(prev);
9505 +
9506 +	sched_state_plugin_check();
9507 +
9508 +	/* check if a global plugin pulled a task from a different RQ */
9509 +	if (next && task_rq(next) != rq) {
9510 +		/* we need to migrate the task */
9511 +		other_rq = task_rq(next);
9512 +		TRACE_TASK(next, "migrate from %d\n", other_rq->cpu);
9513 +
9514 +		/* while we drop the lock, the prev task could change its
9515 +		 * state
9516 +		 */
9517 +		was_running = is_running(prev);
9518 +		mb();
9519 +		raw_spin_unlock(&rq->lock);
9520 +
9521 +		/* Don't race with a concurrent switch.  This could deadlock in
9522 +		 * the case of cross or circular migrations.  It's the job of
9523 +		 * the plugin to make sure that doesn't happen.
9524 +		 */
9525 +		TRACE_TASK(next, "stack_in_use=%d\n",
9526 +			   next->rt_param.stack_in_use);
9527 +		if (next->rt_param.stack_in_use != NO_CPU) {
9528 +			TRACE_TASK(next, "waiting to deschedule\n");
9529 +			_maybe_deadlock = litmus_clock();
9530 +		}
9531 +		while (next->rt_param.stack_in_use != NO_CPU) {
9532 +			cpu_relax();
9533 +			mb();
9534 +			if (next->rt_param.stack_in_use == NO_CPU)
9535 +				TRACE_TASK(next,"descheduled. Proceeding.\n");
9536 +
9537 +			if (lt_before(_maybe_deadlock + 10000000,
9538 +				      litmus_clock())) {
9539 +				/* We've been spinning for 10ms.
9540 +				 * Something can't be right!
9541 +				 * Let's abandon the task and bail out; at least
9542 +				 * we will have debug info instead of a hard
9543 +				 * deadlock.
9544 +				 */
9545 +				TRACE_TASK(next,"stack too long in use. "
9546 +					   "Deadlock?\n");
9547 +				next = NULL;
9548 +
9549 +				/* bail out */
9550 +				raw_spin_lock(&rq->lock);
9551 +				return next;
9552 +			}
9553 +		}
9554 +#ifdef  __ARCH_WANT_UNLOCKED_CTXSW
9555 +		if (next->oncpu)
9556 +			TRACE_TASK(next, "waiting for !oncpu");
9557 +		while (next->oncpu) {
9558 +			cpu_relax();
9559 +			mb();
9560 +		}
9561 +#endif
9562 +		double_rq_lock(rq, other_rq);
9563 +		mb();
9564 +		if (is_realtime(prev) && is_running(prev) != was_running) {
9565 +			TRACE_TASK(prev,
9566 +				   "state changed while we dropped"
9567 +				   " the lock: is_running=%d, was_running=%d\n",
9568 +				   is_running(prev), was_running);
9569 +			if (is_running(prev) && !was_running) {
9570 +				/* prev task became unblocked
9571 +				 * we need to simulate normal sequence of events
9572 +				 * to scheduler plugins.
9573 +				 */
9574 +				litmus->task_block(prev);
9575 +				litmus->task_wake_up(prev);
9576 +			}
9577 +		}
9578 +
9579 +		set_task_cpu(next, smp_processor_id());
9580 +
9581 +		/* DEBUG: now that we have the lock we need to make sure a
9582 +		 *  couple of things still hold:
9583 +		 *  - it is still a real-time task
9584 +		 *  - it is still runnable (could have been stopped)
9585 +		 * If either is violated, then the active plugin is
9586 +		 * doing something wrong.
9587 +		 */
9588 +		if (!is_realtime(next) || !is_running(next)) {
9589 +			/* BAD BAD BAD */
9590 +			TRACE_TASK(next,"BAD: migration invariant FAILED: "
9591 +				   "rt=%d running=%d\n",
9592 +				   is_realtime(next),
9593 +				   is_running(next));
9594 +			/* drop the task */
9595 +			next = NULL;
9596 +		}
9597 +		/* release the other CPU's runqueue, but keep ours */
9598 +		raw_spin_unlock(&other_rq->lock);
9599 +	}
9600 +	if (next) {
9601 +		next->rt_param.stack_in_use = rq->cpu;
9602 +		next->se.exec_start = rq->clock;
9603 +	}
9604 +
9605 +	update_enforcement_timer(next);
9606 +	return next;
9607 +}
9608 +
9609 +static void enqueue_task_litmus(struct rq *rq, struct task_struct *p,
9610 +				int flags)
9611 +{
9612 +	if (flags & ENQUEUE_WAKEUP) {
9613 +		sched_trace_task_resume(p);
9614 +		tsk_rt(p)->present = 1;
9615 +		/* LITMUS^RT plugins need to update the state
9616 +		 * _before_ making it available in global structures.
9617 +		 * Linux gets away with being lazy about the task state
9618 +		 * update. We can't do that, hence we update the task
9619 +		 * state already here.
9620 +		 *
9621 +		 * WARNING: this needs to be re-evaluated when porting
9622 +		 *          to newer kernel versions.
9623 +		 */
9624 +		p->state = TASK_RUNNING;
9625 +		litmus->task_wake_up(p);
9626 +
9627 +		rq->litmus.nr_running++;
9628 +	} else
9629 +		TRACE_TASK(p, "ignoring an enqueue, not a wake up.\n");
9630 +}
9631 +
9632 +static void dequeue_task_litmus(struct rq *rq, struct task_struct *p,
9633 +				int flags)
9634 +{
9635 +	if (flags & DEQUEUE_SLEEP) {
9636 +		litmus->task_block(p);
9637 +		tsk_rt(p)->present = 0;
9638 +		sched_trace_task_block(p);
9639 +
9640 +		rq->litmus.nr_running--;
9641 +	} else
9642 +		TRACE_TASK(p, "ignoring a dequeue, not going to sleep.\n");
9643 +}
9644 +
9645 +static void yield_task_litmus(struct rq *rq)
9646 +{
9647 +	BUG_ON(rq->curr != current);
9648 +	/* sched_yield() is called to trigger delayed preemptions.
9649 +	 * Thus, mark the current task as needing to be rescheduled.
9650 +	 * This will cause the scheduler plugin to be invoked, which can
9651 +	 * then determine if a preemption is still required.
9652 +	 */
9653 +	clear_exit_np(current);
9654 +	litmus_reschedule_local();
9655 +}
9656 +
9657 +/* Plugins are responsible for this.
9658 + */
9659 +static void check_preempt_curr_litmus(struct rq *rq, struct task_struct *p, int flags)
9660 +{
9661 +}
9662 +
9663 +static void put_prev_task_litmus(struct rq *rq, struct task_struct *p)
9664 +{
9665 +}
9666 +
9667 +static void pre_schedule_litmus(struct rq *rq, struct task_struct *prev)
9668 +{
9669 +	update_time_litmus(rq, prev);
9670 +	if (!is_running(prev))
9671 +		tsk_rt(prev)->present = 0;
9672 +}
9673 +
9674 +/* pick_next_task_litmus() - litmus_schedule() function
9675 + *
9676 + * return the next task to be scheduled
9677 + */
9678 +static struct task_struct *pick_next_task_litmus(struct rq *rq)
9679 +{
9680 +	/* get the to-be-switched-out task (prev) */
9681 +	struct task_struct *prev = rq->litmus.prev;
9682 +	struct task_struct *next;
9683 +
9684 +	/* if not called from schedule() but from somewhere
9685 +	 * else (e.g., migration), return now!
9686 +	 */
9687 +	if(!rq->litmus.prev)
9688 +		return NULL;
9689 +
9690 +	rq->litmus.prev = NULL;
9691 +
9692 +	TS_PLUGIN_SCHED_START;
9693 +	next = litmus_schedule(rq, prev);
9694 +	TS_PLUGIN_SCHED_END;
9695 +
9696 +	return next;
9697 +}
9698 +
9699 +static void task_tick_litmus(struct rq *rq, struct task_struct *p, int queued)
9700 +{
9701 +	/* nothing to do; tick related tasks are done by litmus_tick() */
9702 +	return;
9703 +}
9704 +
9705 +static void switched_to_litmus(struct rq *rq, struct task_struct *p)
9706 +{
9707 +}
9708 +
9709 +static void prio_changed_litmus(struct rq *rq, struct task_struct *p,
9710 +				int oldprio)
9711 +{
9712 +}
9713 +
9714 +unsigned int get_rr_interval_litmus(struct rq *rq, struct task_struct *p)
9715 +{
9716 +	/* return infinity */
9717 +	return 0;
9718 +}
9719 +
9720 +/* This is called when a task became a real-time task, either due to a SCHED_*
9721 + * class transition or due to PI mutex inheritance. We don't handle Linux PI
9722 + * mutex inheritance yet (and probably never will). Use LITMUS provided
9723 + * synchronization primitives instead.
9724 + */
9725 +static void set_curr_task_litmus(struct rq *rq)
9726 +{
9727 +	rq->curr->se.exec_start = rq->clock;
9728 +}
9729 +
9730 +
9731 +#ifdef CONFIG_SMP
9732 +/* execve tries to rebalance task in this scheduling domain.
9733 + * We don't care about the scheduling domain; can gets called from
9734 + * exec, fork, wakeup.
9735 + */
9736 +static int
9737 +select_task_rq_litmus(struct task_struct *p, int sd_flag, int flags)
9738 +{
9739 +	/* preemption is already disabled.
9740 +	 * We don't want to change cpu here
9741 +	 */
9742 +	return task_cpu(p);
9743 +}
9744 +#endif
9745 +
9746 +static const struct sched_class litmus_sched_class = {
9747 +	/* From 34f971f6 the stop/migrate worker threads have a class on
9748 +	 * their own, which is the highest prio class. We don't support
9749 +	 * cpu-hotplug or cpu throttling. Allows Litmus to use up to 1.0
9750 +	 * CPU capacity.
9751 +	 */
9752 +	.next			= &stop_sched_class,
9753 +	.enqueue_task		= enqueue_task_litmus,
9754 +	.dequeue_task		= dequeue_task_litmus,
9755 +	.yield_task		= yield_task_litmus,
9756 +
9757 +	.check_preempt_curr	= check_preempt_curr_litmus,
9758 +
9759 +	.pick_next_task		= pick_next_task_litmus,
9760 +	.put_prev_task		= put_prev_task_litmus,
9761 +
9762 +#ifdef CONFIG_SMP
9763 +	.select_task_rq		= select_task_rq_litmus,
9764 +
9765 +	.pre_schedule		= pre_schedule_litmus,
9766 +#endif
9767 +
9768 +	.set_curr_task          = set_curr_task_litmus,
9769 +	.task_tick		= task_tick_litmus,
9770 +
9771 +	.get_rr_interval	= get_rr_interval_litmus,
9772 +
9773 +	.prio_changed		= prio_changed_litmus,
9774 +	.switched_to		= switched_to_litmus,
9775 +};
9776 diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
9777 new file mode 100644
9778 index 0000000..16f1065
9779 --- /dev/null
9780 +++ b/litmus/sched_pfair.c
9781 @@ -0,0 +1,1067 @@
9782 +/*
9783 + * kernel/sched_pfair.c
9784 + *
9785 + * Implementation of the PD^2 pfair scheduling algorithm. This
9786 + * implementation realizes "early releasing," i.e., it is work-conserving.
9787 + *
9788 + */
9789 +
9790 +#include <asm/div64.h>
9791 +#include <linux/delay.h>
9792 +#include <linux/module.h>
9793 +#include <linux/spinlock.h>
9794 +#include <linux/percpu.h>
9795 +#include <linux/sched.h>
9796 +#include <linux/list.h>
9797 +#include <linux/slab.h>
9798 +
9799 +#include <litmus/litmus.h>
9800 +#include <litmus/jobs.h>
9801 +#include <litmus/preempt.h>
9802 +#include <litmus/rt_domain.h>
9803 +#include <litmus/sched_plugin.h>
9804 +#include <litmus/sched_trace.h>
9805 +
9806 +#include <litmus/bheap.h>
9807 +
9808 +/* to configure the cluster size */
9809 +#include <litmus/litmus_proc.h>
9810 +
9811 +#include <litmus/clustered.h>
9812 +
9813 +static enum cache_level pfair_cluster_level = GLOBAL_CLUSTER;
9814 +
9815 +struct subtask {
9816 +	/* measured in quanta relative to job release */
9817 +	quanta_t release;
9818 +        quanta_t deadline;
9819 +	quanta_t overlap; /* called "b bit" by PD^2 */
9820 +	quanta_t group_deadline;
9821 +};
9822 +
9823 +struct pfair_param   {
9824 +	quanta_t	quanta;       /* number of subtasks */
9825 +	quanta_t	cur;          /* index of current subtask */
9826 +
9827 +	quanta_t	release;      /* in quanta */
9828 +	quanta_t	period;       /* in quanta */
9829 +
9830 +	quanta_t	last_quantum; /* when scheduled last */
9831 +	int		last_cpu;     /* where scheduled last */
9832 +
9833 +	struct pfair_cluster* cluster; /* where this task is scheduled */
9834 +
9835 +	struct subtask subtasks[0];   /* allocate together with pfair_param */
9836 +};
9837 +
9838 +#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)
9839 +
9840 +struct pfair_state {
9841 +	struct cluster_cpu topology;
9842 +
9843 +	volatile quanta_t cur_tick;    /* updated by the CPU that is advancing
9844 +				        * the time */
9845 +	volatile quanta_t local_tick;  /* What tick is the local CPU currently
9846 +				        * executing? Updated only by the local
9847 +				        * CPU. In QEMU, this may lag behind the
9848 +				        * current tick. In a real system, with
9849 +				        * proper timers and aligned quanta,
9850 +				        * that should only be the case for a
9851 +				        * very short time after the time
9852 +				        * advanced. With staggered quanta, it
9853 +				        * will lag for the duration of the
9854 +				        * offset.
9855 +					*/
9856 +
9857 +	struct task_struct* linked;    /* the task that should be executing */
9858 +	struct task_struct* local;     /* the local copy of linked          */
9859 +	struct task_struct* scheduled; /* what is actually scheduled        */
9860 +
9861 +	lt_t offset;			/* stagger offset */
9862 +	unsigned int missed_updates;
9863 +	unsigned int missed_quanta;
9864 +};
9865 +
9866 +struct pfair_cluster {
9867 +	struct scheduling_cluster topology;
9868 +
9869 +	/* The "global" time in this cluster. */
9870 +	quanta_t pfair_time; /* the "official" PFAIR clock */
9871 +
9872 +	/* The ready queue for this cluster. */
9873 +	rt_domain_t pfair;
9874 +
9875 +	/* The set of jobs that should have their release enacted at the next
9876 +	 * quantum boundary.
9877 +	 */
9878 +	struct bheap release_queue;
9879 +	raw_spinlock_t release_lock;
9880 +};
9881 +
9882 +#define RT_F_REQUEUE 0x2
9883 +
9884 +static inline struct pfair_cluster* cpu_cluster(struct pfair_state* state)
9885 +{
9886 +	return container_of(state->topology.cluster, struct pfair_cluster, topology);
9887 +}
9888 +
9889 +static inline int cpu_id(struct pfair_state* state)
9890 +{
9891 +	return state->topology.id;
9892 +}
9893 +
9894 +static inline struct pfair_state* from_cluster_list(struct list_head* pos)
9895 +{
9896 +	return list_entry(pos, struct pfair_state, topology.cluster_list);
9897 +}
9898 +
9899 +static inline struct pfair_cluster* from_domain(rt_domain_t* rt)
9900 +{
9901 +	return container_of(rt, struct pfair_cluster, pfair);
9902 +}
9903 +
9904 +static inline raw_spinlock_t* cluster_lock(struct pfair_cluster* cluster)
9905 +{
9906 +	/* The ready_lock is used to serialize all scheduling events. */
9907 +	return &cluster->pfair.ready_lock;
9908 +}
9909 +
9910 +static inline raw_spinlock_t* cpu_lock(struct pfair_state* state)
9911 +{
9912 +	return cluster_lock(cpu_cluster(state));
9913 +}
9914 +
9915 +DEFINE_PER_CPU(struct pfair_state, pfair_state);
9916 +struct pfair_state* *pstate; /* short cut */
9917 +
9918 +static struct pfair_cluster* pfair_clusters;
9919 +static int num_pfair_clusters;
9920 +
9921 +/* Enable for lots of trace info.
9922 + * #define PFAIR_DEBUG
9923 + */
9924 +
9925 +#ifdef PFAIR_DEBUG
9926 +#define PTRACE_TASK(t, f, args...)  TRACE_TASK(t, f, ## args)
9927 +#define PTRACE(f, args...) TRACE(f, ## args)
9928 +#else
9929 +#define PTRACE_TASK(t, f, args...)
9930 +#define PTRACE(f, args...)
9931 +#endif
9932 +
9933 +/* gcc will inline all of these accessor functions... */
9934 +static struct subtask* cur_subtask(struct task_struct* t)
9935 +{
9936 +	return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur;
9937 +}
9938 +
9939 +static quanta_t cur_deadline(struct task_struct* t)
9940 +{
9941 +	return cur_subtask(t)->deadline +  tsk_pfair(t)->release;
9942 +}
9943 +
9944 +static quanta_t cur_release(struct task_struct* t)
9945 +{
9946 +	/* This is early releasing: only the release of the first subtask
9947 +	 * counts. */
9948 +	return tsk_pfair(t)->release;
9949 +}
9950 +
9951 +static quanta_t cur_overlap(struct task_struct* t)
9952 +{
9953 +	return cur_subtask(t)->overlap;
9954 +}
9955 +
9956 +static quanta_t cur_group_deadline(struct task_struct* t)
9957 +{
9958 +	quanta_t gdl = cur_subtask(t)->group_deadline;
9959 +	if (gdl)
9960 +		return gdl + tsk_pfair(t)->release;
9961 +	else
9962 +		return gdl;
9963 +}
9964 +
9965 +
9966 +static int pfair_higher_prio(struct task_struct* first,
9967 +			     struct task_struct* second)
9968 +{
9969 +	return  /* first task must exist */
9970 +		first && (
9971 +		/* Does the second task exist and is it a real-time task?  If
9972 +		 * not, the first task (which is a RT task) has higher
9973 +		 * priority.
9974 +		 */
9975 +		!second || !is_realtime(second)  ||
9976 +
9977 +		/* Is the (subtask) deadline of the first task earlier?
9978 +		 * Then it has higher priority.
9979 +		 */
9980 +		time_before(cur_deadline(first), cur_deadline(second)) ||
9981 +
9982 +		/* Do we have a deadline tie?
9983 +		 * Then break by B-bit.
9984 +		 */
9985 +		(cur_deadline(first) == cur_deadline(second) &&
9986 +		 (cur_overlap(first) > cur_overlap(second) ||
9987 +
9988 +		/* Do we have a B-bit tie?
9989 +		 * Then break by group deadline.
9990 +		 */
9991 +		(cur_overlap(first) == cur_overlap(second) &&
9992 +		 (time_after(cur_group_deadline(first),
9993 +			     cur_group_deadline(second)) ||
9994 +
9995 +		/* Do we have a group deadline tie?
9996 +		 * Then break by PID, which are unique.
9997 +		 */
9998 +		(cur_group_deadline(first) ==
9999 +		 cur_group_deadline(second) &&
10000 +		 first->pid < second->pid))))));
10001 +}
10002 +
10003 +int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
10004 +{
10005 +	return pfair_higher_prio(bheap2task(a), bheap2task(b));
10006 +}
10007 +
10008 +static void pfair_release_jobs(rt_domain_t* rt, struct bheap* tasks)
10009 +{
10010 +	struct pfair_cluster* cluster = from_domain(rt);
10011 +	unsigned long flags;
10012 +
10013 +	raw_spin_lock_irqsave(&cluster->release_lock, flags);
10014 +
10015 +	bheap_union(pfair_ready_order, &cluster->release_queue, tasks);
10016 +
10017 +	raw_spin_unlock_irqrestore(&cluster->release_lock, flags);
10018 +}
10019 +
10020 +static void prepare_release(struct task_struct* t, quanta_t at)
10021 +{
10022 +	tsk_pfair(t)->release    = at;
10023 +	tsk_pfair(t)->cur        = 0;
10024 +}
10025 +
10026 +/* pull released tasks from the release queue */
10027 +static void poll_releases(struct pfair_cluster* cluster)
10028 +{
10029 +	raw_spin_lock(&cluster->release_lock);
10030 +	__merge_ready(&cluster->pfair, &cluster->release_queue);
10031 +	raw_spin_unlock(&cluster->release_lock);
10032 +}
10033 +
10034 +static void check_preempt(struct task_struct* t)
10035 +{
10036 +	int cpu = NO_CPU;
10037 +	if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
10038 +	    tsk_rt(t)->present) {
10039 +		/* the task can be scheduled and
10040 +		 * is not scheduled where it ought to be scheduled
10041 +		 */
10042 +		cpu = tsk_rt(t)->linked_on != NO_CPU ?
10043 +			tsk_rt(t)->linked_on         :
10044 +			tsk_rt(t)->scheduled_on;
10045 +		PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n",
10046 +			   tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on);
10047 +		/* preempt */
10048 +		litmus_reschedule(cpu);
10049 +	}
10050 +}
10051 +
10052 +/* caller must hold pfair.ready_lock */
10053 +static void drop_all_references(struct task_struct *t)
10054 +{
10055 +        int cpu;
10056 +        struct pfair_state* s;
10057 +	struct pfair_cluster* cluster;
10058 +        if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
10059 +                /* It must be in the ready queue; drop references isn't called
10060 +		 * when the job is in a release queue. */
10061 +		cluster = tsk_pfair(t)->cluster;
10062 +                bheap_delete(pfair_ready_order, &cluster->pfair.ready_queue,
10063 +                            tsk_rt(t)->heap_node);
10064 +        }
10065 +        for (cpu = 0; cpu < num_online_cpus(); cpu++) {
10066 +                s = &per_cpu(pfair_state, cpu);
10067 +                if (s->linked == t)
10068 +                        s->linked = NULL;
10069 +                if (s->local  == t)
10070 +                        s->local  = NULL;
10071 +                if (s->scheduled  == t)
10072 +                        s->scheduled = NULL;
10073 +        }
10074 +	/* make sure we don't have a stale linked_on field */
10075 +	tsk_rt(t)->linked_on = NO_CPU;
10076 +}
10077 +
10078 +static void pfair_prepare_next_period(struct task_struct* t)
10079 +{
10080 +	struct pfair_param* p = tsk_pfair(t);
10081 +
10082 +	prepare_for_next_period(t);
10083 +	get_rt_flags(t) = RT_F_RUNNING;
10084 +	p->release += p->period;
10085 +}
10086 +
10087 +/* returns 1 if the task needs to go the release queue */
10088 +static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
10089 +{
10090 +	struct pfair_param* p = tsk_pfair(t);
10091 +	int to_relq;
10092 +	p->cur = (p->cur + 1) % p->quanta;
10093 +	if (!p->cur) {
10094 +		if (tsk_rt(t)->present) {
10095 +			/* The job overran; we start a new budget allocation. */
10096 +			pfair_prepare_next_period(t);
10097 +		} else {
10098 +			/* remove task from system until it wakes */
10099 +			drop_all_references(t);
10100 +			tsk_rt(t)->flags = RT_F_REQUEUE;
10101 +			TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n",
10102 +				   cpu, p->cur);
10103 +			return 0;
10104 +		}
10105 +	}
10106 +	to_relq = time_after(cur_release(t), time);
10107 +	TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d (cur_release:%lu time:%lu)\n",
10108 +		   cpu, p->cur, to_relq, cur_release(t), time);
10109 +	return to_relq;
10110 +}
10111 +
10112 +static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time)
10113 +{
10114 +	struct task_struct* l;
10115 +	struct pfair_param* p;
10116 +	struct list_head* pos;
10117 +	struct pfair_state* cpu;
10118 +
10119 +	list_for_each(pos, &cluster->topology.cpus) {
10120 +		cpu = from_cluster_list(pos);
10121 +		l = cpu->linked;
10122 +		cpu->missed_updates += cpu->linked != cpu->local;
10123 +		if (l) {
10124 +			p = tsk_pfair(l);
10125 +			p->last_quantum = time;
10126 +			p->last_cpu     =  cpu_id(cpu);
10127 +			if (advance_subtask(time, l, cpu_id(cpu))) {
10128 +				//cpu->linked = NULL;
10129 +				PTRACE_TASK(l, "should go to release queue. "
10130 +					    "scheduled_on=%d present=%d\n",
10131 +					    tsk_rt(l)->scheduled_on,
10132 +					    tsk_rt(l)->present);
10133 +			}
10134 +		}
10135 +	}
10136 +}
10137 +
10138 +static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu)
10139 +{
10140 +	int cpu;
10141 +	if (tsk_rt(t)->scheduled_on != NO_CPU) {
10142 +		/* always observe scheduled_on linkage */
10143 +		default_cpu = tsk_rt(t)->scheduled_on;
10144 +	} else if (tsk_pfair(t)->last_quantum == time - 1) {
10145 +		/* back2back quanta */
10146 +		/* Only observe last_quantum if no scheduled_on is in the way.
10147 +		 * This should only kick in if a CPU missed quanta, and that
10148 +		 * *should* only happen in QEMU.
10149 +		 */
10150 +		cpu = tsk_pfair(t)->last_cpu;
10151 +		if (!pstate[cpu]->linked ||
10152 +		    tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) {
10153 +			default_cpu = cpu;
10154 +		}
10155 +	}
10156 +	return default_cpu;
10157 +}
10158 +
10159 +/* returns one if linking was redirected */
10160 +static int pfair_link(quanta_t time, int cpu,
10161 +		      struct task_struct* t)
10162 +{
10163 +	int target = target_cpu(time, t, cpu);
10164 +	struct task_struct* prev  = pstate[cpu]->linked;
10165 +	struct task_struct* other;
10166 +	struct pfair_cluster* cluster = cpu_cluster(pstate[cpu]);
10167 +
10168 +	if (target != cpu) {
10169 +		BUG_ON(pstate[target]->topology.cluster != pstate[cpu]->topology.cluster);
10170 +		other = pstate[target]->linked;
10171 +		pstate[target]->linked = t;
10172 +		tsk_rt(t)->linked_on   = target;
10173 +		if (!other)
10174 +			/* linked ok, but reschedule this CPU */
10175 +			return 1;
10176 +		if (target < cpu) {
10177 +			/* link other to cpu instead */
10178 +			tsk_rt(other)->linked_on = cpu;
10179 +			pstate[cpu]->linked      = other;
10180 +			if (prev) {
10181 +				/* prev got pushed back into the ready queue */
10182 +				tsk_rt(prev)->linked_on = NO_CPU;
10183 +				__add_ready(&cluster->pfair, prev);
10184 +			}
10185 +			/* we are done with this cpu */
10186 +			return 0;
10187 +		} else {
10188 +			/* re-add other, it's original CPU was not considered yet */
10189 +			tsk_rt(other)->linked_on = NO_CPU;
10190 +			__add_ready(&cluster->pfair, other);
10191 +			/* reschedule this CPU */
10192 +			return 1;
10193 +		}
10194 +	} else {
10195 +		pstate[cpu]->linked  = t;
10196 +		tsk_rt(t)->linked_on = cpu;
10197 +		if (prev) {
10198 +			/* prev got pushed back into the ready queue */
10199 +			tsk_rt(prev)->linked_on = NO_CPU;
10200 +			__add_ready(&cluster->pfair, prev);
10201 +		}
10202 +		/* we are done with this CPU */
10203 +		return 0;
10204 +	}
10205 +}
10206 +
10207 +static void schedule_subtasks(struct pfair_cluster *cluster, quanta_t time)
10208 +{
10209 +	int retry;
10210 +	struct list_head *pos;
10211 +	struct pfair_state *cpu_state;
10212 +
10213 +	list_for_each(pos, &cluster->topology.cpus) {
10214 +		cpu_state = from_cluster_list(pos);
10215 +		retry = 1;
10216 +#ifdef CONFIG_RELEASE_MASTER
10217 +		/* skip release master */
10218 +		if (cluster->pfair.release_master == cpu_id(cpu_state))
10219 +			continue;
10220 +#endif
10221 +		while (retry) {
10222 +			if (pfair_higher_prio(__peek_ready(&cluster->pfair),
10223 +					      cpu_state->linked))
10224 +				retry = pfair_link(time, cpu_id(cpu_state),
10225 +						   __take_ready(&cluster->pfair));
10226 +			else
10227 +				retry = 0;
10228 +		}
10229 +	}
10230 +}
10231 +
10232 +static void schedule_next_quantum(struct pfair_cluster *cluster, quanta_t time)
10233 +{
10234 +	struct pfair_state *cpu;
10235 +	struct list_head* pos;
10236 +
10237 +	/* called with interrupts disabled */
10238 +	PTRACE("--- Q %lu at %llu PRE-SPIN\n",
10239 +	       time, litmus_clock());
10240 +	raw_spin_lock(cluster_lock(cluster));
10241 +	PTRACE("<<< Q %lu at %llu\n",
10242 +	       time, litmus_clock());
10243 +
10244 +	sched_trace_quantum_boundary();
10245 +
10246 +	advance_subtasks(cluster, time);
10247 +	poll_releases(cluster);
10248 +	schedule_subtasks(cluster, time);
10249 +
10250 +	list_for_each(pos, &cluster->topology.cpus) {
10251 +		cpu = from_cluster_list(pos);
10252 +		if (cpu->linked)
10253 +			PTRACE_TASK(cpu->linked,
10254 +				    " linked on %d.\n", cpu_id(cpu));
10255 +		else
10256 +			PTRACE("(null) linked on %d.\n", cpu_id(cpu));
10257 +	}
10258 +	/* We are done. Advance time. */
10259 +	mb();
10260 +	list_for_each(pos, &cluster->topology.cpus) {
10261 +		cpu = from_cluster_list(pos);
10262 +		if (cpu->local_tick != cpu->cur_tick) {
10263 +			TRACE("BAD Quantum not acked on %d "
10264 +			      "(l:%lu c:%lu p:%lu)\n",
10265 +			      cpu_id(cpu),
10266 +			      cpu->local_tick,
10267 +			      cpu->cur_tick,
10268 +			      cluster->pfair_time);
10269 +			cpu->missed_quanta++;
10270 +		}
10271 +		cpu->cur_tick = time;
10272 +	}
10273 +	PTRACE(">>> Q %lu at %llu\n",
10274 +	       time, litmus_clock());
10275 +	raw_spin_unlock(cluster_lock(cluster));
10276 +}
10277 +
10278 +static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state)
10279 +{
10280 +	quanta_t loc;
10281 +
10282 +	goto first; /* skip mb() on first iteration */
10283 +	do {
10284 +		cpu_relax();
10285 +		mb();
10286 +	first:	loc = state->cur_tick;
10287 +		/* FIXME: what if loc > cur? */
10288 +	} while (time_before(loc, q));
10289 +	PTRACE("observed cur_tick:%lu >= q:%lu\n",
10290 +	       loc, q);
10291 +}
10292 +
10293 +static quanta_t current_quantum(struct pfair_state* state)
10294 +{
10295 +	lt_t t = litmus_clock() - state->offset;
10296 +	return time2quanta(t, FLOOR);
10297 +}
10298 +
10299 +static void catchup_quanta(quanta_t from, quanta_t target,
10300 +			   struct pfair_state* state)
10301 +{
10302 +	quanta_t cur = from, time;
10303 +	TRACE("+++< BAD catching up quanta from %lu to %lu\n",
10304 +	      from, target);
10305 +	while (time_before(cur, target)) {
10306 +		wait_for_quantum(cur, state);
10307 +		cur++;
10308 +		time = cmpxchg(&cpu_cluster(state)->pfair_time,
10309 +			       cur - 1,   /* expected */
10310 +			       cur        /* next     */
10311 +			);
10312 +		if (time == cur - 1)
10313 +			schedule_next_quantum(cpu_cluster(state), cur);
10314 +	}
10315 +	TRACE("+++> catching up done\n");
10316 +}
10317 +
10318 +/* pfair_tick - this function is called for every local timer
10319 + *                         interrupt.
10320 + */
10321 +static void pfair_tick(struct task_struct* t)
10322 +{
10323 +	struct pfair_state* state = &__get_cpu_var(pfair_state);
10324 +	quanta_t time, cur;
10325 +	int retry = 10;
10326 +
10327 +	do {
10328 +		cur  = current_quantum(state);
10329 +		PTRACE("q %lu at %llu\n", cur, litmus_clock());
10330 +
10331 +		/* Attempt to advance time. First CPU to get here
10332 +		 * will prepare the next quantum.
10333 +		 */
10334 +		time = cmpxchg(&cpu_cluster(state)->pfair_time,
10335 +			       cur - 1,   /* expected */
10336 +			       cur        /* next     */
10337 +			);
10338 +		if (time == cur - 1) {
10339 +			/* exchange succeeded */
10340 +			wait_for_quantum(cur - 1, state);
10341 +			schedule_next_quantum(cpu_cluster(state), cur);
10342 +			retry = 0;
10343 +		} else if (time_before(time, cur - 1)) {
10344 +			/* the whole system missed a tick !? */
10345 +			catchup_quanta(time, cur, state);
10346 +			retry--;
10347 +		} else if (time_after(time, cur)) {
10348 +			/* our timer lagging behind!? */
10349 +			TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur);
10350 +			retry--;
10351 +		} else {
10352 +			/* Some other CPU already started scheduling
10353 +			 * this quantum. Let it do its job and then update.
10354 +			 */
10355 +			retry = 0;
10356 +		}
10357 +	} while (retry);
10358 +
10359 +	/* Spin locally until time advances. */
10360 +	wait_for_quantum(cur, state);
10361 +
10362 +	/* copy assignment */
10363 +	/* FIXME: what if we race with a future update? Corrupted state? */
10364 +	state->local      = state->linked;
10365 +	/* signal that we are done */
10366 +	mb();
10367 +	state->local_tick = state->cur_tick;
10368 +
10369 +	if (state->local != current
10370 +	    && (is_realtime(current) || is_present(state->local)))
10371 +		litmus_reschedule_local();
10372 +}
10373 +
10374 +static int safe_to_schedule(struct task_struct* t, int cpu)
10375 +{
10376 +	int where = tsk_rt(t)->scheduled_on;
10377 +	if (where != NO_CPU && where != cpu) {
10378 +		TRACE_TASK(t, "BAD: can't be scheduled on %d, "
10379 +			   "scheduled already on %d.\n", cpu, where);
10380 +		return 0;
10381 +	} else
10382 +		return tsk_rt(t)->present && get_rt_flags(t) == RT_F_RUNNING;
10383 +}
10384 +
10385 +static struct task_struct* pfair_schedule(struct task_struct * prev)
10386 +{
10387 +	struct pfair_state* state = &__get_cpu_var(pfair_state);
10388 +	struct pfair_cluster* cluster = cpu_cluster(state);
10389 +	int blocks, completion, out_of_time;
10390 +	struct task_struct* next = NULL;
10391 +
10392 +#ifdef CONFIG_RELEASE_MASTER
10393 +	/* Bail out early if we are the release master.
10394 +	 * The release master never schedules any real-time tasks.
10395 +	 */
10396 +	if (unlikely(cluster->pfair.release_master == cpu_id(state))) {
10397 +		sched_state_task_picked();
10398 +		return NULL;
10399 +	}
10400 +#endif
10401 +
10402 +	raw_spin_lock(cpu_lock(state));
10403 +
10404 +	blocks      = is_realtime(prev) && !is_running(prev);
10405 +	completion  = is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP;
10406 +	out_of_time = is_realtime(prev) && time_after(cur_release(prev),
10407 +						      state->local_tick);
10408 +
10409 +	if (is_realtime(prev))
10410 +	    PTRACE_TASK(prev, "blocks:%d completion:%d out_of_time:%d\n",
10411 +			blocks, completion, out_of_time);
10412 +
10413 +	if (completion) {
10414 +		sched_trace_task_completion(prev, 0);
10415 +		pfair_prepare_next_period(prev);
10416 +		prepare_release(prev, cur_release(prev));
10417 +	}
10418 +
10419 +	if (!blocks && (completion || out_of_time)) {
10420 +		drop_all_references(prev);
10421 +		sched_trace_task_release(prev);
10422 +		add_release(&cluster->pfair, prev);
10423 +	}
10424 +
10425 +	if (state->local && safe_to_schedule(state->local, cpu_id(state)))
10426 +		next = state->local;
10427 +
10428 +	if (prev != next) {
10429 +		tsk_rt(prev)->scheduled_on = NO_CPU;
10430 +		if (next)
10431 +			tsk_rt(next)->scheduled_on = cpu_id(state);
10432 +	}
10433 +	sched_state_task_picked();
10434 +	raw_spin_unlock(cpu_lock(state));
10435 +
10436 +	if (next)
10437 +		TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n",
10438 +			   tsk_pfair(next)->release, cpu_cluster(state)->pfair_time, litmus_clock());
10439 +	else if (is_realtime(prev))
10440 +		TRACE("Becomes idle at %lu (%llu)\n", cpu_cluster(state)->pfair_time, litmus_clock());
10441 +
10442 +	return next;
10443 +}
10444 +
10445 +static void pfair_task_new(struct task_struct * t, int on_rq, int running)
10446 +{
10447 +	unsigned long flags;
10448 +	struct pfair_cluster* cluster;
10449 +
10450 +	TRACE("pfair: task new %d state:%d\n", t->pid, t->state);
10451 +
10452 +	cluster = tsk_pfair(t)->cluster;
10453 +
10454 +	raw_spin_lock_irqsave(cluster_lock(cluster), flags);
10455 +
10456 +	prepare_release(t, cluster->pfair_time + 1);
10457 +
10458 +	t->rt_param.scheduled_on = NO_CPU;
10459 +
10460 +	if (running) {
10461 +#ifdef CONFIG_RELEASE_MASTER
10462 +		if (task_cpu(t) != cluster->pfair.release_master)
10463 +#endif
10464 +			t->rt_param.scheduled_on = task_cpu(t);
10465 +		__add_ready(&cluster->pfair, t);
10466 +	}
10467 +
10468 +	check_preempt(t);
10469 +
10470 +	raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
10471 +}
10472 +
10473 +static void pfair_task_wake_up(struct task_struct *t)
10474 +{
10475 +	unsigned long flags;
10476 +	lt_t now;
10477 +	int requeue = 0;
10478 +	struct pfair_cluster* cluster;
10479 +
10480 +	cluster = tsk_pfair(t)->cluster;
10481 +
10482 +	TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n",
10483 +		   litmus_clock(), cur_release(t), cluster->pfair_time);
10484 +
10485 +	raw_spin_lock_irqsave(cluster_lock(cluster), flags);
10486 +
10487 +	/* If a task blocks and wakes before its next job release,
10488 +	 * then it may resume if it is currently linked somewhere
10489 +	 * (as if it never blocked at all). Otherwise, we have a
10490 +	 * new sporadic job release.
10491 +	 */
10492 +	requeue = tsk_rt(t)->flags == RT_F_REQUEUE;
10493 +	now = litmus_clock();
10494 +	if (lt_before(get_deadline(t), now)) {
10495 +		TRACE_TASK(t, "sporadic release!\n");
10496 +		release_at(t, now);
10497 +		prepare_release(t, time2quanta(now, CEIL));
10498 +		sched_trace_task_release(t);
10499 +	}
10500 +
10501 +	/* only add to ready queue if the task isn't still linked somewhere */
10502 +	if (requeue) {
10503 +		TRACE_TASK(t, "requeueing required\n");
10504 +		tsk_rt(t)->flags = RT_F_RUNNING;
10505 +		__add_ready(&cluster->pfair, t);
10506 +	}
10507 +
10508 +	check_preempt(t);
10509 +
10510 +	raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
10511 +	TRACE_TASK(t, "wake up done at %llu\n", litmus_clock());
10512 +}
10513 +
10514 +static void pfair_task_block(struct task_struct *t)
10515 +{
10516 +	BUG_ON(!is_realtime(t));
10517 +	TRACE_TASK(t, "blocks at %llu, state:%d\n",
10518 +		   litmus_clock(), t->state);
10519 +}
10520 +
10521 +static void pfair_task_exit(struct task_struct * t)
10522 +{
10523 +	unsigned long flags;
10524 +	struct pfair_cluster *cluster;
10525 +
10526 +	BUG_ON(!is_realtime(t));
10527 +
10528 +	cluster = tsk_pfair(t)->cluster;
10529 +
10530 +	/* Remote task from release or ready queue, and ensure
10531 +	 * that it is not the scheduled task for ANY CPU. We
10532 +	 * do this blanket check because occassionally when
10533 +	 * tasks exit while blocked, the task_cpu of the task
10534 +	 * might not be the same as the CPU that the PFAIR scheduler
10535 +	 * has chosen for it.
10536 +	 */
10537 +	raw_spin_lock_irqsave(cluster_lock(cluster), flags);
10538 +
10539 +	TRACE_TASK(t, "RIP, state:%d\n", t->state);
10540 +	drop_all_references(t);
10541 +
10542 +	raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
10543 +
10544 +	kfree(t->rt_param.pfair);
10545 +	t->rt_param.pfair = NULL;
10546 +}
10547 +
10548 +
10549 +static void pfair_release_at(struct task_struct* task, lt_t start)
10550 +{
10551 +	unsigned long flags;
10552 +	quanta_t release;
10553 +
10554 +	struct pfair_cluster *cluster;
10555 +
10556 +	cluster = tsk_pfair(task)->cluster;
10557 +
10558 +	BUG_ON(!is_realtime(task));
10559 +
10560 +	raw_spin_lock_irqsave(cluster_lock(cluster), flags);
10561 +	release_at(task, start);
10562 +	release = time2quanta(start, CEIL);
10563 +
10564 +	TRACE_TASK(task, "sys release at %lu\n", release);
10565 +
10566 +	drop_all_references(task);
10567 +	prepare_release(task, release);
10568 +	add_release(&cluster->pfair, task);
10569 +
10570 +	raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
10571 +}
10572 +
10573 +static void init_subtask(struct subtask* sub, unsigned long i,
10574 +			 lt_t quanta, lt_t period)
10575 +{
10576 +	/* since i is zero-based, the formulas are shifted by one */
10577 +	lt_t tmp;
10578 +
10579 +	/* release */
10580 +	tmp = period * i;
10581 +	do_div(tmp, quanta); /* floor */
10582 +	sub->release = (quanta_t) tmp;
10583 +
10584 +	/* deadline */
10585 +	tmp = period * (i + 1);
10586 +	if (do_div(tmp, quanta)) /* ceil */
10587 +		tmp++;
10588 +	sub->deadline = (quanta_t) tmp;
10589 +
10590 +	/* next release */
10591 +	tmp = period * (i + 1);
10592 +	do_div(tmp, quanta); /* floor */
10593 +	sub->overlap =  sub->deadline - (quanta_t) tmp;
10594 +
10595 +	/* Group deadline.
10596 +	 * Based on the formula given in Uma's thesis.
10597 +	 */
10598 +	if (2 * quanta >= period) {
10599 +		/* heavy */
10600 +		tmp = (sub->deadline - (i + 1)) * period;
10601 +		if (period > quanta &&
10602 +		    do_div(tmp, (period - quanta))) /* ceil */
10603 +			tmp++;
10604 +		sub->group_deadline = (quanta_t) tmp;
10605 +	} else
10606 +		sub->group_deadline = 0;
10607 +}
10608 +
10609 +static void dump_subtasks(struct task_struct* t)
10610 +{
10611 +	unsigned long i;
10612 +	for (i = 0; i < t->rt_param.pfair->quanta; i++)
10613 +		TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n",
10614 +			   i + 1,
10615 +			   t->rt_param.pfair->subtasks[i].release,
10616 +			   t->rt_param.pfair->subtasks[i].deadline,
10617 +			   t->rt_param.pfair->subtasks[i].overlap,
10618 +			   t->rt_param.pfair->subtasks[i].group_deadline);
10619 +}
10620 +
10621 +static long pfair_admit_task(struct task_struct* t)
10622 +{
10623 +	lt_t quanta;
10624 +	lt_t period;
10625 +	s64  quantum_length = ktime_to_ns(tick_period);
10626 +	struct pfair_param* param;
10627 +	unsigned long i;
10628 +
10629 +	/* first check that the task is in the right cluster */
10630 +	if (cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]) !=
10631 +	    cpu_cluster(pstate[task_cpu(t)]))
10632 +		return -EINVAL;
10633 +
10634 +	/* Pfair is a tick-based method, so the time
10635 +	 * of interest is jiffies. Calculate tick-based
10636 +	 * times for everything.
10637 +	 * (Ceiling of exec cost, floor of period.)
10638 +	 */
10639 +
10640 +	quanta = get_exec_cost(t);
10641 +	period = get_rt_period(t);
10642 +
10643 +	quanta = time2quanta(get_exec_cost(t), CEIL);
10644 +
10645 +	if (do_div(period, quantum_length))
10646 +		printk(KERN_WARNING
10647 +		       "The period of %s/%d is not a multiple of %llu.\n",
10648 +		       t->comm, t->pid, (unsigned long long) quantum_length);
10649 +
10650 +	if (quanta == period) {
10651 +		/* special case: task has weight 1.0 */
10652 +		printk(KERN_INFO
10653 +		       "Admitting weight 1.0 task. (%s/%d, %llu, %llu).\n",
10654 +		       t->comm, t->pid, quanta, period);
10655 +		quanta = 1;
10656 +		period = 1;
10657 +	}
10658 +
10659 +	param = kmalloc(sizeof(*param) +
10660 +			quanta * sizeof(struct subtask), GFP_ATOMIC);
10661 +
10662 +	if (!param)
10663 +		return -ENOMEM;
10664 +
10665 +	param->quanta  = quanta;
10666 +	param->cur     = 0;
10667 +	param->release = 0;
10668 +	param->period  = period;
10669 +
10670 +	param->cluster = cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]);
10671 +
10672 +	for (i = 0; i < quanta; i++)
10673 +		init_subtask(param->subtasks + i, i, quanta, period);
10674 +
10675 +	if (t->rt_param.pfair)
10676 +		/* get rid of stale allocation */
10677 +		kfree(t->rt_param.pfair);
10678 +
10679 +	t->rt_param.pfair = param;
10680 +
10681 +	/* spew out some debug info */
10682 +	dump_subtasks(t);
10683 +
10684 +	return 0;
10685 +}
10686 +
10687 +static void pfair_init_cluster(struct pfair_cluster* cluster)
10688 +{
10689 +	rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, pfair_release_jobs);
10690 +	bheap_init(&cluster->release_queue);
10691 +	raw_spin_lock_init(&cluster->release_lock);
10692 +	INIT_LIST_HEAD(&cluster->topology.cpus);
10693 +}
10694 +
10695 +static void cleanup_clusters(void)
10696 +{
10697 +	int i;
10698 +
10699 +	if (num_pfair_clusters)
10700 +		kfree(pfair_clusters);
10701 +	pfair_clusters = NULL;
10702 +	num_pfair_clusters = 0;
10703 +
10704 +	/* avoid stale pointers */
10705 +	for (i = 0; i < num_online_cpus(); i++) {
10706 +		pstate[i]->topology.cluster = NULL;
10707 +		printk("P%d missed %u updates and %u quanta.\n", cpu_id(pstate[i]),
10708 +		       pstate[i]->missed_updates, pstate[i]->missed_quanta);
10709 +	}
10710 +}
10711 +
10712 +static long pfair_activate_plugin(void)
10713 +{
10714 +	int err, i;
10715 +	struct pfair_state* state;
10716 +	struct pfair_cluster* cluster ;
10717 +	quanta_t now;
10718 +	int cluster_size;
10719 +	struct cluster_cpu* cpus[NR_CPUS];
10720 +	struct scheduling_cluster* clust[NR_CPUS];
10721 +
10722 +	cluster_size = get_cluster_size(pfair_cluster_level);
10723 +
10724 +	if (cluster_size <= 0 || num_online_cpus() % cluster_size != 0)
10725 +		return -EINVAL;
10726 +
10727 +	num_pfair_clusters = num_online_cpus() / cluster_size;
10728 +
10729 +	pfair_clusters = kzalloc(num_pfair_clusters * sizeof(struct pfair_cluster), GFP_ATOMIC);
10730 +	if (!pfair_clusters) {
10731 +		num_pfair_clusters = 0;
10732 +		printk(KERN_ERR "Could not allocate Pfair clusters!\n");
10733 +		return -ENOMEM;
10734 +	}
10735 +
10736 +	state = &__get_cpu_var(pfair_state);
10737 +	now = current_quantum(state);
10738 +	TRACE("Activating PFAIR at q=%lu\n", now);
10739 +
10740 +	for (i = 0; i < num_pfair_clusters; i++) {
10741 +		cluster = &pfair_clusters[i];
10742 +		pfair_init_cluster(cluster);
10743 +		cluster->pfair_time = now;
10744 +		clust[i] = &cluster->topology;
10745 +#ifdef CONFIG_RELEASE_MASTER
10746 +		cluster->pfair.release_master = atomic_read(&release_master_cpu);
10747 +#endif
10748 +	}
10749 +
10750 +	for (i = 0; i < num_online_cpus(); i++)  {
10751 +		state = &per_cpu(pfair_state, i);
10752 +		state->cur_tick   = now;
10753 +		state->local_tick = now;
10754 +		state->missed_quanta = 0;
10755 +		state->missed_updates = 0;
10756 +		state->offset     = cpu_stagger_offset(i);
10757 +		printk(KERN_ERR "cpus[%d] set; %d\n", i, num_online_cpus());
10758 +		cpus[i] = &state->topology;
10759 +	}
10760 +
10761 +	err = assign_cpus_to_clusters(pfair_cluster_level, clust, num_pfair_clusters,
10762 +				      cpus, num_online_cpus());
10763 +
10764 +	if (err < 0)
10765 +		cleanup_clusters();
10766 +
10767 +	return err;
10768 +}
10769 +
10770 +static long pfair_deactivate_plugin(void)
10771 +{
10772 +	cleanup_clusters();
10773 +	return 0;
10774 +}
10775 +
10776 +/*	Plugin object	*/
10777 +static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
10778 +	.plugin_name		= "PFAIR",
10779 +	.tick			= pfair_tick,
10780 +	.task_new		= pfair_task_new,
10781 +	.task_exit		= pfair_task_exit,
10782 +	.schedule		= pfair_schedule,
10783 +	.task_wake_up		= pfair_task_wake_up,
10784 +	.task_block		= pfair_task_block,
10785 +	.admit_task		= pfair_admit_task,
10786 +	.release_at		= pfair_release_at,
10787 +	.complete_job		= complete_job,
10788 +	.activate_plugin	= pfair_activate_plugin,
10789 +	.deactivate_plugin	= pfair_deactivate_plugin,
10790 +};
10791 +
10792 +
10793 +static struct proc_dir_entry *cluster_file = NULL, *pfair_dir = NULL;
10794 +
10795 +static int __init init_pfair(void)
10796 +{
10797 +	int cpu, err, fs;
10798 +	struct pfair_state *state;
10799 +
10800 +	/*
10801 +	 * initialize short_cut for per-cpu pfair state;
10802 +	 * there may be a problem here if someone removes a cpu
10803 +	 * while we are doing this initialization... and if cpus
10804 +	 * are added / removed later... but we don't support CPU hotplug atm anyway.
10805 +	 */
10806 +	pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL);
10807 +
10808 +	/* initialize CPU state */
10809 +	for (cpu = 0; cpu < num_online_cpus(); cpu++)  {
10810 +		state = &per_cpu(pfair_state, cpu);
10811 +		state->topology.id = cpu;
10812 +		state->cur_tick   = 0;
10813 +		state->local_tick = 0;
10814 +		state->linked     = NULL;
10815 +		state->local      = NULL;
10816 +		state->scheduled  = NULL;
10817 +		state->missed_quanta = 0;
10818 +		state->offset     = cpu_stagger_offset(cpu);
10819 +		pstate[cpu] = state;
10820 +	}
10821 +
10822 +	pfair_clusters = NULL;
10823 +	num_pfair_clusters = 0;
10824 +
10825 +	err = register_sched_plugin(&pfair_plugin);
10826 +	if (!err) {
10827 +		fs = make_plugin_proc_dir(&pfair_plugin, &pfair_dir);
10828 +		if (!fs)
10829 +			cluster_file = create_cluster_file(pfair_dir, &pfair_cluster_level);
10830 +		else
10831 +			printk(KERN_ERR "Could not allocate PFAIR procfs dir.\n");
10832 +	}
10833 +
10834 +	return err;
10835 +}
10836 +
10837 +static void __exit clean_pfair(void)
10838 +{
10839 +	kfree(pstate);
10840 +
10841 +	if (cluster_file)
10842 +		remove_proc_entry("cluster", pfair_dir);
10843 +	if (pfair_dir)
10844 +		remove_plugin_proc_dir(&pfair_plugin);
10845 +}
10846 +
10847 +module_init(init_pfair);
10848 +module_exit(clean_pfair);
10849 diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
10850 new file mode 100644
10851 index 0000000..00a1900
10852 --- /dev/null
10853 +++ b/litmus/sched_plugin.c
10854 @@ -0,0 +1,227 @@
10855 +/* sched_plugin.c -- core infrastructure for the scheduler plugin system
10856 + *
10857 + * This file includes the initialization of the plugin system, the no-op Linux
10858 + * scheduler plugin, some dummy functions, and some helper functions.
10859 + */
10860 +
10861 +#include <linux/list.h>
10862 +#include <linux/spinlock.h>
10863 +#include <linux/sched.h>
10864 +
10865 +#include <litmus/litmus.h>
10866 +#include <litmus/sched_plugin.h>
10867 +#include <litmus/preempt.h>
10868 +#include <litmus/jobs.h>
10869 +
10870 +/*
10871 + * Generic function to trigger preemption on either local or remote cpu
10872 + * from scheduler plugins. The key feature is that this function is
10873 + * non-preemptive section aware and does not invoke the scheduler / send
10874 + * IPIs if the to-be-preempted task is actually non-preemptive.
10875 + */
10876 +void preempt_if_preemptable(struct task_struct* t, int cpu)
10877 +{
10878 +	/* t is the real-time task executing on CPU on_cpu If t is NULL, then
10879 +	 * on_cpu is currently scheduling background work.
10880 +	 */
10881 +
10882 +	int reschedule = 0;
10883 +
10884 +	if (!t)
10885 +		/* move non-real-time task out of the way */
10886 +		reschedule = 1;
10887 +	else {
10888 +		if (smp_processor_id() == cpu) {
10889 +			/* local CPU case */
10890 +			/* check if we need to poke userspace */
10891 +			if (is_user_np(t))
10892 +				/* Yes, poke it. This doesn't have to be atomic since
10893 +				 * the task is definitely not executing. */
10894 +				request_exit_np(t);
10895 +			else if (!is_kernel_np(t))
10896 +				/* only if we are allowed to preempt the
10897 +				 * currently-executing task */
10898 +				reschedule = 1;
10899 +		} else {
10900 +			/* Remote CPU case.  Only notify if it's not a kernel
10901 +			 * NP section and if we didn't set the userspace
10902 +			 * flag. */
10903 +			reschedule = !(is_kernel_np(t) || request_exit_np_atomic(t));
10904 +		}
10905 +	}
10906 +	if (likely(reschedule))
10907 +		litmus_reschedule(cpu);
10908 +}
10909 +
10910 +
10911 +/*************************************************************
10912 + *                   Dummy plugin functions                  *
10913 + *************************************************************/
10914 +
10915 +static void litmus_dummy_finish_switch(struct task_struct * prev)
10916 +{
10917 +}
10918 +
10919 +static struct task_struct* litmus_dummy_schedule(struct task_struct * prev)
10920 +{
10921 +	sched_state_task_picked();
10922 +	return NULL;
10923 +}
10924 +
10925 +static void litmus_dummy_tick(struct task_struct* tsk)
10926 +{
10927 +}
10928 +
10929 +static long litmus_dummy_admit_task(struct task_struct* tsk)
10930 +{
10931 +	printk(KERN_CRIT "LITMUS^RT: Linux plugin rejects %s/%d.\n",
10932 +		tsk->comm, tsk->pid);
10933 +	return -EINVAL;
10934 +}
10935 +
10936 +static void litmus_dummy_task_new(struct task_struct *t, int on_rq, int running)
10937 +{
10938 +}
10939 +
10940 +static void litmus_dummy_task_wake_up(struct task_struct *task)
10941 +{
10942 +}
10943 +
10944 +static void litmus_dummy_task_block(struct task_struct *task)
10945 +{
10946 +}
10947 +
10948 +static void litmus_dummy_task_exit(struct task_struct *task)
10949 +{
10950 +}
10951 +
10952 +static long litmus_dummy_complete_job(void)
10953 +{
10954 +	return -ENOSYS;
10955 +}
10956 +
10957 +static long litmus_dummy_activate_plugin(void)
10958 +{
10959 +	return 0;
10960 +}
10961 +
10962 +static long litmus_dummy_deactivate_plugin(void)
10963 +{
10964 +	return 0;
10965 +}
10966 +
10967 +#ifdef CONFIG_LITMUS_LOCKING
10968 +
10969 +static long litmus_dummy_allocate_lock(struct litmus_lock **lock, int type,
10970 +				       void* __user config)
10971 +{
10972 +	return -ENXIO;
10973 +}
10974 +
10975 +#endif
10976 +
10977 +
10978 +/* The default scheduler plugin. It doesn't do anything and lets Linux do its
10979 + * job.
10980 + */
10981 +struct sched_plugin linux_sched_plugin = {
10982 +	.plugin_name = "Linux",
10983 +	.tick = litmus_dummy_tick,
10984 +	.task_new   = litmus_dummy_task_new,
10985 +	.task_exit = litmus_dummy_task_exit,
10986 +	.task_wake_up = litmus_dummy_task_wake_up,
10987 +	.task_block = litmus_dummy_task_block,
10988 +	.complete_job = litmus_dummy_complete_job,
10989 +	.schedule = litmus_dummy_schedule,
10990 +	.finish_switch = litmus_dummy_finish_switch,
10991 +	.activate_plugin = litmus_dummy_activate_plugin,
10992 +	.deactivate_plugin = litmus_dummy_deactivate_plugin,
10993 +#ifdef CONFIG_LITMUS_LOCKING
10994 +	.allocate_lock = litmus_dummy_allocate_lock,
10995 +#endif
10996 +	.admit_task = litmus_dummy_admit_task
10997 +};
10998 +
10999 +/*
11000 + *	The reference to current plugin that is used to schedule tasks within
11001 + *	the system. It stores references to actual function implementations
11002 + *	Should be initialized by calling "init_***_plugin()"
11003 + */
11004 +struct sched_plugin *litmus = &linux_sched_plugin;
11005 +
11006 +/* the list of registered scheduling plugins */
11007 +static LIST_HEAD(sched_plugins);
11008 +static DEFINE_RAW_SPINLOCK(sched_plugins_lock);
11009 +
11010 +#define CHECK(func) {\
11011 +	if (!plugin->func) \
11012 +		plugin->func = litmus_dummy_ ## func;}
11013 +
11014 +/* FIXME: get reference to module  */
11015 +int register_sched_plugin(struct sched_plugin* plugin)
11016 +{
11017 +	printk(KERN_INFO "Registering LITMUS^RT plugin %s.\n",
11018 +	       plugin->plugin_name);
11019 +
11020 +	/* make sure we don't trip over null pointers later */
11021 +	CHECK(finish_switch);
11022 +	CHECK(schedule);
11023 +	CHECK(tick);
11024 +	CHECK(task_wake_up);
11025 +	CHECK(task_exit);
11026 +	CHECK(task_block);
11027 +	CHECK(task_new);
11028 +	CHECK(complete_job);
11029 +	CHECK(activate_plugin);
11030 +	CHECK(deactivate_plugin);
11031 +#ifdef CONFIG_LITMUS_LOCKING
11032 +	CHECK(allocate_lock);
11033 +#endif
11034 +	CHECK(admit_task);
11035 +
11036 +	if (!plugin->release_at)
11037 +		plugin->release_at = release_at;
11038 +
11039 +	raw_spin_lock(&sched_plugins_lock);
11040 +	list_add(&plugin->list, &sched_plugins);
11041 +	raw_spin_unlock(&sched_plugins_lock);
11042 +
11043 +	return 0;
11044 +}
11045 +
11046 +
11047 +/* FIXME: reference counting, etc. */
11048 +struct sched_plugin* find_sched_plugin(const char* name)
11049 +{
11050 +	struct list_head *pos;
11051 +	struct sched_plugin *plugin;
11052 +
11053 +	raw_spin_lock(&sched_plugins_lock);
11054 +	list_for_each(pos, &sched_plugins) {
11055 +		plugin = list_entry(pos, struct sched_plugin, list);
11056 +		if (!strcmp(plugin->plugin_name, name))
11057 +		    goto out_unlock;
11058 +	}
11059 +	plugin = NULL;
11060 +
11061 +out_unlock:
11062 +	raw_spin_unlock(&sched_plugins_lock);
11063 +	return plugin;
11064 +}
11065 +
11066 +int print_sched_plugins(char* buf, int max)
11067 +{
11068 +	int count = 0;
11069 +	struct list_head *pos;
11070 +	struct sched_plugin *plugin;
11071 +
11072 +	raw_spin_lock(&sched_plugins_lock);
11073 +	list_for_each(pos, &sched_plugins) {
11074 +		plugin = list_entry(pos, struct sched_plugin, list);
11075 +		count += snprintf(buf + count, max - count, "%s\n", plugin->plugin_name);
11076 +		if (max - count <= 0)
11077 +			break;
11078 +	}
11079 +	raw_spin_unlock(&sched_plugins_lock);
11080 +	return 	count;
11081 +}
11082 diff --git a/litmus/sched_psn_edf.c b/litmus/sched_psn_edf.c
11083 new file mode 100644
11084 index 0000000..8e4a22d
11085 --- /dev/null
11086 +++ b/litmus/sched_psn_edf.c
11087 @@ -0,0 +1,645 @@
11088 +/*
11089 + * kernel/sched_psn_edf.c
11090 + *
11091 + * Implementation of the PSN-EDF scheduler plugin.
11092 + * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c.
11093 + *
11094 + * Suspensions and non-preemptable sections are supported.
11095 + * Priority inheritance is not supported.
11096 + */
11097 +
11098 +#include <linux/percpu.h>
11099 +#include <linux/sched.h>
11100 +#include <linux/list.h>
11101 +#include <linux/spinlock.h>
11102 +#include <linux/module.h>
11103 +
11104 +#include <litmus/litmus.h>
11105 +#include <litmus/jobs.h>
11106 +#include <litmus/preempt.h>
11107 +#include <litmus/sched_plugin.h>
11108 +#include <litmus/edf_common.h>
11109 +#include <litmus/sched_trace.h>
11110 +#include <litmus/trace.h>
11111 +
11112 +typedef struct {
11113 +	rt_domain_t 		domain;
11114 +	int          		cpu;
11115 +	struct task_struct* 	scheduled; /* only RT tasks */
11116 +/*
11117 + * scheduling lock slock
11118 + * protects the domain and serializes scheduling decisions
11119 + */
11120 +#define slock domain.ready_lock
11121 +
11122 +} psnedf_domain_t;
11123 +
11124 +DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains);
11125 +
11126 +#define local_edf		(&__get_cpu_var(psnedf_domains).domain)
11127 +#define local_pedf		(&__get_cpu_var(psnedf_domains))
11128 +#define remote_edf(cpu)		(&per_cpu(psnedf_domains, cpu).domain)
11129 +#define remote_pedf(cpu)	(&per_cpu(psnedf_domains, cpu))
11130 +#define task_edf(task)		remote_edf(get_partition(task))
11131 +#define task_pedf(task)		remote_pedf(get_partition(task))
11132 +
11133 +
11134 +static void psnedf_domain_init(psnedf_domain_t* pedf,
11135 +			       check_resched_needed_t check,
11136 +			       release_jobs_t release,
11137 +			       int cpu)
11138 +{
11139 +	edf_domain_init(&pedf->domain, check, release);
11140 +	pedf->cpu      		= cpu;
11141 +	pedf->scheduled		= NULL;
11142 +}
11143 +
11144 +static void requeue(struct task_struct* t, rt_domain_t *edf)
11145 +{
11146 +	if (t->state != TASK_RUNNING)
11147 +		TRACE_TASK(t, "requeue: !TASK_RUNNING\n");
11148 +
11149 +	set_rt_flags(t, RT_F_RUNNING);
11150 +	if (is_released(t, litmus_clock()))
11151 +		__add_ready(edf, t);
11152 +	else
11153 +		add_release(edf, t); /* it has got to wait */
11154 +}
11155 +
11156 +/* we assume the lock is being held */
11157 +static void preempt(psnedf_domain_t *pedf)
11158 +{
11159 +	preempt_if_preemptable(pedf->scheduled, pedf->cpu);
11160 +}
11161 +
11162 +#ifdef CONFIG_LITMUS_LOCKING
11163 +
11164 +static void boost_priority(struct task_struct* t)
11165 +{
11166 +	unsigned long		flags;
11167 +	psnedf_domain_t* 	pedf = task_pedf(t);
11168 +	lt_t			now;
11169 +
11170 +	raw_spin_lock_irqsave(&pedf->slock, flags);
11171 +	now = litmus_clock();
11172 +
11173 +	TRACE_TASK(t, "priority boosted at %llu\n", now);
11174 +
11175 +	tsk_rt(t)->priority_boosted = 1;
11176 +	tsk_rt(t)->boost_start_time = now;
11177 +
11178 +	if (pedf->scheduled != t) {
11179 +		/* holder may be queued: first stop queue changes */
11180 +		raw_spin_lock(&pedf->domain.release_lock);
11181 +		if (is_queued(t) &&
11182 +		    /* If it is queued, then we need to re-order. */
11183 +		    bheap_decrease(edf_ready_order, tsk_rt(t)->heap_node) &&
11184 +		    /* If we bubbled to the top, then we need to check for preemptions. */
11185 +		    edf_preemption_needed(&pedf->domain, pedf->scheduled))
11186 +				preempt(pedf);
11187 +		raw_spin_unlock(&pedf->domain.release_lock);
11188 +	} /* else: nothing to do since the job is not queued while scheduled */
11189 +
11190 +	raw_spin_unlock_irqrestore(&pedf->slock, flags);
11191 +}
11192 +
11193 +static void unboost_priority(struct task_struct* t)
11194 +{
11195 +	unsigned long		flags;
11196 +	psnedf_domain_t* 	pedf = task_pedf(t);
11197 +	lt_t			now;
11198 +
11199 +	raw_spin_lock_irqsave(&pedf->slock, flags);
11200 +	now = litmus_clock();
11201 +
11202 +	/* assumption: this only happens when the job is scheduled */
11203 +	BUG_ON(pedf->scheduled != t);
11204 +
11205 +	TRACE_TASK(t, "priority restored at %llu\n", now);
11206 +
11207 +	/* priority boosted jobs must be scheduled */
11208 +	BUG_ON(pedf->scheduled != t);
11209 +
11210 +	tsk_rt(t)->priority_boosted = 0;
11211 +	tsk_rt(t)->boost_start_time = 0;
11212 +
11213 +	/* check if this changes anything */
11214 +	if (edf_preemption_needed(&pedf->domain, pedf->scheduled))
11215 +		preempt(pedf);
11216 +
11217 +	raw_spin_unlock_irqrestore(&pedf->slock, flags);
11218 +}
11219 +
11220 +#endif
11221 +
11222 +/* This check is trivial in partioned systems as we only have to consider
11223 + * the CPU of the partition.
11224 + */
11225 +static int psnedf_check_resched(rt_domain_t *edf)
11226 +{
11227 +	psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain);
11228 +
11229 +	/* because this is a callback from rt_domain_t we already hold
11230 +	 * the necessary lock for the ready queue
11231 +	 */
11232 +	if (edf_preemption_needed(edf, pedf->scheduled)) {
11233 +		preempt(pedf);
11234 +		return 1;
11235 +	} else
11236 +		return 0;
11237 +}
11238 +
11239 +static void job_completion(struct task_struct* t, int forced)
11240 +{
11241 +	sched_trace_task_completion(t,forced);
11242 +	TRACE_TASK(t, "job_completion().\n");
11243 +
11244 +	set_rt_flags(t, RT_F_SLEEP);
11245 +	prepare_for_next_period(t);
11246 +}
11247 +
11248 +static void psnedf_tick(struct task_struct *t)
11249 +{
11250 +	psnedf_domain_t *pedf = local_pedf;
11251 +
11252 +	/* Check for inconsistency. We don't need the lock for this since
11253 +	 * ->scheduled is only changed in schedule, which obviously is not
11254 +	 *  executing in parallel on this CPU
11255 +	 */
11256 +	BUG_ON(is_realtime(t) && t != pedf->scheduled);
11257 +
11258 +	if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
11259 +		if (!is_np(t)) {
11260 +			litmus_reschedule_local();
11261 +			TRACE("psnedf_scheduler_tick: "
11262 +			      "%d is preemptable "
11263 +			      " => FORCE_RESCHED\n", t->pid);
11264 +		} else if (is_user_np(t)) {
11265 +			TRACE("psnedf_scheduler_tick: "
11266 +			      "%d is non-preemptable, "
11267 +			      "preemption delayed.\n", t->pid);
11268 +			request_exit_np(t);
11269 +		}
11270 +	}
11271 +}
11272 +
11273 +static struct task_struct* psnedf_schedule(struct task_struct * prev)
11274 +{
11275 +	psnedf_domain_t* 	pedf = local_pedf;
11276 +	rt_domain_t*		edf  = &pedf->domain;
11277 +	struct task_struct*	next;
11278 +
11279 +	int 			out_of_time, sleep, preempt,
11280 +				np, exists, blocks, resched;
11281 +
11282 +	raw_spin_lock(&pedf->slock);
11283 +
11284 +	/* sanity checking
11285 +	 * differently from gedf, when a task exits (dead)
11286 +	 * pedf->schedule may be null and prev _is_ realtime
11287 +	 */
11288 +	BUG_ON(pedf->scheduled && pedf->scheduled != prev);
11289 +	BUG_ON(pedf->scheduled && !is_realtime(prev));
11290 +
11291 +	/* (0) Determine state */
11292 +	exists      = pedf->scheduled != NULL;
11293 +	blocks      = exists && !is_running(pedf->scheduled);
11294 +	out_of_time = exists &&
11295 +				  budget_enforced(pedf->scheduled) &&
11296 +				  budget_exhausted(pedf->scheduled);
11297 +	np 	    = exists && is_np(pedf->scheduled);
11298 +	sleep	    = exists && get_rt_flags(pedf->scheduled) == RT_F_SLEEP;
11299 +	preempt     = edf_preemption_needed(edf, prev);
11300 +
11301 +	/* If we need to preempt do so.
11302 +	 * The following checks set resched to 1 in case of special
11303 +	 * circumstances.
11304 +	 */
11305 +	resched = preempt;
11306 +
11307 +	/* If a task blocks we have no choice but to reschedule.
11308 +	 */
11309 +	if (blocks)
11310 +		resched = 1;
11311 +
11312 +	/* Request a sys_exit_np() call if we would like to preempt but cannot.
11313 +	 * Multiple calls to request_exit_np() don't hurt.
11314 +	 */
11315 +	if (np && (out_of_time || preempt || sleep))
11316 +		request_exit_np(pedf->scheduled);
11317 +
11318 +	/* Any task that is preemptable and either exhausts its execution
11319 +	 * budget or wants to sleep completes. We may have to reschedule after
11320 +	 * this.
11321 +	 */
11322 +	if (!np && (out_of_time || sleep) && !blocks) {
11323 +		job_completion(pedf->scheduled, !sleep);
11324 +		resched = 1;
11325 +	}
11326 +
11327 +	/* The final scheduling decision. Do we need to switch for some reason?
11328 +	 * Switch if we are in RT mode and have no task or if we need to
11329 +	 * resched.
11330 +	 */
11331 +	next = NULL;
11332 +	if ((!np || blocks) && (resched || !exists)) {
11333 +		/* When preempting a task that does not block, then
11334 +		 * re-insert it into either the ready queue or the
11335 +		 * release queue (if it completed). requeue() picks
11336 +		 * the appropriate queue.
11337 +		 */
11338 +		if (pedf->scheduled && !blocks)
11339 +			requeue(pedf->scheduled, edf);
11340 +		next = __take_ready(edf);
11341 +	} else
11342 +		/* Only override Linux scheduler if we have a real-time task
11343 +		 * scheduled that needs to continue.
11344 +		 */
11345 +		if (exists)
11346 +			next = prev;
11347 +
11348 +	if (next) {
11349 +		TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
11350 +		set_rt_flags(next, RT_F_RUNNING);
11351 +	} else {
11352 +		TRACE("becoming idle at %llu\n", litmus_clock());
11353 +	}
11354 +
11355 +	pedf->scheduled = next;
11356 +	sched_state_task_picked();
11357 +	raw_spin_unlock(&pedf->slock);
11358 +
11359 +	return next;
11360 +}
11361 +
11362 +
11363 +/*	Prepare a task for running in RT mode
11364 + */
11365 +static void psnedf_task_new(struct task_struct * t, int on_rq, int running)
11366 +{
11367 +	rt_domain_t* 		edf  = task_edf(t);
11368 +	psnedf_domain_t* 	pedf = task_pedf(t);
11369 +	unsigned long		flags;
11370 +
11371 +	TRACE_TASK(t, "psn edf: task new, cpu = %d\n",
11372 +		   t->rt_param.task_params.cpu);
11373 +
11374 +	/* setup job parameters */
11375 +	release_at(t, litmus_clock());
11376 +
11377 +	/* The task should be running in the queue, otherwise signal
11378 +	 * code will try to wake it up with fatal consequences.
11379 +	 */
11380 +	raw_spin_lock_irqsave(&pedf->slock, flags);
11381 +	if (running) {
11382 +		/* there shouldn't be anything else running at the time */
11383 +		BUG_ON(pedf->scheduled);
11384 +		pedf->scheduled = t;
11385 +	} else {
11386 +		requeue(t, edf);
11387 +		/* maybe we have to reschedule */
11388 +		preempt(pedf);
11389 +	}
11390 +	raw_spin_unlock_irqrestore(&pedf->slock, flags);
11391 +}
11392 +
11393 +static void psnedf_task_wake_up(struct task_struct *task)
11394 +{
11395 +	unsigned long		flags;
11396 +	psnedf_domain_t* 	pedf = task_pedf(task);
11397 +	rt_domain_t* 		edf  = task_edf(task);
11398 +	lt_t			now;
11399 +
11400 +	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
11401 +	raw_spin_lock_irqsave(&pedf->slock, flags);
11402 +	BUG_ON(is_queued(task));
11403 +	now = litmus_clock();
11404 +	if (is_tardy(task, now)
11405 +#ifdef CONFIG_LITMUS_LOCKING
11406 +	/* We need to take suspensions because of semaphores into
11407 +	 * account! If a job resumes after being suspended due to acquiring
11408 +	 * a semaphore, it should never be treated as a new job release.
11409 +	 */
11410 +	    && !is_priority_boosted(task)
11411 +#endif
11412 +		) {
11413 +		/* new sporadic release */
11414 +		release_at(task, now);
11415 +		sched_trace_task_release(task);
11416 +	}
11417 +
11418 +	/* Only add to ready queue if it is not the currently-scheduled
11419 +	 * task. This could be the case if a task was woken up concurrently
11420 +	 * on a remote CPU before the executing CPU got around to actually
11421 +	 * de-scheduling the task, i.e., wake_up() raced with schedule()
11422 +	 * and won.
11423 +	 */
11424 +	if (pedf->scheduled != task)
11425 +		requeue(task, edf);
11426 +
11427 +	raw_spin_unlock_irqrestore(&pedf->slock, flags);
11428 +	TRACE_TASK(task, "wake up done\n");
11429 +}
11430 +
11431 +static void psnedf_task_block(struct task_struct *t)
11432 +{
11433 +	/* only running tasks can block, thus t is in no queue */
11434 +	TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);
11435 +
11436 +	BUG_ON(!is_realtime(t));
11437 +	BUG_ON(is_queued(t));
11438 +}
11439 +
11440 +static void psnedf_task_exit(struct task_struct * t)
11441 +{
11442 +	unsigned long flags;
11443 +	psnedf_domain_t* 	pedf = task_pedf(t);
11444 +	rt_domain_t*		edf;
11445 +
11446 +	raw_spin_lock_irqsave(&pedf->slock, flags);
11447 +	if (is_queued(t)) {
11448 +		/* dequeue */
11449 +		edf  = task_edf(t);
11450 +		remove(edf, t);
11451 +	}
11452 +	if (pedf->scheduled == t)
11453 +		pedf->scheduled = NULL;
11454 +
11455 +	TRACE_TASK(t, "RIP, now reschedule\n");
11456 +
11457 +	preempt(pedf);
11458 +	raw_spin_unlock_irqrestore(&pedf->slock, flags);
11459 +}
11460 +
11461 +#ifdef CONFIG_LITMUS_LOCKING
11462 +
11463 +#include <litmus/fdso.h>
11464 +#include <litmus/srp.h>
11465 +
11466 +/* ******************** SRP support ************************ */
11467 +
11468 +static unsigned int psnedf_get_srp_prio(struct task_struct* t)
11469 +{
11470 +	/* assumes implicit deadlines */
11471 +	return get_rt_period(t);
11472 +}
11473 +
11474 +/* ******************** FMLP support ********************** */
11475 +
11476 +/* struct for semaphore with priority inheritance */
11477 +struct fmlp_semaphore {
11478 +	struct litmus_lock litmus_lock;
11479 +
11480 +	/* current resource holder */
11481 +	struct task_struct *owner;
11482 +
11483 +	/* FIFO queue of waiting tasks */
11484 +	wait_queue_head_t wait;
11485 +};
11486 +
11487 +static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
11488 +{
11489 +	return container_of(lock, struct fmlp_semaphore, litmus_lock);
11490 +}
11491 +int psnedf_fmlp_lock(struct litmus_lock* l)
11492 +{
11493 +	struct task_struct* t = current;
11494 +	struct fmlp_semaphore *sem = fmlp_from_lock(l);
11495 +	wait_queue_t wait;
11496 +	unsigned long flags;
11497 +
11498 +	if (!is_realtime(t))
11499 +		return -EPERM;
11500 +
11501 +	spin_lock_irqsave(&sem->wait.lock, flags);
11502 +
11503 +	if (sem->owner) {
11504 +		/* resource is not free => must suspend and wait */
11505 +
11506 +		init_waitqueue_entry(&wait, t);
11507 +
11508 +		/* FIXME: interruptible would be nice some day */
11509 +		set_task_state(t, TASK_UNINTERRUPTIBLE);
11510 +
11511 +		__add_wait_queue_tail_exclusive(&sem->wait, &wait);
11512 +
11513 +		TS_LOCK_SUSPEND;
11514 +
11515 +		/* release lock before sleeping */
11516 +		spin_unlock_irqrestore(&sem->wait.lock, flags);
11517 +
11518 +		/* We depend on the FIFO order.  Thus, we don't need to recheck
11519 +		 * when we wake up; we are guaranteed to have the lock since
11520 +		 * there is only one wake up per release.
11521 +		 */
11522 +
11523 +		schedule();
11524 +
11525 +		TS_LOCK_RESUME;
11526 +
11527 +		/* Since we hold the lock, no other task will change
11528 +		 * ->owner. We can thus check it without acquiring the spin
11529 +		 * lock. */
11530 +		BUG_ON(sem->owner != t);
11531 +	} else {
11532 +		/* it's ours now */
11533 +		sem->owner = t;
11534 +
11535 +		/* mark the task as priority-boosted. */
11536 +		boost_priority(t);
11537 +
11538 +		spin_unlock_irqrestore(&sem->wait.lock, flags);
11539 +	}
11540 +
11541 +	return 0;
11542 +}
11543 +
11544 +int psnedf_fmlp_unlock(struct litmus_lock* l)
11545 +{
11546 +	struct task_struct *t = current, *next;
11547 +	struct fmlp_semaphore *sem = fmlp_from_lock(l);
11548 +	unsigned long flags;
11549 +	int err = 0;
11550 +
11551 +	spin_lock_irqsave(&sem->wait.lock, flags);
11552 +
11553 +	if (sem->owner != t) {
11554 +		err = -EINVAL;
11555 +		goto out;
11556 +	}
11557 +
11558 +	/* we lose the benefit of priority boosting */
11559 +
11560 +	unboost_priority(t);
11561 +
11562 +	/* check if there are jobs waiting for this resource */
11563 +	next = __waitqueue_remove_first(&sem->wait);
11564 +	if (next) {
11565 +		/* boost next job */
11566 +		boost_priority(next);
11567 +
11568 +		/* next becomes the resouce holder */
11569 +		sem->owner = next;
11570 +
11571 +		/* wake up next */
11572 +		wake_up_process(next);
11573 +	} else
11574 +		/* resource becomes available */
11575 +		sem->owner = NULL;
11576 +
11577 +out:
11578 +	spin_unlock_irqrestore(&sem->wait.lock, flags);
11579 +	return err;
11580 +}
11581 +
11582 +int psnedf_fmlp_close(struct litmus_lock* l)
11583 +{
11584 +	struct task_struct *t = current;
11585 +	struct fmlp_semaphore *sem = fmlp_from_lock(l);
11586 +	unsigned long flags;
11587 +
11588 +	int owner;
11589 +
11590 +	spin_lock_irqsave(&sem->wait.lock, flags);
11591 +
11592 +	owner = sem->owner == t;
11593 +
11594 +	spin_unlock_irqrestore(&sem->wait.lock, flags);
11595 +
11596 +	if (owner)
11597 +		psnedf_fmlp_unlock(l);
11598 +
11599 +	return 0;
11600 +}
11601 +
11602 +void psnedf_fmlp_free(struct litmus_lock* lock)
11603 +{
11604 +	kfree(fmlp_from_lock(lock));
11605 +}
11606 +
11607 +static struct litmus_lock_ops psnedf_fmlp_lock_ops = {
11608 +	.close  = psnedf_fmlp_close,
11609 +	.lock   = psnedf_fmlp_lock,
11610 +	.unlock = psnedf_fmlp_unlock,
11611 +	.deallocate = psnedf_fmlp_free,
11612 +};
11613 +
11614 +static struct litmus_lock* psnedf_new_fmlp(void)
11615 +{
11616 +	struct fmlp_semaphore* sem;
11617 +
11618 +	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
11619 +	if (!sem)
11620 +		return NULL;
11621 +
11622 +	sem->owner   = NULL;
11623 +	init_waitqueue_head(&sem->wait);
11624 +	sem->litmus_lock.ops = &psnedf_fmlp_lock_ops;
11625 +
11626 +	return &sem->litmus_lock;
11627 +}
11628 +
11629 +/* **** lock constructor **** */
11630 +
11631 +
11632 +static long psnedf_allocate_lock(struct litmus_lock **lock, int type,
11633 +				 void* __user unused)
11634 +{
11635 +	int err = -ENXIO;
11636 +	struct srp_semaphore* srp;
11637 +
11638 +	/* PSN-EDF currently supports the SRP for local resources and the FMLP
11639 +	 * for global resources. */
11640 +	switch (type) {
11641 +	case FMLP_SEM:
11642 +		/* Flexible Multiprocessor Locking Protocol */
11643 +		*lock = psnedf_new_fmlp();
11644 +		if (*lock)
11645 +			err = 0;
11646 +		else
11647 +			err = -ENOMEM;
11648 +		break;
11649 +
11650 +	case SRP_SEM:
11651 +		/* Baker's Stack Resource Policy */
11652 +		srp = allocate_srp_semaphore();
11653 +		if (srp) {
11654 +			*lock = &srp->litmus_lock;
11655 +			err = 0;
11656 +		} else
11657 +			err = -ENOMEM;
11658 +		break;
11659 +	};
11660 +
11661 +	return err;
11662 +}
11663 +
11664 +#endif
11665 +
11666 +
11667 +static long psnedf_activate_plugin(void)
11668 +{
11669 +#ifdef CONFIG_RELEASE_MASTER
11670 +	int cpu;
11671 +
11672 +	for_each_online_cpu(cpu) {
11673 +		remote_edf(cpu)->release_master = atomic_read(&release_master_cpu);
11674 +	}
11675 +#endif
11676 +
11677 +#ifdef CONFIG_LITMUS_LOCKING
11678 +	get_srp_prio = psnedf_get_srp_prio;
11679 +#endif
11680 +
11681 +	return 0;
11682 +}
11683 +
11684 +static long psnedf_admit_task(struct task_struct* tsk)
11685 +{
11686 +	if (task_cpu(tsk) == tsk->rt_param.task_params.cpu
11687 +#ifdef CONFIG_RELEASE_MASTER
11688 +	    /* don't allow tasks on release master CPU */
11689 +	     && task_cpu(tsk) != remote_edf(task_cpu(tsk))->release_master
11690 +#endif
11691 +		)
11692 +		return 0;
11693 +	else
11694 +		return -EINVAL;
11695 +}
11696 +
11697 +/*	Plugin object	*/
11698 +static struct sched_plugin psn_edf_plugin __cacheline_aligned_in_smp = {
11699 +	.plugin_name		= "PSN-EDF",
11700 +	.tick			= psnedf_tick,
11701 +	.task_new		= psnedf_task_new,
11702 +	.complete_job		= complete_job,
11703 +	.task_exit		= psnedf_task_exit,
11704 +	.schedule		= psnedf_schedule,
11705 +	.task_wake_up		= psnedf_task_wake_up,
11706 +	.task_block		= psnedf_task_block,
11707 +	.admit_task		= psnedf_admit_task,
11708 +	.activate_plugin	= psnedf_activate_plugin,
11709 +#ifdef CONFIG_LITMUS_LOCKING
11710 +	.allocate_lock		= psnedf_allocate_lock,
11711 +#endif
11712 +};
11713 +
11714 +
11715 +static int __init init_psn_edf(void)
11716 +{
11717 +	int i;
11718 +
11719 +	/* We do not really want to support cpu hotplug, do we? ;)
11720 +	 * However, if we are so crazy to do so,
11721 +	 * we cannot use num_online_cpu()
11722 +	 */
11723 +	for (i = 0; i < num_online_cpus(); i++) {
11724 +		psnedf_domain_init(remote_pedf(i),
11725 +				   psnedf_check_resched,
11726 +				   NULL, i);
11727 +	}
11728 +	return register_sched_plugin(&psn_edf_plugin);
11729 +}
11730 +
11731 +module_init(init_psn_edf);
11732 +
11733 diff --git a/litmus/sched_task_trace.c b/litmus/sched_task_trace.c
11734 new file mode 100644
11735 index 0000000..5ef8d09
11736 --- /dev/null
11737 +++ b/litmus/sched_task_trace.c
11738 @@ -0,0 +1,241 @@
11739 +/*
11740 + * sched_task_trace.c -- record scheduling events to a byte stream
11741 + */
11742 +
11743 +#define NO_TASK_TRACE_DECLS
11744 +
11745 +#include <linux/module.h>
11746 +#include <linux/sched.h>
11747 +#include <linux/percpu.h>
11748 +
11749 +#include <litmus/ftdev.h>
11750 +#include <litmus/litmus.h>
11751 +
11752 +#include <litmus/sched_trace.h>
11753 +#include <litmus/feather_trace.h>
11754 +#include <litmus/ftdev.h>
11755 +
11756 +
11757 +#define NO_EVENTS		(1 << CONFIG_SCHED_TASK_TRACE_SHIFT)
11758 +
11759 +#define now() litmus_clock()
11760 +
11761 +struct local_buffer {
11762 +	struct st_event_record record[NO_EVENTS];
11763 +	char   flag[NO_EVENTS];
11764 +	struct ft_buffer ftbuf;
11765 +};
11766 +
11767 +DEFINE_PER_CPU(struct local_buffer, st_event_buffer);
11768 +
11769 +static struct ftdev st_dev;
11770 +
11771 +static int st_dev_can_open(struct ftdev *dev, unsigned int cpu)
11772 +{
11773 +	return cpu_online(cpu) ? 0 : -ENODEV;
11774 +}
11775 +
11776 +static int __init init_sched_task_trace(void)
11777 +{
11778 +	struct local_buffer* buf;
11779 +	int i, ok = 0, err;
11780 +	printk("Allocated %u sched_trace_xxx() events per CPU "
11781 +	       "(buffer size: %d bytes)\n",
11782 +	       NO_EVENTS, (int) sizeof(struct local_buffer));
11783 +
11784 +	err = ftdev_init(&st_dev, THIS_MODULE,
11785 +			num_online_cpus(), "sched_trace");
11786 +	if (err)
11787 +		goto err_out;
11788 +
11789 +	for (i = 0; i < st_dev.minor_cnt; i++) {
11790 +		buf = &per_cpu(st_event_buffer, i);
11791 +		ok += init_ft_buffer(&buf->ftbuf, NO_EVENTS,
11792 +				     sizeof(struct st_event_record),
11793 +				     buf->flag,
11794 +				     buf->record);
11795 +		st_dev.minor[i].buf = &buf->ftbuf;
11796 +	}
11797 +	if (ok == st_dev.minor_cnt) {
11798 +		st_dev.can_open = st_dev_can_open;
11799 +		err = register_ftdev(&st_dev);
11800 +		if (err)
11801 +			goto err_dealloc;
11802 +	} else {
11803 +		err = -EINVAL;
11804 +		goto err_dealloc;
11805 +	}
11806 +
11807 +	return 0;
11808 +
11809 +err_dealloc:
11810 +	ftdev_exit(&st_dev);
11811 +err_out:
11812 +	printk(KERN_WARNING "Could not register sched_trace module\n");
11813 +	return err;
11814 +}
11815 +
11816 +static void __exit exit_sched_task_trace(void)
11817 +{
11818 +	ftdev_exit(&st_dev);
11819 +}
11820 +
11821 +module_init(init_sched_task_trace);
11822 +module_exit(exit_sched_task_trace);
11823 +
11824 +
11825 +static inline struct st_event_record* get_record(u8 type, struct task_struct* t)
11826 +{
11827 +	struct st_event_record* rec = NULL;
11828 +	struct local_buffer* buf;
11829 +
11830 +	buf = &get_cpu_var(st_event_buffer);
11831 +	if (ft_buffer_start_write(&buf->ftbuf, (void**) &rec)) {
11832 +		rec->hdr.type = type;
11833 +		rec->hdr.cpu  = smp_processor_id();
11834 +		rec->hdr.pid  = t ? t->pid : 0;
11835 +		rec->hdr.job  = t ? t->rt_param.job_params.job_no : 0;
11836 +	} else {
11837 +		put_cpu_var(st_event_buffer);
11838 +	}
11839 +	/* rec will be NULL if it failed */
11840 +	return rec;
11841 +}
11842 +
11843 +static inline void put_record(struct st_event_record* rec)
11844 +{
11845 +	struct local_buffer* buf;
11846 +	buf = &__get_cpu_var(st_event_buffer);
11847 +	ft_buffer_finish_write(&buf->ftbuf, rec);
11848 +	put_cpu_var(st_event_buffer);
11849 +}
11850 +
11851 +feather_callback void do_sched_trace_task_name(unsigned long id, unsigned long _task)
11852 +{
11853 +	struct task_struct *t = (struct task_struct*) _task;
11854 +	struct st_event_record* rec = get_record(ST_NAME, t);
11855 +	int i;
11856 +	if (rec) {
11857 +		for (i = 0; i < min(TASK_COMM_LEN, ST_NAME_LEN); i++)
11858 +			rec->data.name.cmd[i] = t->comm[i];
11859 +		put_record(rec);
11860 +	}
11861 +}
11862 +
11863 +feather_callback void do_sched_trace_task_param(unsigned long id, unsigned long _task)
11864 +{
11865 +	struct task_struct *t = (struct task_struct*) _task;
11866 +	struct st_event_record* rec = get_record(ST_PARAM, t);
11867 +	if (rec) {
11868 +		rec->data.param.wcet      = get_exec_cost(t);
11869 +		rec->data.param.period    = get_rt_period(t);
11870 +		rec->data.param.phase     = get_rt_phase(t);
11871 +		rec->data.param.partition = get_partition(t);
11872 +		rec->data.param.class     = get_class(t);
11873 +		put_record(rec);
11874 +	}
11875 +}
11876 +
11877 +feather_callback void do_sched_trace_task_release(unsigned long id, unsigned long _task)
11878 +{
11879 +	struct task_struct *t = (struct task_struct*) _task;
11880 +	struct st_event_record* rec = get_record(ST_RELEASE, t);
11881 +	if (rec) {
11882 +		rec->data.release.release  = get_release(t);
11883 +		rec->data.release.deadline = get_deadline(t);
11884 +		put_record(rec);
11885 +	}
11886 +}
11887 +
11888 +/* skipped: st_assigned_data, we don't use it atm */
11889 +
11890 +feather_callback void do_sched_trace_task_switch_to(unsigned long id,
11891 +						    unsigned long _task)
11892 +{
11893 +	struct task_struct *t = (struct task_struct*) _task;
11894 +	struct st_event_record* rec;
11895 +	if (is_realtime(t)) {
11896 +		rec = get_record(ST_SWITCH_TO, t);
11897 +		if (rec) {
11898 +			rec->data.switch_to.when      = now();
11899 +			rec->data.switch_to.exec_time = get_exec_time(t);
11900 +			put_record(rec);
11901 +		}
11902 +	}
11903 +}
11904 +
11905 +feather_callback void do_sched_trace_task_switch_away(unsigned long id,
11906 +						      unsigned long _task)
11907 +{
11908 +	struct task_struct *t = (struct task_struct*) _task;
11909 +	struct st_event_record* rec;
11910 +	if (is_realtime(t)) {
11911 +		rec = get_record(ST_SWITCH_AWAY, t);
11912 +		if (rec) {
11913 +			rec->data.switch_away.when      = now();
11914 +			rec->data.switch_away.exec_time = get_exec_time(t);
11915 +			put_record(rec);
11916 +		}
11917 +	}
11918 +}
11919 +
11920 +feather_callback void do_sched_trace_task_completion(unsigned long id,
11921 +						     unsigned long _task,
11922 +						     unsigned long forced)
11923 +{
11924 +	struct task_struct *t = (struct task_struct*) _task;
11925 +	struct st_event_record* rec = get_record(ST_COMPLETION, t);
11926 +	if (rec) {
11927 +		rec->data.completion.when   = now();
11928 +		rec->data.completion.forced = forced;
11929 +		put_record(rec);
11930 +	}
11931 +}
11932 +
11933 +feather_callback void do_sched_trace_task_block(unsigned long id,
11934 +						unsigned long _task)
11935 +{
11936 +	struct task_struct *t = (struct task_struct*) _task;
11937 +	struct st_event_record* rec = get_record(ST_BLOCK, t);
11938 +	if (rec) {
11939 +		rec->data.block.when      = now();
11940 +		put_record(rec);
11941 +	}
11942 +}
11943 +
11944 +feather_callback void do_sched_trace_task_resume(unsigned long id,
11945 +						 unsigned long _task)
11946 +{
11947 +	struct task_struct *t = (struct task_struct*) _task;
11948 +	struct st_event_record* rec = get_record(ST_RESUME, t);
11949 +	if (rec) {
11950 +		rec->data.resume.when      = now();
11951 +		put_record(rec);
11952 +	}
11953 +}
11954 +
11955 +feather_callback void do_sched_trace_sys_release(unsigned long id,
11956 +						 unsigned long _start)
11957 +{
11958 +	lt_t *start = (lt_t*) _start;
11959 +	struct st_event_record* rec = get_record(ST_SYS_RELEASE, NULL);
11960 +	if (rec) {
11961 +		rec->data.sys_release.when    = now();
11962 +		rec->data.sys_release.release = *start;
11963 +		put_record(rec);
11964 +	}
11965 +}
11966 +
11967 +feather_callback void do_sched_trace_action(unsigned long id,
11968 +					    unsigned long _task,
11969 +					    unsigned long action)
11970 +{
11971 +	struct task_struct *t = (struct task_struct*) _task;
11972 +	struct st_event_record* rec = get_record(ST_ACTION, t);
11973 +
11974 +	if (rec) {
11975 +		rec->data.action.when   = now();
11976 +		rec->data.action.action = action;
11977 +		put_record(rec);
11978 +	}
11979 +}
11980 diff --git a/litmus/sched_trace.c b/litmus/sched_trace.c
11981 new file mode 100644
11982 index 0000000..f4171fd
11983 --- /dev/null
11984 +++ b/litmus/sched_trace.c
11985 @@ -0,0 +1,252 @@
11986 +/*
11987 + * sched_trace.c -- record scheduling events to a byte stream.
11988 + */
11989 +#include <linux/spinlock.h>
11990 +#include <linux/mutex.h>
11991 +
11992 +#include <linux/fs.h>
11993 +#include <linux/slab.h>
11994 +#include <linux/miscdevice.h>
11995 +#include <asm/uaccess.h>
11996 +#include <linux/module.h>
11997 +#include <linux/sysrq.h>
11998 +
11999 +#include <linux/kfifo.h>
12000 +
12001 +#include <litmus/sched_trace.h>
12002 +#include <litmus/litmus.h>
12003 +
12004 +#define SCHED_TRACE_NAME "litmus/log"
12005 +
12006 +/* Compute size of TRACE() buffer */
12007 +#define LITMUS_TRACE_BUF_SIZE (1 << CONFIG_SCHED_DEBUG_TRACE_SHIFT)
12008 +
12009 +/* Max length of one read from the buffer */
12010 +#define MAX_READ_LEN (64 * 1024)
12011 +
12012 +/* Max length for one write --- by TRACE() --- to the buffer. This is used to
12013 + * allocate a per-cpu buffer for printf() formatting. */
12014 +#define MSG_SIZE 255
12015 +
12016 +
12017 +static DEFINE_MUTEX(reader_mutex);
12018 +static atomic_t reader_cnt = ATOMIC_INIT(0);
12019 +static DEFINE_KFIFO(debug_buffer, char, LITMUS_TRACE_BUF_SIZE);
12020 +
12021 +
12022 +static DEFINE_RAW_SPINLOCK(log_buffer_lock);
12023 +static DEFINE_PER_CPU(char[MSG_SIZE], fmt_buffer);
12024 +
12025 +/*
12026 + * sched_trace_log_message - Write to the trace buffer (log_buffer)
12027 + *
12028 + * This is the only function accessing the log_buffer from inside the
12029 + * kernel for writing.
12030 + * Concurrent access to sched_trace_log_message must be serialized using
12031 + * log_buffer_lock
12032 + * The maximum length of a formatted message is 255
12033 + */
12034 +void sched_trace_log_message(const char* fmt, ...)
12035 +{
12036 +	unsigned long 	flags;
12037 +	va_list 	args;
12038 +	size_t		len;
12039 +	char*		buf;
12040 +
12041 +	if (!atomic_read(&reader_cnt))
12042 +		/* early exit if nobody is listening */
12043 +		return;
12044 +
12045 +	va_start(args, fmt);
12046 +	local_irq_save(flags);
12047 +
12048 +	/* format message */
12049 +	buf = __get_cpu_var(fmt_buffer);
12050 +	len = vscnprintf(buf, MSG_SIZE, fmt, args);
12051 +
12052 +	raw_spin_lock(&log_buffer_lock);
12053 +	/* Don't copy the trailing null byte, we don't want null bytes in a
12054 +	 * text file.
12055 +	 */
12056 +	kfifo_in(&debug_buffer, buf, len);
12057 +	raw_spin_unlock(&log_buffer_lock);
12058 +
12059 +	local_irq_restore(flags);
12060 +	va_end(args);
12061 +}
12062 +
12063 +
12064 +/*
12065 + * log_read - Read the trace buffer
12066 + *
12067 + * This function is called as a file operation from userspace.
12068 + * Readers can sleep. Access is serialized through reader_mutex
12069 + */
12070 +static ssize_t log_read(struct file *filp,
12071 +			char __user *to, size_t len,
12072 +			loff_t *f_pos)
12073 +{
12074 +	/* we ignore f_pos, this is strictly sequential */
12075 +
12076 +	ssize_t error = -EINVAL;
12077 +	char* mem;
12078 +
12079 +	if (mutex_lock_interruptible(&reader_mutex)) {
12080 +		error = -ERESTARTSYS;
12081 +		goto out;
12082 +	}
12083 +
12084 +	if (len > MAX_READ_LEN)
12085 +		len = MAX_READ_LEN;
12086 +
12087 +	mem = kmalloc(len, GFP_KERNEL);
12088 +	if (!mem) {
12089 +		error = -ENOMEM;
12090 +		goto out_unlock;
12091 +	}
12092 +
12093 +	error = kfifo_out(&debug_buffer, mem, len);
12094 +	while (!error) {
12095 +		set_current_state(TASK_INTERRUPTIBLE);
12096 +		schedule_timeout(110);
12097 +		if (signal_pending(current))
12098 +			error = -ERESTARTSYS;
12099 +		else
12100 +			error = kfifo_out(&debug_buffer, mem, len);
12101 +	}
12102 +
12103 +	if (error > 0 && copy_to_user(to, mem, error))
12104 +		error = -EFAULT;
12105 +
12106 +	kfree(mem);
12107 + out_unlock:
12108 +	mutex_unlock(&reader_mutex);
12109 + out:
12110 +	return error;
12111 +}
12112 +
12113 +/*
12114 + * Enable redirection of printk() messages to the trace buffer.
12115 + * Defined in kernel/printk.c
12116 + */
12117 +extern int trace_override;
12118 +extern int trace_recurse;
12119 +
12120 +/*
12121 + * log_open - open the global log message ring buffer.
12122 + */
12123 +static int log_open(struct inode *in, struct file *filp)
12124 +{
12125 +	int error = -EINVAL;
12126 +
12127 +	if (mutex_lock_interruptible(&reader_mutex)) {
12128 +		error = -ERESTARTSYS;
12129 +		goto out;
12130 +	}
12131 +
12132 +	atomic_inc(&reader_cnt);
12133 +	error = 0;
12134 +
12135 +	printk(KERN_DEBUG
12136 +	       "sched_trace kfifo with buffer starting at: 0x%p\n",
12137 +	       debug_buffer.buf);
12138 +
12139 +	/* override printk() */
12140 +	trace_override++;
12141 +
12142 +	mutex_unlock(&reader_mutex);
12143 + out:
12144 +	return error;
12145 +}
12146 +
12147 +static int log_release(struct inode *in, struct file *filp)
12148 +{
12149 +	int error = -EINVAL;
12150 +
12151 +	if (mutex_lock_interruptible(&reader_mutex)) {
12152 +		error = -ERESTARTSYS;
12153 +		goto out;
12154 +	}
12155 +
12156 +	atomic_dec(&reader_cnt);
12157 +
12158 +	/* release printk() overriding */
12159 +	trace_override--;
12160 +
12161 +	printk(KERN_DEBUG "sched_trace kfifo released\n");
12162 +
12163 +	mutex_unlock(&reader_mutex);
12164 + out:
12165 +	return error;
12166 +}
12167 +
12168 +/*
12169 + * log_fops  - The file operations for accessing the global LITMUS log message
12170 + *             buffer.
12171 + *
12172 + * Except for opening the device file it uses the same operations as trace_fops.
12173 + */
12174 +static struct file_operations log_fops = {
12175 +	.owner   = THIS_MODULE,
12176 +	.open    = log_open,
12177 +	.release = log_release,
12178 +	.read    = log_read,
12179 +};
12180 +
12181 +static struct miscdevice litmus_log_dev = {
12182 +	.name    = SCHED_TRACE_NAME,
12183 +	.minor   = MISC_DYNAMIC_MINOR,
12184 +	.fops    = &log_fops,
12185 +};
12186 +
12187 +#ifdef CONFIG_MAGIC_SYSRQ
12188 +void dump_trace_buffer(int max)
12189 +{
12190 +	char line[80];
12191 +	int len;
12192 +	int count = 0;
12193 +
12194 +	/* potential, but very unlikely, race... */
12195 +	trace_recurse = 1;
12196 +	while ((max == 0 || count++ < max) &&
12197 +	       (len = kfifo_out(&debug_buffer, line, sizeof(line - 1))) > 0) {
12198 +		line[len] = '\0';
12199 +		printk("%s", line);
12200 +	}
12201 +	trace_recurse = 0;
12202 +}
12203 +
12204 +static void sysrq_dump_trace_buffer(int key)
12205 +{
12206 +	dump_trace_buffer(100);
12207 +}
12208 +
12209 +static struct sysrq_key_op sysrq_dump_trace_buffer_op = {
12210 +	.handler	= sysrq_dump_trace_buffer,
12211 +	.help_msg	= "dump-trace-buffer(Y)",
12212 +	.action_msg	= "writing content of TRACE() buffer",
12213 +};
12214 +#endif
12215 +
12216 +static int __init init_sched_trace(void)
12217 +{
12218 +	printk("Initializing TRACE() device\n");
12219 +
12220 +#ifdef CONFIG_MAGIC_SYSRQ
12221 +	/* offer some debugging help */
12222 +	if (!register_sysrq_key('y', &sysrq_dump_trace_buffer_op))
12223 +		printk("Registered dump-trace-buffer(Y) magic sysrq.\n");
12224 +	else
12225 +		printk("Could not register dump-trace-buffer(Y) magic sysrq.\n");
12226 +#endif
12227 +
12228 +	return misc_register(&litmus_log_dev);
12229 +}
12230 +
12231 +static void __exit exit_sched_trace(void)
12232 +{
12233 +	misc_deregister(&litmus_log_dev);
12234 +}
12235 +
12236 +module_init(init_sched_trace);
12237 +module_exit(exit_sched_trace);
12238 diff --git a/litmus/srp.c b/litmus/srp.c
12239 new file mode 100644
12240 index 0000000..2ed4ec1
12241 --- /dev/null
12242 +++ b/litmus/srp.c
12243 @@ -0,0 +1,295 @@
12244 +/* ************************************************************************** */
12245 +/*                          STACK RESOURCE POLICY                             */
12246 +/* ************************************************************************** */
12247 +
12248 +#include <asm/atomic.h>
12249 +#include <linux/sched.h>
12250 +#include <linux/wait.h>
12251 +
12252 +#include <litmus/litmus.h>
12253 +#include <litmus/sched_plugin.h>
12254 +#include <litmus/fdso.h>
12255 +#include <litmus/trace.h>
12256 +
12257 +
12258 +#ifdef CONFIG_LITMUS_LOCKING
12259 +
12260 +#include <litmus/srp.h>
12261 +
12262 +srp_prioritization_t get_srp_prio;
12263 +
12264 +struct srp {
12265 +	struct list_head	ceiling;
12266 +	wait_queue_head_t	ceiling_blocked;
12267 +};
12268 +#define system_ceiling(srp) list2prio(srp->ceiling.next)
12269 +#define ceiling2sem(c) container_of(c, struct srp_semaphore, ceiling)
12270 +
12271 +#define UNDEF_SEM -2
12272 +
12273 +atomic_t srp_objects_in_use = ATOMIC_INIT(0);
12274 +
12275 +DEFINE_PER_CPU(struct srp, srp);
12276 +
12277 +/* Initialize SRP semaphores at boot time. */
12278 +static int __init srp_init(void)
12279 +{
12280 +	int i;
12281 +
12282 +	printk("Initializing SRP per-CPU ceilings...");
12283 +	for (i = 0; i < NR_CPUS; i++) {
12284 +		init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked);
12285 +		INIT_LIST_HEAD(&per_cpu(srp, i).ceiling);
12286 +	}
12287 +	printk(" done!\n");
12288 +
12289 +	return 0;
12290 +}
12291 +module_init(srp_init);
12292 +
12293 +/* SRP task priority comparison function. Smaller numeric values have higher
12294 + * priority, tie-break is PID. Special case: priority == 0 <=> no priority
12295 + */
12296 +static int srp_higher_prio(struct srp_priority* first,
12297 +			   struct srp_priority* second)
12298 +{
12299 +	if (!first->priority)
12300 +		return 0;
12301 +	else
12302 +		return  !second->priority ||
12303 +			first->priority < second->priority || (
12304 +			first->priority == second->priority &&
12305 +			first->pid < second->pid);
12306 +}
12307 +
12308 +
12309 +static int srp_exceeds_ceiling(struct task_struct* first,
12310 +			       struct srp* srp)
12311 +{
12312 +	struct srp_priority prio;
12313 +
12314 +	if (list_empty(&srp->ceiling))
12315 +		return 1;
12316 +	else {
12317 +		prio.pid = first->pid;
12318 +		prio.priority = get_srp_prio(first);
12319 +		return srp_higher_prio(&prio, system_ceiling(srp)) ||
12320 +			ceiling2sem(system_ceiling(srp))->owner == first;
12321 +	}
12322 +}
12323 +
12324 +static void srp_add_prio(struct srp* srp, struct srp_priority* prio)
12325 +{
12326 +	struct list_head *pos;
12327 +	if (in_list(&prio->list)) {
12328 +		printk(KERN_CRIT "WARNING: SRP violation detected, prio is already in "
12329 +		       "ceiling list! cpu=%d, srp=%p\n", smp_processor_id(), ceiling2sem(prio));
12330 +		return;
12331 +	}
12332 +	list_for_each(pos, &srp->ceiling)
12333 +		if (unlikely(srp_higher_prio(prio, list2prio(pos)))) {
12334 +			__list_add(&prio->list, pos->prev, pos);
12335 +			return;
12336 +		}
12337 +
12338 +	list_add_tail(&prio->list, &srp->ceiling);
12339 +}
12340 +
12341 +
12342 +static int lock_srp_semaphore(struct litmus_lock* l)
12343 +{
12344 +	struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
12345 +
12346 +	if (!is_realtime(current))
12347 +		return -EPERM;
12348 +
12349 +	preempt_disable();
12350 +
12351 +	/* Update ceiling. */
12352 +	srp_add_prio(&__get_cpu_var(srp), &sem->ceiling);
12353 +
12354 +	/* SRP invariant: all resources available */
12355 +	BUG_ON(sem->owner != NULL);
12356 +
12357 +	sem->owner = current;
12358 +	TRACE_CUR("acquired srp 0x%p\n", sem);
12359 +
12360 +	preempt_enable();
12361 +
12362 +	return 0;
12363 +}
12364 +
12365 +static int unlock_srp_semaphore(struct litmus_lock* l)
12366 +{
12367 +	struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
12368 +	int err = 0;
12369 +
12370 +	preempt_disable();
12371 +
12372 +	if (sem->owner != current) {
12373 +		err = -EINVAL;
12374 +	} else {
12375 +		/* Determine new system priority ceiling for this CPU. */
12376 +		BUG_ON(!in_list(&sem->ceiling.list));
12377 +
12378 +		list_del(&sem->ceiling.list);
12379 +		sem->owner = NULL;
12380 +
12381 +		/* Wake tasks on this CPU, if they exceed current ceiling. */
12382 +		TRACE_CUR("released srp 0x%p\n", sem);
12383 +		wake_up_all(&__get_cpu_var(srp).ceiling_blocked);
12384 +	}
12385 +
12386 +	preempt_enable();
12387 +	return err;
12388 +}
12389 +
12390 +static int open_srp_semaphore(struct litmus_lock* l, void* __user arg)
12391 +{
12392 +	struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
12393 +	int err = 0;
12394 +	struct task_struct* t = current;
12395 +	struct srp_priority t_prio;
12396 +
12397 +	if (!is_realtime(t))
12398 +		return -EPERM;
12399 +
12400 +	TRACE_CUR("opening SRP semaphore %p, cpu=%d\n", sem, sem->cpu);
12401 +
12402 +	preempt_disable();
12403 +
12404 +	if (sem->owner != NULL)
12405 +		err = -EBUSY;
12406 +
12407 +	if (err == 0) {
12408 +		if (sem->cpu == UNDEF_SEM)
12409 +			sem->cpu = get_partition(t);
12410 +		else if (sem->cpu != get_partition(t))
12411 +			err = -EPERM;
12412 +	}
12413 +
12414 +	if (err == 0) {
12415 +		t_prio.priority = get_srp_prio(t);
12416 +		t_prio.pid      = t->pid;
12417 +		if (srp_higher_prio(&t_prio, &sem->ceiling)) {
12418 +			sem->ceiling.priority = t_prio.priority;
12419 +			sem->ceiling.pid      = t_prio.pid;
12420 +		}
12421 +	}
12422 +
12423 +	preempt_enable();
12424 +
12425 +	return err;
12426 +}
12427 +
12428 +static int close_srp_semaphore(struct litmus_lock* l)
12429 +{
12430 +	struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
12431 +	int err = 0;
12432 +
12433 +	preempt_disable();
12434 +
12435 +	if (sem->owner == current)
12436 +		unlock_srp_semaphore(l);
12437 +
12438 +	preempt_enable();
12439 +
12440 +	return err;
12441 +}
12442 +
12443 +static void deallocate_srp_semaphore(struct litmus_lock* l)
12444 +{
12445 +	struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
12446 +	atomic_dec(&srp_objects_in_use);
12447 +	kfree(sem);
12448 +}
12449 +
12450 +static struct litmus_lock_ops srp_lock_ops = {
12451 +	.open   = open_srp_semaphore,
12452 +	.close  = close_srp_semaphore,
12453 +	.lock   = lock_srp_semaphore,
12454 +	.unlock = unlock_srp_semaphore,
12455 +	.deallocate = deallocate_srp_semaphore,
12456 +};
12457 +
12458 +struct srp_semaphore* allocate_srp_semaphore(void)
12459 +{
12460 +	struct srp_semaphore* sem;
12461 +
12462 +	sem = kmalloc(sizeof(*sem), GFP_KERNEL);
12463 +	if (!sem)
12464 +		return NULL;
12465 +
12466 +	INIT_LIST_HEAD(&sem->ceiling.list);
12467 +	sem->ceiling.priority = 0;
12468 +	sem->cpu     = UNDEF_SEM;
12469 +	sem->owner   = NULL;
12470 +
12471 +	sem->litmus_lock.ops = &srp_lock_ops;
12472 +
12473 +	atomic_inc(&srp_objects_in_use);
12474 +	return sem;
12475 +}
12476 +
12477 +static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync,
12478 +		       void *key)
12479 +{
12480 +	int cpu = smp_processor_id();
12481 +	struct task_struct *tsk = wait->private;
12482 +	if (cpu != get_partition(tsk))
12483 +		TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b",
12484 +			   get_partition(tsk));
12485 +	else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
12486 +		return default_wake_function(wait, mode, sync, key);
12487 +	return 0;
12488 +}
12489 +
12490 +static void do_ceiling_block(struct task_struct *tsk)
12491 +{
12492 +	wait_queue_t wait = {
12493 +		.private   = tsk,
12494 +		.func      = srp_wake_up,
12495 +		.task_list = {NULL, NULL}
12496 +	};
12497 +
12498 +	tsk->state = TASK_UNINTERRUPTIBLE;
12499 +	add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
12500 +	tsk->rt_param.srp_non_recurse = 1;
12501 +	preempt_enable_no_resched();
12502 +	schedule();
12503 +	preempt_disable();
12504 +	tsk->rt_param.srp_non_recurse = 0;
12505 +	remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
12506 +}
12507 +
12508 +/* Wait for current task priority to exceed system-wide priority ceiling.
12509 + * FIXME: the hotpath should be inline.
12510 + */
12511 +void srp_ceiling_block(void)
12512 +{
12513 +	struct task_struct *tsk = current;
12514 +
12515 +	/* Only applies to real-time tasks, but optimize for RT tasks. */
12516 +	if (unlikely(!is_realtime(tsk)))
12517 +		return;
12518 +
12519 +	/* Avoid recursive ceiling blocking. */
12520 +	if (unlikely(tsk->rt_param.srp_non_recurse))
12521 +		return;
12522 +
12523 +	/* Bail out early if there aren't any SRP resources around. */
12524 +	if (likely(!atomic_read(&srp_objects_in_use)))
12525 +		return;
12526 +
12527 +	preempt_disable();
12528 +	if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) {
12529 +		TRACE_CUR("is priority ceiling blocked.\n");
12530 +		while (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
12531 +			do_ceiling_block(tsk);
12532 +		TRACE_CUR("finally exceeds system ceiling.\n");
12533 +	} else
12534 +		TRACE_CUR("is not priority ceiling blocked\n");
12535 +	preempt_enable();
12536 +}
12537 +
12538 +#endif
12539 diff --git a/litmus/sync.c b/litmus/sync.c
12540 new file mode 100644
12541 index 0000000..bf75fde
12542 --- /dev/null
12543 +++ b/litmus/sync.c
12544 @@ -0,0 +1,104 @@
12545 +/* litmus/sync.c - Support for synchronous and asynchronous task system releases.
12546 + *
12547 + *
12548 + */
12549 +
12550 +#include <asm/atomic.h>
12551 +#include <asm/uaccess.h>
12552 +#include <linux/spinlock.h>
12553 +#include <linux/list.h>
12554 +#include <linux/sched.h>
12555 +#include <linux/completion.h>
12556 +
12557 +#include <litmus/litmus.h>
12558 +#include <litmus/sched_plugin.h>
12559 +#include <litmus/jobs.h>
12560 +
12561 +#include <litmus/sched_trace.h>
12562 +
12563 +static DECLARE_COMPLETION(ts_release);
12564 +
12565 +static long do_wait_for_ts_release(void)
12566 +{
12567 +	long ret = 0;
12568 +
12569 +	/* If the interruption races with a release, the completion object
12570 +	 * may have a non-zero counter. To avoid this problem, this should
12571 +	 * be replaced by wait_for_completion().
12572 +	 *
12573 +	 * For debugging purposes, this is interruptible for now.
12574 +	 */
12575 +	ret = wait_for_completion_interruptible(&ts_release);
12576 +
12577 +	return ret;
12578 +}
12579 +
12580 +int count_tasks_waiting_for_release(void)
12581 +{
12582 +	unsigned long flags;
12583 +	int task_count = 0;
12584 +	struct list_head *pos;
12585 +
12586 +	spin_lock_irqsave(&ts_release.wait.lock, flags);
12587 +	list_for_each(pos, &ts_release.wait.task_list) {
12588 +		task_count++;
12589 +	}
12590 +	spin_unlock_irqrestore(&ts_release.wait.lock, flags);
12591 +
12592 +	return task_count;
12593 +}
12594 +
12595 +static long do_release_ts(lt_t start)
12596 +{
12597 +	int  task_count = 0;
12598 +	unsigned long flags;
12599 +	struct list_head	*pos;
12600 +	struct task_struct 	*t;
12601 +
12602 +
12603 +	spin_lock_irqsave(&ts_release.wait.lock, flags);
12604 +	TRACE("<<<<<< synchronous task system release >>>>>>\n");
12605 +
12606 +	sched_trace_sys_release(&start);
12607 +	list_for_each(pos, &ts_release.wait.task_list) {
12608 +		t = (struct task_struct*) list_entry(pos,
12609 +						     struct __wait_queue,
12610 +						     task_list)->private;
12611 +		task_count++;
12612 +		litmus->release_at(t, start + t->rt_param.task_params.phase);
12613 +		sched_trace_task_release(t);
12614 +	}
12615 +
12616 +	spin_unlock_irqrestore(&ts_release.wait.lock, flags);
12617 +
12618 +	complete_n(&ts_release, task_count);
12619 +
12620 +	return task_count;
12621 +}
12622 +
12623 +
12624 +asmlinkage long sys_wait_for_ts_release(void)
12625 +{
12626 +	long ret = -EPERM;
12627 +	struct task_struct *t = current;
12628 +
12629 +	if (is_realtime(t))
12630 +		ret = do_wait_for_ts_release();
12631 +
12632 +	return ret;
12633 +}
12634 +
12635 +
12636 +asmlinkage long sys_release_ts(lt_t __user *__delay)
12637 +{
12638 +	long ret;
12639 +	lt_t delay;
12640 +
12641 +	/* FIXME: check capabilities... */
12642 +
12643 +	ret = copy_from_user(&delay, __delay, sizeof(delay));
12644 +	if (ret == 0)
12645 +		ret = do_release_ts(litmus_clock() + delay);
12646 +
12647 +	return ret;
12648 +}
12649 diff --git a/litmus/trace.c b/litmus/trace.c
12650 new file mode 100644
12651 index 0000000..3c35c52
12652 --- /dev/null
12653 +++ b/litmus/trace.c
12654 @@ -0,0 +1,225 @@
12655 +#include <linux/sched.h>
12656 +#include <linux/module.h>
12657 +#include <linux/uaccess.h>
12658 +
12659 +#include <litmus/ftdev.h>
12660 +#include <litmus/litmus.h>
12661 +#include <litmus/trace.h>
12662 +
12663 +/******************************************************************************/
12664 +/*                          Allocation                                        */
12665 +/******************************************************************************/
12666 +
12667 +static struct ftdev overhead_dev;
12668 +
12669 +#define trace_ts_buf overhead_dev.minor[0].buf
12670 +
12671 +static unsigned int ts_seq_no = 0;
12672 +
12673 +DEFINE_PER_CPU(atomic_t, irq_fired_count);
12674 +
12675 +static inline void clear_irq_fired(void)
12676 +{
12677 +	atomic_set(&__raw_get_cpu_var(irq_fired_count), 0);
12678 +}
12679 +
12680 +static inline unsigned int get_and_clear_irq_fired(void)
12681 +{
12682 +	/* This is potentially not atomic  since we might migrate if
12683 +	 * preemptions are not disabled. As a tradeoff between
12684 +	 * accuracy and tracing overheads, this seems acceptable.
12685 +	 * If it proves to be a problem, then one could add a callback
12686 +	 * from the migration code to invalidate irq_fired_count.
12687 +	 */
12688 +	return atomic_xchg(&__raw_get_cpu_var(irq_fired_count), 0);
12689 +}
12690 +
12691 +static inline void __save_irq_flags(struct timestamp *ts)
12692 +{
12693 +	unsigned int irq_count;
12694 +
12695 +	irq_count     = get_and_clear_irq_fired();
12696 +	/* Store how many interrupts occurred. */
12697 +	ts->irq_count = irq_count;
12698 +	/* Extra flag because ts->irq_count overflows quickly. */
12699 +	ts->irq_flag  = irq_count > 0;
12700 +}
12701 +
12702 +static inline void __save_timestamp_cpu(unsigned long event,
12703 +					uint8_t type, uint8_t cpu)
12704 +{
12705 +	unsigned int seq_no;
12706 +	struct timestamp *ts;
12707 +	seq_no = fetch_and_inc((int *) &ts_seq_no);
12708 +	if (ft_buffer_start_write(trace_ts_buf, (void**)  &ts)) {
12709 +		ts->event     = event;
12710 +		ts->seq_no    = seq_no;
12711 +		ts->cpu       = cpu;
12712 +		ts->task_type = type;
12713 +		__save_irq_flags(ts);
12714 +		barrier();
12715 +		/* prevent re-ordering of ft_timestamp() */
12716 +		ts->timestamp = ft_timestamp();
12717 +		ft_buffer_finish_write(trace_ts_buf, ts);
12718 +	}
12719 +}
12720 +
12721 +static void __add_timestamp_user(struct timestamp *pre_recorded)
12722 +{
12723 +	unsigned int seq_no;
12724 +	struct timestamp *ts;
12725 +	seq_no = fetch_and_inc((int *) &ts_seq_no);
12726 +	if (ft_buffer_start_write(trace_ts_buf, (void**)  &ts)) {
12727 +		*ts = *pre_recorded;
12728 +		ts->seq_no = seq_no;
12729 +		__save_irq_flags(ts);
12730 +		ft_buffer_finish_write(trace_ts_buf, ts);
12731 +	}
12732 +}
12733 +
12734 +static inline void __save_timestamp(unsigned long event,
12735 +				   uint8_t type)
12736 +{
12737 +	__save_timestamp_cpu(event, type, raw_smp_processor_id());
12738 +}
12739 +
12740 +feather_callback void save_timestamp(unsigned long event)
12741 +{
12742 +	__save_timestamp(event, TSK_UNKNOWN);
12743 +}
12744 +
12745 +feather_callback void save_timestamp_def(unsigned long event,
12746 +					 unsigned long type)
12747 +{
12748 +	__save_timestamp(event, (uint8_t) type);
12749 +}
12750 +
12751 +feather_callback void save_timestamp_task(unsigned long event,
12752 +					  unsigned long t_ptr)
12753 +{
12754 +	int rt = is_realtime((struct task_struct *) t_ptr);
12755 +	__save_timestamp(event, rt ? TSK_RT : TSK_BE);
12756 +}
12757 +
12758 +feather_callback void save_timestamp_cpu(unsigned long event,
12759 +					 unsigned long cpu)
12760 +{
12761 +	__save_timestamp_cpu(event, TSK_UNKNOWN, cpu);
12762 +}
12763 +
12764 +feather_callback void save_task_latency(unsigned long event,
12765 +					unsigned long when_ptr)
12766 +{
12767 +	lt_t now = litmus_clock();
12768 +	lt_t *when = (lt_t*) when_ptr;
12769 +	unsigned int seq_no;
12770 +	int cpu = raw_smp_processor_id();
12771 +	struct timestamp *ts;
12772 +
12773 +	seq_no = fetch_and_inc((int *) &ts_seq_no);
12774 +	if (ft_buffer_start_write(trace_ts_buf, (void**)  &ts)) {
12775 +		ts->event     = event;
12776 +		ts->timestamp = now - *when;
12777 +		ts->seq_no    = seq_no;
12778 +		ts->cpu       = cpu;
12779 +		ts->task_type = TSK_RT;
12780 +		__save_irq_flags(ts);
12781 +		ft_buffer_finish_write(trace_ts_buf, ts);
12782 +	}
12783 +}
12784 +
12785 +/******************************************************************************/
12786 +/*                        DEVICE FILE DRIVER                                  */
12787 +/******************************************************************************/
12788 +
12789 +/*
12790 + * should be 8M; it is the max we can ask to buddy system allocator (MAX_ORDER)
12791 + * and we might not get as much
12792 + */
12793 +#define NO_TIMESTAMPS (2 << 16)
12794 +
12795 +static int alloc_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
12796 +{
12797 +	unsigned int count = NO_TIMESTAMPS;
12798 +
12799 +	/* An overhead-tracing timestamp should be exactly 16 bytes long. */
12800 +	BUILD_BUG_ON(sizeof(struct timestamp) != 16);
12801 +
12802 +	while (count && !trace_ts_buf) {
12803 +		printk("time stamp buffer: trying to allocate %u time stamps.\n", count);
12804 +		ftdev->minor[idx].buf = alloc_ft_buffer(count, sizeof(struct timestamp));
12805 +		count /= 2;
12806 +	}
12807 +	return ftdev->minor[idx].buf ? 0 : -ENOMEM;
12808 +}
12809 +
12810 +static void free_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
12811 +{
12812 +	free_ft_buffer(ftdev->minor[idx].buf);
12813 +	ftdev->minor[idx].buf = NULL;
12814 +}
12815 +
12816 +static ssize_t write_timestamp_from_user(struct ft_buffer* buf, size_t len,
12817 +					 const char __user *from)
12818 +{
12819 +	ssize_t consumed = 0;
12820 +	struct timestamp ts;
12821 +
12822 +	/* don't give us partial timestamps */
12823 +	if (len % sizeof(ts))
12824 +		return -EINVAL;
12825 +
12826 +	while (len >= sizeof(ts)) {
12827 +		if (copy_from_user(&ts, from, sizeof(ts))) {
12828 +			consumed = -EFAULT;
12829 +			goto out;
12830 +		}
12831 +		len  -= sizeof(ts);
12832 +		from += sizeof(ts);
12833 +		consumed += sizeof(ts);
12834 +
12835 +		__add_timestamp_user(&ts);
12836 +	}
12837 +
12838 +out:
12839 +	return consumed;
12840 +}
12841 +
12842 +static int __init init_ft_overhead_trace(void)
12843 +{
12844 +	int err, cpu;
12845 +
12846 +	printk("Initializing Feather-Trace overhead tracing device.\n");
12847 +	err = ftdev_init(&overhead_dev, THIS_MODULE, 1, "ft_trace");
12848 +	if (err)
12849 +		goto err_out;
12850 +
12851 +	overhead_dev.alloc = alloc_timestamp_buffer;
12852 +	overhead_dev.free  = free_timestamp_buffer;
12853 +	overhead_dev.write = write_timestamp_from_user;
12854 +
12855 +	err = register_ftdev(&overhead_dev);
12856 +	if (err)
12857 +		goto err_dealloc;
12858 +
12859 +	/* initialize IRQ flags */
12860 +	for (cpu = 0; cpu < NR_CPUS; cpu++)  {
12861 +		clear_irq_fired();
12862 +	}
12863 +
12864 +	return 0;
12865 +
12866 +err_dealloc:
12867 +	ftdev_exit(&overhead_dev);
12868 +err_out:
12869 +	printk(KERN_WARNING "Could not register ft_trace module.\n");
12870 +	return err;
12871 +}
12872 +
12873 +static void __exit exit_ft_overhead_trace(void)
12874 +{
12875 +	ftdev_exit(&overhead_dev);
12876 +}
12877 +
12878 +module_init(init_ft_overhead_trace);
12879 +module_exit(exit_ft_overhead_trace);
12880 

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
  • [get | view] (2012-01-30 19:53:28, 344.9 KB) [[attachment:litmus-rt-2012.1.patch]]
 All files | Selected Files: delete move to page copy to page

You are not allowed to attach a file to this page.