Attachment 'litmus-rt-semi-part-with-edfos.patch'

Download

   1 diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
   2 index 6b4ffedb93c9..dd78ef687c5e 100644
   3 --- a/arch/x86/vdso/Makefile
   4 +++ b/arch/x86/vdso/Makefile
   5 @@ -25,7 +25,7 @@ targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
   6  
   7  export CPPFLAGS_vdso.lds += -P -C
   8  
   9 -VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -Wl,-soname=linux-vdso.so.1 \
  10 +VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
  11  		      	-Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
  12  
  13  $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
  14 @@ -69,7 +69,7 @@ vdso32.so-$(VDSO32-y)		+= sysenter
  15  vdso32-images			= $(vdso32.so-y:%=vdso32-%.so)
  16  
  17  CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
  18 -VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -Wl,-soname=linux-gate.so.1
  19 +VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-soname=linux-gate.so.1
  20  
  21  # This makes sure the $(obj) subdirectory exists even though vdso32/
  22  # is not a kbuild sub-make subdirectory.
  23 diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h
  24 index 5d20276e44f4..867239875eef 100644
  25 --- a/include/litmus/litmus.h
  26 +++ b/include/litmus/litmus.h
  27 @@ -88,7 +88,7 @@ inline static int budget_exhausted(struct task_struct* t)
  28  inline static lt_t budget_remaining(struct task_struct* t)
  29  {
  30  	if (!budget_exhausted(t))
  31 -		return get_exec_time(t) - get_exec_cost(t);
  32 +		return get_exec_cost(t) - get_exec_time(t);
  33  	else
  34  		/* avoid overflow */
  35  		return 0;
  36 diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
  37 index a7a183f34a80..bc3bbd07ef26 100644
  38 --- a/include/litmus/rt_param.h
  39 +++ b/include/litmus/rt_param.h
  40 @@ -1,3 +1,6 @@
  41 +#include <linux/threads.h>
  42 +#include <litmus/bheap.h>
  43 +
  44  /*
  45   * Definition of the scheduler plugin interface.
  46   *
  47 @@ -33,6 +36,91 @@ typedef enum {
  48  	PRECISE_ENFORCEMENT  /* NOT IMPLEMENTED - enforced with hrtimers */
  49  } budget_policy_t;
  50  
  51 +
  52 +/* The parameters for EDF-Fm scheduling algorithm.
  53 + * Each task may be fixed or migratory. Migratory tasks may
  54 + * migrate on 2 (contiguous) CPU only. NR_CPUS_EDF_FM = 2.
  55 + */
  56 +#define NR_CPUS_EDF_FM	2
  57 +#define NR_CPUS_EDF_OS 24
  58 +
  59 +struct edffm_params {
  60 +	/* EDF-fm where can a migratory task execute? */
  61 +	unsigned int	cpus[NR_CPUS_EDF_FM];
  62 +	/* how many cpus are used by this task?
  63 +	 * fixed = 0, migratory = (NR_CPUS_EDF_FM - 1)
  64 +	 * Efficient way to allow writing cpus[nr_cpus].
  65 +	 */
  66 +	unsigned int	nr_cpus;
  67 +	/* Fraction of this task exec_cost that each CPU should handle.
  68 +	 * We keep the fraction divided in num/denom : a matrix of
  69 +	 * (NR_CPUS_EDF_FM rows) x (2 columns).
  70 +	 * The first column is the numerator of the fraction.
  71 +	 * The second column is the denominator.
  72 +	 * In EDF-fm this is a 2*2 matrix
  73 +	 */
  74 +	lt_t		fraction[2][NR_CPUS_EDF_FM];
  75 +};
  76 +
  77 +struct edfos_params {
  78 +	/* The first CPU.*/
  79 +	unsigned int first_cpu;
  80 +	/* Whether this task is a migrating task*/
  81 +	unsigned int migrat;
  82 +	/* Time of next subtask release or deadline */
  83 +	int heap_data[NR_CPUS_EDF_OS];
  84 +	/* Fraction of this task exec_cost that each CPU should handle.
  85 +	 * We keep the fraction divided in num/denom : a matrix of
  86 +	 * (NR_CPUS_EDF_OS rows) x (2 columns).
  87 +	 * The first column is the numerator of the fraction.
  88 +	 * The second column is the denominator.
  89 +	 */
  90 +	lt_t		fraction[NR_CPUS_EDF_OS][2];
  91 +	struct bheap release_queue;
  92 +	struct bheap ready_queue;
  93 +};
  94 +
  95 +/* Parameters for NPS-F semi-partitioned scheduling algorithm.
  96 + * Each (cpu, budget) entry defines the share ('budget' in ns, a % of
  97 + * the slot_length) of the notional processor on the CPU 'cpu'.
  98 + * This structure is used by the library - syscall interface in order
  99 + * to go through the overhead of a syscall only once per server.
 100 + */
 101 +struct npsf_budgets {
 102 +	int	cpu;
 103 +	lt_t	budget;
 104 +};
 105 +
 106 +/* The parameters for the EDF-WM semi-partitioned scheduler.
 107 + * Each task may be split across multiple cpus. Each per-cpu allocation
 108 + * is called a 'slice'.
 109 + */
 110 +#define MAX_EDF_WM_SLICES 24
 111 +#define MIN_EDF_WM_SLICE_SIZE 50000 /* .05 millisecond = 50us */
 112 +
 113 +struct edf_wm_slice {
 114 +	/* on which CPU is this slice allocated */
 115 +	unsigned int cpu;
 116 +	/* relative deadline from job release (not from slice release!) */
 117 +	lt_t deadline;
 118 +	/* budget of this slice; must be precisely enforced */
 119 +	lt_t budget;
 120 +	/* offset of this slice relative to the job release */
 121 +	lt_t offset;
 122 +};
 123 +
 124 +/* If a job is not sliced across multiple CPUs, then
 125 + * count is set to zero and none of the slices is used.
 126 + * This implies that count == 1 is illegal.
 127 + */
 128 +struct edf_wm_params {
 129 +	/* enumeration of all slices */
 130 +	struct edf_wm_slice slices[MAX_EDF_WM_SLICES];
 131 +
 132 +	/* how many slices are defined? */
 133 +	unsigned int count;
 134 +};
 135 +
 136  struct rt_task {
 137  	lt_t 		exec_cost;
 138  	lt_t 		period;
 139 @@ -40,6 +128,25 @@ struct rt_task {
 140  	unsigned int	cpu;
 141  	task_class_t	cls;
 142  	budget_policy_t budget_policy; /* ignored by pfair */
 143 +
 144 +	/* parameters used by the semi-partitioned algorithms */
 145 +	union {
 146 +		/* EDF-Fm; defined in sched_edf_fm.c */
 147 +		struct edffm_params fm;
 148 +
 149 +		/* EDF-os; defined in sched_edf_os.c */
 150 +		struct edfos_params os;
 151 +
 152 +		/* NPS-F; defined in sched_npsf.c
 153 +		 * id for the server (notional processor) that holds
 154 +		 * this task; the same npfs_id can be assigned to "the same"
 155 +		 * server split on different cpus
 156 +		 */
 157 +		int npsf_id;
 158 +
 159 +		/* EDF-WM; defined in sched_edf_wm.c */
 160 +		struct edf_wm_params wm;
 161 +	} semi_part;
 162  };
 163  
 164  /* The definition of the data that is shared between the kernel and real-time
 165 @@ -184,6 +291,27 @@ struct rt_param {
 166  
 167  	/* Pointer to the page shared between userspace and kernel. */
 168  	struct control_page * ctrl_page;
 169 +
 170 +	/* runtime info for the semi-part plugins */
 171 +	union {
 172 +		/* EDF-Fm and EDF-os runtime information
 173 +		 * number of jobs handled by this cpu
 174 +		 * (to determine next cpu for a migrating task)
 175 +		 */
 176 +		unsigned int	cpu_job_no[NR_CPUS_EDF_OS];
 177 +
 178 +		/* EDF-WM runtime information */
 179 +		struct {
 180 +			/* at which exec time did the current slice start? */
 181 +			lt_t exec_time;
 182 +			/* when did the job suspend? */
 183 +			lt_t suspend_time;
 184 +			/* cached job parameters */
 185 +			lt_t job_release, job_deadline;
 186 +			/* pointer to the current slice */
 187 +			struct edf_wm_slice* slice;
 188 +		} wm;
 189 +	} semi_part;
 190  };
 191  
 192  /*	Possible RT flags	*/
 193 diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h
 194 index 9c1c9f28ba79..7ea9176624ff 100644
 195 --- a/include/litmus/sched_plugin.h
 196 +++ b/include/litmus/sched_plugin.h
 197 @@ -6,6 +6,8 @@
 198  #define _LINUX_SCHED_PLUGIN_H_
 199  
 200  #include <linux/sched.h>
 201 +/* NSEC_PER... conversions */
 202 +#include <linux/time.h>
 203  
 204  /* struct for semaphore with priority inheritance */
 205  struct pi_semaphore {
 206 @@ -136,6 +138,9 @@ extern struct sched_plugin *litmus;
 207  /* cluster size: cache_index = 2 L2, cache_index = 3 L3 */
 208  extern int cluster_cache_index;
 209  
 210 +/* Slot length (ns) for NPS-F semi-part. algo */
 211 +extern lt_t npsf_slot_length;
 212 +
 213  int register_sched_plugin(struct sched_plugin* plugin);
 214  struct sched_plugin* find_sched_plugin(const char* name);
 215  int print_sched_plugins(char* buf, int max);
 216 diff --git a/include/litmus/trace.h b/include/litmus/trace.h
 217 index b32c71180774..6afbf96ef9e4 100644
 218 --- a/include/litmus/trace.h
 219 +++ b/include/litmus/trace.h
 220 @@ -78,6 +78,8 @@ feather_callback void save_timestamp_cpu(unsigned long event, unsigned long cpu)
 221  #define TS_TICK_START(t)		TTIMESTAMP(110, t)
 222  #define TS_TICK_END(t) 			TTIMESTAMP(111, t)
 223  
 224 +#define TS_PULL_TIMER_START		TIMESTAMP(112)
 225 +#define TS_PULL_TIMER_END		TIMESTAMP(113)
 226  
 227  #define TS_PLUGIN_SCHED_START		/* TIMESTAMP(120) */  /* currently unused */
 228  #define TS_PLUGIN_SCHED_END		/* TIMESTAMP(121) */
 229 diff --git a/include/litmus/unistd_64.h b/include/litmus/unistd_64.h
 230 index f0618e75348d..4e82c52722c8 100644
 231 --- a/include/litmus/unistd_64.h
 232 +++ b/include/litmus/unistd_64.h
 233 @@ -33,5 +33,7 @@ __SYSCALL(__NR_wait_for_ts_release, sys_wait_for_ts_release)
 234  __SYSCALL(__NR_release_ts, sys_release_ts)
 235  #define __NR_null_call				__LSC(13)
 236  __SYSCALL(__NR_null_call, sys_null_call)
 237 +#define __NR_add_server				__LSC(14)
 238 +__SYSCALL(__NR_add_server, sys_add_server)
 239  
 240 -#define NR_litmus_syscalls 14
 241 +#define NR_litmus_syscalls 15
 242 diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
 243 index fdf95968e517..23d3712012f4 100644
 244 --- a/kernel/hrtimer.c
 245 +++ b/kernel/hrtimer.c
 246 @@ -47,6 +47,7 @@
 247  #include <linux/timer.h>
 248  
 249  #include <litmus/litmus.h>
 250 +#include <litmus/trace.h>
 251  
 252  #include <asm/uaccess.h>
 253  
 254 @@ -1063,6 +1064,7 @@ void hrtimer_pull(void)
 255  	struct hrtimer_start_on_info *info;
 256  	struct list_head *pos, *safe, list;
 257  
 258 +	TS_PULL_TIMER_START;
 259  	raw_spin_lock(&base->lock);
 260  	list_replace_init(&base->to_pull, &list);
 261  	raw_spin_unlock(&base->lock);
 262 @@ -1073,6 +1075,7 @@ void hrtimer_pull(void)
 263  		list_del(pos);
 264  		hrtimer_start(info->timer, info->time, info->mode);
 265  	}
 266 +	TS_PULL_TIMER_END;
 267  }
 268  
 269  /**
 270 diff --git a/litmus/Makefile b/litmus/Makefile
 271 index f301d2842e43..b243093abc6d 100644
 272 --- a/litmus/Makefile
 273 +++ b/litmus/Makefile
 274 @@ -14,7 +14,11 @@ obj-y     = sched_plugin.o litmus.o \
 275  	    bheap.o \
 276  	    ctrldev.o \
 277  	    sched_gsn_edf.o \
 278 -	    sched_psn_edf.o
 279 +	    sched_psn_edf.o \
 280 +	    sched_edf_wm.o \
 281 +	    sched_npsf.o \
 282 +	    sched_edf_fm.o \
 283 +		sched_edf_os.o
 284  
 285  obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
 286  obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o
 287 diff --git a/litmus/litmus.c b/litmus/litmus.c
 288 index b04a42b0da9c..2f780222d8e8 100644
 289 --- a/litmus/litmus.c
 290 +++ b/litmus/litmus.c
 291 @@ -632,6 +632,55 @@ static int proc_write_cluster_size(struct file *file,
 292  	return len;
 293  }
 294  
 295 +static int proc_read_npsf_slot_length(char *page, char **start,
 296 +				    off_t off, int count,
 297 +				    int *eof, void *data)
 298 +{
 299 +	return snprintf(page, PAGE_SIZE, "%d us\n",
 300 +			(int) (npsf_slot_length / NSEC_PER_USEC));
 301 +}
 302 +
 303 +extern void npsf_hrtimers_cleanup(void);
 304 +/* NPS-F slot length in us.
 305 + *
 306 + * Writing 0 as npsf_slot_length will trigger the removal of the
 307 + * hrtimers for the domain_reschedule_tick() in the NPS-F plugin.
 308 + */
 309 +static int proc_write_npsf_slot_length(struct file *file,
 310 +				     const char *buffer,
 311 +				     unsigned long count,
 312 +				     void *data)
 313 +{
 314 +	int err, slot_length;
 315 +	char msg[64];
 316 +
 317 +	if (count > 63)
 318 +		return -EINVAL;
 319 +
 320 +	if (copy_from_user(msg, buffer, count))
 321 +		return -EFAULT;
 322 +
 323 +	/* terminate */
 324 +	msg[count] = '\0';
 325 +	/* chomp */
 326 +	if (count > 1 && msg[count - 1] == '\n')
 327 +		msg[count - 1] = '\0';
 328 +
 329 +	err = sscanf(msg, "%d", &slot_length);
 330 +
 331 +	if (err == 1) {
 332 +		if (!slot_length) {
 333 +			npsf_hrtimers_cleanup();
 334 +			/* reset to default */
 335 +			slot_length = 5000;
 336 +		}
 337 +		npsf_slot_length = (lt_t)((lt_t) slot_length * NSEC_PER_USEC);
 338 +		return count;
 339 +	}
 340 +
 341 +	return -EINVAL;
 342 +}
 343 +
 344  #ifdef CONFIG_RELEASE_MASTER
 345  static int proc_read_release_master(char *page, char **start,
 346  				    off_t off, int count,
 347 @@ -691,7 +740,8 @@ static struct proc_dir_entry *litmus_dir = NULL,
 348  #ifdef CONFIG_RELEASE_MASTER
 349  	*release_master_file = NULL,
 350  #endif
 351 -	*clus_cache_idx_file = NULL;
 352 +	*clus_cache_idx_file = NULL,
 353 +	*npsf_slot_length_file = NULL;
 354  
 355  static int __init init_litmus_proc(void)
 356  {
 357 @@ -733,6 +783,16 @@ static int __init init_litmus_proc(void)
 358  	clus_cache_idx_file->read_proc = proc_read_cluster_size;
 359  	clus_cache_idx_file->write_proc = proc_write_cluster_size;
 360  
 361 +	npsf_slot_length_file = create_proc_entry("npsf_slot_length",
 362 +						0644, litmus_dir);
 363 +	if (!npsf_slot_length_file) {
 364 +		printk(KERN_ERR "Could not allocate npsf_slot_length "
 365 +		       "procfs entry.\n");
 366 +		return -ENOMEM;
 367 +	}
 368 +	npsf_slot_length_file->read_proc = proc_read_npsf_slot_length;
 369 +	npsf_slot_length_file->write_proc = proc_write_npsf_slot_length;
 370 +
 371  	stat_file = create_proc_read_entry("stats", 0444, litmus_dir,
 372  					   proc_read_stats, NULL);
 373  
 374 @@ -752,6 +812,8 @@ static void exit_litmus_proc(void)
 375  		remove_proc_entry("active_plugin", litmus_dir);
 376  	if (clus_cache_idx_file)
 377  		remove_proc_entry("cluster_cache", litmus_dir);
 378 +	if (npsf_slot_length_file)
 379 +		remove_proc_entry("npsf_slot_length", litmus_dir);
 380  #ifdef CONFIG_RELEASE_MASTER
 381  	if (release_master_file)
 382  		remove_proc_entry("release_master", litmus_dir);
 383 diff --git a/litmus/sched_edf_fm.c b/litmus/sched_edf_fm.c
 384 new file mode 100644
 385 index 000000000000..0465220f9dbb
 386 --- /dev/null
 387 +++ b/litmus/sched_edf_fm.c
 388 @@ -0,0 +1,571 @@
 389 +/*
 390 + * litmus/sched_edf_fm.c
 391 + *
 392 + * Implementation of the EDF-fm scheduling algorithm.
 393 + */
 394 +
 395 +#include <linux/percpu.h>
 396 +#include <linux/sched.h>
 397 +#include <linux/list.h>
 398 +#include <linux/spinlock.h>
 399 +
 400 +#include <linux/module.h>
 401 +
 402 +#include <litmus/litmus.h>
 403 +#include <litmus/jobs.h>
 404 +#include <litmus/sched_plugin.h>
 405 +#include <litmus/edf_common.h>
 406 +
 407 +typedef struct {
 408 +	rt_domain_t 		domain;
 409 +	int          		cpu;
 410 +	struct task_struct* 	scheduled; /* only RT tasks */
 411 +/* domain lock */
 412 +#define slock domain.ready_lock
 413 +} edffm_domain_t;
 414 +
 415 +DEFINE_PER_CPU(edffm_domain_t, edffm_domains);
 416 +
 417 +#define local_edffm		(&__get_cpu_var(edffm_domains))
 418 +#define remote_edf(cpu)		(&per_cpu(edffm_domains, cpu).domain)
 419 +#define remote_edffm(cpu)	(&per_cpu(edffm_domains, cpu))
 420 +#define task_edf(task)		remote_edf(get_partition(task))
 421 +#define task_edffm(task)	remote_edffm(get_partition(task))
 422 +
 423 +#define edffm_params(t)		(t->rt_param.task_params.semi_part.fm)
 424 +
 425 +/* Is the task a migratory task? */
 426 +#define is_migrat_task(task)	(edffm_params(task).nr_cpus)
 427 +/* t is on the wrong CPU (it should be requeued properly) */
 428 +#define wrong_cpu(t)	is_migrat_task((t)) && task_cpu((t)) != get_partition((t))
 429 +/* Get next CPU */
 430 +#define migrat_next_cpu(t)	\
 431 +	((tsk_rt(t)->task_params.cpu == edffm_params(t).cpus[0]) ? \
 432 +		edffm_params(t).cpus[1] : \
 433 +		edffm_params(t).cpus[0])
 434 +/* Get current cpu */
 435 +#define migrat_cur_cpu(t)	\
 436 +	((tsk_rt(t)->task_params.cpu == edffm_params(t).cpus[0]) ? \
 437 +		edffm_params(t).cpus[0] : \
 438 +		edffm_params(t).cpus[1])
 439 +/* Manipulate share for current cpu */
 440 +#define cur_cpu_fract_num(t)	\
 441 +	((tsk_rt(t)->task_params.cpu == edffm_params(t).cpus[0]) ? \
 442 +		edffm_params(t).fraction[0][0] : \
 443 +		edffm_params(t).fraction[0][1])
 444 +#define cur_cpu_fract_den(t)	\
 445 +	((tsk_rt(t)->task_params.cpu == edffm_params(t).cpus[0]) ? \
 446 +		edffm_params(t).fraction[1][0] : \
 447 +		edffm_params(t).fraction[1][1])
 448 +/* Get job number for current cpu */
 449 +#define cur_cpu_job_no(t)	\
 450 +	((tsk_rt(t)->task_params.cpu == edffm_params(t).cpus[0]) ? \
 451 +		tsk_rt(t)->semi_part.cpu_job_no[0] : \
 452 +		tsk_rt(t)->semi_part.cpu_job_no[1])
 453 +/* What is the current cpu position in the array? */
 454 +#define edffm_cpu_pos(cpu,t)    \
 455 +	((cpu == edffm_params(t).cpus[0]) ? \
 456 +	 0 : 1)
 457 +
 458 +/*
 459 + * EDF-fm: migratory tasks have higher prio than fixed, EDF in both classes.
 460 + * (Both first and second may be NULL).
 461 + */
 462 +int edffm_higher_prio(struct task_struct* first, struct task_struct* second)
 463 +{
 464 +	if ((first && edffm_params(first).nr_cpus) ||
 465 +			(second && edffm_params(second).nr_cpus)) {
 466 +		if ((first && edffm_params(first).nr_cpus) &&
 467 +			(second && edffm_params(second).nr_cpus))
 468 +			/* both are migrating */
 469 +			return edf_higher_prio(first, second);
 470 +
 471 +		if (first && edffm_params(first).nr_cpus)
 472 +			/* first is migrating */
 473 +			return 1;
 474 +		else
 475 +			/* second is migrating */
 476 +			return 0;
 477 +	}
 478 +
 479 +	/* both are fixed or not real time */
 480 +	return edf_higher_prio(first, second);
 481 +}
 482 +
 483 +int edffm_ready_order(struct bheap_node* a, struct bheap_node* b)
 484 +{
 485 +	return edffm_higher_prio(bheap2task(a), bheap2task(b));
 486 +}
 487 +
 488 +/* need_to_preempt - check whether the task t needs to be preempted
 489 + *                   call only with irqs disabled and with ready_lock acquired
 490 + */
 491 +int edffm_preemption_needed(rt_domain_t* rt, struct task_struct *t)
 492 +{
 493 +	/* we need the read lock for edf_ready_queue */
 494 +	/* no need to preempt if there is nothing pending */
 495 +	if (!__jobs_pending(rt))
 496 +		return 0;
 497 +	/* we need to reschedule if t doesn't exist */
 498 +	if (!t)
 499 +		return 1;
 500 +
 501 +	/* make sure to get non-rt stuff out of the way */
 502 +	return !is_realtime(t) || edffm_higher_prio(__next_ready(rt), t);
 503 +}
 504 +
 505 +/* we assume the lock is being held */
 506 +static void preempt(edffm_domain_t *edffm)
 507 +{
 508 +	preempt_if_preemptable(edffm->scheduled, edffm->cpu);
 509 +}
 510 +
 511 +static void edffm_release_jobs(rt_domain_t* rt, struct bheap* tasks)
 512 +{
 513 +	unsigned long flags;
 514 +	edffm_domain_t *edffm = container_of(rt, edffm_domain_t, domain);
 515 +
 516 +	raw_spin_lock_irqsave(&edffm->slock, flags);
 517 +
 518 +	__merge_ready(rt, tasks);
 519 +
 520 +	if (edffm_preemption_needed(rt, edffm->scheduled))
 521 +		preempt(edffm);
 522 +
 523 +	raw_spin_unlock_irqrestore(&edffm->slock, flags);
 524 +}
 525 +
 526 +/* EDF-fm uses the "release_master" field to force the next release for
 527 + * the task 'task' to happen on a remote CPU. The remote cpu for task is
 528 + * previously set up during job_completion() taking into consideration
 529 + * whether a task is a migratory task or not.
 530 + */
 531 +static inline void
 532 +edffm_add_release_remote(struct task_struct *task)
 533 +{
 534 +	unsigned long flags;
 535 +	rt_domain_t *rt = task_edf(task);
 536 +
 537 +	raw_spin_lock_irqsave(&rt->tobe_lock, flags);
 538 +
 539 +	/* "modify" destination cpu */
 540 +	rt->release_master = get_partition(task);
 541 +
 542 +	TRACE_TASK(task, "Add remote release: smp_proc_id = %d, cpu = %d, remote = %d\n",
 543 +			smp_processor_id(), task_cpu(task), rt->release_master);
 544 +
 545 +	/* trigger future release */
 546 +	__add_release(rt, task);
 547 +
 548 +	/* reset proper release_master and unlock */
 549 +	rt->release_master = NO_CPU;
 550 +	raw_spin_unlock_irqrestore(&rt->tobe_lock, flags);
 551 +}
 552 +
 553 +/* perform double ready_queue locking in an orderwise fashion
 554 + * this is called with: interrupt disabled and rq->lock held (from
 555 + * schedule())
 556 + */
 557 +static noinline void double_domain_lock(edffm_domain_t *dom1, edffm_domain_t *dom2)
 558 +{
 559 +	if (dom1 == dom2) {
 560 +		/* fake */
 561 +		raw_spin_lock(&dom1->slock);
 562 +	} else {
 563 +		if (dom1 < dom2) {
 564 +			raw_spin_lock(&dom1->slock);
 565 +			raw_spin_lock(&dom2->slock);
 566 +			TRACE("acquired %d and %d\n", dom1->cpu, dom2->cpu);
 567 +		} else {
 568 +			raw_spin_lock(&dom2->slock);
 569 +			raw_spin_lock(&dom1->slock);
 570 +			TRACE("acquired %d and %d\n", dom2->cpu, dom1->cpu);
 571 +		}
 572 +	}
 573 +}
 574 +
 575 +/* Directly insert a task in a remote ready queue. This function
 576 + * should only be called if this task is a migrating task and its
 577 + * last job for this CPU just completed (a new one is released for
 578 + * a remote CPU), but the new job is already tardy.
 579 + */
 580 +static noinline void insert_task_in_remote_ready(struct task_struct *task)
 581 +{
 582 +	edffm_domain_t *this = remote_edffm(task_cpu(task));
 583 +	edffm_domain_t *remote = remote_edffm(get_partition(task));
 584 +
 585 +	BUG_ON(get_partition(task) != remote->cpu);
 586 +
 587 +	TRACE_TASK(task, "Migrate From P%d -> To P%d\n",
 588 +			this->cpu, remote->cpu);
 589 +	TRACE_TASK(task, "Inserting in remote ready queue\n");
 590 +
 591 +	WARN_ON(!irqs_disabled());
 592 +
 593 +	raw_spin_unlock(&this->slock);
 594 +	mb();
 595 +	TRACE_TASK(task,"edffm_lock %d released\n", this->cpu);
 596 +
 597 +	/* lock both ready queues */
 598 +	double_domain_lock(this, remote);
 599 +	mb();
 600 +
 601 +	__add_ready(&remote->domain, task);
 602 +
 603 +	/* release remote but keep ours */
 604 +	raw_spin_unlock(&remote->slock);
 605 +	TRACE_TASK(task,"edffm_lock %d released\n", remote->cpu);
 606 +
 607 +	/* ask remote cpu to reschedule, we are already rescheduling on this */
 608 +	preempt(remote);
 609 +}
 610 +
 611 +static void requeue(struct task_struct* t, rt_domain_t *edf)
 612 +{
 613 +	if (t->state != TASK_RUNNING)
 614 +		TRACE_TASK(t, "requeue: !TASK_RUNNING\n");
 615 +
 616 +	set_rt_flags(t, RT_F_RUNNING);
 617 +	if (is_released(t, litmus_clock())) {
 618 +		if (wrong_cpu(t)) {
 619 +			/* this should only happen if t just completed, but
 620 +			 * its next release is already tardy, so it should be
 621 +			 * migrated and inserted in the remote ready queue
 622 +			 */
 623 +			TRACE_TASK(t, "Migrating task already released, "
 624 +				       "move from P%d to P%d\n",
 625 +					task_cpu(t), get_partition(t));
 626 +
 627 +			insert_task_in_remote_ready(t);
 628 +		} else {
 629 +			/* not a migrat task or the job is on the right CPU */
 630 +			__add_ready(edf, t);
 631 +		}
 632 +	} else {
 633 +		if (wrong_cpu(t)) {
 634 +
 635 +			TRACE_TASK(t, "Migrating task, adding remote release\n");
 636 +			edffm_add_release_remote(t);
 637 +		} else {
 638 +			TRACE_TASK(t, "Adding local release\n");
 639 +			add_release(edf, t);
 640 +		}
 641 +	}
 642 +}
 643 +
 644 +/* Update statistics for the _current_ job.
 645 + * 	- job_no was incremented _before_ starting this job
 646 + * 	(release_at / prepare_for_next_period)
 647 + * 	- cpu_job_no is incremented when the job completes
 648 + */
 649 +static void update_job_counter(struct task_struct *t)
 650 +{
 651 +	int cpu_pos;
 652 +
 653 +	/* Which CPU counter should be incremented? */
 654 +	cpu_pos = edffm_cpu_pos(t->rt_param.task_params.cpu, t);
 655 +	t->rt_param.semi_part.cpu_job_no[cpu_pos]++;
 656 +
 657 +	TRACE_TASK(t, "job_no = %d, cpu_job_no(pos %d) = %d, cpu %d\n",
 658 +			t->rt_param.job_params.job_no, cpu_pos, cur_cpu_job_no(t),
 659 +			t->rt_param.task_params.cpu);
 660 +}
 661 +
 662 +/* What is the next cpu for this job? (eq. 8, in EDF-Fm paper) */
 663 +static int next_cpu_for_job(struct task_struct *t)
 664 +{
 665 +	BUG_ON(!is_migrat_task(t));
 666 +
 667 +	TRACE_TASK(t, "%u = %u * %u / %u\n",
 668 +			t->rt_param.job_params.job_no, cur_cpu_job_no(t),
 669 +			cur_cpu_fract_den(t), cur_cpu_fract_num(t));
 670 +	if ((t->rt_param.job_params.job_no) ==
 671 +			(((lt_t) cur_cpu_job_no(t) * cur_cpu_fract_den(t)) /
 672 +			cur_cpu_fract_num(t)))
 673 +		return edffm_params(t).cpus[0];
 674 +
 675 +	return edffm_params(t).cpus[1];
 676 +}
 677 +
 678 +/* If needed (the share for task t on this CPU is exhausted), updates
 679 + * the task_params.cpu for the _migrating_ task t
 680 + */
 681 +static void change_migrat_cpu_if_needed(struct task_struct *t)
 682 +{
 683 +	BUG_ON(!is_migrat_task(t));
 684 +	/* EDF-fm: if it is a migrating task and it has already executed
 685 +	 * the required number of jobs on this CPU, we need to move it
 686 +	 * on its next CPU; changing the cpu here will affect the requeue
 687 +	 * and the next release
 688 +	 */
 689 +	if (unlikely(next_cpu_for_job(t) != migrat_cur_cpu(t))) {
 690 +
 691 +		tsk_rt(t)->task_params.cpu = migrat_next_cpu(t);
 692 +		TRACE_TASK(t, "EDF-fm: will migrate job %d -> %d\n",
 693 +			task_cpu(t), tsk_rt(t)->task_params.cpu);
 694 +		return;
 695 +	}
 696 +
 697 +	TRACE_TASK(t, "EDF-fm: job will stay on %d -> %d\n",
 698 +			task_cpu(t), tsk_rt(t)->task_params.cpu);
 699 +}
 700 +
 701 +static void job_completion(struct task_struct* t, int forced)
 702 +{
 703 +	sched_trace_task_completion(t,forced);
 704 +	TRACE_TASK(t, "job_completion().\n");
 705 +
 706 +	if (unlikely(is_migrat_task(t))) {
 707 +		update_job_counter(t);
 708 +		change_migrat_cpu_if_needed(t);
 709 +	}
 710 +
 711 +	set_rt_flags(t, RT_F_SLEEP);
 712 +	prepare_for_next_period(t);
 713 +}
 714 +
 715 +static void edffm_tick(struct task_struct *t)
 716 +{
 717 +	edffm_domain_t *edffm = local_edffm;
 718 +
 719 +	BUG_ON(is_realtime(t) && t != edffm->scheduled);
 720 +
 721 +	if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
 722 +		set_tsk_need_resched(t);
 723 +		TRACE("edffm_scheduler_tick: "
 724 +			"%d is preemptable "
 725 +			" => FORCE_RESCHED\n", t->pid);
 726 +	}
 727 +}
 728 +
 729 +static struct task_struct* edffm_schedule(struct task_struct * prev)
 730 +{
 731 +	edffm_domain_t* 	edffm = local_edffm;
 732 +	rt_domain_t*		edf  = &edffm->domain;
 733 +	struct task_struct*	next;
 734 +
 735 +	int out_of_time, sleep, preempt, exists, blocks, change_cpu, resched;
 736 +
 737 +	raw_spin_lock(&edffm->slock);
 738 +
 739 +	BUG_ON(edffm->scheduled && edffm->scheduled != prev);
 740 +	BUG_ON(edffm->scheduled && !is_realtime(prev));
 741 +
 742 +	/* (0) Determine state */
 743 +	exists      = edffm->scheduled != NULL;
 744 +	blocks      = exists && !is_running(edffm->scheduled);
 745 +	out_of_time = exists &&
 746 +				  budget_enforced(edffm->scheduled) &&
 747 +				  budget_exhausted(edffm->scheduled);
 748 +	sleep	    = exists && get_rt_flags(edffm->scheduled) == RT_F_SLEEP;
 749 +	change_cpu  = exists && wrong_cpu(edffm->scheduled);
 750 +	preempt     = edffm_preemption_needed(edf, prev);
 751 +
 752 +	BUG_ON(blocks && change_cpu);
 753 +
 754 +	if (exists)
 755 +		TRACE_TASK(prev,
 756 +			   "blocks:%d out_of_time:%d sleep:%d preempt:%d "
 757 +			   "wrong_cpu:%d state:%d sig:%d\n",
 758 +			   blocks, out_of_time, sleep, preempt,
 759 +			   change_cpu, prev->state, signal_pending(prev));
 760 +
 761 +	/* If we need to preempt do so. */
 762 +	resched = preempt;
 763 +
 764 +	/* If a task blocks we have no choice but to reschedule. */
 765 +	if (blocks)
 766 +		resched = 1;
 767 +
 768 +	/* If a task has just woken up, it was tardy and the wake up
 769 +	 * raced with this schedule, a new job has already been released,
 770 +	 * but scheduled should be enqueued on a remote ready queue, and a
 771 +	 * new task should be selected for the current queue.
 772 +	 */
 773 +	if (change_cpu)
 774 +		resched = 1;
 775 +
 776 +	/* Any task that is preemptable and either exhausts its execution
 777 +	 * budget or wants to sleep completes. We may have to reschedule after
 778 +	 * this.
 779 +	 */
 780 +	if ((out_of_time || sleep) && !blocks) {
 781 +		job_completion(edffm->scheduled, !sleep);
 782 +		resched = 1;
 783 +	}
 784 +
 785 +	/* The final scheduling decision. Do we need to switch for some reason?
 786 +	 * Switch if we are in RT mode and have no task or if we need to
 787 +	 * resched.
 788 +	 */
 789 +	next = NULL;
 790 +	if (resched || !exists) {
 791 +
 792 +		if (edffm->scheduled && !blocks)
 793 +			requeue(edffm->scheduled, edf);
 794 +		next = __take_ready(edf);
 795 +	} else
 796 +		/* Only override Linux scheduler if we have a real-time task
 797 +		 * scheduled that needs to continue.
 798 +		 */
 799 +		if (exists)
 800 +			next = prev;
 801 +
 802 +	if (next) {
 803 +		TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
 804 +		set_rt_flags(next, RT_F_RUNNING);
 805 +	} else {
 806 +		TRACE("becoming idle at %llu\n", litmus_clock());
 807 +	}
 808 +
 809 +	edffm->scheduled = next;
 810 +	raw_spin_unlock(&edffm->slock);
 811 +
 812 +	return next;
 813 +}
 814 +
 815 +/*	Prepare a task for running in RT mode
 816 + */
 817 +static void edffm_task_new(struct task_struct * t, int on_rq, int running)
 818 +{
 819 +	rt_domain_t* 		edf  = task_edf(t);
 820 +	edffm_domain_t* 	edffm = task_edffm(t);
 821 +	unsigned long		flags;
 822 +
 823 +	TRACE_TASK(t, "EDF-fm: task new, cpu = %d\n",
 824 +		   t->rt_param.task_params.cpu);
 825 +
 826 +	release_at(t, litmus_clock());
 827 +	update_job_counter(t);
 828 +
 829 +	/* The task should be running in the queue, otherwise signal
 830 +	 * code will try to wake it up with fatal consequences.
 831 +	 */
 832 +	raw_spin_lock_irqsave(&edffm->slock, flags);
 833 +	if (running) {
 834 +		/* there shouldn't be anything else running at the time */
 835 +		BUG_ON(edffm->scheduled);
 836 +		edffm->scheduled = t;
 837 +	} else {
 838 +		requeue(t, edf);
 839 +		/* maybe we have to reschedule */
 840 +		preempt(edffm);
 841 +	}
 842 +	raw_spin_unlock_irqrestore(&edffm->slock, flags);
 843 +}
 844 +
 845 +static void edffm_task_wake_up(struct task_struct *task)
 846 +{
 847 +	unsigned long		flags;
 848 +	edffm_domain_t* 	edffm = task_edffm(task);
 849 +	rt_domain_t* 		edf  = task_edf(task);
 850 +	lt_t			now;
 851 +
 852 +	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
 853 +
 854 +	TRACE_TASK(task, "acquire edffm %d\n", edffm->cpu);
 855 +	raw_spin_lock_irqsave(&edffm->slock, flags);
 856 +
 857 +	BUG_ON(edffm != task_edffm(task));
 858 +	BUG_ON(is_queued(task));
 859 +
 860 +	now = litmus_clock();
 861 +	if (is_tardy(task, now)) {
 862 +		if (unlikely(is_migrat_task(task))) {
 863 +			/* a new job will be released.
 864 +			 * Update current job counter */
 865 +			update_job_counter(task);
 866 +			/* Switch CPU if needed */
 867 +			change_migrat_cpu_if_needed(task);
 868 +		}
 869 +		/* new sporadic release */
 870 +		TRACE_TASK(task, "release new\n");
 871 +		release_at(task, now);
 872 +		sched_trace_task_release(task);
 873 +	}
 874 +
 875 +	/* Only add to ready queue if it is not the currently-scheduled
 876 +	 * task. This could be the case if a task was woken up concurrently
 877 +	 * on a remote CPU before the executing CPU got around to actually
 878 +	 * de-scheduling the task, i.e., wake_up() raced with schedule()
 879 +	 * and won.
 880 +	 */
 881 +	if (edffm->scheduled != task)
 882 +		requeue(task, edf);
 883 +
 884 +	raw_spin_unlock_irqrestore(&edffm->slock, flags);
 885 +	TRACE_TASK(task, "release edffm %d\n", edffm->cpu);
 886 +	TRACE_TASK(task, "wake up done\n");
 887 +}
 888 +
 889 +static void edffm_task_block(struct task_struct *t)
 890 +{
 891 +	TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);
 892 +
 893 +	BUG_ON(!is_realtime(t));
 894 +	if (is_queued(t)) {
 895 +		edffm_domain_t *edffm = local_edffm;
 896 +		TRACE_TASK(t, "task blocked, race with wakeup, "
 897 +				"remove from queue %d\n", edffm->cpu);
 898 +		remove(&edffm->domain, t);
 899 +	}
 900 +}
 901 +
 902 +static void edffm_task_exit(struct task_struct * t)
 903 +{
 904 +	unsigned long flags;
 905 +	edffm_domain_t* 	edffm = task_edffm(t);
 906 +	rt_domain_t*		edf;
 907 +
 908 +	raw_spin_lock_irqsave(&edffm->slock, flags);
 909 +	if (is_queued(t)) {
 910 +		/* dequeue */
 911 +		edf  = task_edf(t);
 912 +		remove(edf, t);
 913 +	}
 914 +	if (edffm->scheduled == t)
 915 +		edffm->scheduled = NULL;
 916 +
 917 +	TRACE_TASK(t, "RIP\n");
 918 +
 919 +	preempt(edffm);
 920 +	raw_spin_unlock_irqrestore(&edffm->slock, flags);
 921 +}
 922 +
 923 +static long edffm_admit_task(struct task_struct* tsk)
 924 +{
 925 +	return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL;
 926 +}
 927 +
 928 +/*	Plugin object	*/
 929 +static struct sched_plugin edffm_plugin __cacheline_aligned_in_smp = {
 930 +	.plugin_name		= "EDF-fm",
 931 +	.tick			= edffm_tick,
 932 +	.task_new		= edffm_task_new,
 933 +	.complete_job		= complete_job,
 934 +	.task_exit		= edffm_task_exit,
 935 +	.schedule		= edffm_schedule,
 936 +	.task_wake_up		= edffm_task_wake_up,
 937 +	.task_block		= edffm_task_block,
 938 +	.admit_task		= edffm_admit_task
 939 +};
 940 +
 941 +static int __init init_edffm(void)
 942 +{
 943 +	int i;
 944 +	edffm_domain_t *edffm;
 945 +
 946 +	/* Note, broken if num_online_cpus() may change */
 947 +	for (i = 0; i < num_online_cpus(); i++) {
 948 +		edffm = remote_edffm(i);
 949 +		edffm->cpu = i;
 950 +		edffm->scheduled = NULL;
 951 +		rt_domain_init(&edffm->domain, edffm_ready_order, NULL,
 952 +			       edffm_release_jobs);
 953 +	}
 954 +
 955 +	return register_sched_plugin(&edffm_plugin);
 956 +}
 957 +
 958 +module_init(init_edffm);
 959 +
 960 diff --git a/litmus/sched_edf_os.c b/litmus/sched_edf_os.c
 961 new file mode 100644
 962 index 000000000000..e021d22b5129
 963 --- /dev/null
 964 +++ b/litmus/sched_edf_os.c
 965 @@ -0,0 +1,660 @@
 966 +/*
 967 + * litmus/sched_edf_os.c
 968 + *
 969 + * Implementation of the EDF-os scheduling algorithm.
 970 + */
 971 +
 972 +#include <linux/percpu.h>
 973 +#include <linux/sched.h>
 974 +#include <linux/list.h>
 975 +#include <linux/spinlock.h>
 976 +
 977 +#include <linux/module.h>
 978 +
 979 +#include <litmus/litmus.h>
 980 +#include <litmus/jobs.h>
 981 +#include <litmus/sched_plugin.h>
 982 +#include <litmus/edf_common.h>
 983 +
 984 +typedef struct {
 985 +	rt_domain_t 		domain;
 986 +	int          		cpu;
 987 +	struct task_struct* 	scheduled; /* only RT tasks */
 988 +/* domain lock */
 989 +#define slock domain.ready_lock
 990 +} edfos_domain_t;
 991 +
 992 +DEFINE_PER_CPU(edfos_domain_t, edfos_domains);
 993 +
 994 +#define local_edfos		(&__get_cpu_var(edfos_domains))
 995 +#define remote_edf(cpu)		(&per_cpu(edfos_domains, cpu).domain)
 996 +#define remote_edfos(cpu)	(&per_cpu(edfos_domains, cpu))
 997 +#define task_edf(task)		remote_edf(get_partition(task))
 998 +#define task_edfos(task)	remote_edfos(get_partition(task))
 999 +
1000 +#define edfos_params(t)		(t->rt_param.task_params.semi_part.os)
1001 +
1002 +/* Is the task a migratory task? */
1003 +#define is_migrat_task(task)	(edfos_params(task).migrat)
1004 +/* t is on the wrong CPU (it should be requeued properly) */
1005 +#define wrong_cpu(t)		is_migrat_task((t)) \
1006 +				&& task_cpu((t)) != get_partition((t))
1007 +/* Manipulate share for current cpu */
1008 +#define cur_cpu_fract_num(t)	edfos_params(t).fraction[get_partition(t)][0]
1009 +#define cur_cpu_fract_den(t)	edfos_params(t).fraction[get_partition(t)][1]
1010 +/* Get job number for current cpu */
1011 +#define cur_cpu_job_no(t)	\
1012 +	tsk_rt(t)->semi_part.cpu_job_no[get_partition(t)]
1013 +
1014 +/*
1015 + * EDF-os: migratory tasks have higher prio than fixed, EDF in both classes.
1016 + * (Both first and second may be NULL).
1017 + */
1018 +int edfos_higher_prio(struct task_struct* first, struct task_struct* second)
1019 +{
1020 +	if ((first && edfos_params(first).migrat) ||
1021 +			(second && edfos_params(second).migrat)) {
1022 +		if ((first && edfos_params(first).migrat) &&
1023 +		    (second && edfos_params(second).migrat))
1024 +		{
1025 +			/* both are migrating */
1026 +			if (edfos_params(first).first_cpu <
1027 +			    edfos_params(second).first_cpu)
1028 +				return 1;
1029 +			else
1030 +				return 0;
1031 +		}
1032 +
1033 +		if (first && edfos_params(first).migrat)
1034 +			/* first is migrating */
1035 +			return 1;
1036 +		else
1037 +			/* second is migrating */
1038 +			return 0;
1039 +	}
1040 +
1041 +	/* both are fixed or not real time */
1042 +	return edf_higher_prio(first, second);
1043 +}
1044 +
1045 +int edfos_ready_order(struct bheap_node* a, struct bheap_node* b)
1046 +{
1047 +	return edfos_higher_prio(bheap2task(a), bheap2task(b));
1048 +}
1049 +
1050 +static int fakepfair_ready_order(struct bheap_node* a, struct bheap_node* b)
1051 +{
1052 +	return *((int*)a->value) < *((int*)b->value);
1053 +}
1054 +
1055 +/* need_to_preempt - check whether the task t needs to be preempted
1056 + *                   call only with irqs disabled and with ready_lock acquired
1057 + */
1058 +int edfos_preemption_needed(rt_domain_t* rt, struct task_struct *t)
1059 +{
1060 +	/* we need the read lock for edf_ready_queue */
1061 +	/* no need to preempt if there is nothing pending */
1062 +	if (!__jobs_pending(rt))
1063 +		return 0;
1064 +	/* we need to reschedule if t doesn't exist */
1065 +	if (!t)
1066 +		return 1;
1067 +
1068 +	/* make sure to get non-rt stuff out of the way */
1069 +	return !is_realtime(t) || edfos_higher_prio(__next_ready(rt), t);
1070 +}
1071 +
1072 +/* we assume the lock is being held */
1073 +static void preempt(edfos_domain_t *edfos)
1074 +{
1075 +	preempt_if_preemptable(edfos->scheduled, edfos->cpu);
1076 +}
1077 +
1078 +static void edfos_release_jobs(rt_domain_t* rt, struct bheap* tasks)
1079 +{
1080 +	unsigned long flags;
1081 +	edfos_domain_t *edfos = container_of(rt, edfos_domain_t, domain);
1082 +
1083 +	raw_spin_lock_irqsave(&edfos->slock, flags);
1084 +
1085 +	__merge_ready(rt, tasks);
1086 +
1087 +	if (edfos_preemption_needed(rt, edfos->scheduled))
1088 +		preempt(edfos);
1089 +
1090 +	raw_spin_unlock_irqrestore(&edfos->slock, flags);
1091 +}
1092 +
1093 +/* EDF-os uses the "release_master" field to force the next release for
1094 + * the task 'task' to happen on a remote CPU. The remote cpu for task is
1095 + * previously set up during job_completion() taking into consideration
1096 + * whether a task is a migratory task or not.
1097 + */
1098 +static inline void
1099 +edfos_add_release_remote(struct task_struct *task)
1100 +{
1101 +	unsigned long flags;
1102 +	rt_domain_t *rt = task_edf(task);
1103 +
1104 +	raw_spin_lock_irqsave(&rt->tobe_lock, flags);
1105 +
1106 +	/* "modify" destination cpu */
1107 +	rt->release_master = get_partition(task);
1108 +
1109 +	TRACE_TASK(task, "Add remote release: smp_proc_id = %d, cpu = %d, remote = %d\n",
1110 +			smp_processor_id(), task_cpu(task), rt->release_master);
1111 +
1112 +	/* trigger future release */
1113 +	__add_release(rt, task);
1114 +
1115 +	/* reset proper release_master and unlock */
1116 +	rt->release_master = NO_CPU;
1117 +	raw_spin_unlock_irqrestore(&rt->tobe_lock, flags);
1118 +}
1119 +
1120 +/* perform double ready_queue locking in an orderwise fashion
1121 + * this is called with: interrupt disabled and rq->lock held (from
1122 + * schedule())
1123 + */
1124 +static noinline void double_domain_lock(edfos_domain_t *dom1, edfos_domain_t *dom2)
1125 +{
1126 +	if (dom1 == dom2) {
1127 +		/* fake */
1128 +		raw_spin_lock(&dom1->slock);
1129 +	} else {
1130 +		if (dom1 < dom2) {
1131 +			raw_spin_lock(&dom1->slock);
1132 +			raw_spin_lock(&dom2->slock);
1133 +			TRACE("acquired %d and %d\n", dom1->cpu, dom2->cpu);
1134 +		} else {
1135 +			raw_spin_lock(&dom2->slock);
1136 +			raw_spin_lock(&dom1->slock);
1137 +			TRACE("acquired %d and %d\n", dom2->cpu, dom1->cpu);
1138 +		}
1139 +	}
1140 +}
1141 +
1142 +/* Directly insert a task in a remote ready queue. This function
1143 + * should only be called if this task is a migrating task and its
1144 + * last job for this CPU just completed (a new one is released for
1145 + * a remote CPU), but the new job is already tardy.
1146 + */
1147 +static noinline void insert_task_in_remote_ready(struct task_struct *task)
1148 +{
1149 +	edfos_domain_t *this = remote_edfos(task_cpu(task));
1150 +	edfos_domain_t *remote = remote_edfos(get_partition(task));
1151 +
1152 +	BUG_ON(get_partition(task) != remote->cpu);
1153 +
1154 +	TRACE_TASK(task, "Migrate From P%d -> To P%d\n",
1155 +			this->cpu, remote->cpu);
1156 +	TRACE_TASK(task, "Inserting in remote ready queue\n");
1157 +
1158 +	WARN_ON(!irqs_disabled());
1159 +
1160 +	raw_spin_unlock(&this->slock);
1161 +	mb();
1162 +	TRACE_TASK(task,"edfos_lock %d released\n", this->cpu);
1163 +
1164 +	/* lock both ready queues */
1165 +	double_domain_lock(this, remote);
1166 +	mb();
1167 +
1168 +	__add_ready(&remote->domain, task);
1169 +
1170 +	/* release remote but keep ours */
1171 +	raw_spin_unlock(&remote->slock);
1172 +	TRACE_TASK(task,"edfos_lock %d released\n", remote->cpu);
1173 +
1174 +	/* ask remote cpu to reschedule, we are already rescheduling on this */
1175 +	preempt(remote);
1176 +}
1177 +
1178 +static void requeue(struct task_struct* t, rt_domain_t *edf)
1179 +{
1180 +	if (t->state != TASK_RUNNING)
1181 +		TRACE_TASK(t, "requeue: !TASK_RUNNING\n");
1182 +
1183 +	set_rt_flags(t, RT_F_RUNNING);
1184 +	if (is_released(t, litmus_clock())) {
1185 +		if (wrong_cpu(t)) {
1186 +			/* this should only happen if t just completed, but
1187 +			 * its next release is already tardy, so it should be
1188 +			 * migrated and inserted in the remote ready queue
1189 +			 */
1190 +			TRACE_TASK(t, "Migrating task already released, "
1191 +				       "move from P%d to P%d\n",
1192 +					task_cpu(t), get_partition(t));
1193 +
1194 +			insert_task_in_remote_ready(t);
1195 +		} else {
1196 +			/* not a migrat task or the job is on the right CPU */
1197 +			__add_ready(edf, t);
1198 +		}
1199 +	} else {
1200 +		if (wrong_cpu(t)) {
1201 +
1202 +			TRACE_TASK(t, "Migrating task, adding remote release\n");
1203 +			edfos_add_release_remote(t);
1204 +		} else {
1205 +			TRACE_TASK(t, "Adding local release\n");
1206 +			add_release(edf, t);
1207 +		}
1208 +	}
1209 +}
1210 +
1211 +/* Update statistics for the _current_ job.
1212 + * 	- job_no was incremented _before_ starting this job
1213 + * 	(release_at / prepare_for_next_period)
1214 + * 	- cpu_job_no is incremented when the job completes
1215 + */
1216 +static void update_job_counter(struct task_struct *t)
1217 +{
1218 +	t->rt_param.semi_part.cpu_job_no[get_partition(t)]++;
1219 +
1220 +	TRACE_TASK(t, "job_no = %d, cpu_job_no(pos %d) = %d, cpu %d\n",
1221 +			t->rt_param.job_params.job_no, get_partition(t),
1222 +			cur_cpu_job_no(t), t->rt_param.task_params.cpu);
1223 +}
1224 +
1225 +
1226 +static int compute_pfair_deadline(lt_t wt_num, lt_t wt_den,
1227 +				  unsigned int job_no)
1228 +{	
1229 +	lt_t num;
1230 +	num = job_no * wt_den;
1231 +	if (do_div(num, wt_num))
1232 +		num++;
1233 +	return (int)num;
1234 +}
1235 +
1236 +static int compute_pfair_release(lt_t wt_num, lt_t wt_den,
1237 +				 unsigned int job_no)
1238 +{
1239 +	lt_t num;
1240 +	num = (job_no - 1) * wt_den;
1241 +	do_div(num, wt_num);
1242 +	return (int)num;
1243 +}
1244 +
1245 +static int next_cpu_for_job(struct task_struct *t)
1246 +{
1247 +	unsigned int cpu;
1248 +	lt_t next_rel;
1249 +	struct bheap_node* node;
1250 +	BUG_ON(!is_migrat_task(t));
1251 +	
1252 +	/* Process any new subtask releases. */
1253 +	node = bheap_peek(fakepfair_ready_order,
1254 +			  &edfos_params(t).release_queue);
1255 +	while (node && *((int*)node->value) <= tsk_rt(t)->job_params.job_no) {
1256 +		node = bheap_take(fakepfair_ready_order,
1257 +				  &edfos_params(t).release_queue);
1258 +		BUG_ON(!node);
1259 +		cpu = ((int*)node->value) - edfos_params(t).heap_data;
1260 +		*((int*)node->value) = compute_pfair_deadline(
1261 +				edfos_params(t).fraction[cpu][0],
1262 +				edfos_params(t).fraction[cpu][1],
1263 +				tsk_rt(t)->semi_part.cpu_job_no[cpu] + 1);
1264 +		bheap_insert(fakepfair_ready_order,
1265 +			     &edfos_params(t).ready_queue, node);
1266 +		node = bheap_peek(fakepfair_ready_order,
1267 +				  &edfos_params(t).release_queue);
1268 +	}
1269 +
1270 +	/* Choose the next Pfair subtask. */
1271 +	node = bheap_take(fakepfair_ready_order,
1272 +			  &edfos_params(t).ready_queue);
1273 +	BUG_ON(!node);
1274 +	cpu = ((int*)node->value) - edfos_params(t).heap_data;
1275 +
1276 +	next_rel = compute_pfair_release(edfos_params(t).fraction[cpu][0],
1277 +					 edfos_params(t).fraction[cpu][1],
1278 +					 tsk_rt(t)->semi_part.cpu_job_no[cpu]
1279 +					 + 1);
1280 +	if (next_rel <= tsk_rt(t)->job_params.job_no)
1281 +	{
1282 +		/* Next subtask already released. */
1283 +		*((int*)node->value) = compute_pfair_deadline(
1284 +					edfos_params(t).fraction[cpu][0],
1285 +					edfos_params(t).fraction[cpu][1],
1286 +					tsk_rt(t)->semi_part.cpu_job_no[cpu] +
1287 +					1);
1288 +		bheap_insert(fakepfair_ready_order,
1289 +			     &edfos_params(t).ready_queue, node);
1290 +	}
1291 +	else
1292 +	{
1293 +		/* Next subtask not yet released. */
1294 +		*((int*)node->value) = next_rel;
1295 +		bheap_insert(fakepfair_ready_order,
1296 +			     &edfos_params(t).release_queue, node);
1297 +	}
1298 +
1299 +	TRACE_TASK(t, "%u = %u * %u / %u\n",
1300 +			t->rt_param.job_params.job_no, cur_cpu_job_no(t),
1301 +			cur_cpu_fract_den(t), cur_cpu_fract_num(t));
1302 +	return cpu;
1303 +}
1304 +
1305 +/* If needed (the share for task t on this CPU is exhausted), updates
1306 + * the task_params.cpu for the _migrating_ task t
1307 + */
1308 +static void change_migrat_cpu_if_needed(struct task_struct *t)
1309 +{
1310 +	int cpu;
1311 +	BUG_ON(!is_migrat_task(t));
1312 +	/* EDF-os: if it is a migrating task and it has already executed
1313 +	 * the required number of jobs on this CPU, we need to move it
1314 +	 * on its next CPU; changing the cpu here will affect the requeue
1315 +	 * and the next release
1316 +	 */
1317 +	cpu = next_cpu_for_job(t);
1318 +	if (unlikely(cpu != get_partition(t))) {
1319 +		tsk_rt(t)->task_params.cpu = cpu;
1320 +		TRACE_TASK(t, "EDF-os: will migrate job %d -> %d\n",
1321 +			task_cpu(t), tsk_rt(t)->task_params.cpu);
1322 +		return;
1323 +	}
1324 +
1325 +	TRACE_TASK(t, "EDF-os: job will stay on %d -> %d\n",
1326 +			task_cpu(t), tsk_rt(t)->task_params.cpu);
1327 +}
1328 +
1329 +static void job_completion(struct task_struct* t, int forced)
1330 +{
1331 +	sched_trace_task_completion(t,forced);
1332 +	TRACE_TASK(t, "job_completion().\n");
1333 +
1334 +	if (unlikely(is_migrat_task(t))) {
1335 +		update_job_counter(t);
1336 +		change_migrat_cpu_if_needed(t);
1337 +	}
1338 +
1339 +	set_rt_flags(t, RT_F_SLEEP);
1340 +	prepare_for_next_period(t);
1341 +}
1342 +
1343 +static void edfos_tick(struct task_struct *t)
1344 +{
1345 +	edfos_domain_t *edfos = local_edfos;
1346 +
1347 +	BUG_ON(is_realtime(t) && t != edfos->scheduled);
1348 +
1349 +	if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
1350 +		set_tsk_need_resched(t);
1351 +		TRACE("edfos_scheduler_tick: "
1352 +			"%d is preemptable "
1353 +			" => FORCE_RESCHED\n", t->pid);
1354 +	}
1355 +}
1356 +
1357 +static struct task_struct* edfos_schedule(struct task_struct * prev)
1358 +{
1359 +	edfos_domain_t* 	edfos = local_edfos;
1360 +	rt_domain_t*		edf  = &edfos->domain;
1361 +	struct task_struct*	next;
1362 +
1363 +	int out_of_time, sleep, preempt, exists, blocks, change_cpu, resched;
1364 +
1365 +	raw_spin_lock(&edfos->slock);
1366 +
1367 +	BUG_ON(edfos->scheduled && edfos->scheduled != prev);
1368 +	BUG_ON(edfos->scheduled && !is_realtime(prev));
1369 +
1370 +	/* (0) Determine state */
1371 +	exists      = edfos->scheduled != NULL;
1372 +	blocks      = exists && !is_running(edfos->scheduled);
1373 +	out_of_time = exists &&
1374 +				  budget_enforced(edfos->scheduled) &&
1375 +				  budget_exhausted(edfos->scheduled);
1376 +	sleep	    = exists && get_rt_flags(edfos->scheduled) == RT_F_SLEEP;
1377 +	change_cpu  = exists && wrong_cpu(edfos->scheduled);
1378 +	preempt     = edfos_preemption_needed(edf, prev);
1379 +
1380 +	BUG_ON(blocks && change_cpu);
1381 +
1382 +	if (exists)
1383 +		TRACE_TASK(prev,
1384 +			   "blocks:%d out_of_time:%d sleep:%d preempt:%d "
1385 +			   "wrong_cpu:%d state:%d sig:%d\n",
1386 +			   blocks, out_of_time, sleep, preempt,
1387 +			   change_cpu, prev->state, signal_pending(prev));
1388 +
1389 +	/* If we need to preempt do so. */
1390 +	resched = preempt;
1391 +
1392 +	/* If a task blocks we have no choice but to reschedule. */
1393 +	if (blocks)
1394 +		resched = 1;
1395 +
1396 +	/* If a task has just woken up, it was tardy and the wake up
1397 +	 * raced with this schedule, a new job has already been released,
1398 +	 * but scheduled should be enqueued on a remote ready queue, and a
1399 +	 * new task should be selected for the current queue.
1400 +	 */
1401 +	if (change_cpu)
1402 +		resched = 1;
1403 +
1404 +	/* Any task that is preemptable and either exhausts its execution
1405 +	 * budget or wants to sleep completes. We may have to reschedule after
1406 +	 * this.
1407 +	 */
1408 +	if ((out_of_time || sleep) && !blocks) {
1409 +		job_completion(edfos->scheduled, !sleep);
1410 +		resched = 1;
1411 +	}
1412 +
1413 +	/* The final scheduling decision. Do we need to switch for some reason?
1414 +	 * Switch if we are in RT mode and have no task or if we need to
1415 +	 * resched.
1416 +	 */
1417 +	next = NULL;
1418 +	if (resched || !exists) {
1419 +
1420 +		if (edfos->scheduled && !blocks)
1421 +			requeue(edfos->scheduled, edf);
1422 +		next = __take_ready(edf);
1423 +	} else
1424 +		/* Only override Linux scheduler if we have a real-time task
1425 +		 * scheduled that needs to continue.
1426 +		 */
1427 +		if (exists)
1428 +			next = prev;
1429 +
1430 +	if (next) {
1431 +		TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
1432 +		set_rt_flags(next, RT_F_RUNNING);
1433 +	} else {
1434 +		TRACE("becoming idle at %llu\n", litmus_clock());
1435 +	}
1436 +
1437 +	edfos->scheduled = next;
1438 +	raw_spin_unlock(&edfos->slock);
1439 +
1440 +	return next;
1441 +}
1442 +
1443 +/*	Prepare a task for running in RT mode
1444 + */
1445 +static void edfos_task_new(struct task_struct * t, int on_rq, int running)
1446 +{
1447 +	rt_domain_t* 		edf  = task_edf(t);
1448 +	edfos_domain_t* 	edfos = task_edfos(t);
1449 +	unsigned long		flags;
1450 +	unsigned int		i;
1451 +	
1452 +	if (edfos_params(t).migrat) {
1453 +		bheap_init(&edfos_params(t).release_queue);
1454 +		bheap_init(&edfos_params(t).ready_queue);
1455 +		for (i = 0; i < NR_CPUS_EDF_OS; i++) {
1456 +			if (i == t->rt_param.task_params.cpu) {
1457 +				/* Initial CPU - setup next release. */
1458 +				edfos_params(t).heap_data[i] =
1459 +					compute_pfair_release(
1460 +					edfos_params(t).fraction[i][0],
1461 +					edfos_params(t).fraction[i][1], 2);
1462 +				bheap_add(fakepfair_ready_order,
1463 +					  &edfos_params(t).release_queue,
1464 +					  &edfos_params(t).heap_data[i],
1465 +					  GFP_ATOMIC);
1466 +			}
1467 +			else if (edfos_params(t).fraction[i][0] > 0) {
1468 +				/* Non-initial CPU - already released, setup
1469 +				 * deadline.
1470 +				 */
1471 +				edfos_params(t).heap_data[i] =
1472 +					compute_pfair_deadline(
1473 +					edfos_params(t).fraction[i][0],
1474 +					edfos_params(t).fraction[i][1], 1);
1475 +				bheap_add(fakepfair_ready_order,
1476 +					  &edfos_params(t).ready_queue,
1477 +					  &edfos_params(t).heap_data[i],
1478 +					  GFP_ATOMIC);
1479 +			}
1480 +		}
1481 +	}
1482 +
1483 +	TRACE_TASK(t, "EDF-os: task new, cpu = %d\n",
1484 +		   t->rt_param.task_params.cpu);
1485 +
1486 +	release_at(t, litmus_clock());
1487 +	update_job_counter(t);
1488 +
1489 +	/* The task should be running in the queue, otherwise signal
1490 +	 * code will try to wake it up with fatal consequences.
1491 +	 */
1492 +	raw_spin_lock_irqsave(&edfos->slock, flags);
1493 +	if (running) {
1494 +		/* there shouldn't be anything else running at the time */
1495 +		BUG_ON(edfos->scheduled);
1496 +		edfos->scheduled = t;
1497 +	} else {
1498 +		requeue(t, edf);
1499 +		/* maybe we have to reschedule */
1500 +		preempt(edfos);
1501 +	}
1502 +	raw_spin_unlock_irqrestore(&edfos->slock, flags);
1503 +}
1504 +
1505 +static void edfos_task_wake_up(struct task_struct *task)
1506 +{
1507 +	unsigned long		flags;
1508 +	edfos_domain_t* 	edfos = task_edfos(task);
1509 +	rt_domain_t* 		edf  = task_edf(task);
1510 +	lt_t			now;
1511 +
1512 +	TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
1513 +
1514 +	TRACE_TASK(task, "acquire edfos %d\n", edfos->cpu);
1515 +	raw_spin_lock_irqsave(&edfos->slock, flags);
1516 +
1517 +	BUG_ON(edfos != task_edfos(task));
1518 +	BUG_ON(is_queued(task));
1519 +
1520 +	now = litmus_clock();
1521 +	if (is_tardy(task, now)) {
1522 +		if (unlikely(is_migrat_task(task))) {
1523 +			/* a new job will be released.
1524 +			 * Update current job counter */
1525 +			update_job_counter(task);
1526 +			/* Switch CPU if needed */
1527 +			change_migrat_cpu_if_needed(task);
1528 +		}
1529 +		/* new sporadic release */
1530 +		TRACE_TASK(task, "release new\n");
1531 +		release_at(task, now);
1532 +		sched_trace_task_release(task);
1533 +	}
1534 +
1535 +	/* Only add to ready queue if it is not the currently-scheduled
1536 +	 * task. This could be the case if a task was woken up concurrently
1537 +	 * on a remote CPU before the executing CPU got around to actually
1538 +	 * de-scheduling the task, i.e., wake_up() raced with schedule()
1539 +	 * and won.
1540 +	 */
1541 +	if (edfos->scheduled != task)
1542 +		requeue(task, edf);
1543 +
1544 +	raw_spin_unlock_irqrestore(&edfos->slock, flags);
1545 +	TRACE_TASK(task, "release edfos %d\n", edfos->cpu);
1546 +	TRACE_TASK(task, "wake up done\n");
1547 +}
1548 +
1549 +static void edfos_task_block(struct task_struct *t)
1550 +{
1551 +	TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);
1552 +
1553 +	BUG_ON(!is_realtime(t));
1554 +	if (is_queued(t)) {
1555 +		edfos_domain_t *edfos = local_edfos;
1556 +		TRACE_TASK(t, "task blocked, race with wakeup, "
1557 +				"remove from queue %d\n", edfos->cpu);
1558 +		remove(&edfos->domain, t);
1559 +	}
1560 +}
1561 +
1562 +static void edfos_task_exit(struct task_struct * t)
1563 +{
1564 +	unsigned long flags;
1565 +	edfos_domain_t* 	edfos = task_edfos(t);
1566 +	rt_domain_t*		edf;
1567 +
1568 +	raw_spin_lock_irqsave(&edfos->slock, flags);
1569 +	if (is_queued(t)) {
1570 +		/* dequeue */
1571 +		edf  = task_edf(t);
1572 +		remove(edf, t);
1573 +	}
1574 +	if (edfos->scheduled == t)
1575 +		edfos->scheduled = NULL;
1576 +
1577 +	/* Deallocate heap nodes. */
1578 +	while (bheap_take_del(fakepfair_ready_order,
1579 +			      &edfos_params(t).release_queue)) {}
1580 +	while (bheap_take_del(fakepfair_ready_order,
1581 +			      &edfos_params(t).ready_queue)) {}
1582 +
1583 +	TRACE_TASK(t, "RIP\n");
1584 +
1585 +	preempt(edfos);
1586 +	raw_spin_unlock_irqrestore(&edfos->slock, flags);
1587 +}
1588 +
1589 +static long edfos_admit_task(struct task_struct* tsk)
1590 +{
1591 +	return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL;
1592 +}
1593 +
1594 +/*	Plugin object	*/
1595 +static struct sched_plugin edfos_plugin __cacheline_aligned_in_smp = {
1596 +	.plugin_name		= "EDF-os",
1597 +	.tick			= edfos_tick,
1598 +	.task_new		= edfos_task_new,
1599 +	.complete_job		= complete_job,
1600 +	.task_exit		= edfos_task_exit,
1601 +	.schedule		= edfos_schedule,
1602 +	.task_wake_up		= edfos_task_wake_up,
1603 +	.task_block		= edfos_task_block,
1604 +	.admit_task		= edfos_admit_task
1605 +};
1606 +
1607 +static int __init init_edfos(void)
1608 +{
1609 +	int i;
1610 +	edfos_domain_t *edfos;
1611 +
1612 +	/* Note, broken if num_online_cpus() may change */
1613 +	for (i = 0; i < num_online_cpus(); i++) {
1614 +		edfos = remote_edfos(i);
1615 +		edfos->cpu = i;
1616 +		edfos->scheduled = NULL;
1617 +		rt_domain_init(&edfos->domain, edfos_ready_order, NULL,
1618 +			       edfos_release_jobs);
1619 +	}
1620 +
1621 +	return register_sched_plugin(&edfos_plugin);
1622 +}
1623 +
1624 +module_init(init_edfos);
1625 +
1626 diff --git a/litmus/sched_edf_wm.c b/litmus/sched_edf_wm.c
1627 new file mode 100644
1628 index 000000000000..8b7be32b40dd
1629 --- /dev/null
1630 +++ b/litmus/sched_edf_wm.c
1631 @@ -0,0 +1,688 @@
1632 +/* EDF-WM: based on PSN-EDF.
1633 + */
1634 +
1635 +#include <linux/percpu.h>
1636 +#include <linux/sched.h>
1637 +#include <linux/list.h>
1638 +#include <linux/spinlock.h>
1639 +
1640 +#include <linux/module.h>
1641 +
1642 +#include <litmus/litmus.h>
1643 +#include <litmus/jobs.h>
1644 +#include <litmus/sched_plugin.h>
1645 +#include <litmus/edf_common.h>
1646 +
1647 +typedef struct {
1648 +	rt_domain_t 		domain;
1649 +	int          		cpu;
1650 +	struct task_struct* 	scheduled; /* only RT tasks */
1651 +
1652 +/*
1653 + * scheduling lock slock
1654 + * protects the domain and serializes scheduling decisions
1655 + */
1656 +#define slock domain.ready_lock
1657 +
1658 +} wm_domain_t;
1659 +
1660 +DEFINE_PER_CPU(wm_domain_t, wm_domains);
1661 +
1662 +#define TRACE_DOM(dom, fmt, args...) \
1663 +	TRACE("(wm_domains[%d]) " fmt, (dom)->cpu, ##args)
1664 +
1665 +
1666 +#define local_domain         (&__get_cpu_var(wm_domains))
1667 +#define remote_domain(cpu)   (&per_cpu(wm_domains, cpu))
1668 +#define domain_of_task(task) (remote_domain(get_partition(task)))
1669 +
1670 +static int is_sliced_task(struct task_struct* t)
1671 +{
1672 +	return tsk_rt(t)->task_params.semi_part.wm.count;
1673 +}
1674 +
1675 +static struct edf_wm_slice* get_last_slice(struct task_struct* t)
1676 +{
1677 +	int idx = tsk_rt(t)->task_params.semi_part.wm.count - 1;
1678 +	return tsk_rt(t)->task_params.semi_part.wm.slices + idx;
1679 +}
1680 +
1681 +static void compute_slice_params(struct task_struct* t)
1682 +{
1683 +	struct rt_param* p = tsk_rt(t);
1684 +	/* Here we do a little trick to make the generic EDF code
1685 +	 * play well with job slices. We overwrite the job-level
1686 +	 * release and deadline fields with the slice-specific values
1687 +	 * so that we can enqueue this task in an EDF rt_domain_t
1688 +	 * without issue. The actual values are cached in the semi_part.wm
1689 +	 * structure. */
1690 +	p->job_params.deadline = p->semi_part.wm.job_release +
1691 +		p->semi_part.wm.slice->deadline;
1692 +	p->job_params.release  = p->semi_part.wm.job_release +
1693 +		p->semi_part.wm.slice->offset;
1694 +
1695 +	/* Similarly, we play a trick on the cpu field. */
1696 +	p->task_params.cpu = p->semi_part.wm.slice->cpu;
1697 +
1698 +	/* update the per-slice budget reference */
1699 +	p->semi_part.wm.exec_time = p->job_params.exec_time;
1700 +}
1701 +
1702 +static void complete_sliced_job(struct task_struct* t)
1703 +{
1704 +	struct rt_param* p = tsk_rt(t);
1705 +
1706 +	/* We need to undo our trickery to the
1707 +	 * job parameters (see above). */
1708 +	p->job_params.release  = p->semi_part.wm.job_release;
1709 +	p->job_params.deadline = p->semi_part.wm.job_deadline;
1710 +
1711 +	/* Ok, now let generic code do the actual work. */
1712 +	prepare_for_next_period(t);
1713 +
1714 +	/* And finally cache the updated parameters. */
1715 +	p->semi_part.wm.job_release = p->job_params.release;
1716 +	p->semi_part.wm.job_deadline = p->job_params.deadline;
1717 +}
1718 +
1719 +static lt_t slice_exec_time(struct task_struct* t)
1720 +{
1721 +	struct rt_param* p = tsk_rt(t);
1722 +
1723 +	/* Compute how much execution time has been consumed
1724 +	 * since last slice advancement. */
1725 +	return p->job_params.exec_time - p->semi_part.wm.exec_time;
1726 +}
1727 +
1728 +static lt_t slice_budget(struct task_struct* t)
1729 +{
1730 +	return tsk_rt(t)->semi_part.wm.slice->budget;
1731 +}
1732 +
1733 +static int slice_budget_exhausted(struct task_struct* t)
1734 +{
1735 +	return slice_exec_time(t) >= slice_budget(t);
1736 +}
1737 +
1738 +/* assumes positive remainder; overflows otherwise */
1739 +static lt_t slice_budget_remaining(struct task_struct* t)
1740 +{
1741 +	return slice_budget(t) - slice_exec_time(t);
1742 +}
1743 +
1744 +static int wm_budget_exhausted(struct task_struct* t)
1745 +{
1746 +	if (is_sliced_task(t))
1747 +		return slice_budget_exhausted(t);
1748 +	else
1749 +		return budget_exhausted(t);
1750 +}
1751 +
1752 +static void advance_next_slice(struct task_struct* t, int completion_signaled)
1753 +{
1754 +	int idx;
1755 +	struct rt_param* p = tsk_rt(t);
1756 +
1757 +	/* make sure this is actually a sliced job */
1758 +	BUG_ON(!is_sliced_task(t));
1759 +	BUG_ON(is_queued(t));
1760 +
1761 +	/* determine index of current slice */
1762 +	idx = p->semi_part.wm.slice -
1763 +		p->task_params.semi_part.wm.slices;
1764 +
1765 +	TRACE_TASK(t, "advancing slice %d; excess=%lluns; "
1766 +		   "completion_signaled=%d.\n",
1767 +		   idx, slice_exec_time(t) - slice_budget(t),
1768 +		   completion_signaled);
1769 +
1770 +	if (completion_signaled)
1771 +		idx = 0;
1772 +	else
1773 +		/* increment and wrap around, if necessary */
1774 +		idx = (idx + 1) % p->task_params.semi_part.wm.count;
1775 +
1776 +	/* point to next slice */
1777 +	p->semi_part.wm.slice =
1778 +		p->task_params.semi_part.wm.slices + idx;
1779 +
1780 +	/* Check if we need to update essential job parameters. */
1781 +	if (!idx) {
1782 +		/* job completion */
1783 +		sched_trace_task_completion(t, !completion_signaled);
1784 +		TRACE_TASK(t, "completed sliced job"
1785 +			   "(signaled:%d)\n", completion_signaled);
1786 +		complete_sliced_job(t);
1787 +	}
1788 +
1789 +	/* Update job parameters for new slice. */
1790 +	compute_slice_params(t);
1791 +}
1792 +
1793 +/* assumes time_passed does not advance past the last slice */
1794 +static void fast_forward_slices(struct task_struct* t, lt_t time_passed)
1795 +{
1796 +	TRACE_TASK(t, "fast forwarding %lluns\n", time_passed);
1797 +
1798 +	/* this is NOT the slice version */
1799 +	BUG_ON(budget_remaining(t) <= time_passed);
1800 +
1801 +	if (wm_budget_exhausted(t)) {
1802 +		/* This can happen if a suspension raced
1803 +		 * with a normal slice advancement. wm_schedule()
1804 +		 * does not process out_of_time when a task blocks. */
1805 +		TRACE_TASK(t, "block raced with out_of_time?\n");
1806 +		advance_next_slice(t, 0);
1807 +	}
1808 +
1809 +	while (time_passed &&
1810 +	       time_passed >= slice_budget_remaining(t)) {
1811 +		/* slice completely exhausted */
1812 +		time_passed -= slice_budget_remaining(t);
1813 +		tsk_rt(t)->job_params.exec_time +=
1814 +			slice_budget_remaining(t);
1815 +
1816 +		BUG_ON(!slice_budget_exhausted(t));
1817 +		BUG_ON(slice_budget_remaining(t) != 0);
1818 +		BUG_ON(tsk_rt(t)->semi_part.wm.slice == get_last_slice(t));
1819 +
1820 +		advance_next_slice(t, 0);
1821 +	}
1822 +	/* add remainder to exec cost */
1823 +	tsk_rt(t)->job_params.exec_time += time_passed;
1824 +}
1825 +
1826 +/* we assume the lock is being held */
1827 +static void preempt(wm_domain_t *dom)
1828 +{
1829 +	TRACE_DOM(dom, "will be preempted.\n");
1830 +	/* We pass NULL as the task since non-preemptive sections are not
1831 +	 * supported in this plugin, so per-task checks are not needed. */
1832 +	preempt_if_preemptable(NULL, dom->cpu);
1833 +}
1834 +
1835 +static void wm_domain_init(wm_domain_t* dom,
1836 +			   check_resched_needed_t check,
1837 +			   release_jobs_t release,
1838 +			   int cpu)
1839 +{
1840 +	edf_domain_init(&dom->domain, check, release);
1841 +	dom->cpu      		= cpu;
1842 +	dom->scheduled		= NULL;
1843 +}
1844 +
1845 +static void wm_requeue_remote(struct task_struct *t)
1846 +{
1847 +	wm_domain_t *dom = domain_of_task(t);
1848 +
1849 +	set_rt_flags(t, RT_F_RUNNING);
1850 +	if (is_released(t, litmus_clock()))
1851 +		/* acquires necessary lock */
1852 +		add_ready(&dom->domain, t);
1853 +	else
1854 +		/* force timer on remote CPU */
1855 +		add_release_on(&dom->domain, t, get_partition(t));
1856 +}
1857 +
1858 +static void wm_requeue_local(struct task_struct* t, rt_domain_t *edf)
1859 +{
1860 +	if (t->state != TASK_RUNNING)
1861 +		TRACE_TASK(t, "requeue: !TASK_RUNNING\n");
1862 +
1863 +	set_rt_flags(t, RT_F_RUNNING);
1864 +	if (is_released(t, litmus_clock()))
1865 +		__add_ready(edf, t);
1866 +	else
1867 +		add_release(edf, t); /* it has got to wait */
1868 +}
1869 +
1870 +static int wm_check_resched(rt_domain_t *edf)
1871 +{
1872 +	wm_domain_t *dom = container_of(edf, wm_domain_t, domain);
1873 +
1874 +	/* because this is a callback from rt_domain_t we already hold
1875 +	 * the necessary lock for the ready queue
1876 +	 */
1877 +	if (edf_preemption_needed(edf, dom->scheduled)) {
1878 +		preempt(dom);
1879 +		return 1;
1880 +	} else
1881 +		return 0;
1882 +}
1883 +
1884 +static void regular_job_completion(struct task_struct* t, int forced)
1885 +{
1886 +	sched_trace_task_completion(t, forced);
1887 +	TRACE_TASK(t, "job_completion().\n");
1888 +
1889 +	set_rt_flags(t, RT_F_SLEEP);
1890 +	prepare_for_next_period(t);
1891 +}
1892 +
1893 +static void wm_job_or_slice_completion(struct task_struct* t,
1894 +				       int completion_signaled)
1895 +{
1896 +	if (is_sliced_task(t))
1897 +		advance_next_slice(t, completion_signaled);
1898 +	else
1899 +		regular_job_completion(t, !completion_signaled);
1900 +}
1901 +
1902 +static void wm_tick(struct task_struct *t)
1903 +{
1904 +	wm_domain_t *dom = local_domain;
1905 +
1906 +	/* Check for inconsistency. We don't need the lock for this since
1907 +	 * ->scheduled is only changed in schedule, which obviously is not
1908 +	 *  executing in parallel on this CPU
1909 +	 */
1910 +	BUG_ON(is_realtime(t) && t != dom->scheduled);
1911 +
1912 +	if (is_realtime(t) && budget_enforced(t) && wm_budget_exhausted(t)) {
1913 +		set_tsk_need_resched(t);
1914 +		TRACE_DOM(dom, "budget of %d exhausted in tick\n",
1915 +			  t->pid);
1916 +	}
1917 +}
1918 +
1919 +static struct task_struct* wm_schedule(struct task_struct * prev)
1920 +{
1921 +	wm_domain_t		*dom = local_domain;
1922 +	rt_domain_t		*edf = &dom->domain;
1923 +	struct task_struct	*next, *migrate = NULL;
1924 +
1925 +	int out_of_time, sleep, preempt, wrong_cpu, exists, blocks, resched;
1926 +
1927 +	raw_spin_lock(&dom->slock);
1928 +
1929 +	/* Sanity checking:
1930 +	 * When a task exits (dead) dom->schedule may be null
1931 +	 * and prev _is_ realtime. */
1932 +	BUG_ON(dom->scheduled && dom->scheduled != prev);
1933 +	BUG_ON(dom->scheduled && !is_realtime(prev));
1934 +
1935 +	/* (0) Determine state */
1936 +	exists      = dom->scheduled != NULL;
1937 +	wrong_cpu   = exists && get_partition(dom->scheduled) != dom->cpu;
1938 +	blocks      = exists && !is_running(dom->scheduled);
1939 +	out_of_time = exists
1940 +		&& budget_enforced(dom->scheduled)
1941 +		&& wm_budget_exhausted(dom->scheduled);
1942 +	sleep	    = exists && get_rt_flags(dom->scheduled) == RT_F_SLEEP;
1943 +	preempt     = edf_preemption_needed(edf, prev);
1944 +
1945 +	/* If we need to preempt do so.
1946 +	 * The following checks set resched to 1 in case of special
1947 +	 * circumstances.
1948 +	 */
1949 +	resched = preempt;
1950 +
1951 +
1952 +	if (exists)
1953 +		TRACE_TASK(prev,
1954 +			   "blocks:%d out_of_time:%d sleep:%d preempt:%d "
1955 +			   "wrong_cpu:%d state:%d sig:%d\n",
1956 +			   blocks, out_of_time, sleep, preempt, wrong_cpu,
1957 +			   prev->state, signal_pending(prev));
1958 +
1959 +	/* If a task blocks we have no choice but to reschedule.
1960 +	 */
1961 +	if (blocks)
1962 +		resched = 1;
1963 +
1964 +	/* This can happen if sliced task was moved to the next slice
1965 +	 * by the wake_up() code path while still being scheduled.
1966 +	 */
1967 +	if (wrong_cpu)
1968 +		resched = 1;
1969 +
1970 +	/* Any task that is preemptable and either exhausts its execution
1971 +	 * budget or wants to sleep completes. We may have to reschedule after
1972 +	 * this.
1973 +	 */
1974 +	if ((out_of_time || sleep) && !blocks) {
1975 +		wm_job_or_slice_completion(dom->scheduled, sleep);
1976 +		resched = 1;
1977 +	}
1978 +
1979 +	/* The final scheduling decision. Do we need to switch for some reason?
1980 +	 * Switch if we are in RT mode and have no task or if we need to
1981 +	 * resched.
1982 +	 */
1983 +	next = NULL;
1984 +	if (resched || !exists) {
1985 +		if (dom->scheduled && !blocks) {
1986 +			if (get_partition(dom->scheduled) == dom->cpu)
1987 +				/* local task */
1988 +				wm_requeue_local(dom->scheduled, edf);
1989 +			else
1990 +				/* not local anymore; wait until we drop the
1991 +				 * ready queue lock */
1992 +				migrate = dom->scheduled;
1993 +		}
1994 +		next = __take_ready(edf);
1995 +	} else
1996 +		/* Only override Linux scheduler if we have a real-time task
1997 +		 * scheduled that needs to continue. */
1998 +		if (exists)
1999 +			next = prev;
2000 +
2001 +	if (next) {
2002 +		TRACE_TASK(next, "scheduled at %llu (state:%d/%d)\n", litmus_clock(),
2003 +			   next->state, is_running(next));
2004 +		set_rt_flags(next, RT_F_RUNNING);
2005 +	} else if (exists) {
2006 +		TRACE("becoming idle at %llu\n", litmus_clock());
2007 +	}
2008 +
2009 +	dom->scheduled = next;
2010 +	raw_spin_unlock(&dom->slock);
2011 +
2012 +	/* check if we need to push the previous task onto another queue */
2013 +	if (migrate) {
2014 +		TRACE_TASK(migrate, "schedule-initiated migration to %d\n",
2015 +			   get_partition(migrate));
2016 +		wm_requeue_remote(migrate);
2017 +	}
2018 +
2019 +	return next;
2020 +}
2021 +
2022 +
2023 +/*	Prepare a task for running in RT mode
2024 + */
2025 +static void wm_task_new(struct task_struct * t, int on_rq, int running)
2026 +{
2027 +	wm_domain_t* dom = domain_of_task(t);
2028 +	rt_domain_t* edf = &dom->domain;
2029 +	unsigned long flags;
2030 +
2031 +	TRACE_TASK(t, "edf-wm: task new, cpu = %d\n",
2032 +		   t->rt_param.task_params.cpu);
2033 +
2034 +	/* setup job parameters */
2035 +	release_at(t, litmus_clock());
2036 +
2037 +	/* The task should be running in the queue, otherwise signal
2038 +	 * code will try to wake it up with fatal consequences.
2039 +	 */
2040 +	raw_spin_lock_irqsave(&dom->slock, flags);
2041 +
2042 +	if (is_sliced_task(t)) {
2043 +		/* make sure parameters are initialized consistently */
2044 +		tsk_rt(t)->semi_part.wm.exec_time = 0;
2045 +		tsk_rt(t)->semi_part.wm.job_release  = get_release(t);
2046 +		tsk_rt(t)->semi_part.wm.job_deadline = get_deadline(t);
2047 +		tsk_rt(t)->semi_part.wm.slice = tsk_rt(t)->task_params.semi_part.wm.slices;
2048 +		tsk_rt(t)->job_params.exec_time = 0;
2049 +	}
2050 +
2051 +	if (running) {
2052 +		/* there shouldn't be anything else running at the time */
2053 +		BUG_ON(dom->scheduled);
2054 +		dom->scheduled = t;
2055 +	} else {
2056 +		wm_requeue_local(t, edf);
2057 +		/* maybe we have to reschedule */
2058 +		preempt(dom);
2059 +	}
2060 +	raw_spin_unlock_irqrestore(&dom->slock, flags);
2061 +}
2062 +
2063 +static void wm_release_at(struct task_struct *t, lt_t start)
2064 +{
2065 +	struct rt_param* p = tsk_rt(t);
2066 +
2067 +	if (is_sliced_task(t)) {
2068 +		/* simulate wrapping to the first slice */
2069 +		p->semi_part.wm.job_deadline = start;
2070 +		p->semi_part.wm.slice = get_last_slice(t);
2071 +		/* FIXME: creates bogus completion event... */
2072 +		advance_next_slice(t, 0);
2073 +		set_rt_flags(t, RT_F_RUNNING);
2074 +	} else
2075 +		/* generic code handles it */
2076 +		release_at(t, start);
2077 +}
2078 +
2079 +static lt_t wm_earliest_release(struct task_struct *t, lt_t now)
2080 +{
2081 +	lt_t deadline;
2082 +	if (is_sliced_task(t))
2083 +		deadline = tsk_rt(t)->semi_part.wm.job_deadline;
2084 +	else
2085 +		deadline = get_deadline(t);
2086 +	if (lt_before(deadline, now))
2087 +		return now;
2088 +	else
2089 +		return deadline;
2090 +}
2091 +
2092 +static void wm_task_wake_up(struct task_struct *t)
2093 +{
2094 +	unsigned long flags;
2095 +	wm_domain_t* dom = domain_of_task(t);
2096 +	rt_domain_t* edf = &dom->domain;
2097 +	struct rt_param* p = tsk_rt(t);
2098 +	lt_t now, sleep_time;
2099 +	int migrate = 0;
2100 +
2101 +	raw_spin_lock_irqsave(&dom->slock, flags);
2102 +	BUG_ON(is_queued(t));
2103 +
2104 +	now = litmus_clock();
2105 +
2106 +	sleep_time = now - p->semi_part.wm.suspend_time;
2107 +
2108 +	TRACE_TASK(t, "wake_up at %llu after %llu, still-scheduled:%d\n",
2109 +		   now, sleep_time, dom->scheduled == t);
2110 +
2111 +	/* account sleep time as execution time */
2112 +	if (get_exec_time(t) + sleep_time >= get_exec_cost(t)) {
2113 +		/* new sporadic release */
2114 +		TRACE_TASK(t, "new sporadic release\n");
2115 +		wm_release_at(t, wm_earliest_release(t, now));
2116 +		sched_trace_task_release(t);
2117 +	} else if (is_sliced_task(t)) {
2118 +		/* figure out which slice we should be executing on */
2119 +		fast_forward_slices(t, sleep_time);
2120 +		/* can't be exhausted now */
2121 +		BUG_ON(wm_budget_exhausted(t));
2122 +	} else {
2123 +		/* simply add to the execution time */
2124 +		tsk_rt(t)->job_params.exec_time += sleep_time;
2125 +	}
2126 +
2127 +
2128 +	/* Only add to ready queue if it is not the currently-scheduled
2129 +	 * task. This could be the case if a task was woken up concurrently
2130 +	 * on a remote CPU before the executing CPU got around to actually
2131 +	 * de-scheduling the task, i.e., wake_up() raced with schedule()
2132 +	 * and won.
2133 +	 */
2134 +	if (dom->scheduled != t) {
2135 +		if (get_partition(t) == dom->cpu)
2136 +			wm_requeue_local(t, edf);
2137 +		else
2138 +			/* post-pone migration until after unlocking */
2139 +			migrate = 1;
2140 +	}
2141 +
2142 +	raw_spin_unlock_irqrestore(&dom->slock, flags);
2143 +
2144 +	if (migrate) {
2145 +		TRACE_TASK(t, "wake_up-initiated migration to %d\n",
2146 +			   get_partition(t));
2147 +		wm_requeue_remote(t);
2148 +	}
2149 +
2150 +	TRACE_TASK(t, "wake up done\n");
2151 +}
2152 +
2153 +static void wm_task_block(struct task_struct *t)
2154 +{
2155 +	wm_domain_t* dom = domain_of_task(t);
2156 +	unsigned long flags;
2157 +	lt_t now = litmus_clock();
2158 +
2159 +	TRACE_TASK(t, "block at %llu, state=%d\n", now, t->state);
2160 +
2161 +	tsk_rt(t)->semi_part.wm.suspend_time = now;
2162 +
2163 +	raw_spin_lock_irqsave(&dom->slock, flags);
2164 +	if (is_queued(t)) {
2165 +		TRACE_TASK(t, "still queued; migration invariant failed?\n");
2166 +		remove(&dom->domain, t);
2167 +	}
2168 +	raw_spin_unlock_irqrestore(&dom->slock, flags);
2169 +
2170 +	BUG_ON(!is_realtime(t));
2171 +}
2172 +
2173 +static void wm_task_exit(struct task_struct * t)
2174 +{
2175 +	unsigned long flags;
2176 +	wm_domain_t* dom = domain_of_task(t);
2177 +	rt_domain_t* edf = &dom->domain;
2178 +
2179 +	raw_spin_lock_irqsave(&dom->slock, flags);
2180 +	if (is_queued(t)) {
2181 +		/* dequeue */
2182 +		remove(edf, t);
2183 +	}
2184 +	if (dom->scheduled == t)
2185 +		dom->scheduled = NULL;
2186 +
2187 +	TRACE_TASK(t, "RIP, now reschedule\n");
2188 +
2189 +	preempt(dom);
2190 +	raw_spin_unlock_irqrestore(&dom->slock, flags);
2191 +}
2192 +
2193 +static long wm_check_params(struct task_struct *t)
2194 +{
2195 +	struct rt_param* p = tsk_rt(t);
2196 +	struct edf_wm_params* wm = &p->task_params.semi_part.wm;
2197 +	int i;
2198 +	lt_t tmp;
2199 +
2200 +	if (!is_sliced_task(t)) {
2201 +		/* regular task; nothing to check */
2202 +		TRACE_TASK(t, "accepted regular (non-sliced) task with "
2203 +			   "%d slices\n",
2204 +			   wm->count);
2205 +		return 0;
2206 +	}
2207 +
2208 +	/* (1) Either not sliced, or more than 1 slice. */
2209 +	if (wm->count == 1 || wm->count > MAX_EDF_WM_SLICES) {
2210 +		TRACE_TASK(t, "bad number of slices (%u) \n",
2211 +			   wm->count);
2212 +		return -EINVAL;
2213 +	}
2214 +
2215 +	/* (2) The partition has to agree with the first slice. */
2216 +	if (get_partition(t) != wm->slices[0].cpu) {
2217 +		TRACE_TASK(t, "partition and first slice CPU differ "
2218 +			   "(%d != %d)\n", get_partition(t), wm->slices[0].cpu);
2219 +		return -EINVAL;
2220 +	}
2221 +
2222 +	/* (3) The total budget must agree. */
2223 +	for (i = 0, tmp = 0; i < wm->count; i++)
2224 +		tmp += wm->slices[i].budget;
2225 +	if (get_exec_cost(t) != tmp) {
2226 +		TRACE_TASK(t, "total budget and sum of slice budgets differ\n");
2227 +		return -EINVAL;
2228 +	}
2229 +
2230 +	/* (4) The release of each slice must not precede the previous
2231 +	 *     deadline. */
2232 +	for (i = 0; i < wm->count - 1; i++)
2233 +		if (wm->slices[i].deadline > wm->slices[i + 1].offset) {
2234 +			TRACE_TASK(t, "slice %d overlaps with slice %d\n",
2235 +				   i, i + 1);
2236 +			return -EINVAL;
2237 +		}
2238 +
2239 +	/* (5) The budget of each slice must fit within [offset, deadline] */
2240 +	for (i = 0; i < wm->count; i++)
2241 +		if (lt_before(wm->slices[i].deadline, wm->slices[i].offset) ||
2242 +		    wm->slices[i].deadline - wm->slices[i].offset <
2243 +		    wm->slices[i].budget) {
2244 +			TRACE_TASK(t, "slice %d is overloaded\n", i);
2245 +			return -EINVAL;
2246 +		}
2247 +
2248 +	/* (6) The budget of each slice must exceed the minimum budget size. */
2249 +	for (i = 0; i < wm->count; i++)
2250 +		if (wm->slices[i].budget < MIN_EDF_WM_SLICE_SIZE) {
2251 +			TRACE_TASK(t, "slice %d is too short\n", i);
2252 +			return -EINVAL;
2253 +		}
2254 +
2255 +	/* (7) The CPU of each slice must be different from the previous CPU. */
2256 +	for (i = 0; i < wm->count - 1; i++)
2257 +		if (wm->slices[i].cpu == wm->slices[i + 1].cpu) {
2258 +			TRACE_TASK(t, "slice %d does not migrate\n", i);
2259 +			return -EINVAL;
2260 +		}
2261 +
2262 +	/* (8) The CPU of each slice must be online. */
2263 +	for (i = 0; i < wm->count; i++)
2264 +		if (!cpu_online(wm->slices[i].cpu)) {
2265 +			TRACE_TASK(t, "slice %d is allocated on offline CPU\n",
2266 +				   i);
2267 +			return -EINVAL;
2268 +		}
2269 +
2270 +	/* (9) A sliced task's budget must be precisely enforced. */
2271 +	if (!budget_precisely_enforced(t)) {
2272 +		TRACE_TASK(t, "budget is not precisely enforced "
2273 +			   "(policy: %d).\n",
2274 +			   tsk_rt(t)->task_params.budget_policy);
2275 +		return -EINVAL;
2276 +	}
2277 +
2278 +	TRACE_TASK(t, "accepted sliced task with %d slices\n",
2279 +		   wm->count);
2280 +
2281 +	return 0;
2282 +}
2283 +
2284 +static long wm_admit_task(struct task_struct* t)
2285 +{
2286 +	return task_cpu(t) == get_partition(t) ? wm_check_params(t) : -EINVAL;
2287 +}
2288 +
2289 +/*	Plugin object	*/
2290 +static struct sched_plugin edf_wm_plugin __cacheline_aligned_in_smp = {
2291 +	.plugin_name		= "EDF-WM",
2292 +	.tick			= wm_tick,
2293 +	.task_new		= wm_task_new,
2294 +	.complete_job		= complete_job,
2295 +	.task_exit		= wm_task_exit,
2296 +	.schedule		= wm_schedule,
2297 +	.release_at		= wm_release_at,
2298 +	.task_wake_up		= wm_task_wake_up,
2299 +	.task_block		= wm_task_block,
2300 +	.admit_task		= wm_admit_task
2301 +};
2302 +
2303 +
2304 +static int __init init_edf_wm(void)
2305 +{
2306 +	int i;
2307 +
2308 +	/* FIXME: breaks with CPU hotplug
2309 +	 */
2310 +	for (i = 0; i < num_online_cpus(); i++) {
2311 +		wm_domain_init(remote_domain(i),
2312 +			       wm_check_resched,
2313 +			       NULL, i);
2314 +	}
2315 +	return register_sched_plugin(&edf_wm_plugin);
2316 +}
2317 +
2318 +module_init(init_edf_wm);
2319 +
2320 diff --git a/litmus/sched_npsf.c b/litmus/sched_npsf.c
2321 new file mode 100644
2322 index 000000000000..aad99c7e447c
2323 --- /dev/null
2324 +++ b/litmus/sched_npsf.c
2325 @@ -0,0 +1,1185 @@
2326 +/*
2327 + * litmus/sched_npsf.c
2328 + *
2329 + * Implementation of the NPS-F scheduling algorithm.
2330 + *
2331 + * A _server_ may span on multiple _reserves_ on different CPUs.
2332 + *
2333 + *                      *                      1
2334 + * +--------------+  +--> +--------------+  +--> +--------------+
2335 + * | cpu_entry_t  |  |    | npsf_reserve |  |    | npsf_server  |
2336 + * +--------------+  |    +--------------+  |    +--------------+
2337 + * |              |1 |    |              |1 |    |              |
2338 + * | cpu_reserve  |--+   1|       server |--+   1|              |
2339 + * |              |   +---| cpu          |   +---| curr_reserve |
2340 + * +--------------+ <-+   +--------------+ <-+   +--------------+
2341 + *                  1                      *
2342 + */
2343 +
2344 +#include <asm/uaccess.h>
2345 +#include <linux/percpu.h>
2346 +#include <linux/sched.h>
2347 +#include <linux/list.h>
2348 +#include <linux/spinlock.h>
2349 +#include <linux/slab.h>
2350 +
2351 +#include <linux/module.h>
2352 +
2353 +#include <litmus/litmus.h>
2354 +#include <litmus/jobs.h>
2355 +#include <litmus/sched_plugin.h>
2356 +#include <litmus/edf_common.h>
2357 +
2358 +/* Be extra verbose (log spam) */
2359 +#define NPSF_VERBOSE
2360 +
2361 +#ifdef NPSF_VERBOSE
2362 +#define npsf_printk(fmt, arg...) printk(KERN_INFO fmt, ##arg)
2363 +#else
2364 +#define npsf_printk(fmt, arg...)
2365 +#endif
2366 +
2367 +struct npsf_reserve;
2368 +
2369 +/* cpu_entry_t
2370 + *
2371 + * Each cpu has a list of reserves assigned on the cpu.
2372 + * Each reserve has a pointer to its server (Notional processor)
2373 + * that may be shared among multiple reserves.
2374 + */
2375 +typedef struct  {
2376 +	/* lock to protect cpu_reserve and list changes */
2377 +	raw_spinlock_t		cpu_res_lock;
2378 +	/* the reserve currently executing on this cpu */
2379 +	struct npsf_reserve	*cpu_reserve;
2380 +	/* list of reserves on this cpu */
2381 +	struct list_head	npsf_reserves;
2382 +	/* cpu ID */
2383 +	int 			cpu;
2384 +	/* timer to control reserve switching */
2385 +	struct hrtimer		timer;
2386 +	/* virtual timer expiring (wrt time_origin) */
2387 +	lt_t			should_expire;
2388 +	/* delegate timer firing to proper cpu */
2389 +	struct hrtimer_start_on_info	info;
2390 +	/* FIXME: the ids for servers should be an increasing int >=0 */
2391 +	int			last_seen_npsf_id;
2392 +} cpu_entry_t;
2393 +
2394 +/* one cpu_entry_t per CPU */
2395 +DEFINE_PER_CPU(cpu_entry_t, npsf_cpu_entries);
2396 +
2397 +/* This is the "notional processor" (i.e., simple server) abstraction. */
2398 +typedef struct npsf_server {
2399 +	/* shared among reserves */
2400 +	rt_domain_t		dom;
2401 +	/* the real-time task that this server *SHOULD* be scheduling */
2402 +	struct task_struct	*highest_prio;
2403 +	/* current reserve where this dom is executing */
2404 +	struct npsf_reserve	*curr_reserve;
2405 +	/* The "first" reserve for this server in a time slot.
2406 +	 * For non-migrating servers this will always be the same as curr_reserve. */
2407 +	struct npsf_reserve *first_reserve;
2408 +	/* Prevent a race between the last CPU in a reserve chain an the first. */
2409 +	int first_cpu_wants_ipi;
2410 +	/* rt_domain_t lock + npsf_server_t lock */
2411 +#define lock dom.ready_lock
2412 +} npsf_server_t;
2413 +
2414 +typedef struct npsf_reserve {
2415 +	/* Pointer to the server for this reserve: a server may be shared among
2416 +	 * multiple cpus with different budget per cpu, but same npsf_id. */
2417 +	npsf_server_t		*server;
2418 +	/* we queue here in npsf_reserves */
2419 +	struct list_head	node;
2420 +	/* budget of this npsf_id on this cpu */
2421 +	lt_t			budget;
2422 +	/* cpu for this (portion of) server */
2423 +	cpu_entry_t		*cpu;
2424 +	/* id of this server, it is the same for the
2425 +	 * same server on different cpus */
2426 +	int 			npsf_id;
2427 +	/* Can be used to identify if a reserve continues
2428 +	 * next npsf in the chain, needed for proper server deletion */
2429 +	struct npsf_reserve 	*next_npsf;
2430 +	/* flag that is true if the reserve is currently scheduled */
2431 +	int			is_currently_scheduled;
2432 +} npsf_reserve_t;
2433 +
2434 +/* synchronization point to start moving and switching servers only
2435 + * when all servers have been properly set up by the user.
2436 + */
2437 +static atomic_t all_servers_added;
2438 +static atomic_t timers_activated = ATOMIC_INIT(0);
2439 +
2440 +/* Virtual time starts here */
2441 +static lt_t time_origin;
2442 +
2443 +/* save number of online cpus seen at init time */
2444 +static unsigned int _online_cpus = 1;
2445 +
2446 +#define no_reserves(entry)	(list_empty(&((entry)->npsf_reserves)))
2447 +#define local_entry		(&__get_cpu_var(npsf_cpu_entries))
2448 +#define remote_entry(cpu)	(&per_cpu(npsf_cpu_entries, (cpu)))
2449 +
2450 +#define server_from_dom(domain)	(container_of((domain), npsf_server_t, dom))
2451 +
2452 +/* task_entry uses get_partition() therefore we must take care of
2453 + * updating correclty the task_params.cpu whenever we switch task,
2454 + * otherwise we'll deadlock.
2455 + */
2456 +#define task_entry(task)	remote_entry(get_partition(task))
2457 +#define domain_edf(npsf)	(&((npsf)->server->dom))
2458 +
2459 +#define task_npsfid(task)	((task)->rt_param.task_params.semi_part.npsf_id)
2460 +
2461 +static inline int owns_server(npsf_reserve_t *npsf)
2462 +{
2463 +	return (npsf->server->curr_reserve == npsf);
2464 +}
2465 +
2466 +/* utility functions to get next and prev domains; must hold entry lock */
2467 +static inline npsf_reserve_t* local_next_reserve(npsf_reserve_t *curr,
2468 +		cpu_entry_t *entry)
2469 +{
2470 +	return (list_is_last(&curr->node, &entry->npsf_reserves)) ?
2471 +		list_entry(entry->npsf_reserves.next, npsf_reserve_t, node) :
2472 +		list_entry(curr->node.next, npsf_reserve_t, node);
2473 +
2474 +}
2475 +
2476 +static inline npsf_reserve_t* local_prev_reserve(npsf_reserve_t *curr,
2477 +		cpu_entry_t *entry)
2478 +{
2479 +	return ((curr->node.prev == &entry->npsf_reserves) ?
2480 +		list_entry(entry->npsf_reserves.prev, npsf_reserve_t, node) :
2481 +		list_entry(curr->node.prev, npsf_reserve_t, node));
2482 +}
2483 +static void requeue(struct task_struct* t, rt_domain_t *edf)
2484 +{
2485 +	if (t->state != TASK_RUNNING)
2486 +		TRACE_TASK(t, "requeue: !TASK_RUNNING\n");
2487 +
2488 +	BUG_ON(is_queued(t));
2489 +
2490 +	set_rt_flags(t, RT_F_RUNNING);
2491 +	if (is_released(t, litmus_clock()))
2492 +		__add_ready(edf, t);
2493 +	else
2494 +		add_release(edf, t); /* it has got to wait */
2495 +}
2496 +
2497 +/* we assume the lock is being held */
2498 +static void preempt(npsf_reserve_t *npsf)
2499 +{
2500 +	/* Since we do not support non-preemptable sections,
2501 +	 * we don't need to pass in a task. If we call this,
2502 +	 * we want the remote CPU to reschedule, no matter what.
2503 +	 */
2504 +	preempt_if_preemptable(NULL, npsf->cpu->cpu);
2505 +}
2506 +
2507 +
2508 +static void npsf_preempt_if_server_is_scheduled(npsf_server_t* srv)
2509 +{
2510 +	npsf_reserve_t *reserve = srv->curr_reserve;
2511 +	if (reserve->is_currently_scheduled) {
2512 +		preempt(reserve);
2513 +	}
2514 +}
2515 +
2516 +/* assumes lock is held by caller */
2517 +static void npsf_reschedule_server(npsf_server_t* srv)
2518 +{
2519 +	struct task_struct* hp = srv->highest_prio;
2520 +	rt_domain_t* edf = &srv->dom;
2521 +
2522 +	if (edf_preemption_needed(edf, hp)) {
2523 +		srv->highest_prio = __take_ready(edf);
2524 +		if (hp) {
2525 +			TRACE_TASK(hp, "requeue: no longer highest prio\n");
2526 +			requeue(hp, edf);
2527 +		}
2528 +		npsf_preempt_if_server_is_scheduled(srv);
2529 +	}
2530 +}
2531 +
2532 +static void npsf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
2533 +{
2534 +	npsf_server_t *srv = server_from_dom(rt);
2535 +	unsigned long flags;
2536 +
2537 +	raw_spin_lock_irqsave(&srv->lock, flags);
2538 +
2539 +	__merge_ready(rt, tasks);
2540 +	npsf_reschedule_server(srv);
2541 +
2542 +	raw_spin_unlock_irqrestore(&srv->lock, flags);
2543 +}
2544 +
2545 +static void job_completion(struct task_struct* t, int forced)
2546 +{
2547 +	sched_trace_task_completion(t, forced);
2548 +	TRACE_TASK(t, "job_completion().\n");
2549 +
2550 +	set_rt_flags(t, RT_F_SLEEP);
2551 +	prepare_for_next_period(t);
2552 +}
2553 +
2554 +/* When did this slot start ? */
2555 +static inline lt_t slot_begin(lt_t now)
2556 +{
2557 +	return (((now - time_origin) / npsf_slot_length)
2558 +			* npsf_slot_length + time_origin);
2559 +}
2560 +
2561 +/* Compute the delta from the beginning of the current slot. */
2562 +static inline lt_t delta_from_slot_begin(lt_t now)
2563 +{
2564 +	return (now - slot_begin(now));
2565 +}
2566 +
2567 +/* Given an offset into a slot, return the corresponding eligible reserve.
2568 + * The output param reservation_end is used to return the (relative) time at which
2569 + * the returned reserve ends.
2570 + */
2571 +static npsf_reserve_t* get_reserve_for_offset(cpu_entry_t *entry, lt_t offset,
2572 +                                              lt_t *reservation_end)
2573 +{
2574 +	npsf_reserve_t *tmp;
2575 +
2576 +	*reservation_end = 0;
2577 +
2578 +	/* linear search through all reserves, figure out which one is the last one
2579 +	 * to become eligible before delta */
2580 +	list_for_each_entry(tmp, &entry->npsf_reserves, node) {
2581 +		*reservation_end += tmp->budget;
2582 +
2583 +		/* We are always "late". Found tmp is the right one */
2584 +		if ((*reservation_end > offset))
2585 +			return tmp;
2586 +	}
2587 +
2588 +	/* error: we should never fall of the reserve list */
2589 +	BUG();
2590 +	return NULL;
2591 +}
2592 +
2593 +/* Determine which reserve is eligible based on the current time.
2594 + */
2595 +static npsf_reserve_t* get_current_reserve(cpu_entry_t *entry)
2596 +{
2597 +	lt_t reservation_end;
2598 +	lt_t offset = delta_from_slot_begin(litmus_clock());
2599 +	return get_reserve_for_offset(entry, offset, &reservation_end);
2600 +}
2601 +
2602 +/* This is used to ensure that we are "always" late, i.e., to make
2603 + * sure that the timer jitter is always positive. This should
2604 + * only trigger in KVM (or in real machines with bad TSC drift after
2605 + * an IPI).
2606 + *
2607 + * ATM proper tracing for this event is done in reserve_switch_tick().
2608 + */
2609 +static noinline ktime_t catchup_time(lt_t from, lt_t target)
2610 +{
2611 +	while(lt_before(from, target)) {
2612 +		from = litmus_clock();
2613 +
2614 +		mb();
2615 +		cpu_relax();
2616 +	}
2617 +
2618 +	return ns_to_ktime(from);
2619 +}
2620 +
2621 +
2622 +/* compute the next ABSOLUTE timer value */
2623 +static lt_t get_next_reserve_switch_time(void)
2624 +{
2625 +	cpu_entry_t *entry = local_entry;
2626 +	lt_t now        = litmus_clock();
2627 +	lt_t slot_start = slot_begin(now);
2628 +	lt_t offset     = now - slot_start;
2629 +	lt_t next_time;
2630 +	npsf_reserve_t* reserve;
2631 +
2632 +	/* compute the absolute litmus time of the next reserve switch */
2633 +	reserve = get_reserve_for_offset(entry, offset, &next_time);
2634 +	/* get_reserve_for_offset returns a relative start time; let's make it
2635 +	   absolute */
2636 +	next_time += slot_start;
2637 +
2638 +	/* Let's see if we need to skip the next timer. */
2639 +	reserve = local_next_reserve(reserve, entry);
2640 +	/* if the next reserve is a continuing reserve
2641 +	 * (i.e., if it belongs to a migrating server),
2642 +	 * then we skip the timer event because we will
2643 +	 * receive an IPI from the previous processor instead. */
2644 +	if (reserve->server->first_reserve != reserve) {
2645 +		/* it is indeed not the first reserve */
2646 +		next_time += reserve->budget;
2647 +	}
2648 +
2649 +	return next_time;
2650 +}
2651 +
2652 +/* This is the callback for reserve-switching interrupts.
2653 + * The timer is reprogrammed to expire at the beginning of every logical
2654 + * reserve (i.e., a continuing reserve may be split among different CPUs
2655 + * but is a _single_ logical reserve). get_next_reserve_switch_time()
2656 + * will return the right next_expire time.
2657 + */
2658 +static enum hrtimer_restart reserve_switch_tick(struct hrtimer *timer)
2659 +{
2660 +	unsigned long flags;
2661 +	cpu_entry_t *entry;
2662 +	/* we are using CLOCK_MONOTONIC */
2663 +	ktime_t now = ktime_get();
2664 +	ktime_t delta;
2665 +	int late;
2666 +
2667 +	entry = container_of(timer, cpu_entry_t, timer);
2668 +	raw_spin_lock_irqsave(&entry->cpu_res_lock, flags);
2669 +
2670 +	/* jitter wrt virtual time */
2671 +	delta = ktime_sub(now, ns_to_ktime(entry->should_expire));
2672 +	late = (ktime_to_ns(delta) >= 0) ? 1 : 0;
2673 +
2674 +#ifdef NPSF_VERBOSE
2675 +	if (entry->cpu_reserve && atomic_read(&all_servers_added))
2676 +		TRACE("(npsf_id: %d) tick starts at %Ld, "
2677 +		      "now - should_expire: %Ld\n",
2678 +		      entry->cpu_reserve->npsf_id,
2679 +		      ktime_to_ns(now), ktime_to_ns(delta));
2680 +#endif
2681 +	/* if the timer expires earlier than the should_expire time,
2682 +	 * we delay the switching until time it's synchronized with
2683 +	 * the switch boundary. Otherwise next reserve will execute
2684 +	 * longer (wrong).
2685 +	 */
2686 +	if (!late) {
2687 +		TRACE("+++ Timer fired early, waiting...\n");
2688 +		now = catchup_time(ktime_to_ns(now), entry->should_expire);
2689 +
2690 +		delta = ktime_sub(now, ns_to_ktime(entry->should_expire));
2691 +		TRACE("+++ done, tick restarts at %Ld, "
2692 +		      "now - should_expire: %Ld\n",
2693 +		      ktime_to_ns(now), ktime_to_ns(delta));
2694 +	}
2695 +
2696 +	BUG_ON(!atomic_read(&all_servers_added));
2697 +	BUG_ON(no_reserves(entry));
2698 +
2699 +	/* Compute the next time that we need to be notified. */
2700 +	entry->should_expire = get_next_reserve_switch_time();
2701 +
2702 +	/* kindly ask the Penguin to let us know... */
2703 +	hrtimer_set_expires(timer, ns_to_ktime(entry->should_expire));
2704 +
2705 +	/* set resched flag to reschedule local cpu */
2706 +	set_need_resched();
2707 +
2708 +	raw_spin_unlock_irqrestore(&entry->cpu_res_lock, flags);
2709 +#ifdef NPSF_VERBOSE
2710 +	if (atomic_read(&all_servers_added))
2711 +		TRACE("(npsf_id: %d) tick ends at %Ld, should_expire: %llu\n",
2712 +		      entry->cpu_reserve->npsf_id, ktime_to_ns(ktime_get()),
2713 +		      entry->should_expire);
2714 +#endif
2715 +
2716 +	return HRTIMER_RESTART;
2717 +}
2718 +
2719 +static void npsf_scheduler_tick(struct task_struct *t)
2720 +{
2721 +	if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
2722 +		set_tsk_need_resched(t);
2723 +		TRACE("npsf_tick: %d is preemptable "
2724 +				" => FORCE_RESCHED\n", t->pid);
2725 +	}
2726 +}
2727 +
2728 +/* Assumption: caller holds srv lock and prev belongs to
2729 + * the currently-scheduled reservation.
2730 + */
2731 +static void npsf_schedule_server(struct task_struct* prev,
2732 +				 cpu_entry_t *entry)
2733 +{
2734 +	npsf_server_t* srv = entry->cpu_reserve->server;
2735 +
2736 +	int out_of_time, sleep, exists, blocks;
2737 +
2738 +	exists      = is_realtime(prev);
2739 +	blocks      = exists && !is_running(prev);
2740 +	out_of_time = exists &&
2741 +		budget_enforced(prev) &&
2742 +		budget_exhausted(prev);
2743 +	sleep	    = exists && get_rt_flags(prev) == RT_F_SLEEP;
2744 +
2745 +	if (exists)
2746 +		TRACE_TASK(prev, "(npsf_id %d) blocks:%d "
2747 +			   "out_of_time:%d sleep:%d state:%d sig:%d\n",
2748 +			   task_npsfid(prev),
2749 +			   blocks, out_of_time, sleep,
2750 +			   prev->state,
2751 +			   signal_pending(prev));
2752 +
2753 +	/* Any task that is preemptable and either exhausts its
2754 +	 * execution budget or wants to sleep completes. We may have
2755 +	 * to reschedule after this.
2756 +	 */
2757 +	if ((out_of_time || sleep) && !blocks) {
2758 +		job_completion(prev, !sleep);
2759 +
2760 +		if (srv->highest_prio != prev) {
2761 +			BUG_ON(!is_queued(prev));
2762 +			remove(&srv->dom, prev);
2763 +		}
2764 +
2765 +		requeue(prev, &srv->dom);
2766 +
2767 +		if (srv->highest_prio == prev)
2768 +			srv->highest_prio = __take_ready(&srv->dom);
2769 +	}
2770 +
2771 +	BUG_ON(blocks && prev == srv->highest_prio);
2772 +//	BUG_ON(!srv->highest_prio && jobs_pending(&srv->dom));
2773 +}
2774 +
2775 +static void npsf_notify_next_cpu(npsf_reserve_t *npsf_prev)
2776 +{
2777 +	npsf_server_t *srv;
2778 +
2779 +	if (unlikely(npsf_prev->next_npsf != npsf_prev)) {
2780 +		/* This reserve is actually shared. Let's update its 'owner'
2781 +		 * and notify the next CPU. */
2782 +		srv = npsf_prev->server;
2783 +		raw_spin_lock(&srv->lock);
2784 +		srv->curr_reserve = npsf_prev->next_npsf;
2785 +		if (srv->first_reserve != srv->curr_reserve ||
2786 +		    srv->first_cpu_wants_ipi) {
2787 +			/* send an IPI to notify next CPU in chain */
2788 +			srv->first_cpu_wants_ipi = 0;
2789 +			TRACE("sending IPI\n");
2790 +			preempt(srv->curr_reserve);
2791 +		}
2792 +		raw_spin_unlock(&srv->lock);
2793 +	}
2794 +}
2795 +
2796 +static struct task_struct* npsf_schedule(struct task_struct * prev)
2797 +{
2798 +	npsf_reserve_t *npsf_prev, *npsf_next;
2799 +	npsf_server_t *srv_prev, *srv_next;
2800 +	cpu_entry_t *entry = local_entry;
2801 +	struct task_struct *next;
2802 +
2803 +	int reserve_switch;
2804 +
2805 +	/* servers not ready yet, yield to linux */
2806 +	if (!atomic_read(&all_servers_added))
2807 +		return NULL;
2808 +
2809 +#ifdef NPSF_VERBOSE
2810 +	TRACE_TASK(prev, "schedule\n");
2811 +#endif
2812 +	raw_spin_lock(&entry->cpu_res_lock);
2813 +
2814 +	BUG_ON(no_reserves(entry));
2815 +
2816 +	/* step 1: what are we currently serving? */
2817 +	npsf_prev = entry->cpu_reserve;
2818 +	srv_prev  = npsf_prev->server;
2819 +
2820 +	/* step 2: what SHOULD we be currently serving? */
2821 +	npsf_next = get_current_reserve(entry);
2822 +	srv_next  = npsf_next->server;
2823 +
2824 +	/* TODO second measuring point for IPI receiving
2825 +	 * if (!srv_next->measure_wait_IPI) --- the remote reset
2826 +	 * 	trace_time_end.
2827 +	 */
2828 +	raw_spin_lock(&srv_prev->lock);
2829 +
2830 +
2831 +	/* step 3: update prev server */
2832 +	if (is_realtime(prev) && task_npsfid(prev) == entry->cpu_reserve->npsf_id)
2833 +		npsf_schedule_server(prev, entry);
2834 +	else if (is_realtime(prev))
2835 +		TRACE_TASK(prev, "npsf_id %d != cpu_reserve npsf_id %d\n",
2836 +				task_npsfid(prev), entry->cpu_reserve->npsf_id);
2837 +
2838 +	/* step 4: determine if we need to switch to another reserve */
2839 +	reserve_switch = npsf_prev != npsf_next;
2840 +
2841 +	if (!reserve_switch) {
2842 +		/* easy case: just enact what the server scheduler decided */
2843 +		next = srv_prev->highest_prio;
2844 +
2845 +		/* Unlock AFTER observing highest_prio to avoid races with
2846 +		 * remote rescheduling activity. */
2847 +		raw_spin_unlock(&srv_prev->lock);
2848 +	} else {
2849 +		/* In this case we have a reserve switch.  We are done with the
2850 +		 * previous server, so release its lock. */
2851 +		TRACE("switch reserve npsf_id %d -> npsf_id %d\n",
2852 +				npsf_prev->npsf_id, npsf_next->npsf_id);
2853 +		npsf_prev->is_currently_scheduled = 0;
2854 +		raw_spin_unlock(&srv_prev->lock);
2855 +
2856 +		/* Move on to the next server. */
2857 +
2858 +		raw_spin_lock(&srv_next->lock);
2859 +		npsf_next->is_currently_scheduled = 1;
2860 +
2861 +		/* make sure we are owner of a  server (if it is shared) */
2862 +		if (unlikely(srv_next->curr_reserve != npsf_next)) {
2863 +			/* We raced with the previous owner.  Let's schedule
2864 +			 * the previous reserve for now. The previous owner
2865 +			 * will send us an IPI when the server has been pushed
2866 +			 * to us.
2867 +			 */
2868 +			TRACE("(npsf_id %d) raced with previous server owner\n",
2869 +			      npsf_next->npsf_id);
2870 +
2871 +			/* check if we are the first CPU, in which case we need
2872 +			 * to request a notification explicitly */
2873 +			if (srv_next->first_reserve == npsf_next)
2874 +				srv_next->first_cpu_wants_ipi = 1;
2875 +
2876 +			npsf_next->is_currently_scheduled = 0;
2877 +			raw_spin_unlock(&srv_next->lock);
2878 +
2879 +			/* just keep the previous reserve one more time */
2880 +			raw_spin_lock(&srv_prev->lock);
2881 +
2882 +			npsf_prev->is_currently_scheduled = 1;
2883 +			/* Note that there is not a race condition here.
2884 +			 * Since curr_reserve didn't point yet to this reserve,
2885 +			 * so no processor would have observed the one in npsf_next.
2886 +			 * A processor might have observed the flag being zero
2887 +			 * in npsf_prev and decided not to send an IPI, which
2888 +			 * doesn't matter since we are going to reschedule
2889 +			 * below anyay. */
2890 +
2891 +			next = srv_prev->highest_prio;
2892 +
2893 +			raw_spin_unlock(&srv_prev->lock);
2894 +
2895 +			/* TODO first measuring point for '0'-switching time
2896 +			 * remote is not ready yet and will send us an IPI
2897 +			 * when it's done.
2898 +			 * local:
2899 +			 * 	srv_next->measure_wait_IPI = 1;
2900 +			 * remote before sending IPI:
2901 +			 * 	if (srv_next->measure_wait_IPI) reset;
2902 +			 */
2903 +		} else {
2904 +			/* invariant: srv->highest_prio is always the
2905 +			 * highest-priority job in the server, and it is always
2906 +			 * runnable. Any update to the server must maintain
2907 +			 * this invariant. */
2908 +			next = srv_next->highest_prio;
2909 +
2910 +			entry->cpu_reserve = npsf_next;
2911 +			raw_spin_unlock(&srv_next->lock);
2912 +
2913 +			/* send an IPI (if necessary) */
2914 +			npsf_notify_next_cpu(npsf_prev);
2915 +		}
2916 +
2917 +	}
2918 +
2919 +	if (next) {
2920 +		TRACE_TASK(next, "(npsf_id %d) scheduled at %llu\n",
2921 +			   task_npsfid(next), litmus_clock());
2922 +		set_rt_flags(next, RT_F_RUNNING);
2923 +		/* The TASK_RUNNING flag is set by the Penguin _way_ after
2924 +		 * activating a task. This dosn't matter much to Linux as
2925 +		 * the rq lock will prevent any changes, but it matters to
2926 +		 * us. It is possible for a remote cpu waking up this task
2927 +		 * to requeue the task before it's runnable, send an IPI here,
2928 +		 * we schedule that task (still "not-runnable"), and only
2929 +		 * before the real execution of next, the running flag is set.
2930 +		 */
2931 +		if (!is_running(next))
2932 +			TRACE_TASK(next, "BAD: !TASK_RUNNING\n");
2933 +	} else {
2934 +		/* FIXME npsf_id is wrong if reserve switch but "switching back"
2935 +		 * if we race */
2936 +		TRACE("(npsf_id %d) becoming idle at %llu\n",
2937 +		      reserve_switch ? npsf_next->npsf_id : npsf_prev->npsf_id,
2938 +		      litmus_clock());
2939 +	}
2940 +
2941 +	raw_spin_unlock(&entry->cpu_res_lock);
2942 +
2943 +	return next;
2944 +}
2945 +
2946 +/*	Prepare a task for running in RT mode
2947 + *
2948 + *	We can only be sure that the cpu is a right one (admit checks
2949 + *	against tasks released on a cpu that doesn't host the right npsf_id)
2950 + *	but we _cannot_ be sure that:
2951 + *	1) the found npsf is the reserve currently running on this cpu.
2952 + *	2) the current reserve (the one in charge of scheduling) is not
2953 + *	running on a different cpu.
2954 + */
2955 +static void npsf_task_new(struct task_struct * t, int on_rq, int running)
2956 +{
2957 +	npsf_reserve_t *npsf;
2958 +	npsf_server_t *srv;
2959 +	cpu_entry_t *entry = task_entry(t);
2960 +	rt_domain_t *edf;
2961 +	unsigned long flags;
2962 +
2963 +	BUG_ON(no_reserves(entry));
2964 +
2965 +	/* search the proper npsf_server where to add the new task */
2966 +	list_for_each_entry(npsf, &entry->npsf_reserves, node) {
2967 +		if (npsf->npsf_id == task_npsfid(t))
2968 +			break;
2969 +	}
2970 +
2971 +
2972 +	srv = npsf->server;
2973 +
2974 +	/* The task should be running in the queue, otherwise signal
2975 +	 * code will try to wake it up with fatal consequences.
2976 +	 */
2977 +	raw_spin_lock_irqsave(&entry->cpu_res_lock, flags);
2978 +	raw_spin_lock(&srv->lock);
2979 +
2980 +	edf = domain_edf(npsf);
2981 +	tsk_rt(t)->domain = edf;
2982 +
2983 +	TRACE_TASK(t, "task_new: P%d, task_npsfid %d, "
2984 +			"npsf->npsf_id %d, entry->cpu %d\n",
2985 +			t->rt_param.task_params.cpu, task_npsfid(t),
2986 +			npsf->npsf_id, entry->cpu);
2987 +
2988 +	/* setup job parameters */
2989 +	release_at(t, litmus_clock());
2990 +
2991 +	/* There are four basic scenarios that could happen:
2992 +	 *  1) the server is on another cpu and scheduled;
2993 +	 *  2) the server is on another cpu and not scheduled;
2994 +	 *  3) the server is on this cpu and scheduled; and
2995