Attachment 'litmus-rt-semi-part-with-edfos.patch'
Download 1 diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
2 index 6b4ffedb93c9..dd78ef687c5e 100644
3 --- a/arch/x86/vdso/Makefile
4 +++ b/arch/x86/vdso/Makefile
5 @@ -25,7 +25,7 @@ targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
6
7 export CPPFLAGS_vdso.lds += -P -C
8
9 -VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -Wl,-soname=linux-vdso.so.1 \
10 +VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
11 -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
12
13 $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
14 @@ -69,7 +69,7 @@ vdso32.so-$(VDSO32-y) += sysenter
15 vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
16
17 CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
18 -VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -Wl,-soname=linux-gate.so.1
19 +VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-soname=linux-gate.so.1
20
21 # This makes sure the $(obj) subdirectory exists even though vdso32/
22 # is not a kbuild sub-make subdirectory.
23 diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h
24 index 5d20276e44f4..867239875eef 100644
25 --- a/include/litmus/litmus.h
26 +++ b/include/litmus/litmus.h
27 @@ -88,7 +88,7 @@ inline static int budget_exhausted(struct task_struct* t)
28 inline static lt_t budget_remaining(struct task_struct* t)
29 {
30 if (!budget_exhausted(t))
31 - return get_exec_time(t) - get_exec_cost(t);
32 + return get_exec_cost(t) - get_exec_time(t);
33 else
34 /* avoid overflow */
35 return 0;
36 diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
37 index a7a183f34a80..bc3bbd07ef26 100644
38 --- a/include/litmus/rt_param.h
39 +++ b/include/litmus/rt_param.h
40 @@ -1,3 +1,6 @@
41 +#include <linux/threads.h>
42 +#include <litmus/bheap.h>
43 +
44 /*
45 * Definition of the scheduler plugin interface.
46 *
47 @@ -33,6 +36,91 @@ typedef enum {
48 PRECISE_ENFORCEMENT /* NOT IMPLEMENTED - enforced with hrtimers */
49 } budget_policy_t;
50
51 +
52 +/* The parameters for EDF-Fm scheduling algorithm.
53 + * Each task may be fixed or migratory. Migratory tasks may
54 + * migrate on 2 (contiguous) CPU only. NR_CPUS_EDF_FM = 2.
55 + */
56 +#define NR_CPUS_EDF_FM 2
57 +#define NR_CPUS_EDF_OS 24
58 +
59 +struct edffm_params {
60 + /* EDF-fm where can a migratory task execute? */
61 + unsigned int cpus[NR_CPUS_EDF_FM];
62 + /* how many cpus are used by this task?
63 + * fixed = 0, migratory = (NR_CPUS_EDF_FM - 1)
64 + * Efficient way to allow writing cpus[nr_cpus].
65 + */
66 + unsigned int nr_cpus;
67 + /* Fraction of this task exec_cost that each CPU should handle.
68 + * We keep the fraction divided in num/denom : a matrix of
69 + * (NR_CPUS_EDF_FM rows) x (2 columns).
70 + * The first column is the numerator of the fraction.
71 + * The second column is the denominator.
72 + * In EDF-fm this is a 2*2 matrix
73 + */
74 + lt_t fraction[2][NR_CPUS_EDF_FM];
75 +};
76 +
77 +struct edfos_params {
78 + /* The first CPU.*/
79 + unsigned int first_cpu;
80 + /* Whether this task is a migrating task*/
81 + unsigned int migrat;
82 + /* Time of next subtask release or deadline */
83 + int heap_data[NR_CPUS_EDF_OS];
84 + /* Fraction of this task exec_cost that each CPU should handle.
85 + * We keep the fraction divided in num/denom : a matrix of
86 + * (NR_CPUS_EDF_OS rows) x (2 columns).
87 + * The first column is the numerator of the fraction.
88 + * The second column is the denominator.
89 + */
90 + lt_t fraction[NR_CPUS_EDF_OS][2];
91 + struct bheap release_queue;
92 + struct bheap ready_queue;
93 +};
94 +
95 +/* Parameters for NPS-F semi-partitioned scheduling algorithm.
96 + * Each (cpu, budget) entry defines the share ('budget' in ns, a % of
97 + * the slot_length) of the notional processor on the CPU 'cpu'.
98 + * This structure is used by the library - syscall interface in order
99 + * to go through the overhead of a syscall only once per server.
100 + */
101 +struct npsf_budgets {
102 + int cpu;
103 + lt_t budget;
104 +};
105 +
106 +/* The parameters for the EDF-WM semi-partitioned scheduler.
107 + * Each task may be split across multiple cpus. Each per-cpu allocation
108 + * is called a 'slice'.
109 + */
110 +#define MAX_EDF_WM_SLICES 24
111 +#define MIN_EDF_WM_SLICE_SIZE 50000 /* .05 millisecond = 50us */
112 +
113 +struct edf_wm_slice {
114 + /* on which CPU is this slice allocated */
115 + unsigned int cpu;
116 + /* relative deadline from job release (not from slice release!) */
117 + lt_t deadline;
118 + /* budget of this slice; must be precisely enforced */
119 + lt_t budget;
120 + /* offset of this slice relative to the job release */
121 + lt_t offset;
122 +};
123 +
124 +/* If a job is not sliced across multiple CPUs, then
125 + * count is set to zero and none of the slices is used.
126 + * This implies that count == 1 is illegal.
127 + */
128 +struct edf_wm_params {
129 + /* enumeration of all slices */
130 + struct edf_wm_slice slices[MAX_EDF_WM_SLICES];
131 +
132 + /* how many slices are defined? */
133 + unsigned int count;
134 +};
135 +
136 struct rt_task {
137 lt_t exec_cost;
138 lt_t period;
139 @@ -40,6 +128,25 @@ struct rt_task {
140 unsigned int cpu;
141 task_class_t cls;
142 budget_policy_t budget_policy; /* ignored by pfair */
143 +
144 + /* parameters used by the semi-partitioned algorithms */
145 + union {
146 + /* EDF-Fm; defined in sched_edf_fm.c */
147 + struct edffm_params fm;
148 +
149 + /* EDF-os; defined in sched_edf_os.c */
150 + struct edfos_params os;
151 +
152 + /* NPS-F; defined in sched_npsf.c
153 + * id for the server (notional processor) that holds
154 + * this task; the same npfs_id can be assigned to "the same"
155 + * server split on different cpus
156 + */
157 + int npsf_id;
158 +
159 + /* EDF-WM; defined in sched_edf_wm.c */
160 + struct edf_wm_params wm;
161 + } semi_part;
162 };
163
164 /* The definition of the data that is shared between the kernel and real-time
165 @@ -184,6 +291,27 @@ struct rt_param {
166
167 /* Pointer to the page shared between userspace and kernel. */
168 struct control_page * ctrl_page;
169 +
170 + /* runtime info for the semi-part plugins */
171 + union {
172 + /* EDF-Fm and EDF-os runtime information
173 + * number of jobs handled by this cpu
174 + * (to determine next cpu for a migrating task)
175 + */
176 + unsigned int cpu_job_no[NR_CPUS_EDF_OS];
177 +
178 + /* EDF-WM runtime information */
179 + struct {
180 + /* at which exec time did the current slice start? */
181 + lt_t exec_time;
182 + /* when did the job suspend? */
183 + lt_t suspend_time;
184 + /* cached job parameters */
185 + lt_t job_release, job_deadline;
186 + /* pointer to the current slice */
187 + struct edf_wm_slice* slice;
188 + } wm;
189 + } semi_part;
190 };
191
192 /* Possible RT flags */
193 diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h
194 index 9c1c9f28ba79..7ea9176624ff 100644
195 --- a/include/litmus/sched_plugin.h
196 +++ b/include/litmus/sched_plugin.h
197 @@ -6,6 +6,8 @@
198 #define _LINUX_SCHED_PLUGIN_H_
199
200 #include <linux/sched.h>
201 +/* NSEC_PER... conversions */
202 +#include <linux/time.h>
203
204 /* struct for semaphore with priority inheritance */
205 struct pi_semaphore {
206 @@ -136,6 +138,9 @@ extern struct sched_plugin *litmus;
207 /* cluster size: cache_index = 2 L2, cache_index = 3 L3 */
208 extern int cluster_cache_index;
209
210 +/* Slot length (ns) for NPS-F semi-part. algo */
211 +extern lt_t npsf_slot_length;
212 +
213 int register_sched_plugin(struct sched_plugin* plugin);
214 struct sched_plugin* find_sched_plugin(const char* name);
215 int print_sched_plugins(char* buf, int max);
216 diff --git a/include/litmus/trace.h b/include/litmus/trace.h
217 index b32c71180774..6afbf96ef9e4 100644
218 --- a/include/litmus/trace.h
219 +++ b/include/litmus/trace.h
220 @@ -78,6 +78,8 @@ feather_callback void save_timestamp_cpu(unsigned long event, unsigned long cpu)
221 #define TS_TICK_START(t) TTIMESTAMP(110, t)
222 #define TS_TICK_END(t) TTIMESTAMP(111, t)
223
224 +#define TS_PULL_TIMER_START TIMESTAMP(112)
225 +#define TS_PULL_TIMER_END TIMESTAMP(113)
226
227 #define TS_PLUGIN_SCHED_START /* TIMESTAMP(120) */ /* currently unused */
228 #define TS_PLUGIN_SCHED_END /* TIMESTAMP(121) */
229 diff --git a/include/litmus/unistd_64.h b/include/litmus/unistd_64.h
230 index f0618e75348d..4e82c52722c8 100644
231 --- a/include/litmus/unistd_64.h
232 +++ b/include/litmus/unistd_64.h
233 @@ -33,5 +33,7 @@ __SYSCALL(__NR_wait_for_ts_release, sys_wait_for_ts_release)
234 __SYSCALL(__NR_release_ts, sys_release_ts)
235 #define __NR_null_call __LSC(13)
236 __SYSCALL(__NR_null_call, sys_null_call)
237 +#define __NR_add_server __LSC(14)
238 +__SYSCALL(__NR_add_server, sys_add_server)
239
240 -#define NR_litmus_syscalls 14
241 +#define NR_litmus_syscalls 15
242 diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
243 index fdf95968e517..23d3712012f4 100644
244 --- a/kernel/hrtimer.c
245 +++ b/kernel/hrtimer.c
246 @@ -47,6 +47,7 @@
247 #include <linux/timer.h>
248
249 #include <litmus/litmus.h>
250 +#include <litmus/trace.h>
251
252 #include <asm/uaccess.h>
253
254 @@ -1063,6 +1064,7 @@ void hrtimer_pull(void)
255 struct hrtimer_start_on_info *info;
256 struct list_head *pos, *safe, list;
257
258 + TS_PULL_TIMER_START;
259 raw_spin_lock(&base->lock);
260 list_replace_init(&base->to_pull, &list);
261 raw_spin_unlock(&base->lock);
262 @@ -1073,6 +1075,7 @@ void hrtimer_pull(void)
263 list_del(pos);
264 hrtimer_start(info->timer, info->time, info->mode);
265 }
266 + TS_PULL_TIMER_END;
267 }
268
269 /**
270 diff --git a/litmus/Makefile b/litmus/Makefile
271 index f301d2842e43..b243093abc6d 100644
272 --- a/litmus/Makefile
273 +++ b/litmus/Makefile
274 @@ -14,7 +14,11 @@ obj-y = sched_plugin.o litmus.o \
275 bheap.o \
276 ctrldev.o \
277 sched_gsn_edf.o \
278 - sched_psn_edf.o
279 + sched_psn_edf.o \
280 + sched_edf_wm.o \
281 + sched_npsf.o \
282 + sched_edf_fm.o \
283 + sched_edf_os.o
284
285 obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
286 obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o
287 diff --git a/litmus/litmus.c b/litmus/litmus.c
288 index b04a42b0da9c..2f780222d8e8 100644
289 --- a/litmus/litmus.c
290 +++ b/litmus/litmus.c
291 @@ -632,6 +632,55 @@ static int proc_write_cluster_size(struct file *file,
292 return len;
293 }
294
295 +static int proc_read_npsf_slot_length(char *page, char **start,
296 + off_t off, int count,
297 + int *eof, void *data)
298 +{
299 + return snprintf(page, PAGE_SIZE, "%d us\n",
300 + (int) (npsf_slot_length / NSEC_PER_USEC));
301 +}
302 +
303 +extern void npsf_hrtimers_cleanup(void);
304 +/* NPS-F slot length in us.
305 + *
306 + * Writing 0 as npsf_slot_length will trigger the removal of the
307 + * hrtimers for the domain_reschedule_tick() in the NPS-F plugin.
308 + */
309 +static int proc_write_npsf_slot_length(struct file *file,
310 + const char *buffer,
311 + unsigned long count,
312 + void *data)
313 +{
314 + int err, slot_length;
315 + char msg[64];
316 +
317 + if (count > 63)
318 + return -EINVAL;
319 +
320 + if (copy_from_user(msg, buffer, count))
321 + return -EFAULT;
322 +
323 + /* terminate */
324 + msg[count] = '\0';
325 + /* chomp */
326 + if (count > 1 && msg[count - 1] == '\n')
327 + msg[count - 1] = '\0';
328 +
329 + err = sscanf(msg, "%d", &slot_length);
330 +
331 + if (err == 1) {
332 + if (!slot_length) {
333 + npsf_hrtimers_cleanup();
334 + /* reset to default */
335 + slot_length = 5000;
336 + }
337 + npsf_slot_length = (lt_t)((lt_t) slot_length * NSEC_PER_USEC);
338 + return count;
339 + }
340 +
341 + return -EINVAL;
342 +}
343 +
344 #ifdef CONFIG_RELEASE_MASTER
345 static int proc_read_release_master(char *page, char **start,
346 off_t off, int count,
347 @@ -691,7 +740,8 @@ static struct proc_dir_entry *litmus_dir = NULL,
348 #ifdef CONFIG_RELEASE_MASTER
349 *release_master_file = NULL,
350 #endif
351 - *clus_cache_idx_file = NULL;
352 + *clus_cache_idx_file = NULL,
353 + *npsf_slot_length_file = NULL;
354
355 static int __init init_litmus_proc(void)
356 {
357 @@ -733,6 +783,16 @@ static int __init init_litmus_proc(void)
358 clus_cache_idx_file->read_proc = proc_read_cluster_size;
359 clus_cache_idx_file->write_proc = proc_write_cluster_size;
360
361 + npsf_slot_length_file = create_proc_entry("npsf_slot_length",
362 + 0644, litmus_dir);
363 + if (!npsf_slot_length_file) {
364 + printk(KERN_ERR "Could not allocate npsf_slot_length "
365 + "procfs entry.\n");
366 + return -ENOMEM;
367 + }
368 + npsf_slot_length_file->read_proc = proc_read_npsf_slot_length;
369 + npsf_slot_length_file->write_proc = proc_write_npsf_slot_length;
370 +
371 stat_file = create_proc_read_entry("stats", 0444, litmus_dir,
372 proc_read_stats, NULL);
373
374 @@ -752,6 +812,8 @@ static void exit_litmus_proc(void)
375 remove_proc_entry("active_plugin", litmus_dir);
376 if (clus_cache_idx_file)
377 remove_proc_entry("cluster_cache", litmus_dir);
378 + if (npsf_slot_length_file)
379 + remove_proc_entry("npsf_slot_length", litmus_dir);
380 #ifdef CONFIG_RELEASE_MASTER
381 if (release_master_file)
382 remove_proc_entry("release_master", litmus_dir);
383 diff --git a/litmus/sched_edf_fm.c b/litmus/sched_edf_fm.c
384 new file mode 100644
385 index 000000000000..0465220f9dbb
386 --- /dev/null
387 +++ b/litmus/sched_edf_fm.c
388 @@ -0,0 +1,571 @@
389 +/*
390 + * litmus/sched_edf_fm.c
391 + *
392 + * Implementation of the EDF-fm scheduling algorithm.
393 + */
394 +
395 +#include <linux/percpu.h>
396 +#include <linux/sched.h>
397 +#include <linux/list.h>
398 +#include <linux/spinlock.h>
399 +
400 +#include <linux/module.h>
401 +
402 +#include <litmus/litmus.h>
403 +#include <litmus/jobs.h>
404 +#include <litmus/sched_plugin.h>
405 +#include <litmus/edf_common.h>
406 +
407 +typedef struct {
408 + rt_domain_t domain;
409 + int cpu;
410 + struct task_struct* scheduled; /* only RT tasks */
411 +/* domain lock */
412 +#define slock domain.ready_lock
413 +} edffm_domain_t;
414 +
415 +DEFINE_PER_CPU(edffm_domain_t, edffm_domains);
416 +
417 +#define local_edffm (&__get_cpu_var(edffm_domains))
418 +#define remote_edf(cpu) (&per_cpu(edffm_domains, cpu).domain)
419 +#define remote_edffm(cpu) (&per_cpu(edffm_domains, cpu))
420 +#define task_edf(task) remote_edf(get_partition(task))
421 +#define task_edffm(task) remote_edffm(get_partition(task))
422 +
423 +#define edffm_params(t) (t->rt_param.task_params.semi_part.fm)
424 +
425 +/* Is the task a migratory task? */
426 +#define is_migrat_task(task) (edffm_params(task).nr_cpus)
427 +/* t is on the wrong CPU (it should be requeued properly) */
428 +#define wrong_cpu(t) is_migrat_task((t)) && task_cpu((t)) != get_partition((t))
429 +/* Get next CPU */
430 +#define migrat_next_cpu(t) \
431 + ((tsk_rt(t)->task_params.cpu == edffm_params(t).cpus[0]) ? \
432 + edffm_params(t).cpus[1] : \
433 + edffm_params(t).cpus[0])
434 +/* Get current cpu */
435 +#define migrat_cur_cpu(t) \
436 + ((tsk_rt(t)->task_params.cpu == edffm_params(t).cpus[0]) ? \
437 + edffm_params(t).cpus[0] : \
438 + edffm_params(t).cpus[1])
439 +/* Manipulate share for current cpu */
440 +#define cur_cpu_fract_num(t) \
441 + ((tsk_rt(t)->task_params.cpu == edffm_params(t).cpus[0]) ? \
442 + edffm_params(t).fraction[0][0] : \
443 + edffm_params(t).fraction[0][1])
444 +#define cur_cpu_fract_den(t) \
445 + ((tsk_rt(t)->task_params.cpu == edffm_params(t).cpus[0]) ? \
446 + edffm_params(t).fraction[1][0] : \
447 + edffm_params(t).fraction[1][1])
448 +/* Get job number for current cpu */
449 +#define cur_cpu_job_no(t) \
450 + ((tsk_rt(t)->task_params.cpu == edffm_params(t).cpus[0]) ? \
451 + tsk_rt(t)->semi_part.cpu_job_no[0] : \
452 + tsk_rt(t)->semi_part.cpu_job_no[1])
453 +/* What is the current cpu position in the array? */
454 +#define edffm_cpu_pos(cpu,t) \
455 + ((cpu == edffm_params(t).cpus[0]) ? \
456 + 0 : 1)
457 +
458 +/*
459 + * EDF-fm: migratory tasks have higher prio than fixed, EDF in both classes.
460 + * (Both first and second may be NULL).
461 + */
462 +int edffm_higher_prio(struct task_struct* first, struct task_struct* second)
463 +{
464 + if ((first && edffm_params(first).nr_cpus) ||
465 + (second && edffm_params(second).nr_cpus)) {
466 + if ((first && edffm_params(first).nr_cpus) &&
467 + (second && edffm_params(second).nr_cpus))
468 + /* both are migrating */
469 + return edf_higher_prio(first, second);
470 +
471 + if (first && edffm_params(first).nr_cpus)
472 + /* first is migrating */
473 + return 1;
474 + else
475 + /* second is migrating */
476 + return 0;
477 + }
478 +
479 + /* both are fixed or not real time */
480 + return edf_higher_prio(first, second);
481 +}
482 +
483 +int edffm_ready_order(struct bheap_node* a, struct bheap_node* b)
484 +{
485 + return edffm_higher_prio(bheap2task(a), bheap2task(b));
486 +}
487 +
488 +/* need_to_preempt - check whether the task t needs to be preempted
489 + * call only with irqs disabled and with ready_lock acquired
490 + */
491 +int edffm_preemption_needed(rt_domain_t* rt, struct task_struct *t)
492 +{
493 + /* we need the read lock for edf_ready_queue */
494 + /* no need to preempt if there is nothing pending */
495 + if (!__jobs_pending(rt))
496 + return 0;
497 + /* we need to reschedule if t doesn't exist */
498 + if (!t)
499 + return 1;
500 +
501 + /* make sure to get non-rt stuff out of the way */
502 + return !is_realtime(t) || edffm_higher_prio(__next_ready(rt), t);
503 +}
504 +
505 +/* we assume the lock is being held */
506 +static void preempt(edffm_domain_t *edffm)
507 +{
508 + preempt_if_preemptable(edffm->scheduled, edffm->cpu);
509 +}
510 +
511 +static void edffm_release_jobs(rt_domain_t* rt, struct bheap* tasks)
512 +{
513 + unsigned long flags;
514 + edffm_domain_t *edffm = container_of(rt, edffm_domain_t, domain);
515 +
516 + raw_spin_lock_irqsave(&edffm->slock, flags);
517 +
518 + __merge_ready(rt, tasks);
519 +
520 + if (edffm_preemption_needed(rt, edffm->scheduled))
521 + preempt(edffm);
522 +
523 + raw_spin_unlock_irqrestore(&edffm->slock, flags);
524 +}
525 +
526 +/* EDF-fm uses the "release_master" field to force the next release for
527 + * the task 'task' to happen on a remote CPU. The remote cpu for task is
528 + * previously set up during job_completion() taking into consideration
529 + * whether a task is a migratory task or not.
530 + */
531 +static inline void
532 +edffm_add_release_remote(struct task_struct *task)
533 +{
534 + unsigned long flags;
535 + rt_domain_t *rt = task_edf(task);
536 +
537 + raw_spin_lock_irqsave(&rt->tobe_lock, flags);
538 +
539 + /* "modify" destination cpu */
540 + rt->release_master = get_partition(task);
541 +
542 + TRACE_TASK(task, "Add remote release: smp_proc_id = %d, cpu = %d, remote = %d\n",
543 + smp_processor_id(), task_cpu(task), rt->release_master);
544 +
545 + /* trigger future release */
546 + __add_release(rt, task);
547 +
548 + /* reset proper release_master and unlock */
549 + rt->release_master = NO_CPU;
550 + raw_spin_unlock_irqrestore(&rt->tobe_lock, flags);
551 +}
552 +
553 +/* perform double ready_queue locking in an orderwise fashion
554 + * this is called with: interrupt disabled and rq->lock held (from
555 + * schedule())
556 + */
557 +static noinline void double_domain_lock(edffm_domain_t *dom1, edffm_domain_t *dom2)
558 +{
559 + if (dom1 == dom2) {
560 + /* fake */
561 + raw_spin_lock(&dom1->slock);
562 + } else {
563 + if (dom1 < dom2) {
564 + raw_spin_lock(&dom1->slock);
565 + raw_spin_lock(&dom2->slock);
566 + TRACE("acquired %d and %d\n", dom1->cpu, dom2->cpu);
567 + } else {
568 + raw_spin_lock(&dom2->slock);
569 + raw_spin_lock(&dom1->slock);
570 + TRACE("acquired %d and %d\n", dom2->cpu, dom1->cpu);
571 + }
572 + }
573 +}
574 +
575 +/* Directly insert a task in a remote ready queue. This function
576 + * should only be called if this task is a migrating task and its
577 + * last job for this CPU just completed (a new one is released for
578 + * a remote CPU), but the new job is already tardy.
579 + */
580 +static noinline void insert_task_in_remote_ready(struct task_struct *task)
581 +{
582 + edffm_domain_t *this = remote_edffm(task_cpu(task));
583 + edffm_domain_t *remote = remote_edffm(get_partition(task));
584 +
585 + BUG_ON(get_partition(task) != remote->cpu);
586 +
587 + TRACE_TASK(task, "Migrate From P%d -> To P%d\n",
588 + this->cpu, remote->cpu);
589 + TRACE_TASK(task, "Inserting in remote ready queue\n");
590 +
591 + WARN_ON(!irqs_disabled());
592 +
593 + raw_spin_unlock(&this->slock);
594 + mb();
595 + TRACE_TASK(task,"edffm_lock %d released\n", this->cpu);
596 +
597 + /* lock both ready queues */
598 + double_domain_lock(this, remote);
599 + mb();
600 +
601 + __add_ready(&remote->domain, task);
602 +
603 + /* release remote but keep ours */
604 + raw_spin_unlock(&remote->slock);
605 + TRACE_TASK(task,"edffm_lock %d released\n", remote->cpu);
606 +
607 + /* ask remote cpu to reschedule, we are already rescheduling on this */
608 + preempt(remote);
609 +}
610 +
611 +static void requeue(struct task_struct* t, rt_domain_t *edf)
612 +{
613 + if (t->state != TASK_RUNNING)
614 + TRACE_TASK(t, "requeue: !TASK_RUNNING\n");
615 +
616 + set_rt_flags(t, RT_F_RUNNING);
617 + if (is_released(t, litmus_clock())) {
618 + if (wrong_cpu(t)) {
619 + /* this should only happen if t just completed, but
620 + * its next release is already tardy, so it should be
621 + * migrated and inserted in the remote ready queue
622 + */
623 + TRACE_TASK(t, "Migrating task already released, "
624 + "move from P%d to P%d\n",
625 + task_cpu(t), get_partition(t));
626 +
627 + insert_task_in_remote_ready(t);
628 + } else {
629 + /* not a migrat task or the job is on the right CPU */
630 + __add_ready(edf, t);
631 + }
632 + } else {
633 + if (wrong_cpu(t)) {
634 +
635 + TRACE_TASK(t, "Migrating task, adding remote release\n");
636 + edffm_add_release_remote(t);
637 + } else {
638 + TRACE_TASK(t, "Adding local release\n");
639 + add_release(edf, t);
640 + }
641 + }
642 +}
643 +
644 +/* Update statistics for the _current_ job.
645 + * - job_no was incremented _before_ starting this job
646 + * (release_at / prepare_for_next_period)
647 + * - cpu_job_no is incremented when the job completes
648 + */
649 +static void update_job_counter(struct task_struct *t)
650 +{
651 + int cpu_pos;
652 +
653 + /* Which CPU counter should be incremented? */
654 + cpu_pos = edffm_cpu_pos(t->rt_param.task_params.cpu, t);
655 + t->rt_param.semi_part.cpu_job_no[cpu_pos]++;
656 +
657 + TRACE_TASK(t, "job_no = %d, cpu_job_no(pos %d) = %d, cpu %d\n",
658 + t->rt_param.job_params.job_no, cpu_pos, cur_cpu_job_no(t),
659 + t->rt_param.task_params.cpu);
660 +}
661 +
662 +/* What is the next cpu for this job? (eq. 8, in EDF-Fm paper) */
663 +static int next_cpu_for_job(struct task_struct *t)
664 +{
665 + BUG_ON(!is_migrat_task(t));
666 +
667 + TRACE_TASK(t, "%u = %u * %u / %u\n",
668 + t->rt_param.job_params.job_no, cur_cpu_job_no(t),
669 + cur_cpu_fract_den(t), cur_cpu_fract_num(t));
670 + if ((t->rt_param.job_params.job_no) ==
671 + (((lt_t) cur_cpu_job_no(t) * cur_cpu_fract_den(t)) /
672 + cur_cpu_fract_num(t)))
673 + return edffm_params(t).cpus[0];
674 +
675 + return edffm_params(t).cpus[1];
676 +}
677 +
678 +/* If needed (the share for task t on this CPU is exhausted), updates
679 + * the task_params.cpu for the _migrating_ task t
680 + */
681 +static void change_migrat_cpu_if_needed(struct task_struct *t)
682 +{
683 + BUG_ON(!is_migrat_task(t));
684 + /* EDF-fm: if it is a migrating task and it has already executed
685 + * the required number of jobs on this CPU, we need to move it
686 + * on its next CPU; changing the cpu here will affect the requeue
687 + * and the next release
688 + */
689 + if (unlikely(next_cpu_for_job(t) != migrat_cur_cpu(t))) {
690 +
691 + tsk_rt(t)->task_params.cpu = migrat_next_cpu(t);
692 + TRACE_TASK(t, "EDF-fm: will migrate job %d -> %d\n",
693 + task_cpu(t), tsk_rt(t)->task_params.cpu);
694 + return;
695 + }
696 +
697 + TRACE_TASK(t, "EDF-fm: job will stay on %d -> %d\n",
698 + task_cpu(t), tsk_rt(t)->task_params.cpu);
699 +}
700 +
701 +static void job_completion(struct task_struct* t, int forced)
702 +{
703 + sched_trace_task_completion(t,forced);
704 + TRACE_TASK(t, "job_completion().\n");
705 +
706 + if (unlikely(is_migrat_task(t))) {
707 + update_job_counter(t);
708 + change_migrat_cpu_if_needed(t);
709 + }
710 +
711 + set_rt_flags(t, RT_F_SLEEP);
712 + prepare_for_next_period(t);
713 +}
714 +
715 +static void edffm_tick(struct task_struct *t)
716 +{
717 + edffm_domain_t *edffm = local_edffm;
718 +
719 + BUG_ON(is_realtime(t) && t != edffm->scheduled);
720 +
721 + if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
722 + set_tsk_need_resched(t);
723 + TRACE("edffm_scheduler_tick: "
724 + "%d is preemptable "
725 + " => FORCE_RESCHED\n", t->pid);
726 + }
727 +}
728 +
729 +static struct task_struct* edffm_schedule(struct task_struct * prev)
730 +{
731 + edffm_domain_t* edffm = local_edffm;
732 + rt_domain_t* edf = &edffm->domain;
733 + struct task_struct* next;
734 +
735 + int out_of_time, sleep, preempt, exists, blocks, change_cpu, resched;
736 +
737 + raw_spin_lock(&edffm->slock);
738 +
739 + BUG_ON(edffm->scheduled && edffm->scheduled != prev);
740 + BUG_ON(edffm->scheduled && !is_realtime(prev));
741 +
742 + /* (0) Determine state */
743 + exists = edffm->scheduled != NULL;
744 + blocks = exists && !is_running(edffm->scheduled);
745 + out_of_time = exists &&
746 + budget_enforced(edffm->scheduled) &&
747 + budget_exhausted(edffm->scheduled);
748 + sleep = exists && get_rt_flags(edffm->scheduled) == RT_F_SLEEP;
749 + change_cpu = exists && wrong_cpu(edffm->scheduled);
750 + preempt = edffm_preemption_needed(edf, prev);
751 +
752 + BUG_ON(blocks && change_cpu);
753 +
754 + if (exists)
755 + TRACE_TASK(prev,
756 + "blocks:%d out_of_time:%d sleep:%d preempt:%d "
757 + "wrong_cpu:%d state:%d sig:%d\n",
758 + blocks, out_of_time, sleep, preempt,
759 + change_cpu, prev->state, signal_pending(prev));
760 +
761 + /* If we need to preempt do so. */
762 + resched = preempt;
763 +
764 + /* If a task blocks we have no choice but to reschedule. */
765 + if (blocks)
766 + resched = 1;
767 +
768 + /* If a task has just woken up, it was tardy and the wake up
769 + * raced with this schedule, a new job has already been released,
770 + * but scheduled should be enqueued on a remote ready queue, and a
771 + * new task should be selected for the current queue.
772 + */
773 + if (change_cpu)
774 + resched = 1;
775 +
776 + /* Any task that is preemptable and either exhausts its execution
777 + * budget or wants to sleep completes. We may have to reschedule after
778 + * this.
779 + */
780 + if ((out_of_time || sleep) && !blocks) {
781 + job_completion(edffm->scheduled, !sleep);
782 + resched = 1;
783 + }
784 +
785 + /* The final scheduling decision. Do we need to switch for some reason?
786 + * Switch if we are in RT mode and have no task or if we need to
787 + * resched.
788 + */
789 + next = NULL;
790 + if (resched || !exists) {
791 +
792 + if (edffm->scheduled && !blocks)
793 + requeue(edffm->scheduled, edf);
794 + next = __take_ready(edf);
795 + } else
796 + /* Only override Linux scheduler if we have a real-time task
797 + * scheduled that needs to continue.
798 + */
799 + if (exists)
800 + next = prev;
801 +
802 + if (next) {
803 + TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
804 + set_rt_flags(next, RT_F_RUNNING);
805 + } else {
806 + TRACE("becoming idle at %llu\n", litmus_clock());
807 + }
808 +
809 + edffm->scheduled = next;
810 + raw_spin_unlock(&edffm->slock);
811 +
812 + return next;
813 +}
814 +
815 +/* Prepare a task for running in RT mode
816 + */
817 +static void edffm_task_new(struct task_struct * t, int on_rq, int running)
818 +{
819 + rt_domain_t* edf = task_edf(t);
820 + edffm_domain_t* edffm = task_edffm(t);
821 + unsigned long flags;
822 +
823 + TRACE_TASK(t, "EDF-fm: task new, cpu = %d\n",
824 + t->rt_param.task_params.cpu);
825 +
826 + release_at(t, litmus_clock());
827 + update_job_counter(t);
828 +
829 + /* The task should be running in the queue, otherwise signal
830 + * code will try to wake it up with fatal consequences.
831 + */
832 + raw_spin_lock_irqsave(&edffm->slock, flags);
833 + if (running) {
834 + /* there shouldn't be anything else running at the time */
835 + BUG_ON(edffm->scheduled);
836 + edffm->scheduled = t;
837 + } else {
838 + requeue(t, edf);
839 + /* maybe we have to reschedule */
840 + preempt(edffm);
841 + }
842 + raw_spin_unlock_irqrestore(&edffm->slock, flags);
843 +}
844 +
845 +static void edffm_task_wake_up(struct task_struct *task)
846 +{
847 + unsigned long flags;
848 + edffm_domain_t* edffm = task_edffm(task);
849 + rt_domain_t* edf = task_edf(task);
850 + lt_t now;
851 +
852 + TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
853 +
854 + TRACE_TASK(task, "acquire edffm %d\n", edffm->cpu);
855 + raw_spin_lock_irqsave(&edffm->slock, flags);
856 +
857 + BUG_ON(edffm != task_edffm(task));
858 + BUG_ON(is_queued(task));
859 +
860 + now = litmus_clock();
861 + if (is_tardy(task, now)) {
862 + if (unlikely(is_migrat_task(task))) {
863 + /* a new job will be released.
864 + * Update current job counter */
865 + update_job_counter(task);
866 + /* Switch CPU if needed */
867 + change_migrat_cpu_if_needed(task);
868 + }
869 + /* new sporadic release */
870 + TRACE_TASK(task, "release new\n");
871 + release_at(task, now);
872 + sched_trace_task_release(task);
873 + }
874 +
875 + /* Only add to ready queue if it is not the currently-scheduled
876 + * task. This could be the case if a task was woken up concurrently
877 + * on a remote CPU before the executing CPU got around to actually
878 + * de-scheduling the task, i.e., wake_up() raced with schedule()
879 + * and won.
880 + */
881 + if (edffm->scheduled != task)
882 + requeue(task, edf);
883 +
884 + raw_spin_unlock_irqrestore(&edffm->slock, flags);
885 + TRACE_TASK(task, "release edffm %d\n", edffm->cpu);
886 + TRACE_TASK(task, "wake up done\n");
887 +}
888 +
889 +static void edffm_task_block(struct task_struct *t)
890 +{
891 + TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);
892 +
893 + BUG_ON(!is_realtime(t));
894 + if (is_queued(t)) {
895 + edffm_domain_t *edffm = local_edffm;
896 + TRACE_TASK(t, "task blocked, race with wakeup, "
897 + "remove from queue %d\n", edffm->cpu);
898 + remove(&edffm->domain, t);
899 + }
900 +}
901 +
902 +static void edffm_task_exit(struct task_struct * t)
903 +{
904 + unsigned long flags;
905 + edffm_domain_t* edffm = task_edffm(t);
906 + rt_domain_t* edf;
907 +
908 + raw_spin_lock_irqsave(&edffm->slock, flags);
909 + if (is_queued(t)) {
910 + /* dequeue */
911 + edf = task_edf(t);
912 + remove(edf, t);
913 + }
914 + if (edffm->scheduled == t)
915 + edffm->scheduled = NULL;
916 +
917 + TRACE_TASK(t, "RIP\n");
918 +
919 + preempt(edffm);
920 + raw_spin_unlock_irqrestore(&edffm->slock, flags);
921 +}
922 +
923 +static long edffm_admit_task(struct task_struct* tsk)
924 +{
925 + return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL;
926 +}
927 +
928 +/* Plugin object */
929 +static struct sched_plugin edffm_plugin __cacheline_aligned_in_smp = {
930 + .plugin_name = "EDF-fm",
931 + .tick = edffm_tick,
932 + .task_new = edffm_task_new,
933 + .complete_job = complete_job,
934 + .task_exit = edffm_task_exit,
935 + .schedule = edffm_schedule,
936 + .task_wake_up = edffm_task_wake_up,
937 + .task_block = edffm_task_block,
938 + .admit_task = edffm_admit_task
939 +};
940 +
941 +static int __init init_edffm(void)
942 +{
943 + int i;
944 + edffm_domain_t *edffm;
945 +
946 + /* Note, broken if num_online_cpus() may change */
947 + for (i = 0; i < num_online_cpus(); i++) {
948 + edffm = remote_edffm(i);
949 + edffm->cpu = i;
950 + edffm->scheduled = NULL;
951 + rt_domain_init(&edffm->domain, edffm_ready_order, NULL,
952 + edffm_release_jobs);
953 + }
954 +
955 + return register_sched_plugin(&edffm_plugin);
956 +}
957 +
958 +module_init(init_edffm);
959 +
960 diff --git a/litmus/sched_edf_os.c b/litmus/sched_edf_os.c
961 new file mode 100644
962 index 000000000000..e021d22b5129
963 --- /dev/null
964 +++ b/litmus/sched_edf_os.c
965 @@ -0,0 +1,660 @@
966 +/*
967 + * litmus/sched_edf_os.c
968 + *
969 + * Implementation of the EDF-os scheduling algorithm.
970 + */
971 +
972 +#include <linux/percpu.h>
973 +#include <linux/sched.h>
974 +#include <linux/list.h>
975 +#include <linux/spinlock.h>
976 +
977 +#include <linux/module.h>
978 +
979 +#include <litmus/litmus.h>
980 +#include <litmus/jobs.h>
981 +#include <litmus/sched_plugin.h>
982 +#include <litmus/edf_common.h>
983 +
984 +typedef struct {
985 + rt_domain_t domain;
986 + int cpu;
987 + struct task_struct* scheduled; /* only RT tasks */
988 +/* domain lock */
989 +#define slock domain.ready_lock
990 +} edfos_domain_t;
991 +
992 +DEFINE_PER_CPU(edfos_domain_t, edfos_domains);
993 +
994 +#define local_edfos (&__get_cpu_var(edfos_domains))
995 +#define remote_edf(cpu) (&per_cpu(edfos_domains, cpu).domain)
996 +#define remote_edfos(cpu) (&per_cpu(edfos_domains, cpu))
997 +#define task_edf(task) remote_edf(get_partition(task))
998 +#define task_edfos(task) remote_edfos(get_partition(task))
999 +
1000 +#define edfos_params(t) (t->rt_param.task_params.semi_part.os)
1001 +
1002 +/* Is the task a migratory task? */
1003 +#define is_migrat_task(task) (edfos_params(task).migrat)
1004 +/* t is on the wrong CPU (it should be requeued properly) */
1005 +#define wrong_cpu(t) is_migrat_task((t)) \
1006 + && task_cpu((t)) != get_partition((t))
1007 +/* Manipulate share for current cpu */
1008 +#define cur_cpu_fract_num(t) edfos_params(t).fraction[get_partition(t)][0]
1009 +#define cur_cpu_fract_den(t) edfos_params(t).fraction[get_partition(t)][1]
1010 +/* Get job number for current cpu */
1011 +#define cur_cpu_job_no(t) \
1012 + tsk_rt(t)->semi_part.cpu_job_no[get_partition(t)]
1013 +
1014 +/*
1015 + * EDF-os: migratory tasks have higher prio than fixed, EDF in both classes.
1016 + * (Both first and second may be NULL).
1017 + */
1018 +int edfos_higher_prio(struct task_struct* first, struct task_struct* second)
1019 +{
1020 + if ((first && edfos_params(first).migrat) ||
1021 + (second && edfos_params(second).migrat)) {
1022 + if ((first && edfos_params(first).migrat) &&
1023 + (second && edfos_params(second).migrat))
1024 + {
1025 + /* both are migrating */
1026 + if (edfos_params(first).first_cpu <
1027 + edfos_params(second).first_cpu)
1028 + return 1;
1029 + else
1030 + return 0;
1031 + }
1032 +
1033 + if (first && edfos_params(first).migrat)
1034 + /* first is migrating */
1035 + return 1;
1036 + else
1037 + /* second is migrating */
1038 + return 0;
1039 + }
1040 +
1041 + /* both are fixed or not real time */
1042 + return edf_higher_prio(first, second);
1043 +}
1044 +
1045 +int edfos_ready_order(struct bheap_node* a, struct bheap_node* b)
1046 +{
1047 + return edfos_higher_prio(bheap2task(a), bheap2task(b));
1048 +}
1049 +
1050 +static int fakepfair_ready_order(struct bheap_node* a, struct bheap_node* b)
1051 +{
1052 + return *((int*)a->value) < *((int*)b->value);
1053 +}
1054 +
1055 +/* need_to_preempt - check whether the task t needs to be preempted
1056 + * call only with irqs disabled and with ready_lock acquired
1057 + */
1058 +int edfos_preemption_needed(rt_domain_t* rt, struct task_struct *t)
1059 +{
1060 + /* we need the read lock for edf_ready_queue */
1061 + /* no need to preempt if there is nothing pending */
1062 + if (!__jobs_pending(rt))
1063 + return 0;
1064 + /* we need to reschedule if t doesn't exist */
1065 + if (!t)
1066 + return 1;
1067 +
1068 + /* make sure to get non-rt stuff out of the way */
1069 + return !is_realtime(t) || edfos_higher_prio(__next_ready(rt), t);
1070 +}
1071 +
1072 +/* we assume the lock is being held */
1073 +static void preempt(edfos_domain_t *edfos)
1074 +{
1075 + preempt_if_preemptable(edfos->scheduled, edfos->cpu);
1076 +}
1077 +
1078 +static void edfos_release_jobs(rt_domain_t* rt, struct bheap* tasks)
1079 +{
1080 + unsigned long flags;
1081 + edfos_domain_t *edfos = container_of(rt, edfos_domain_t, domain);
1082 +
1083 + raw_spin_lock_irqsave(&edfos->slock, flags);
1084 +
1085 + __merge_ready(rt, tasks);
1086 +
1087 + if (edfos_preemption_needed(rt, edfos->scheduled))
1088 + preempt(edfos);
1089 +
1090 + raw_spin_unlock_irqrestore(&edfos->slock, flags);
1091 +}
1092 +
1093 +/* EDF-os uses the "release_master" field to force the next release for
1094 + * the task 'task' to happen on a remote CPU. The remote cpu for task is
1095 + * previously set up during job_completion() taking into consideration
1096 + * whether a task is a migratory task or not.
1097 + */
1098 +static inline void
1099 +edfos_add_release_remote(struct task_struct *task)
1100 +{
1101 + unsigned long flags;
1102 + rt_domain_t *rt = task_edf(task);
1103 +
1104 + raw_spin_lock_irqsave(&rt->tobe_lock, flags);
1105 +
1106 + /* "modify" destination cpu */
1107 + rt->release_master = get_partition(task);
1108 +
1109 + TRACE_TASK(task, "Add remote release: smp_proc_id = %d, cpu = %d, remote = %d\n",
1110 + smp_processor_id(), task_cpu(task), rt->release_master);
1111 +
1112 + /* trigger future release */
1113 + __add_release(rt, task);
1114 +
1115 + /* reset proper release_master and unlock */
1116 + rt->release_master = NO_CPU;
1117 + raw_spin_unlock_irqrestore(&rt->tobe_lock, flags);
1118 +}
1119 +
1120 +/* perform double ready_queue locking in an orderwise fashion
1121 + * this is called with: interrupt disabled and rq->lock held (from
1122 + * schedule())
1123 + */
1124 +static noinline void double_domain_lock(edfos_domain_t *dom1, edfos_domain_t *dom2)
1125 +{
1126 + if (dom1 == dom2) {
1127 + /* fake */
1128 + raw_spin_lock(&dom1->slock);
1129 + } else {
1130 + if (dom1 < dom2) {
1131 + raw_spin_lock(&dom1->slock);
1132 + raw_spin_lock(&dom2->slock);
1133 + TRACE("acquired %d and %d\n", dom1->cpu, dom2->cpu);
1134 + } else {
1135 + raw_spin_lock(&dom2->slock);
1136 + raw_spin_lock(&dom1->slock);
1137 + TRACE("acquired %d and %d\n", dom2->cpu, dom1->cpu);
1138 + }
1139 + }
1140 +}
1141 +
1142 +/* Directly insert a task in a remote ready queue. This function
1143 + * should only be called if this task is a migrating task and its
1144 + * last job for this CPU just completed (a new one is released for
1145 + * a remote CPU), but the new job is already tardy.
1146 + */
1147 +static noinline void insert_task_in_remote_ready(struct task_struct *task)
1148 +{
1149 + edfos_domain_t *this = remote_edfos(task_cpu(task));
1150 + edfos_domain_t *remote = remote_edfos(get_partition(task));
1151 +
1152 + BUG_ON(get_partition(task) != remote->cpu);
1153 +
1154 + TRACE_TASK(task, "Migrate From P%d -> To P%d\n",
1155 + this->cpu, remote->cpu);
1156 + TRACE_TASK(task, "Inserting in remote ready queue\n");
1157 +
1158 + WARN_ON(!irqs_disabled());
1159 +
1160 + raw_spin_unlock(&this->slock);
1161 + mb();
1162 + TRACE_TASK(task,"edfos_lock %d released\n", this->cpu);
1163 +
1164 + /* lock both ready queues */
1165 + double_domain_lock(this, remote);
1166 + mb();
1167 +
1168 + __add_ready(&remote->domain, task);
1169 +
1170 + /* release remote but keep ours */
1171 + raw_spin_unlock(&remote->slock);
1172 + TRACE_TASK(task,"edfos_lock %d released\n", remote->cpu);
1173 +
1174 + /* ask remote cpu to reschedule, we are already rescheduling on this */
1175 + preempt(remote);
1176 +}
1177 +
1178 +static void requeue(struct task_struct* t, rt_domain_t *edf)
1179 +{
1180 + if (t->state != TASK_RUNNING)
1181 + TRACE_TASK(t, "requeue: !TASK_RUNNING\n");
1182 +
1183 + set_rt_flags(t, RT_F_RUNNING);
1184 + if (is_released(t, litmus_clock())) {
1185 + if (wrong_cpu(t)) {
1186 + /* this should only happen if t just completed, but
1187 + * its next release is already tardy, so it should be
1188 + * migrated and inserted in the remote ready queue
1189 + */
1190 + TRACE_TASK(t, "Migrating task already released, "
1191 + "move from P%d to P%d\n",
1192 + task_cpu(t), get_partition(t));
1193 +
1194 + insert_task_in_remote_ready(t);
1195 + } else {
1196 + /* not a migrat task or the job is on the right CPU */
1197 + __add_ready(edf, t);
1198 + }
1199 + } else {
1200 + if (wrong_cpu(t)) {
1201 +
1202 + TRACE_TASK(t, "Migrating task, adding remote release\n");
1203 + edfos_add_release_remote(t);
1204 + } else {
1205 + TRACE_TASK(t, "Adding local release\n");
1206 + add_release(edf, t);
1207 + }
1208 + }
1209 +}
1210 +
1211 +/* Update statistics for the _current_ job.
1212 + * - job_no was incremented _before_ starting this job
1213 + * (release_at / prepare_for_next_period)
1214 + * - cpu_job_no is incremented when the job completes
1215 + */
1216 +static void update_job_counter(struct task_struct *t)
1217 +{
1218 + t->rt_param.semi_part.cpu_job_no[get_partition(t)]++;
1219 +
1220 + TRACE_TASK(t, "job_no = %d, cpu_job_no(pos %d) = %d, cpu %d\n",
1221 + t->rt_param.job_params.job_no, get_partition(t),
1222 + cur_cpu_job_no(t), t->rt_param.task_params.cpu);
1223 +}
1224 +
1225 +
1226 +static int compute_pfair_deadline(lt_t wt_num, lt_t wt_den,
1227 + unsigned int job_no)
1228 +{
1229 + lt_t num;
1230 + num = job_no * wt_den;
1231 + if (do_div(num, wt_num))
1232 + num++;
1233 + return (int)num;
1234 +}
1235 +
1236 +static int compute_pfair_release(lt_t wt_num, lt_t wt_den,
1237 + unsigned int job_no)
1238 +{
1239 + lt_t num;
1240 + num = (job_no - 1) * wt_den;
1241 + do_div(num, wt_num);
1242 + return (int)num;
1243 +}
1244 +
1245 +static int next_cpu_for_job(struct task_struct *t)
1246 +{
1247 + unsigned int cpu;
1248 + lt_t next_rel;
1249 + struct bheap_node* node;
1250 + BUG_ON(!is_migrat_task(t));
1251 +
1252 + /* Process any new subtask releases. */
1253 + node = bheap_peek(fakepfair_ready_order,
1254 + &edfos_params(t).release_queue);
1255 + while (node && *((int*)node->value) <= tsk_rt(t)->job_params.job_no) {
1256 + node = bheap_take(fakepfair_ready_order,
1257 + &edfos_params(t).release_queue);
1258 + BUG_ON(!node);
1259 + cpu = ((int*)node->value) - edfos_params(t).heap_data;
1260 + *((int*)node->value) = compute_pfair_deadline(
1261 + edfos_params(t).fraction[cpu][0],
1262 + edfos_params(t).fraction[cpu][1],
1263 + tsk_rt(t)->semi_part.cpu_job_no[cpu] + 1);
1264 + bheap_insert(fakepfair_ready_order,
1265 + &edfos_params(t).ready_queue, node);
1266 + node = bheap_peek(fakepfair_ready_order,
1267 + &edfos_params(t).release_queue);
1268 + }
1269 +
1270 + /* Choose the next Pfair subtask. */
1271 + node = bheap_take(fakepfair_ready_order,
1272 + &edfos_params(t).ready_queue);
1273 + BUG_ON(!node);
1274 + cpu = ((int*)node->value) - edfos_params(t).heap_data;
1275 +
1276 + next_rel = compute_pfair_release(edfos_params(t).fraction[cpu][0],
1277 + edfos_params(t).fraction[cpu][1],
1278 + tsk_rt(t)->semi_part.cpu_job_no[cpu]
1279 + + 1);
1280 + if (next_rel <= tsk_rt(t)->job_params.job_no)
1281 + {
1282 + /* Next subtask already released. */
1283 + *((int*)node->value) = compute_pfair_deadline(
1284 + edfos_params(t).fraction[cpu][0],
1285 + edfos_params(t).fraction[cpu][1],
1286 + tsk_rt(t)->semi_part.cpu_job_no[cpu] +
1287 + 1);
1288 + bheap_insert(fakepfair_ready_order,
1289 + &edfos_params(t).ready_queue, node);
1290 + }
1291 + else
1292 + {
1293 + /* Next subtask not yet released. */
1294 + *((int*)node->value) = next_rel;
1295 + bheap_insert(fakepfair_ready_order,
1296 + &edfos_params(t).release_queue, node);
1297 + }
1298 +
1299 + TRACE_TASK(t, "%u = %u * %u / %u\n",
1300 + t->rt_param.job_params.job_no, cur_cpu_job_no(t),
1301 + cur_cpu_fract_den(t), cur_cpu_fract_num(t));
1302 + return cpu;
1303 +}
1304 +
1305 +/* If needed (the share for task t on this CPU is exhausted), updates
1306 + * the task_params.cpu for the _migrating_ task t
1307 + */
1308 +static void change_migrat_cpu_if_needed(struct task_struct *t)
1309 +{
1310 + int cpu;
1311 + BUG_ON(!is_migrat_task(t));
1312 + /* EDF-os: if it is a migrating task and it has already executed
1313 + * the required number of jobs on this CPU, we need to move it
1314 + * on its next CPU; changing the cpu here will affect the requeue
1315 + * and the next release
1316 + */
1317 + cpu = next_cpu_for_job(t);
1318 + if (unlikely(cpu != get_partition(t))) {
1319 + tsk_rt(t)->task_params.cpu = cpu;
1320 + TRACE_TASK(t, "EDF-os: will migrate job %d -> %d\n",
1321 + task_cpu(t), tsk_rt(t)->task_params.cpu);
1322 + return;
1323 + }
1324 +
1325 + TRACE_TASK(t, "EDF-os: job will stay on %d -> %d\n",
1326 + task_cpu(t), tsk_rt(t)->task_params.cpu);
1327 +}
1328 +
1329 +static void job_completion(struct task_struct* t, int forced)
1330 +{
1331 + sched_trace_task_completion(t,forced);
1332 + TRACE_TASK(t, "job_completion().\n");
1333 +
1334 + if (unlikely(is_migrat_task(t))) {
1335 + update_job_counter(t);
1336 + change_migrat_cpu_if_needed(t);
1337 + }
1338 +
1339 + set_rt_flags(t, RT_F_SLEEP);
1340 + prepare_for_next_period(t);
1341 +}
1342 +
1343 +static void edfos_tick(struct task_struct *t)
1344 +{
1345 + edfos_domain_t *edfos = local_edfos;
1346 +
1347 + BUG_ON(is_realtime(t) && t != edfos->scheduled);
1348 +
1349 + if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
1350 + set_tsk_need_resched(t);
1351 + TRACE("edfos_scheduler_tick: "
1352 + "%d is preemptable "
1353 + " => FORCE_RESCHED\n", t->pid);
1354 + }
1355 +}
1356 +
1357 +static struct task_struct* edfos_schedule(struct task_struct * prev)
1358 +{
1359 + edfos_domain_t* edfos = local_edfos;
1360 + rt_domain_t* edf = &edfos->domain;
1361 + struct task_struct* next;
1362 +
1363 + int out_of_time, sleep, preempt, exists, blocks, change_cpu, resched;
1364 +
1365 + raw_spin_lock(&edfos->slock);
1366 +
1367 + BUG_ON(edfos->scheduled && edfos->scheduled != prev);
1368 + BUG_ON(edfos->scheduled && !is_realtime(prev));
1369 +
1370 + /* (0) Determine state */
1371 + exists = edfos->scheduled != NULL;
1372 + blocks = exists && !is_running(edfos->scheduled);
1373 + out_of_time = exists &&
1374 + budget_enforced(edfos->scheduled) &&
1375 + budget_exhausted(edfos->scheduled);
1376 + sleep = exists && get_rt_flags(edfos->scheduled) == RT_F_SLEEP;
1377 + change_cpu = exists && wrong_cpu(edfos->scheduled);
1378 + preempt = edfos_preemption_needed(edf, prev);
1379 +
1380 + BUG_ON(blocks && change_cpu);
1381 +
1382 + if (exists)
1383 + TRACE_TASK(prev,
1384 + "blocks:%d out_of_time:%d sleep:%d preempt:%d "
1385 + "wrong_cpu:%d state:%d sig:%d\n",
1386 + blocks, out_of_time, sleep, preempt,
1387 + change_cpu, prev->state, signal_pending(prev));
1388 +
1389 + /* If we need to preempt do so. */
1390 + resched = preempt;
1391 +
1392 + /* If a task blocks we have no choice but to reschedule. */
1393 + if (blocks)
1394 + resched = 1;
1395 +
1396 + /* If a task has just woken up, it was tardy and the wake up
1397 + * raced with this schedule, a new job has already been released,
1398 + * but scheduled should be enqueued on a remote ready queue, and a
1399 + * new task should be selected for the current queue.
1400 + */
1401 + if (change_cpu)
1402 + resched = 1;
1403 +
1404 + /* Any task that is preemptable and either exhausts its execution
1405 + * budget or wants to sleep completes. We may have to reschedule after
1406 + * this.
1407 + */
1408 + if ((out_of_time || sleep) && !blocks) {
1409 + job_completion(edfos->scheduled, !sleep);
1410 + resched = 1;
1411 + }
1412 +
1413 + /* The final scheduling decision. Do we need to switch for some reason?
1414 + * Switch if we are in RT mode and have no task or if we need to
1415 + * resched.
1416 + */
1417 + next = NULL;
1418 + if (resched || !exists) {
1419 +
1420 + if (edfos->scheduled && !blocks)
1421 + requeue(edfos->scheduled, edf);
1422 + next = __take_ready(edf);
1423 + } else
1424 + /* Only override Linux scheduler if we have a real-time task
1425 + * scheduled that needs to continue.
1426 + */
1427 + if (exists)
1428 + next = prev;
1429 +
1430 + if (next) {
1431 + TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
1432 + set_rt_flags(next, RT_F_RUNNING);
1433 + } else {
1434 + TRACE("becoming idle at %llu\n", litmus_clock());
1435 + }
1436 +
1437 + edfos->scheduled = next;
1438 + raw_spin_unlock(&edfos->slock);
1439 +
1440 + return next;
1441 +}
1442 +
1443 +/* Prepare a task for running in RT mode
1444 + */
1445 +static void edfos_task_new(struct task_struct * t, int on_rq, int running)
1446 +{
1447 + rt_domain_t* edf = task_edf(t);
1448 + edfos_domain_t* edfos = task_edfos(t);
1449 + unsigned long flags;
1450 + unsigned int i;
1451 +
1452 + if (edfos_params(t).migrat) {
1453 + bheap_init(&edfos_params(t).release_queue);
1454 + bheap_init(&edfos_params(t).ready_queue);
1455 + for (i = 0; i < NR_CPUS_EDF_OS; i++) {
1456 + if (i == t->rt_param.task_params.cpu) {
1457 + /* Initial CPU - setup next release. */
1458 + edfos_params(t).heap_data[i] =
1459 + compute_pfair_release(
1460 + edfos_params(t).fraction[i][0],
1461 + edfos_params(t).fraction[i][1], 2);
1462 + bheap_add(fakepfair_ready_order,
1463 + &edfos_params(t).release_queue,
1464 + &edfos_params(t).heap_data[i],
1465 + GFP_ATOMIC);
1466 + }
1467 + else if (edfos_params(t).fraction[i][0] > 0) {
1468 + /* Non-initial CPU - already released, setup
1469 + * deadline.
1470 + */
1471 + edfos_params(t).heap_data[i] =
1472 + compute_pfair_deadline(
1473 + edfos_params(t).fraction[i][0],
1474 + edfos_params(t).fraction[i][1], 1);
1475 + bheap_add(fakepfair_ready_order,
1476 + &edfos_params(t).ready_queue,
1477 + &edfos_params(t).heap_data[i],
1478 + GFP_ATOMIC);
1479 + }
1480 + }
1481 + }
1482 +
1483 + TRACE_TASK(t, "EDF-os: task new, cpu = %d\n",
1484 + t->rt_param.task_params.cpu);
1485 +
1486 + release_at(t, litmus_clock());
1487 + update_job_counter(t);
1488 +
1489 + /* The task should be running in the queue, otherwise signal
1490 + * code will try to wake it up with fatal consequences.
1491 + */
1492 + raw_spin_lock_irqsave(&edfos->slock, flags);
1493 + if (running) {
1494 + /* there shouldn't be anything else running at the time */
1495 + BUG_ON(edfos->scheduled);
1496 + edfos->scheduled = t;
1497 + } else {
1498 + requeue(t, edf);
1499 + /* maybe we have to reschedule */
1500 + preempt(edfos);
1501 + }
1502 + raw_spin_unlock_irqrestore(&edfos->slock, flags);
1503 +}
1504 +
1505 +static void edfos_task_wake_up(struct task_struct *task)
1506 +{
1507 + unsigned long flags;
1508 + edfos_domain_t* edfos = task_edfos(task);
1509 + rt_domain_t* edf = task_edf(task);
1510 + lt_t now;
1511 +
1512 + TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
1513 +
1514 + TRACE_TASK(task, "acquire edfos %d\n", edfos->cpu);
1515 + raw_spin_lock_irqsave(&edfos->slock, flags);
1516 +
1517 + BUG_ON(edfos != task_edfos(task));
1518 + BUG_ON(is_queued(task));
1519 +
1520 + now = litmus_clock();
1521 + if (is_tardy(task, now)) {
1522 + if (unlikely(is_migrat_task(task))) {
1523 + /* a new job will be released.
1524 + * Update current job counter */
1525 + update_job_counter(task);
1526 + /* Switch CPU if needed */
1527 + change_migrat_cpu_if_needed(task);
1528 + }
1529 + /* new sporadic release */
1530 + TRACE_TASK(task, "release new\n");
1531 + release_at(task, now);
1532 + sched_trace_task_release(task);
1533 + }
1534 +
1535 + /* Only add to ready queue if it is not the currently-scheduled
1536 + * task. This could be the case if a task was woken up concurrently
1537 + * on a remote CPU before the executing CPU got around to actually
1538 + * de-scheduling the task, i.e., wake_up() raced with schedule()
1539 + * and won.
1540 + */
1541 + if (edfos->scheduled != task)
1542 + requeue(task, edf);
1543 +
1544 + raw_spin_unlock_irqrestore(&edfos->slock, flags);
1545 + TRACE_TASK(task, "release edfos %d\n", edfos->cpu);
1546 + TRACE_TASK(task, "wake up done\n");
1547 +}
1548 +
1549 +static void edfos_task_block(struct task_struct *t)
1550 +{
1551 + TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);
1552 +
1553 + BUG_ON(!is_realtime(t));
1554 + if (is_queued(t)) {
1555 + edfos_domain_t *edfos = local_edfos;
1556 + TRACE_TASK(t, "task blocked, race with wakeup, "
1557 + "remove from queue %d\n", edfos->cpu);
1558 + remove(&edfos->domain, t);
1559 + }
1560 +}
1561 +
1562 +static void edfos_task_exit(struct task_struct * t)
1563 +{
1564 + unsigned long flags;
1565 + edfos_domain_t* edfos = task_edfos(t);
1566 + rt_domain_t* edf;
1567 +
1568 + raw_spin_lock_irqsave(&edfos->slock, flags);
1569 + if (is_queued(t)) {
1570 + /* dequeue */
1571 + edf = task_edf(t);
1572 + remove(edf, t);
1573 + }
1574 + if (edfos->scheduled == t)
1575 + edfos->scheduled = NULL;
1576 +
1577 + /* Deallocate heap nodes. */
1578 + while (bheap_take_del(fakepfair_ready_order,
1579 + &edfos_params(t).release_queue)) {}
1580 + while (bheap_take_del(fakepfair_ready_order,
1581 + &edfos_params(t).ready_queue)) {}
1582 +
1583 + TRACE_TASK(t, "RIP\n");
1584 +
1585 + preempt(edfos);
1586 + raw_spin_unlock_irqrestore(&edfos->slock, flags);
1587 +}
1588 +
1589 +static long edfos_admit_task(struct task_struct* tsk)
1590 +{
1591 + return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL;
1592 +}
1593 +
1594 +/* Plugin object */
1595 +static struct sched_plugin edfos_plugin __cacheline_aligned_in_smp = {
1596 + .plugin_name = "EDF-os",
1597 + .tick = edfos_tick,
1598 + .task_new = edfos_task_new,
1599 + .complete_job = complete_job,
1600 + .task_exit = edfos_task_exit,
1601 + .schedule = edfos_schedule,
1602 + .task_wake_up = edfos_task_wake_up,
1603 + .task_block = edfos_task_block,
1604 + .admit_task = edfos_admit_task
1605 +};
1606 +
1607 +static int __init init_edfos(void)
1608 +{
1609 + int i;
1610 + edfos_domain_t *edfos;
1611 +
1612 + /* Note, broken if num_online_cpus() may change */
1613 + for (i = 0; i < num_online_cpus(); i++) {
1614 + edfos = remote_edfos(i);
1615 + edfos->cpu = i;
1616 + edfos->scheduled = NULL;
1617 + rt_domain_init(&edfos->domain, edfos_ready_order, NULL,
1618 + edfos_release_jobs);
1619 + }
1620 +
1621 + return register_sched_plugin(&edfos_plugin);
1622 +}
1623 +
1624 +module_init(init_edfos);
1625 +
1626 diff --git a/litmus/sched_edf_wm.c b/litmus/sched_edf_wm.c
1627 new file mode 100644
1628 index 000000000000..8b7be32b40dd
1629 --- /dev/null
1630 +++ b/litmus/sched_edf_wm.c
1631 @@ -0,0 +1,688 @@
1632 +/* EDF-WM: based on PSN-EDF.
1633 + */
1634 +
1635 +#include <linux/percpu.h>
1636 +#include <linux/sched.h>
1637 +#include <linux/list.h>
1638 +#include <linux/spinlock.h>
1639 +
1640 +#include <linux/module.h>
1641 +
1642 +#include <litmus/litmus.h>
1643 +#include <litmus/jobs.h>
1644 +#include <litmus/sched_plugin.h>
1645 +#include <litmus/edf_common.h>
1646 +
1647 +typedef struct {
1648 + rt_domain_t domain;
1649 + int cpu;
1650 + struct task_struct* scheduled; /* only RT tasks */
1651 +
1652 +/*
1653 + * scheduling lock slock
1654 + * protects the domain and serializes scheduling decisions
1655 + */
1656 +#define slock domain.ready_lock
1657 +
1658 +} wm_domain_t;
1659 +
1660 +DEFINE_PER_CPU(wm_domain_t, wm_domains);
1661 +
1662 +#define TRACE_DOM(dom, fmt, args...) \
1663 + TRACE("(wm_domains[%d]) " fmt, (dom)->cpu, ##args)
1664 +
1665 +
1666 +#define local_domain (&__get_cpu_var(wm_domains))
1667 +#define remote_domain(cpu) (&per_cpu(wm_domains, cpu))
1668 +#define domain_of_task(task) (remote_domain(get_partition(task)))
1669 +
1670 +static int is_sliced_task(struct task_struct* t)
1671 +{
1672 + return tsk_rt(t)->task_params.semi_part.wm.count;
1673 +}
1674 +
1675 +static struct edf_wm_slice* get_last_slice(struct task_struct* t)
1676 +{
1677 + int idx = tsk_rt(t)->task_params.semi_part.wm.count - 1;
1678 + return tsk_rt(t)->task_params.semi_part.wm.slices + idx;
1679 +}
1680 +
1681 +static void compute_slice_params(struct task_struct* t)
1682 +{
1683 + struct rt_param* p = tsk_rt(t);
1684 + /* Here we do a little trick to make the generic EDF code
1685 + * play well with job slices. We overwrite the job-level
1686 + * release and deadline fields with the slice-specific values
1687 + * so that we can enqueue this task in an EDF rt_domain_t
1688 + * without issue. The actual values are cached in the semi_part.wm
1689 + * structure. */
1690 + p->job_params.deadline = p->semi_part.wm.job_release +
1691 + p->semi_part.wm.slice->deadline;
1692 + p->job_params.release = p->semi_part.wm.job_release +
1693 + p->semi_part.wm.slice->offset;
1694 +
1695 + /* Similarly, we play a trick on the cpu field. */
1696 + p->task_params.cpu = p->semi_part.wm.slice->cpu;
1697 +
1698 + /* update the per-slice budget reference */
1699 + p->semi_part.wm.exec_time = p->job_params.exec_time;
1700 +}
1701 +
1702 +static void complete_sliced_job(struct task_struct* t)
1703 +{
1704 + struct rt_param* p = tsk_rt(t);
1705 +
1706 + /* We need to undo our trickery to the
1707 + * job parameters (see above). */
1708 + p->job_params.release = p->semi_part.wm.job_release;
1709 + p->job_params.deadline = p->semi_part.wm.job_deadline;
1710 +
1711 + /* Ok, now let generic code do the actual work. */
1712 + prepare_for_next_period(t);
1713 +
1714 + /* And finally cache the updated parameters. */
1715 + p->semi_part.wm.job_release = p->job_params.release;
1716 + p->semi_part.wm.job_deadline = p->job_params.deadline;
1717 +}
1718 +
1719 +static lt_t slice_exec_time(struct task_struct* t)
1720 +{
1721 + struct rt_param* p = tsk_rt(t);
1722 +
1723 + /* Compute how much execution time has been consumed
1724 + * since last slice advancement. */
1725 + return p->job_params.exec_time - p->semi_part.wm.exec_time;
1726 +}
1727 +
1728 +static lt_t slice_budget(struct task_struct* t)
1729 +{
1730 + return tsk_rt(t)->semi_part.wm.slice->budget;
1731 +}
1732 +
1733 +static int slice_budget_exhausted(struct task_struct* t)
1734 +{
1735 + return slice_exec_time(t) >= slice_budget(t);
1736 +}
1737 +
1738 +/* assumes positive remainder; overflows otherwise */
1739 +static lt_t slice_budget_remaining(struct task_struct* t)
1740 +{
1741 + return slice_budget(t) - slice_exec_time(t);
1742 +}
1743 +
1744 +static int wm_budget_exhausted(struct task_struct* t)
1745 +{
1746 + if (is_sliced_task(t))
1747 + return slice_budget_exhausted(t);
1748 + else
1749 + return budget_exhausted(t);
1750 +}
1751 +
1752 +static void advance_next_slice(struct task_struct* t, int completion_signaled)
1753 +{
1754 + int idx;
1755 + struct rt_param* p = tsk_rt(t);
1756 +
1757 + /* make sure this is actually a sliced job */
1758 + BUG_ON(!is_sliced_task(t));
1759 + BUG_ON(is_queued(t));
1760 +
1761 + /* determine index of current slice */
1762 + idx = p->semi_part.wm.slice -
1763 + p->task_params.semi_part.wm.slices;
1764 +
1765 + TRACE_TASK(t, "advancing slice %d; excess=%lluns; "
1766 + "completion_signaled=%d.\n",
1767 + idx, slice_exec_time(t) - slice_budget(t),
1768 + completion_signaled);
1769 +
1770 + if (completion_signaled)
1771 + idx = 0;
1772 + else
1773 + /* increment and wrap around, if necessary */
1774 + idx = (idx + 1) % p->task_params.semi_part.wm.count;
1775 +
1776 + /* point to next slice */
1777 + p->semi_part.wm.slice =
1778 + p->task_params.semi_part.wm.slices + idx;
1779 +
1780 + /* Check if we need to update essential job parameters. */
1781 + if (!idx) {
1782 + /* job completion */
1783 + sched_trace_task_completion(t, !completion_signaled);
1784 + TRACE_TASK(t, "completed sliced job"
1785 + "(signaled:%d)\n", completion_signaled);
1786 + complete_sliced_job(t);
1787 + }
1788 +
1789 + /* Update job parameters for new slice. */
1790 + compute_slice_params(t);
1791 +}
1792 +
1793 +/* assumes time_passed does not advance past the last slice */
1794 +static void fast_forward_slices(struct task_struct* t, lt_t time_passed)
1795 +{
1796 + TRACE_TASK(t, "fast forwarding %lluns\n", time_passed);
1797 +
1798 + /* this is NOT the slice version */
1799 + BUG_ON(budget_remaining(t) <= time_passed);
1800 +
1801 + if (wm_budget_exhausted(t)) {
1802 + /* This can happen if a suspension raced
1803 + * with a normal slice advancement. wm_schedule()
1804 + * does not process out_of_time when a task blocks. */
1805 + TRACE_TASK(t, "block raced with out_of_time?\n");
1806 + advance_next_slice(t, 0);
1807 + }
1808 +
1809 + while (time_passed &&
1810 + time_passed >= slice_budget_remaining(t)) {
1811 + /* slice completely exhausted */
1812 + time_passed -= slice_budget_remaining(t);
1813 + tsk_rt(t)->job_params.exec_time +=
1814 + slice_budget_remaining(t);
1815 +
1816 + BUG_ON(!slice_budget_exhausted(t));
1817 + BUG_ON(slice_budget_remaining(t) != 0);
1818 + BUG_ON(tsk_rt(t)->semi_part.wm.slice == get_last_slice(t));
1819 +
1820 + advance_next_slice(t, 0);
1821 + }
1822 + /* add remainder to exec cost */
1823 + tsk_rt(t)->job_params.exec_time += time_passed;
1824 +}
1825 +
1826 +/* we assume the lock is being held */
1827 +static void preempt(wm_domain_t *dom)
1828 +{
1829 + TRACE_DOM(dom, "will be preempted.\n");
1830 + /* We pass NULL as the task since non-preemptive sections are not
1831 + * supported in this plugin, so per-task checks are not needed. */
1832 + preempt_if_preemptable(NULL, dom->cpu);
1833 +}
1834 +
1835 +static void wm_domain_init(wm_domain_t* dom,
1836 + check_resched_needed_t check,
1837 + release_jobs_t release,
1838 + int cpu)
1839 +{
1840 + edf_domain_init(&dom->domain, check, release);
1841 + dom->cpu = cpu;
1842 + dom->scheduled = NULL;
1843 +}
1844 +
1845 +static void wm_requeue_remote(struct task_struct *t)
1846 +{
1847 + wm_domain_t *dom = domain_of_task(t);
1848 +
1849 + set_rt_flags(t, RT_F_RUNNING);
1850 + if (is_released(t, litmus_clock()))
1851 + /* acquires necessary lock */
1852 + add_ready(&dom->domain, t);
1853 + else
1854 + /* force timer on remote CPU */
1855 + add_release_on(&dom->domain, t, get_partition(t));
1856 +}
1857 +
1858 +static void wm_requeue_local(struct task_struct* t, rt_domain_t *edf)
1859 +{
1860 + if (t->state != TASK_RUNNING)
1861 + TRACE_TASK(t, "requeue: !TASK_RUNNING\n");
1862 +
1863 + set_rt_flags(t, RT_F_RUNNING);
1864 + if (is_released(t, litmus_clock()))
1865 + __add_ready(edf, t);
1866 + else
1867 + add_release(edf, t); /* it has got to wait */
1868 +}
1869 +
1870 +static int wm_check_resched(rt_domain_t *edf)
1871 +{
1872 + wm_domain_t *dom = container_of(edf, wm_domain_t, domain);
1873 +
1874 + /* because this is a callback from rt_domain_t we already hold
1875 + * the necessary lock for the ready queue
1876 + */
1877 + if (edf_preemption_needed(edf, dom->scheduled)) {
1878 + preempt(dom);
1879 + return 1;
1880 + } else
1881 + return 0;
1882 +}
1883 +
1884 +static void regular_job_completion(struct task_struct* t, int forced)
1885 +{
1886 + sched_trace_task_completion(t, forced);
1887 + TRACE_TASK(t, "job_completion().\n");
1888 +
1889 + set_rt_flags(t, RT_F_SLEEP);
1890 + prepare_for_next_period(t);
1891 +}
1892 +
1893 +static void wm_job_or_slice_completion(struct task_struct* t,
1894 + int completion_signaled)
1895 +{
1896 + if (is_sliced_task(t))
1897 + advance_next_slice(t, completion_signaled);
1898 + else
1899 + regular_job_completion(t, !completion_signaled);
1900 +}
1901 +
1902 +static void wm_tick(struct task_struct *t)
1903 +{
1904 + wm_domain_t *dom = local_domain;
1905 +
1906 + /* Check for inconsistency. We don't need the lock for this since
1907 + * ->scheduled is only changed in schedule, which obviously is not
1908 + * executing in parallel on this CPU
1909 + */
1910 + BUG_ON(is_realtime(t) && t != dom->scheduled);
1911 +
1912 + if (is_realtime(t) && budget_enforced(t) && wm_budget_exhausted(t)) {
1913 + set_tsk_need_resched(t);
1914 + TRACE_DOM(dom, "budget of %d exhausted in tick\n",
1915 + t->pid);
1916 + }
1917 +}
1918 +
1919 +static struct task_struct* wm_schedule(struct task_struct * prev)
1920 +{
1921 + wm_domain_t *dom = local_domain;
1922 + rt_domain_t *edf = &dom->domain;
1923 + struct task_struct *next, *migrate = NULL;
1924 +
1925 + int out_of_time, sleep, preempt, wrong_cpu, exists, blocks, resched;
1926 +
1927 + raw_spin_lock(&dom->slock);
1928 +
1929 + /* Sanity checking:
1930 + * When a task exits (dead) dom->schedule may be null
1931 + * and prev _is_ realtime. */
1932 + BUG_ON(dom->scheduled && dom->scheduled != prev);
1933 + BUG_ON(dom->scheduled && !is_realtime(prev));
1934 +
1935 + /* (0) Determine state */
1936 + exists = dom->scheduled != NULL;
1937 + wrong_cpu = exists && get_partition(dom->scheduled) != dom->cpu;
1938 + blocks = exists && !is_running(dom->scheduled);
1939 + out_of_time = exists
1940 + && budget_enforced(dom->scheduled)
1941 + && wm_budget_exhausted(dom->scheduled);
1942 + sleep = exists && get_rt_flags(dom->scheduled) == RT_F_SLEEP;
1943 + preempt = edf_preemption_needed(edf, prev);
1944 +
1945 + /* If we need to preempt do so.
1946 + * The following checks set resched to 1 in case of special
1947 + * circumstances.
1948 + */
1949 + resched = preempt;
1950 +
1951 +
1952 + if (exists)
1953 + TRACE_TASK(prev,
1954 + "blocks:%d out_of_time:%d sleep:%d preempt:%d "
1955 + "wrong_cpu:%d state:%d sig:%d\n",
1956 + blocks, out_of_time, sleep, preempt, wrong_cpu,
1957 + prev->state, signal_pending(prev));
1958 +
1959 + /* If a task blocks we have no choice but to reschedule.
1960 + */
1961 + if (blocks)
1962 + resched = 1;
1963 +
1964 + /* This can happen if sliced task was moved to the next slice
1965 + * by the wake_up() code path while still being scheduled.
1966 + */
1967 + if (wrong_cpu)
1968 + resched = 1;
1969 +
1970 + /* Any task that is preemptable and either exhausts its execution
1971 + * budget or wants to sleep completes. We may have to reschedule after
1972 + * this.
1973 + */
1974 + if ((out_of_time || sleep) && !blocks) {
1975 + wm_job_or_slice_completion(dom->scheduled, sleep);
1976 + resched = 1;
1977 + }
1978 +
1979 + /* The final scheduling decision. Do we need to switch for some reason?
1980 + * Switch if we are in RT mode and have no task or if we need to
1981 + * resched.
1982 + */
1983 + next = NULL;
1984 + if (resched || !exists) {
1985 + if (dom->scheduled && !blocks) {
1986 + if (get_partition(dom->scheduled) == dom->cpu)
1987 + /* local task */
1988 + wm_requeue_local(dom->scheduled, edf);
1989 + else
1990 + /* not local anymore; wait until we drop the
1991 + * ready queue lock */
1992 + migrate = dom->scheduled;
1993 + }
1994 + next = __take_ready(edf);
1995 + } else
1996 + /* Only override Linux scheduler if we have a real-time task
1997 + * scheduled that needs to continue. */
1998 + if (exists)
1999 + next = prev;
2000 +
2001 + if (next) {
2002 + TRACE_TASK(next, "scheduled at %llu (state:%d/%d)\n", litmus_clock(),
2003 + next->state, is_running(next));
2004 + set_rt_flags(next, RT_F_RUNNING);
2005 + } else if (exists) {
2006 + TRACE("becoming idle at %llu\n", litmus_clock());
2007 + }
2008 +
2009 + dom->scheduled = next;
2010 + raw_spin_unlock(&dom->slock);
2011 +
2012 + /* check if we need to push the previous task onto another queue */
2013 + if (migrate) {
2014 + TRACE_TASK(migrate, "schedule-initiated migration to %d\n",
2015 + get_partition(migrate));
2016 + wm_requeue_remote(migrate);
2017 + }
2018 +
2019 + return next;
2020 +}
2021 +
2022 +
2023 +/* Prepare a task for running in RT mode
2024 + */
2025 +static void wm_task_new(struct task_struct * t, int on_rq, int running)
2026 +{
2027 + wm_domain_t* dom = domain_of_task(t);
2028 + rt_domain_t* edf = &dom->domain;
2029 + unsigned long flags;
2030 +
2031 + TRACE_TASK(t, "edf-wm: task new, cpu = %d\n",
2032 + t->rt_param.task_params.cpu);
2033 +
2034 + /* setup job parameters */
2035 + release_at(t, litmus_clock());
2036 +
2037 + /* The task should be running in the queue, otherwise signal
2038 + * code will try to wake it up with fatal consequences.
2039 + */
2040 + raw_spin_lock_irqsave(&dom->slock, flags);
2041 +
2042 + if (is_sliced_task(t)) {
2043 + /* make sure parameters are initialized consistently */
2044 + tsk_rt(t)->semi_part.wm.exec_time = 0;
2045 + tsk_rt(t)->semi_part.wm.job_release = get_release(t);
2046 + tsk_rt(t)->semi_part.wm.job_deadline = get_deadline(t);
2047 + tsk_rt(t)->semi_part.wm.slice = tsk_rt(t)->task_params.semi_part.wm.slices;
2048 + tsk_rt(t)->job_params.exec_time = 0;
2049 + }
2050 +
2051 + if (running) {
2052 + /* there shouldn't be anything else running at the time */
2053 + BUG_ON(dom->scheduled);
2054 + dom->scheduled = t;
2055 + } else {
2056 + wm_requeue_local(t, edf);
2057 + /* maybe we have to reschedule */
2058 + preempt(dom);
2059 + }
2060 + raw_spin_unlock_irqrestore(&dom->slock, flags);
2061 +}
2062 +
2063 +static void wm_release_at(struct task_struct *t, lt_t start)
2064 +{
2065 + struct rt_param* p = tsk_rt(t);
2066 +
2067 + if (is_sliced_task(t)) {
2068 + /* simulate wrapping to the first slice */
2069 + p->semi_part.wm.job_deadline = start;
2070 + p->semi_part.wm.slice = get_last_slice(t);
2071 + /* FIXME: creates bogus completion event... */
2072 + advance_next_slice(t, 0);
2073 + set_rt_flags(t, RT_F_RUNNING);
2074 + } else
2075 + /* generic code handles it */
2076 + release_at(t, start);
2077 +}
2078 +
2079 +static lt_t wm_earliest_release(struct task_struct *t, lt_t now)
2080 +{
2081 + lt_t deadline;
2082 + if (is_sliced_task(t))
2083 + deadline = tsk_rt(t)->semi_part.wm.job_deadline;
2084 + else
2085 + deadline = get_deadline(t);
2086 + if (lt_before(deadline, now))
2087 + return now;
2088 + else
2089 + return deadline;
2090 +}
2091 +
2092 +static void wm_task_wake_up(struct task_struct *t)
2093 +{
2094 + unsigned long flags;
2095 + wm_domain_t* dom = domain_of_task(t);
2096 + rt_domain_t* edf = &dom->domain;
2097 + struct rt_param* p = tsk_rt(t);
2098 + lt_t now, sleep_time;
2099 + int migrate = 0;
2100 +
2101 + raw_spin_lock_irqsave(&dom->slock, flags);
2102 + BUG_ON(is_queued(t));
2103 +
2104 + now = litmus_clock();
2105 +
2106 + sleep_time = now - p->semi_part.wm.suspend_time;
2107 +
2108 + TRACE_TASK(t, "wake_up at %llu after %llu, still-scheduled:%d\n",
2109 + now, sleep_time, dom->scheduled == t);
2110 +
2111 + /* account sleep time as execution time */
2112 + if (get_exec_time(t) + sleep_time >= get_exec_cost(t)) {
2113 + /* new sporadic release */
2114 + TRACE_TASK(t, "new sporadic release\n");
2115 + wm_release_at(t, wm_earliest_release(t, now));
2116 + sched_trace_task_release(t);
2117 + } else if (is_sliced_task(t)) {
2118 + /* figure out which slice we should be executing on */
2119 + fast_forward_slices(t, sleep_time);
2120 + /* can't be exhausted now */
2121 + BUG_ON(wm_budget_exhausted(t));
2122 + } else {
2123 + /* simply add to the execution time */
2124 + tsk_rt(t)->job_params.exec_time += sleep_time;
2125 + }
2126 +
2127 +
2128 + /* Only add to ready queue if it is not the currently-scheduled
2129 + * task. This could be the case if a task was woken up concurrently
2130 + * on a remote CPU before the executing CPU got around to actually
2131 + * de-scheduling the task, i.e., wake_up() raced with schedule()
2132 + * and won.
2133 + */
2134 + if (dom->scheduled != t) {
2135 + if (get_partition(t) == dom->cpu)
2136 + wm_requeue_local(t, edf);
2137 + else
2138 + /* post-pone migration until after unlocking */
2139 + migrate = 1;
2140 + }
2141 +
2142 + raw_spin_unlock_irqrestore(&dom->slock, flags);
2143 +
2144 + if (migrate) {
2145 + TRACE_TASK(t, "wake_up-initiated migration to %d\n",
2146 + get_partition(t));
2147 + wm_requeue_remote(t);
2148 + }
2149 +
2150 + TRACE_TASK(t, "wake up done\n");
2151 +}
2152 +
2153 +static void wm_task_block(struct task_struct *t)
2154 +{
2155 + wm_domain_t* dom = domain_of_task(t);
2156 + unsigned long flags;
2157 + lt_t now = litmus_clock();
2158 +
2159 + TRACE_TASK(t, "block at %llu, state=%d\n", now, t->state);
2160 +
2161 + tsk_rt(t)->semi_part.wm.suspend_time = now;
2162 +
2163 + raw_spin_lock_irqsave(&dom->slock, flags);
2164 + if (is_queued(t)) {
2165 + TRACE_TASK(t, "still queued; migration invariant failed?\n");
2166 + remove(&dom->domain, t);
2167 + }
2168 + raw_spin_unlock_irqrestore(&dom->slock, flags);
2169 +
2170 + BUG_ON(!is_realtime(t));
2171 +}
2172 +
2173 +static void wm_task_exit(struct task_struct * t)
2174 +{
2175 + unsigned long flags;
2176 + wm_domain_t* dom = domain_of_task(t);
2177 + rt_domain_t* edf = &dom->domain;
2178 +
2179 + raw_spin_lock_irqsave(&dom->slock, flags);
2180 + if (is_queued(t)) {
2181 + /* dequeue */
2182 + remove(edf, t);
2183 + }
2184 + if (dom->scheduled == t)
2185 + dom->scheduled = NULL;
2186 +
2187 + TRACE_TASK(t, "RIP, now reschedule\n");
2188 +
2189 + preempt(dom);
2190 + raw_spin_unlock_irqrestore(&dom->slock, flags);
2191 +}
2192 +
2193 +static long wm_check_params(struct task_struct *t)
2194 +{
2195 + struct rt_param* p = tsk_rt(t);
2196 + struct edf_wm_params* wm = &p->task_params.semi_part.wm;
2197 + int i;
2198 + lt_t tmp;
2199 +
2200 + if (!is_sliced_task(t)) {
2201 + /* regular task; nothing to check */
2202 + TRACE_TASK(t, "accepted regular (non-sliced) task with "
2203 + "%d slices\n",
2204 + wm->count);
2205 + return 0;
2206 + }
2207 +
2208 + /* (1) Either not sliced, or more than 1 slice. */
2209 + if (wm->count == 1 || wm->count > MAX_EDF_WM_SLICES) {
2210 + TRACE_TASK(t, "bad number of slices (%u) \n",
2211 + wm->count);
2212 + return -EINVAL;
2213 + }
2214 +
2215 + /* (2) The partition has to agree with the first slice. */
2216 + if (get_partition(t) != wm->slices[0].cpu) {
2217 + TRACE_TASK(t, "partition and first slice CPU differ "
2218 + "(%d != %d)\n", get_partition(t), wm->slices[0].cpu);
2219 + return -EINVAL;
2220 + }
2221 +
2222 + /* (3) The total budget must agree. */
2223 + for (i = 0, tmp = 0; i < wm->count; i++)
2224 + tmp += wm->slices[i].budget;
2225 + if (get_exec_cost(t) != tmp) {
2226 + TRACE_TASK(t, "total budget and sum of slice budgets differ\n");
2227 + return -EINVAL;
2228 + }
2229 +
2230 + /* (4) The release of each slice must not precede the previous
2231 + * deadline. */
2232 + for (i = 0; i < wm->count - 1; i++)
2233 + if (wm->slices[i].deadline > wm->slices[i + 1].offset) {
2234 + TRACE_TASK(t, "slice %d overlaps with slice %d\n",
2235 + i, i + 1);
2236 + return -EINVAL;
2237 + }
2238 +
2239 + /* (5) The budget of each slice must fit within [offset, deadline] */
2240 + for (i = 0; i < wm->count; i++)
2241 + if (lt_before(wm->slices[i].deadline, wm->slices[i].offset) ||
2242 + wm->slices[i].deadline - wm->slices[i].offset <
2243 + wm->slices[i].budget) {
2244 + TRACE_TASK(t, "slice %d is overloaded\n", i);
2245 + return -EINVAL;
2246 + }
2247 +
2248 + /* (6) The budget of each slice must exceed the minimum budget size. */
2249 + for (i = 0; i < wm->count; i++)
2250 + if (wm->slices[i].budget < MIN_EDF_WM_SLICE_SIZE) {
2251 + TRACE_TASK(t, "slice %d is too short\n", i);
2252 + return -EINVAL;
2253 + }
2254 +
2255 + /* (7) The CPU of each slice must be different from the previous CPU. */
2256 + for (i = 0; i < wm->count - 1; i++)
2257 + if (wm->slices[i].cpu == wm->slices[i + 1].cpu) {
2258 + TRACE_TASK(t, "slice %d does not migrate\n", i);
2259 + return -EINVAL;
2260 + }
2261 +
2262 + /* (8) The CPU of each slice must be online. */
2263 + for (i = 0; i < wm->count; i++)
2264 + if (!cpu_online(wm->slices[i].cpu)) {
2265 + TRACE_TASK(t, "slice %d is allocated on offline CPU\n",
2266 + i);
2267 + return -EINVAL;
2268 + }
2269 +
2270 + /* (9) A sliced task's budget must be precisely enforced. */
2271 + if (!budget_precisely_enforced(t)) {
2272 + TRACE_TASK(t, "budget is not precisely enforced "
2273 + "(policy: %d).\n",
2274 + tsk_rt(t)->task_params.budget_policy);
2275 + return -EINVAL;
2276 + }
2277 +
2278 + TRACE_TASK(t, "accepted sliced task with %d slices\n",
2279 + wm->count);
2280 +
2281 + return 0;
2282 +}
2283 +
2284 +static long wm_admit_task(struct task_struct* t)
2285 +{
2286 + return task_cpu(t) == get_partition(t) ? wm_check_params(t) : -EINVAL;
2287 +}
2288 +
2289 +/* Plugin object */
2290 +static struct sched_plugin edf_wm_plugin __cacheline_aligned_in_smp = {
2291 + .plugin_name = "EDF-WM",
2292 + .tick = wm_tick,
2293 + .task_new = wm_task_new,
2294 + .complete_job = complete_job,
2295 + .task_exit = wm_task_exit,
2296 + .schedule = wm_schedule,
2297 + .release_at = wm_release_at,
2298 + .task_wake_up = wm_task_wake_up,
2299 + .task_block = wm_task_block,
2300 + .admit_task = wm_admit_task
2301 +};
2302 +
2303 +
2304 +static int __init init_edf_wm(void)
2305 +{
2306 + int i;
2307 +
2308 + /* FIXME: breaks with CPU hotplug
2309 + */
2310 + for (i = 0; i < num_online_cpus(); i++) {
2311 + wm_domain_init(remote_domain(i),
2312 + wm_check_resched,
2313 + NULL, i);
2314 + }
2315 + return register_sched_plugin(&edf_wm_plugin);
2316 +}
2317 +
2318 +module_init(init_edf_wm);
2319 +
2320 diff --git a/litmus/sched_npsf.c b/litmus/sched_npsf.c
2321 new file mode 100644
2322 index 000000000000..aad99c7e447c
2323 --- /dev/null
2324 +++ b/litmus/sched_npsf.c
2325 @@ -0,0 +1,1185 @@
2326 +/*
2327 + * litmus/sched_npsf.c
2328 + *
2329 + * Implementation of the NPS-F scheduling algorithm.
2330 + *
2331 + * A _server_ may span on multiple _reserves_ on different CPUs.
2332 + *
2333 + * * 1
2334 + * +--------------+ +--> +--------------+ +--> +--------------+
2335 + * | cpu_entry_t | | | npsf_reserve | | | npsf_server |
2336 + * +--------------+ | +--------------+ | +--------------+
2337 + * | |1 | | |1 | | |
2338 + * | cpu_reserve |--+ 1| server |--+ 1| |
2339 + * | | +---| cpu | +---| curr_reserve |
2340 + * +--------------+ <-+ +--------------+ <-+ +--------------+
2341 + * 1 *
2342 + */
2343 +
2344 +#include <asm/uaccess.h>
2345 +#include <linux/percpu.h>
2346 +#include <linux/sched.h>
2347 +#include <linux/list.h>
2348 +#include <linux/spinlock.h>
2349 +#include <linux/slab.h>
2350 +
2351 +#include <linux/module.h>
2352 +
2353 +#include <litmus/litmus.h>
2354 +#include <litmus/jobs.h>
2355 +#include <litmus/sched_plugin.h>
2356 +#include <litmus/edf_common.h>
2357 +
2358 +/* Be extra verbose (log spam) */
2359 +#define NPSF_VERBOSE
2360 +
2361 +#ifdef NPSF_VERBOSE
2362 +#define npsf_printk(fmt, arg...) printk(KERN_INFO fmt, ##arg)
2363 +#else
2364 +#define npsf_printk(fmt, arg...)
2365 +#endif
2366 +
2367 +struct npsf_reserve;
2368 +
2369 +/* cpu_entry_t
2370 + *
2371 + * Each cpu has a list of reserves assigned on the cpu.
2372 + * Each reserve has a pointer to its server (Notional processor)
2373 + * that may be shared among multiple reserves.
2374 + */
2375 +typedef struct {
2376 + /* lock to protect cpu_reserve and list changes */
2377 + raw_spinlock_t cpu_res_lock;
2378 + /* the reserve currently executing on this cpu */
2379 + struct npsf_reserve *cpu_reserve;
2380 + /* list of reserves on this cpu */
2381 + struct list_head npsf_reserves;
2382 + /* cpu ID */
2383 + int cpu;
2384 + /* timer to control reserve switching */
2385 + struct hrtimer timer;
2386 + /* virtual timer expiring (wrt time_origin) */
2387 + lt_t should_expire;
2388 + /* delegate timer firing to proper cpu */
2389 + struct hrtimer_start_on_info info;
2390 + /* FIXME: the ids for servers should be an increasing int >=0 */
2391 + int last_seen_npsf_id;
2392 +} cpu_entry_t;
2393 +
2394 +/* one cpu_entry_t per CPU */
2395 +DEFINE_PER_CPU(cpu_entry_t, npsf_cpu_entries);
2396 +
2397 +/* This is the "notional processor" (i.e., simple server) abstraction. */
2398 +typedef struct npsf_server {
2399 + /* shared among reserves */
2400 + rt_domain_t dom;
2401 + /* the real-time task that this server *SHOULD* be scheduling */
2402 + struct task_struct *highest_prio;
2403 + /* current reserve where this dom is executing */
2404 + struct npsf_reserve *curr_reserve;
2405 + /* The "first" reserve for this server in a time slot.
2406 + * For non-migrating servers this will always be the same as curr_reserve. */
2407 + struct npsf_reserve *first_reserve;
2408 + /* Prevent a race between the last CPU in a reserve chain an the first. */
2409 + int first_cpu_wants_ipi;
2410 + /* rt_domain_t lock + npsf_server_t lock */
2411 +#define lock dom.ready_lock
2412 +} npsf_server_t;
2413 +
2414 +typedef struct npsf_reserve {
2415 + /* Pointer to the server for this reserve: a server may be shared among
2416 + * multiple cpus with different budget per cpu, but same npsf_id. */
2417 + npsf_server_t *server;
2418 + /* we queue here in npsf_reserves */
2419 + struct list_head node;
2420 + /* budget of this npsf_id on this cpu */
2421 + lt_t budget;
2422 + /* cpu for this (portion of) server */
2423 + cpu_entry_t *cpu;
2424 + /* id of this server, it is the same for the
2425 + * same server on different cpus */
2426 + int npsf_id;
2427 + /* Can be used to identify if a reserve continues
2428 + * next npsf in the chain, needed for proper server deletion */
2429 + struct npsf_reserve *next_npsf;
2430 + /* flag that is true if the reserve is currently scheduled */
2431 + int is_currently_scheduled;
2432 +} npsf_reserve_t;
2433 +
2434 +/* synchronization point to start moving and switching servers only
2435 + * when all servers have been properly set up by the user.
2436 + */
2437 +static atomic_t all_servers_added;
2438 +static atomic_t timers_activated = ATOMIC_INIT(0);
2439 +
2440 +/* Virtual time starts here */
2441 +static lt_t time_origin;
2442 +
2443 +/* save number of online cpus seen at init time */
2444 +static unsigned int _online_cpus = 1;
2445 +
2446 +#define no_reserves(entry) (list_empty(&((entry)->npsf_reserves)))
2447 +#define local_entry (&__get_cpu_var(npsf_cpu_entries))
2448 +#define remote_entry(cpu) (&per_cpu(npsf_cpu_entries, (cpu)))
2449 +
2450 +#define server_from_dom(domain) (container_of((domain), npsf_server_t, dom))
2451 +
2452 +/* task_entry uses get_partition() therefore we must take care of
2453 + * updating correclty the task_params.cpu whenever we switch task,
2454 + * otherwise we'll deadlock.
2455 + */
2456 +#define task_entry(task) remote_entry(get_partition(task))
2457 +#define domain_edf(npsf) (&((npsf)->server->dom))
2458 +
2459 +#define task_npsfid(task) ((task)->rt_param.task_params.semi_part.npsf_id)
2460 +
2461 +static inline int owns_server(npsf_reserve_t *npsf)
2462 +{
2463 + return (npsf->server->curr_reserve == npsf);
2464 +}
2465 +
2466 +/* utility functions to get next and prev domains; must hold entry lock */
2467 +static inline npsf_reserve_t* local_next_reserve(npsf_reserve_t *curr,
2468 + cpu_entry_t *entry)
2469 +{
2470 + return (list_is_last(&curr->node, &entry->npsf_reserves)) ?
2471 + list_entry(entry->npsf_reserves.next, npsf_reserve_t, node) :
2472 + list_entry(curr->node.next, npsf_reserve_t, node);
2473 +
2474 +}
2475 +
2476 +static inline npsf_reserve_t* local_prev_reserve(npsf_reserve_t *curr,
2477 + cpu_entry_t *entry)
2478 +{
2479 + return ((curr->node.prev == &entry->npsf_reserves) ?
2480 + list_entry(entry->npsf_reserves.prev, npsf_reserve_t, node) :
2481 + list_entry(curr->node.prev, npsf_reserve_t, node));
2482 +}
2483 +static void requeue(struct task_struct* t, rt_domain_t *edf)
2484 +{
2485 + if (t->state != TASK_RUNNING)
2486 + TRACE_TASK(t, "requeue: !TASK_RUNNING\n");
2487 +
2488 + BUG_ON(is_queued(t));
2489 +
2490 + set_rt_flags(t, RT_F_RUNNING);
2491 + if (is_released(t, litmus_clock()))
2492 + __add_ready(edf, t);
2493 + else
2494 + add_release(edf, t); /* it has got to wait */
2495 +}
2496 +
2497 +/* we assume the lock is being held */
2498 +static void preempt(npsf_reserve_t *npsf)
2499 +{
2500 + /* Since we do not support non-preemptable sections,
2501 + * we don't need to pass in a task. If we call this,
2502 + * we want the remote CPU to reschedule, no matter what.
2503 + */
2504 + preempt_if_preemptable(NULL, npsf->cpu->cpu);
2505 +}
2506 +
2507 +
2508 +static void npsf_preempt_if_server_is_scheduled(npsf_server_t* srv)
2509 +{
2510 + npsf_reserve_t *reserve = srv->curr_reserve;
2511 + if (reserve->is_currently_scheduled) {
2512 + preempt(reserve);
2513 + }
2514 +}
2515 +
2516 +/* assumes lock is held by caller */
2517 +static void npsf_reschedule_server(npsf_server_t* srv)
2518 +{
2519 + struct task_struct* hp = srv->highest_prio;
2520 + rt_domain_t* edf = &srv->dom;
2521 +
2522 + if (edf_preemption_needed(edf, hp)) {
2523 + srv->highest_prio = __take_ready(edf);
2524 + if (hp) {
2525 + TRACE_TASK(hp, "requeue: no longer highest prio\n");
2526 + requeue(hp, edf);
2527 + }
2528 + npsf_preempt_if_server_is_scheduled(srv);
2529 + }
2530 +}
2531 +
2532 +static void npsf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
2533 +{
2534 + npsf_server_t *srv = server_from_dom(rt);
2535 + unsigned long flags;
2536 +
2537 + raw_spin_lock_irqsave(&srv->lock, flags);
2538 +
2539 + __merge_ready(rt, tasks);
2540 + npsf_reschedule_server(srv);
2541 +
2542 + raw_spin_unlock_irqrestore(&srv->lock, flags);
2543 +}
2544 +
2545 +static void job_completion(struct task_struct* t, int forced)
2546 +{
2547 + sched_trace_task_completion(t, forced);
2548 + TRACE_TASK(t, "job_completion().\n");
2549 +
2550 + set_rt_flags(t, RT_F_SLEEP);
2551 + prepare_for_next_period(t);
2552 +}
2553 +
2554 +/* When did this slot start ? */
2555 +static inline lt_t slot_begin(lt_t now)
2556 +{
2557 + return (((now - time_origin) / npsf_slot_length)
2558 + * npsf_slot_length + time_origin);
2559 +}
2560 +
2561 +/* Compute the delta from the beginning of the current slot. */
2562 +static inline lt_t delta_from_slot_begin(lt_t now)
2563 +{
2564 + return (now - slot_begin(now));
2565 +}
2566 +
2567 +/* Given an offset into a slot, return the corresponding eligible reserve.
2568 + * The output param reservation_end is used to return the (relative) time at which
2569 + * the returned reserve ends.
2570 + */
2571 +static npsf_reserve_t* get_reserve_for_offset(cpu_entry_t *entry, lt_t offset,
2572 + lt_t *reservation_end)
2573 +{
2574 + npsf_reserve_t *tmp;
2575 +
2576 + *reservation_end = 0;
2577 +
2578 + /* linear search through all reserves, figure out which one is the last one
2579 + * to become eligible before delta */
2580 + list_for_each_entry(tmp, &entry->npsf_reserves, node) {
2581 + *reservation_end += tmp->budget;
2582 +
2583 + /* We are always "late". Found tmp is the right one */
2584 + if ((*reservation_end > offset))
2585 + return tmp;
2586 + }
2587 +
2588 + /* error: we should never fall of the reserve list */
2589 + BUG();
2590 + return NULL;
2591 +}
2592 +
2593 +/* Determine which reserve is eligible based on the current time.
2594 + */
2595 +static npsf_reserve_t* get_current_reserve(cpu_entry_t *entry)
2596 +{
2597 + lt_t reservation_end;
2598 + lt_t offset = delta_from_slot_begin(litmus_clock());
2599 + return get_reserve_for_offset(entry, offset, &reservation_end);
2600 +}
2601 +
2602 +/* This is used to ensure that we are "always" late, i.e., to make
2603 + * sure that the timer jitter is always positive. This should
2604 + * only trigger in KVM (or in real machines with bad TSC drift after
2605 + * an IPI).
2606 + *
2607 + * ATM proper tracing for this event is done in reserve_switch_tick().
2608 + */
2609 +static noinline ktime_t catchup_time(lt_t from, lt_t target)
2610 +{
2611 + while(lt_before(from, target)) {
2612 + from = litmus_clock();
2613 +
2614 + mb();
2615 + cpu_relax();
2616 + }
2617 +
2618 + return ns_to_ktime(from);
2619 +}
2620 +
2621 +
2622 +/* compute the next ABSOLUTE timer value */
2623 +static lt_t get_next_reserve_switch_time(void)
2624 +{
2625 + cpu_entry_t *entry = local_entry;
2626 + lt_t now = litmus_clock();
2627 + lt_t slot_start = slot_begin(now);
2628 + lt_t offset = now - slot_start;
2629 + lt_t next_time;
2630 + npsf_reserve_t* reserve;
2631 +
2632 + /* compute the absolute litmus time of the next reserve switch */
2633 + reserve = get_reserve_for_offset(entry, offset, &next_time);
2634 + /* get_reserve_for_offset returns a relative start time; let's make it
2635 + absolute */
2636 + next_time += slot_start;
2637 +
2638 + /* Let's see if we need to skip the next timer. */
2639 + reserve = local_next_reserve(reserve, entry);
2640 + /* if the next reserve is a continuing reserve
2641 + * (i.e., if it belongs to a migrating server),
2642 + * then we skip the timer event because we will
2643 + * receive an IPI from the previous processor instead. */
2644 + if (reserve->server->first_reserve != reserve) {
2645 + /* it is indeed not the first reserve */
2646 + next_time += reserve->budget;
2647 + }
2648 +
2649 + return next_time;
2650 +}
2651 +
2652 +/* This is the callback for reserve-switching interrupts.
2653 + * The timer is reprogrammed to expire at the beginning of every logical
2654 + * reserve (i.e., a continuing reserve may be split among different CPUs
2655 + * but is a _single_ logical reserve). get_next_reserve_switch_time()
2656 + * will return the right next_expire time.
2657 + */
2658 +static enum hrtimer_restart reserve_switch_tick(struct hrtimer *timer)
2659 +{
2660 + unsigned long flags;
2661 + cpu_entry_t *entry;
2662 + /* we are using CLOCK_MONOTONIC */
2663 + ktime_t now = ktime_get();
2664 + ktime_t delta;
2665 + int late;
2666 +
2667 + entry = container_of(timer, cpu_entry_t, timer);
2668 + raw_spin_lock_irqsave(&entry->cpu_res_lock, flags);
2669 +
2670 + /* jitter wrt virtual time */
2671 + delta = ktime_sub(now, ns_to_ktime(entry->should_expire));
2672 + late = (ktime_to_ns(delta) >= 0) ? 1 : 0;
2673 +
2674 +#ifdef NPSF_VERBOSE
2675 + if (entry->cpu_reserve && atomic_read(&all_servers_added))
2676 + TRACE("(npsf_id: %d) tick starts at %Ld, "
2677 + "now - should_expire: %Ld\n",
2678 + entry->cpu_reserve->npsf_id,
2679 + ktime_to_ns(now), ktime_to_ns(delta));
2680 +#endif
2681 + /* if the timer expires earlier than the should_expire time,
2682 + * we delay the switching until time it's synchronized with
2683 + * the switch boundary. Otherwise next reserve will execute
2684 + * longer (wrong).
2685 + */
2686 + if (!late) {
2687 + TRACE("+++ Timer fired early, waiting...\n");
2688 + now = catchup_time(ktime_to_ns(now), entry->should_expire);
2689 +
2690 + delta = ktime_sub(now, ns_to_ktime(entry->should_expire));
2691 + TRACE("+++ done, tick restarts at %Ld, "
2692 + "now - should_expire: %Ld\n",
2693 + ktime_to_ns(now), ktime_to_ns(delta));
2694 + }
2695 +
2696 + BUG_ON(!atomic_read(&all_servers_added));
2697 + BUG_ON(no_reserves(entry));
2698 +
2699 + /* Compute the next time that we need to be notified. */
2700 + entry->should_expire = get_next_reserve_switch_time();
2701 +
2702 + /* kindly ask the Penguin to let us know... */
2703 + hrtimer_set_expires(timer, ns_to_ktime(entry->should_expire));
2704 +
2705 + /* set resched flag to reschedule local cpu */
2706 + set_need_resched();
2707 +
2708 + raw_spin_unlock_irqrestore(&entry->cpu_res_lock, flags);
2709 +#ifdef NPSF_VERBOSE
2710 + if (atomic_read(&all_servers_added))
2711 + TRACE("(npsf_id: %d) tick ends at %Ld, should_expire: %llu\n",
2712 + entry->cpu_reserve->npsf_id, ktime_to_ns(ktime_get()),
2713 + entry->should_expire);
2714 +#endif
2715 +
2716 + return HRTIMER_RESTART;
2717 +}
2718 +
2719 +static void npsf_scheduler_tick(struct task_struct *t)
2720 +{
2721 + if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
2722 + set_tsk_need_resched(t);
2723 + TRACE("npsf_tick: %d is preemptable "
2724 + " => FORCE_RESCHED\n", t->pid);
2725 + }
2726 +}
2727 +
2728 +/* Assumption: caller holds srv lock and prev belongs to
2729 + * the currently-scheduled reservation.
2730 + */
2731 +static void npsf_schedule_server(struct task_struct* prev,
2732 + cpu_entry_t *entry)
2733 +{
2734 + npsf_server_t* srv = entry->cpu_reserve->server;
2735 +
2736 + int out_of_time, sleep, exists, blocks;
2737 +
2738 + exists = is_realtime(prev);
2739 + blocks = exists && !is_running(prev);
2740 + out_of_time = exists &&
2741 + budget_enforced(prev) &&
2742 + budget_exhausted(prev);
2743 + sleep = exists && get_rt_flags(prev) == RT_F_SLEEP;
2744 +
2745 + if (exists)
2746 + TRACE_TASK(prev, "(npsf_id %d) blocks:%d "
2747 + "out_of_time:%d sleep:%d state:%d sig:%d\n",
2748 + task_npsfid(prev),
2749 + blocks, out_of_time, sleep,
2750 + prev->state,
2751 + signal_pending(prev));
2752 +
2753 + /* Any task that is preemptable and either exhausts its
2754 + * execution budget or wants to sleep completes. We may have
2755 + * to reschedule after this.
2756 + */
2757 + if ((out_of_time || sleep) && !blocks) {
2758 + job_completion(prev, !sleep);
2759 +
2760 + if (srv->highest_prio != prev) {
2761 + BUG_ON(!is_queued(prev));
2762 + remove(&srv->dom, prev);
2763 + }
2764 +
2765 + requeue(prev, &srv->dom);
2766 +
2767 + if (srv->highest_prio == prev)
2768 + srv->highest_prio = __take_ready(&srv->dom);
2769 + }
2770 +
2771 + BUG_ON(blocks && prev == srv->highest_prio);
2772 +// BUG_ON(!srv->highest_prio && jobs_pending(&srv->dom));
2773 +}
2774 +
2775 +static void npsf_notify_next_cpu(npsf_reserve_t *npsf_prev)
2776 +{
2777 + npsf_server_t *srv;
2778 +
2779 + if (unlikely(npsf_prev->next_npsf != npsf_prev)) {
2780 + /* This reserve is actually shared. Let's update its 'owner'
2781 + * and notify the next CPU. */
2782 + srv = npsf_prev->server;
2783 + raw_spin_lock(&srv->lock);
2784 + srv->curr_reserve = npsf_prev->next_npsf;
2785 + if (srv->first_reserve != srv->curr_reserve ||
2786 + srv->first_cpu_wants_ipi) {
2787 + /* send an IPI to notify next CPU in chain */
2788 + srv->first_cpu_wants_ipi = 0;
2789 + TRACE("sending IPI\n");
2790 + preempt(srv->curr_reserve);
2791 + }
2792 + raw_spin_unlock(&srv->lock);
2793 + }
2794 +}
2795 +
2796 +static struct task_struct* npsf_schedule(struct task_struct * prev)
2797 +{
2798 + npsf_reserve_t *npsf_prev, *npsf_next;
2799 + npsf_server_t *srv_prev, *srv_next;
2800 + cpu_entry_t *entry = local_entry;
2801 + struct task_struct *next;
2802 +
2803 + int reserve_switch;
2804 +
2805 + /* servers not ready yet, yield to linux */
2806 + if (!atomic_read(&all_servers_added))
2807 + return NULL;
2808 +
2809 +#ifdef NPSF_VERBOSE
2810 + TRACE_TASK(prev, "schedule\n");
2811 +#endif
2812 + raw_spin_lock(&entry->cpu_res_lock);
2813 +
2814 + BUG_ON(no_reserves(entry));
2815 +
2816 + /* step 1: what are we currently serving? */
2817 + npsf_prev = entry->cpu_reserve;
2818 + srv_prev = npsf_prev->server;
2819 +
2820 + /* step 2: what SHOULD we be currently serving? */
2821 + npsf_next = get_current_reserve(entry);
2822 + srv_next = npsf_next->server;
2823 +
2824 + /* TODO second measuring point for IPI receiving
2825 + * if (!srv_next->measure_wait_IPI) --- the remote reset
2826 + * trace_time_end.
2827 + */
2828 + raw_spin_lock(&srv_prev->lock);
2829 +
2830 +
2831 + /* step 3: update prev server */
2832 + if (is_realtime(prev) && task_npsfid(prev) == entry->cpu_reserve->npsf_id)
2833 + npsf_schedule_server(prev, entry);
2834 + else if (is_realtime(prev))
2835 + TRACE_TASK(prev, "npsf_id %d != cpu_reserve npsf_id %d\n",
2836 + task_npsfid(prev), entry->cpu_reserve->npsf_id);
2837 +
2838 + /* step 4: determine if we need to switch to another reserve */
2839 + reserve_switch = npsf_prev != npsf_next;
2840 +
2841 + if (!reserve_switch) {
2842 + /* easy case: just enact what the server scheduler decided */
2843 + next = srv_prev->highest_prio;
2844 +
2845 + /* Unlock AFTER observing highest_prio to avoid races with
2846 + * remote rescheduling activity. */
2847 + raw_spin_unlock(&srv_prev->lock);
2848 + } else {
2849 + /* In this case we have a reserve switch. We are done with the
2850 + * previous server, so release its lock. */
2851 + TRACE("switch reserve npsf_id %d -> npsf_id %d\n",
2852 + npsf_prev->npsf_id, npsf_next->npsf_id);
2853 + npsf_prev->is_currently_scheduled = 0;
2854 + raw_spin_unlock(&srv_prev->lock);
2855 +
2856 + /* Move on to the next server. */
2857 +
2858 + raw_spin_lock(&srv_next->lock);
2859 + npsf_next->is_currently_scheduled = 1;
2860 +
2861 + /* make sure we are owner of a server (if it is shared) */
2862 + if (unlikely(srv_next->curr_reserve != npsf_next)) {
2863 + /* We raced with the previous owner. Let's schedule
2864 + * the previous reserve for now. The previous owner
2865 + * will send us an IPI when the server has been pushed
2866 + * to us.
2867 + */
2868 + TRACE("(npsf_id %d) raced with previous server owner\n",
2869 + npsf_next->npsf_id);
2870 +
2871 + /* check if we are the first CPU, in which case we need
2872 + * to request a notification explicitly */
2873 + if (srv_next->first_reserve == npsf_next)
2874 + srv_next->first_cpu_wants_ipi = 1;
2875 +
2876 + npsf_next->is_currently_scheduled = 0;
2877 + raw_spin_unlock(&srv_next->lock);
2878 +
2879 + /* just keep the previous reserve one more time */
2880 + raw_spin_lock(&srv_prev->lock);
2881 +
2882 + npsf_prev->is_currently_scheduled = 1;
2883 + /* Note that there is not a race condition here.
2884 + * Since curr_reserve didn't point yet to this reserve,
2885 + * so no processor would have observed the one in npsf_next.
2886 + * A processor might have observed the flag being zero
2887 + * in npsf_prev and decided not to send an IPI, which
2888 + * doesn't matter since we are going to reschedule
2889 + * below anyay. */
2890 +
2891 + next = srv_prev->highest_prio;
2892 +
2893 + raw_spin_unlock(&srv_prev->lock);
2894 +
2895 + /* TODO first measuring point for '0'-switching time
2896 + * remote is not ready yet and will send us an IPI
2897 + * when it's done.
2898 + * local:
2899 + * srv_next->measure_wait_IPI = 1;
2900 + * remote before sending IPI:
2901 + * if (srv_next->measure_wait_IPI) reset;
2902 + */
2903 + } else {
2904 + /* invariant: srv->highest_prio is always the
2905 + * highest-priority job in the server, and it is always
2906 + * runnable. Any update to the server must maintain
2907 + * this invariant. */
2908 + next = srv_next->highest_prio;
2909 +
2910 + entry->cpu_reserve = npsf_next;
2911 + raw_spin_unlock(&srv_next->lock);
2912 +
2913 + /* send an IPI (if necessary) */
2914 + npsf_notify_next_cpu(npsf_prev);
2915 + }
2916 +
2917 + }
2918 +
2919 + if (next) {
2920 + TRACE_TASK(next, "(npsf_id %d) scheduled at %llu\n",
2921 + task_npsfid(next), litmus_clock());
2922 + set_rt_flags(next, RT_F_RUNNING);
2923 + /* The TASK_RUNNING flag is set by the Penguin _way_ after
2924 + * activating a task. This dosn't matter much to Linux as
2925 + * the rq lock will prevent any changes, but it matters to
2926 + * us. It is possible for a remote cpu waking up this task
2927 + * to requeue the task before it's runnable, send an IPI here,
2928 + * we schedule that task (still "not-runnable"), and only
2929 + * before the real execution of next, the running flag is set.
2930 + */
2931 + if (!is_running(next))
2932 + TRACE_TASK(next, "BAD: !TASK_RUNNING\n");
2933 + } else {
2934 + /* FIXME npsf_id is wrong if reserve switch but "switching back"
2935 + * if we race */
2936 + TRACE("(npsf_id %d) becoming idle at %llu\n",
2937 + reserve_switch ? npsf_next->npsf_id : npsf_prev->npsf_id,
2938 + litmus_clock());
2939 + }
2940 +
2941 + raw_spin_unlock(&entry->cpu_res_lock);
2942 +
2943 + return next;
2944 +}
2945 +
2946 +/* Prepare a task for running in RT mode
2947 + *
2948 + * We can only be sure that the cpu is a right one (admit checks
2949 + * against tasks released on a cpu that doesn't host the right npsf_id)
2950 + * but we _cannot_ be sure that:
2951 + * 1) the found npsf is the reserve currently running on this cpu.
2952 + * 2) the current reserve (the one in charge of scheduling) is not
2953 + * running on a different cpu.
2954 + */
2955 +static void npsf_task_new(struct task_struct * t, int on_rq, int running)
2956 +{
2957 + npsf_reserve_t *npsf;
2958 + npsf_server_t *srv;
2959 + cpu_entry_t *entry = task_entry(t);
2960 + rt_domain_t *edf;
2961 + unsigned long flags;
2962 +
2963 + BUG_ON(no_reserves(entry));
2964 +
2965 + /* search the proper npsf_server where to add the new task */
2966 + list_for_each_entry(npsf, &entry->npsf_reserves, node) {
2967 + if (npsf->npsf_id == task_npsfid(t))
2968 + break;
2969 + }
2970 +
2971 +
2972 + srv = npsf->server;
2973 +
2974 + /* The task should be running in the queue, otherwise signal
2975 + * code will try to wake it up with fatal consequences.
2976 + */
2977 + raw_spin_lock_irqsave(&entry->cpu_res_lock, flags);
2978 + raw_spin_lock(&srv->lock);
2979 +
2980 + edf = domain_edf(npsf);
2981 + tsk_rt(t)->domain = edf;
2982 +
2983 + TRACE_TASK(t, "task_new: P%d, task_npsfid %d, "
2984 + "npsf->npsf_id %d, entry->cpu %d\n",
2985 + t->rt_param.task_params.cpu, task_npsfid(t),
2986 + npsf->npsf_id, entry->cpu);
2987 +
2988 + /* setup job parameters */
2989 + release_at(t, litmus_clock());
2990 +
2991 + /* There are four basic scenarios that could happen:
2992 + * 1) the server is on another cpu and scheduled;
2993 + * 2) the server is on another cpu and not scheduled;
2994 + * 3) the server is on this cpu and scheduled; and
2995 + * 4) the server is on this cpu and not scheduled.
2996 + *
2997 + * Whatever scenario we're in, it cannot change while we are
2998 + * holding the server lock.
2999 + *
3000 + * If the new task does not have a high priority, then
3001 + * we can just queue it and be done.
3002 + *
3003 + * In theory, the requeue() and reschedule_server() code
3004 + * take care of all that.
3005 + */
3006 +
3007 + requeue(t, edf);
3008 + /* reschedule will cause a remote preemption, if required */
3009 + npsf_reschedule_server(srv);
3010 + /* always preempt to make sure we don't
3011 + * use the stack if it needs to migrate */
3012 + set_tsk_need_resched(t);
3013 +
3014 + raw_spin_unlock(&srv->lock);
3015 + raw_spin_unlock_irqrestore(&entry->cpu_res_lock, flags);
3016 +}
3017 +
3018 +static void npsf_task_wake_up(struct task_struct *t)
3019 +{
3020 + rt_domain_t *edf;
3021 + npsf_server_t* srv;
3022 + unsigned long flags;
3023 + lt_t now;
3024 +
3025 + BUG_ON(!is_realtime(t));
3026 +
3027 + edf = tsk_rt(t)->domain;
3028 + srv = server_from_dom(edf);
3029 +
3030 + raw_spin_lock_irqsave(&srv->lock, flags);
3031 +
3032 + BUG_ON(is_queued(t));
3033 +
3034 + now = litmus_clock();
3035 + /* FIXME: this should be a configurable policy... */
3036 + if (is_tardy(t, now)) {
3037 + /* new sporadic release */
3038 + release_at(t, now);
3039 + sched_trace_task_release(t);
3040 + }
3041 +
3042 + /* Only add to ready queue if it is not the
3043 + * currently-scheduled task.
3044 + */
3045 + if (srv->highest_prio != t) {
3046 + requeue(t, edf);
3047 + npsf_reschedule_server(srv);
3048 + }
3049 +#ifdef NPSF_VERBOSE
3050 + else
3051 + TRACE_TASK(t, "wake_up, is curr_sched, not requeued\n");
3052 +#endif
3053 +
3054 + raw_spin_unlock_irqrestore(&srv->lock, flags);
3055 +
3056 + TRACE_TASK(t, "wake up done\n");
3057 +}
3058 +
3059 +static void remove_from_server(struct task_struct *t, npsf_server_t* srv)
3060 +{
3061 + if (srv->highest_prio == t) {
3062 + TRACE_TASK(t, "remove from server: is highest-prio task\n");
3063 + srv->highest_prio = NULL;
3064 + npsf_reschedule_server(srv);
3065 + } else if (is_queued(t)) {
3066 + TRACE_TASK(t, "remove from server: removed from queue\n");
3067 + remove(&srv->dom, t);
3068 + }
3069 +#ifdef NPSF_VERBOSE
3070 + else
3071 + TRACE_TASK(t, "WARN: where is this task?\n");
3072 +#endif
3073 +}
3074 +
3075 +static void npsf_task_block(struct task_struct *t)
3076 +{
3077 + rt_domain_t *edf;
3078 + npsf_server_t* srv;
3079 + unsigned long flags;
3080 +
3081 + TRACE_TASK(t, "(npsf_id %d) block at %llu, state=%d\n",
3082 + task_npsfid(t), litmus_clock(), t->state);
3083 +
3084 + BUG_ON(!is_realtime(t));
3085 +
3086 + edf = tsk_rt(t)->domain;
3087 + srv = server_from_dom(edf);
3088 +
3089 + raw_spin_lock_irqsave(&srv->lock, flags);
3090 +
3091 + remove_from_server(t, srv);
3092 +
3093 + raw_spin_unlock_irqrestore(&srv->lock, flags);
3094 +}
3095 +
3096 +static void npsf_task_exit(struct task_struct * t)
3097 +{
3098 + rt_domain_t *edf;
3099 + npsf_server_t* srv;
3100 + unsigned long flags;
3101 +
3102 + BUG_ON(!is_realtime(t));
3103 +
3104 + edf = tsk_rt(t)->domain;
3105 + srv = server_from_dom(edf);
3106 +
3107 + raw_spin_lock_irqsave(&srv->lock, flags);
3108 +
3109 + remove_from_server(t, srv);
3110 +
3111 + raw_spin_unlock_irqrestore(&srv->lock, flags);
3112 +
3113 + TRACE_TASK(t, "RIP, now reschedule\n");
3114 +}
3115 +
3116 +static long npsf_admit_task(struct task_struct* tsk)
3117 +{
3118 + npsf_reserve_t *npsf;
3119 + cpu_entry_t *entry = task_entry(tsk);
3120 + int id_ok = 0;
3121 +
3122 + if (!atomic_read(&all_servers_added)) {
3123 + printk(KERN_DEBUG "not all servers added\n");
3124 + return -ENODEV;
3125 + }
3126 +
3127 + /* check to be on the right cpu and on the right server */
3128 + if (task_cpu(tsk) != tsk->rt_param.task_params.cpu) {
3129 + printk(KERN_DEBUG "wrong CPU(%d, %d, %d) for npsf_id %d\n",
3130 + task_cpu(tsk), tsk->rt_param.task_params.cpu,
3131 + entry->cpu, task_npsfid(tsk));
3132 + return -EINVAL;
3133 + }
3134 +
3135 + /* 1) this cpu should have the proper npsf_id in the list
3136 + * 2) the rt_domain for the proper npsf_id is not null
3137 + */
3138 + list_for_each_entry(npsf, &entry->npsf_reserves, node) {
3139 + if (npsf->npsf_id == task_npsfid(tsk)) {
3140 + id_ok = 1;
3141 + break;
3142 + }
3143 + }
3144 + if (!id_ok)
3145 + printk(KERN_DEBUG "wrong npsf_id (%d) for entry %d\n",
3146 + task_npsfid(tsk), entry->cpu);
3147 +
3148 + return id_ok ? 0 : -EINVAL;
3149 +}
3150 +
3151 +/* in litmus.c */
3152 +extern atomic_t rt_task_count;
3153 +
3154 +/* initialization status control */
3155 +static int reserves_allocated = 0;
3156 +
3157 +#ifdef NPSF_VERBOSE
3158 +static void print_reserve(cpu_entry_t *cpu)
3159 +{
3160 + npsf_reserve_t *tmp;
3161 +
3162 + printk(KERN_INFO "NPS-F: reserves on CPU %d:\n", cpu->cpu);
3163 + list_for_each_entry(tmp, &cpu->npsf_reserves, node) {
3164 + BUG_ON(!tmp->server);
3165 + BUG_ON(!&(tmp->server->dom));
3166 + BUG_ON(tmp->server->highest_prio);
3167 + printk(KERN_INFO "%d: %d us\n", tmp->npsf_id,
3168 + (int)(tmp->budget / 1000));
3169 + }
3170 +}
3171 +#endif
3172 +/*
3173 + * do_add_reserve: add a reserve(cpu, id, budget)
3174 + *
3175 + * Callback for syscall add_server(); it allows to add the reserve "id"
3176 + * to the CPU "cpu". "budget" is the length of the reserve for the
3177 + * notional processor (server) id on the cpu cpu.
3178 + */
3179 +static long do_add_reserve(npsf_reserve_t **new, cpu_entry_t *cpu,
3180 + npsf_server_t *the_dom, int npsf_id, lt_t budget)
3181 +{
3182 + unsigned long flags;
3183 +
3184 + /* npsf_id for each cpu should be given in increasing order,
3185 + * it doesn't make sense the same np on the same cpu.
3186 + * The last_seen_npsf_id is reset upon plugin insertion.
3187 + */
3188 + if (cpu->last_seen_npsf_id >= npsf_id)
3189 + return -EINVAL;
3190 +
3191 + /* don't allow server changes if there are tasks in the system */
3192 + if (atomic_read(&rt_task_count))
3193 + return -EACCES;
3194 +
3195 + if ((*new = kmalloc(sizeof(npsf_reserve_t), GFP_ATOMIC)) == NULL)
3196 + return -ENOMEM;
3197 +
3198 + (*new)->server = the_dom;
3199 + (*new)->npsf_id = npsf_id;
3200 + (*new)->budget = budget;
3201 + (*new)->cpu = cpu;
3202 +
3203 + npsf_printk("Add npsf_id %d on P%d with budget %llu\n", (*new)->npsf_id,
3204 + (*new)->cpu->cpu, (*new)->budget);
3205 +
3206 + raw_spin_lock_irqsave(&cpu->cpu_res_lock, flags);
3207 +
3208 + list_add_tail(&(*new)->node, &cpu->npsf_reserves);
3209 + cpu->last_seen_npsf_id = npsf_id;
3210 + cpu->cpu_reserve = list_first_entry(&cpu->npsf_reserves, npsf_reserve_t, node);
3211 +
3212 + raw_spin_unlock_irqrestore(&cpu->cpu_res_lock, flags);
3213 +
3214 + return 0;
3215 +}
3216 +
3217 +static void kickoff_timers(void)
3218 +{
3219 + int cpu;
3220 + cpu_entry_t *entry;
3221 + lt_t kickoff;
3222 +
3223 + kickoff = slot_begin(litmus_clock() + npsf_slot_length * 2);
3224 +
3225 + for_each_online_cpu(cpu) {
3226 + entry = &per_cpu(npsf_cpu_entries, cpu);
3227 + hrtimer_start_on(cpu, &entry->info, &entry->timer,
3228 + ns_to_ktime(kickoff),
3229 + HRTIMER_MODE_ABS_PINNED);
3230 + entry->should_expire = kickoff;
3231 + }
3232 + atomic_set(&timers_activated, 1);
3233 +}
3234 +
3235 +/* We offer to library a budgets array interface (so we go through the
3236 + * syscall path only once) and we internally cycle on do_add_reserve.
3237 + *
3238 + * last == 1 means that the user is adding the last server and after
3239 + * the insertion the plugin is properly set up. (FIXME it should be
3240 + * done in a better way, but I doubt this plugin will ever go
3241 + * to the master branch).
3242 + */
3243 +asmlinkage long sys_add_server(int __user *__id,
3244 + struct npsf_budgets __user *__budgets, int last)
3245 +{
3246 + int id, i;
3247 + int ret = -EFAULT;
3248 + struct npsf_budgets *budgets;
3249 + cpu_entry_t *entry;
3250 + npsf_server_t *npsfserver;
3251 + npsf_reserve_t *npsf_reserve_array[NR_CPUS];
3252 + npsf_reserve_t *first_reserve;
3253 +
3254 + if (_online_cpus != num_online_cpus())
3255 + return ret;
3256 +
3257 + if (copy_from_user(&id, __id, sizeof(id)))
3258 + return ret;
3259 +
3260 + budgets = kmalloc(_online_cpus * sizeof(struct npsf_budgets),
3261 + GFP_ATOMIC);
3262 +
3263 + for (i = 0; i < _online_cpus; i++) {
3264 + budgets[i].cpu = NO_CPU;
3265 + budgets[i].budget = 0;
3266 + }
3267 +
3268 + if (copy_from_user(budgets, __budgets,
3269 + sizeof(budgets) * _online_cpus))
3270 + goto err;
3271 +
3272 + /* initialize the npsf_server_t for this npsf_server series */
3273 + npsfserver = kmalloc(sizeof(npsf_server_t), GFP_ATOMIC);
3274 + if (!npsfserver) {
3275 + ret = -ENOMEM;
3276 + goto err;
3277 + }
3278 + edf_domain_init(&npsfserver->dom, NULL, npsf_release_jobs);
3279 + npsfserver->highest_prio = NULL;
3280 +
3281 + /* initialize all npsf_reserve_t for this server */
3282 + for (i = 0; budgets[i].cpu != NO_CPU && i < _online_cpus; i++) {
3283 + entry = &per_cpu(npsf_cpu_entries, budgets[i].cpu);
3284 + if ((ret = do_add_reserve(&npsf_reserve_array[i], entry,
3285 + npsfserver,
3286 + id, budgets[i].budget)) < 0)
3287 + goto err;
3288 + }
3289 + /* set the current reserve to the first (and possibly unique)
3290 + * slice for this npsf_id */
3291 + npsfserver->curr_reserve = npsf_reserve_array[0];
3292 + npsfserver->first_reserve = npsf_reserve_array[0];
3293 + npsfserver->first_cpu_wants_ipi = 0;
3294 + for (i = 0; budgets[i].cpu != NO_CPU && i < _online_cpus; i++) {
3295 +
3296 + if (i == 0 && budgets[i+1].cpu == NO_CPU) {
3297 + /* Fixed reserve always has itself as next */
3298 + npsf_reserve_array[i]->next_npsf = npsf_reserve_array[i];
3299 + } else if (((i+1) < _online_cpus) &&
3300 + (i > 0 && budgets[i+1].cpu == NO_CPU)) {
3301 + /* Last reserve in the chain has the first reserve as next */
3302 + npsf_reserve_array[i]->next_npsf = npsf_reserve_array[0];
3303 + } else {
3304 + /* Normal continuing reserve */
3305 + npsf_reserve_array[i]->next_npsf = npsf_reserve_array[i+1];
3306 + }
3307 + }
3308 +#ifdef NPSF_VERBOSE
3309 + for (i = 0; budgets[i].cpu != NO_CPU && i < _online_cpus; i++) {
3310 + entry = &per_cpu(npsf_cpu_entries, budgets[i].cpu);
3311 + print_reserve(entry);
3312 + }
3313 +#endif
3314 +
3315 + if (last) {
3316 + /* force the first slot switching by setting the
3317 + * current_reserve to the last server for each cpu.
3318 + *
3319 + * FIXME:don't assume there exists at least one reserve per CPU
3320 + */
3321 + for_each_online_cpu(i) {
3322 + entry = &per_cpu(npsf_cpu_entries, i);
3323 + first_reserve = list_entry(entry->npsf_reserves.next,
3324 + npsf_reserve_t, node);
3325 +
3326 + first_reserve->server->curr_reserve = first_reserve;
3327 + entry->cpu_reserve = first_reserve;
3328 + npsf_printk("npsf_id %d is the current reserve "
3329 + "and server on CPU %d\n",
3330 + first_reserve->npsf_id, entry->cpu);
3331 +
3332 + }
3333 +
3334 + kickoff_timers();
3335 +
3336 + /* real plugin enable */
3337 + atomic_set(&all_servers_added, 1);
3338 + mb();
3339 + }
3340 +
3341 + /* at least one server was initialized and may need deletion */
3342 + reserves_allocated = 1;
3343 +err:
3344 + kfree(budgets);
3345 + return ret;
3346 +}
3347 +
3348 +
3349 +/* Cancel server_reschedule_tick() hrtimers. Wait for all callbacks
3350 + * to complete. The function is triggered writing 0 as npsf_slot_length.
3351 + */
3352 +void npsf_hrtimers_cleanup(void)
3353 +{
3354 + int cpu;
3355 + cpu_entry_t *entry;
3356 + int redo;
3357 +
3358 + if (!atomic_read(&timers_activated))
3359 + return;
3360 +
3361 + atomic_set(&timers_activated, 0);
3362 +
3363 + /* prevent the firing of the timer on this cpu */
3364 + do {
3365 + redo = 0;
3366 + for_each_online_cpu(cpu) {
3367 + entry = &per_cpu(npsf_cpu_entries, cpu);
3368 +
3369 + /* if callback active, skip it for now and redo later */
3370 + if (hrtimer_try_to_cancel(&entry->timer) == -1) {
3371 + redo = 1;
3372 +#ifdef NPSF_VERBOSE
3373 + printk(KERN_INFO "(P%d) hrtimer on P%d was "
3374 + "active, try to delete again\n",
3375 + get_cpu(), cpu);
3376 + put_cpu();
3377 +#endif
3378 + }
3379 + }
3380 + } while (redo);
3381 +
3382 + printk(KERN_INFO "npsf hrtimers deleted\n");
3383 +}
3384 +
3385 +static void cleanup_npsf(void)
3386 +{
3387 + int cpu;
3388 + cpu_entry_t *entry;
3389 + struct list_head *nd, *next;
3390 + npsf_reserve_t *tmp, *tmp_save;
3391 +
3392 + for_each_online_cpu(cpu) {
3393 + entry = &per_cpu(npsf_cpu_entries, cpu);
3394 +
3395 + /* FIXME probably not needed as we should be the only cpu
3396 + * doing the removal */
3397 + raw_spin_lock(&entry->cpu_res_lock);
3398 +
3399 + list_for_each_safe(nd, next, &entry->npsf_reserves) {
3400 + tmp = list_entry(nd, npsf_reserve_t, node);
3401 + npsf_printk("Del. (id, cpu):(%d, %d)\n",
3402 + tmp->npsf_id,
3403 + tmp->cpu->cpu);
3404 + if (tmp->server) {
3405 + npsf_printk("Del. reserves for npsf_id %d\n",
3406 + tmp->npsf_id);
3407 + tmp_save = tmp;
3408 + while (tmp_save->next_npsf &&
3409 + tmp_save->next_npsf != tmp) {
3410 + tmp_save = tmp_save->next_npsf;
3411 + tmp_save->server = NULL;
3412 + }
3413 + npsf_printk("Freeing server 0x%p\n", tmp->server);
3414 + kfree(tmp->server);
3415 + }
3416 + npsf_printk("Freeing npsf_reserve_t 0x%p\n", tmp);
3417 + kfree(tmp);
3418 + }
3419 + list_del(&entry->npsf_reserves);
3420 + raw_spin_unlock(&entry->cpu_res_lock);
3421 + }
3422 +}
3423 +
3424 +/* prevent plugin deactivation if timers are still active */
3425 +static long npsf_deactivate_plugin(void)
3426 +{
3427 + return (atomic_read(&timers_activated)) ? -1 : 0;
3428 +}
3429 +
3430 +static long npsf_activate_plugin(void)
3431 +{
3432 + int cpu;
3433 + cpu_entry_t *entry;
3434 + ktime_t now = ktime_get();
3435 +
3436 + /* prevent plugin switching if timers are active */
3437 + if (atomic_read(&timers_activated))
3438 + return -1;
3439 +
3440 + atomic_set(&all_servers_added, 0);
3441 +
3442 + /* de-allocate old servers (if any) */
3443 + if (reserves_allocated)
3444 + cleanup_npsf();
3445 +
3446 + _online_cpus = num_online_cpus();
3447 +
3448 + for_each_online_cpu(cpu) {
3449 + entry = &per_cpu(npsf_cpu_entries, cpu);
3450 +
3451 + raw_spin_lock_init(&entry->cpu_res_lock);
3452 +
3453 + entry->cpu_reserve = NULL;
3454 + INIT_LIST_HEAD(&entry->npsf_reserves);
3455 +
3456 + entry->cpu = cpu;
3457 + hrtimer_init(&entry->timer, CLOCK_MONOTONIC,
3458 + HRTIMER_MODE_ABS_PINNED);
3459 +
3460 + /* initialize (reinitialize) pull timers */
3461 + hrtimer_start_on_info_init(&entry->info);
3462 +
3463 + entry->timer.function = reserve_switch_tick;
3464 + entry->last_seen_npsf_id = -1;
3465 + }
3466 +
3467 + printk(KERN_INFO "NPS-F activated: slot length = %lld ns\n",
3468 + npsf_slot_length);
3469 +
3470 + /* time starts now! */
3471 + time_origin = (lt_t) ktime_to_ns(now);
3472 + TRACE("Time_origin = %llu\n", time_origin);
3473 + return 0;
3474 +}
3475 +
3476 +/* Plugin object */
3477 +static struct sched_plugin npsf_plugin __cacheline_aligned_in_smp = {
3478 + .plugin_name = "NPS-F",
3479 +
3480 + .tick = npsf_scheduler_tick,
3481 + .task_new = npsf_task_new,
3482 + .complete_job = complete_job,
3483 + .task_exit = npsf_task_exit,
3484 + .schedule = npsf_schedule,
3485 + .task_wake_up = npsf_task_wake_up,
3486 + .task_block = npsf_task_block,
3487 + .admit_task = npsf_admit_task,
3488 + .activate_plugin = npsf_activate_plugin,
3489 + .deactivate_plugin = npsf_deactivate_plugin,
3490 +};
3491 +
3492 +static int __init init_npsf(void)
3493 +{
3494 + return register_sched_plugin(&npsf_plugin);
3495 +}
3496 +
3497 +static void __exit exit_npsf(void)
3498 +{
3499 + if (atomic_read(&timers_activated)) {
3500 + atomic_set(&timers_activated, 0);
3501 + return;
3502 + }
3503 +
3504 + if (reserves_allocated)
3505 + cleanup_npsf();
3506 +}
3507 +
3508 +module_init(init_npsf);
3509 +module_exit(exit_npsf);
3510 +
3511 diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
3512 index 3543b7baff53..3036df9b12e3 100644
3513 --- a/litmus/sched_plugin.c
3514 +++ b/litmus/sched_plugin.c
3515 @@ -178,6 +178,12 @@ struct sched_plugin linux_sched_plugin = {
3516 */
3517 int cluster_cache_index = 2;
3518
3519 +/*
3520 + * Slot length (in ns) for NPS-F semi-partitioned plugin.
3521 + * This value can be changed at "runtime" through proc file.
3522 + */
3523 +lt_t npsf_slot_length = 5 * NSEC_PER_MSEC;
3524 +
3525 /*
3526 * The reference to current plugin that is used to schedule tasks within
3527 * the system. It stores references to actual function implementations
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.