Attachment 'gpusync_rtss13_liblitmus.patch'

Download

   1 From 8cc1caa8190c0dcb95d690d43f73d3fea867d377 Mon Sep 17 00:00:00 2001
   2 From: Glenn Elliott <gelliott@cs.unc.edu>
   3 Date: Sun, 19 May 2013 23:35:28 -0400
   4 Subject: [PATCH] squash
   5 
   6 ---
   7  Makefile            |  127 ++-
   8  bin/base_mt_task.c  |   38 +-
   9  bin/base_task.c     |   38 +-
  10  bin/null_call.c     |    4 +-
  11  bin/release_ts.c    |   37 +-
  12  bin/rt_launch.c     |   62 +-
  13  bin/rtspin.c        |  132 ++-
  14  bin/uncache.c       |  381 ++++++++
  15  gpu/aux_threads.c   |  313 ++++++
  16  gpu/budget.cpp      |  379 ++++++++
  17  gpu/dgl.c           |  282 ++++++
  18  gpu/gpuspin.cu      | 2705 +++++++++++++++++++++++++++++++++++++++++++++++++++
  19  gpu/ikglptest.c     |  653 +++++++++++++
  20  gpu/locktest.c      |  206 ++++
  21  gpu/nested.c        |  262 +++++
  22  gpu/normal_task.c   |   90 ++
  23  include/common.h    |    7 +
  24  include/litmus.h    |  322 +++++-
  25  include/migration.h |   24 +
  26  include/tests.h     |    7 +-
  27  src/kernel_iface.c  |   17 +-
  28  src/litmus.c        |  270 ++++-
  29  src/migration.c     |  217 +++++
  30  src/signal.c        |  109 +++
  31  src/syscalls.c      |   82 +-
  32  src/task.c          |   24 +-
  33  tests/core_api.c    |    9 +-
  34  tests/fdso.c        |   10 +-
  35  tests/locks.c       |   12 +-
  36  tests/nesting.c     |  468 +++++++++
  37  tests/pcp.c         |  224 ++++-
  38  tests/sched.c       |   15 +-
  39  32 files changed, 7264 insertions(+), 262 deletions(-)
  40  create mode 100644 bin/uncache.c
  41  create mode 100644 gpu/aux_threads.c
  42  create mode 100644 gpu/budget.cpp
  43  create mode 100644 gpu/dgl.c
  44  create mode 100644 gpu/gpuspin.cu
  45  create mode 100644 gpu/ikglptest.c
  46  create mode 100644 gpu/locktest.c
  47  create mode 100644 gpu/nested.c
  48  create mode 100644 gpu/normal_task.c
  49  create mode 100644 include/migration.h
  50  create mode 100644 src/migration.c
  51  create mode 100644 src/signal.c
  52  create mode 100644 tests/nesting.c
  53 
  54 diff --git a/Makefile b/Makefile
  55 index 8195752..e877ca4 100644
  56 --- a/Makefile
  57 +++ b/Makefile
  58 @@ -14,13 +14,29 @@ ARCH ?= ${host-arch}
  59  # LITMUS_KERNEL -- where to find the litmus kernel?
  60  LITMUS_KERNEL ?= ../litmus-rt
  61  
  62 +# NUMA Support. Comment out to disable. Requires libnuma dev files.
  63 +#
  64 +# Enabling this option will ensure all memory resides on NUMA nodes
  65 +# that overlap clusters/partitions specified by a call to be_migrate*().
  66 +NUMA_SUPPORT = dummyval
  67  
  68  # ##############################################################################
  69  # Internal configuration.
  70  
  71  # compiler flags
  72 -flags-debug    = -Wall -Werror -g -Wdeclaration-after-statement
  73 +flags-debug    = -O2 -Wall -Werror -g -Wdeclaration-after-statement
  74 +#flags-debug    = -Wall -Werror -g -Wdeclaration-after-statement
  75 +flags-debug-cpp    = -O2 -Wall -Werror -g
  76 +#flags-debug-cpp    = -Wall -Werror -g
  77  flags-api      = -D_XOPEN_SOURCE=600 -D_GNU_SOURCE
  78 +flags-misc     = -fasynchronous-unwind-tables -fnon-call-exceptions
  79 +
  80 +flags-cu-debug = -g -G -Xcompiler -Wall -Xcompiler -Werror
  81 +flags-cu-optim = -O2 -Xcompiler -march=native
  82 +#flags-cu-optim = -Xcompiler -march=native
  83 +flags-cu-nvcc = --use_fast_math -gencode arch=compute_20,code=sm_20 -gencode arch=compute_30,code=sm_30
  84 +flags-cu-misc  = -Xcompiler -fasynchronous-unwind-tables -Xcompiler -fnon-call-exceptions -Xcompiler -malign-double -Xcompiler -pthread
  85 +flags-cu-x86_64 = -m64
  86  
  87  # architecture-specific flags
  88  flags-i386     = -m32
  89 @@ -48,12 +64,27 @@ LIBLITMUS ?= .
  90  headers = -I${LIBLITMUS}/include -I${LIBLITMUS}/arch/${include-${ARCH}}/include
  91  
  92  # combine options
  93 -CPPFLAGS = ${flags-api} ${flags-${ARCH}} -DARCH=${ARCH} ${headers}
  94 -CFLAGS   = ${flags-debug}
  95 +CPPFLAGS = ${flags-api} ${flags-debug-cpp} ${flags-misc} ${flags-${ARCH}} -DARCH=${ARCH} ${headers}
  96 +CUFLAGS  = ${flags-api} ${flags-cu-debug} ${flags-cu-optim} ${flags-cu-nvcc} ${flags-cu-misc} -DARCH=${ARCH} ${headers}
  97 +CFLAGS   = ${flags-debug} ${flags-misc}
  98  LDFLAGS  = ${flags-${ARCH}}
  99  
 100 +ifdef NUMA_SUPPORT
 101 +CFLAGS += -DLITMUS_NUMA_SUPPORT
 102 +CPPFLAGS += -DLITMUS_NUMA_SUPPORT
 103 +CUFLAGS += -DLITMUS_NUMA_SUPPORT
 104 +endif
 105 +
 106  # how to link against liblitmus
 107  liblitmus-flags = -L${LIBLITMUS} -llitmus
 108 +ifdef NUMA_SUPPORT
 109 +liblitmus-flags += -lnuma
 110 +endif
 111 +
 112 +# how to link cuda
 113 +cuda-flags-i386 = -L/usr/local/cuda/lib
 114 +cuda-flags-x86_64 = -L/usr/local/cuda/lib64
 115 +cuda-flags = ${cuda-flags-${ARCH}} -lcudart -lcuda
 116  
 117  # Force gcc instead of cc, but let the user specify a more specific version if
 118  # desired.
 119 @@ -61,17 +92,28 @@ ifeq (${CC},cc)
 120  CC = gcc
 121  endif
 122  
 123 +#ifeq (${CPP},cpp)
 124 +CPP = g++
 125 +#endif
 126 +
 127 +CU = nvcc
 128 +
 129  # incorporate cross-compiler (if any)
 130  CC  := ${CROSS_COMPILE}${CC}
 131 +CPP  := ${CROSS_COMPILE}${CPP}
 132  LD  := ${CROSS_COMPILE}${LD}
 133  AR  := ${CROSS_COMPILE}${AR}
 134 +CU  := ${CROSS_COMPILE}${CU}
 135  
 136  # ##############################################################################
 137  # Targets
 138  
 139 -all     = lib ${rt-apps}
 140 +all     = lib ${rt-apps} ${rt-cppapps} ${rt-cuapps}
 141  rt-apps = cycles base_task rt_launch rtspin release_ts measure_syscall \
 142 -	  base_mt_task runtests
 143 +	  base_mt_task uncache runtests \
 144 +	  nested locktest ikglptest dgl aux_threads normal_task
 145 +rt-cppapps = budget
 146 +rt-cuapps = gpuspin
 147  
 148  .PHONY: all lib clean dump-config TAGS tags cscope help
 149  
 150 @@ -86,10 +128,14 @@ inc/config.makefile: Makefile
 151  	@printf "%-15s= %-20s\n" \
 152  		ARCH ${ARCH} \
 153  		CFLAGS '${CFLAGS}' \
 154 +		CPPFLAGS '${CPPFLAGS}' \
 155 +		CUFLAGS '${CUFLAGS}' \
 156  		LDFLAGS '${LDFLAGS}' \
 157  		LDLIBS '${liblitmus-flags}' \
 158  		CPPFLAGS '${CPPFLAGS}' \
 159  		CC '${shell which ${CC}}' \
 160 +		CPP '${shell which ${CPP}}' \
 161 +		CU '${shell which ${CU}}' \
 162  		LD '${shell which ${LD}}' \
 163  		AR '${shell which ${AR}}' \
 164  	> $@
 165 @@ -103,10 +149,12 @@ dump-config:
 166  		headers "${headers}" \
 167  		"kernel headers" "${imported-headers}" \
 168  		CFLAGS "${CFLAGS}" \
 169 -		LDFLAGS "${LDFLAGS}" \
 170  		CPPFLAGS "${CPPFLAGS}" \
 171 +		CUFLAGS "${CUFLAGS}" \
 172 +		LDFLAGS "${LDFLAGS}" \
 173  		CC "${CC}" \
 174  		CPP "${CPP}" \
 175 +		CU "${CU}" \
 176  		LD "${LD}" \
 177  		AR "${AR}" \
 178  		obj-all "${obj-all}"
 179 @@ -115,7 +163,7 @@ help:
 180  	@cat INSTALL
 181  
 182  clean:
 183 -	rm -f ${rt-apps}
 184 +	rm -f ${rt-apps} ${rt-cppapps} ${rt-cuapps}
 185  	rm -f *.o *.d *.a test_catalog.inc
 186  	rm -f ${imported-headers}
 187  	rm -f inc/config.makefile
 188 @@ -156,6 +204,8 @@ arch/${include-${ARCH}}/include/asm/%.h: \
 189  litmus-headers = \
 190  	include/litmus/rt_param.h \
 191  	include/litmus/fpmath.h \
 192 +	include/litmus/binheap.h \
 193 +	include/litmus/signal.h \
 194  	include/litmus/unistd_32.h \
 195  	include/litmus/unistd_64.h
 196  
 197 @@ -201,7 +251,7 @@ tests/runner.c: test_catalog.inc
 198  # Tools that link with liblitmus
 199  
 200  # these source files are found in bin/
 201 -vpath %.c bin/
 202 +vpath %.c bin/ gpu/
 203  
 204  obj-cycles = cycles.o
 205  
 206 @@ -210,16 +260,49 @@ obj-base_task = base_task.o
 207  obj-base_mt_task = base_mt_task.o
 208  ldf-base_mt_task = -pthread
 209  
 210 +obj-aux_threads = aux_threads.o
 211 +ldf-aux_threads = -pthread
 212 +
 213  obj-rt_launch = rt_launch.o common.o
 214  
 215  obj-rtspin = rtspin.o common.o
 216  lib-rtspin = -lrt
 217  
 218 +obj-nested = nested.o common.o
 219 +lib-nested = -lrt -pthread
 220 +
 221 +obj-locktest = locktest.o common.o
 222 +lib-locktest = -lrt -pthread
 223 +
 224 +obj-ikglptest = ikglptest.o common.o
 225 +lib-ikglptest = -lrt -pthread -lm
 226 +
 227 +obj-normal_task = normal_task.o common.o
 228 +lib-normal_task = -lrt -pthread -lm
 229 +
 230 +obj-dgl = dgl.o common.o
 231 +lib-dgl = -lrt -pthread
 232 +
 233 +obj-uncache = uncache.o
 234 +lib-uncache = -lrt
 235 +
 236  obj-release_ts = release_ts.o
 237  
 238  obj-measure_syscall = null_call.o
 239  lib-measure_syscall = -lm
 240  
 241 +
 242 +vpath %.cpp gpu/
 243 +
 244 +objcpp-budget = budget.o common.o
 245 +lib-budget = -lrt -lm -pthread
 246 +
 247 +
 248 +vpath %.cu gpu/
 249 +
 250 +objcu-gpuspin = gpuspin.o common.o
 251 +lib-gpuspin = -lblitz -lrt -lm -lpthread -lboost_filesystem -lboost_system
 252 +
 253  # ##############################################################################
 254  # Build everything that depends on liblitmus.
 255  
 256 @@ -227,12 +310,22 @@ lib-measure_syscall = -lm
 257  ${rt-apps}: $${obj-$$@} liblitmus.a
 258  	$(CC) -o $@ $(LDFLAGS) ${ldf-$@} $(filter-out liblitmus.a,$+) $(LOADLIBS) $(LDLIBS) ${liblitmus-flags} ${lib-$@}
 259  
 260 +${rt-cppapps}: $${objcpp-$$@} liblitmus.a
 261 +	$(CPP) -o $@ $(LDFLAGS) ${ldf-$@} $(filter-out liblitmus.a,$+) $(LOADLIBS) $(LDLIBS) ${liblitmus-flags} ${lib-$@}
 262 +
 263 +${rt-cuapps}: $${objcu-$$@} liblitmus.a
 264 +	$(CPP) -o $@ $(LDFLAGS) ${ldf-$@} $(filter-out liblitmus.a,$+) $(LOADLIBS) $(LDLIBS) ${liblitmus-flags} ${cuda-flags} ${lib-$@}
 265 +
 266  # ##############################################################################
 267  # Dependency resolution.
 268  
 269 -vpath %.c bin/ src/ tests/
 270 +vpath %.c bin/ src/ gpu/ tests/
 271 +vpath %.cpp gpu/
 272 +vpath %.cu gpu/
 273  
 274  obj-all = ${sort ${foreach target,${all},${obj-${target}}}}
 275 +obj-all += ${sort ${foreach target,${all},${objcpp-${target}}}}
 276 +obj-all += ${sort ${foreach target,${all},${objcu-${target}}}}
 277  
 278  # rule to generate dependency files
 279  %.d: %.c ${imported-headers}
 280 @@ -241,6 +334,22 @@ obj-all = ${sort ${foreach target,${all},${obj-${target}}}}
 281  		sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \
 282  		rm -f $@.$$$$
 283  
 284 +%.d: %.cpp ${imported-headers}
 285 +	@set -e; rm -f $@; \
 286 +		$(CPP) -MM $(CPPFLAGS) $< > $@.$$$$; \
 287 +		sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \
 288 +		rm -f $@.$$$$
 289 +
 290 +%.d: %.cu ${imported-headers}
 291 +	@set -e; rm -f $@; \
 292 +		$(CU) --generate-dependencies $(CUFLAGS) $< > $@.$$$$; \
 293 +		sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \
 294 +		rm -f $@.$$$$
 295 +
 296 +# teach make how to compile .cu files
 297 +%.o: %.cu
 298 +	$(CU) --compile $(CUFLAGS) $(OUTPUT_OPTION) $<
 299 +
 300  ifeq ($(MAKECMDGOALS),)
 301  MAKECMDGOALS += all
 302  endif
 303 diff --git a/bin/base_mt_task.c b/bin/base_mt_task.c
 304 index 8090cc3..1406b20 100644
 305 --- a/bin/base_mt_task.c
 306 +++ b/bin/base_mt_task.c
 307 @@ -1,4 +1,4 @@
 308 -/* based_mt_task.c -- A basic multi-threaded real-time task skeleton. 
 309 +/* based_mt_task.c -- A basic multi-threaded real-time task skeleton.
 310   *
 311   * This (by itself useless) task demos how to setup a multi-threaded LITMUS^RT
 312   * real-time task. Familiarity with the single threaded example (base_task.c)
 313 @@ -26,12 +26,10 @@
 314  #define RELATIVE_DEADLINE 100
 315  #define EXEC_COST         10
 316  
 317 -#define NS_PER_MS         1e6
 318 -
 319  /* Let's create 10 threads in the example, 
 320   * for a total utilization of 1.
 321   */
 322 -#define NUM_THREADS      10 
 323 +#define NUM_THREADS      10
 324  
 325  /* The information passed to each thread. Could be anything. */
 326  struct thread_context {
 327 @@ -43,7 +41,7 @@ struct thread_context {
 328   */
 329  void* rt_thread(void *tcontext);
 330  
 331 -/* Declare the periodically invoked job. 
 332 +/* Declare the periodically invoked job.
 333   * Returns 1 -> task should exit.
 334   *         0 -> task should continue.
 335   */
 336 @@ -62,7 +60,7 @@ int job(void);
 337  	} while (0)
 338  
 339  
 340 -/* Basic setup is the same as in the single-threaded example. However, 
 341 +/* Basic setup is the same as in the single-threaded example. However,
 342   * we do some thread initiliazation first before invoking the job.
 343   */
 344  int main(int argc, char** argv)
 345 @@ -71,7 +69,7 @@ int main(int argc, char** argv)
 346  	struct thread_context ctx[NUM_THREADS];
 347  	pthread_t             task[NUM_THREADS];
 348  
 349 -	/* The task is in background mode upon startup. */		
 350 +	/* The task is in background mode upon startup. */
 351  
 352  
 353  	/*****
 354 @@ -79,7 +77,7 @@ int main(int argc, char** argv)
 355  	 */
 356  
 357  
 358 -       
 359 +
 360  	/*****
 361  	 * 2) Work environment (e.g., global data structures, file data, etc.) would
 362  	 *    be setup here.
 363 @@ -94,7 +92,7 @@ int main(int argc, char** argv)
 364  	init_litmus();
 365  
 366  
 367 -	/***** 
 368 +	/*****
 369  	 * 4) Launch threads.
 370  	 */
 371  	for (i = 0; i < NUM_THREADS; i++) {
 372 @@ -102,15 +100,15 @@ int main(int argc, char** argv)
 373  		pthread_create(task + i, NULL, rt_thread, (void *) (ctx + i));
 374  	}
 375  
 376 -	
 377 +
 378  	/*****
 379  	 * 5) Wait for RT threads to terminate.
 380  	 */
 381  	for (i = 0; i < NUM_THREADS; i++)
 382  		pthread_join(task[i], NULL);
 383 -	
 384  
 385 -	/***** 
 386 +
 387 +	/*****
 388  	 * 6) Clean up, maybe print results and stats, and exit.
 389  	 */
 390  	return 0;
 391 @@ -129,10 +127,10 @@ void* rt_thread(void *tcontext)
 392  	struct rt_task param;
 393  
 394  	/* Set up task parameters */
 395 -	memset(&param, 0, sizeof(param));
 396 -	param.exec_cost = EXEC_COST * NS_PER_MS;
 397 -	param.period = PERIOD * NS_PER_MS;
 398 -	param.relative_deadline = RELATIVE_DEADLINE * NS_PER_MS;
 399 +	init_rt_task_param(&param);
 400 +	param.exec_cost = ms2ns(EXEC_COST);
 401 +	param.period = ms2ns(PERIOD);
 402 +	param.relative_deadline = ms2ns(RELATIVE_DEADLINE);
 403  
 404  	/* What to do in the case of budget overruns? */
 405  	param.budget_policy = NO_ENFORCEMENT;
 406 @@ -166,7 +164,7 @@ void* rt_thread(void *tcontext)
 407  	 */
 408  	CALL( task_mode(LITMUS_RT_TASK) );
 409  
 410 -	/* The task is now executing as a real-time task if the call didn't fail. 
 411 +	/* The task is now executing as a real-time task if the call didn't fail.
 412  	 */
 413  
 414  
 415 @@ -178,11 +176,11 @@ void* rt_thread(void *tcontext)
 416  		/* Wait until the next job is released. */
 417  		sleep_next_period();
 418  		/* Invoke job. */
 419 -		do_exit = job();		
 420 +		do_exit = job();
 421  	} while (!do_exit);
 422  
 423  
 424 -	
 425 +
 426  	/*****
 427  	 * 4) Transition to background mode.
 428  	 */
 429 @@ -194,7 +192,7 @@ void* rt_thread(void *tcontext)
 430  
 431  
 432  
 433 -int job(void) 
 434 +int job(void)
 435  {
 436  	/* Do real-time calculation. */
 437  
 438 diff --git a/bin/base_task.c b/bin/base_task.c
 439 index df0c5a2..0274c89 100644
 440 --- a/bin/base_task.c
 441 +++ b/bin/base_task.c
 442 @@ -1,6 +1,6 @@
 443 -/* based_task.c -- A basic real-time task skeleton. 
 444 +/* based_task.c -- A basic real-time task skeleton.
 445   *
 446 - * This (by itself useless) task demos how to setup a 
 447 + * This (by itself useless) task demos how to setup a
 448   * single-threaded LITMUS^RT real-time task.
 449   */
 450  
 451 @@ -20,7 +20,7 @@
 452   */
 453  #include "litmus.h"
 454  
 455 -/* Next, we define period and execution cost to be constant. 
 456 +/* Next, we define period and execution cost to be constant.
 457   * These are only constants for convenience in this example, they can be
 458   * determined at run time, e.g., from command line parameters.
 459   *
 460 @@ -30,8 +30,6 @@
 461  #define RELATIVE_DEADLINE 100
 462  #define EXEC_COST         10
 463  
 464 -#define NS_PER_MS         1e6
 465 -
 466  /* Catch errors.
 467   */
 468  #define CALL( exp ) do { \
 469 @@ -44,13 +42,13 @@
 470  	} while (0)
 471  
 472  
 473 -/* Declare the periodically invoked job. 
 474 +/* Declare the periodically invoked job.
 475   * Returns 1 -> task should exit.
 476   *         0 -> task should continue.
 477   */
 478  int job(void);
 479  
 480 -/* typically, main() does a couple of things: 
 481 +/* typically, main() does a couple of things:
 482   * 	1) parse command line parameters, etc.
 483   *	2) Setup work environment.
 484   *	3) Setup real-time parameters.
 485 @@ -60,7 +58,7 @@ int job(void);
 486   *	7) Clean up and exit.
 487   *
 488   * The following main() function provides the basic skeleton of a single-threaded
 489 - * LITMUS^RT real-time task. In a real program, all the return values should be 
 490 + * LITMUS^RT real-time task. In a real program, all the return values should be
 491   * checked for errors.
 492   */
 493  int main(int argc, char** argv)
 494 @@ -69,10 +67,10 @@ int main(int argc, char** argv)
 495  	struct rt_task param;
 496  
 497  	/* Setup task parameters */
 498 -	memset(&param, 0, sizeof(param));
 499 -	param.exec_cost = EXEC_COST * NS_PER_MS;
 500 -	param.period = PERIOD * NS_PER_MS;
 501 -	param.relative_deadline = RELATIVE_DEADLINE * NS_PER_MS;
 502 +	init_rt_task_param(&param);
 503 +	param.exec_cost = ms2ns(EXEC_COST);
 504 +	param.period = ms2ns(PERIOD);
 505 +	param.relative_deadline = ms2ns(RELATIVE_DEADLINE);
 506  
 507  	/* What to do in the case of budget overruns? */
 508  	param.budget_policy = NO_ENFORCEMENT;
 509 @@ -100,9 +98,9 @@ int main(int argc, char** argv)
 510  
 511  
 512  	/*****
 513 -	 * 3) Setup real-time parameters. 
 514 -	 *    In this example, we create a sporadic task that does not specify a 
 515 -	 *    target partition (and thus is intended to run under global scheduling). 
 516 +	 * 3) Setup real-time parameters.
 517 +	 *    In this example, we create a sporadic task that does not specify a
 518 +	 *    target partition (and thus is intended to run under global scheduling).
 519  	 *    If this were to execute under a partitioned scheduler, it would be assigned
 520  	 *    to the first partition (since partitioning is performed offline).
 521  	 */
 522 @@ -124,7 +122,7 @@ int main(int argc, char** argv)
 523  	 */
 524  	CALL( task_mode(LITMUS_RT_TASK) );
 525  
 526 -	/* The task is now executing as a real-time task if the call didn't fail. 
 527 +	/* The task is now executing as a real-time task if the call didn't fail.
 528  	 */
 529  
 530  
 531 @@ -136,11 +134,11 @@ int main(int argc, char** argv)
 532  		/* Wait until the next job is released. */
 533  		sleep_next_period();
 534  		/* Invoke job. */
 535 -		do_exit = job();		
 536 +		do_exit = job();
 537  	} while (!do_exit);
 538  
 539  
 540 -	
 541 +
 542  	/*****
 543  	 * 6) Transition to background mode.
 544  	 */
 545 @@ -148,14 +146,14 @@ int main(int argc, char** argv)
 546  
 547  
 548  
 549 -	/***** 
 550 +	/*****
 551  	 * 7) Clean up, maybe print results and stats, and exit.
 552  	 */
 553  	return 0;
 554  }
 555  
 556  
 557 -int job(void) 
 558 +int job(void)
 559  {
 560  	/* Do real-time calculation. */
 561  
 562 diff --git a/bin/null_call.c b/bin/null_call.c
 563 index d714e77..bab8e73 100644
 564 --- a/bin/null_call.c
 565 +++ b/bin/null_call.c
 566 @@ -16,7 +16,7 @@ static void time_null_call(void)
 567  	t2 = get_cycles();
 568  	if (ret != 0)
 569  		perror("null_call");
 570 -	printf("%10" CYCLES_FMT ", " 
 571 +	printf("%10" CYCLES_FMT ", "
 572  	       "%10" CYCLES_FMT ", "
 573  	       "%10" CYCLES_FMT ", "
 574  	       "%10" CYCLES_FMT ", "
 575 @@ -38,7 +38,7 @@ int main(int argc, char **argv)
 576  {
 577  	double delay;
 578  	struct timespec sleep_time;
 579 -	
 580 +
 581  	if (argc == 2) {
 582  		delay = atof(argv[1]);
 583  		sleep_time = sec2timespec(delay);
 584 diff --git a/bin/release_ts.c b/bin/release_ts.c
 585 index 7752097..6a74710 100644
 586 --- a/bin/release_ts.c
 587 +++ b/bin/release_ts.c
 588 @@ -10,7 +10,6 @@
 589  #include "internal.h"
 590  
 591  #define OPTSTR "d:wf:"
 592 -#define NS_PER_MS 1000000
 593  
 594  #define LITMUS_STATS_FILE "/proc/litmus/stats"
 595  
 596 @@ -31,54 +30,34 @@ void usage(char *error) {
 597  void wait_until_ready(int expected)
 598  {
 599  	int ready = 0, all = 0;
 600 -	char buf[100];
 601  	int loops = 0;
 602 -	ssize_t len;
 603 -	
 604  
 605  	do {
 606  		if (loops++ > 0)
 607  			sleep(1);
 608 -		len = read_file(LITMUS_STATS_FILE, buf, sizeof(buf) - 1);
 609 -		if (len < 0) {
 610 -			fprintf(stderr,
 611 -				"(EE) Error while reading '%s': %m.\n"
 612 -				"(EE) Ignoring -w option.\n",
 613 -				LITMUS_STATS_FILE);
 614 -			break;
 615 -		} else {
 616 -			len = sscanf(buf,
 617 -				     "real-time tasks   = %d\n"
 618 -				     "ready for release = %d\n",
 619 -				     &all, &ready);
 620 -			if (len != 2) {
 621 -				fprintf(stderr, 
 622 -					"(EE) Could not parse '%s'.\n"
 623 -					"(EE) Ignoring -w option.\n",
 624 -					LITMUS_STATS_FILE);
 625 -				break;
 626 -			}
 627 -		}
 628 -	} while (expected > ready || ready < all);
 629 +		if (!read_litmus_stats(&ready, &all))
 630 +			perror("read_litmus_stats");
 631 +	} while (expected > ready || (!expected && ready < all));
 632  }
 633  
 634  int main(int argc, char** argv)
 635  {
 636  	int released;
 637 -	lt_t delay = ms2lt(1000);
 638 +	lt_t delay = ms2ns(1000);
 639  	int wait = 0;
 640  	int expected = 0;
 641  	int opt;
 642 -      
 643 +
 644  	while ((opt = getopt(argc, argv, OPTSTR)) != -1) {
 645  		switch (opt) {
 646  		case 'd':
 647 -			delay = ms2lt(atoi(optarg));
 648 +			delay = ms2ns(atoi(optarg));
 649  			break;
 650  		case 'w':
 651  			wait = 1;
 652  			break;
 653  		case 'f':
 654 +			wait = 1;
 655  			expected = atoi(optarg);
 656  			break;
 657  		case ':':
 658 @@ -99,7 +78,7 @@ int main(int argc, char** argv)
 659  		perror("release task system");
 660  		exit(1);
 661  	}
 662 -	
 663 +
 664  	printf("Released %d real-time tasks.\n", released);
 665  
 666  	return 0;
 667 diff --git a/bin/rt_launch.c b/bin/rt_launch.c
 668 index 3863031..805e20b 100644
 669 --- a/bin/rt_launch.c
 670 +++ b/bin/rt_launch.c
 671 @@ -29,10 +29,11 @@ int launch(void *task_info_p) {
 672  }
 673  
 674  void usage(char *error) {
 675 -	fprintf(stderr, "%s\nUsage: rt_launch [-w][-v][-p cpu][-c hrt | srt | be] wcet period program [arg1 arg2 ...]\n"
 676 +	fprintf(stderr, "%s\nUsage: rt_launch [-w][-v][-p partition/cluster [-z cluster size]][-q prio][-c hrt | srt | be] wcet period program [arg1 arg2 ...]\n"
 677  			"\t-w\tSynchronous release\n"
 678  			"\t-v\tVerbose\n"
 679 -			"\t-p\tcpu (or initial cpu)\n"
 680 +			"\t-p\tpartition or cluster\n"
 681 +			"\t-z\tsize of cluster (default = 1 for partitioned)\n"
 682  			"\t-c\tClass\n"
 683  			"\twcet, period in ms\n"
 684  			"\tprogram to be launched\n",
 685 @@ -41,20 +42,24 @@ void usage(char *error) {
 686  }
 687  
 688  
 689 -#define OPTSTR "p:c:vw"
 690 +#define OPTSTR "p:z:c:vwq:t"
 691  
 692 -int main(int argc, char** argv) 
 693 +int main(int argc, char** argv)
 694  {
 695  	int ret;
 696  	lt_t wcet;
 697  	lt_t period;
 698  	int migrate = 0;
 699 -	int cpu = 0;
 700 +	int cluster = 0;
 701 +	int cluster_size = 1;
 702  	int opt;
 703  	int verbose = 0;
 704  	int wait = 0;
 705  	startup_info_t info;
 706 -	task_class_t class = RT_CLASS_HARD;
 707 +	task_class_t cls = RT_CLASS_HARD;
 708 +	unsigned int priority = LITMUS_LOWEST_PRIORITY;
 709 +	budget_policy_t budget_pol = QUANTUM_ENFORCEMENT;
 710 +	struct rt_task param;
 711  
 712  	while ((opt = getopt(argc, argv, OPTSTR)) != -1) {
 713  		switch (opt) {
 714 @@ -65,15 +70,26 @@ int main(int argc, char** argv)
 715  			verbose = 1;
 716  			break;
 717  		case 'p':
 718 -			cpu = atoi(optarg);
 719 +			cluster = atoi(optarg);
 720  			migrate = 1;
 721  			break;
 722 +		case 'z':
 723 +			cluster_size = atoi(optarg);
 724 +			break;
 725 +		case 'q':
 726 +			priority = atoi(optarg);
 727 +			if (!litmus_is_valid_fixed_prio(priority))
 728 +				usage("Invalid priority.");
 729 +			break;
 730  		case 'c':
 731 -			class = str2class(optarg);
 732 -			if (class == -1)
 733 +			cls = str2class(optarg);
 734 +			if (cls == -1)
 735  				usage("Unknown task class.");
 736  			break;
 737 -
 738 +		case 't':
 739 +			/* use an hrtimer for budget enforcement */
 740 +			budget_pol = PRECISE_ENFORCEMENT;
 741 +			break;
 742  		case ':':
 743  			usage("Argument missing.");
 744  			break;
 745 @@ -87,9 +103,9 @@ int main(int argc, char** argv)
 746  	signal(SIGUSR1, SIG_IGN);
 747  
 748  	if (argc - optind < 3)
 749 -		usage("Arguments missing.");       
 750 -	wcet   = ms2lt(atoi(argv[optind + 0]));
 751 -	period = ms2lt(atoi(argv[optind + 1]));
 752 +		usage("Arguments missing.");
 753 +	wcet   = ms2ns(atoi(argv[optind + 0]));
 754 +	period = ms2ns(atoi(argv[optind + 1]));
 755  	if (wcet <= 0)
 756  	usage("The worst-case execution time must be a "
 757  	      "positive number.");
 758 @@ -103,17 +119,27 @@ int main(int argc, char** argv)
 759  	info.argv      = argv + optind + 2;
 760  	info.wait      = wait;
 761  	if (migrate) {
 762 -		ret = be_migrate_to(cpu);
 763 +		ret = be_migrate_to_cluster(cluster, cluster_size);
 764  		if (ret < 0)
 765 -			bail_out("could not migrate to target partition");
 766 +			bail_out("could not migrate to target partition or cluster");
 767  	}
 768 -	ret = __create_rt_task(launch, &info, cpu, wcet, period, class);
 769  
 770 -	
 771 +	init_rt_task_param(&param);
 772 +	param.exec_cost = wcet;
 773 +	param.period = period;
 774 +	param.priority = priority;
 775 +	param.cls = cls;
 776 +	param.budget_policy = budget_pol;
 777 +
 778 +	if (migrate)
 779 +		param.cpu = cluster_to_first_cpu(cluster, cluster_size);
 780 +
 781 +	ret = create_rt_task(launch, &info, &param);
 782 +
 783  	if (ret < 0)
 784  		bail_out("could not create rt child process");
 785  	else if (verbose)
 786  		printf("%d\n", ret);
 787  
 788 -	return 0;	
 789 +	return 0;
 790  }
 791 diff --git a/bin/rtspin.c b/bin/rtspin.c
 792 index f0a477d..4a1d994 100644
 793 --- a/bin/rtspin.c
 794 +++ b/bin/rtspin.c
 795 @@ -4,6 +4,7 @@
 796  #include <stdlib.h>
 797  #include <unistd.h>
 798  #include <time.h>
 799 +#include <string.h>
 800  #include <assert.h>
 801  
 802  
 803 @@ -20,9 +21,12 @@ static void usage(char *error) {
 804  		"	rt_spin [COMMON-OPTS] -f FILE [-o COLUMN] WCET PERIOD\n"
 805  		"	rt_spin -l\n"
 806  		"\n"
 807 -		"COMMON-OPTS = [-w] [-p PARTITION] [-c CLASS] [-s SCALE]\n"
 808 +		"COMMON-OPTS = [-w] [-s SCALE]\n"
 809 +		"              [-p PARTITION/CLUSTER [-z CLUSTER SIZE]] [-c CLASS]\n"
 810 +		"              [-X LOCKING-PROTOCOL] [-L CRITICAL SECTION LENGTH] [-Q RESOURCE-ID]"
 811  		"\n"
 812 -		"WCET and PERIOD are milliseconds, DURATION is seconds.\n");
 813 +		"WCET and PERIOD are milliseconds, DURATION is seconds.\n"
 814 +		"CRITICAL SECTION LENGTH is in milliseconds.\n");
 815  	exit(EXIT_FAILURE);
 816  }
 817  
 818 @@ -67,7 +71,7 @@ static void get_exec_times(const char *file, const int column,
 819  		bail_out("rewinding file failed");
 820  
 821  	/* allocate space for exec times */
 822 -	*exec_times = calloc(*num_jobs, sizeof(*exec_times));
 823 +	*exec_times = (double*)calloc(*num_jobs, sizeof(*exec_times));
 824  	if (!*exec_times)
 825  		bail_out("couldn't allocate memory");
 826  
 827 @@ -77,7 +81,7 @@ static void get_exec_times(const char *file, const int column,
 828  
 829  		for (cur_col = 1; cur_col < column; ++cur_col) {
 830  			/* discard input until we get to the column we want */
 831 -			fscanf(fstream, "%*s,");
 832 +			int unused __attribute__ ((unused)) = fscanf(fstream, "%*s,");
 833  		}
 834  
 835  		/* get the desired exec. time */
 836 @@ -150,19 +154,37 @@ static void debug_delay_loop(void)
 837  	}
 838  }
 839  
 840 -static int job(double exec_time, double program_end)
 841 +static int job(double exec_time, double program_end, int lock_od, double cs_length)
 842  {
 843 +	double chunk1, chunk2;
 844 +
 845  	if (wctime() > program_end)
 846  		return 0;
 847  	else {
 848 -		loop_for(exec_time, program_end + 1);
 849 +		if (lock_od >= 0) {
 850 +			/* simulate critical section somewhere in the middle */
 851 +			chunk1 = drand48() * (exec_time - cs_length);
 852 +			chunk2 = exec_time - cs_length - chunk1;
 853 +
 854 +			/* non-critical section */
 855 +			loop_for(chunk1, program_end + 1);
 856 +
 857 +			/* critical section */
 858 +			litmus_lock(lock_od);
 859 +			loop_for(cs_length, program_end + 1);
 860 +			litmus_unlock(lock_od);
 861 +
 862 +			/* non-critical section */
 863 +			loop_for(chunk2, program_end + 2);
 864 +		} else {
 865 +			loop_for(exec_time, program_end + 1);
 866 +		}
 867  		sleep_next_period();
 868  		return 1;
 869  	}
 870  }
 871  
 872 -#define OPTSTR "p:c:wlveo:f:s:q:"
 873 -
 874 +#define OPTSTR "p:z:c:wlveio:f:s:q:X:L:Q:"
 875  int main(int argc, char** argv)
 876  {
 877  	int ret;
 878 @@ -171,18 +193,28 @@ int main(int argc, char** argv)
 879  	double wcet_ms, period_ms;
 880  	unsigned int priority = LITMUS_LOWEST_PRIORITY;
 881  	int migrate = 0;
 882 -	int cpu = 0;
 883 +	int cluster = 0;
 884 +	int cluster_size = 1;
 885  	int opt;
 886  	int wait = 0;
 887  	int test_loop = 0;
 888  	int column = 1;
 889  	const char *file = NULL;
 890  	int want_enforcement = 0;
 891 -	double duration = 0, start;
 892 +	int want_signals = 0;
 893 +	double duration = 0, start = 0;
 894  	double *exec_times = NULL;
 895  	double scale = 1.0;
 896 -	task_class_t class = RT_CLASS_HARD;
 897 -	int cur_job, num_jobs;
 898 +	task_class_t cls = RT_CLASS_HARD;
 899 +	int cur_job = 0, num_jobs = 0;
 900 +	struct rt_task param;
 901 +
 902 +	/* locking */
 903 +	int lock_od = -1;
 904 +	int resource_id = 0;
 905 +	const char *lock_namespace = "./rtspin-locks";
 906 +	int protocol = -1;
 907 +	double cs_length = 1; /* millisecond */
 908  
 909  	progname = argv[0];
 910  
 911 @@ -192,22 +224,28 @@ int main(int argc, char** argv)
 912  			wait = 1;
 913  			break;
 914  		case 'p':
 915 -			cpu = atoi(optarg);
 916 +			cluster = atoi(optarg);
 917  			migrate = 1;
 918  			break;
 919 +		case 'z':
 920 +			cluster_size = atoi(optarg);
 921 +			break;
 922  		case 'q':
 923  			priority = atoi(optarg);
 924  			if (!litmus_is_valid_fixed_prio(priority))
 925  				usage("Invalid priority.");
 926  			break;
 927  		case 'c':
 928 -			class = str2class(optarg);
 929 -			if (class == -1)
 930 +			cls = str2class(optarg);
 931 +			if (cls == -1)
 932  				usage("Unknown task class.");
 933  			break;
 934  		case 'e':
 935  			want_enforcement = 1;
 936  			break;
 937 +		case 'i':
 938 +			want_signals = 1;
 939 +			break;
 940  		case 'l':
 941  			test_loop = 1;
 942  			break;
 943 @@ -220,6 +258,21 @@ int main(int argc, char** argv)
 944  		case 's':
 945  			scale = atof(optarg);
 946  			break;
 947 +		case 'X':
 948 +			protocol = lock_protocol_for_name(optarg);
 949 +			if (protocol < 0)
 950 +				usage("Unknown locking protocol specified.");
 951 +			break;
 952 +		case 'L':
 953 +			cs_length = atof(optarg);
 954 +			if (cs_length <= 0)
 955 +				usage("Invalid critical section length.");
 956 +			break;
 957 +		case 'Q':
 958 +			resource_id = atoi(optarg);
 959 +			if (resource_id <= 0 && strcmp(optarg, "0"))
 960 +				usage("Invalid resource ID.");
 961 +			break;
 962  		case ':':
 963  			usage("Argument missing.");
 964  			break;
 965 @@ -235,6 +288,8 @@ int main(int argc, char** argv)
 966  		return 0;
 967  	}
 968  
 969 +	srand(getpid());
 970 +
 971  	if (file) {
 972  		get_exec_times(file, column, &num_jobs, &exec_times);
 973  
 974 @@ -257,8 +312,8 @@ int main(int argc, char** argv)
 975  	wcet_ms   = atof(argv[optind + 0]);
 976  	period_ms = atof(argv[optind + 1]);
 977  
 978 -	wcet   = wcet_ms * __NS_PER_MS;
 979 -	period = period_ms * __NS_PER_MS;
 980 +	wcet   = ms2ns(wcet_ms);
 981 +	period = ms2ns(period_ms);
 982  	if (wcet <= 0)
 983  		usage("The worst-case execution time must be a "
 984  				"positive number.");
 985 @@ -275,24 +330,47 @@ int main(int argc, char** argv)
 986  		duration += period_ms * 0.001 * (num_jobs - 1);
 987  
 988  	if (migrate) {
 989 -		ret = be_migrate_to(cpu);
 990 +		ret = be_migrate_to_cluster(cluster, cluster_size);
 991  		if (ret < 0)
 992 -			bail_out("could not migrate to target partition");
 993 +			bail_out("could not migrate to target partition or cluster.");
 994  	}
 995  
 996 -	ret = sporadic_task_ns(wcet, period, 0, cpu, priority, class,
 997 -			       want_enforcement ? PRECISE_ENFORCEMENT
 998 -			                        : NO_ENFORCEMENT,
 999 -			       migrate);
1000 +	init_rt_task_param(&param);
1001 +	param.exec_cost = wcet;
1002 +	param.period = period;
1003 +	param.priority = priority;
1004 +	param.cls = cls;
1005 +	param.budget_policy = (want_enforcement) ?
1006 +			PRECISE_ENFORCEMENT : NO_ENFORCEMENT;
1007 +	param.budget_signal_policy = (want_enforcement && want_signals) ?
1008 +			PRECISE_SIGNALS : NO_SIGNALS;
1009 +				
1010 +	if (migrate)
1011 +		param.cpu = cluster_to_first_cpu(cluster, cluster_size);
1012 +	ret = set_rt_task_param(gettid(), &param);
1013  	if (ret < 0)
1014  		bail_out("could not setup rt task params");
1015  
1016  	init_litmus();
1017  
1018 +	if (want_signals) {
1019 +		/* bind default longjmp signal handler to SIG_BUDGET. */
1020 +		activate_litmus_signals(SIG_BUDGET_MASK, longjmp_on_litmus_signal);
1021 +	}
1022 +
1023  	ret = task_mode(LITMUS_RT_TASK);
1024  	if (ret != 0)
1025  		bail_out("could not become RT task");
1026  
1027 +	if (protocol >= 0) {
1028 +		/* open reference to semaphore */
1029 +		lock_od = litmus_open_lock(protocol, resource_id, lock_namespace, &cluster);
1030 +		if (lock_od < 0) {
1031 +			perror("litmus_open_lock");
1032 +			usage("Could not open lock.");
1033 +		}
1034 +	}
1035 +
1036  	if (wait) {
1037  		ret = wait_for_ts_release();
1038  		if (ret != 0)
1039 @@ -306,11 +384,13 @@ int main(int argc, char** argv)
1040  		for (cur_job = 0; cur_job < num_jobs; ++cur_job) {
1041  			/* convert job's length to seconds */
1042  			job(exec_times[cur_job] * 0.001 * scale,
1043 -					start + duration);
1044 +			    start + duration,
1045 +			    lock_od, cs_length * 0.001);
1046  		}
1047  	} else {
1048 -		/* conver to seconds and scale */
1049 -		while (job(wcet_ms * 0.001 * scale, start + duration));
1050 +		/* convert to seconds and scale */
1051 +		while (job(wcet_ms * 0.001 * scale, start + duration,
1052 +			   lock_od, cs_length * 0.001));
1053  	}
1054  
1055  	ret = task_mode(BACKGROUND_TASK);
1056 diff --git a/bin/uncache.c b/bin/uncache.c
1057 new file mode 100644
1058 index 0000000..b6f6913
1059 --- /dev/null
1060 +++ b/bin/uncache.c
1061 @@ -0,0 +1,381 @@
1062 +#include <stdio.h>
1063 +#include <stdlib.h>
1064 +#include <unistd.h>
1065 +#include <time.h>
1066 +#include <sched.h>
1067 +#include <assert.h>
1068 +#include <string.h>
1069 +#include <stdint.h>
1070 +#include <sys/fcntl.h>
1071 +#include <sys/mman.h>
1072 +
1073 +/* Test tool for validating Litmus's uncache device.     */
1074 +/* Tool also capable basic cache vs. sysmem statistics.  */
1075 +/* Compile with '-O2' for significaintly greater margins */
1076 +/* in performance between cache and sysmem:              */
1077 +/* (Intel Xeon X5650)                                    */
1078 +/*    -g -> uncache is 30x slower                        */
1079 +/*    -O2 -> uncache is >100x slower                     */
1080 +
1081 +int PAGE_SIZE;
1082 +#define NR_PAGES 16
1083 +
1084 +#define UNCACHE_DEV "/dev/litmus/uncache"
1085 +
1086 +/* volatile forces a read from memory (or cache) on every reference. Note
1087 +   that volatile does not keep data out of the cache! */
1088 +typedef volatile char* pbuf_t;
1089 +
1090 +/* hit the first byte in each page.
1091 +   addr must be page aligned. */
1092 +inline int linear_write(pbuf_t addr, int size, char val)
1093 +{
1094 +	pbuf_t end = addr + size;
1095 +	pbuf_t step;
1096 +	int nr_pages = (unsigned long)(end - addr)/PAGE_SIZE;
1097 +	int times = nr_pages * PAGE_SIZE;
1098 +	int i;
1099 +
1100 +	for (i = 0; i < times; ++i)
1101 +		for(step = addr; step < end; step += PAGE_SIZE)
1102 +			*step = val;
1103 +	return 0;
1104 +}
1105 +inline int linear_read(pbuf_t addr, int size, char val)
1106 +{
1107 +	pbuf_t end = addr + size;
1108 +	pbuf_t step;
1109 +	int nr_pages = (unsigned long)(end - addr)/PAGE_SIZE;
1110 +	int times = nr_pages * PAGE_SIZE;
1111 +	int i;
1112 +
1113 +	for (i = 0; i < times; ++i)
1114 +		for(step = addr; step < end; step += PAGE_SIZE) {
1115 +			if (*step != val)
1116 +				return -1;
1117 +		}
1118 +	return 0;
1119 +}
1120 +
1121 +/* write to *data nr times. */
1122 +inline int hammer_write(pbuf_t data, char val, int nr)
1123 +{
1124 +	int i;
1125 +	for (i = 0; i < nr; ++i)
1126 +		*data = val;
1127 +	return 0;
1128 +}
1129 +
1130 +/* read from *data nr times. */
1131 +inline int hammer_read(pbuf_t data, char val, int nr)
1132 +{
1133 +	int i;
1134 +	for (i = 0; i < nr; ++i) {
1135 +		if (*data != val)
1136 +			return -1;
1137 +	}
1138 +	return 0;
1139 +}
1140 +
1141 +inline int test(pbuf_t data, int size, int trials)
1142 +{
1143 +	int HAMMER_TIME = 10000;  /* can't cache this! */
1144 +	char VAL = 0x55;
1145 +	int t;
1146 +	for(t = 0; t < trials; ++t) {
1147 +
1148 +#if 0
1149 +		if (linear_write(data, size, VAL) != 0) {
1150 +			printf("failed linear_write()\n");
1151 +			return -1;
1152 +		}
1153 +		if (linear_read(data, size, VAL) != 0) {
1154 +			printf("failed linear_read()\n");
1155 +			return -1;
1156 +		}
1157 +#endif
1158 +
1159 +		/* hammer at the first byte in the array */
1160 +		if (hammer_write(data, VAL, HAMMER_TIME) != 0) {
1161 +			printf("failed hammer_write()\n");
1162 +			return -1;
1163 +		}
1164 +		if (hammer_read(data, VAL, HAMMER_TIME) != 0) {
1165 +			printf("failed hammer_read()\n");
1166 +			return -1;
1167 +		}
1168 +	}
1169 +	return 0;
1170 +}
1171 +
1172 +inline void timespec_normalize(struct timespec* ts, time_t sec, int64_t nsec)
1173 +{
1174 +	while(nsec > 1000000000LL) {
1175 +		asm("" : "+rm"(nsec));
1176 +		nsec -= 1000000000LL;
1177 +		++sec;
1178 +	}
1179 +	while(nsec < 0) {
1180 +		asm("" : "+rm"(nsec));
1181 +		nsec += 1000000000LL;
1182 +		--sec;
1183 +	}
1184 +
1185 +	ts->tv_sec = sec;
1186 +	ts->tv_nsec = nsec;
1187 +}
1188 +
1189 +inline struct timespec timespec_sub(struct timespec lhs, struct timespec rhs)
1190 +{
1191 +	struct timespec delta;
1192 +	timespec_normalize(&delta, lhs.tv_sec - rhs.tv_sec, lhs.tv_nsec - rhs.tv_nsec);
1193 +	return delta;
1194 +}
1195 +
1196 +inline struct timespec timespec_add(struct timespec lhs, struct timespec rhs)
1197 +{
1198 +	struct timespec delta;
1199 +	timespec_normalize(&delta, lhs.tv_sec + rhs.tv_sec, lhs.tv_nsec + rhs.tv_nsec);
1200 +	return delta;
1201 +}
1202 +
1203 +inline int64_t timespec_to_us(struct timespec ts)
1204 +{
1205 +	int64_t t;
1206 +	t = ts.tv_sec * 1000000LL;
1207 +	t += ts.tv_nsec / 1000LL;
1208 +	return t;
1209 +}
1210 +
1211 +/* hammers away at the first byte in each mmaped page and
1212 +   times how long it took. */
1213 +int do_data(int do_uncache, int64_t* time)
1214 +{
1215 +	int size;
1216 +	int prot = PROT_READ | PROT_WRITE;
1217 +	int flags = MAP_PRIVATE;
1218 +
1219 +	pbuf_t data;
1220 +
1221 +	struct sched_param fifo_params;
1222 +
1223 +	struct timespec start, end;
1224 +	int64_t elapsed;
1225 +	int trials = 1000;
1226 +
1227 +	printf("Running data access test.\n");
1228 +
1229 +	mlockall(MCL_CURRENT | MCL_FUTURE);
1230 +
1231 +	memset(&fifo_params, 0, sizeof(fifo_params));
1232 +	fifo_params.sched_priority = sched_get_priority_max(SCHED_FIFO);
1233 +
1234 +	size = PAGE_SIZE*NR_PAGES;
1235 +
1236 +	printf("Allocating %d %s pages.\n", NR_PAGES, (do_uncache) ?
1237 +					"uncacheable" : "cacheable");
1238 +	if (do_uncache) {
1239 +		int fd = open(UNCACHE_DEV, O_RDWR);
1240 +		data = mmap(NULL, size, prot, flags, fd, 0);
1241 +		close(fd);
1242 +	}
1243 +	else {
1244 +		/* Accessed data will probably fit in L1, so this will go VERY fast.
1245 +		   Code should also have little-to-no pipeline stalls. */
1246 +		flags |= MAP_ANONYMOUS;
1247 +		data = mmap(NULL, size, prot, flags, -1, 0);
1248 +	}
1249 +	if (data == MAP_FAILED) {
1250 +		printf("Failed to alloc data! "
1251 +			   "Are you running Litmus? "
1252 +			   "Is Litmus broken?\n");
1253 +		return -1;
1254 +	}
1255 +	else {
1256 +		printf("Data allocated at %p.\n", data);
1257 +	}
1258 +
1259 +	printf("Beginning tests...\n");
1260 +	if (sched_setscheduler(getpid(), SCHED_FIFO, &fifo_params)) {
1261 +		printf("(Could not become SCHED_FIFO task.) Are you running as root?\n");
1262 +	}
1263 +
1264 +	/* observations suggest that no warmup phase is needed. */
1265 +	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
1266 +	if (test(data, size, trials) != 0) {
1267 +		printf("Test failed!\n");
1268 +		munmap((char*)data, size);
1269 +		return -1;
1270 +	}
1271 +	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
1272 +	elapsed = timespec_to_us(timespec_sub(end, start));
1273 +	printf("%s Time: %ldus\n", (do_uncache) ?
1274 +					"Uncache" : "Cache", elapsed);
1275 +
1276 +	munmap((char*)data, size);
1277 +
1278 +	if(time)
1279 +		*time = elapsed;
1280 +
1281 +	return 0;
1282 +}
1283 +
1284 +/* compares runtime of cached vs. uncached */
1285 +int do_data_compare()
1286 +{
1287 +	const double thresh = 1.3;
1288 +	int ret = 0;
1289 +	double ratio;
1290 +	int64_t cache_time = 0, uncache_time = 0;
1291 +
1292 +	printf("Timing cached pages...\n");
1293 +	ret = do_data(0, &cache_time);
1294 +	if (ret != 0)
1295 +		goto out;
1296 +
1297 +	printf("Timing uncached pages...\n");
1298 +	ret = do_data(1, &uncache_time);
1299 +	if (ret != 0)
1300 +		goto out;
1301 +
1302 +	ratio = (double)uncache_time/(double)cache_time;
1303 +	printf("Uncached/Cached Ratio: %f\n", ratio);
1304 +
1305 +	if (ratio < thresh) {
1306 +		printf("Ratio is unexpectedly small (< %f)! "
1307 +				" Uncache broken? Are you on kvm?\n", thresh);
1308 +		ret = -1;
1309 +	}
1310 +
1311 +out:
1312 +	return ret;
1313 +}
1314 +
1315 +/* tries to max out uncache allocations.
1316 +   under normal conditions (non-mlock),
1317 +   pages should spill into swap. uncache
1318 +   pages are not locked in memory. */
1319 +int do_max_alloc(void)
1320 +{
1321 +	int fd;
1322 +	int good = 1;
1323 +	int count = 0;
1324 +	uint64_t mmap_size = PAGE_SIZE; /* start at one page per mmap */
1325 +
1326 +	/* half of default limit on ubuntu. (see /proc/sys/vm/max_map_count) */
1327 +	int max_mmaps = 32765;
1328 +	volatile char** maps = calloc(max_mmaps, sizeof(pbuf_t));
1329 +
1330 +	if (!maps) {
1331 +		printf("failed to alloc pointers for pages\n");
1332 +		return -1;
1333 +	}
1334 +
1335 +	printf("Testing max amount of uncache data. System may get wonkie (OOM Killer)!\n");
1336 +
1337 +	fd = open(UNCACHE_DEV, O_RDWR);
1338 +	do {
1339 +		int i;
1340 +		int nr_pages = mmap_size/PAGE_SIZE;
1341 +		printf("Testing mmaps of %d pages.\n", nr_pages);
1342 +
1343 +		count = 0;
1344 +		for (i = 0; (i < max_mmaps) && good; ++i) {
1345 +			pbuf_t data = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_POPULATE, fd, 0);
1346 +
1347 +			if (data != MAP_FAILED) {
1348 +				maps[i] = data;
1349 +				++count;
1350 +			}
1351 +			else {
1352 +				perror(NULL);
1353 +				good = 0;
1354 +			}
1355 +		}
1356 +		for (i = 0; i < count; ++i) {
1357 +			if (maps[i])
1358 +				munmap((char*)(maps[i]), mmap_size);
1359 +		}
1360 +		memset(maps, 0, sizeof(maps[0])*max_mmaps);
1361 +
1362 +		mmap_size *= 2; /* let's do it again with bigger allocations */
1363 +	}while(good);
1364 +
1365 +	free(maps);
1366 +	close(fd);
1367 +
1368 +	printf("Maxed out allocs with %d mmaps of %lu pages in size.\n",
1369 +		count, mmap_size/PAGE_SIZE);
1370 +
1371 +	return 0;
1372 +}
1373 +
1374 +typedef enum
1375 +{
1376 +	UNCACHE,
1377 +	CACHE,
1378 +	COMPARE,
1379 +	MAX_ALLOC
1380 +} test_t;
1381 +
1382 +#define OPTSTR "ucxa"
1383 +int main(int argc, char** argv)
1384 +{
1385 +	int ret;
1386 +	test_t test = UNCACHE;
1387 +	int opt;
1388 +	PAGE_SIZE = sysconf(_SC_PAGE_SIZE);
1389 +
1390 +	while((opt = getopt(argc, argv, OPTSTR)) != -1) {
1391 +		switch(opt) {
1392 +			case 'c':
1393 +				test = CACHE;
1394 +				break;
1395 +			case 'u':
1396 +				test = UNCACHE;
1397 +				break;
1398 +			case 'x':
1399 +				test = COMPARE;
1400 +				break;
1401 +			case 'a':
1402 +				test = MAX_ALLOC;
1403 +				break;
1404 +			case ':':
1405 +				printf("missing option\n");
1406 +				exit(-1);
1407 +			case '?':
1408 +			default:
1409 +				printf("bad argument\n");
1410 +				exit(-1);
1411 +		}
1412 +	}
1413 +
1414 +
1415 +	printf("Page Size: %d\n", PAGE_SIZE);
1416 +
1417 +	switch(test)
1418 +	{
1419 +	case CACHE:
1420 +		ret = do_data(0, NULL);
1421 +		break;
1422 +	case UNCACHE:
1423 +		ret = do_data(1, NULL);
1424 +		break;
1425 +	case COMPARE:
1426 +		ret = do_data_compare();
1427 +		break;
1428 +	case MAX_ALLOC:
1429 +		ret = do_max_alloc();
1430 +		break;
1431 +	default:
1432 +		printf("invalid test\n");
1433 +		ret = -1;
1434 +		break;
1435 +	}
1436 +
1437 +	if (ret != 0) {
1438 +		printf("Test failed.\n");
1439 +	}
1440 +
1441 +	return ret;
1442 +}
1443 diff --git a/gpu/aux_threads.c b/gpu/aux_threads.c
1444 new file mode 100644
1445 index 0000000..1711c40
1446 --- /dev/null
1447 +++ b/gpu/aux_threads.c
1448 @@ -0,0 +1,313 @@
1449 +/* based_mt_task.c -- A basic multi-threaded real-time task skeleton.
1450 + *
1451 + * This (by itself useless) task demos how to setup a multi-threaded LITMUS^RT
1452 + * real-time task. Familiarity with the single threaded example (base_task.c)
1453 + * is assumed.
1454 + *
1455 + * Currently, liblitmus still lacks automated support for real-time
1456 + * tasks, but internaly it is thread-safe, and thus can be used together
1457 + * with pthreads.
1458 + */
1459 +
1460 +#include <stdio.h>
1461 +#include <stdlib.h>
1462 +#include <unistd.h>
1463 +
1464 +#include <fcntl.h>
1465 +#include <sys/stat.h>
1466 +#include <sys/time.h>
1467 +#include <sys/resource.h>
1468 +
1469 +/* Include gettid() */
1470 +#include <sys/types.h>
1471 +
1472 +/* Include threading support. */
1473 +#include <pthread.h>
1474 +
1475 +/* Include the LITMUS^RT API.*/
1476 +#include "litmus.h"
1477 +
1478 +//#define PERIOD		500
1479 +#define PERIOD		 10
1480 +//#define EXEC_COST	 10
1481 +#define EXEC_COST 1
1482 +
1483 +int NUM_AUX_THREADS = 2;
1484 +
1485 +#define LITMUS_STATS_FILE "/proc/litmus/stats"
1486 +
1487 +/* The information passed to each thread. Could be anything. */
1488 +struct thread_context {
1489 +	int id;
1490 +	struct timeval total_time;
1491 +};
1492 +
1493 +/* The real-time thread program. Doesn't have to be the same for
1494 + * all threads. Here, we only have one that will invoke job().
1495 + */
1496 +void* rt_thread(void *tcontext);
1497 +void* aux_thread(void *tcontext);
1498 +
1499 +/* Declare the periodically invoked job.
1500 + * Returns 1 -> task should exit.
1501 + *         0 -> task should continue.
1502 + */
1503 +int job(void);
1504 +
1505 +
1506 +/* Catch errors.
1507 + */
1508 +#define CALL( exp ) do { \
1509 +		int ret; \
1510 +		ret = exp; \
1511 +		if (ret != 0) \
1512 +			fprintf(stderr, "%s failed: %m\n", #exp);\
1513 +		else \
1514 +			fprintf(stderr, "%s ok.\n", #exp); \
1515 +	} while (0)
1516 +
1517 +int gRun = 1;
1518 +
1519 +pthread_mutex_t gMutex = PTHREAD_MUTEX_INITIALIZER;
1520 +pthread_barrier_t gBar;
1521 +
1522 +#define OPTSTR "t:fcb"
1523 +
1524 +int main(int argc, char** argv)
1525 +{
1526 +	int i;
1527 +	struct thread_context *ctx;
1528 +	pthread_t *task;
1529 +
1530 +	int opt;
1531 +	int before = 0;
1532 +	int aux_flags = 0;
1533 +	int do_future = 0;
1534 +
1535 +	while ((opt = getopt(argc, argv, OPTSTR)) != -1) {
1536 +		switch(opt)
1537 +		{
1538 +		case 't':
1539 +			NUM_AUX_THREADS = atoi(optarg);
1540 +			printf("%d aux threads\n", NUM_AUX_THREADS);
1541 +			break;
1542 +		case 'f':
1543 +			aux_flags |= AUX_FUTURE;
1544 +			do_future = 1;
1545 +			break;
1546 +		case 'c':
1547 +			aux_flags |= AUX_CURRENT;
1548 +			break;
1549 +		case 'b':
1550 +			before = 1;
1551 +			printf("Will become real-time before spawning aux threads.\n");
1552 +			break;
1553 +		}
1554 +	}
1555 +
1556 +	if (aux_flags == 0) {
1557 +		printf("Must specify -c (AUX_CURRENT) and/or -f (AUX_FUTURE) for aux tasks.\n");
1558 +		return -1;
1559 +	}
1560 +
1561 +	ctx = calloc(NUM_AUX_THREADS, sizeof(struct thread_context));
1562 +	task = calloc(NUM_AUX_THREADS, sizeof(pthread_t));
1563 +
1564 +	//lt_t delay = ms2lt(1000);
1565 +
1566 +	/*****
1567 +	 * 3) Initialize LITMUS^RT.
1568 +	 *    Task parameters will be specified per thread.
1569 +	 */
1570 +	init_litmus();
1571 +
1572 +	{
1573 +		pthread_barrierattr_t battr;
1574 +		pthread_barrierattr_init(&battr);
1575 +		pthread_barrier_init(&gBar, &battr, (NUM_AUX_THREADS)+1);
1576 +	}
1577 +
1578 +	if(before)
1579 +	{
1580 +		CALL( init_rt_thread() );
1581 +		CALL( sporadic_partitioned(EXEC_COST, PERIOD, 0) );
1582 +		CALL( task_mode(LITMUS_RT_TASK) );
1583 +	}
1584 +
1585 +
1586 +	if(do_future && before)
1587 +	{
1588 +		CALL( enable_aux_rt_tasks(aux_flags) );
1589 +	}
1590 +
1591 +//	printf("Red Leader is now real-time!\n");
1592 +
1593 +	for (i = 0; i < NUM_AUX_THREADS; i++) {
1594 +		ctx[i].id = i;
1595 +		pthread_create(task + i, NULL, aux_thread, (void *) (ctx + i));
1596 +	}
1597 +
1598 +//	pthread_barrier_wait(&gBar);
1599 +
1600 +//	sleep(1);
1601 +
1602 +	if(!before)
1603 +	{
1604 +		CALL( init_rt_thread() );
1605 +		CALL( sporadic_global(EXEC_COST, PERIOD) );
1606 +		CALL( task_mode(LITMUS_RT_TASK) );
1607 +	}
1608 +
1609 +	// secondary call *should* be harmless
1610 +	CALL( enable_aux_rt_tasks(aux_flags) );
1611 +
1612 +	{
1613 +	int last = time(0);
1614 +//	struct timespec sleeptime = {0, 1000}; // 1 microsecond
1615 +//	for(i = 0; i < 24000; ++i) {
1616 +	for(i = 0; i < 2000; ++i) {
1617 +		sleep_next_period();
1618 +//		printf("RED LEADER!\n");
1619 +
1620 +//		nanosleep(&sleeptime, NULL);
1621 +
1622 +		pthread_mutex_lock(&gMutex);
1623 +
1624 +		if((i%(10000/PERIOD)) == 0) {
1625 +			int now = time(0);
1626 +			printf("hearbeat %d: %d\n", i, now - last);
1627 +			last = now;
1628 +		}
1629 +
1630 +		pthread_mutex_unlock(&gMutex);
1631 +	}
1632 +	}
1633 +
1634 +	CALL( disable_aux_rt_tasks(aux_flags) );
1635 +	gRun = 0;
1636 +
1637 +	CALL( task_mode(BACKGROUND_TASK) );
1638 +
1639 +	/*****
1640 +	 * 5) Wait for RT threads to terminate.
1641 +	 */
1642 +	for (i = 0; i < NUM_AUX_THREADS; i++) {
1643 +		if (task[i] != 0) {
1644 +			float time;
1645 +			pthread_join(task[i], NULL);
1646 +			time = ctx[i].total_time.tv_sec + ctx[i].total_time.tv_usec / (float)(1e6);
1647 +			printf("child %d: %fs\n", i, time);
1648 +		}
1649 +	}
1650 +
1651 +
1652 +	/*****
1653 +	 * 6) Clean up, maybe print results and stats, and exit.
1654 +	 */
1655 +	return 0;
1656 +}
1657 +
1658 +
1659 +
1660 +/* A real-time thread is very similar to the main function of a single-threaded
1661 + * real-time app. Notice, that init_rt_thread() is called to initialized per-thread
1662 + * data structures of the LITMUS^RT user space libary.
1663 + */
1664 +void* aux_thread(void *tcontext)
1665 +{
1666 +	struct thread_context *ctx = (struct thread_context *) tcontext;
1667 +	int count = 0;
1668 +
1669 +//	pthread_barrier_wait(&gBar);
1670 +
1671 +	while(gRun)
1672 +	{
1673 +		if(count++ % 100000 == 0) {
1674 +			pthread_mutex_lock(&gMutex);
1675 +			pthread_mutex_unlock(&gMutex);
1676 +		}
1677 +	}
1678 +
1679 +	{
1680 +	struct rusage use;
1681 +	long int sec;
1682 +
1683 +	getrusage(RUSAGE_THREAD, &use);
1684 +
1685 +	ctx->total_time.tv_usec = use.ru_utime.tv_usec + use.ru_stime.tv_usec;
1686 +	sec = ctx->total_time.tv_usec / (long int)(1e6);
1687 +	ctx->total_time.tv_usec = ctx->total_time.tv_usec % (long int)(1e6);
1688 +	ctx->total_time.tv_sec = use.ru_utime.tv_sec + use.ru_stime.tv_sec + sec;
1689 +	}
1690 +
1691 +	return ctx;
1692 +}
1693 +
1694 +
1695 +/* A real-time thread is very similar to the main function of a single-threaded
1696 + * real-time app. Notice, that init_rt_thread() is called to initialized per-thread
1697 + * data structures of the LITMUS^RT user space libary.
1698 + */
1699 +void* rt_thread(void *tcontext)
1700 +{
1701 +	struct thread_context *ctx = (struct thread_context *) tcontext;
1702 +
1703 +	/* Make presence visible. */
1704 +	printf("RT Thread %d active.\n", ctx->id);
1705 +
1706 +	/*****
1707 +	 * 1) Initialize real-time settings.
1708 +	 */
1709 +	CALL( init_rt_thread() );
1710 +	CALL( sporadic_global(EXEC_COST, PERIOD + ctx->id * 10) );
1711 +
1712 +
1713 +	/*****
1714 +	 * 2) Transition to real-time mode.
1715 +	 */
1716 +	CALL( task_mode(LITMUS_RT_TASK) );
1717 +
1718 +
1719 +
1720 +	wait_for_ts_release();
1721 +
1722 +	/* The task is now executing as a real-time task if the call didn't fail.
1723 +	 */
1724 +
1725 +
1726 +
1727 +	/*****
1728 +	 * 3) Invoke real-time jobs.
1729 +	 */
1730 +	while(gRun) {
1731 +		/* Wait until the next job is released. */
1732 +		sleep_next_period();
1733 +		printf("%d: task.\n", ctx->id);
1734 +	}
1735 +
1736 +	/*****
1737 +	 * 4) Transition to background mode.
1738 +	 */
1739 +	CALL( task_mode(BACKGROUND_TASK) );
1740 +
1741 +	{
1742 +	struct rusage use;
1743 +	long int sec;
1744 +
1745 +	getrusage(RUSAGE_THREAD, &use);
1746 +	ctx->total_time.tv_usec = use.ru_utime.tv_usec + use.ru_stime.tv_usec;
1747 +	sec = ctx->total_time.tv_usec / (long int)(1e6);
1748 +	ctx->total_time.tv_usec = ctx->total_time.tv_usec % (long int)(1e6);
1749 +	ctx->total_time.tv_sec = use.ru_utime.tv_sec + use.ru_stime.tv_sec + sec;
1750 +	}
1751 +
1752 +	return ctx;
1753 +}
1754 +
1755 +int job(void)
1756 +{
1757 +	/* Do real-time calculation. */
1758 +
1759 +	/* Don't exit. */
1760 +	return 0;
1761 +}
1762 diff --git a/gpu/budget.cpp b/gpu/budget.cpp
1763 new file mode 100644
1764 index 0000000..e08daf7
1765 --- /dev/null
1766 +++ b/gpu/budget.cpp
1767 @@ -0,0 +1,379 @@
1768 +#include <stdio.h>
1769 +#include <stdlib.h>
1770 +#include <stdint.h>
1771 +#include <math.h>
1772 +#include <unistd.h>
1773 +#include <assert.h>
1774 +#include <errno.h>
1775 +#include <sys/types.h>
1776 +#include <sys/stat.h>
1777 +#include <fcntl.h>
1778 +
1779 +/* Include gettid() */
1780 +#include <sys/types.h>
1781 +
1782 +/* Include threading support. */
1783 +#include <pthread.h>
1784 +
1785 +/* Include the LITMUS^RT API.*/
1786 +#include "litmus.h"
1787 +
1788 +#define NUMS 4096
1789 +static int nums[NUMS];
1790 +
1791 +inline static lt_t cputime_ns(void)
1792 +{
1793 +	struct timespec ts;
1794 +	lt_t time;
1795 +	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
1796 +
1797 +	// safe, as long as sizeof(ls_t) >= 8
1798 +	time = s2ns(ts.tv_sec) + ts.tv_nsec;
1799 +
1800 +	return time;
1801 +}
1802 +
1803 +inline static lt_t wtime_ns(void)
1804 +{
1805 +	struct timespec ts;
1806 +	lt_t time;
1807 +	clock_gettime(CLOCK_MONOTONIC, &ts);
1808 +
1809 +	// safe, as long as sizeof(ls_t) >= 8
1810 +	time = s2ns(ts.tv_sec) + ts.tv_nsec;
1811 +
1812 +	return time;
1813 +}
1814 +
1815 +static int loop_once(void)
1816 +{
1817 +	int i, j = 0;
1818 +	for (i = 0; i < NUMS; ++i)
1819 +		j += nums[i]++;
1820 +	return j;
1821 +}
1822 +
1823 +int loop_for(lt_t time)
1824 +{
1825 +	lt_t end, now;
1826 +	lt_t last_loop = 0, loop_start;
1827 +	int dummy = 0;
1828 +
1829 +	last_loop = 0;
1830 +
1831 +	now = cputime_ns();
1832 +	end = now + time;
1833 +
1834 +	/* '+ last_loop' attempts to avoid overrun */
1835 +	while (now + last_loop < end) {
1836 +		loop_start = now;
1837 +		dummy += loop_once();
1838 +		now = cputime_ns();
1839 +		last_loop = now - loop_start;
1840 +	}
1841 +
1842 +	return dummy;
1843 +}
1844 +
1845 +int OVERRUN = 0;
1846 +int SIGNALS = 0;
1847 +int BLOCK_SIGNALS_ON_SLEEP = 0;
1848 +int OVERRUN_RATE = 1; /* default: every job overruns */
1849 +
1850 +int CXS_OVERRUN = 0;
1851 +int NUM_LOCKS = 1;
1852 +int NUM_REPLICAS = 1;
1853 +int NAMESPACE = 0;
1854 +int *LOCKS = NULL;
1855 +int IKGLP_LOCK = 0;
1856 +int USE_DGLS = 0;
1857 +int NEST_IN_IKGLP = 0;
1858 +
1859 +int WAIT = 0;
1860 +
1861 +enum eLockType
1862 +{
1863 +	FIFO,
1864 +	PRIOQ,
1865 +	IKGLP
1866 +};
1867 +
1868 +eLockType LOCK_TYPE = FIFO;
1869 +
1870 +int OVERRUN_BY_SLEEP = 0;
1871 +
1872 +int NUM_JOBS = 0;
1873 +int NUM_COMPLETED_JOBS = 0;
1874 +int NUM_OVERRUNS = 0;
1875 +
1876 +lt_t overrun_extra = 0;
1877 +
1878 +int job(lt_t exec_ns, lt_t budget_ns)
1879 +{
1880 +	++NUM_JOBS;
1881 +
1882 +	try{
1883 +		lt_t approx_remaining = budget_ns;
1884 +		lt_t now = cputime_ns();
1885 +		loop_for(lt_t(exec_ns * 0.9)); /* fudge it a bit to account for overheads */
1886 +
1887 +		if (OVERRUN) {
1888 +			// do we want to overrun this job?
1889 +			if ((NUM_JOBS % OVERRUN_RATE) == 0) {
1890 +				approx_remaining -= (cputime_ns() - now);
1891 +
1892 +				if (SIGNALS && BLOCK_SIGNALS_ON_SLEEP)
1893 +					block_litmus_signals(SIG_BUDGET);
1894 +
1895 +				if(CXS_OVERRUN) {
1896 +					if (NEST_IN_IKGLP)
1897 +						litmus_lock(IKGLP_LOCK);
1898 +					if (USE_DGLS)
1899 +						litmus_dgl_lock(LOCKS, NUM_LOCKS);
1900 +					else
1901 +						for(int i = 0; i < NUM_LOCKS; ++i)
1902 +							litmus_lock(LOCKS[i]);
1903 +				}
1904 +
1905 +				// intentionally overrun via suspension
1906 +				if (OVERRUN_BY_SLEEP)
1907 +					lt_sleep(approx_remaining + overrun_extra);
1908 +				else
1909 +					loop_for((approx_remaining + overrun_extra) * 0.9);
1910 +
1911 +				if(CXS_OVERRUN) {
1912 +					if (USE_DGLS)
1913 +						litmus_dgl_unlock(LOCKS, NUM_LOCKS);
1914 +					else
1915 +						for(int i = NUM_LOCKS-1; i >= 0; --i)
1916 +							litmus_unlock(LOCKS[i]);
1917 +					if (NEST_IN_IKGLP)
1918 +						litmus_unlock(IKGLP_LOCK);
1919 +				}
1920 +
1921 +				if (SIGNALS && BLOCK_SIGNALS_ON_SLEEP)
1922 +					unblock_litmus_signals(SIG_BUDGET);
1923 +			}
1924 +		}
1925 +		++NUM_COMPLETED_JOBS;
1926 +	}
1927 +	catch (const litmus::sigbudget& e) {
1928 +		++NUM_OVERRUNS;
1929 +	}
1930 +
1931 +	sleep_next_period();
1932 +	return 1;
1933 +}
1934 +
1935 +#define OPTSTR "SbosOvzalwqixdn:r:p:"
1936 +
1937 +int main(int argc, char** argv)
1938 +{
1939 +	int ret;
1940 +
1941 +	srand(getpid());
1942 +
1943 +	lt_t e_ns = ms2ns(2);
1944 +	lt_t p_ns = ms2ns(50) + rand()%200;
1945 +	lt_t budget_ns = p_ns/2;
1946 +	lt_t duration = s2ns(60);
1947 +	lt_t terminate_time;
1948 +	unsigned int first_job, last_job;
1949 +	int opt;
1950 +	struct rt_task param;
1951 +	budget_drain_policy_t drain_policy = DRAIN_SIMPLE;
1952 +	int compute_overrun_rate = 0;
1953 +	int once = 1;
1954 +
1955 +	bool migrate = false;
1956 +	int partition = 0;
1957 +	int partition_sz = 1;
1958 +
1959 +	while ((opt = getopt(argc, argv, OPTSTR)) != -1) {
1960 +		switch(opt) {
1961 +		case 'p':
1962 +			migrate = true;
1963 +			partition = atoi(optarg);
1964 +			break;
1965 +		case 'S':
1966 +			SIGNALS = 1;
1967 +			break;
1968 +		case 'b':
1969 +			BLOCK_SIGNALS_ON_SLEEP = 1;
1970 +			break;
1971 +		case 's':
1972 +			OVERRUN_BY_SLEEP = 1;
1973 +			break;
1974 +		case 'o':
1975 +			OVERRUN = 1;
1976 +			overrun_extra = budget_ns/2;
1977 +			break;
1978 +		case 'O':
1979 +			OVERRUN = 1;
1980 +			overrun_extra = 4*p_ns;
1981 +			break;
1982 +		case 'a':
1983 +			/* select an overrun rate such that a task should be caught
1984 +			 * up from a backlog caused by an overrun before the next
1985 +			 * overrun occurs.
1986 +			 */
1987 +			compute_overrun_rate = 1;
1988 +			break;
1989 +		case 'v':
1990 +			drain_policy = DRAIN_SOBLIV;
1991 +			break;
1992 +		case 'z':
1993 +			drain_policy = DRAIN_SIMPLE_IO;
1994 +			break;
1995 +		case 'l':
1996 +			CXS_OVERRUN = 1;
1997 +			NAMESPACE = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
1998 +			break;
1999 +		case 'q':
2000 +			LOCK_TYPE = PRIOQ;
2001 +			break;
2002 +		case 'i':
2003 +			LOCK_TYPE = IKGLP;
2004 +			break;
2005 +		case 'x':
2006 +			NEST_IN_IKGLP = 1;
2007 +			break;
2008 +		case 'w':
2009 +			WAIT = 1;
2010 +			break;
2011 +		case 'd':
2012 +			USE_DGLS = 1;
2013 +			break;
2014 +		case 'n':
2015 +			NUM_LOCKS = atoi(optarg);
2016 +			break;
2017 +		case 'r':
2018 +			NUM_REPLICAS = atoi(optarg);
2019 +			break;
2020 +		case ':':
2021 +			printf("missing argument\n");
2022 +			assert(false);
2023 +			break;
2024 +		default:
2025 +			printf("unknown option\n");
2026 +			assert(false);
2027 +			break;
2028 +		}
2029 +	}
2030 +
2031 +	assert(!BLOCK_SIGNALS_ON_SLEEP || (BLOCK_SIGNALS_ON_SLEEP && SIGNALS));
2032 +	assert(!CXS_OVERRUN || (CXS_OVERRUN && WAIT));
2033 +	assert(LOCK_TYPE != IKGLP || NUM_LOCKS == 1);
2034 +	assert(LOCK_TYPE != IKGLP || (LOCK_TYPE == IKGLP && !NEST_IN_IKGLP));
2035 +	assert(NUM_LOCKS > 0);
2036 +	if (LOCK_TYPE == IKGLP || NEST_IN_IKGLP)
2037 +		assert(NUM_REPLICAS >= 1);
2038 +
2039 +	LOCKS = new int[NUM_LOCKS];
2040 +
2041 +	if (compute_overrun_rate) {
2042 +		int backlog = (int)ceil((overrun_extra + budget_ns)/(double)budget_ns);
2043 +		if (!CXS_OVERRUN)
2044 +			OVERRUN_RATE = backlog + 2; /* some padding */
2045 +		else
2046 +			OVERRUN_RATE = 2*backlog + 2; /* overrun less frequently for testing */
2047 +	}
2048 +
2049 +	init_rt_task_param(&param);
2050 +	param.exec_cost = budget_ns;
2051 +	param.period = p_ns;
2052 +	param.release_policy = PERIODIC;
2053 +	param.drain_policy = drain_policy;
2054 +	if (!SIGNALS)
2055 +		param.budget_policy = PRECISE_ENFORCEMENT;
2056 +	else
2057 +		param.budget_signal_policy = PRECISE_SIGNALS;
2058 +	if (migrate)
2059 +		param.cpu = cluster_to_first_cpu(partition, partition_sz);
2060 +
2061 +	// set up affinity and init litmus
2062 +	if (migrate) {
2063 +		ret = be_migrate_to_cluster(partition, partition_sz);
2064 +		assert(!ret);
2065 +	}
2066 +	init_litmus();
2067 +
2068 +	ret = set_rt_task_param(gettid(), &param);
2069 +	assert(ret == 0);
2070 +
2071 +	if (CXS_OVERRUN) {
2072 +		int i;
2073 +		for(i = 0; i < NUM_LOCKS; ++i) {
2074 +			int lock = -1;
2075 +			switch(LOCK_TYPE)
2076 +			{
2077 +				case FIFO:
2078 +					lock = open_fifo_sem(NAMESPACE, i);
2079 +					break;
2080 +				case PRIOQ:
2081 +					lock = open_prioq_sem(NAMESPACE, i);
2082 +					break;
2083 +				case IKGLP:
2084 +					lock = open_ikglp_sem(NAMESPACE, i, NUM_REPLICAS);
2085 +					break;
2086 +			}
2087 +			if (lock < 0) {
2088 +				perror("open_sem");
2089 +				exit(-1);
2090 +			}
2091 +			LOCKS[i] = lock;
2092 +		}
2093 +
2094 +		if (NEST_IN_IKGLP) {
2095 +			IKGLP_LOCK = open_ikglp_sem(NAMESPACE, i, NUM_REPLICAS);
2096 +			if (IKGLP_LOCK < 0) {
2097 +				perror("open_sem");
2098 +				exit(-1);
2099 +			}
2100 +		}
2101 +	}
2102 +
2103 +	if (WAIT) {
2104 +		ret = wait_for_ts_release();
2105 +		if (ret < 0)
2106 +			perror("wait_for_ts_release");
2107 +	}
2108 +
2109 +	ret = task_mode(LITMUS_RT_TASK);
2110 +	assert(ret == 0);
2111 +
2112 +	sleep_next_period();
2113 +
2114 +	ret = get_job_no(&first_job);
2115 +	assert(ret == 0);
2116 +
2117 +	terminate_time = duration + wtime_ns();
2118 +
2119 +	while (wtime_ns() < terminate_time) {
2120 +		try{
2121 +			if(once) {
2122 +				activate_litmus_signals(SIG_BUDGET, litmus::throw_on_litmus_signal);
2123 +				once = 0;
2124 +			}
2125 +			job(e_ns, budget_ns);
2126 +		}
2127 +		catch(const litmus::sigbudget &e) {
2128 +			/* drop silently */
2129 +		}
2130 +	}
2131 +
2132 +	ret = get_job_no(&last_job);
2133 +	assert(ret == 0);
2134 +
2135 +	ret = task_mode(BACKGROUND_TASK);
2136 +	assert(ret == 0);
2137 +
2138 +	printf("# Kernel Jobs: %d\n", last_job - first_job + 1);
2139 +	printf("# User Started Jobs: %d\n", NUM_JOBS);
2140 +	printf("# User Jobs Completed: %d\n", NUM_COMPLETED_JOBS);
2141 +	printf("# Overruns: %d\n", NUM_OVERRUNS);
2142 +
2143 +	delete[] LOCKS;
2144 +
2145 +	return 0;
2146 +}
2147 diff --git a/gpu/dgl.c b/gpu/dgl.c
2148 new file mode 100644
2149 index 0000000..c40fec6
2150 --- /dev/null
2151 +++ b/gpu/dgl.c
2152 @@ -0,0 +1,282 @@
2153 +#include <stdio.h>
2154 +#include <stdlib.h>
2155 +#include <stdint.h>
2156 +#include <unistd.h>
2157 +#include <assert.h>
2158 +#include <errno.h>
2159 +#include <sys/types.h>
2160 +#include <sys/stat.h>
2161 +#include <fcntl.h>
2162 +
2163 +/* Include gettid() */
2164 +#include <sys/types.h>
2165 +
2166 +/* Include threading support. */
2167 +#include <pthread.h>
2168 +
2169 +/* Include the LITMUS^RT API.*/
2170 +#include "litmus.h"
2171 +
2172 +#define xfprintf( ... ) do { \
2173 +if(!SILENT) { fprintf( __VA_ARGS__ ) ; } \
2174 +} while (0)
2175 +
2176 +
2177 +/* Catch errors.
2178 + */
2179 +#define CALL( exp ) do { \
2180 +		int ret; \
2181 +		ret = exp; \
2182 +		if (ret != 0) \
2183 +			xfprintf(stderr, "%s failed: %m\n", #exp);\
2184 +		else \
2185 +			xfprintf(stderr, "%s ok.\n", #exp); \
2186 +	} while (0)
2187 +
2188 +#define TH_CALL( exp ) do { \
2189 +		int ret; \
2190 +		ret = exp; \
2191 +		if (ret != 0) \
2192 +			xfprintf(stderr, "[%d] %s failed: %m\n", ctx->id, #exp); \
2193 +		else \
2194 +			xfprintf(stderr, "[%d] %s ok.\n", ctx->id, #exp); \
2195 +	} while (0)
2196 +
2197 +#define TH_SAFE_CALL( exp ) do { \
2198 +		int ret; \
2199 +		xfprintf(stderr, "[%d] calling %s...\n", ctx->id, #exp); \
2200 +		ret = exp; \
2201 +		if (ret != 0) \
2202 +			xfprintf(stderr, "\t...[%d] %s failed: %m\n", ctx->id, #exp); \
2203 +		else \
2204 +			xfprintf(stderr, "\t...[%d] %s ok.\n", ctx->id, #exp); \
2205 +	} while (0)
2206 +
2207 +
2208 +
2209 +
2210 +
2211 +/* these are only default values */
2212 +int NUM_THREADS=3;
2213 +int NUM_SEMS=1;
2214 +unsigned int NUM_REPLICAS=0;
2215 +int NEST_DEPTH=1;
2216 +
2217 +int SILENT = 0;
2218 +
2219 +int SLEEP_BETWEEN_JOBS = 1;
2220 +int USE_PRIOQ = 0;
2221 +
2222 +#define MAX_SEMS 1000
2223 +#define MAX_NEST_DEPTH 10
2224 +
2225 +
2226 +// 1000 = 1us
2227 +#define EXEC_COST 	 1000*1
2228 +#define PERIOD		1000*10
2229 +
2230 +/* The information passed to each thread. Could be anything. */
2231 +struct thread_context {
2232 +	int id;
2233 +	int fd;
2234 +	int ikglp;
2235 +	int od[MAX_SEMS];
2236 +	int count;
2237 +	unsigned int rand;
2238 +};
2239 +
2240 +void* rt_thread(void* _ctx);
2241 +int nested_job(struct thread_context* ctx, int *count, int *next);
2242 +int job(struct thread_context*);
2243 +
2244 +#define OPTSTR "t:k:s:d:fqX"
2245 +
2246 +int main(int argc, char** argv)
2247 +{
2248 +	int i;
2249 +	struct thread_context* ctx;
2250 +	pthread_t*	     task;
2251 +	int fd;
2252 +
2253 +	int opt;
2254 +	while((opt = getopt(argc, argv, OPTSTR)) != -1) {
2255 +		switch(opt) {
2256 +			case 't':
2257 +				NUM_THREADS = atoi(optarg);
2258 +				break;
2259 +			case 'k':
2260 +				NUM_REPLICAS = atoi(optarg);
2261 +				assert(NUM_REPLICAS > 0);
2262 +				break;
2263 +			case 's':
2264 +				NUM_SEMS = atoi(optarg);
2265 +				assert(NUM_SEMS >= 0 && NUM_SEMS <= MAX_SEMS);
2266 +				break;
2267 +			case 'd':
2268 +				NEST_DEPTH = atoi(optarg);
2269 +				assert(NEST_DEPTH >= 1 && NEST_DEPTH <= MAX_NEST_DEPTH);
2270 +				break;
2271 +			case 'f':
2272 +				SLEEP_BETWEEN_JOBS = 0;
2273 +				break;
2274 +			case 'q':
2275 +				USE_PRIOQ = 1;
2276 +				break;
2277 +			case 'X':
2278 +				SILENT = 1;
2279 +				break;
2280 +			default:
2281 +				fprintf(stderr, "Unknown option: %c\n", opt);
2282 +				exit(-1);
2283 +				break;
2284 +		}
2285 +	}
2286 +
2287 +	ctx = (struct thread_context*) calloc(NUM_THREADS, sizeof(struct thread_context));
2288 +	task = (pthread_t*) calloc(NUM_THREADS, sizeof(pthread_t));
2289 +
2290 +	srand(0); /* something repeatable for now */
2291 +
2292 +	fd = open("semaphores", O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
2293 +
2294 +	CALL( init_litmus() );
2295 +
2296 +	for (i = 0; i < NUM_THREADS; i++) {
2297 +		ctx[i].id = i;
2298 +		ctx[i].fd = fd;
2299 +		ctx[i].rand = rand();
2300 +		CALL( pthread_create(task + i, NULL, rt_thread, ctx + i) );
2301 +	}
2302 +
2303 +
2304 +	for (i = 0; i < NUM_THREADS; i++)
2305 +		pthread_join(task[i], NULL);
2306 +
2307 +
2308 +	return 0;
2309 +}
2310 +
2311 +void* rt_thread(void* _ctx)
2312 +{
2313 +	int i;
2314 +	int do_exit = 0;
2315 +	struct rt_task param;
2316 +
2317 +	struct thread_context *ctx = (struct thread_context*)_ctx;
2318 +
2319 +	init_rt_task_param(&param);
2320 +	param.exec_cost = EXEC_COST;
2321 +	param.period = PERIOD + 10*ctx->id; /* Vary period a little bit. */
2322 +	param.cls = RT_CLASS_SOFT;
2323 +
2324 +	TH_CALL( init_rt_thread() );
2325 +	TH_CALL( set_rt_task_param(gettid(), &param) );
2326 +
2327 +	if (NUM_REPLICAS) {
2328 +		ctx->ikglp = open_ikglp_sem(ctx->fd, 0, NUM_REPLICAS);
2329 +		if(ctx->ikglp < 0)
2330 +			perror("open_ikglp_sem");
2331 +		else
2332 +			xfprintf(stdout, "ikglp od = %d\n", ctx->ikglp);
2333 +	}
2334 +
2335 +
2336 +	for (i = 0; i < NUM_SEMS; i++) {
2337 +		if(!USE_PRIOQ) {
2338 +			ctx->od[i] = open_fifo_sem(ctx->fd, i+1);
2339 +			if(ctx->od[i] < 0)
2340 +				perror("open_fifo_sem");
2341 +			else
2342 +				xfprintf(stdout, "fifo[%d] od = %d\n", i, ctx->od[i]);
2343 +		}
2344 +		else {
2345 +			ctx->od[i] = open_prioq_sem(ctx->fd, i+1);
2346 +			if(ctx->od[i] < 0)
2347 +				perror("open_prioq_sem");
2348 +			else
2349 +				xfprintf(stdout, "prioq[%d] od = %d\n", i, ctx->od[i]);
2350 +		}
2351 +	}
2352 +
2353 +	TH_CALL( task_mode(LITMUS_RT_TASK) );
2354 +
2355 +
2356 +	xfprintf(stdout, "[%d] Waiting for TS release.\n ", ctx->id);
2357 +	wait_for_ts_release();
2358 +	ctx->count = 0;
2359 +
2360 +	do {
2361 +		int replica = -1;
2362 +		int first = (int)(NUM_SEMS * (rand_r(&(ctx->rand)) / (RAND_MAX + 1.0)));
2363 +		int last = (first + NEST_DEPTH - 1 >= NUM_SEMS) ? NUM_SEMS - 1 : first + NEST_DEPTH - 1;
2364 +		int dgl_size = last - first + 1;
2365 +		int dgl[dgl_size];
2366 +
2367 +		// construct the DGL
2368 +		for(i = first; i <= last; ++i) {
2369 +			dgl[i-first] = ctx->od[i];
2370 +		}
2371 +
2372 +
2373 +		if(NUM_REPLICAS) {
2374 +			replica = litmus_lock(ctx->ikglp);
2375 +			xfprintf(stdout, "[%d] got ikglp replica %d.\n", ctx->id, replica);
2376 +		}
2377 +
2378 +
2379 +		litmus_dgl_lock(dgl, dgl_size);
2380 +		xfprintf(stdout, "[%d] acquired dgl.\n", ctx->id);
2381 +
2382 +		do_exit = job(ctx);
2383 +
2384 +		fprintf(stdout, "[%d] should yield dgl: %d.\n", ctx->id, litmus_dgl_should_yield_lock(dgl, dgl_size));
2385 +
2386 +		xfprintf(stdout, "[%d] unlocking dgl.\n", ctx->id);
2387 +		litmus_dgl_unlock(dgl, dgl_size);
2388 +
2389 +		if(NUM_REPLICAS) {
2390 +			xfprintf(stdout, "[%d]: freeing ikglp replica %d.\n", ctx->id, replica);
2391 +			litmus_unlock(ctx->ikglp);
2392 +		}
2393 +
2394 +		if(SLEEP_BETWEEN_JOBS && !do_exit) {
2395 +			sleep_next_period();
2396 +		}
2397 +	} while(!do_exit);
2398 +
2399 +	/*****
2400 +	 * 4) Transition to background mode.
2401 +	 */
2402 +	TH_CALL( task_mode(BACKGROUND_TASK) );
2403 +
2404 +
2405 +	return NULL;
2406 +}
2407 +
2408 +void dirty_kb(int kb)
2409 +{
2410 +	int32_t one_kb[256];
2411 +	int32_t sum = 0;
2412 +	int32_t i;
2413 +
2414 +	for (i = 0; i < 256; i++)
2415 +		sum += one_kb[i];
2416 +	kb--;
2417 +	/* prevent tail recursion */
2418 +	if (kb)
2419 +		dirty_kb(kb);
2420 +	for (i = 0; i < 256; i++)
2421 +		sum += one_kb[i];
2422 +}
2423 +
2424 +int job(struct thread_context* ctx)
2425 +{
2426 +	/* Do real-time calculation. */
2427 +	dirty_kb(8);
2428 +
2429 +	/* Don't exit. */
2430 +	//return ctx->count++ > 100;
2431 +	//return ctx->count++ > 12000;
2432 +	//return ctx->count++ > 120000;
2433 +	return ctx->count++ >   50000;  // controls number of jobs per task
2434 +}
2435 diff --git a/gpu/gpuspin.cu b/gpu/gpuspin.cu
2436 new file mode 100644
2437 index 0000000..c42dea9
2438 --- /dev/null
2439 +++ b/gpu/gpuspin.cu
2440 @@ -0,0 +1,2705 @@
2441 +#include <sys/time.h>
2442 +
2443 +#include <stdio.h>
2444 +#include <stdlib.h>
2445 +#include <unistd.h>
2446 +#include <time.h>
2447 +#include <string.h>
2448 +#include <assert.h>
2449 +#include <execinfo.h>
2450 +
2451 +#include <exception>
2452 +
2453 +#include <boost/interprocess/managed_shared_memory.hpp>
2454 +#include <boost/interprocess/sync/interprocess_mutex.hpp>
2455 +#include <boost/filesystem.hpp>
2456 +
2457 +#include <random/normal.h>
2458 +
2459 +#include <cuda.h>
2460 +#include <cuda_runtime.h>
2461 +
2462 +#include "litmus.h"
2463 +#include "common.h"
2464 +
2465 +using namespace std;
2466 +using namespace boost::interprocess;
2467 +using namespace ranlib;
2468 +
2469 +#define ms2s(ms)  ((ms)*0.001)
2470 +
2471 +const unsigned int TOKEN_START = 100;
2472 +const unsigned int TOKEN_END = 101;
2473 +
2474 +const unsigned int EE_START = 200;
2475 +const unsigned int EE_END = 201;
2476 +
2477 +const unsigned int CE_SEND_START = 300;
2478 +const unsigned int CE_SEND_END = 301;
2479 +
2480 +const unsigned int CE_RECV_START = 400;
2481 +const unsigned int CE_RECV_END = 401;
2482 +
2483 +bool SILENT = true;
2484 +//bool SILENT = false;
2485 +inline int xprintf(const char *format, ...)
2486 +{
2487 +	int ret = 0;
2488 +	if (!SILENT) {
2489 +		va_list args;
2490 +		va_start(args, format);
2491 +		ret = vprintf(format, args);
2492 +		va_end(args);
2493 +	}
2494 +	return ret;
2495 +}
2496 +
2497 +const char *lock_namespace = "./.gpuspin-locks";
2498 +const size_t PAGE_SIZE = sysconf(_SC_PAGESIZE);
2499 +
2500 +const int NR_GPUS = 8;
2501 +
2502 +bool WANT_SIGNALS = false;
2503 +inline void gpuspin_block_litmus_signals(unsigned long mask)
2504 +{
2505 +	if (WANT_SIGNALS)
2506 +		block_litmus_signals(mask);
2507 +}
2508 +
2509 +inline void gpuspin_unblock_litmus_signals(unsigned long mask)
2510 +{
2511 +	if (WANT_SIGNALS)
2512 +		unblock_litmus_signals(mask);
2513 +}
2514 +
2515 +bool GPU_USING = false;
2516 +bool ENABLE_AFFINITY = false;
2517 +bool RELAX_FIFO_MAX_LEN = false;
2518 +bool ENABLE_CHUNKING = false;
2519 +bool MIGRATE_VIA_SYSMEM = false;
2520 +
2521 +bool YIELD_LOCKS = false;
2522 +
2523 +enum eEngineLockTypes
2524 +{
2525 +	FIFO,
2526 +	PRIOQ
2527 +};
2528 +
2529 +eEngineLockTypes ENGINE_LOCK_TYPE = FIFO;
2530 +
2531 +int GPU_PARTITION = 0;
2532 +int GPU_PARTITION_SIZE = 0;
2533 +int CPU_PARTITION_SIZE = 0;
2534 +
2535 +int RHO = 2;
2536 +
2537 +int NUM_COPY_ENGINES = 2;
2538 +
2539 +
2540 +__attribute__((unused)) static size_t kbToB(size_t kb) { return kb * 1024; }
2541 +__attribute__((unused)) static size_t mbToB(size_t mb) { return kbToB(mb * 1024); }
2542 +
2543 +/* in bytes */
2544 +size_t SEND_SIZE = 0;
2545 +size_t RECV_SIZE = 0;
2546 +size_t STATE_SIZE = 0;
2547 +size_t CHUNK_SIZE = 0;
2548 +
2549 +int TOKEN_LOCK = -1;
2550 +
2551 +bool USE_ENGINE_LOCKS = false;
2552 +bool USE_DYNAMIC_GROUP_LOCKS = false;
2553 +int EE_LOCKS[NR_GPUS];
2554 +int CE_SEND_LOCKS[NR_GPUS];
2555 +int CE_RECV_LOCKS[NR_GPUS];
2556 +int CE_MIGR_SEND_LOCKS[NR_GPUS];
2557 +int CE_MIGR_RECV_LOCKS[NR_GPUS];
2558 +bool RESERVED_MIGR_COPY_ENGINE = false;  // only checked if NUM_COPY_ENGINES == 2
2559 +
2560 +//bool ENABLE_RT_AUX_THREADS = false;
2561 +bool ENABLE_RT_AUX_THREADS = true;
2562 +
2563 +enum eGpuSyncMode
2564 +{
2565 +	IKGLP_MODE,
2566 +	IKGLP_WC_MODE, /* work-conserving IKGLP. no GPU is left idle, but breaks optimality */
2567 +	KFMLP_MODE,
2568 +	RGEM_MODE,
2569 +};
2570 +
2571 +eGpuSyncMode GPU_SYNC_MODE = IKGLP_MODE;
2572 +
2573 +enum eCudaSyncMode
2574 +{
2575 +	BLOCKING,
2576 +	SPIN
2577 +};
2578 +
2579 +eCudaSyncMode CUDA_SYNC_MODE = BLOCKING;
2580 +
2581 +
2582 +int CUR_DEVICE = -1;
2583 +int LAST_DEVICE = -1;
2584 +
2585 +cudaStream_t STREAMS[NR_GPUS];
2586 +cudaEvent_t EVENTS[NR_GPUS];
2587 +int GPU_HZ[NR_GPUS];
2588 +int NUM_SM[NR_GPUS];
2589 +int WARP_SIZE[NR_GPUS];
2590 +int ELEM_PER_THREAD[NR_GPUS];
2591 +
2592 +enum eScheduler
2593 +{
2594 +	LITMUS,
2595 +	LINUX,
2596 +	RT_LINUX
2597 +};
2598 +
2599 +struct Args
2600 +{
2601 +	bool wait;
2602 +	bool migrate;
2603 +	int cluster;
2604 +	int cluster_size;
2605 +	bool gpu_using;
2606 +	int gpu_partition;
2607 +	int gpu_partition_size;
2608 +	int rho;
2609 +	int num_ce;
2610 +	bool reserve_migr_ce;
2611 +	bool use_engine_locks;
2612 +	eEngineLockTypes engine_lock_type;
2613 +	bool yield_locks;
2614 +	bool use_dgls;
2615 +	eGpuSyncMode gpusync_mode;
2616 +	bool enable_affinity;
2617 +	int relax_fifo_len;
2618 +	eCudaSyncMode sync_mode;
2619 +	size_t send_size;
2620 +	size_t recv_size;
2621 +	size_t state_size;
2622 +	bool enable_chunking;
2623 +	size_t chunk_size;
2624 +	bool use_sysmem_migration;
2625 +	int num_kernels;
2626 +
2627 +	double wcet_ms;
2628 +	double gpu_wcet_ms;
2629 +	double period_ms;
2630 +
2631 +	double budget_ms;
2632 +
2633 +	double stddev;
2634 +
2635 +	eScheduler scheduler;
2636 +
2637 +	unsigned int priority;
2638 +
2639 +	task_class_t cls;
2640 +
2641 +	bool want_enforcement;
2642 +	bool want_signals;
2643 +	budget_drain_policy_t drain_policy;
2644 +
2645 +	int column;
2646 +
2647 +	int num_gpu_tasks;
2648 +	int num_tasks;
2649 +
2650 +	double scale;
2651 +
2652 +	double duration;
2653 +
2654 +	bool is_aberrant;
2655 +	double aberrant_prob;
2656 +	double aberrant_factor;
2657 +};
2658 +
2659 +
2660 +
2661 +#define DEFINE_PER_GPU(type, var) type var[NR_GPUS]
2662 +#define per_gpu(var, idx) (var[(idx)])
2663 +#define this_gpu(var) (var[(CUR_DEVICE)])
2664 +#define cur_stream() (this_gpu(STREAMS))
2665 +#define cur_event() (this_gpu(EVENTS))
2666 +#define cur_gpu() (CUR_DEVICE)
2667 +#define last_gpu() (LAST_DEVICE)
2668 +#define cur_ee() (EE_LOCKS[CUR_DEVICE])
2669 +#define cur_send() (CE_SEND_LOCKS[CUR_DEVICE])
2670 +#define cur_recv() (CE_RECV_LOCKS[CUR_DEVICE])
2671 +#define cur_migr_send() (CE_MIGR_SEND_LOCKS[CUR_DEVICE])
2672 +#define cur_migr_recv() (CE_MIGR_RECV_LOCKS[CUR_DEVICE])
2673 +#define cur_hz() (GPU_HZ[CUR_DEVICE])
2674 +#define cur_sms() (NUM_SM[CUR_DEVICE])
2675 +#define cur_warp_size() (WARP_SIZE[CUR_DEVICE])
2676 +#define cur_elem_per_thread() (ELEM_PER_THREAD[CUR_DEVICE])
2677 +#define num_online_gpus() (NUM_GPUS)
2678 +
2679 +static bool useEngineLocks()
2680 +{
2681 +	return(USE_ENGINE_LOCKS);
2682 +}
2683 +
2684 +//#define VANILLA_LINUX
2685 +
2686 +bool TRACE_MIGRATIONS = false;
2687 +#ifndef VANILLA_LINUX
2688 +#define trace_migration(to, from)					do { inject_gpu_migration((to), (from)); } while(0)
2689 +#define trace_release(arrival, deadline, jobno)		do { inject_release((arrival), (deadline), (jobno)); } while(0)
2690 +#define trace_completion(jobno)						do { inject_completion((jobno)); } while(0)
2691 +#define trace_name()								do { inject_name(); } while(0)
2692 +#define trace_param()								do { inject_param(); } while(0)
2693 +#else
2694 +#define set_rt_task_param(x, y)						(0)
2695 +#define trace_migration(to, from)
2696 +#define trace_release(arrival, deadline, jobno)
2697 +#define trace_completion(jobno)
2698 +#define trace_name()
2699 +#define trace_param()
2700 +#endif
2701 +
2702 +struct ce_lock_state
2703 +{
2704 +	int locks[2];
2705 +	size_t num_locks;
2706 +	size_t budget_remaining;
2707 +	bool locked;
2708 +
2709 +	ce_lock_state(int device_a, enum cudaMemcpyKind kind, size_t size, int device_b = -1, bool migration = false) {
2710 +		num_locks = (device_a != -1) + (device_b != -1);
2711 +
2712 +		if(device_a != -1) {
2713 +			if (!migration)
2714 +				locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ?
2715 +				CE_SEND_LOCKS[device_a] : CE_RECV_LOCKS[device_a];
2716 +			else
2717 +				locks[0] = (kind == cudaMemcpyHostToDevice || (kind == cudaMemcpyDeviceToDevice && device_b == -1)) ?
2718 +				CE_MIGR_SEND_LOCKS[device_a] : CE_MIGR_RECV_LOCKS[device_a];
2719 +		}
2720 +
2721 +		if(device_b != -1) {
2722 +			assert(kind == cudaMemcpyDeviceToDevice);
2723 +
2724 +			if (!migration)
2725 +				locks[1] = CE_RECV_LOCKS[device_b];
2726 +			else
2727 +				locks[1] = CE_MIGR_RECV_LOCKS[device_b];
2728 +
2729 +			if(locks[1] < locks[0]) {
2730 +				// enforce total order on locking
2731 +				int temp = locks[1];
2732 +				locks[1] = locks[0];
2733 +				locks[0] = temp;
2734 +			}
2735 +		}
2736 +		else {
2737 +			locks[1] = -1;
2738 +		}
2739 +
2740 +		if(!ENABLE_CHUNKING)
2741 +			budget_remaining = size;
2742 +		else
2743 +			budget_remaining = CHUNK_SIZE;
2744 +	}
2745 +
2746 +	void crash(void) {
2747 +		void *array[50];
2748 +		int size, i;
2749 +		char **messages;
2750 +
2751 +		size = backtrace(array, 50);
2752 +		messages = backtrace_symbols(array, size);
2753 +
2754 +		fprintf(stderr, "%d: TRIED TO GRAB SAME LOCK TWICE! Lock = %d\n", getpid(), locks[0]);
2755 +		for (i = 1; i < size && messages != NULL; ++i)
2756 +		{
2757 +			fprintf(stderr, "%d: [bt]: (%d) %s\n", getpid(), i, messages[i]);
2758 +		}
2759 +		free(messages);
2760 +
2761 +		assert(false);
2762 +	}
2763 +
2764 +
2765 +	void lock() {
2766 +		if(locks[0] == locks[1]) crash();
2767 +
2768 +		if (num_locks == 1) {
2769 +			gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS);
2770 +			litmus_lock(locks[0]);
2771 +			gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS);
2772 +		}
2773 +		else if(USE_DYNAMIC_GROUP_LOCKS) {
2774 +			gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS);
2775 +			litmus_dgl_lock(locks, num_locks);
2776 +			gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS);
2777 +		}
2778 +		else
2779 +		{
2780 +			gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS);
2781 +			for(int l = 0; l < num_locks; ++l)
2782 +			{
2783 +				litmus_lock(locks[l]);
2784 +			}
2785 +			gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS);
2786 +		}
2787 +		locked = true;
2788 +	}
2789 +
2790 +	void unlock() {
2791 +		if(locks[0] == locks[1]) crash();
2792 +
2793 +		if (num_locks == 1) {
2794 +			gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS);
2795 +			litmus_unlock(locks[0]);
2796 +			gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS);
2797 +		}
2798 +		else if(USE_DYNAMIC_GROUP_LOCKS) {
2799 +			gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS);
2800 +			litmus_dgl_unlock(locks, num_locks);
2801 +			gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS);
2802 +		}
2803 +		else
2804 +		{
2805 +			gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS);
2806 +			// reverse order
2807 +			for(int l = num_locks - 1; l >= 0; --l)
2808 +			{
2809 +				litmus_unlock(locks[l]);
2810 +			}
2811 +			gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS);
2812 +		}
2813 +		locked = false;
2814 +	}
2815 +
2816 +	bool should_yield() {
2817 +		int yield = 1; // assume we should yield
2818 +		if (YIELD_LOCKS) {
2819 +			if(locks[0] == locks[1]) crash();
2820 +			if (num_locks == 1)
2821 +				yield = litmus_should_yield_lock(locks[0]);
2822 +			else if(USE_DYNAMIC_GROUP_LOCKS)
2823 +				yield = litmus_dgl_should_yield_lock(locks, num_locks);
2824 +			else
2825 +				for(int l = num_locks - 1; l >= 0; --l)  // reverse order
2826 +					yield |= litmus_should_yield_lock(locks[l]);
2827 +		}
2828 +		return (yield);
2829 +	}
2830 +
2831 +	void refresh() {
2832 +		budget_remaining = CHUNK_SIZE;
2833 +	}
2834 +
2835 +	bool budgetIsAvailable(size_t tosend) {
2836 +		return(tosend >= budget_remaining);
2837 +	}
2838 +
2839 +	void decreaseBudget(size_t spent) {
2840 +		budget_remaining -= spent;
2841 +	}
2842 +};
2843 +
2844 +// precondition: if do_locking == true, locks in state are held.
2845 +static cudaError_t __chunkMemcpy(void* a_dst, const void* a_src, size_t count,
2846 +								 enum cudaMemcpyKind kind,
2847 +								 ce_lock_state* state)
2848 +{
2849 +    cudaError_t ret = cudaSuccess;
2850 +    int remaining = count;
2851 +
2852 +    char* dst = (char*)a_dst;
2853 +    const char* src = (const char*)a_src;
2854 +
2855 +	// disable chunking, if needed, by setting chunk_size equal to the
2856 +	// amount of data to be copied.
2857 +	int chunk_size = (ENABLE_CHUNKING) ? CHUNK_SIZE : count;
2858 +	int i = 0;
2859 +
2860 +    while(remaining != 0)
2861 +    {
2862 +        int bytesToCopy = std::min(remaining, chunk_size);
2863 +
2864 +		if (state && state->locked) {
2865 +			// we have to unlock/re-lock the copy engine to refresh our budget unless
2866 +			// we still have budget available.
2867 +			if (!state->budgetIsAvailable(bytesToCopy)) {
2868 +				// optimization - don't unlock if no one else needs the engine
2869 +				if (state->should_yield()) {
2870 +					gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS); 
2871 +					cudaEventSynchronize(cur_event());
2872 +					ret = cudaGetLastError();
2873 +					if (kind == cudaMemcpyDeviceToHost || kind == cudaMemcpyDeviceToDevice)
2874 +						inject_action(CE_RECV_END);
2875 +					if (kind == cudaMemcpyHostToDevice)
2876 +						inject_action(CE_SEND_END);
2877 +					gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS);
2878 +
2879 +					state->unlock();
2880 +					if(ret != cudaSuccess)
2881 +						break;
2882 +				}
2883 +				// we can only run out of
2884 +				// budget if chunking is enabled.
2885 +				// we presume that init budget would
2886 +				// be set to cover entire memcpy
2887 +				// if chunking were disabled.
2888 +				state->refresh();
2889 +			}
2890 +		}
2891 +
2892 +		if(state && !state->locked) {
2893 +			state->lock();
2894 +			if (kind == cudaMemcpyDeviceToHost || kind == cudaMemcpyDeviceToDevice)
2895 +				inject_action(CE_RECV_START);
2896 +			if (kind == cudaMemcpyHostToDevice)
2897 +				inject_action(CE_SEND_START);
2898 +		}
2899 +
2900 +        //ret = cudaMemcpy(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind);
2901 +		gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS);
2902 +		cudaMemcpyAsync(dst+i*chunk_size, src+i*chunk_size, bytesToCopy, kind, cur_stream());
2903 +		cudaEventRecord(cur_event(), cur_stream());
2904 +		gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS);
2905 +
2906 +		if(state)
2907 +			state->decreaseBudget(bytesToCopy);
2908 +
2909 +        ++i;
2910 +        remaining -= bytesToCopy;
2911 +    }
2912 +    return ret;
2913 +}
2914 +
2915 +static cudaError_t chunkMemcpy(void* a_dst, const void* a_src, size_t count,
2916 +							   enum cudaMemcpyKind kind,
2917 +							   int device_a = -1,  // device_a == -1 disables locking
2918 +							   bool do_locking = true,
2919 +							   int device_b = -1,
2920 +							   bool migration = false)
2921 +{
2922 +	cudaError_t ret;
2923 +	if(!do_locking || device_a == -1) {
2924 +		ret = __chunkMemcpy(a_dst, a_src, count, kind, NULL);
2925 +		gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS);
2926 +		cudaEventSynchronize(cur_event());
2927 +		if(ret == cudaSuccess)
2928 +			ret = cudaGetLastError();
2929 +		gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS);
2930 +	}
2931 +	else {
2932 +		ce_lock_state state(device_a, kind, count, device_b, migration);
2933 +		state.lock();
2934 +
2935 +		if (kind == cudaMemcpyDeviceToHost || kind == cudaMemcpyDeviceToDevice)
2936 +			inject_action(CE_RECV_START);
2937 +		if (kind == cudaMemcpyHostToDevice)
2938 +			inject_action(CE_SEND_START);
2939 +
2940 +		ret = __chunkMemcpy(a_dst, a_src, count, kind, &state);
2941 +		gpuspin_block_litmus_signals(ALL_LITMUS_SIG_MASKS);
2942 +		cudaEventSynchronize(cur_event());
2943 +		//		cudaStreamSynchronize(cur_stream());
2944 +		if(ret == cudaSuccess)
2945 +			ret = cudaGetLastError();
2946 +
2947 +		if (kind == cudaMemcpyDeviceToHost || kind == cudaMemcpyDeviceToDevice)
2948 +			inject_action(CE_RECV_END);
2949 +		if (kind == cudaMemcpyHostToDevice)
2950 +			inject_action(CE_SEND_END);
2951 +		gpuspin_unblock_litmus_signals(ALL_LITMUS_SIG_MASKS);
2952 +
2953 +		state.unlock();
2954 +	}
2955 +	return ret;
2956 +}
2957 +
2958 +int LITMUS_LOCK_FD = 0;
2959 +
2960 +int EXP_OFFSET = 0;
2961 +
2962 +void allocate_locks_litmus(void)
2963 +{
2964 +	stringstream ss;
2965 +	ss<<lock_namespace<<"-"<<EXP_OFFSET;
2966 +
2967 +	// allocate k-FMLP lock
2968 +	//LITMUS_LOCK_FD = open(lock_namespace, O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
2969 +	LITMUS_LOCK_FD = open(ss.str().c_str(), O_RDONLY | O_CREAT, S_IRUSR | S_IWUSR);
2970 +	int *fd = &LITMUS_LOCK_FD;
2971 +
2972 +	int base_name = GPU_PARTITION * 100 + EXP_OFFSET * 200;
2973 +	++EXP_OFFSET;
2974 +
2975 +	if (GPU_SYNC_MODE == IKGLP_MODE) {
2976 +		/* Standard (optimal) IKGLP */
2977 +		TOKEN_LOCK = open_gpusync_token_lock(*fd,
2978 +						base_name,  /* name */
2979 +						GPU_PARTITION_SIZE,
2980 +						GPU_PARTITION*GPU_PARTITION_SIZE,
2981 +						RHO,
2982 +						IKGLP_M_IN_FIFOS,
2983 +						(!RELAX_FIFO_MAX_LEN) ?
2984 +						IKGLP_OPTIMAL_FIFO_LEN :
2985 +						IKGLP_UNLIMITED_FIFO_LEN,
2986 +						ENABLE_AFFINITY);
2987 +	}
2988 +	else if (GPU_SYNC_MODE == KFMLP_MODE) {
2989 +		/* KFMLP. FIFO queues only for tokens. */
2990 +		TOKEN_LOCK = open_gpusync_token_lock(*fd,
2991 +						base_name,  /* name */
2992 +						GPU_PARTITION_SIZE,
2993 +						GPU_PARTITION*GPU_PARTITION_SIZE,
2994 +						RHO,
2995 +						IKGLP_UNLIMITED_IN_FIFOS,
2996 +						IKGLP_UNLIMITED_FIFO_LEN,
2997 +						ENABLE_AFFINITY);
2998 +	}
2999 +	else if (GPU_SYNC_MODE == RGEM_MODE) {
3000 +		/* RGEM-like token allocation. Shared priority queue for all tokens. */
3001 +		TOKEN_LOCK = open_gpusync_token_lock(*fd,
3002 +						base_name,  /* name */
3003 +						GPU_PARTITION_SIZE,
3004 +						GPU_PARTITION*GPU_PARTITION_SIZE,
3005 +						RHO,
3006 +						RHO*GPU_PARTITION_SIZE,
3007 +						1,
3008 +						ENABLE_AFFINITY);
3009 +	}
3010 +	else if (GPU_SYNC_MODE == IKGLP_WC_MODE) {
3011 +		/* Non-optimal IKGLP that never lets a replica idle if there are pending
3012 +		 * token requests. */
3013 +		int max_simult_run = std::max(CPU_PARTITION_SIZE, RHO*GPU_PARTITION_SIZE);
3014 +		int max_fifo_len = (int)ceil((float)max_simult_run / (RHO*GPU_PARTITION_SIZE));
3015 +		TOKEN_LOCK = open_gpusync_token_lock(*fd,
3016 +						base_name,  /* name */
3017 +						GPU_PARTITION_SIZE,
3018 +						GPU_PARTITION*GPU_PARTITION_SIZE,
3019 +						RHO,
3020 +						max_simult_run,
3021 +						(!RELAX_FIFO_MAX_LEN) ?
3022 +							max_fifo_len :
3023 +							IKGLP_UNLIMITED_FIFO_LEN,
3024 +						ENABLE_AFFINITY);
3025 +	}
3026 +	else {
3027 +		perror("Invalid GPUSync mode specified\n");
3028 +		TOKEN_LOCK = -1;
3029 +	}
3030 +
3031 +	if(TOKEN_LOCK < 0)
3032 +		perror("open_token_sem");
3033 +
3034 +	if(USE_ENGINE_LOCKS)
3035 +	{
3036 +		assert(NUM_COPY_ENGINES == 1 || NUM_COPY_ENGINES == 2);
3037 +		assert((NUM_COPY_ENGINES == 1 && !RESERVED_MIGR_COPY_ENGINE) || NUM_COPY_ENGINES == 2);
3038 +
3039 +		// allocate the engine locks.
3040 +		for (int i = 0; i < GPU_PARTITION_SIZE; ++i)
3041 +		{
3042 +			int idx = GPU_PARTITION*GPU_PARTITION_SIZE + i;
3043 +			int ee_name = (i+1)*10 + base_name;
3044 +			int ce_0_name = (i+1)*10 + base_name + 1;
3045 +			int ce_1_name = (i+1)*10 + base_name + 2;
3046 +			int ee_lock = -1, ce_0_lock = -1, ce_1_lock = -1;
3047 +
3048 +			open_sem_t openEngineLock = (ENGINE_LOCK_TYPE == FIFO) ?
3049 +				open_fifo_sem : open_prioq_sem;
3050 +
3051 +			ee_lock = openEngineLock(*fd, ee_name);
3052 +			if (ee_lock < 0)
3053 +				perror("open_*_sem (engine lock)");
3054 +
3055 +			ce_0_lock = openEngineLock(*fd, ce_0_name);
3056 +			if (ce_0_lock < 0)
3057 +				perror("open_*_sem (engine lock)");
3058 +
3059 +			if (NUM_COPY_ENGINES == 2)
3060 +			{
3061 +				ce_1_lock = openEngineLock(*fd, ce_1_name);
3062 +				if (ce_1_lock < 0)
3063 +					perror("open_*_sem (engine lock)");
3064 +			}
3065 +
3066 +			EE_LOCKS[idx] = ee_lock;
3067 +
3068 +			if (NUM_COPY_ENGINES == 1)
3069 +			{
3070 +				// share locks
3071 +				CE_SEND_LOCKS[idx] = ce_0_lock;
3072 +				CE_RECV_LOCKS[idx] = ce_0_lock;
3073 +				CE_MIGR_SEND_LOCKS[idx] = ce_0_lock;
3074 +				CE_MIGR_RECV_LOCKS[idx] = ce_0_lock;
3075 +			}
3076 +			else
3077 +			{
3078 +				assert(NUM_COPY_ENGINES == 2);
3079 +
3080 +				if (RESERVED_MIGR_COPY_ENGINE) {
3081 +					// copy engine deadicated to migration operations
3082 +					CE_SEND_LOCKS[idx] = ce_0_lock;
3083 +					CE_RECV_LOCKS[idx] = ce_0_lock;
3084 +					CE_MIGR_SEND_LOCKS[idx] = ce_1_lock;
3085 +					CE_MIGR_RECV_LOCKS[idx] = ce_1_lock;
3086 +				}
3087 +				else {
3088 +					// migration transmissions treated as regular data
3089 +					CE_SEND_LOCKS[idx] = ce_0_lock;
3090 +					CE_RECV_LOCKS[idx] = ce_1_lock;
3091 +					CE_MIGR_SEND_LOCKS[idx] = ce_0_lock;
3092 +					CE_MIGR_RECV_LOCKS[idx] = ce_1_lock;
3093 +				}
3094 +			}
3095 +		}
3096 +	}
3097 +}
3098 +
3099 +void deallocate_locks_litmus(void)
3100 +{
3101 +	for (int i = 0; i < GPU_PARTITION_SIZE; ++i)
3102 +	{
3103 +		int idx = GPU_PARTITION*GPU_PARTITION_SIZE + i;
3104 +
3105 +		od_close(EE_LOCKS[idx]);
3106 +		if (NUM_COPY_ENGINES == 1)
3107 +		{
3108 +			od_close(CE_SEND_LOCKS[idx]);
3109 +		}
3110 +		else
3111 +		{
3112 +			if (RESERVED_MIGR_COPY_ENGINE) {
3113 +				od_close(CE_SEND_LOCKS[idx]);
3114 +				od_close(CE_MIGR_SEND_LOCKS[idx]);
3115 +			}
3116