86d601cbf5c7266644b99b1ada9224b2e3edbd6d
[barrelfish] / lib / bomp / backends / xomp_master.c
1 /*
2  * Copyright (c) 2010, 2011, 2012, ETH Zurich.
3  * All rights reserved.
4  *
5  * This file is distributed under the terms in the attached LICENSE file.
6  * If you do not find this file, copies can be found by writing to:
7  * ETH Zurich D-INFK, CAB F.78, Universitaetstr. 6, CH-8092 Zurich,
8  * Attn: Systems Group.
9  */
10
11 #include <barrelfish/barrelfish.h>
12 #include <barrelfish/spawn_client.h>
13
14 #include <flounder/flounder_txqueue.h>
15 #include <spawndomain/spawndomain.h>
16
17 #include <xeon_phi/xeon_phi.h>
18 #include <xeon_phi/xeon_phi_client.h>
19 #include <xeon_phi/xeon_phi_domain.h>
20
21 #include <bomp_internal.h>
22 #include <xomp/xomp.h>
23
24 #include <if/xomp_defs.h>
25
26 #include <xomp_debug.h>
27
28 /// enables the virtual threads
29 #define XOMP_VTHREADS (XOMP_VTHREAD_COUNT + 1)
30
31 /**
32  * \brief worker state enumeration.
33  *
34  * Describes the possible states a worker can be in.
35  */
36 typedef enum xomp_worker_state
37 {
38     XOMP_WORKER_ST_INVALID  = 0,    ///< this worker has not been initialized
39     XOMP_WORKER_ST_FAILURE  = 1,    ///< an error occurred during an operation
40     XOMP_WORKER_ST_SPAWNING = 2,    ///< worker is being spawned
41     XOMP_WORKER_ST_SPAWNED  = 3,    ///< worker is spawned and connected to master
42     XOMP_WORKER_ST_READY    = 4,    ///< worker is ready to service requests
43     XOMP_WORKER_ST_BUSY     = 5     ///< worker is busy servicing requests
44 } xomp_worker_st_t;
45
46 /**
47  * \brief worker type enumeration
48  *
49  * Describes the possible worker types i.e. where the domain is running on
50  */
51 typedef enum xomp_worker_type
52 {
53     XOMP_WORKER_TYPE_INVALID = 0,  ///< invalid worker type (not initialized)
54     XOMP_WORKER_TYPE_LOCAL   = 1,  ///< worker runs local to master
55     XOMP_WORKER_TYPE_REMOTE  = 2   ///< worker runs remote to master
56 } xomp_worker_type_t;
57
58 /**
59  * \brief XOMP worker
60  */
61 struct xomp_worker
62 {
63     xomp_wid_t id;                  ///< worker ID
64     xomp_worker_type_t type;        ///< worker type
65     xomp_worker_st_t state;         ///< worker state
66     xphi_dom_id_t domainid;         ///< domain ID of the worker
67
68     struct xomp_binding *binding;   ///< Control channel binding
69     struct tx_queue txq;            ///< Flounder TX queue
70
71     errval_t err;                   ///< error number in case an error occurred
72     uint8_t add_mem_st;             ///< state flag when we adding a frame
73
74     struct capref msgframe;         ///< messaging frame + tls for the worker
75     lpaddr_t msgbase;               ///< physical base of the messaging frame
76     void *msgbuf;                   ///< where the messaging frame is mapped
77
78     void *tls;                      ///< pointer to the thread local storage
79
80 #if XOMP_BENCH_ENABLED
81     cycles_t start;                 ///< start time of the operation
82     uint32_t index;
83 #endif
84 };
85
86 /**
87  * \brief XOMP master
88  */
89 struct xomp_master
90 {
91     uint32_t numworker;                 ///< total number of worker spawned
92     struct {
93         uint32_t num;                   ///< number of local workers
94         struct xomp_worker *workers;    ///< array of local workers
95         uint32_t next;                  ///< next local worker to "allocate"
96     } local;
97     struct {
98         uint32_t num;                   ///< number of remote workers
99         struct xomp_worker *workers;    ///< array of remote workers
100         uint32_t next;                  ///< next remote worker to "allocate"
101     } remote;
102 };
103
104 /**
105  * \brief Message state for the TX queue
106  */
107 struct xomp_msg_st
108 {
109     struct txq_msg_st common;       ///< common msg state
110     /* union of arguments */
111     union {
112         struct {
113             uint64_t fn;
114             uint64_t arg;
115             uint64_t id;
116             uint64_t flags;
117         } do_work;
118         struct {
119             struct capref frame;
120             uint64_t vaddr;
121             uint8_t type;
122         } add_mem;
123     } args;
124 };
125
126 /// intialized flag
127 static uint8_t xomp_master_initialized = 0x0;
128
129 /// XOMP master
130 static struct xomp_master xmaster;
131
132 /// exported service iref (for local workers)
133 static iref_t svc_iref;
134
135 /// number of present Xeon Phis
136 static uint8_t num_phi = 0;
137
138 /// only use remote workers, no locals
139 static xomp_wloc_t worker_loc = XOMP_WORKER_LOC_MIXED;
140
141 /// stride for core allocation (in case of hyperthreads)
142 static coreid_t core_stride;
143
144 /// arguments to supply to the local spawned workers
145 static struct xomp_spawn spawn_args_local;
146
147 /// arguments to supply to the remote spawned workers
148 static struct xomp_spawn spawn_args_remote;
149
150 /// buffer for the worker id argument
151 static char worker_id_buf[26];
152
153 /// buffer for the iref argument
154 static char iref_buf[19];
155
156 #if XOMP_BENCH_ENABLED
157
158 #include <bench/bench.h>
159
160 static bench_ctl_t **xomp_bench_mem_add;
161 static bench_ctl_t **xomp_bench_do_work;
162 static bench_ctl_t **xomp_bench_spawn;
163
164 #endif
165
166 #if XOMP_BENCH_MASTER_EN
167 static cycles_t local_timer;
168 static cycles_t remote_timer;
169 #endif
170
171 /**
172  * \brief enters the barrier when a worker finished his work, this function
173  *        is called on the main thread (master domain)
174  *
175  * \param barrier   The barrier to enter
176  */
177 static inline void xbomp_barrier_enter_no_wait(struct bomp_barrier *barrier)
178 {
179     if (__sync_fetch_and_add(&barrier->counter, 1) == (barrier->max - 1)) {
180         barrier->counter = 0;
181         barrier->cycle = !barrier->cycle;
182     }
183 }
184
185 #define XOMP_LOCAL_THREADS_MAX 10
186
187 /*
188  * ----------------------------------------------------------------------------
189  * Helper functions
190  * ----------------------------------------------------------------------------
191  */
192 static inline uint32_t xomp_master_get_local_threads(uint32_t nthreads)
193 {
194     switch (worker_loc) {
195         case XOMP_WORKER_LOC_LOCAL:
196             return nthreads - 1;
197         case XOMP_WORKER_LOC_MIXED:
198 #if XOMP_LOCAL_THREADS_MAX
199             if (nthreads > XOMP_LOCAL_THREADS_MAX) {
200                 return XOMP_LOCAL_THREADS_MAX - 1;
201             } else {
202                 return nthreads - (num_phi * ((nthreads) / (1 + num_phi))) - 1;
203             }
204 #else
205             return nthreads - (num_phi * ((nthreads) / (1 + num_phi))) - 1;
206 #endif
207         case XOMP_WORKER_LOC_REMOTE:
208             return 0;
209         default:
210             USER_PANIC("unknown worker location!");
211     }
212     USER_PANIC("unknown worker location!");
213     return 0;
214 }
215
216 static inline uint32_t xomp_master_get_remote_threads(uint32_t nthreads)
217 {
218     switch (worker_loc) {
219         case XOMP_WORKER_LOC_LOCAL:
220             return 0;
221         case XOMP_WORKER_LOC_MIXED:
222 #if XOMP_LOCAL_THREADS_MAX
223             if (nthreads > XOMP_LOCAL_THREADS_MAX) {
224                 return nthreads - XOMP_LOCAL_THREADS_MAX;
225             } else {
226                 return ((nthreads) / (1 + num_phi)) * num_phi;
227             }
228 #else
229             return ((nthreads) / (1 + num_phi)) * num_phi;
230 #endif
231         case XOMP_WORKER_LOC_REMOTE:
232             return nthreads - 1;
233         default:
234             USER_PANIC("unknown worker location!");
235     }
236
237     return 0;
238 }
239
240 /*
241  * ----------------------------------------------------------------------------
242  * XOMP channel send handlers
243  * ----------------------------------------------------------------------------
244  */
245
246 static errval_t do_work_tx(struct txq_msg_st *msg_st)
247 {
248
249     struct xomp_msg_st *st = (struct xomp_msg_st *) msg_st;
250
251     return xomp_do_work__tx(msg_st->queue->binding, TXQCONT(msg_st),
252                             st->args.do_work.fn, st->args.do_work.arg,
253                             st->args.do_work.id, st->args.do_work.flags);
254 }
255
256 static errval_t gw_req_memory_call_tx(struct txq_msg_st *msg_st)
257 {
258
259     struct xomp_msg_st *st = (struct xomp_msg_st *) msg_st;
260
261     return xomp_gw_req_memory_call__tx(msg_st->queue->binding, TXQCONT(msg_st),
262                                        st->args.add_mem.vaddr,
263                                        st->args.add_mem.type);
264 }
265
266 static errval_t add_memory_call_tx(struct txq_msg_st *msg_st)
267 {
268
269     struct xomp_msg_st *st = (struct xomp_msg_st *) msg_st;
270
271     return xomp_add_memory_call__tx(msg_st->queue->binding, TXQCONT(msg_st),
272                                     st->args.add_mem.frame,
273                                     st->args.add_mem.vaddr, st->args.add_mem.type);
274 }
275
276 /*
277  * ----------------------------------------------------------------------------
278  * XOMP channel receive handlers
279  * ----------------------------------------------------------------------------
280  */
281
282 static void gw_req_memory_response_rx(struct xomp_binding *b,
283                                       errval_t msgerr)
284 {
285     XMP_DEBUG("gw_req_memory_response_rx: %s\n", err_getstring(msgerr));
286
287     struct xomp_worker *worker = b->st;
288
289     worker->err = msgerr;
290     worker->add_mem_st = 0x2;
291
292 #if XOMP_BENCH_ENABLED
293     cycles_t timer = bench_tsc();
294     if (xomp_bench_mem_add) {
295         timer = bench_time_diff(worker->start, timer);
296         bench_ctl_add_run(xomp_bench_mem_add[1], &timer);
297         bench_ctl_add_run(xomp_bench_mem_add[2+worker->index], &timer);
298     }
299 #endif
300
301 #if XOMP_BENCH_MASTER_EN
302     cycles_t duration = bench_tsc() - worker->start;
303     remote_timer += duration;
304     debug_printf("remote worker %016lx: add memory took %lu cycles, %lu ms\n",
305                  worker->id, duration, bench_tsc_to_ms(duration));
306 #endif
307 }
308
309 static void add_memory_response_rx(struct xomp_binding *b,
310                                    errval_t msgerr)
311 {
312     XMP_DEBUG("add_memory_response_rx: %s\n", err_getstring(msgerr));
313
314     struct xomp_worker *worker = b->st;
315
316     worker->err = msgerr;
317     worker->add_mem_st = 0x2;
318
319 #if XOMP_BENCH_ENABLED
320     cycles_t timer = bench_tsc();
321     if (xomp_bench_mem_add) {
322         timer = bench_time_diff(worker->start, timer);
323         bench_ctl_add_run(xomp_bench_mem_add[0], &timer);
324         bench_ctl_add_run(xomp_bench_mem_add[2+worker->index], &timer);
325     }
326 #endif
327
328 #if XOMP_BENCH_MASTER_EN
329     cycles_t duration = bench_tsc() - worker->start;
330     local_timer += duration;
331     debug_printf("local worker %016lx: add memory took %lu cycles, %lu ms\n",
332                  worker->id, duration, bench_tsc_to_ms(duration));
333 #endif
334 }
335
336 static inline void done_msg_common(struct xomp_binding *b,
337                                    uint64_t tid,
338                                    errval_t msgerr)
339 {
340     struct xomp_task *task = (struct xomp_task *) tid;
341
342     struct xomp_worker *worker = b->st;
343     if (err_is_fail(msgerr)) {
344         worker->state = XOMP_WORKER_ST_FAILURE;
345     } else {
346         worker->state = XOMP_WORKER_ST_READY;
347     }
348
349 #if XOMP_BENCH_ENABLED
350     cycles_t timer = bench_tsc();
351     if (xomp_bench_do_work) {
352         timer = bench_time_diff(worker->start, timer);
353         if (worker->type == XOMP_WORKER_TYPE_LOCAL) {
354             bench_ctl_add_run(xomp_bench_do_work[0], &timer);
355         } else if (worker->type == XOMP_WORKER_TYPE_REMOTE){
356             bench_ctl_add_run(xomp_bench_do_work[1], &timer);
357         }
358         bench_ctl_add_run(xomp_bench_do_work[2 + worker->index], &timer);
359     }
360 #endif
361
362 #if XOMP_BENCH_MASTER_EN
363     cycles_t duration = bench_tsc()- worker->start;
364     debug_printf("generic worker %u, %lu cycles, %lu ms\n",
365                  (uint16_t)worker->id, duration, bench_tsc_to_ms(duration));
366 #endif
367
368     xbomp_barrier_enter_no_wait(task->barrier);
369
370     /* if the last worker returns, free up the task data structure */
371     task->done++;
372     if (task->done == task->total_threads) {
373         free(task);
374     }
375 }
376
377 static void done_with_arg_rx(struct xomp_binding *b,
378                              uint64_t tid,
379                              uint64_t arg,
380                              errval_t msgerr)
381 {
382     XMP_DEBUG("done_with_arg_rx: arg:%lx, id:%lx\n", arg, tid);
383
384     done_msg_common(b, tid, msgerr);
385
386     /* XXX: do something with the argument */
387 }
388
389 static void done_notify_rx(struct xomp_binding *b,
390                            uint64_t tid,
391                            errval_t msgerr)
392 {
393     XMP_DEBUG("done_notify_rx: id:%lx\n", tid);
394
395     done_msg_common(b, tid, msgerr);
396 }
397
398 static struct xomp_rx_vtbl rx_vtbl = {
399     .gw_req_memory_response = gw_req_memory_response_rx,
400     .add_memory_response = add_memory_response_rx,
401     .done_notify = done_notify_rx,
402     .done_with_arg = done_with_arg_rx
403 };
404
405 /*
406  * ----------------------------------------------------------------------------
407  * XOMP channel connect handler
408  * ----------------------------------------------------------------------------
409  */
410
411 static errval_t xomp_svc_connect_cb(void *st,
412                                     struct xomp_binding *xb)
413 {
414     struct xomp_worker *worker = xmaster.local.workers + xmaster.local.next++;
415
416     XMI_DEBUG("xomp_svc_connect_cb:%lx connected: %p\n", worker->id, worker);
417
418     xb->rx_vtbl = rx_vtbl;
419     xb->st = worker;
420
421     txq_init(&worker->txq, xb, xb->waitset, (txq_register_fn_t) xb->register_send,
422              sizeof(struct xomp_msg_st));
423
424     worker->binding = xb;
425     worker->state = XOMP_WORKER_ST_SPAWNED;
426
427     return SYS_ERR_OK;
428 }
429
430 static void xomp_svc_export_cb(void *st,
431                                errval_t err,
432                                iref_t iref)
433 {
434     XMI_DEBUG("Service exported @ iref:%"PRIuIREF", %s\n", iref, err_getstring(err));
435
436     svc_iref = iref;
437 }
438
439 /**
440  * \brief XOMP channel connect callback called by the Flounder backend
441  *
442  * \param st    Supplied worker state
443  * \param err   outcome of the connect attempt
444  * \param xb    XOMP Flounder binding
445  */
446 static void worker_connect_cb(void *st,
447                               errval_t err,
448                               struct xomp_binding *xb)
449 {
450
451     struct xomp_worker *worker = st;
452
453     XMI_DEBUG("worker:%lx connected: %s\n", worker->id, err_getstring(err));
454
455     if (err_is_fail(err)) {
456         worker->state = XOMP_WORKER_ST_FAILURE;
457         return;
458     }
459
460     xb->rx_vtbl = rx_vtbl;
461     xb->st = worker;
462
463     txq_init(&worker->txq, xb, xb->waitset, (txq_register_fn_t) xb->register_send,
464              sizeof(struct xomp_msg_st));
465
466     worker->binding = xb;
467     worker->state = XOMP_WORKER_ST_SPAWNED;
468 }
469
470 /*
471  * ============================================================================
472  * Public Interface
473  * ============================================================================
474  */
475
476 /**
477  * \brief initializes the Xeon Phi openMP library
478  *
479  * \param args struct containing the master initialization values
480  *
481  * \returns SYS_ERR_OK on success
482  *          errval on failure
483  */
484
485 errval_t xomp_master_init(struct xomp_args *args)
486 {
487     errval_t err;
488
489     if (xomp_master_initialized) {
490         XMI_DEBUG("WARNIG: XOMP master already initialized\n");
491         return SYS_ERR_OK;
492     }
493
494     if (args->type == XOMP_ARG_TYPE_WORKER) {
495         return -1;  // TODO: ERRNO
496     }
497
498 #if XOMP_BENCH_MASTER_EN
499     bench_init();
500 #endif
501
502     if (args->core_stride != 0) {
503         core_stride = args->core_stride;
504     } else {
505         core_stride = BOMP_DEFAULT_CORE_STRIDE;
506     }
507
508     if (args->type == XOMP_ARG_TYPE_UNIFORM) {
509         num_phi = args->args.uniform.nphi;
510         worker_loc = args->args.uniform.worker_loc;
511     } else {
512         num_phi = args->args.distinct.nphi;
513
514         worker_loc = args->args.distinct.worker_loc;
515     }
516
517     XMI_DEBUG("Initializing XOMP master with nthreads:%u, nphi:%u\n",
518               args->args.uniform.nthreads, args->args.uniform.nphi);
519
520     /* exporting the interface for local workers */
521     err = xomp_export(NULL, xomp_svc_export_cb, xomp_svc_connect_cb,
522                       get_default_waitset(), IDC_EXPORT_FLAGS_DEFAULT);
523     if (err_is_fail(err)) {
524         return err;
525     }
526
527     while (svc_iref == 0) {
528         err = event_dispatch(get_default_waitset());
529         if (err_is_fail(err)) {
530             USER_PANIC_ERR(err, "event dispatch\n");
531         }
532     }
533
534     char **argv = NULL;
535
536     if (args->type == XOMP_ARG_TYPE_UNIFORM) {
537
538         spawn_args_local.argc = args->args.uniform.argc;
539         spawn_args_remote.argc = args->args.uniform.argc;
540
541         err = xomp_master_build_path(&spawn_args_local.path, &spawn_args_remote.path);
542         if (err_is_fail(err)) {
543             USER_PANIC_ERR(err, "could not build the path");
544         }
545         argv = args->args.uniform.argv;
546     } else {
547         spawn_args_local.argc = args->args.distinct.local.argc;
548         spawn_args_local.path = args->args.distinct.local.path;
549         spawn_args_remote.path = args->args.distinct.remote.path;
550         argv = args->args.distinct.local.argv;
551     }
552
553     spawn_args_local.argv = calloc(spawn_args_local.argc + 3, sizeof(char *));
554
555     if (spawn_args_local.argv == NULL) {
556         return LIB_ERR_MALLOC_FAIL;
557     }
558
559     for (uint8_t i = 0; i < spawn_args_local.argc; ++i) {
560         spawn_args_local.argv[i] = argv[i];
561     }
562
563     spawn_args_local.argv[spawn_args_local.argc++] = XOMP_WORKER_ARG;
564     spawn_args_local.argv[spawn_args_local.argc++] = worker_id_buf;
565     spawn_args_local.argv[spawn_args_local.argc++] = iref_buf;
566     spawn_args_local.argv[spawn_args_local.argc] = NULL;
567
568     snprintf(iref_buf, sizeof(iref_buf), "--iref=0x%08x", svc_iref);
569
570     /* remote initialization */
571
572     if (args->type == XOMP_ARG_TYPE_DISTINCT) {
573         argv = args->args.distinct.remote.argv;
574         spawn_args_remote.argc = args->args.distinct.remote.argc;
575     }
576
577     spawn_args_remote.argv = calloc(spawn_args_remote.argc + 2, sizeof(char *));
578
579     if (spawn_args_remote.argv == NULL) {
580         free(spawn_args_local.argv);
581         return LIB_ERR_MALLOC_FAIL;
582     }
583
584     for (uint8_t i = 0; i < spawn_args_remote.argc; ++i) {
585         spawn_args_remote.argv[i] = argv[i];
586     }
587
588     spawn_args_remote.argv[spawn_args_remote.argc++] = XOMP_WORKER_ARG;
589     spawn_args_remote.argv[spawn_args_remote.argc++] = worker_id_buf;
590     spawn_args_remote.argv[spawn_args_remote.argc] = NULL;
591
592     xomp_master_initialized = 0x1;
593
594     return SYS_ERR_OK;
595 }
596
597 /**
598  * \brief Spawns the worker threads on the Xeon Phi
599  *
600  * \param nworkers    Number of total workers this includes the Master
601  *
602  * \returns SYS_ERR_OK on success
603  *          errval on failure
604  */
605 errval_t xomp_master_spawn_workers(uint32_t nworkers)
606 {
607     errval_t err;
608
609     if (!xomp_master_initialized) {
610         return XOMP_ERR_MASTER_NOT_INIT;
611     }
612
613     xmaster.numworker = nworkers;
614
615     struct xomp_worker *workers = calloc(nworkers, sizeof(struct xomp_worker));
616
617     if (workers == NULL) {
618         return LIB_ERR_MALLOC_FAIL;
619     }
620
621     uint32_t remote_threads = xomp_master_get_remote_threads(nworkers);
622     uint32_t local_threads = xomp_master_get_local_threads(nworkers);
623
624     xmaster.local.next = 0;
625     xmaster.remote.next = 0;
626     xmaster.local.num = local_threads;
627     xmaster.remote.num = remote_threads;
628     xmaster.local.workers = workers;
629
630     if (remote_threads > 0) {
631         err = spawn_symval_cache_init(0);
632         if (err_is_fail(err)) {
633             USER_PANIC_ERR(err, "domain no spawned with appropriate flags\n");
634             return err;
635         }
636     }
637
638     if (num_phi > 0) {
639         xmaster.remote.workers = workers + local_threads;
640     }
641
642     XMI_DEBUG("spawning %u workers: local:%u, remote: %ux%u\n", nworkers - 1,
643               local_threads, num_phi,
644               (num_phi != 0 ? remote_threads / num_phi : remote_threads));
645
646     assert((remote_threads + local_threads + 1) == nworkers);
647
648     xphi_id_t xid = 0;
649     coreid_t core = disp_get_core_id() + core_stride;
650
651 #if XOMP_BENCH_MASTER_EN
652     cycles_t spawn_timer;
653     cycles_t remote_spawn_timer = 0;
654     cycles_t remote_connect_timer = 0;
655     cycles_t local_spawn_timer = 0;
656     cycles_t local_connect_timer = 0;
657 #endif
658
659     for (uint32_t i = 0; i < remote_threads + local_threads; ++i) {
660 #ifdef __k1om__
661         if (xid == disp_xeon_phi_id()) {
662             xid = (xid + 1) % num_phi;
663         }
664 #endif
665         if (i == local_threads) {
666             core = XOMP_REMOTE_COREID_START;
667         }
668
669         struct xomp_worker *worker = workers + i;
670
671 #if XOMP_BENCH_ENABLED
672         worker->index = i;
673         worker->start = bench_tsc();
674 #endif
675
676 #ifndef __k1om__
677         /*
678          * XXX: we have to set the ram affinity in order to have a higher chance
679          *      the node gets found at the Xeon Phi. It may be split up otherwise
680          */
681         uint64_t min_base, max_limit;
682         ram_get_affinity(&min_base, &max_limit);
683         ram_set_affinity(XOMP_RAM_MIN_BASE, XOMP_RAM_MAX_LIMIT);
684 #endif
685
686         if (i < local_threads) {
687             err = frame_alloc(&worker->msgframe, XOMP_TLS_SIZE, NULL);
688         } else {
689             err = frame_alloc(&worker->msgframe, XOMP_FRAME_SIZE, NULL);
690         }
691
692 #ifndef __k1om__
693         ram_set_affinity(min_base, max_limit);
694 #endif
695
696         if (err_is_fail(err)) {
697             /* TODO: cleanup */
698             worker->state = XOMP_WORKER_ST_FAILURE;
699             return err_push(err, XOMP_ERR_SPAWN_WORKER_FAILED);
700         }
701
702         struct frame_identity id;
703         err = frame_identify(worker->msgframe, &id);
704         if (err_is_fail(err)) {
705             /* TODO: cleanup */
706             return err_push(err, XOMP_ERR_SPAWN_WORKER_FAILED);
707         }
708
709         /* TODO: build a good id */
710         worker->id = ((uint64_t) disp_get_domain_id()) << 48 | ((uint64_t)core) << 32;
711         if (i < local_threads) {
712             worker->id |= ((uint64_t)0xFF) << 24;
713         } else {
714             worker->id |= ((uint64_t)xid) << 24;
715         }
716         worker->id |= i+1;
717
718         worker->msgbase = id.base;
719         worker->state = XOMP_WORKER_ST_SPAWNING;
720
721         err = vspace_map_one_frame(&worker->msgbuf, id.bytes,
722                                    worker->msgframe, NULL, NULL);
723
724         if (err_is_fail(err)) {
725             /* TODO: cleanup */
726             return err_push(err, XOMP_ERR_SPAWN_WORKER_FAILED);
727         }
728
729         XMI_DEBUG("messaging frame mapped: [%016lx] @ [%016lx]\n",
730                   worker->msgbase, (lvaddr_t )worker->msgbuf);
731
732         if (i < local_threads) {
733             snprintf(worker_id_buf, sizeof(worker_id_buf), "--wid=%016"PRIx64,
734                      worker->id);
735             /*
736              * TODO: set a gateway domain for each NUMA node as it is done with
737              *       the Xeon Phi
738              */
739             worker->tls = worker->msgbuf;
740
741             XMI_DEBUG("spawning {%s} on host, core:%u\n", spawn_args_local.path,
742                       core);
743 #if XOMP_BENCH_MASTER_EN
744             spawn_timer = bench_tsc();
745 #endif
746
747             domainid_t did;
748             err = spawn_program_with_caps(core, spawn_args_local.path,
749                                           spawn_args_local.argv, NULL, NULL_CAP,
750                                           worker->msgframe, SPAWN_FLAGS_OMP,
751                                           &did);
752 #if XOMP_BENCH_MASTER_EN
753             local_spawn_timer += bench_tsc() - spawn_timer;
754             spawn_timer = bench_tsc();
755 #endif
756             worker->domainid = did;
757             worker->type = XOMP_WORKER_TYPE_LOCAL;
758             if (err_is_fail(err)) {
759                 /* TODO: cleanup */
760                 return err_push(err, XOMP_ERR_SPAWN_WORKER_FAILED);
761             }
762
763             core += core_stride;
764         } else {
765             /*
766              * we give give the first worker domains the gateway flag so it
767              * initializes the gateway service while others will connect to it
768              */
769             if (core == XOMP_REMOTE_COREID_START) {
770                 worker->id |= XOMP_WID_GATEWAY_FLAG;
771             }
772
773             snprintf(worker_id_buf, sizeof(worker_id_buf), "--wid=%016"PRIx64,
774                      worker->id);
775
776             worker->tls = ((uint8_t *) worker->msgbuf) + XOMP_MSG_FRAME_SIZE;
777
778             struct xomp_frameinfo fi = {
779                 .sendbase = worker->msgbase + XOMP_MSG_CHAN_SIZE,
780                 .inbuf = worker->msgbuf,
781                 .inbufsize = XOMP_MSG_CHAN_SIZE,
782                 .outbuf = ((uint8_t *) worker->msgbuf) + XOMP_MSG_CHAN_SIZE,
783                 .outbufsize = XOMP_MSG_CHAN_SIZE
784             };
785
786             err = xomp_accept(&fi, worker, worker_connect_cb,
787                               get_default_waitset(), IDC_EXPORT_FLAGS_DEFAULT);
788
789             if (err_is_fail(err)) {
790                 /* TODO: Clean up */
791                 return err_push(err, XOMP_ERR_SPAWN_WORKER_FAILED);
792             }
793
794             XMI_DEBUG("spawning {%s} on xid:%u, core:%u\n",
795                       spawn_args_remote.path, xid, core);
796 #if XOMP_BENCH_MASTER_EN
797             spawn_timer = bench_tsc();
798 #endif
799             err = xeon_phi_client_spawn(xid, core, spawn_args_remote.path,
800                                         spawn_args_remote.argv, worker->msgframe,
801                                         SPAWN_FLAGS_OMP, &worker->domainid);
802
803 #if XOMP_BENCH_MASTER_EN
804             remote_spawn_timer += bench_tsc() - spawn_timer;
805             spawn_timer = bench_tsc();
806 #endif
807
808             if (err_is_fail(err)) {
809                 /* TODO: cleanup */
810                 return err_push(err, XOMP_ERR_SPAWN_WORKER_FAILED);
811             }
812             worker->type = XOMP_WORKER_TYPE_REMOTE;
813             xid++;
814         }
815
816         XMI_DEBUG("waiting for client %p to connect...\n", worker);
817
818         while (worker->state == XOMP_WORKER_ST_SPAWNING) {
819             err = event_dispatch(get_default_waitset());
820             if (err_is_fail(err)) {
821                 USER_PANIC_ERR(err, "event dispatch\n");
822             }
823         }
824 #if XOMP_BENCH_MASTER_EN
825         if (worker->type == XOMP_WORKER_TYPE_REMOTE) {
826             remote_connect_timer += bench_tsc() - spawn_timer;
827         } else {
828             local_connect_timer += bench_tsc() - spawn_timer;
829         }
830 #endif
831
832         if (worker->state == XOMP_WORKER_ST_FAILURE) {
833             return XOMP_ERR_SPAWN_WORKER_FAILED;
834         }
835
836 #if XOMP_BENCH_ENABLED
837         cycles_t timer = bench_tsc();
838         if (xomp_bench_spawn) {
839             timer = bench_time_diff(worker->start, timer);
840             if (i < local_threads) {
841                 bench_ctl_add_run(xomp_bench_spawn[0], &timer);
842             } else {
843                 bench_ctl_add_run(xomp_bench_spawn[1], &timer);
844             }
845         }
846 #endif
847
848         worker->state = XOMP_WORKER_ST_READY;
849
850         if (i >= local_threads) {
851             if (xid == num_phi) {
852                 xid = 0;
853                 core++; // no stride on xeon phi
854             }
855         }
856     }
857
858 #if XOMP_BENCH_MASTER_EN
859     remote_spawn_timer /= (remote_threads ? remote_threads : 1);
860     local_spawn_timer /= (local_threads ? local_threads : 1);
861     remote_connect_timer /= (remote_threads ? remote_threads : 1);
862     local_connect_timer /= (local_threads ? local_threads : 1);
863     debug_printf("Avg spawn time remote: %lu cycles, %lu ms\n",
864                  remote_spawn_timer, bench_tsc_to_ms(remote_spawn_timer));
865     debug_printf("Avg spawn time local: %lu cycles, %lu ms\n",
866                  local_spawn_timer, bench_tsc_to_ms(local_spawn_timer));
867     debug_printf("Avg connect time remote: %lu cycles, %lu ms\n",
868                  remote_connect_timer, bench_tsc_to_ms(remote_connect_timer));
869     debug_printf("Avg connect time local: %lu cycles, %lu ms\n",
870                  local_connect_timer, bench_tsc_to_ms(local_connect_timer));
871 #endif
872
873     xmaster.local.next = 0;
874     xmaster.remote.next = 0;
875
876     return SYS_ERR_OK;
877 }
878
879 /**
880  * \brief Adds a memory region to be used for work
881  *
882  * \param frame Frame to be shared
883  * \param info  information about the frame i.e. virtual address to map
884  * \oaram type  Type of the frame
885  *
886  * \returns SYS_ERR_OK on success
887  *          errval on error
888  */
889
890 errval_t xomp_master_add_memory(struct capref frame,
891                                 uint64_t info,
892                                 xomp_frame_type_t type)
893 {
894     errval_t err;
895
896     if (!xomp_master_initialized) {
897         return XOMP_ERR_MASTER_NOT_INIT;
898     }
899
900 #if XOMP_BENCH_MASTER_EN
901     remote_timer = 0;
902     local_timer = 0;
903 #endif
904
905     struct xomp_worker *worker;
906
907     XMI_DEBUG("adding memory of type %u @ info: %016lx\n", type, info);
908
909     /*
910      * we adding the memory to the worker domains with the Xeon Phi Gateway
911      * domains first. This is expected to take the longest time. (potential
912      * replication and going through the Xeon Phi drivers).
913      *
914      * For subsequent worker domains, we just send the messages asynchronously
915      */
916     for (uint32_t i = 0; i < xmaster.remote.num; ++i) {
917         worker = &xmaster.remote.workers[i];
918 #if XOMP_BENCH_ENABLED
919             worker->start = bench_tsc();
920 #endif
921         if (worker->id & XOMP_WID_GATEWAY_FLAG) {
922             xphi_id_t xid = xeon_phi_domain_get_xid(worker->domainid);
923             err = xeon_phi_client_chan_open(xid, worker->domainid, info, frame,
924                                             type);
925             if (err_is_fail(err)) {
926                 worker->state = XOMP_WORKER_ST_FAILURE;
927                 /*
928                  * XXX: if the gateway domain fails, the entire node is not
929                  *      operational.
930                  */
931                 return err;
932             }
933 #if XOMP_BENCH_ENABLED
934             if (xomp_bench_mem_add) {
935                 cycles_t timer = bench_tsc();
936                 timer = bench_time_diff(worker->start, timer);
937                 bench_ctl_add_run(xomp_bench_mem_add[1], &timer);
938                 bench_ctl_add_run(xomp_bench_mem_add[2 + worker->index], &timer);
939             }
940 #endif
941 #if XOMP_BENCH_MASTER_EN
942             cycles_t duration = bench_tsc() - worker->start;
943             debug_printf("remote worker %lx: chan open took  %lu cycles, %lu ms\n",
944                          worker->id, duration, bench_tsc_to_ms(duration));
945             remote_timer += duration;
946 #endif
947         } else {
948             assert(worker->add_mem_st == 0x0);
949
950             worker->add_mem_st = 0x1;
951
952             struct txq_msg_st *msg_st = txq_msg_st_alloc(&worker->txq);
953
954             if (msg_st == NULL) {
955                 return LIB_ERR_MALLOC_FAIL;
956             }
957
958             msg_st->send = gw_req_memory_call_tx;
959             msg_st->cleanup = NULL;
960
961             struct xomp_msg_st *st = (struct xomp_msg_st *) msg_st;
962             st->args.add_mem.vaddr = info;
963             st->args.add_mem.type = type;
964
965             txq_send(msg_st);
966         }
967     }
968
969     /* send the memory caps to the local workers directly */
970     for (uint32_t i = 0; i < xmaster.local.num; ++i) {
971         worker = &xmaster.local.workers[i];
972 #if XOMP_BENCH_ENABLED
973         worker->start = bench_tsc();
974 #endif
975         assert(worker->type == XOMP_WORKER_TYPE_LOCAL);
976         assert(worker->add_mem_st == 0x0);
977
978         worker->add_mem_st = 0x1;
979
980         struct txq_msg_st *msg_st = txq_msg_st_alloc(&worker->txq);
981
982         if (msg_st == NULL) {
983             return LIB_ERR_MALLOC_FAIL;
984         }
985
986         msg_st->send = add_memory_call_tx;
987         msg_st->cleanup = NULL;
988
989         struct xomp_msg_st *st = (struct xomp_msg_st *) msg_st;
990         st->args.add_mem.frame = frame;
991         st->args.add_mem.vaddr = info;
992
993         // XXX: overwriting replicaton on the host for now
994         if (type == XOMP_FRAME_TYPE_REPL_RW) {
995             st->args.add_mem.type =  XOMP_FRAME_TYPE_SHARED_RW;
996         } else {
997             st->args.add_mem.type = type;
998         }
999
1000         txq_send(msg_st);
1001     }
1002
1003     /* wait for the replies */
1004
1005     for (uint32_t i = 0; i < xmaster.remote.num; ++i) {
1006         worker = &xmaster.remote.workers[i];
1007         if (worker->id & XOMP_WID_GATEWAY_FLAG) {
1008             continue;
1009         }
1010         while (worker->add_mem_st == 0x1) {
1011             err = event_dispatch(get_default_waitset());
1012             if (err_is_fail(err)) {
1013                 USER_PANIC_ERR(err, "event dispatch\n");
1014             }
1015         }
1016         if (err_is_fail(worker->err)) {
1017             worker->state = XOMP_WORKER_ST_FAILURE;
1018             return worker->err;
1019         }
1020         worker->add_mem_st = 0x0;
1021     }
1022
1023     for (uint32_t i = 0; i < xmaster.local.num; ++i) {
1024         worker = &xmaster.local.workers[i];
1025         assert(worker->type == XOMP_WORKER_TYPE_LOCAL);
1026
1027         while (worker->add_mem_st == 0x1) {
1028             err = event_dispatch(get_default_waitset());
1029             if (err_is_fail(err)) {
1030                 USER_PANIC_ERR(err, "event dispatch\n");
1031             }
1032         }
1033         if (err_is_fail(worker->err)) {
1034             worker->state = XOMP_WORKER_ST_FAILURE;
1035             return worker->err;
1036         }
1037         worker->add_mem_st = 0x0;
1038     }
1039
1040 #if XOMP_BENCH_MASTER_EN
1041     remote_timer /= (xmaster.remote.num ? xmaster.remote.num : 1);
1042     local_timer /= (xmaster.local.num ? xmaster.local.num : 1);
1043
1044     debug_printf("Avg mem add time remote: %lu cycles, %lu ms\n",
1045                  remote_timer, bench_tsc_to_ms(remote_timer));
1046     debug_printf("Avg mem add time local: %lu cycles, %lu ms\n",
1047                  local_timer, bench_tsc_to_ms(local_timer));
1048
1049 #endif
1050
1051     return SYS_ERR_OK;
1052 }
1053
1054 /**
1055  * \brief builds the argument path based on the own binary name
1056  *
1057  * \param local  pointer where to store the local path
1058  * \param remote pointer where to store the remote path
1059  *
1060  * \returns SYS_ERR_OK on success
1061  */
1062 errval_t xomp_master_build_path(char **local,
1063                                 char **remote)
1064 {
1065     size_t length, size = 0;
1066
1067     size += snprintf(NULL, 0, "/x86_64/sbin/%s", disp_name()) + 1;
1068     size += snprintf(NULL, 0, "/k1om/sbin/%s", disp_name()) + 1;
1069
1070     char *path = malloc(size);
1071     if (path == NULL) {
1072         return LIB_ERR_MALLOC_FAIL;
1073     }
1074
1075     length = snprintf(path, size, "/x86_64/sbin/%s", disp_name());
1076     path[length] = '\0';
1077     size -= (++length);
1078
1079     if (local) {
1080         *local = path;
1081     }
1082
1083     path += length;
1084     length = snprintf(path, size, "/k1om/sbin/%s", disp_name());
1085     path[length] = '\0';
1086
1087     if (remote) {
1088         *remote = path;
1089     }
1090
1091     return SYS_ERR_OK;
1092 }
1093
1094 /**
1095  * \brief executes some work on each worker domains
1096  *
1097  * \param task information about the task
1098  *
1099  * \returns SYS_ERR_OK on success
1100  *          errval on error
1101  */
1102 errval_t xomp_master_do_work(struct xomp_task *task)
1103 {
1104     errval_t err;
1105
1106     if (!xomp_master_initialized) {
1107         return XOMP_ERR_MASTER_NOT_INIT;
1108     }
1109
1110 #ifndef __k1om__
1111     struct waitset *ws = get_default_waitset();
1112 #endif
1113
1114     uint64_t fn = 0;
1115
1116 #if XOMP_BENCH_MASTER_EN
1117     remote_timer = 0;
1118     local_timer = 0;
1119 #endif
1120
1121     uint32_t remote_threads = xomp_master_get_remote_threads(task->total_threads);
1122     uint32_t local_threads = xomp_master_get_local_threads(task->total_threads);
1123
1124     XMP_DEBUG("Executing task with %u workers host:%u, xphi:%ux%u]\n",
1125               task->total_threads, local_threads + 1, num_phi, remote_threads);
1126
1127     assert(local_threads <= xmaster.local.num);
1128     assert(remote_threads <= xmaster.remote.num);
1129     assert((local_threads + remote_threads + 1) == task->total_threads);
1130
1131     uint32_t fn_idx;
1132     char *fn_name;
1133
1134     if (remote_threads > 0) {
1135         /*
1136          * do the address translation for the remote workers
1137          */
1138         err = spawn_symval_lookup_addr((genvaddr_t) task->fn, &fn_idx, &fn_name);
1139         if (err_is_fail(err)) {
1140             USER_PANIC_ERR(err, "looking up address\n");
1141             return err;
1142         }
1143     }
1144
1145     /* overwrite the global num threads counter */
1146     g_bomp_state->num_threads += ((local_threads) * (XOMP_VTHREAD_COUNT));
1147
1148     uint32_t threadid = 1;
1149
1150     for (uint32_t i = 1; i < task->total_threads; ++i) {
1151         struct xomp_worker *worker = NULL;
1152
1153         if (i <= local_threads) {
1154             worker = &xmaster.local.workers[xmaster.local.next++];
1155             assert(worker->type == XOMP_WORKER_TYPE_LOCAL);
1156
1157             if (xmaster.local.next == xmaster.local.num) {
1158                 xmaster.local.next = 0;
1159             }
1160
1161             XMP_DEBUG("local worker id:%lx\n", worker->id);
1162
1163             fn = (uint64_t) task->fn;
1164
1165         } else {
1166             worker = &xmaster.remote.workers[xmaster.remote.next++];
1167             assert(worker->type == XOMP_WORKER_TYPE_REMOTE);
1168             assert(fn_idx != 0);
1169
1170             if (xmaster.remote.next == xmaster.remote.num) {
1171                 xmaster.remote.next = 0;
1172             }
1173             // build the function address based on the flag and the index
1174             fn = (uint64_t) fn_idx | XOMP_FN_INDEX_FLAG;
1175
1176             XMP_DEBUG("remote worker id: %016lx, function %s @ index %u\n",
1177                       worker->id, fn_name, fn_idx);
1178         }
1179
1180 #if XOMP_BENCH_ENABLED
1181         worker->start = bench_tsc();
1182 #endif
1183
1184         if (worker->state != XOMP_WORKER_ST_READY) {
1185             return XOMP_ERR_WORKER_STATE;
1186         }
1187         assert(worker->state == XOMP_WORKER_ST_READY);
1188         worker->state = XOMP_WORKER_ST_BUSY;
1189
1190         struct bomp_work *work = worker->tls;
1191
1192         work->fn = task->fn;
1193
1194         work->barrier = NULL;
1195         work->thread_id = threadid;
1196         work->num_threads = g_bomp_state->num_threads;
1197
1198         if (i <= local_threads) {
1199             work->num_vtreads = XOMP_VTHREADS;
1200             threadid += XOMP_VTHREADS;
1201         } else {
1202             work->num_vtreads = 1;
1203             threadid++;
1204         }
1205
1206         /* XXX: hack, we do not know how big the data section is... */
1207         if (task->arg) {
1208             uint64_t *src = task->arg;
1209             uint64_t *dst = (uint64_t *) (work + 1);
1210             uint32_t bytes = 0;
1211             while (*src != 0 || bytes < 64) {
1212                 *dst++ = *src++;
1213                 bytes += 8;
1214             }
1215         }
1216
1217         struct txq_msg_st *msg_st = txq_msg_st_alloc(&worker->txq);
1218
1219         if (msg_st == NULL) {
1220             if (i == 1) {
1221                 free(task);
1222             }
1223             return LIB_ERR_MALLOC_FAIL;
1224         }
1225
1226         msg_st->send = do_work_tx;
1227         msg_st->cleanup = NULL;
1228
1229         struct xomp_msg_st *st = (struct xomp_msg_st *) msg_st;
1230         st->args.do_work.arg = (uint64_t) work->data;
1231         st->args.do_work.fn = fn;
1232         st->args.do_work.id = (uint64_t) task;
1233         st->args.do_work.flags = 0;
1234
1235         txq_send(msg_st);
1236
1237 #ifndef __k1om__
1238         do {
1239             err = event_dispatch_non_block(ws);
1240         } while(err_is_ok(err));
1241 #endif
1242     }
1243
1244     return SYS_ERR_OK;
1245 }
1246
1247
1248 /**
1249  * \brief tells the gateway domains to update their local replicas
1250  *
1251  * \param frame      capability of the shared frame
1252  * \param offset     offset into the capability to copy
1253  * \param length     number of bytes to copy
1254  * \param node       which node to send the copy request to
1255  * \param direction  UPDATE or WRITE BACK
1256  *
1257  * \return SYS_ERR_OK on sucess,
1258  *         errval on failure
1259  *
1260  */
1261 errval_t xomp_master_copy_memory(struct capref frame,
1262                                  size_t offset,
1263                                  size_t length,
1264                                  uint16_t node,
1265                                  xomp_master_copy_t direction)
1266 {
1267     assert(!"NYI");
1268     return SYS_ERR_OK;
1269 }
1270
1271 #if XOMP_BENCH_ENABLED
1272 /**
1273  * \brief enables basic benchmarking facilities
1274  *
1275  * \param runs   the number of runs of the experiment
1276  * \param flags  flags which benchmarks to enable
1277  *
1278  * \returns SYS_ERR_OK on success
1279  */
1280 errval_t xomp_master_bench_enable(size_t runs,
1281                                   size_t nthreads,
1282                                   uint8_t flags)
1283 {
1284     bench_init();
1285
1286     bench_ctl_t **mem = NULL;
1287
1288     if (!flags) {
1289         return -1;
1290     }
1291
1292     mem = calloc(2 + 2 * (2 + nthreads), sizeof(bench_ctl_t*));
1293
1294
1295     if (flags & XOMP_MASTER_BENCH_SPAWN) {
1296         xomp_bench_spawn = mem;
1297         xomp_bench_spawn[0] = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, nthreads);
1298         xomp_bench_spawn[1] = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, nthreads);
1299         mem += (2);
1300     }
1301
1302     if (flags & XOMP_MASTER_BENCH_DO_WORK) {
1303         xomp_bench_do_work = mem;
1304         xomp_bench_do_work[0] = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, nthreads *  runs);
1305         xomp_bench_do_work[1] = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, nthreads *  runs);
1306         for (uint32_t i = 0; i < nthreads; ++i) {
1307             xomp_bench_do_work[i + 2] = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, runs);
1308         }
1309         mem += (2 + nthreads);
1310     }
1311
1312     if (flags & XOMP_MASTER_BENCH_MEM_ADD) {
1313         xomp_bench_mem_add = mem;
1314         xomp_bench_mem_add[0] = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, nthreads *  runs);
1315         xomp_bench_mem_add[1] = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, nthreads *  runs);
1316         for (uint32_t i = 0; i < nthreads; ++i) {
1317             xomp_bench_mem_add[i + 2] = bench_ctl_init(BENCH_MODE_FIXEDRUNS, 1, runs);
1318         }
1319     }
1320
1321     return SYS_ERR_OK;
1322 }
1323
1324 /**
1325  * \brief prints the results of the enabled benchmarks
1326  */
1327 void xomp_master_bench_print_results(void)
1328 {
1329     cycles_t tsc_per_us = bench_tsc_per_us();
1330     if (xomp_bench_spawn) {
1331         bench_ctl_dump_analysis(xomp_bench_spawn[0], 0, "SPAWN LOCAL", tsc_per_us);
1332         bench_ctl_dump_analysis(xomp_bench_spawn[1], 0, "SPAWN REMOTE", tsc_per_us);
1333     }
1334
1335     uint32_t nthreads = xmaster.local.num + xmaster.remote.num;
1336
1337     char buf[20];
1338
1339     if (xomp_bench_do_work) {
1340         bench_ctl_dump_analysis(xomp_bench_do_work[0], 0, "WORK LOCAL", tsc_per_us);
1341         bench_ctl_dump_analysis(xomp_bench_do_work[1], 0, "WORK REMOTE", tsc_per_us);
1342         for (uint32_t i = 0; i < nthreads; ++i) {
1343             snprintf(buf, 20, "work w.%u", i+1);
1344             bench_ctl_dump_analysis(xomp_bench_spawn[2+i], 0, buf, tsc_per_us);
1345         }
1346     }
1347
1348     if (xomp_bench_mem_add) {
1349         bench_ctl_dump_analysis(xomp_bench_mem_add[0], 0, "MEM ADD LOCAL", tsc_per_us);
1350         bench_ctl_dump_analysis(xomp_bench_mem_add[1], 0, "MEM ADD REMOTE", tsc_per_us);
1351         for (uint32_t i = 0; i < nthreads; ++i) {
1352             snprintf(buf, 20, "memadd w.%u", i+1);
1353             bench_ctl_dump_analysis(xomp_bench_mem_add[2+i], 0, buf, tsc_per_us);
1354         }
1355     }
1356
1357 }
1358 #endif