Implement kill() and exit() in the process manager.
authorRazvan Damachi <razvan.damachi@gmail.com>
Wed, 21 Jun 2017 17:11:41 +0000 (19:11 +0200)
committerSimon Gerber <simon.gerber@inf.ethz.ch>
Thu, 31 Aug 2017 14:35:08 +0000 (16:35 +0200)
On the server side, both calls are handled similarly: the process manager sends
a kill request to all spawnds running a dispatcher for the victim domain. On the
client side, they are different calls.

The general-purpose domain exit protocol now attempts to use the proc mgmt API
exit call (in lib/barrelfish/init.c). If this fails, the protocol will fall
back to exiting via a direct spawn_exit() call, like before. The use-case where
exiting via the process manager is expected to fail is when the domain was not
spawned through the process manager in the first place, such as is the case
with the special domains spawned by the monitor on the bootstrap core, as well
as the monitors themselves.

Signed-off-by: Razvan Damachi <razvan.damachi@gmail.com>

errors/errno.fugu
if/proc_mgmt.if
include/barrelfish/proc_mgmt_client.h
lib/barrelfish/init.c
lib/barrelfish/proc_mgmt_client.c
usr/proc_mgmt/domain.c
usr/proc_mgmt/domain.h
usr/proc_mgmt/pending_clients.h
usr/proc_mgmt/service.c
usr/spawnd/ps.c

index 84d3e87..89ba6be 100755 (executable)
@@ -686,6 +686,7 @@ errors proc_mgmt PROC_MGMT_ERR_ {
     failure DOMAIN_TABLE_FIND    "Failed to find requested domain in domain table",
     failure DOMAIN_NOT_RUNNING   "Domain is not currently running",
     failure ALREADY_SPANNED      "Domain has already been spanned to the given core",
+    failure KILL                 "Failed to kill requested domain",
 };
 
 // errors from ELF library
index 3309307..4697c54 100644 (file)
@@ -38,6 +38,9 @@ interface proc_mgmt "Process management service" {
   // Kill a domain for which the caller has a domain cap.
   rpc kill(in cap domain_cap, out errval err);
 
+  // Let the process manager know the caller has finished execution.
+  message exit(cap domain_cap, uint8 status);
+
   /*
   // Span the caller to a new core.
   rpc span(in cap domainid_cap,
index ad489c2..adfb4f3 100644 (file)
@@ -44,6 +44,7 @@ errval_t proc_mgmt_spawn_program_with_caps(coreid_t core_id, const char *path,
                                  struct capref *ret_domain_cap);
 errval_t proc_mgmt_span(coreid_t core_id);
 errval_t proc_mgmt_kill(struct capref domain_cap);
+errval_t proc_mgmt_exit(uint8_t status);
 
 __END_DECLS
 
index 6221a2f..949fcdd 100644 (file)
@@ -25,6 +25,7 @@
 #include <barrelfish/morecore.h>
 #include <barrelfish/monitor_client.h>
 #include <barrelfish/nameservice_client.h>
+#include <barrelfish/proc_mgmt_client.h>
 #include <barrelfish/spawn_client.h>
 #include <barrelfish/systime.h>
 #include <barrelfish_kpi/domain_params.h>
@@ -68,9 +69,14 @@ void libc_exit(int status)
 
         // XXX: Leak all other domain allocations
     } else {
-        err = spawn_exit(status);
-        if(err_is_fail(err)) {
-            DEBUG_ERR(err, "spawn_exit");
+        err = proc_mgmt_exit(status);
+        if (err_is_fail(err)) {
+            // Maybe we have not been spawned through the process manager, but
+            // through spawnd directly (we're some bootstrap domain).
+            err = spawn_exit(status);
+            if (err_is_fail(err)) {
+                DEBUG_ERR(err, "spawn_exit");
+            }
         }
     }
 
index 130c868..775fcc0 100644 (file)
@@ -487,3 +487,24 @@ errval_t proc_mgmt_kill(struct capref domain_cap)
 
     return msgerr;
 }
+
+/**
+ * \brief Inform the process manager about exiting execution.
+ */
+errval_t proc_mgmt_exit(uint8_t status )
+{
+    errval_t err = proc_mgmt_bind_client();
+    if (err_is_fail(err)) {
+        return err;
+    }
+
+    struct proc_mgmt_binding *b = get_proc_mgmt_binding();
+    assert(b != NULL);
+
+    err = b->tx_vtbl.exit(b, NOP_CONT, cap_domainid, status);
+    if (err_is_fail(err)) {
+        return err;
+    }
+
+    return SYS_ERR_OK;
+}
index 8516128..9a0d89d 100644 (file)
 
 #include <barrelfish/barrelfish.h>
 #include <collections/hash_table.h>
+#include <if/spawn_defs.h>
 
 #include "domain.h"
+#include "spawnd_state.h"
 
 #define HASH_INDEX_BUCKETS 6151
 static collections_hash_table* domain_table = NULL;
@@ -29,7 +31,8 @@ errval_t domain_new(struct capref domain_cap, struct domain_entry **ret_entry)
 
     entry->domain_cap = domain_cap;
     entry->status = DOMAIN_STATUS_NIL;
-    entry->spawnds = NULL;
+    memset(entry->spawnds, 0, sizeof(entry->spawnds));
+    entry->num_spawnds_running = 0;
     entry->waiters = NULL;
 
     if (domain_table == NULL) {
@@ -73,21 +76,17 @@ errval_t domain_get_by_cap(struct capref domain_cap,
     return SYS_ERR_OK;
 }
 
-void domain_run_on_spawnd(struct domain_entry *entry,
-                          struct spawnd_state *spawnd)
+void domain_run_on_core(struct domain_entry *entry, coreid_t core_id)
 {
     assert(entry != NULL);
-    assert(spawnd != NULL);
+    assert(core_id < MAX_COREID);
     assert(entry->status == DOMAIN_STATUS_NIL ||
            entry->status == DOMAIN_STATUS_RUNNING);
 
     entry->status = DOMAIN_STATUS_RUNNING;
 
-    struct domain_spawnd_state *st = (struct domain_spawnd_state*) malloc(
-            sizeof(struct domain_spawnd_state));
-    st->spawnd_state = spawnd;
-    st->next = entry->spawnds;
-    entry->spawnds = st;
+    entry->spawnds[core_id] = spawnd_state_get(core_id);
+    ++entry->num_spawnds_running;
 }
 
 errval_t domain_spawn(struct capref domain_cap, coreid_t core_id)
@@ -101,7 +100,7 @@ errval_t domain_spawn(struct capref domain_cap, coreid_t core_id)
         return err;
     }
 
-    domain_run_on_spawnd(entry, spawnd_state_get(core_id));
+    domain_run_on_core(entry, core_id);
 
     return SYS_ERR_OK;
 }
@@ -119,14 +118,10 @@ errval_t domain_can_span(struct capref domain_cap, coreid_t core_id)
         return PROC_MGMT_ERR_DOMAIN_NOT_RUNNING;
     }
 
-    struct domain_spawnd_state *st = entry->spawnds;
-    while (st != NULL) {
-        if (st->spawnd_state->core_id == core_id) {
-            // TODO(razvan): Maybe we want to allow the same domain to span
-            // multiple dispatcher onto the same core?
-            return PROC_MGMT_ERR_ALREADY_SPANNED;
-        }
-        st = st->next;
+    if (entry->spawnds[core_id] != NULL) {
+        // TODO(razvan): Maybe we want to allow the same domain to span multiple
+        // dispatchers onto the same core?
+        return PROC_MGMT_ERR_ALREADY_SPANNED;
     }
 
     return SYS_ERR_OK;
@@ -141,19 +136,7 @@ errval_t domain_span(struct capref domain_cap, coreid_t core_id)
     }
     assert(entry != NULL);
 
-    domain_run_on_spawnd(entry, spawnd_state_get(core_id));
+    domain_run_on_core(entry, core_id);
 
     return SYS_ERR_OK;
 }
-
-void domain_send_stop(struct domain_entry *entry)
-{
-    assert(entry != NULL);
-
-    struct domain_spawnd_state *st = entry->spawnds;
-    while (st != NULL) {
-        debug_printf("Simulating STOP message to spawnd at binding %p\n",
-                     st->spawnd_state->b);
-        st = st->next;
-    }
-}
index a338eda..017b588 100644 (file)
@@ -17,6 +17,8 @@
 
 #include "spawnd_state.h"
 
+#define EXIT_STATUS_KILLED 9
+
 enum domain_status {
     DOMAIN_STATUS_NIL,
     DOMAIN_STATUS_RUNNING,
@@ -30,28 +32,31 @@ struct domain_waiter {
     struct domain_waiter *next;
 };
 
-struct domain_spawnd_state {
-    struct spawnd_state *spawnd_state;
-    struct domain_spawnd_state *next;
-};
-
 struct domain_entry {
-    struct capref domain_cap;              // Unique domain ID cap.
-    enum domain_status status;             // Current domain state.
-    struct domain_spawnd_state *spawnds;   // Spawnds running this domain.
-    struct domain_waiter *waiters;         // Clients waiting after this domain.
+    struct capref domain_cap;   // Unique domain ID cap.
+    enum domain_status status;  // Current domain state.
+
+    struct spawnd_state *spawnds[MAX_COREID];  // Spawnds running this domain.
+    coreid_t num_spawnds_running;
+
+    struct domain_waiter *waiters;  // Clients waiting after this domain.
+
+    uint8_t exit_status;
 };
 
 errval_t domain_new(struct capref domain_cap, struct domain_entry **ret_entry);
 errval_t domain_get_by_cap(struct capref domain_cap,
                            struct domain_entry **ret_entry);
-void domain_run_on_spawnd(struct domain_entry *entry,
-                          struct spawnd_state *spawnd);
+void domain_run_on_core(struct domain_entry *entry, coreid_t core_id);
 
 errval_t domain_spawn(struct capref domain_cap, coreid_t core_id);
 errval_t domain_can_span(struct capref domain_cap, coreid_t core_id);
 errval_t domain_span(struct capref domain_cap, coreid_t core_id);
-void domain_send_stop(struct domain_entry *entry);
+static inline void domain_stop_pending(struct domain_entry *entry)
+{
+    assert(entry != NULL);
+    entry->status = DOMAIN_STATUS_STOP_PEND;
+}
 // TODO(razvan): domain_exists, domain_remove etc.
 
 #endif  // PROC_MGMT_DOMAIN_H
\ No newline at end of file
index 32261e4..94d239e 100644 (file)
@@ -22,6 +22,7 @@ enum ClientType {
        ClientType_SpawnWithCaps,
        ClientType_Span,
        ClientType_Kill,
+       ClientType_Exit,
        ClientType_Wait
        // TODO(razvan): Others?
 };
index 91027c1..4aaca3e 100644 (file)
@@ -67,12 +67,13 @@ static void spawn_reply_handler(struct spawn_binding *b,
         // followed by a local error in the process manager (see below). If that
         // is the case, then we won't have a client, as it has already been
         // released.
-        debug_printf("Unable to retrieve pending client based on domain cap "
-                     "returned by spawnd");
+        DEBUG_ERR(err, "failed to retrieve pending client based on domain cap "
+                  "returned by spawnd");
         return;
     }
 
-    errval_t resp_err;
+    errval_t resp_err = SYS_ERR_OK;
+    struct domain_entry *entry;
     switch (cl->type) {
         case ClientType_Spawn:
             err = spawn_err;
@@ -100,8 +101,75 @@ static void spawn_reply_handler(struct spawn_binding *b,
             resp_err = cl->b->tx_vtbl.span_response(cl->b, NOP_CONT, err);
             break;
 
+        case ClientType_Kill:
+                if (err_is_fail(spawn_err)) {
+                    // Looks like some spawnd was unable to successfully kill
+                    // its dispatcher for this domain. Not much the process
+                    // manager can do about it; return the error to the client.
+                    resp_err = cl->b->tx_vtbl.kill_response(cl->b, NOP_CONT,
+                                                            err);
+                    break;
+                }
+
+                err = domain_get_by_cap(domain_cap, &entry);
+                if (err_is_fail(err)) {
+                    DEBUG_ERR(err, "failed to retrieve domain by domain_cap "
+                              "returned by spawnd after kill");
+                    break;
+                }
+
+                assert(entry->num_spawnds_running > 0);
+                assert(entry->status != DOMAIN_STATUS_STOPPED);
+
+                --entry->num_spawnds_running;
+                if (entry->num_spawnds_running == 0) {
+                    entry->status = DOMAIN_STATUS_STOPPED;
+                    entry->exit_status = EXIT_STATUS_KILLED;  // TODO(razvan): Is this desirable?
+
+                    resp_err = cl->b->tx_vtbl.kill_response(cl->b, NOP_CONT,
+                                                            err);
+
+                    // At this point, the domain exists in state STOPPED for
+                    // history reasons.
+                    // TODO(razvan): This is where we will inform waiters.
+                    break;
+                }
+
+            break;
+
+        case ClientType_Exit:
+            if (err_is_fail(spawn_err)) {
+                // Looks like some spawnd was unable to successfully kill
+                // its dispatcher for this domain. Not much the process
+                // manager can do about it. Furthermore, this was an exit call,
+                // so there's no client to reply back to.
+                break;
+            }
+
+            err = domain_get_by_cap(domain_cap, &entry);
+            if (err_is_fail(err)) {
+                DEBUG_ERR(err, "failed to retrieve domain by domain_cap "
+                          "returned by spawnd after kill");
+                break;
+            }
+
+            assert(entry->num_spawnds_running > 0);
+            assert(entry->status != DOMAIN_STATUS_STOPPED);
+
+            --entry->num_spawnds_running;
+            if (entry->num_spawnds_running == 0) {
+                entry->status = DOMAIN_STATUS_STOPPED;
+
+                // At this point, the domain exists in state STOPPED for
+                // history reasons.
+                // TODO(razvan): This is where we will inform waiters.
+                break;
+            }
+
+        break;
+
         default:
-            // TODO(razvan): Handle the other cases, e.g. kill.
+            // TODO(razvan): Handle the other cases, e.g. wait.
             debug_printf("Unknown client type %u\n", cl->type);
             return;
     }
@@ -282,13 +350,62 @@ respond_with_err:
     }
 }
 
-static void kill_handler(struct proc_mgmt_binding *b, struct capref domain_cap)
+static errval_t kill_handler_common(struct proc_mgmt_binding *b,
+                                    struct capref domain_cap,
+                                    enum ClientType type,
+                                    uint8_t exit_status)
 {
+    errval_t err = pending_clients_add(domain_cap, b, type, MAX_COREID);
+    if (err_is_fail(err)) {
+        return err;
+    }
+
     struct domain_entry *entry;
-    errval_t err = domain_get_by_cap(domain_cap, &entry);
-    if (err_is_ok(err)) {
-        domain_send_stop(entry);
+    err = domain_get_by_cap(domain_cap, &entry);
+    if (err_is_fail(err)) {
+        return err;
+    }
+
+    entry->exit_status = exit_status;
+    domain_stop_pending(entry);
+
+    for (coreid_t i = 0; i < MAX_COREID; ++i) {
+        if (entry->spawnds[i] == NULL) {
+            continue;
+        }
+
+        struct spawn_binding *spb = entry->spawnds[i]->b;
+        errval_t req_err = spb->tx_vtbl.kill_request(spb, NOP_CONT, domain_cap);
+        if (err_is_fail(req_err)) {
+            DEBUG_ERR(req_err, "failed to send kill_request to spawnd %u\n", i);
+        }
+    }
+
+    return SYS_ERR_OK;
+}
+
+static void kill_handler(struct proc_mgmt_binding *b, struct capref domain_cap)
+{
+    errval_t err = kill_handler_common(b, domain_cap, ClientType_Kill,
+                                       EXIT_STATUS_KILLED);
+    if (err_is_fail(err)) {
+        errval_t resp_err = b->tx_vtbl.kill_response(b, NOP_CONT, err);
+        if (err_is_fail(resp_err)) {
+            DEBUG_ERR(resp_err, "failed to send kill_response");
+        }
+    }
+}
+
+static void exit_handler(struct proc_mgmt_binding *b, struct capref domain_cap,
+                         uint8_t exit_status)
+{
+    errval_t err = kill_handler_common(b, domain_cap, ClientType_Exit,
+                                       exit_status);
+    if (err_is_fail(err)) {
+        DEBUG_ERR(err, "processing exit_handler for requesting domain, exit "
+                  "code %u", exit_status);
     }
+    // Error or not, there's no client to reply to anymore.
 }
 
 static struct proc_mgmt_rx_vtbl monitor_vtbl = {
@@ -296,7 +413,8 @@ static struct proc_mgmt_rx_vtbl monitor_vtbl = {
     .spawn_call           = spawn_handler,
     .spawn_with_caps_call = spawn_with_caps_handler,
     .span_call            = span_handler,
-    .kill_call            = kill_handler
+    .kill_call            = kill_handler,
+    .exit                 = exit_handler
 };
 
 static struct proc_mgmt_rx_vtbl non_monitor_vtbl = {
@@ -304,7 +422,8 @@ static struct proc_mgmt_rx_vtbl non_monitor_vtbl = {
     .spawn_call           = spawn_handler,
     .spawn_with_caps_call = spawn_with_caps_handler,
     .span_call            = span_handler,
-    .kill_call            = kill_handler
+    .kill_call            = kill_handler,
+    .exit                 = exit_handler
 };
 
 static errval_t alloc_ep_for_monitor(struct capref *ep)
index 50542f7..500fd6d 100644 (file)
@@ -18,7 +18,7 @@
 #include "ps.h"
 
 #define HASH_INDEX_BUCKETS 6151
-static collections_hash_table* domain_table = NULL;
+static collections_hash_table* ps_table = NULL;
 
 static struct ps_entry *entries[MAX_DOMAINS];
 
@@ -61,10 +61,10 @@ errval_t ps_hash_domain(struct ps_entry *entry, struct capref domain_cap)
 {
     entry->domain_cap = domain_cap;
 
-    if (domain_table == NULL) {
-        collections_hash_create_with_buckets(&domain_table, HASH_INDEX_BUCKETS,
+    if (ps_table == NULL) {
+        collections_hash_create_with_buckets(&ps_table, HASH_INDEX_BUCKETS,
                                              NULL);
-        if (domain_table == NULL) {
+        if (ps_table == NULL) {
             return SPAWN_ERR_CREATE_DOMAIN_TABLE;
         }
     }
@@ -75,7 +75,7 @@ errval_t ps_hash_domain(struct ps_entry *entry, struct capref domain_cap)
         return err;
     }
 
-    collections_hash_insert(domain_table, key, entry);
+    collections_hash_insert(ps_table, key, entry);
 
     return SYS_ERR_OK;
 }
@@ -91,13 +91,13 @@ errval_t ps_release_domain(struct capref domain_cap,
         return err;
     }
 
-    void *table_entry = collections_hash_find(domain_table, key);
+    void *table_entry = collections_hash_find(ps_table, key);
     if (table_entry == NULL) {
         return SPAWN_ERR_DOMAIN_TABLE_FIND;
     }
     *ret_entry = (struct ps_entry*) table_entry;
 
-    collections_hash_delete(domain_table, key);
+    collections_hash_delete(ps_table, key);
 
     return SYS_ERR_OK;
 }