/* $Id$ * * The Data Control Manager for Moira. * * Copyright (C) 1987-1998 by the Massachusetts Institute of Technology. * For copying and distribution information, see the file * . */ #include #include #include #include #include "update.h" #include #include #include #include #include #include #include #include #include EXEC SQL INCLUDE sqlca; void sqlglm(char *, unsigned int *, unsigned int *); RCSID("$Header$"); int generate_service(char *name, int force); void do_hosts(char *service); int dcm_send_file(char *service, int type, char *host, char *target); int dcm_execute(char *service, char *host, char *script); void dbmserr(void); #define SQL_NO_MATCH 1403 #define SOFT_FAIL(x) (((x) == MR_NO_MEM) || ((x) == MR_CANT_CONNECT) || ((x) == MR_CCONFIG) || ((x) == MR_DEADLOCK) || ((x) == MR_BUSY) || ((x) == MR_ABORT)) /* argument parsing macro */ #define argis(a, b) (!strcmp(*arg + 1, a) || !strcmp(*arg + 1, b)) char whobuf[256], *whoami = whobuf, *db = "moira"; enum { UNIQUE, DISTRIBUTED, REPLICATED }; int main(int argc, char **argv) { int i, force = 0; EXEC SQL BEGIN DECLARE SECTION; char buf[SERVERS_NAME_SIZE], *name; int enable; EXEC SQL END DECLARE SECTION; struct save_queue *sq; int status, srvcnt = 0; char **arg = argv, *services[BUFSIZ]; if (strchr(argv[0], '/')) strcpy(whoami, strrchr(argv[0], '/') + 1); else strcpy(whoami, argv[0]); umask(7); setvbuf(stderr, NULL, _IOLBF, BUFSIZ); setvbuf(stdout, NULL, _IOLBF, BUFSIZ); initialize_sms_error_table(); initialize_krb_error_table(); while (++arg - argv < argc) { if (**arg == '-') { if (argis("f", "force")) force++; else { com_err(whoami, 0, "Usage: %s [-f] servicename", argv[0]); exit(1); } } else /* Doesn't begin with a dash, is a service name. * Build an array of them we can iterate through later. */ { services[srvcnt] = malloc(SERVERS_NAME_SIZE); if (!services[srvcnt]) { com_err(whoami, 0, "Out of memory!"); exit(1); } strncpy(services[srvcnt], *arg, SERVERS_NAME_SIZE); srvcnt++; } } /* Iterate through services specified on the command line, if any. */ if (srvcnt > 0) { for (i = 0; i < srvcnt; i++) { if (generate_service(services[i], force)) { do_hosts(services[i]); free(services[i]); } } exit(0); } /* if DCM is not enabled, exit after logging */ if (!access(NODCMFILE, F_OK)) { printf("/etc/nodcm exists -- exiting\n"); exit(1); } EXEC SQL WHENEVER SQLERROR DO dbmserr(); EXEC SQL CONNECT :db IDENTIFIED BY :db; EXEC SQL SELECT value INTO :enable FROM numvalues WHERE name = 'dcm_enable'; if (enable == 0) { printf("dcm_enable not set -- exiting\n"); exit(1); } /* fetch list of services */ EXEC SQL DECLARE csr_svc CURSOR FOR SELECT LOWER(name) FROM servers WHERE enable = 1 AND harderror = 0 AND update_int > 0; EXEC SQL OPEN csr_svc; sq = sq_create(); while (1) { EXEC SQL FETCH csr_svc INTO :buf; if (sqlca.sqlcode) break; sq_save_data(sq, strdup(strtrim(buf))); } EXEC SQL CLOSE csr_svc; /* we will repeatedly open and close the db since it seems to get upset if you keep it open across a fork */ EXEC SQL COMMIT RELEASE; /* Now run through list */ while (sq_get_data(sq, &name)) { if (generate_service(name, force)) { switch (fork()) { case -1: com_err(whoami, errno, "forking for service %s -- exiting", name); exit(1); case 0: sprintf(strchr(whoami, '\0'), " (%s:%ld)", name, (long)getpid()); do_hosts(name); com_err(whoami, 0, "exiting"); exit(0); default: break; } } } com_err(whoami, 0, "All files generated. Waiting for children to exit"); while (waitpid(0, &status, 0) > 0) ; com_err(whoami, 0, "exiting"); exit(0); } int generate_service(char *name, int force) { EXEC SQL BEGIN DECLARE SECTION; int interval, dfcheck, status, inprogress; time_t now; const char *errmsg; EXEC SQL END DECLARE SECTION; char dfgen_prog[MAXPATHLEN], dfgen_cmd[2 * MAXPATHLEN]; struct sigaction action, prevaction; int waits; EXEC SQL CONNECT :db IDENTIFIED BY :db; EXEC SQL SELECT update_int, dfcheck, inprogress INTO :interval, :dfcheck, :inprogress FROM servers WHERE name = UPPER(:name); if (sqlca.sqlcode == SQL_NO_MATCH) { com_err(whoami, 0, "No such service `%s'", name); EXEC SQL COMMIT RELEASE; return 0; } /* Someone might try to run a DCM from the command line while the * regular one is running, which will bypass the "interval" test. * Check inprogress to make sure they don't stomp on themselves. * * Note that there is still a race condition here, and this doesn't * absolutely prevent 2 DCMs from stepping on one another, but it * does reduce the window of vulnerability greatly. */ if (inprogress == 1) { com_err(whoami, 0, "DCM for service `%s' already in progress", name); EXEC SQL COMMIT RELEASE; return 0; } time(&now); if ((interval * 60 + dfcheck < now) || force) { sprintf(dfgen_prog, "%s/%s.gen", BIN_DIR, name); if (access(dfgen_prog, F_OK) != 0) { com_err(whoami, 0, "prog %s doesn't exist", dfgen_prog); EXEC SQL COMMIT RELEASE; return 0; } sprintf(dfgen_cmd, "exec %s %s/%s.out", dfgen_prog, DCM_DIR, name); com_err(whoami, 0, "running %s", dfgen_prog); EXEC SQL WHENEVER SQLERROR GOTO gen_cleanup; EXEC SQL UPDATE servers SET inprogress = 1 WHERE name = UPPER(:name); EXEC SQL COMMIT; action.sa_flags = 0; sigemptyset(&action.sa_mask); action.sa_handler = SIG_DFL; sigaction(SIGCHLD, &action, &prevaction); waits = system(dfgen_cmd); sigaction(SIGCHLD, &prevaction, NULL); if (WIFSIGNALED(waits)) { status = MR_COREDUMP; com_err(whoami, status, " %s exited on signal %d", dfgen_prog, WTERMSIG(waits)); } else if (WEXITSTATUS(waits)) { /* extract the process's exit value */ status = WEXITSTATUS(waits) + ERROR_TABLE_BASE_sms; if (status != MR_NO_CHANGE) com_err(whoami, status, "in %s", dfgen_prog); } else status = MR_SUCCESS; if (status == MR_SUCCESS) { EXEC SQL UPDATE servers SET dfgen = :now, dfcheck = :now, inprogress = 0 WHERE name = UPPER(:name); EXEC SQL COMMIT RELEASE; return 1; } else if (status == MR_NO_CHANGE) { EXEC SQL UPDATE servers SET dfcheck = :now, inprogress = 0 WHERE name = UPPER(:name); } else if (SOFT_FAIL(status)) { errmsg = error_message(status); EXEC SQL UPDATE servers SET errmsg = :errmsg, inprogress = 0 WHERE name = UPPER(:name); } else /* HARD_FAIL(status) */ { errmsg = error_message(status); EXEC SQL UPDATE servers SET harderror = :status, errmsg = :errmsg, inprogress = 0 WHERE name = UPPER(:name); critical_alert("DCM", "DCM building config files for %s: %s", name, errmsg); } } else { com_err(whoami, 0, "DCM for service `%s' has run too recently.", name); com_err(whoami, 0, "Use the -force flag to force a DCM."); } EXEC SQL COMMIT RELEASE; return 0; gen_cleanup: EXEC SQL WHENEVER SQLERROR DO dbmserr(); EXEC SQL UPDATE servers SET inprogress = 0, harderror = MR_INTERNAL, errmsg = 'DBMS Internal Error' WHERE name = UPPER(:name); dbmserr(); } void do_hosts(char *service) { EXEC SQL BEGIN DECLARE SECTION; char server_type[SERVERS_TYPE_SIZE], host[MACHINE_NAME_SIZE], *name; char target[SERVERS_TARGET_FILE_SIZE], script[SERVERS_SCRIPT_SIZE]; const char *errmsg; int status = 0, dfgen, type, mid, inprogress; time_t now; EXEC SQL END DECLARE SECTION; struct save_queue *sq; time(&now); mr_init(); EXEC SQL CONNECT :db IDENTIFIED BY :db; EXEC SQL SELECT dfgen, type, target_file, script, inprogress INTO :dfgen, :server_type, :target, :script, :inprogress FROM servers WHERE name = UPPER(:service); if (!strncmp(strtrim(server_type), "REPLICAT", 8)) type = REPLICATED; else if (!strncmp(server_type, "DISTRIB", 8)) type = DISTRIBUTED; else type = UNIQUE; strtrim(target); strtrim(script); /* Rudimentary locking. Doesn't eliminate the possibility of 2 DCMs * stepping on one another, but makes it harder. */ if (inprogress == 1) { com_err(whoami, 0, "DCM for service `%s' already in progress", name); EXEC SQL COMMIT RELEASE; return; } EXEC SQL DECLARE csr_hst1 CURSOR FOR SELECT m.name, m.mach_id FROM machine m, serverhosts sh WHERE sh.service = UPPER(:service) AND sh.enable = 1 AND sh.hosterror = 0 AND sh.lts < :dfgen AND sh.mach_id = m.mach_id; EXEC SQL OPEN csr_hst1; sq = sq_create(); while (1) { EXEC SQL FETCH csr_hst1 INTO :host, mid; if (sqlca.sqlcode == SQL_NO_MATCH) break; sq_save_data(sq, strdup(strtrim(host))); sq_save_data(sq, (void *)mid); } EXEC SQL CLOSE csr_hst1; EXEC SQL WHENEVER SQLERROR GOTO host_cleanup; while (sq_get_data(sq, &name)) { sq_get_data(sq, &mid); EXEC SQL SELECT inprogress INTO :inprogress FROM serverhosts WHERE service = UPPER(:service) AND mach_id = :mid; /* Check if someone got here before we did. * There's still a race condition here, but it's a small one. */ if (inprogress == 1) { com_err(whoami, 0, "DCM for service `%s' to host `%s' already in progress", service, name); EXEC SQL COMMIT RELEASE; return; } com_err(whoami, 0, "sending %s data to %s", service, name); EXEC SQL UPDATE serverhosts SET inprogress = 1 WHERE service = UPPER(:service) AND mach_id = :mid; EXEC SQL COMMIT; status = dcm_send_file(service, type, name, target); if (status) { errmsg = error_message(status); EXEC SQL UPDATE serverhosts SET hosterrmsg = :errmsg, inprogress = 0, success = 0, ltt = :now WHERE service = UPPER(:service) AND mach_id = :mid; if (!SOFT_FAIL(status)) { EXEC SQL UPDATE serverhosts SET hosterror = :status WHERE service = UPPER(:service) AND mach_id = :mid; critical_alert("DCM", "DCM updating %s:%s: %s", service, name, errmsg); } EXEC SQL COMMIT; if (type == REPLICATED) break; } } sq_destroy(sq); if (status == MR_SUCCESS || type != REPLICATED) { EXEC SQL DECLARE csr_hst2 CURSOR FOR SELECT m.name, m.mach_id FROM machine m, serverhosts sh WHERE sh.service = UPPER(:service) AND sh.inprogress = 1 AND sh.enable = 1 AND sh.hosterror = 0 AND sh.mach_id = m.mach_id; EXEC SQL OPEN csr_hst2; sq = sq_create(); while (1) { EXEC SQL FETCH csr_hst2 INTO :host, :mid; if (sqlca.sqlcode == SQL_NO_MATCH) break; sq_save_data(sq, strdup(strtrim(host))); sq_save_data(sq, (void *)mid); } EXEC SQL CLOSE csr_hst2; while (sq_get_data(sq, &name)) { sq_get_data(sq, &mid); com_err(whoami, 0, "executing instructions on %s", name); status = dcm_execute(service, name, script); if (status) { errmsg = error_message(status); EXEC SQL UPDATE serverhosts SET hosterrmsg = :errmsg, inprogress = 0, success = 0, ltt = :now WHERE service = UPPER(:service) AND mach_id = :mid; if (!SOFT_FAIL(status)) { EXEC SQL UPDATE serverhosts SET hosterror = :status WHERE service = UPPER(:service) AND mach_id = :mid; critical_alert("DCM", "DCM updating %s:%s: %s", service, name, errmsg); } if (type == REPLICATED) break; } else { EXEC SQL UPDATE serverhosts SET inprogress = 0, ltt = :now, lts = :now, success = 1 WHERE service = UPPER(:service) AND mach_id = :mid; } EXEC SQL COMMIT; } EXEC SQL CLOSE csr_hst2; } if (type == REPLICATED) { /* Clear inprogress flag on any hosts we started but didn't * finish. */ EXEC SQL UPDATE serverhosts SET inprogress = 0 WHERE service = UPPER(:service); } EXEC SQL WHENEVER SQLERROR DO dbmserr(); if (status && !SOFT_FAIL(status) && type == REPLICATED) { EXEC SQL UPDATE servers SET harderror = :status, errmsg = :errmsg WHERE name = UPPER(:service); } EXEC SQL COMMIT RELEASE; return; host_cleanup: EXEC SQL UPDATE serverhosts SET inprogress = 0, success = 0, ltt = :now, hosterror = MR_INTERNAL, hosterrmsg = 'DBMS Internal Error' WHERE service = UPPER(:service) AND mach_id = :mid; if (type == REPLICATED) { EXEC SQL UPDATE servers SET harderror = MR_INTERNAL, errmsg = 'DBMS Internal Error' WHERE name = UPPER(:service); } } int dcm_send_file(char *service, int type, char *host, char *target) { char data[MAXPATHLEN]; int code, conn; conn = mr_connect_internal(host, "moira_update"); if (!conn) { com_err(whoami, errno, "can't connect to %s", host); return MR_CANT_CONNECT; } code = mr_send_krb5_auth(conn, host); if (code) code = mr_send_auth(conn, host); if (code) { com_err(whoami, code, "authenticating to %s", host); goto done; } if (type == DISTRIBUTED) sprintf(data, "%s/%s/%s", DCM_DIR, service, host); else sprintf(data, "%s/%s.out", DCM_DIR, service); code = mr_send_file(conn, data, target, 0); if (code) com_err(whoami, code, "sending data to %s", host); done: mr_send_quit(conn); close(conn); return code; } int dcm_execute(char *service, char *host, char *script) { char inst[MAXPATHLEN]; int code, conn; conn = mr_connect_internal(host, "moira_update"); if (!conn) { com_err(whoami, errno, "can't connect to %s", host); return MR_CANT_CONNECT; } code = mr_send_krb5_auth(conn, host); if (code) code = mr_send_auth(conn, host); if (code) { com_err(whoami, code, "authenticating to %s", host); goto done; } sprintf(inst, "/tmp/moira-update.XXXXXX"); mktemp(inst); code = mr_send_file(conn, script, inst, 0); if (code) { com_err(whoami, code, "sending instructions to %s", host); goto done; } code = mr_execute(conn, inst); if (code) com_err(whoami, code, "executing instructions on %s", host); done: mr_send_quit(conn); close(conn); return code; } void dbmserr(void) { EXEC SQL BEGIN DECLARE SECTION; char err_msg[256]; EXEC SQL END DECLARE SECTION; int bufsize = 256, msglength = 0; sqlglm(err_msg, &bufsize, &msglength); err_msg[msglength] = '\0'; com_err(whoami, 0, "Encountered SQL error:\n%s", err_msg); com_err(whoami, 0, "exiting"); exit(1); }