]> andersk Git - moira.git/blob - dcm/dcm.c
wait a minute after deadlock before trying again to update state in DB
[moira.git] / dcm / dcm.c
1 /*
2  * The Data Control Manager for SMS.
3  *
4  * Copyright 1987, 1988 by the Massachusetts Institute of Technology.
5  * For copying and distribution information, see the file
6  * "mit-copyright.h".
7  *
8  * $Source$
9  * $Author$
10  * $Header$
11  */
12
13 #ifndef lint
14 static char rcsid_dcm_c[] = "$Header$";
15 #endif lint
16
17 #include <stdio.h>
18 #include <update.h>
19 #include <sys/file.h>
20 #include <sys/time.h>
21 #include <sys/wait.h>
22 #include <ctype.h>
23 #include <sms.h>
24 #include <sms_app.h>
25 #include "dcm.h"
26 #include "mit-copyright.h"
27
28 extern char *ctime();
29 extern char *getenv();
30 extern int log_flags;
31 extern char *error_message();
32 char *itoa();
33 int gqval();
34 long time();
35
36
37 #define DEADLOCK_WAIT   (3 * 60)        /* number of seconds to wait after
38                                            a deadlock before trying again. */
39
40 /* declared global so that we can get the current time from different places. */
41 struct timeval tv;
42
43
44 main(argc, argv)
45 int argc;
46 char *argv[];
47 {
48         int i;
49         char **arg = argv;
50         char *qargv[3];
51         int status;
52
53         whoami = argv[0];
54         dbg = atoi(getenv("DEBUG"));
55         umask(UMASK);
56         log_flags = 0;
57         setlinebuf(stderr);
58         setlinebuf(stdout);
59         
60         while(++arg - argv < argc) {
61             if (**arg == '-')
62                 switch((*arg)[1]) {
63                 case 'd':
64                     dbg =  atoi((*arg)[2]? *arg+2: *++arg);
65                     break;
66                 }
67         }
68         set_com_err_hook(dcm_com_err_hook);
69         
70         /* if /etc/nodcm exists, punt quietly. */
71         if (!access("/etc/nodcm", F_OK)) {
72                 exit(1);
73         } 
74
75         if (status = sms_connect("")) {
76             com_err(whoami, status, " on sms_connect");
77             leave("connect failed");
78         }
79
80         if (status = sms_auth("dcm")) {
81             com_err(whoami, status, " on \"authenticate\"");
82             leave("auth failed");
83         }
84
85         /* if DCM is not enabled, exit after logging */
86         qargv[0] = "dcm_enable";
87         if (status = sms_query("get_value", 1, qargv, gqval, &i)) {
88             com_err(whoami, status, " check dcm_enable");
89             leave("query failed");
90         }
91         if (i == 0) {
92             errno = 0;
93             leave("dcm_enable not set");
94         } 
95
96         /* do it! */
97         do_services();
98         errno = 0;
99         leave("");
100 }
101
102
103 /* Used by the get_value query when checking for dcm_enable. */
104
105 gqval(argc, argv, hint)
106 int argc;
107 char **argv;
108 int *hint;
109 {
110     *hint = atoi(argv[0]);
111     return(UPCALL_STOP);
112 }
113
114
115 /* Used by qualified_get_server to make a list of servers to check */
116
117 qgetsv(argc, argv, sq)
118 int argc;
119 char **argv;
120 struct save_queue *sq;
121 {
122     sq_save_data(sq, strsave(argv[0]));
123     return(UPCALL_CONT);
124 }
125
126
127 /* Used by get_server_info to record all of the returned information */
128
129 getsvinfo(argc, argv, sserv)
130 int argc;
131 char **argv;
132 struct service *sserv;
133 {
134     sserv->service = strsave(argv[0]);
135     sserv->interval = atoi(argv[1]);
136     sserv->target = strsave(argv[2]);
137     sserv->script = strsave(argv[3]);
138     sserv->dfgen = atoi(argv[4]);
139     sserv->dfcheck = atoi(argv[5]);
140     sserv->type = strsave(argv[6]);
141     sserv->enable = atoi(argv[7]);
142     sserv->inprogress = atoi(argv[8]);
143     sserv->harderror = atoi(argv[9]);
144     sserv->errmsg = strsave(argv[10]);
145     return(UPCALL_STOP);
146 }
147
148
149 /* Scan the services and process any that need it. */
150
151 do_services()
152 {
153     char *qargv[6];
154     struct save_queue *sq, *sq_create();
155     char *service, dfgen_prog[64], dfgen_cmd[128];
156     struct service svc;
157     int status, lock_fd, ex;
158     struct timezone tz;
159     register char *p;
160     union wait waits;
161
162     if (dbg & DBG_VERBOSE)
163         com_err(whoami, 0, "starting pass over services");
164
165     qargv[0] = "true";
166     qargv[1] = "dontcare";
167     qargv[2] = "false";
168     sq = sq_create();
169     if (status = sms_query("qualified_get_server", 3, qargv, qgetsv, sq)) {
170         com_err(whoami, status, " getting services");
171         leave("query failed");
172     }
173     while (sq_get_data(sq, &service)) {
174         for (p = service; *p; p++)
175           if (isupper(*p))
176             *p = tolower(*p);
177         com_err(whoami, 0, "checking %s...", service);
178         qargv[0] = service;
179         sprintf(dfgen_prog, "%s/bin/%s.gen", SMS_DIR, service);
180         if (!file_exists(dfgen_prog)) {
181             com_err(whoami, 0, "prog %s doesn't exist\n", dfgen_prog);
182             free(service);
183             continue;
184         }
185         sprintf(dfgen_cmd, "exec %s %s/dcm/%s.out",
186                 dfgen_prog, SMS_DIR, service);
187         gettimeofday(&tv, &tz);
188         if (status = sms_query("get_server_info", 1, qargv, getsvinfo, &svc)) {
189             com_err(whoami, status, " getting service %s info, skipping to next service", service);
190             continue;
191         }
192         svc.service = strsave(service);
193         qargv[0] = strsave(service);
194         qargv[1] = itoa(svc.dfgen);
195         qargv[2] = itoa(svc.dfcheck);
196         qargv[3] = strsave("0");
197         qargv[4] = itoa(svc.harderror);
198         qargv[5] = strsave(svc.errmsg);
199         if (svc.interval != 0) {
200             if (svc.interval * 60 + svc.dfcheck < tv.tv_sec) {
201                 lock_fd = maybe_lock_update(SMS_DIR, "@db@", service, 1);
202                 if (lock_fd < 0)
203                   goto free_service;
204                 free(qargv[3]);
205                 free(qargv[4]);
206                 free(qargv[5]);
207                 qargv[3] = strsave("1");
208                 qargv[4] = strsave("0");
209                 qargv[5] = strsave("");
210                 status = sms_query("set_server_internal_flags", 6, qargv,
211                                    scream, NULL);
212                 if (status != SMS_SUCCESS) {
213                     com_err(whoami, status, " setting server state");
214                     goto free_service;
215                 }
216             
217                 com_err(whoami, status, " running %s", dfgen_prog);
218                 waits.w_status = system(dfgen_cmd);
219                 if (waits.w_termsig) {
220                     status = SMS_TAR_FAIL;
221                     com_err(whoami, status, " %s exited on signal %d",
222                             dfgen_prog, waits.w_termsig);
223                 } else if (waits.w_retcode) {
224                     /* extract the process's exit value */
225                     status = waits.w_retcode + sms_err_base;
226                     com_err(whoami, status, " %s exited", dfgen_prog);
227                 }
228                 if (SOFT_FAIL(status)) {
229                     free(qargv[5]);
230                     qargv[5] = strsave(error_message(status));
231                 } else if (status == SMS_NO_CHANGE) {
232                     free(qargv[2]);
233                     qargv[2] = itoa(tv.tv_sec);
234                     svc.dfcheck = tv.tv_sec;
235                 } else if (status == SMS_SUCCESS) {
236                     free(qargv[1]);
237                     free(qargv[2]);
238                     qargv[1] = itoa(tv.tv_sec);
239                     qargv[2] = strsave(qargv[1]);
240                     svc.dfcheck = svc.dfgen = tv.tv_sec;
241                 } else { /* HARD_FAIL(status) */
242                     free(qargv[2]);
243                     free(qargv[4]);
244                     free(qargv[5]);
245                     qargv[2] = itoa(tv.tv_sec); 
246                     svc.dfcheck = tv.tv_sec;
247                     qargv[4] = itoa(status);
248                     qargv[5] = strsave(error_message(status));
249                     critical_alert("DCM","DCM building config files for %s: %s",
250                                   service, qargv[5]);
251                 }
252             free_service:
253                 free(qargv[3]);
254                 qargv[3] = strsave("0");
255                 status = sms_query("set_server_internal_flags", 6, qargv,
256                                    scream, NULL);
257                 if (status) {
258                     com_err(whoami, status,
259                             " setting service state, sleeping");
260                     sleep(DEADLOCK_WAIT);
261                     status = sms_query("set_server_internal_flags", 6, qargv,
262                                        scream, NULL);
263                     if (status)
264                       com_err(whoami, status, " setting service state again");
265                 }
266                 close(lock_fd);
267                 free(qargv[0]);
268                 free(qargv[1]);
269                 free(qargv[2]);
270                 free(qargv[3]);
271                 free(qargv[4]);
272                 free(qargv[5]);
273             }
274             if (!strcmp(svc.type, "REPLICAT"))
275               ex = 1;
276             else
277               ex = 0;
278             lock_fd = maybe_lock_update(SMS_DIR, "@db@", service, ex);
279             if (lock_fd >= 0) {
280                 do_hosts(&svc);
281                 close(lock_fd);
282             }
283         }
284         free(svc.service);
285         free(svc.target);
286         free(svc.script);
287         free(svc.type);
288         free(svc.errmsg);
289         free(service);
290     }
291     sq_destroy(sq);
292 }
293
294
295 /* Used by qualified_get_server_host to make a list of hosts to check */
296
297 qgethost(argc, argv, sq)
298 int argc;
299 char **argv;
300 struct save_queue *sq;
301 {
302     sq_save_data(sq, strsave(argv[1]));
303     return(UPCALL_CONT);
304 }
305
306
307 /* Used by get_server_host_info to store all of the info about a host */
308
309 gethostinfo(argc, argv, shost)
310 int argc;
311 char **argv;
312 struct svrhost *shost;
313 {
314     shost->service = strsave(argv[0]);
315     shost->machine = strsave(argv[1]);
316     shost->enable = atoi(argv[2]);
317     shost->override = atoi(argv[3]);
318     shost->success = atoi(argv[4]);
319     shost->inprogress = atoi(argv[5]);
320     shost->hosterror = atoi(argv[6]);
321     shost->errmsg = strsave(argv[7]);
322     shost->lasttry = atoi(argv[8]);
323     shost->lastsuccess = atoi(argv[9]);
324     shost->value1 = atoi(argv[10]);
325     shost->value2 = atoi(argv[11]);
326     shost->value3 = strsave(argv[12]);
327     return(UPCALL_STOP);
328 }
329
330
331 /* Scans all of the hosts for a particular service, and processes them. */
332
333 do_hosts(svc)
334 struct service *svc;
335 {
336     char *argv[9], *machine;
337     int status, lock_fd;
338     struct save_queue *sq;
339     struct svrhost shost;
340
341     sq = sq_create();
342     argv[0] = svc->service;
343     argv[1] = "TRUE";
344     argv[2] = argv[3] = argv[4] = "DONTCARE";
345     argv[5] = "FALSE";
346     status = sms_query("qualified_get_server_host", 6, argv, qgethost, sq);
347     if (status == SMS_NO_MATCH) {
348         return;
349     } else if (status) {
350         com_err(whoami, status, " getting server_hosts for  %s", svc->service);
351         return;
352     }
353     while (sq_get_data(sq, &machine)) {
354         if (dbg & DBG_TRACE)
355           com_err(whoami, 0, "checking %s...", machine);
356         argv[1] = machine;
357         status = sms_query("get_server_host_info", 2, argv,gethostinfo, &shost);
358         if (status) {
359             com_err(whoami,status, " getting server_host_info for %s", machine);
360             goto free_mach;
361         }
362         if (!shost.enable || shost.hosterror ||
363             (shost.success && !shost.override &&
364              shost.lastsuccess >= svc->dfgen)) {
365             if (dbg & DBG_TRACE)
366               com_err(whoami, 0, "not updating %s:%s", svc->service, machine);
367             goto free_mach;
368         }
369
370         lock_fd = maybe_lock_update(SMS_DIR, machine, svc->service, 1);
371         if (lock_fd < 0)
372           goto free_mach;
373         argv[0] = svc->service;
374         argv[1] = machine;
375         argv[2] = argv[3] = argv[5] = "0";
376         argv[4] = "1";
377         argv[6] = strsave("");
378         argv[7] = itoa(tv.tv_sec);
379         argv[8] = itoa(shost.lastsuccess);
380         status = sms_query("set_server_host_internal", 9, argv,scream,NULL);
381         if (status != SMS_SUCCESS) {
382             com_err(whoami,status," while setting internal state for %s:%s",
383                     svc->service, machine);
384             goto free_mach;
385         }
386         status = sms_update_server(svc->service, machine, svc->target,
387                                    svc->script);
388         if (status == SMS_SUCCESS) {
389             argv[2] = "0";
390             argv[3] = "1";
391             free(argv[8]);
392             argv[8] = itoa(tv.tv_sec);
393         } else if (SOFT_FAIL(status)) {
394             free(argv[6]);
395             argv[6] = strsave(error_message(status));
396         } else { /* HARD_FAIL */
397             argv[2] = itoa(shost.override);
398             argv[5] = itoa(status);
399             free(argv[6]);
400             argv[6] = strsave(error_message(status));
401             critical_alert("DCM", "DCM updating %s:%s: %s",
402                            machine, svc->service, argv[6]);
403             if (!strcmp(svc->type, "REPLICAT")) {
404                 char *qargv[6];
405
406                 svc->harderror = status;
407                 svc->errmsg = strsave(argv[6]);
408                 qargv[0] = strsave(svc->service);
409                 qargv[1] = itoa(svc->dfgen);
410                 qargv[2] = itoa(svc->dfcheck);
411                 qargv[3] = strsave("0");
412                 qargv[4] = itoa(svc->harderror);
413                 qargv[5] = strsave(svc->errmsg);
414                 status = sms_query("set_server_internal_flags",
415                                    6, qargv, scream, NULL);
416                 if (status) {
417                     com_err(whoami, status,
418                             " setting service state, sleeping");
419                     sleep(DEADLOCK_WAIT);
420                     status = sms_query("set_server_internal_flags",
421                                        6, qargv, scream, NULL);
422                     if (status)
423                       com_err(whoami, status, " setting service state again");
424                 }
425                 free(qargv[0]);
426                 free(qargv[1]);
427                 free(qargv[2]);
428                 free(qargv[3]);
429                 free(qargv[4]);
430                 free(qargv[5]);
431                 close(lock_fd);
432                 free(argv[2]);
433                 argv[4] = "0";
434                 free(argv[5]);
435                 status = sms_query("set_server_host_internal",
436                                    9, argv,scream,NULL);
437                 if (status) {
438                     com_err(whoami, status,
439                             " setting host state, sleeping");
440                     sleep(DEADLOCK_WAIT);
441                     status = sms_query("set_server_host_internal",
442                                        9, argv,scream,NULL);
443                     if (status)
444                       com_err(whoami, status, " setting host state again");
445                 }
446                 return(-1);
447             }
448             free(argv[2]);
449             free(argv[5]);
450         }
451         argv[4] = "0";
452         close(lock_fd);
453         status = sms_query("set_server_host_internal", 9, argv,scream,NULL);
454         if (status) {
455             com_err(whoami, status, " setting host state, sleeping");
456             sleep(DEADLOCK_WAIT);
457             status = sms_query("set_server_host_internal", 9, argv,scream,NULL);
458             if (status)
459               com_err(whoami, status, " setting host state again");
460         }
461     free_mach:
462         free(machine);
463         close(lock_fd);
464     }
465     return(0);
466 }
This page took 0.082431 seconds and 5 git commands to generate.