"Fossies" - the Fresh Open Source Software Archive 
Member "freeha-1.0/freehad.c" (23 Nov 2006, 27563 Bytes) of package /linux/privat/old/freeha-1.0.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
1 /* actual workhorse demon for "FreeHA".
2 *
3 * This program should handle all of the following:
4 *
5 * -> send status 'heartbeats' to all configured networks
6 *
7 * -> listen for 'heartbeats' from other nodes
8 *
9 * -> call 'starthasrv' if it has been determined that this node
10 * should become "active"
11 *
12 * -> call 'monitorhasrv' periodically to see if there are errors
13 *
14 * -> call 'alerthasrv' if an error has been detected.
15 *
16 * -> call 'stophasrv' if an error has been detected.
17 * (It is up to the sysadmin to make sure 'stophasrv' gets
18 * called when the machine shuts down 'normally')
19 *
20 * -> stay in 'errored' state once entered, until killed
21 *
22 */
23
24 static char *versionstring="@(#) freehad.c 1.36@(#)";
25
26 #include <stdio.h>
27 #include <unistd.h>
28 #include <sys/param.h> /* for MAXPATHLEN */
29 #include <stdlib.h>
30 #include <errno.h>
31 #include <time.h>
32
33 #if defined(__svr4__) || defined(__linux__)
34 # include <string.h>
35 # define bcopy(src,dest,sz) memcpy(dest,src,sz)
36 # define bzero(dest,sz) memset(dest,0,sz)
37 #else
38 # include <strings.h>
39 #endif /* __svr4__ */
40
41 #include <ctype.h>
42
43 #include <sys/types.h>
44 #include <sys/socket.h>
45 #include <netinet/in.h>
46 #include <arpa/inet.h>
47
48 #ifdef USE_SIGNAL
49 #include <signal.h>
50 #endif
51
52 #ifdef USE_SYSLOG
53 #include <syslog.h>
54 #endif
55
56 #include <sys/wait.h> /* should be right on solaris, linux, AND freebsd */
57
58 #include <sys/stat.h>
59 #include <fcntl.h>
60 #include <sys/ioctl.h>
61
62 #ifndef FIONBIO
63 #include <sys/filio.h> /* WHY do I need this specifically, when it is
64 included in sys/ioctl.h ? ? ? */
65 #endif
66
67 #include "freehad.h"
68
69 /**********************************************************************/
70 /**********************************************************************/
71
72 /* for the heartbeat addrs/nets */
73 struct in_addr ipaddr[3]; /* for listening */
74 struct in_addr netaddr[3];/* for sending (broadcast)*/
75
76 int listensock[3], sendsock[3];
77
78 int freeha_port=0xf33;
79
80 char state_file[1024]; /* path to 'state file', that we write */
81 int state_fd;
82
83 struct statelist{
84 struct freeha_msg msg;
85 struct statelist *next,*prev;
86 };
87
88 /* list of every system we know about */
89 struct statelist *host_states;
90
91 char myhostname[100];
92 char scriptdir[MAXPATHLEN];
93
94 int mystate=INITIAL; /* the 'HA status' of this node */
95 int am_main=0; /* Am I the main node? dont think sooo... */
96
97 int num_nodes=2; /* number of machines in cluster. used for
98 * 'quorum' purposes.
99 * (avoiding split-brain, if more than 2 nodes)
100 */
101 int quorum_count; /* filled in by main() */
102
103 int heartbeatsec=1; /* not used currently */
104 int monitorsec=1; /* seconds delay between monitor/hb loops */
105 int timeoutsec=121; /* seconds after which a node with no heartbeat is
106 * assumed 'Missing In Action' (ie 'dead')
107 */
108
109 int need_to_stop=0; /* flag for main loop to call stop_services() */
110 int need_to_start=0; /* flag for main loop to call start_services() */
111 int need_to_quit=0; /* If received SIGINT, quit cleanly */
112 int have_error=0;
113
114 /**********************************************************************/
115
116 /* I want to keep this SIMPLE, by using getopt()
117 * Unfortunately, getopt does not support more complex arg strings, but
118 * only single-letter options.
119 * Hence why the options are a leetle bit unwieldy
120 */
121 void usage(){
122 puts("Usage:");
123 puts(" (-a|-b|-c) {IPaddr} IP addr 1,2 or 3, for src addr");
124 puts(" (-A|-B|-C) {addr} BROADCAST addr 1,2 or 3 for dest broadcast");
125 puts(" -h help (This text)");
126 puts(" -l {statefile} file to write ongoing state to");
127 puts(" -n {nodes} number of nodes in cluster (default 2)");
128 puts(" -p {port} UDP port for heartbeat(default 0xfee)");
129 puts(" -s directory script directory (default /opt/freeha/bin)");
130 puts(" -M {sec} number of seconds between monitorings(defalt 1)");
131 puts(" -T {timeoutsec} number of seconds to timeout a node(dead)");
132 printf(" (default=%d)\n",timeoutsec);
133 puts("");
134 puts("Note that -M specifies 'delay beween', not 'do every'");
135 puts("");
136 printf("Code Release: %s\n",versionstring);
137 }
138
139
140 #ifdef USE_SIGNAL
141 void sighandler(int sigarg){
142 switch(sigarg){
143 case SIGINT:
144 case SIGTERM:
145 need_to_quit=1;
146 case SIGHUP:
147 need_to_stop=1;
148 break;
149 case SIGUSR1:
150 /* This is a FORCED (re)-start. So we clear error flags */
151 have_error=0;
152 need_to_start=1;
153 break;
154 case SIGUSR2:
155 /* Just clear the error flag. Dont actually DO anything */
156 have_error=0;
157 if(mystate==ERRORED){
158 mystate=STANDBY;
159 }
160 break;
161 }
162 }
163 #endif /* USE_SIGNAL */
164
165 void init_state_file(char *statefile_name)
166 {
167 if (statefile_name != NULL){
168 strncpy(state_file,statefile_name,1024);
169 } else {
170 if(access("/var/run", X_OK|W_OK)==0) {
171 strcpy(state_file,"/var/run/freeha");
172 } else if(access("/var/freeha", X_OK|W_OK)==0){
173 strcpy(state_file,"/var/freeha/state");
174 } else {
175 fprintf(stderr,"ERROR: Do not know where to create state file\n");
176 fprintf(stderr,"/var/run and /var/freeha are not accessible\n");
177 fprintf(stderr,"Use -l to specify location of state file\n");
178
179 exit(1);
180 }
181 }
182
183 /* this is just to ensure that the creat() call wont fail with EEXIST.
184 * we dont care if there isnt an old statefile there already
185 */
186 unlink(state_file);
187
188 state_fd=creat(state_file, S_IRWXU|S_IRGRP|S_IROTH);
189 if(state_fd <0) {
190 perror("could not create state file");
191 exit(1);
192 }
193 }
194
195 /* called as subroutine for opensockets() */
196 int opensock(struct sockaddr_in *addr_ptr)
197 {
198 int reuse_opt=1;
199 int newsock=socket(AF_INET,SOCK_DGRAM,0);
200 if(newsock<0){
201 perror("could not create new socket?? !!");
202 return -1;
203 }
204 if(bind(newsock, (struct sockaddr*)addr_ptr,
205 sizeof(struct sockaddr_in)) <0)
206 {
207 printf("Error binding %s:%d\n",inet_ntoa(addr_ptr->sin_addr),
208 freeha_port);
209 return -1;
210 }
211 setsockopt(newsock, SOL_SOCKET,SO_REUSEADDR,
212 &reuse_opt, sizeof(reuse_opt));
213
214 /* Okay, only half the sockets we make, need this option set.
215 * But no harm in setting it, as far as I can see!
216 */
217 setsockopt(newsock, SOL_SOCKET,SO_BROADCAST,
218 &reuse_opt, sizeof(reuse_opt));
219
220 /* I would think that NONBLOCK would work.
221 * Except that it doesnt seem to work for solaris. GRRR.
222 * So thats why FIONBIO is here also
223 */
224 #ifdef FIONBIO
225 ioctl(newsock,FIONBIO, &reuse_opt);
226 #else
227 fcntl(newsock,F_SETFL, O_NONBLOCK);
228 #endif
229
230 return newsock;
231
232 }
233
234
235 /* Open the sending sockets, and the listening sockets */
236 void opensockets()
237 {
238 struct sockaddr_in addr;
239 int loop;
240 int addrcount=0;
241
242 bzero(&addr, sizeof(struct sockaddr_in));
243
244 addr.sin_family=AF_INET;
245 addr.sin_port=htons(freeha_port);
246
247
248 for(loop=0; loop<3; loop++) {
249 addr.sin_addr.s_addr = ipaddr[loop].s_addr;
250 if(addr.sin_addr.s_addr == INADDR_ANY) continue;
251
252 printf("Trying to bind for send %s:%d\n",
253 inet_ntoa(addr.sin_addr),
254 freeha_port);
255
256 sendsock[loop]=opensock(&addr);
257 if(sendsock[loop] >=0){
258 addrcount++;
259 }
260
261 addr.sin_addr.s_addr = netaddr[loop].s_addr;
262 if(addr.sin_addr.s_addr == INADDR_ANY){
263 netaddr[loop].s_addr = ipaddr[loop].s_addr;
264 /* well durnit. try to GUESS at broadcast,
265 * since unspecified.
266 * assume they want class C broadcast.
267 */
268 netaddr[loop].s_addr = htonl(netaddr[loop].s_addr);
269 /* We KNOW it is in MSB format now */
270 netaddr[loop].s_addr |= 0x000000ff;
271
272 netaddr[loop].s_addr = ntohl(netaddr[loop].s_addr);
273 addr.sin_addr.s_addr = netaddr[loop].s_addr;
274
275 printf("Have adjusted broadcast to %s\n",
276 inet_ntoa(addr.sin_addr));
277 }
278
279 printf("Trying to bind LISTEN for %s:%d\n",
280 inet_ntoa(addr.sin_addr),
281 freeha_port);
282 listensock[loop]=opensock(&addr);
283 if(listensock[loop] <0){
284 perror("ERROR trying to bind listen socket");
285 fprintf(stderr,"failed for %s:%d\n",
286 inet_ntoa(addr.sin_addr),
287 freeha_port);
288 exit(1);
289 }
290
291 }
292
293 if(addrcount==0){
294 fprintf(stderr,"ERROR: could not bind any addresses\n");
295 exit(1);
296 }
297
298 }
299
300
301 /* This wrapper exists to call the external alerthasrv script (if it exists)
302 * when we change our internal state.
303 * It first changes the global state variable, 'mystate'
304 * It then calls the external script, with:
305 * alerthasrv [statenum] [statename]
306 */
307
308 void changestate(int newstate)
309 {
310 static int oldstate=UNKNOWNSTATE;
311 char *statestring;
312 char runstring[50];
313
314 if(newstate==oldstate){
315 return;
316 }
317 oldstate=newstate;
318 mystate=newstate;
319
320 /* Using switch statement instead of array, for error checking */
321 switch(newstate){
322 case UNKNOWNSTATE: statestring="UNKNOWN_STATE"; break;
323 case INITIAL: statestring="INITIAL"; break;
324 case RUNNING: statestring="RUNNING"; break;
325 case STOPPING: statestring="STOPPING"; break;
326 case STARTING: statestring="STARTING"; break;
327 case STANDBY: statestring="STANDBY"; break;
328 case ERRORED: statestring="ERRORED"; break;
329 case TIMEDOUT: statestring="TIMEDOUT"; break;
330 default: statestring="INTERNAL_ERROR"; break;
331 }
332 sprintf(runstring,"%s/alerthasrv %d %s",scriptdir, newstate,
333 statestring);
334 system(runstring);
335 /* technically this could fail, but... oh well */
336
337
338 }
339
340 /* Called only by storemsg().
341 * We have a state message from a machine we dont recognize.
342 * In this case, we automatically add that host's state into our list of
343 * host states
344 */
345 void addhost(struct freeha_msg* msg)
346 {
347 struct statelist *newentry,*oldentry;
348 int compare=0;
349
350 newentry=(struct statelist*)malloc(sizeof(struct statelist));
351 if(newentry==NULL){
352 changestate(ERRORED);
353 fprintf(stderr,"ERROR: addhost out of memory!\n");
354 return;
355 }
356
357 newentry->next=newentry->prev=NULL;
358
359 bcopy(msg, &newentry->msg, sizeof(struct freeha_msg));
360 if(host_states==NULL){
361 host_states=newentry;
362 return;
363 }
364 oldentry=host_states;
365
366 /*We need to alphabetize insert! So compare...*/
367 do {
368 compare=strcasecmp(oldentry->msg.srchost,
369 newentry->msg.srchost);
370
371 if(compare >0){
372 newentry->next=oldentry;
373 newentry->prev=oldentry->prev;
374 if(oldentry->prev==NULL){
375 host_states=newentry;
376 } else {
377 oldentry->prev->next=newentry;
378 }
379 oldentry->prev=newentry;
380 return;
381 }
382 if(oldentry->next==NULL){
383 break;
384 } else {
385 oldentry=oldentry->next;
386 }
387 } while(1);
388
389 /* oh well. slap on the end, then. */
390 oldentry->next=newentry;
391 newentry->prev=oldentry;
392
393 }
394
395 /* Once we have received a status (heartbeat) message from another system,
396 * store it in our state table.
397 * If system is not already present, then add it.
398 */
399 void storemsg(struct freeha_msg* msg)
400 {
401 int version;
402 struct statelist *hoststate=host_states;
403 version=ntohl(msg->version);
404 if(version != HA_VERSION) {
405 printf("ERROR: storemsg got msg version=%d; expecting %d\n",
406 version, HA_VERSION);
407 return;
408 }
409
410 msg->timestamp=time(NULL);
411
412 while(hoststate !=NULL){
413 if(strncmp(hoststate->msg.srchost, msg->srchost, HA_HOSTLEN)==0){
414 bcopy(msg, &hoststate->msg, sizeof(struct freeha_msg));
415 return;
416 }
417 hoststate=hoststate->next;
418 }
419 /* HOST not found. so auto-add to our state list. */
420 addhost(msg);
421 }
422
423 /* dump the states of all hosts we know about.
424 * we dump this to the state file. So the state file should ideally be
425 * on a tmpfs filesystem.
426 * /var/run usually fulfills this purpose well
427 */
428 void dump_states()
429 {
430 char hoststate[50], *statestring;
431 struct statelist *stateptr=host_states;
432 int state;
433
434 /* 'rewind' the file */
435 lseek(state_fd, 0, SEEK_SET);
436
437 #ifdef DEBUG
438 puts("Writing to state file:");
439 #endif
440 while(stateptr !=NULL){
441 state=ntohl(stateptr->msg.status);
442 switch(state){
443 case INITIAL:
444 statestring="Initial";
445 break;
446 case RUNNING:
447 statestring="Running";
448 break;
449 case STOPPING:
450 statestring="Stopping";
451 break;
452 case STARTING:
453 statestring="Starting"; /* starting services */
454 break;
455 case STANDBY:
456 statestring="Standby";
457 break;
458 case ERRORED:
459 statestring="ERRORED";
460 break;
461 case TIMEDOUT:
462 statestring="TIMEOUT";
463 break;
464 default:
465 statestring="[invalid state]";
466 break;
467 }
468 sprintf(hoststate,"%.32s [%s] %s",
469 stateptr->msg.srchost, statestring,
470 ctime(&(stateptr->msg.timestamp)) );
471
472 /* remember, ctime adds \n */
473
474 write(state_fd,hoststate,strlen(hoststate));
475 #ifdef DEBUG
476 printf(" %s",hoststate);
477 #endif
478
479 stateptr=stateptr->next;
480 }
481 }
482
483 /* used to see if we have enough active nodes in cluster to
484 * constitute "quorum". We want to avoid mini-clusters occuring
485 * in our cluster machines.
486 * Mainly used on initial demon startup only.
487 */
488 int count_activenodes()
489 {
490 struct statelist *stateptr=host_states;
491 int activecount=0;
492
493
494 while(stateptr !=NULL){
495 int state=ntohl(stateptr->msg.status);
496 switch(state){
497 case RUNNING:
498 case STANDBY:
499 case INITIAL:
500 case STOPPING:
501 case STARTING:
502 case ERRORED:
503 activecount+=1;
504 }
505 stateptr=stateptr->next;
506 }
507 return activecount;
508 }
509
510 /* Determine if services are either running, or in process
511 * of shutting down, SOMEWHERE in the cluster
512 * return 1 if yes, 0 if no.
513 */
514 int check_running()
515 {
516 struct statelist *stateptr=host_states;
517 int services_up=0;
518
519 while(stateptr !=NULL){
520 int state=ntohl(stateptr->msg.status);
521 switch(state){
522 case RUNNING:
523 case STARTING:
524 case STOPPING:
525 services_up=1;
526 }
527 stateptr=stateptr->next;
528 }
529
530 return services_up;
531 }
532
533 void sendheartbeat(){
534 struct freeha_msg message;
535 int msglen=sizeof(message);
536 int loop;
537 struct sockaddr_in addr;
538
539 message.version=htonl(HA_VERSION);
540 message.status=htonl(mystate);
541 message.timestamp=0;
542 strcpy(message.srchost, myhostname);
543
544 addr.sin_family=AF_INET;
545 addr.sin_port=htons(freeha_port);
546
547 for(loop=0; loop<3; loop++) {
548 if(sendsock[loop]==-1) continue;
549
550 addr.sin_addr.s_addr = netaddr[loop].s_addr;
551 if(sendto(sendsock[loop], &message, msglen,
552 NULL, (struct sockaddr*)&addr,
553 sizeof(struct sockaddr_in)) != msglen)
554 {
555 perror("heartbeat send failed");
556 }
557 }
558 }
559
560
561 /* return 1 on successfull read, 0 on none, -1 on error */
562 int readheartbeat(int socket)
563 {
564 struct freeha_msg incoming;
565 int readstat;
566
567 if(socket<0) return 0;
568
569 /* Note that this would get very unhappy if we somehow read less
570 * than a full heatbeat.
571 * At some point, will use NREADY, etc.
572 */
573
574 readstat=recvfrom(socket, &incoming,sizeof(incoming),
575 NULL, NULL,0);
576
577 if(readstat < (int)sizeof(incoming)){
578 switch(errno){
579 #ifdef EWOULDBLOCK
580 case EWOULDBLOCK:
581 #else
582 default:
583 #endif
584 return 0;
585 }
586 perror("readheartbeat: Error reading socket!");
587 return -1;
588 }
589
590 storemsg(&incoming);
591 return 1;
592 }
593
594 /* Kinda ugly, but...
595 * basically, read in all network traffic for port that is queued,
596 * for each interface/addr that we know about.
597 * (Just in case we have fallen behind in our processing.
598 * Which is quite likely, with long monitor scripts)
599 */
600 void readheartbeats()
601 {
602 while(readheartbeat(listensock[0])==1);
603 while(readheartbeat(listensock[1])==1);
604 while(readheartbeat(listensock[2])==1);
605 }
606
607
608 /* This is the wrapper around the 'starthasrv' script.
609 * It calls the script, checks the status of it, and
610 * then sets mystate to either RUNNING, or ERRORED, as appropriate.
611 */
612 void start_services()
613 {
614 char startscript[MAXPATHLEN+20];
615 int startstate;
616
617 need_to_start=0;
618 if(mystate == RUNNING){
619 puts("ERROR: start_services called when already RUNNING");
620 return;
621 }
622
623
624 #ifdef USE_SYSLOG
625 syslog(LOG_NOTICE,"starting services");
626 #else
627 puts("DEBUG: start_services() calling start script");
628 #endif
629
630 sprintf(startscript,"%s/%s",scriptdir, "starthasrv");
631 startstate=system(startscript);
632 if(startstate==0) {
633 changestate(RUNNING);
634 #ifdef USE_SYSLOG
635 syslog(LOG_NOTICE,"starthasrv returned successfully");
636 #else
637 puts("DEBUG: start_services() returning successfully");
638 #endif
639 return;
640 }
641
642
643 #ifdef USE_SYSLOG
644 syslog(LOG_ERR,"Error hit from starthasrv script");
645 #else
646 fprintf(stderr,"ERROR: start_services (): starthasrv failed!(err %d)\n",
647 startstate);
648 #endif
649 changestate(ERRORED);
650
651 }
652
653 /* stop_services: called by main loop, as a result of
654 * either the monitor script, or indirectly by signal handler
655 * if we get a HUP or INT signal
656 *
657 * - changes state to STOPPING
658 * - sends heartbeat
659 * - calls stophasrv script
660 * - changes state to STANDBY
661 * - sends heartbeat
662 */
663 void stop_services()
664 {
665 char stopscript[MAXPATHLEN+20];
666
667 switch(mystate){
668 case RUNNING:
669 case STOPPING:
670 case INITIAL:
671 case ERRORED:
672 break;
673 default:
674 puts("ERROR: stop_services called in unexpected state");
675 return;
676 }
677
678 need_to_start=0; /* stop takes precedence over start */
679 need_to_stop=0;
680
681 #ifdef USE_SYSLOG
682 syslog(LOG_NOTICE,"stopping services");
683 #endif
684 changestate(STOPPING);
685 sendheartbeat();
686 dump_states();
687
688 sprintf(stopscript,"%s/%s",scriptdir, "stophasrv");
689 system(stopscript);
690
691 }
692
693 /* Run monitor script.
694 * This monitors the state of actively running services.
695 * Change mystate to 'ERRORED', if monitor fails.
696 * On ERRORED, will also set need_to_stop=1, and have_error=1
697 *
698 * Called specially, on first startup (startup=1), to determine if
699 * demon has been started on a node where services are already running.
700 * In this case, will either set mystate to RUNNING, or do nothing.
701 */
702 void monitor_services(int firsttime)
703 {
704 char monitorscript[MAXPATHLEN+20];
705 int mon_status;
706
707 sprintf(monitorscript,"%s/%s",scriptdir, "monitorhasrv");
708
709
710 /* only run monitor, if we have reason to do so! */
711 if(firsttime==0){
712 switch(mystate) {
713 case RUNNING:
714 break;
715 default:
716 return;
717 }
718 }
719
720 mon_status=system(monitorscript);
721 mon_status = WEXITSTATUS(mon_status);
722
723
724 switch(mon_status){
725 case 0: /* Services are running OK */
726 if(firsttime==1){
727 #ifdef USE_SYSLOG
728 syslog(LOG_NOTICE,"services detected as already running");
729 #else
730 puts("DEBUG: services already running");
731 #endif
732 changestate(RUNNING);
733 return;
734 }
735 break;
736
737 default:
738
739 if(firsttime==0){
740 #ifdef USE_SYSLOG
741 syslog(LOG_ERR,"monitorhasrv detected error");
742 #else
743 printf("monitor status=%d: ERRORED!\n",mon_status);
744 #endif
745 changestate(ERRORED);
746 have_error=1;
747 } else {
748 printf("Services not running.");
749 printf("Will Run stop_services for cleanup\n");
750 }
751
752 /* Sneaky hack: if firsttime started, and services
753 * are not running... flag that we should run
754 * stop_services, to clean up anything left behind
755 */
756 need_to_stop=1;
757 }
758
759 }
760
761
762 /* Run through list of systems, and see if we should timeout
763 * any of them.
764 */
765 void do_timeouts()
766 {
767 struct statelist *stateptr=host_states;
768 time_t timenow=time(NULL);
769 int timeoutstate=htonl(TIMEDOUT);
770
771 while(stateptr !=NULL){
772 /* Skip my own line */
773 if(strncmp(stateptr->msg.srchost, myhostname, HA_HOSTLEN)==0){
774 stateptr=stateptr->next;
775 continue;
776 }
777 if(stateptr->msg.status==timeoutstate){
778 /* already in timeout state */
779 stateptr=stateptr->next;
780 continue;
781 }
782
783 if((timenow - stateptr->msg.timestamp) > timeoutsec){
784 #ifdef USE_SYSLOG
785 syslog(LOG_ERR,"timeout for %s",stateptr->msg.srchost);
786 #else
787
788 printf("DEBUG: timeout for system %s\n",
789 stateptr->msg.srchost);
790 #endif
791 stateptr->msg.status=timeoutstate;
792 }
793 stateptr=stateptr->next;
794 }
795
796 }
797
798 /* Determine who should be 'main' system.
799 * If me, then flag 'need_to_start' so that main loop
800 * will trigger service_start
801 *
802 * Will only flag need_to_start, if
803 * - All nodes, or at minimum a 'quorum' (50%+1) are heard from
804 * AND
805 * - we are alphabetically first in the list of machines that arent screwed up
806 * AND
807 * - we are in 'STANDBY' state
808 * AND
809 * - there is no other machine in state RUNNING or STOPPING or STARTING
810 *
811 */
812 void check_main()
813 {
814 struct statelist *stateptr=host_states;
815
816 if(am_main){
817 return;
818 }
819 if(mystate!=STANDBY){
820 return;
821 }
822
823 /* Check for if we're in the majority half of the cluster,
824 * in case of partial network failure.
825 * 2-node cluster is special case. We dont meetnormal quorum count,
826 * of 50%+1.
827 * But on the other hand, we still have to try to start services,
828 * or whats the point of having a cluster in the first place!
829 */
830 if(num_nodes>2){
831 if(count_activenodes() <quorum_count){
832 return;
833 }
834 }
835 if(check_running()==1){
836 /* cant do anything, if another node is already
837 * running services!
838 */
839 return;
840 }
841
842 /* Now find first active, clean node, and if it is me,
843 * flag service start.
844 * Right now, that boils down to "first node in STANDBY state".
845 */
846 while(stateptr !=NULL){
847 int state=ntohl(stateptr->msg.status);
848 switch(state){
849 case INITIAL:
850 /* wait for other node to finish coming up */
851 return;
852
853 case STANDBY:
854 /* This is "first active node". Is it me? */
855 if(strcasecmp(stateptr->msg.srchost, myhostname)==0){
856 need_to_start=1;
857 }
858 return;
859 }
860 stateptr=stateptr->next;
861 }
862
863 }
864
865
866 int main(int argc, char *argv[])
867 {
868 int optc;
869 char *statefile=NULL,*nameparse;
870 FILE *pipefp;
871
872 bzero(ipaddr,sizeof(struct in_addr) *3);
873 bzero(netaddr,sizeof(struct in_addr) *3);
874
875 listensock[0]=-1;
876 listensock[1]=-1;
877 listensock[2]=-1;
878 sendsock[0]=-1;
879 sendsock[1]=-1;
880 sendsock[2]=-1;
881
882 pipefp=popen("uname -n","r");
883 fgets(myhostname,99,pipefp);
884 myhostname[99]='\0';
885 nameparse=&myhostname[0];
886 /* okay, "_" isnt legal, or didnt used to be. but people use it */
887 while(isalnum((int)*nameparse) ||
888 *nameparse=='-'|| *nameparse=='_'){
889 nameparse++;
890 }
891 *nameparse='\0';
892
893 printf("DEBUG: myhostname is '%s'\n",myhostname);
894 pclose(pipefp); /* you might need fclose() instead */
895
896
897 /* Need to:
898 - parse options
899 - bind UDP port
900 - broadcast our status
901 - listen for other machines' statuses
902 and then all the other stuff of course
903 */
904
905 strcpy(scriptdir,BINDIR);
906
907 while((optc=getopt(argc,argv,"a:b:c:A:B:C:l:mn:p:s:H:M:T:")) != -1){
908 switch(optc){
909 case 'a':
910 inet_aton(optarg, &ipaddr[0]);
911 break;
912 case 'b':
913 inet_aton(optarg, &ipaddr[1]);
914 break;
915 case 'c':
916 inet_aton(optarg, &ipaddr[2]);
917 break;
918
919 case 's':
920 strncpy(scriptdir,optarg,MAXPATHLEN-2);
921 printf("DEBUG: script dir now set to %s\n",optarg);
922 break;
923
924 case 'l':
925 statefile=optarg;
926 printf("DEBUG: statefile now set to %s\n",statefile);
927 break;
928
929 case 'm':
930 am_main=1;
931 puts("DEBUG: I think I am the main node");
932 puts(" I will start services in a few seconds.");
933 break;
934
935 case 'n':
936 num_nodes=atoi(optarg);
937 printf("DEBUG: number of nodes set to %d\n",num_nodes);
938 break;
939
940 case 'p':
941 freeha_port=atoi(optarg);
942 printf("DEBUG: port set to %d\n",freeha_port);
943 break;
944
945 case 'A':
946 inet_aton(optarg, &netaddr[0]);
947 break;
948 case 'B':
949 inet_aton(optarg, &netaddr[1]);
950 break;
951 case 'C':
952 inet_aton(optarg, &netaddr[2]);
953 break;
954 case 'M':
955 monitorsec=atoi(optarg);
956 printf("DEBUG: monitor delay set to %d\n",
957 monitorsec);
958 break;
959 case 'T':
960 timeoutsec=atoi(optarg);
961 printf("DEBUG: timeout seconds set to %d\n",
962 timeoutsec);
963 break;
964
965 default:
966 usage();
967 exit(1);
968 }
969 }
970
971 /* determine min number of nodes to have cluster happy */
972 quorum_count=(num_nodes/2) + 1;
973
974 { /* check to see we have valid script directory */
975 char scriptfile[MAXPATHLEN];
976 sprintf(scriptfile,"%s/%s",scriptdir,"starthasrv");
977 if(access(scriptfile,X_OK)!=0){
978 fprintf(stderr,"ERROR: cannot access %s\n",scriptfile);
979 exit(1);
980 }
981 sprintf(scriptfile,"%s/%s",scriptdir,"stophasrv");
982 if(access(scriptfile,X_OK)!=0){
983 fprintf(stderr,"ERROR: cannot access %s\n",scriptfile);
984 exit(1);
985 }
986 sprintf(scriptfile,"%s/%s",scriptdir,"monitorhasrv");
987 if(access(scriptfile,X_OK)!=0){
988 fprintf(stderr,"ERROR: cannot access %s\n",scriptfile);
989 exit(1);
990 }
991 }
992
993 printf("addr1=%s\n", inet_ntoa(ipaddr[0]));
994 printf("net1=%s\n", inet_ntoa(netaddr[0]));
995
996 init_state_file(statefile);
997 opensockets();
998
999 #ifdef USE_SIGNAL
1000 # ifdef __svr4__
1001 sigset(SIGUSR1,sighandler); /* start services */
1002 sigset(SIGUSR2,sighandler); /* start services */
1003 sigset(SIGHUP,sighandler); /* stop services */
1004 sigset(SIGINT,sighandler); /* stop services and QUIT */
1005 sigset(SIGTERM,sighandler); /* stop services and QUIT */
1006 # else
1007 signal(SIGUSR1,sighandler); /* start services */
1008 signal(SIGUSR2,sighandler); /* start services */
1009 signal(SIGHUP,sighandler); /* stop services */
1010 signal(SIGINT,sighandler); /* stop services and QUIT */
1011 signal(SIGTERM,sighandler); /* stop services and QUIT */
1012 # endif /* __svr4__*/
1013 #endif /* USE_SIGNAL */
1014
1015 #ifdef USE_SYSLOG
1016 openlog("FreeHA", LOG_CONS|LOG_PID, LOG_LOCAL1);
1017 syslog(LOG_NOTICE, "demon starting");
1018 #else
1019 puts("demon starting");
1020 #endif
1021
1022 /* In theory, on initial start, should loop for a few seconds
1023 * to figure out who gets to be top dog.
1024 * THEN start up services, if I am main node.
1025 * THEN go into main polling loop
1026 * Or then again.. just force startup script to tell us that
1027 * We are normally main node.
1028 * Probably that is safer.
1029 */
1030
1031 if(am_main){
1032 #ifdef USE_SYSLOG
1033 syslog(LOG_WARNING, "demon starting in FORCE-MAIN mode!!!");
1034 #else
1035 puts("DEBUG: called in force-main mode");
1036 puts(" broadcasting primary role");
1037 #endif
1038 changestate(STARTING);
1039 /* send a quickie msg right away, to try to inform
1040 * other nodes that may be coming up, "IM DOING IT!"
1041 * We get called if -m flag was added at startup.
1042 * Note that we dont do any cross-checks.
1043 * This makes using -m flag VERY DANGEROUS!!!
1044 */
1045 sendheartbeat();
1046 dump_states();
1047 sleep(1);
1048 sendheartbeat();
1049 start_services();
1050 } else {
1051 /* Seeding cluster state tables for other nodes.
1052 * Try for 200 seconds to see if we have all nodes
1053 * communicating.
1054 * If we see all nodes, continue immediately.
1055 */
1056 int loop=200;
1057
1058 lseek(state_fd, 0, SEEK_SET);
1059 write(state_fd,"Waiting for quorum of nodes to come online\n",43);
1060
1061 while(loop-- >0){
1062 sendheartbeat();
1063 readheartbeats();
1064 if(count_activenodes() == num_nodes){
1065 break;
1066 }
1067 if(need_to_quit){
1068 exit(0);
1069 }
1070 sleep(1);
1071 }
1072
1073 if(count_activenodes() < quorum_count){
1074 #ifdef USE_SYSLOG
1075 syslog(LOG_ERR,"Cannot meet quorum node count. Quitting.");
1076 #else
1077 fprintf(stderr,"ERROR: min nodes in cluster is %d\n",
1078 quorum_count);
1079 fprintf(stderr," Only have %d nodes. Cannot continue.\n",
1080 count_activenodes());
1081 #endif
1082 exit(1);
1083 }
1084
1085 monitor_services(1); /* are services already running? */
1086 if(need_to_stop){
1087 /* clean up, if neccessary */
1088 stop_services();
1089 changestate(STANDBY);
1090 } /* else should already be mystate==RUNNING*/
1091 }
1092
1093
1094 /**** MAIN LOOP ****/
1095 while(1){
1096 sendheartbeat();
1097 readheartbeats();
1098
1099 do_timeouts();
1100 dump_states();
1101 monitor_services(0);
1102
1103 if(need_to_stop){
1104 /* If need_to_stop flag set, call stop_services()
1105 * even if error detected.
1106 * Perhaps only one service of multiple has failed.
1107 * Need to cleanly shut down remaining services.
1108 */
1109 if((mystate==RUNNING) || (mystate == ERRORED)){
1110 changestate(STOPPING);
1111 sendheartbeat();
1112 dump_states();
1113 stop_services();
1114 }
1115 if(need_to_quit){
1116 #ifdef USE_SYSLOG
1117 syslog(LOG_NOTICE,"demon stopping");
1118 #else
1119 puts("DEBUG: caught SIGINT. Quitting cleanly");
1120 #endif
1121 exit(0);
1122 /* do NOT send heartbeat with STANDBY state,
1123 * so that it is clear during the timeout
1124 * that this machine is going offline
1125 */
1126 }
1127
1128 if(have_error==0){
1129 changestate(STANDBY);
1130 } else {
1131 changestate(ERRORED);
1132 }
1133 sendheartbeat();
1134
1135 } else {
1136 /* who 'should' be main node? me?*/
1137 check_main();
1138 if(need_to_start){
1139 changestate(STARTING);
1140 sendheartbeat();
1141 dump_states();
1142 start_services();
1143 }
1144 }
1145
1146 sleep(monitorsec);
1147 }
1148 /* End of MAIN LOOP */
1149
1150 }