"Fossies" - the Fresh Open Source Software Archive

Member "freeha-1.0/freehad.c" (23 Nov 2006, 27563 Bytes) of package /linux/privat/old/freeha-1.0.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 /* actual workhorse demon for "FreeHA".
    2  *
    3  * This program should handle all of the following:
    4  *
    5  * -> send status 'heartbeats' to all configured networks
    6  *
    7  * -> listen for 'heartbeats' from other nodes
    8  *
    9  * -> call 'starthasrv' if it has been determined that this node
   10  *    should become "active"
   11  *
   12  * -> call 'monitorhasrv'  periodically to see if there are errors
   13  *
   14  * -> call 'alerthasrv' if an error has been detected.
   15  *
   16  * -> call 'stophasrv' if an error has been detected.
   17  *     (It is up to the sysadmin to make sure 'stophasrv' gets
   18  *       called when the machine shuts down 'normally')
   19  *
   20  * -> stay in 'errored' state once entered, until killed
   21  *
   22  */
   23 
   24 static char *versionstring="@(#) freehad.c 1.36@(#)";
   25 
   26 #include <stdio.h>
   27 #include <unistd.h>
   28 #include <sys/param.h> /* for MAXPATHLEN */
   29 #include <stdlib.h>
   30 #include <errno.h>
   31 #include <time.h>
   32 
   33 #if defined(__svr4__) || defined(__linux__)
   34 # include <string.h>
   35 # define bcopy(src,dest,sz) memcpy(dest,src,sz)
   36 # define bzero(dest,sz) memset(dest,0,sz)
   37 #else
   38 # include <strings.h>
   39 #endif /* __svr4__ */
   40 
   41 #include <ctype.h>
   42 
   43 #include <sys/types.h>
   44 #include <sys/socket.h>
   45 #include <netinet/in.h>
   46 #include <arpa/inet.h>
   47 
   48 #ifdef USE_SIGNAL
   49 #include <signal.h>
   50 #endif
   51 
   52 #ifdef USE_SYSLOG
   53 #include <syslog.h>
   54 #endif
   55 
   56 #include <sys/wait.h> /* should be right on solaris, linux, AND freebsd */
   57 
   58 #include <sys/stat.h>
   59 #include <fcntl.h>
   60 #include <sys/ioctl.h>
   61 
   62 #ifndef FIONBIO
   63 #include <sys/filio.h> /* WHY do I need this specifically, when it is
   64                           included in sys/ioctl.h ? ? ? */
   65 #endif
   66 
   67 #include "freehad.h"
   68 
   69 /**********************************************************************/
   70 /**********************************************************************/
   71 
   72 /* for the heartbeat addrs/nets */
   73 struct in_addr ipaddr[3]; /* for listening */
   74 struct in_addr netaddr[3];/* for sending (broadcast)*/
   75 
   76 int listensock[3], sendsock[3];
   77 
   78 int freeha_port=0xf33;
   79 
   80 char state_file[1024]; /* path to 'state file', that we write */
   81 int state_fd;
   82 
   83 struct statelist{
   84     struct freeha_msg msg;
   85     struct statelist *next,*prev;
   86 };
   87 
   88 /* list of every system we know about */
   89 struct statelist *host_states;
   90 
   91 char myhostname[100];
   92 char scriptdir[MAXPATHLEN];
   93 
   94 int mystate=INITIAL;    /* the 'HA status' of this node */
   95 int am_main=0;      /* Am I the main node? dont think sooo... */
   96 
   97 int num_nodes=2;    /* number of machines in cluster. used for
   98              * 'quorum' purposes.
   99              * (avoiding split-brain, if more than 2 nodes)
  100              */
  101 int quorum_count;   /* filled in by main() */
  102 
  103 int heartbeatsec=1; /* not used currently */
  104 int monitorsec=1;   /* seconds delay between monitor/hb loops */
  105 int timeoutsec=121; /* seconds after which a node with no heartbeat is
  106                          * assumed 'Missing In Action' (ie 'dead')
  107              */
  108 
  109 int need_to_stop=0; /* flag for main loop to call stop_services() */
  110 int need_to_start=0;    /* flag for main loop to call start_services() */
  111 int need_to_quit=0; /* If received SIGINT, quit cleanly */
  112 int have_error=0;
  113 
  114 /**********************************************************************/
  115 
  116 /* I want to keep this SIMPLE, by using getopt()
  117  * Unfortunately, getopt does not support more complex arg strings, but
  118  * only single-letter options.
  119  * Hence why the options are a leetle bit unwieldy
  120  */
  121 void usage(){
  122     puts("Usage:");
  123     puts(" (-a|-b|-c) {IPaddr}   IP addr 1,2 or 3, for src addr");
  124     puts(" (-A|-B|-C) {addr}     BROADCAST addr 1,2 or 3 for dest broadcast");
  125     puts(" -h                    help (This text)");
  126     puts(" -l {statefile}        file to write ongoing state to");
  127     puts(" -n {nodes}            number of nodes in cluster (default 2)");
  128     puts(" -p {port}             UDP port for heartbeat(default 0xfee)");
  129     puts(" -s directory          script directory (default /opt/freeha/bin)");
  130     puts(" -M {sec}              number of seconds between monitorings(defalt 1)");
  131     puts(" -T {timeoutsec}       number of seconds to timeout a node(dead)");
  132     printf("                          (default=%d)\n",timeoutsec);
  133     puts("");
  134     puts("Note that -M specifies 'delay beween', not 'do every'");
  135     puts("");
  136     printf("Code Release: %s\n",versionstring);
  137 }
  138 
  139 
  140 #ifdef USE_SIGNAL
  141 void sighandler(int sigarg){
  142     switch(sigarg){
  143         case SIGINT:
  144         case SIGTERM:
  145         need_to_quit=1;
  146         case SIGHUP:
  147         need_to_stop=1;
  148         break;
  149         case SIGUSR1:
  150         /* This is a FORCED (re)-start. So we clear error flags */
  151         have_error=0;
  152         need_to_start=1;
  153         break;
  154         case SIGUSR2:
  155         /* Just clear the error flag. Dont actually DO anything */
  156         have_error=0;
  157         if(mystate==ERRORED){
  158             mystate=STANDBY;
  159         }
  160         break;
  161     }
  162 }
  163 #endif /* USE_SIGNAL */
  164 
  165 void init_state_file(char *statefile_name)
  166 {
  167     if (statefile_name != NULL){
  168         strncpy(state_file,statefile_name,1024);
  169     } else {
  170         if(access("/var/run", X_OK|W_OK)==0) {
  171             strcpy(state_file,"/var/run/freeha");
  172         } else if(access("/var/freeha", X_OK|W_OK)==0){
  173             strcpy(state_file,"/var/freeha/state");
  174         } else {
  175             fprintf(stderr,"ERROR: Do not know where to create state file\n");
  176             fprintf(stderr,"/var/run and /var/freeha are not accessible\n");
  177             fprintf(stderr,"Use -l to specify location of state file\n");
  178         
  179             exit(1);
  180         }
  181     }
  182 
  183     /* this is just to ensure that the creat() call wont fail with EEXIST.
  184      * we dont care if there isnt an old statefile there already
  185      */
  186     unlink(state_file);
  187 
  188     state_fd=creat(state_file, S_IRWXU|S_IRGRP|S_IROTH);
  189     if(state_fd <0) {
  190         perror("could not create state file");
  191         exit(1);
  192     }
  193 }
  194 
  195 /* called as subroutine for opensockets() */
  196 int opensock(struct sockaddr_in *addr_ptr)
  197 {
  198     int reuse_opt=1;
  199     int newsock=socket(AF_INET,SOCK_DGRAM,0);
  200     if(newsock<0){
  201         perror("could not create new socket?? !!");
  202         return -1;
  203     }
  204     if(bind(newsock, (struct sockaddr*)addr_ptr,
  205             sizeof(struct sockaddr_in)) <0)
  206     {
  207         printf("Error binding %s:%d\n",inet_ntoa(addr_ptr->sin_addr),
  208                 freeha_port);
  209         return -1;
  210     }
  211     setsockopt(newsock, SOL_SOCKET,SO_REUSEADDR,
  212            &reuse_opt, sizeof(reuse_opt));
  213 
  214     /* Okay, only half the sockets we make, need this option set.
  215      * But no harm in setting it, as far as I can see!
  216      */
  217     setsockopt(newsock, SOL_SOCKET,SO_BROADCAST,
  218            &reuse_opt, sizeof(reuse_opt));
  219 
  220 /* I would think that NONBLOCK would work.
  221  * Except that it doesnt seem to work for solaris. GRRR.
  222  * So thats why FIONBIO is here also
  223  */
  224 #ifdef FIONBIO
  225     ioctl(newsock,FIONBIO, &reuse_opt);
  226 #else
  227     fcntl(newsock,F_SETFL, O_NONBLOCK);
  228 #endif
  229 
  230     return newsock;
  231 
  232 }
  233 
  234 
  235 /* Open the sending sockets, and the listening sockets */
  236 void opensockets()
  237 {
  238     struct sockaddr_in addr;
  239     int loop;
  240     int addrcount=0;
  241 
  242     bzero(&addr, sizeof(struct sockaddr_in));
  243 
  244     addr.sin_family=AF_INET;
  245     addr.sin_port=htons(freeha_port);
  246 
  247 
  248     for(loop=0; loop<3; loop++) {
  249         addr.sin_addr.s_addr = ipaddr[loop].s_addr;
  250         if(addr.sin_addr.s_addr == INADDR_ANY)  continue;
  251 
  252         printf("Trying to bind for send %s:%d\n",
  253                      inet_ntoa(addr.sin_addr),
  254                      freeha_port);
  255 
  256         sendsock[loop]=opensock(&addr);
  257         if(sendsock[loop] >=0){
  258             addrcount++;
  259         }
  260 
  261         addr.sin_addr.s_addr = netaddr[loop].s_addr;
  262         if(addr.sin_addr.s_addr == INADDR_ANY){
  263             netaddr[loop].s_addr = ipaddr[loop].s_addr;
  264             /* well durnit. try to GUESS at broadcast,
  265              * since unspecified.
  266              * assume they want class C broadcast.
  267              */
  268             netaddr[loop].s_addr = htonl(netaddr[loop].s_addr);
  269             /* We KNOW it is in MSB format now */
  270             netaddr[loop].s_addr |= 0x000000ff;
  271 
  272             netaddr[loop].s_addr = ntohl(netaddr[loop].s_addr);
  273             addr.sin_addr.s_addr = netaddr[loop].s_addr;
  274 
  275             printf("Have adjusted broadcast to %s\n",
  276                     inet_ntoa(addr.sin_addr));
  277         }
  278 
  279         printf("Trying to bind LISTEN for %s:%d\n",
  280                      inet_ntoa(addr.sin_addr),
  281                      freeha_port);
  282         listensock[loop]=opensock(&addr);
  283         if(listensock[loop] <0){
  284             perror("ERROR trying to bind listen socket");
  285             fprintf(stderr,"failed for %s:%d\n",
  286                      inet_ntoa(addr.sin_addr),
  287                      freeha_port);
  288             exit(1);
  289         }
  290 
  291     }
  292 
  293     if(addrcount==0){
  294         fprintf(stderr,"ERROR: could not bind any addresses\n");
  295         exit(1);
  296     }
  297     
  298 }
  299 
  300 
  301 /* This wrapper exists to call the external alerthasrv script (if it exists)
  302  * when we change our internal state.
  303  * It first changes the global state variable, 'mystate'
  304  * It then calls the external script, with:
  305  *   alerthasrv [statenum] [statename]
  306  */
  307  
  308 void changestate(int newstate)
  309 {
  310     static int oldstate=UNKNOWNSTATE;
  311     char *statestring;
  312     char runstring[50];
  313 
  314     if(newstate==oldstate){
  315         return;
  316     }
  317     oldstate=newstate;
  318     mystate=newstate;
  319 
  320     /* Using switch statement instead of array, for error checking */
  321     switch(newstate){
  322         case UNKNOWNSTATE: statestring="UNKNOWN_STATE"; break;
  323         case INITIAL: statestring="INITIAL"; break;
  324         case RUNNING: statestring="RUNNING"; break;
  325         case STOPPING: statestring="STOPPING"; break;
  326         case STARTING: statestring="STARTING"; break;
  327         case STANDBY: statestring="STANDBY"; break;
  328         case ERRORED: statestring="ERRORED"; break;
  329         case TIMEDOUT: statestring="TIMEDOUT"; break;
  330         default: statestring="INTERNAL_ERROR"; break;
  331     }
  332     sprintf(runstring,"%s/alerthasrv %d %s",scriptdir, newstate,
  333             statestring);
  334     system(runstring);
  335     /* technically this could fail, but... oh well */
  336     
  337     
  338 }
  339 
  340 /* Called only by storemsg().
  341  * We have a state message from a machine we dont recognize.
  342  * In this case, we automatically add that host's state into our list of
  343  * host states
  344  */
  345 void addhost(struct freeha_msg* msg)
  346 {
  347     struct statelist *newentry,*oldentry;
  348     int compare=0;
  349 
  350     newentry=(struct statelist*)malloc(sizeof(struct statelist));
  351     if(newentry==NULL){
  352         changestate(ERRORED);
  353         fprintf(stderr,"ERROR: addhost out of memory!\n");
  354         return;
  355     }
  356 
  357     newentry->next=newentry->prev=NULL;
  358 
  359     bcopy(msg, &newentry->msg, sizeof(struct freeha_msg));
  360     if(host_states==NULL){
  361         host_states=newentry;
  362         return;
  363     }
  364     oldentry=host_states;
  365 
  366     /*We need to alphabetize insert! So compare...*/
  367     do {
  368         compare=strcasecmp(oldentry->msg.srchost,
  369                 newentry->msg.srchost);
  370                
  371         if(compare >0){
  372             newentry->next=oldentry;
  373             newentry->prev=oldentry->prev;
  374             if(oldentry->prev==NULL){
  375                 host_states=newentry;
  376             } else {
  377                 oldentry->prev->next=newentry;
  378             }
  379             oldentry->prev=newentry;
  380             return;
  381         }
  382         if(oldentry->next==NULL){
  383             break;
  384         }  else {
  385             oldentry=oldentry->next;
  386         }
  387     } while(1);
  388 
  389     /* oh well. slap on the end, then. */
  390     oldentry->next=newentry;
  391     newentry->prev=oldentry;
  392 
  393 }
  394 
  395 /* Once we have received a status (heartbeat) message from another system,
  396  * store it in our state table.
  397  * If system is not already present, then add it.
  398  */
  399 void storemsg(struct freeha_msg* msg)
  400 {
  401     int version;
  402     struct statelist *hoststate=host_states;
  403     version=ntohl(msg->version);
  404     if(version != HA_VERSION) {
  405         printf("ERROR: storemsg got msg version=%d; expecting %d\n",
  406                version, HA_VERSION);
  407         return;
  408     }
  409 
  410     msg->timestamp=time(NULL);
  411 
  412     while(hoststate !=NULL){
  413         if(strncmp(hoststate->msg.srchost, msg->srchost, HA_HOSTLEN)==0){
  414             bcopy(msg, &hoststate->msg, sizeof(struct freeha_msg));
  415             return;
  416         }
  417         hoststate=hoststate->next;
  418     }
  419     /* HOST not found. so auto-add to our state list. */
  420     addhost(msg);
  421 }
  422 
  423 /* dump the states of all hosts we know about.
  424  * we dump this to the state file. So the state file should ideally be
  425  * on a tmpfs filesystem.
  426  * /var/run usually fulfills this purpose well
  427  */
  428 void dump_states()
  429 {
  430     char hoststate[50], *statestring;
  431     struct statelist *stateptr=host_states;
  432     int state;
  433 
  434     /* 'rewind' the file */
  435     lseek(state_fd, 0, SEEK_SET);
  436 
  437 #ifdef DEBUG
  438     puts("Writing to state file:");
  439 #endif
  440     while(stateptr !=NULL){
  441         state=ntohl(stateptr->msg.status);
  442         switch(state){
  443             case INITIAL:
  444             statestring="Initial";
  445             break;
  446             case RUNNING:
  447             statestring="Running";
  448             break;
  449             case STOPPING:
  450             statestring="Stopping";
  451             break;
  452             case STARTING:
  453             statestring="Starting"; /* starting services */
  454             break;
  455             case STANDBY:
  456             statestring="Standby";
  457             break;
  458             case ERRORED:
  459             statestring="ERRORED";
  460             break;
  461             case TIMEDOUT:
  462             statestring="TIMEOUT";
  463             break;
  464             default:
  465             statestring="[invalid state]";
  466             break;
  467         }
  468         sprintf(hoststate,"%.32s [%s] %s",
  469                 stateptr->msg.srchost, statestring,
  470             ctime(&(stateptr->msg.timestamp)) );
  471 
  472         /* remember, ctime adds \n */
  473 
  474         write(state_fd,hoststate,strlen(hoststate));
  475 #ifdef DEBUG
  476         printf(" %s",hoststate);
  477 #endif
  478 
  479         stateptr=stateptr->next;
  480     }
  481 }
  482 
  483 /* used to see if we have enough active nodes in cluster to
  484  * constitute "quorum". We want to avoid mini-clusters occuring
  485  * in our cluster machines.
  486  * Mainly used on initial demon startup only.
  487  */
  488 int count_activenodes()
  489 {
  490     struct statelist *stateptr=host_states;
  491     int activecount=0;
  492 
  493 
  494     while(stateptr !=NULL){
  495         int state=ntohl(stateptr->msg.status);
  496         switch(state){
  497             case RUNNING:
  498             case STANDBY:
  499             case INITIAL:
  500             case STOPPING:
  501             case STARTING:
  502             case ERRORED:
  503             activecount+=1;
  504         }
  505         stateptr=stateptr->next;
  506     }
  507     return activecount;
  508 }
  509 
  510 /* Determine if services are either running, or in process
  511  * of shutting down, SOMEWHERE in the cluster
  512  * return 1 if yes, 0 if no.
  513  */
  514 int check_running()
  515 {
  516     struct statelist *stateptr=host_states;
  517     int services_up=0;
  518 
  519     while(stateptr !=NULL){
  520         int state=ntohl(stateptr->msg.status);
  521         switch(state){
  522             case RUNNING:
  523             case STARTING:
  524             case STOPPING:
  525             services_up=1;
  526         }
  527         stateptr=stateptr->next;
  528     }
  529 
  530     return services_up;
  531 }
  532 
  533 void sendheartbeat(){
  534     struct freeha_msg message;
  535     int msglen=sizeof(message);
  536     int loop;
  537     struct sockaddr_in addr;
  538 
  539     message.version=htonl(HA_VERSION);
  540     message.status=htonl(mystate);
  541     message.timestamp=0;
  542     strcpy(message.srchost, myhostname);
  543 
  544     addr.sin_family=AF_INET;
  545     addr.sin_port=htons(freeha_port);
  546 
  547     for(loop=0; loop<3; loop++) {
  548         if(sendsock[loop]==-1) continue;
  549 
  550         addr.sin_addr.s_addr = netaddr[loop].s_addr;
  551         if(sendto(sendsock[loop], &message, msglen,
  552                NULL, (struct sockaddr*)&addr,
  553                sizeof(struct sockaddr_in)) != msglen)
  554         {
  555             perror("heartbeat send failed");
  556         }
  557     }
  558 }
  559 
  560 
  561 /* return 1 on successfull read, 0 on none, -1 on error */
  562 int readheartbeat(int socket)
  563 {
  564     struct freeha_msg incoming;
  565     int readstat;
  566 
  567     if(socket<0) return 0;
  568     
  569     /* Note that this would get very unhappy if we somehow read less
  570      * than a full heatbeat.
  571      * At some point, will use NREADY, etc.
  572      */
  573 
  574     readstat=recvfrom(socket, &incoming,sizeof(incoming),
  575              NULL, NULL,0);
  576 
  577     if(readstat < (int)sizeof(incoming)){
  578         switch(errno){
  579 #ifdef EWOULDBLOCK
  580             case EWOULDBLOCK:
  581 #else
  582             default:
  583 #endif
  584             return 0;
  585         }
  586         perror("readheartbeat: Error reading socket!");
  587         return -1;
  588     }
  589 
  590     storemsg(&incoming);
  591     return 1;
  592 }
  593 
  594 /* Kinda ugly, but...
  595  * basically, read in all network traffic for port that is queued,
  596  * for each interface/addr that we know about.
  597  * (Just in case we have fallen behind in our processing.
  598  *  Which is quite likely, with long monitor scripts)
  599  */
  600 void readheartbeats()
  601 {
  602     while(readheartbeat(listensock[0])==1);
  603     while(readheartbeat(listensock[1])==1);
  604     while(readheartbeat(listensock[2])==1);
  605 }
  606 
  607 
  608 /* This is the wrapper around the 'starthasrv' script.
  609  * It calls the script, checks the status of it, and
  610  * then sets mystate to either RUNNING, or ERRORED, as appropriate.
  611  */
  612 void start_services()
  613 {
  614     char startscript[MAXPATHLEN+20];
  615     int startstate;
  616 
  617     need_to_start=0;
  618     if(mystate == RUNNING){
  619         puts("ERROR: start_services called when already RUNNING");
  620         return;
  621     }
  622 
  623 
  624 #ifdef USE_SYSLOG
  625     syslog(LOG_NOTICE,"starting services");
  626 #else
  627     puts("DEBUG: start_services() calling start script");
  628 #endif
  629 
  630     sprintf(startscript,"%s/%s",scriptdir, "starthasrv");
  631     startstate=system(startscript);
  632     if(startstate==0) {
  633         changestate(RUNNING);
  634 #ifdef USE_SYSLOG
  635         syslog(LOG_NOTICE,"starthasrv returned successfully");
  636 #else
  637         puts("DEBUG: start_services() returning successfully");
  638 #endif
  639         return;
  640     }
  641 
  642 
  643 #ifdef USE_SYSLOG
  644     syslog(LOG_ERR,"Error hit from starthasrv script");
  645 #else
  646     fprintf(stderr,"ERROR: start_services (): starthasrv failed!(err %d)\n",
  647              startstate);
  648 #endif
  649     changestate(ERRORED);
  650 
  651 }
  652 
  653 /* stop_services: called by main loop, as a result of
  654  * either the monitor script, or indirectly by signal handler
  655  * if we get a HUP or INT signal
  656  * 
  657  *  - changes state to STOPPING
  658  *  - sends heartbeat
  659  *  - calls stophasrv script
  660  *  - changes state to STANDBY
  661  *  - sends heartbeat
  662  */
  663 void stop_services()
  664 {
  665     char stopscript[MAXPATHLEN+20];
  666 
  667     switch(mystate){
  668         case RUNNING:
  669         case STOPPING:
  670         case INITIAL:
  671         case ERRORED:
  672             break;
  673         default:
  674             puts("ERROR: stop_services called in unexpected state");
  675             return;
  676     }
  677 
  678     need_to_start=0; /* stop takes precedence over start */
  679     need_to_stop=0;
  680 
  681 #ifdef USE_SYSLOG
  682     syslog(LOG_NOTICE,"stopping services");
  683 #endif
  684     changestate(STOPPING);
  685     sendheartbeat();
  686     dump_states();
  687 
  688     sprintf(stopscript,"%s/%s",scriptdir, "stophasrv");
  689     system(stopscript);
  690 
  691 }
  692 
  693 /* Run monitor script.
  694  * This monitors the state of actively running services.
  695  * Change mystate to 'ERRORED', if monitor fails.
  696  * On ERRORED, will also set need_to_stop=1, and have_error=1
  697  *
  698  * Called specially, on first startup (startup=1), to determine if
  699  * demon has been started on a node where services are already running.
  700  * In this case, will either set mystate to RUNNING, or do nothing.
  701  */
  702 void monitor_services(int firsttime)
  703 {
  704     char monitorscript[MAXPATHLEN+20];
  705     int mon_status;
  706 
  707     sprintf(monitorscript,"%s/%s",scriptdir, "monitorhasrv");
  708 
  709 
  710     /* only run monitor, if we have reason to do so! */
  711     if(firsttime==0){
  712         switch(mystate) {
  713             case RUNNING:
  714                 break;
  715             default:
  716                 return;
  717         }
  718     }
  719 
  720     mon_status=system(monitorscript);
  721     mon_status = WEXITSTATUS(mon_status);
  722 
  723 
  724     switch(mon_status){
  725         case 0: /* Services are running OK */
  726             if(firsttime==1){
  727 #ifdef USE_SYSLOG
  728                 syslog(LOG_NOTICE,"services detected as already running");
  729 #else
  730                 puts("DEBUG: services already running");
  731 #endif
  732                 changestate(RUNNING);
  733                 return;
  734             }
  735             break;
  736 
  737         default:
  738 
  739             if(firsttime==0){
  740 #ifdef USE_SYSLOG
  741                 syslog(LOG_ERR,"monitorhasrv detected error");
  742 #else
  743                 printf("monitor status=%d: ERRORED!\n",mon_status);
  744 #endif
  745                 changestate(ERRORED);
  746                 have_error=1;
  747             } else {
  748                 printf("Services not running.");
  749                 printf("Will Run stop_services for cleanup\n");
  750             }
  751 
  752             /* Sneaky hack: if firsttime started, and services
  753              * are not running... flag that we should run
  754              * stop_services, to clean up anything left behind
  755              */
  756             need_to_stop=1;
  757     }
  758     
  759 }
  760 
  761 
  762 /* Run through list of systems, and see if we should timeout
  763  * any of them.
  764  */
  765 void do_timeouts()
  766 {
  767     struct statelist *stateptr=host_states;
  768     time_t timenow=time(NULL);
  769     int timeoutstate=htonl(TIMEDOUT);
  770 
  771     while(stateptr !=NULL){
  772         /* Skip my own line */
  773         if(strncmp(stateptr->msg.srchost, myhostname, HA_HOSTLEN)==0){
  774             stateptr=stateptr->next;
  775             continue;
  776         }
  777         if(stateptr->msg.status==timeoutstate){
  778             /* already in timeout state */
  779             stateptr=stateptr->next;
  780             continue;
  781         }
  782 
  783         if((timenow - stateptr->msg.timestamp) > timeoutsec){
  784 #ifdef USE_SYSLOG
  785             syslog(LOG_ERR,"timeout for %s",stateptr->msg.srchost);
  786 #else
  787 
  788             printf("DEBUG: timeout for system %s\n",
  789                    stateptr->msg.srchost);
  790 #endif
  791             stateptr->msg.status=timeoutstate;
  792         }
  793         stateptr=stateptr->next;
  794     }
  795     
  796 }
  797 
  798 /* Determine who should be 'main' system.
  799  * If me, then flag 'need_to_start' so that main loop
  800  * will trigger service_start
  801  *
  802  * Will only flag need_to_start, if
  803  *  -  All nodes, or at minimum a 'quorum' (50%+1) are heard from
  804  * AND
  805  *  -  we are alphabetically first in the list of machines that arent screwed up
  806  * AND
  807  *  -  we are in 'STANDBY' state
  808  * AND
  809  *  - there is no other machine in state RUNNING or STOPPING or STARTING
  810  *
  811  */
  812 void check_main()
  813 {
  814     struct statelist *stateptr=host_states;
  815 
  816     if(am_main){
  817         return;
  818     }
  819     if(mystate!=STANDBY){
  820         return;
  821     }
  822 
  823     /* Check for if we're in the majority half of the cluster,
  824      * in case of partial network failure.
  825      * 2-node cluster is special case. We dont meetnormal quorum count,
  826      *   of 50%+1.
  827      * But on the other hand, we still have to try to start services,
  828      * or whats the point of having a cluster in the first place!
  829      */
  830     if(num_nodes>2){
  831         if(count_activenodes() <quorum_count){
  832             return;
  833         }
  834     }
  835     if(check_running()==1){
  836         /* cant do anything, if another node is already
  837          * running services!
  838          */
  839         return;
  840     }
  841 
  842     /* Now find first active, clean node, and if it is me,
  843      * flag service start.
  844      * Right now, that boils down to "first node in STANDBY state".
  845      */
  846     while(stateptr !=NULL){
  847         int state=ntohl(stateptr->msg.status);
  848         switch(state){
  849             case INITIAL:
  850             /* wait for other node to finish coming up */
  851             return;
  852 
  853             case STANDBY:
  854             /* This is "first active node". Is it me? */
  855             if(strcasecmp(stateptr->msg.srchost, myhostname)==0){
  856                 need_to_start=1;
  857             }
  858             return;
  859         }
  860         stateptr=stateptr->next;
  861     }
  862 
  863 }
  864 
  865 
  866 int main(int argc, char *argv[])
  867 {
  868     int optc;
  869     char *statefile=NULL,*nameparse;
  870     FILE *pipefp;
  871 
  872     bzero(ipaddr,sizeof(struct in_addr) *3);
  873     bzero(netaddr,sizeof(struct in_addr) *3);
  874 
  875     listensock[0]=-1;
  876     listensock[1]=-1;
  877     listensock[2]=-1;
  878     sendsock[0]=-1;
  879     sendsock[1]=-1;
  880     sendsock[2]=-1;
  881 
  882     pipefp=popen("uname -n","r");
  883     fgets(myhostname,99,pipefp);
  884     myhostname[99]='\0';
  885     nameparse=&myhostname[0];
  886     /* okay, "_" isnt legal, or didnt used to be. but people use it */
  887     while(isalnum((int)*nameparse) ||
  888           *nameparse=='-'|| *nameparse=='_'){
  889         nameparse++;
  890     }
  891     *nameparse='\0';
  892 
  893     printf("DEBUG: myhostname is '%s'\n",myhostname);
  894     pclose(pipefp); /* you might need fclose() instead */
  895 
  896 
  897     /* Need to:
  898         - parse options
  899         - bind UDP port
  900         - broadcast our status
  901         - listen for other machines' statuses
  902            and then all the other stuff of course
  903      */
  904 
  905     strcpy(scriptdir,BINDIR);
  906 
  907     while((optc=getopt(argc,argv,"a:b:c:A:B:C:l:mn:p:s:H:M:T:")) != -1){
  908         switch(optc){
  909         case 'a':
  910             inet_aton(optarg, &ipaddr[0]);
  911             break;
  912         case 'b':
  913             inet_aton(optarg, &ipaddr[1]);
  914             break;
  915         case 'c':
  916             inet_aton(optarg, &ipaddr[2]);
  917             break;
  918 
  919         case 's':
  920             strncpy(scriptdir,optarg,MAXPATHLEN-2);
  921             printf("DEBUG: script dir now set to %s\n",optarg);
  922             break;
  923 
  924         case 'l':
  925             statefile=optarg;
  926             printf("DEBUG: statefile now set to %s\n",statefile);
  927             break;
  928 
  929         case 'm':
  930             am_main=1;
  931             puts("DEBUG: I think I am the main node");
  932             puts("   I will start services in a few seconds.");
  933             break;
  934 
  935         case 'n':
  936             num_nodes=atoi(optarg);
  937             printf("DEBUG: number of nodes set to %d\n",num_nodes);
  938             break;
  939 
  940         case 'p':
  941             freeha_port=atoi(optarg);
  942             printf("DEBUG: port set to %d\n",freeha_port);
  943             break;
  944 
  945         case 'A':
  946             inet_aton(optarg, &netaddr[0]);
  947             break;
  948         case 'B':
  949             inet_aton(optarg, &netaddr[1]);
  950             break;
  951         case 'C':
  952             inet_aton(optarg, &netaddr[2]);
  953             break;
  954         case 'M':
  955             monitorsec=atoi(optarg);
  956             printf("DEBUG: monitor delay set to %d\n",
  957                     monitorsec);
  958             break;
  959         case 'T':
  960             timeoutsec=atoi(optarg);
  961             printf("DEBUG: timeout seconds set to %d\n",
  962                     timeoutsec);
  963             break;
  964 
  965         default:
  966             usage();
  967             exit(1);
  968         }
  969     }
  970 
  971     /* determine min number of nodes to have cluster happy */
  972     quorum_count=(num_nodes/2) + 1;
  973 
  974     { /* check to see we have valid script directory */
  975         char scriptfile[MAXPATHLEN];
  976         sprintf(scriptfile,"%s/%s",scriptdir,"starthasrv");
  977         if(access(scriptfile,X_OK)!=0){
  978             fprintf(stderr,"ERROR: cannot access %s\n",scriptfile);
  979             exit(1);
  980         }
  981         sprintf(scriptfile,"%s/%s",scriptdir,"stophasrv");
  982         if(access(scriptfile,X_OK)!=0){
  983             fprintf(stderr,"ERROR: cannot access %s\n",scriptfile);
  984             exit(1);
  985         }
  986         sprintf(scriptfile,"%s/%s",scriptdir,"monitorhasrv");
  987         if(access(scriptfile,X_OK)!=0){
  988             fprintf(stderr,"ERROR: cannot access %s\n",scriptfile);
  989             exit(1);
  990         }
  991     }
  992     
  993     printf("addr1=%s\n", inet_ntoa(ipaddr[0]));
  994     printf("net1=%s\n", inet_ntoa(netaddr[0]));
  995 
  996     init_state_file(statefile);
  997     opensockets();
  998 
  999 #ifdef USE_SIGNAL
 1000 # ifdef __svr4__
 1001     sigset(SIGUSR1,sighandler); /* start services */
 1002     sigset(SIGUSR2,sighandler); /* start services */
 1003     sigset(SIGHUP,sighandler); /* stop services */
 1004     sigset(SIGINT,sighandler); /* stop services and QUIT */
 1005     sigset(SIGTERM,sighandler); /* stop services and QUIT */
 1006 # else
 1007     signal(SIGUSR1,sighandler); /* start services */
 1008     signal(SIGUSR2,sighandler); /* start services */
 1009     signal(SIGHUP,sighandler); /* stop services */
 1010     signal(SIGINT,sighandler); /* stop services and QUIT */
 1011     signal(SIGTERM,sighandler); /* stop services and QUIT */
 1012 # endif /* __svr4__*/
 1013 #endif /* USE_SIGNAL */
 1014 
 1015 #ifdef USE_SYSLOG
 1016     openlog("FreeHA", LOG_CONS|LOG_PID, LOG_LOCAL1);
 1017     syslog(LOG_NOTICE, "demon starting");
 1018 #else
 1019     puts("demon starting");
 1020 #endif
 1021 
 1022     /* In theory, on initial start, should loop for a few seconds
 1023      * to figure out who gets to be top dog.
 1024      * THEN start up services, if I am main node.
 1025      * THEN go into main polling loop
 1026      * Or then again.. just force startup script to tell us that
 1027      * We are normally main node.
 1028      * Probably that is safer.
 1029      */
 1030 
 1031     if(am_main){
 1032 #ifdef USE_SYSLOG
 1033         syslog(LOG_WARNING, "demon starting in FORCE-MAIN mode!!!");
 1034 #else
 1035         puts("DEBUG: called in force-main mode");
 1036         puts("  broadcasting primary role");
 1037 #endif
 1038         changestate(STARTING);
 1039         /* send a quickie msg right away, to try to inform
 1040          * other nodes that may be coming up, "IM DOING IT!"
 1041          * We get called if -m flag was added at startup.
 1042          * Note that we dont do any cross-checks.
 1043          * This makes using -m flag VERY DANGEROUS!!!
 1044          */
 1045         sendheartbeat();
 1046         dump_states();
 1047         sleep(1);
 1048         sendheartbeat();
 1049         start_services();
 1050     } else {
 1051         /* Seeding cluster state tables for other nodes.
 1052          * Try for 200 seconds to see if we have all nodes
 1053          * communicating.
 1054          * If we see all nodes, continue immediately.
 1055          */
 1056         int loop=200;
 1057 
 1058         lseek(state_fd, 0, SEEK_SET);
 1059         write(state_fd,"Waiting for quorum of nodes to come online\n",43);
 1060 
 1061         while(loop-- >0){
 1062             sendheartbeat();
 1063             readheartbeats();
 1064             if(count_activenodes() == num_nodes){
 1065                 break;
 1066             }
 1067             if(need_to_quit){
 1068                 exit(0);
 1069             }
 1070             sleep(1);
 1071         }
 1072 
 1073         if(count_activenodes() < quorum_count){
 1074 #ifdef USE_SYSLOG
 1075             syslog(LOG_ERR,"Cannot meet quorum node count. Quitting.");
 1076 #else
 1077             fprintf(stderr,"ERROR: min nodes in cluster is %d\n",
 1078                 quorum_count);
 1079             fprintf(stderr,"  Only have %d nodes. Cannot continue.\n",
 1080                 count_activenodes());
 1081 #endif
 1082             exit(1);
 1083         }
 1084 
 1085         monitor_services(1); /* are services already running? */
 1086         if(need_to_stop){
 1087             /* clean up, if neccessary */
 1088             stop_services();
 1089             changestate(STANDBY);
 1090         } /* else should already be mystate==RUNNING*/
 1091     }
 1092 
 1093 
 1094     /****  MAIN LOOP ****/
 1095     while(1){
 1096         sendheartbeat();
 1097         readheartbeats();
 1098 
 1099         do_timeouts();
 1100         dump_states();
 1101         monitor_services(0);
 1102 
 1103         if(need_to_stop){
 1104             /* If need_to_stop flag set, call stop_services()
 1105              *   even if error detected.
 1106              * Perhaps only one service of multiple has failed.
 1107              * Need to cleanly shut down remaining services.
 1108              */
 1109             if((mystate==RUNNING) || (mystate == ERRORED)){
 1110                 changestate(STOPPING);
 1111                 sendheartbeat();
 1112                 dump_states();
 1113                 stop_services();
 1114             }
 1115             if(need_to_quit){
 1116 #ifdef USE_SYSLOG
 1117                 syslog(LOG_NOTICE,"demon stopping");
 1118 #else
 1119                 puts("DEBUG: caught SIGINT. Quitting cleanly");
 1120 #endif
 1121                 exit(0);
 1122                 /* do NOT send heartbeat with STANDBY state,
 1123                  * so that it is clear during the timeout
 1124                  * that this machine is going offline
 1125                  */
 1126             }
 1127 
 1128             if(have_error==0){
 1129                 changestate(STANDBY);
 1130             } else {
 1131                 changestate(ERRORED);
 1132             }
 1133             sendheartbeat();
 1134 
 1135         } else {
 1136             /* who 'should' be main node? me?*/
 1137             check_main(); 
 1138             if(need_to_start){
 1139                 changestate(STARTING);
 1140                 sendheartbeat();
 1141                 dump_states();
 1142                 start_services();
 1143             }
 1144         }
 1145 
 1146         sleep(monitorsec);
 1147     }
 1148     /* End of MAIN LOOP */
 1149     
 1150 }