"Fossies" - the Fresh Open Source Software Archive

Member "tin-2.4.4/src/refs.c" (20 Nov 2019, 27602 Bytes) of package /linux/misc/tin-2.4.4.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "refs.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 2.4.3_vs_2.4.4.

    1 /*
    2  *  Project   : tin - a Usenet reader
    3  *  Module    : refs.c
    4  *  Author    : Jason Faultless <jason@altarstone.com>
    5  *  Created   : 1996-05-09
    6  *  Updated   : 2017-03-28
    7  *  Notes     : Caching of message ids / References based threading
    8  *  Credits   : Richard Hodson <richard@macgyver.tele2.co.uk>
    9  *              hash_msgid, free_msgid
   10  *
   11  * Copyright (c) 1996-2020 Jason Faultless <jason@altarstone.com>
   12  * All rights reserved.
   13  *
   14  * Redistribution and use in source and binary forms, with or without
   15  * modification, are permitted provided that the following conditions
   16  * are met:
   17  *
   18  * 1. Redistributions of source code must retain the above copyright notice,
   19  *    this list of conditions and the following disclaimer.
   20  *
   21  * 2. Redistributions in binary form must reproduce the above copyright
   22  *    notice, this list of conditions and the following disclaimer in the
   23  *    documentation and/or other materials provided with the distribution.
   24  *
   25  * 3. Neither the name of the copyright holder nor the names of its
   26  *    contributors may be used to endorse or promote products derived from
   27  *    this software without specific prior written permission.
   28  *
   29  * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   30  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   31  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   32  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
   33  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   34  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   35  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   36  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   37  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   38  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   39  * POSSIBILITY OF SUCH DAMAGE.
   40  */
   41 
   42 
   43 #ifndef TIN_H
   44 #   include "tin.h"
   45 #endif /* !TIN_H */
   46 
   47 #define MAX_REFS    100         /* Limit recursion depth */
   48 #define REF_SEP " \t"           /* Separator chars in ref headers */
   49 
   50 #ifdef DEBUG
   51 #   define DEBUG_PRINT(x)   if (dbgfd != NULL) fprintf x
   52 static FILE *dbgfd;
   53 #endif /* DEBUG */
   54 
   55 /*
   56  * local prototypes
   57  */
   58 static char *_get_references(struct t_msgid *refptr, int depth);
   59 static struct t_msgid *add_msgid(int key, const char *msgid, struct t_msgid *newparent);
   60 static struct t_msgid *find_next(struct t_msgid *ptr);
   61 static struct t_msgid *parse_references(char *r);
   62 static t_bool valid_msgid(char *msgid);
   63 static unsigned int hash_msgid(const char *key);
   64 static void add_to_parent(struct t_msgid *ptr);
   65 static void build_thread(struct t_msgid *ptr);
   66 static void rearrange_siblings(void);
   67 #ifdef DEBUG
   68     static void dump_msgid_thread(struct t_msgid *ptr, int level);
   69     static void dump_msgid_threads(void);
   70 #endif /* DEBUG */
   71 #if 0
   72     static void dump_msgids(void);
   73     static void dump_thread(FILE *fp, struct t_msgid *msgid, int level);
   74 #endif /* 0 */
   75 
   76 /*
   77  * The msgids are all hashed into a big array, with overspill
   78  */
   79 static struct t_msgid *msgids[MSGID_HASH_SIZE] = {0};
   80 
   81 /*
   82  * This part of the code deals with the caching and retrieval
   83  * of Message-ID and References headers
   84  *
   85  * Rationale:
   86  *    Even though the Message-ID is unique to an article, the References
   87  *    field contains msgids from elsewhere in the group. As the expiry
   88  *    period increases, so does the redundancy of data.
   89  *    At the time of writing, comp.os.ms-windows.advocacy held ~850
   90  *    articles. The references fields contained 192k of text, of which
   91  *    169k was saved using the new caching.
   92  *
   93  *    When threading on Refs, a much better view of the original thread
   94  *    can be built up using this data, and threading is much faster
   95  *    because all the article relationships are automatically available
   96  *    to us.
   97  *
   98  * NB: We don't cache msgids from the filter file.
   99  */
  100 
  101 /*
  102  * Hash a message id. A msgid is of the form <unique@sitename>
  103  * (But badly broken message id's do occur)
  104  * We hash on the unique portion which should have good randomness in
  105  * the lower 5 bits. Propagate the random bits up with a shift, and
  106  * mix in the new bits with an exclusive or.
  107  *
  108  * This should generate about 5+strlen(string_start) bits of randomness.
  109  * MSGID_HASH_SIZE is a prime of order 2^11
  110  */
  111 static unsigned int
  112 hash_msgid(
  113     const char *key)
  114 {
  115     unsigned int hash = 0;
  116 
  117     while (*key && *key != '@') {
  118         hash = (hash << 1) ^ *key;
  119         ++key;
  120     }
  121 
  122     hash %= MSGID_HASH_SIZE;
  123 
  124     return hash;
  125 }
  126 
  127 
  128 /*
  129  * Thread us into our parents' list of children.
  130  */
  131 static void
  132 add_to_parent(
  133     struct t_msgid *ptr)
  134 {
  135     struct t_msgid *p;
  136 
  137     if (!ptr->parent)
  138         return;
  139 
  140     /*
  141      * Trivial case - if we are the first child (followup)
  142      */
  143     if (ptr->parent->child == NULL) {
  144         ptr->parent->child = ptr;
  145         return;
  146     }
  147 
  148     /*
  149      * Add this followup to the sibling chain of our parent.
  150      * We add at the end of the chain, rearrange_siblings() will
  151      * sort it later.
  152      */
  153     for (p = ptr->parent->child; p->sibling != NULL; p = p->sibling)
  154         ;
  155 
  156 /*  ptr->sibling is already NULL */
  157     p->sibling = ptr;
  158 }
  159 
  160 
  161 /*
  162  * Checks if Message-ID has valid format
  163  * Returns TRUE if it does, FALSE if it does not
  164  *
  165  * TODO: combine with post.c:damaged_id()
  166  */
  167 static t_bool
  168 valid_msgid(
  169     char *msgid)
  170 {
  171     size_t mlen = 0;
  172     t_bool at_present = FALSE;
  173 
  174     str_trim(msgid);
  175     if (!msgid || *msgid != '<')
  176         return FALSE;
  177 
  178     while (isascii((unsigned char) *msgid) && isgraph((unsigned char) *msgid) && !iscntrl((unsigned char) *msgid) && *msgid != '>') {
  179         if (*msgid == '@')
  180             at_present = TRUE;
  181         mlen++;
  182         msgid++;
  183     }
  184 
  185     if (!at_present || (*msgid != '>') || mlen <= 2 /* || mlen > 250 */|| *(msgid + 1))
  186         return FALSE;
  187 
  188     return TRUE;
  189 }
  190 
  191 
  192 /*
  193  * Adds or updates a message id in the cache.
  194  * We return a ptr to the msgid, whether located or newly created.
  195  *
  196  * . If the message id is new, add it to the cache, creating parent, child
  197  *   & sibling ptrs if a parent is given.
  198  *
  199  * . If the message id is a duplicate, then:
  200  *     a) If no parent or the same parent, is given, no action is needed.
  201  *
  202  *     b) If a parent is specified and the current parent is NULL, then
  203  *        add in the new parent and create child/sibling ptrs.
  204  *        Because we add Message-ID headers first, we don't have to worry
  205  *        about bogus Reference headers messing things up.
  206  *
  207  *     c) If a conflicting parent is given:
  208  *
  209  *        If (key == REF_REF) ignore the error - probably the refs
  210  *        headers are broken or have been truncated.
  211  *
  212  *        Otherwise we have a genuine problem, two articles in one group
  213  *        with identical Message-IDs. This is indicative of a broken
  214  *        overview database.
  215  */
  216 static struct t_msgid *
  217 add_msgid(
  218     int key,
  219     const char *msgid,
  220     struct t_msgid *newparent)
  221 {
  222     struct t_msgid *ptr;
  223     struct t_msgid *i;
  224     unsigned int h;
  225 
  226     if (!msgid) {
  227         error_message(2, "add_msgid: NULL msgid\n");
  228         free_all_arrays();
  229         giveup();
  230     }
  231 
  232     h = hash_msgid(msgid + 1);              /* Don't hash the initial '<' */
  233 
  234 #ifdef DEBUG
  235     if (debug & DEBUG_REFS)
  236         DEBUG_PRINT((dbgfd, "---------------- Add %s %s with parent %s\n", (key == MSGID_REF) ? "MSG" : "REF", msgid, (newparent == NULL) ? _("unchanged") : newparent->txt));
  237 #endif /* DEBUG */
  238 
  239     /*
  240      * Look for this message id in the cache.
  241      * Broken software will sometimes damage the case of a message-id.
  242      */
  243     for (i = msgids[h]; i != NULL; i = i->next) {
  244         if (strcasecmp(i->txt, msgid) != 0)             /* No match yet */
  245             continue;
  246 
  247         /*
  248          * CASE 1a - No parent specified, do nothing
  249          */
  250         if (newparent == NULL) {
  251 #ifdef DEBUG
  252             if (debug & DEBUG_REFS)
  253                 DEBUG_PRINT((dbgfd, "nop: %s No parent specified\n", i->txt));
  254 #endif /* DEBUG */
  255             return i;
  256         }
  257 
  258         /*
  259          * CASE 1b - Parent not changed, do nothing
  260          */
  261         if (newparent == i->parent) {
  262 #ifdef DEBUG
  263             if (debug & DEBUG_REFS)
  264                 DEBUG_PRINT((dbgfd, "dup: %s -> %s (no change)\n", i->txt, i->parent ? i->parent->txt : "NULL"));
  265 #endif /* DEBUG */
  266             return i;
  267         }
  268 
  269         /*
  270          * CASE2 - A parent has been given where there was none before.
  271          *         Change parent from null -> not-null & update ptrs
  272          */
  273         if (i->parent == NULL) {
  274             /*
  275              * Detect & ignore circular reference paths by looking for the
  276              * new parent in this thread
  277              */
  278             for (ptr = newparent; ptr != NULL; ptr = ptr->parent) {
  279                 if (ptr == i) {
  280 #ifdef DEBUG
  281                     if (debug & DEBUG_REFS)
  282                         DEBUG_PRINT((dbgfd, "Avoiding circular reference! (%s)\n", (key == MSGID_REF) ? "MSG" : "REF"));
  283 #endif /* DEBUG */
  284                     return i;
  285                 }
  286             }
  287 
  288             i->parent = newparent;
  289             add_to_parent(i);
  290 #ifdef DEBUG
  291             if (debug & DEBUG_REFS)
  292                 DEBUG_PRINT((dbgfd, "set: %s -> %s\n", i->txt, newparent ? newparent->txt : _("None")));
  293 #endif /* DEBUG */
  294             return i;
  295         }
  296 
  297         /*
  298          * CASE 3 - A new parent has been given that conflicts with the
  299          *          current one. This is caused by
  300          * 1) A duplicate Message-ID in the spool (very bad !)
  301          * 2) corrupt References header
  302          * All we can do is ignore the error
  303          */
  304         if (i->parent != newparent) {
  305 #ifdef DEBUG
  306             if (debug & DEBUG_REFS)
  307                 DEBUG_PRINT((dbgfd, "Warning: (%s) Ignoring %s -> %s (already %s)\n",
  308                     (key == MSGID_REF) ? "MSG" : "REF", i->txt,
  309                     newparent ? newparent->txt : "None", i->parent->txt));
  310 #endif /* DEBUG */
  311 
  312             return i;
  313         }
  314 
  315         error_message(2, "Error: Impossible combination of conditions!\n");
  316         return i;
  317     }
  318 
  319 #ifdef DEBUG
  320     if (debug & DEBUG_REFS)
  321         DEBUG_PRINT((dbgfd, "new: %s -> %s\n", msgid, (newparent) ? newparent->txt : "None"));
  322 #endif /* DEBUG */
  323 
  324     /*
  325      * This is a new node, so build a structure for it
  326      */
  327     ptr = my_malloc(sizeof(struct t_msgid) + strlen(msgid));
  328 
  329     strcpy(ptr->txt, msgid);
  330     ptr->parent = newparent;
  331     ptr->child = ptr->sibling = NULL;
  332     ptr->article = (key == MSGID_REF ? top_art : ART_UNAVAILABLE);
  333 
  334     add_to_parent(ptr);
  335 
  336     /*
  337      * Insert at head of list for speed.
  338      */
  339     ptr->next = msgids[h];
  340     msgids[h] = ptr;
  341 
  342     return ptr;
  343 }
  344 
  345 
  346 /*
  347  * Find a Message-ID in the cache. Return ptr to this node, or NULL if
  348  * not found.
  349  */
  350 struct t_msgid *
  351 find_msgid(
  352     const char *msgid)
  353 {
  354     unsigned int h;
  355     struct t_msgid *i;
  356 
  357     h = hash_msgid(msgid + 1);              /* Don't hash the initial '<' */
  358 
  359     /*
  360      * Look for this message id in the cache.
  361      * Broken software will sometimes damage the case of a message-id.
  362      */
  363     for (i = msgids[h]; i != NULL; i = i->next) {
  364         if (strcasecmp(i->txt, msgid) == 0)             /* Found it */
  365             return i;
  366     }
  367 
  368     return NULL;
  369 }
  370 
  371 
  372 /*
  373  * Take a raw line of references data and return a ptr to a linked list of
  374  * msgids, starting with the most recent entry. (Thus the list is reversed)
  375  * Following the parent ptrs leads us back to the start of the thread.
  376  *
  377  * We iterate through the refs, adding each to the msgid cache, with
  378  * the previous ref as the parent.
  379  * The space saving vs. storing the refs as a single string is significant.
  380  */
  381 static struct t_msgid *
  382 parse_references(
  383     char *r)
  384 {
  385     char *ptr;
  386     struct t_msgid *parent, *current;
  387 
  388     if (!r)
  389         return NULL;
  390 
  391 #ifdef DEBUG
  392     if (debug & DEBUG_REFS)
  393         DEBUG_PRINT((dbgfd, "parse_references: %s\n", r));
  394 #endif /* DEBUG */
  395 
  396     /*
  397      * Break the refs down, using REF_SEP as delimiters
  398      */
  399     if ((ptr = strtok(r, REF_SEP)) == NULL)
  400         return NULL;
  401 
  402     /*
  403      * As per RFC 5536, a leading comment is allowed -> skip unknown
  404      * token until we find a valid msgid
  405      *
  406      * TODO: parse these tokens to be sure it is a comment and not
  407      *       a damaged header
  408      */
  409     if (!valid_msgid(ptr)) {
  410         while ((ptr = strtok(NULL, REF_SEP)) != NULL && !valid_msgid(ptr))
  411             ;
  412     }
  413 
  414     if (ptr == NULL)
  415         return NULL;
  416 
  417     /*
  418      * By definition, the head of the thread has no parent
  419      */
  420     parent = NULL;
  421 
  422     current = add_msgid(REF_REF, ptr, parent);
  423 
  424     while ((ptr = strtok(NULL, REF_SEP)) != NULL) {
  425         if (valid_msgid(ptr)) {
  426             parent = current;
  427             current = add_msgid(REF_REF, ptr, parent);
  428         }
  429     }
  430 
  431     return current;
  432 }
  433 
  434 
  435 static void
  436 rearrange_siblings(
  437     void)
  438 {
  439     int i;
  440     struct t_msgid *current, *p1, *p2;
  441 
  442     for_each_art(i) {
  443         current = arts[i].refptr;
  444 
  445         if (!current)
  446             continue;
  447 
  448         for (; current->sibling == NULL && current->parent != NULL; current = current->parent)
  449             ;
  450 
  451         if (current->sibling != NULL) {
  452             for (p1 = current; p1->article == ART_UNAVAILABLE && p1->child != NULL; p1 = p1->child)
  453                 ;
  454 
  455             for (p2 = current->sibling; p2->article == ART_UNAVAILABLE && p2->child != NULL; p2 = p2->child)
  456                 ;
  457 
  458             if (p1->article != ART_UNAVAILABLE && p2->article != ART_UNAVAILABLE && p1->article > p2->article) {
  459                 if (current->parent->child == current) {
  460                     /*
  461                      * current is the first followup
  462                      *  adjust parent->child pointer
  463                      */
  464                     current->parent->child = current->sibling;
  465                     p1 = current->parent->child;
  466                 } else {
  467                     /*
  468                      * current is not the first followup
  469                      *  find the sibling above current
  470                      *  adjust the sibling pointer there
  471                      */
  472                     for (p1 = current->parent->child; p1->sibling != current; p1 = p1->sibling)
  473                         ;
  474 
  475                     p1->sibling = current->sibling;
  476                     p1 = p1->sibling;
  477                 }
  478                 /*
  479                  * swap current <-> sibling
  480                  */
  481                 current->sibling = p1->sibling;
  482                 p1->sibling = current;
  483             }
  484         }
  485     }
  486 }
  487 
  488 
  489 /*
  490  * Reconstruct the References: field from the parent pointers
  491  * NB: In deep threads this can lead to a very long line. If you want to use
  492  *     this function to build a Reference: line for posting be aware that the
  493  *     server might refuse long lines -- short it accordingly!
  494  */
  495 static char *
  496 _get_references(
  497     struct t_msgid *refptr,
  498     int depth)
  499 {
  500     char *refs;
  501     static size_t len;      /* Accumulated size */
  502     static size_t pos;      /* current insertion position */
  503 
  504     if (depth == 1)
  505         len = 0;
  506 
  507     len += strlen(refptr->txt) + 1; /* msgid + space */
  508     if (refptr->parent == NULL || depth > MAX_REFS) {
  509 
  510 #ifdef DEBUG
  511         if (debug & DEBUG_REFS) {
  512             if (depth > MAX_REFS)
  513                 error_message(2, "Warning: Too many refs near to %s. Truncated\n", refptr->txt);
  514         }
  515 #endif /* DEBUG */
  516         refs = my_malloc(len + 1);  /* total length + nullbyte */
  517         pos = 0;
  518     } else
  519         refs = _get_references(refptr->parent, depth + 1);
  520 
  521     sprintf(refs + pos, "%s ", refptr->txt);
  522     pos = strlen(refs);
  523 
  524     return refs;
  525 }
  526 
  527 
  528 /*
  529  * A wrapper to the above, null terminate the string
  530  */
  531 char *
  532 get_references(
  533     struct t_msgid *refptr)
  534 {
  535     char *refs;
  536     size_t len;
  537 
  538     if (refptr == NULL)
  539         return NULL;
  540 
  541     refs = _get_references(refptr, 1);
  542     len = strlen(refs);
  543     refs[len - 1] = '\0';
  544 
  545     return refs;
  546 }
  547 
  548 
  549 /*
  550  * Clear the entire msgid cache, freeing up all chains. This is
  551  * normally only needed when entering a new group
  552  */
  553 void
  554 free_msgids(
  555     void)
  556 {
  557     int i;
  558     struct t_msgid *ptr, *next, **msgptr;
  559 
  560     msgptr = msgids;                /* first list */
  561 
  562     for (i = MSGID_HASH_SIZE - 1; i >= 0; i--) {    /* count down is faster */
  563         ptr = *msgptr;
  564         *msgptr++ = NULL;           /* declare list empty */
  565 
  566         while (ptr != NULL) {   /* for each node in the list */
  567 
  568             next = ptr->next;       /* grab ptr before we free node */
  569             free(ptr);          /* release node */
  570 
  571             ptr = next;         /* hop down the chain */
  572         }
  573     }
  574 }
  575 
  576 
  577 #if 0   /* But please don't remove it */
  578 /*
  579  * Function to dump an ASCII tree map of a thread rooted at msgid.
  580  * Output goes to fp, level is the current depth of the tree.
  581  */
  582 static void
  583 dump_thread(
  584     FILE *fp,
  585     struct t_msgid *msgid,
  586     int level)
  587 {
  588     char buff[120];     /* This is _probably_ enough */
  589     char *ptr = buff;
  590     int i, len;
  591 
  592     /*
  593      * Dump the current article
  594      */
  595     sprintf(ptr, "%3d %*s", msgid->article, 2*level, "  ");
  596 
  597     len = strlen(ptr);
  598     i = cCOLS - len - 20;
  599 
  600     if (msgid->article >= 0)
  601         sprintf(ptr + len, "%-*.*s   %-17.17s", i, i, arts[msgid->article].subject, (arts[msgid->article].name) ? arts[msgid->article].name : arts[msgid->article].from);
  602     else
  603         sprintf(ptr + len, "%-*.*s", i, i, _("[- Unavailable -]"));
  604 
  605     fprintf(fp, "%s\n", ptr);
  606 
  607     if (msgid->child != NULL)
  608         dump_thread(fp, msgid->child, level + 1);
  609 
  610     if (msgid->sibling != NULL)
  611         dump_thread(fp, msgid->sibling, level);
  612 
  613     return;
  614 }
  615 
  616 
  617 static void
  618 dump_msgids(
  619     void)
  620 {
  621     int i;
  622     struct t_msgid *ptr;
  623 
  624     my_fprintf(stderr, "Dumping...\n");
  625 
  626     for (i = 0; i < MSGID_HASH_SIZE; i++) {
  627         if (msgids[i] != NULL) {
  628             my_fprintf(stderr, "node %d", i);
  629             for (ptr = msgids[i]; ptr != NULL; ptr = ptr->next)
  630                 my_fprintf(stderr, " -> %s", ptr->txt);
  631 
  632             my_fprintf(stderr, "\n");
  633 
  634         }
  635     }
  636 }
  637 #endif /* 0 */
  638 
  639 
  640 /*
  641  * The rest of this code deals with reference threading
  642  *
  643  * Legend:
  644  *
  645  * . When a new thread is started, the root message will have no
  646  *   References: field
  647  *
  648  * . When a followup is posted, the message-id that was referred to
  649  *   will be appended to the References: field. If no References:
  650  *   field exists, a new one will be created, containing the single
  651  *   message-id
  652  *
  653  * . The References: field should not be truncated, though in practice
  654  *   this will happen, often in badly broken ways.
  655  *
  656  * This is simplistic, so check out RFC 1036 & son of RFC 1036 for full
  657  * details from the posting point of view.
  658  *
  659  * We attempt to maintain 3 pointers in each message-id to handle threading
  660  * on References:
  661  *
  662  * 1) parent  - the article that the current one was in reply to
  663  *              An article with no References: has no parent, therefore
  664  *              it is the root of a thread.
  665  *
  666  * 2) sibling - the next reply in sequence to parent.
  667  *
  668  * 3) child   - the first reply to the current article.
  669  *
  670  * These pointers are automatically set up when we read in the
  671  * headers for a group.
  672  *
  673  * It remains for us to fill in the .thread and .prev ptrs in
  674  * each article that exists in the spool, using the intelligence of
  675  * the reference tree to locate the 'next' article in the thread.
  676  *
  677  */
  678 /*
  679  * Clear out all the article fields from the msgid hash prior to a
  680  * rethread.
  681  */
  682 void
  683 clear_art_ptrs(
  684     void)
  685 {
  686     int i;
  687     struct t_msgid *ptr;
  688 
  689     for (i = MSGID_HASH_SIZE - 1; i >= 0; i--) {
  690         for (ptr = msgids[i]; ptr != NULL; ptr = ptr->next)
  691             ptr->article = ART_UNAVAILABLE;
  692     }
  693 }
  694 
  695 
  696 #ifdef DEBUG
  697 /*
  698  * Dump out all the threads from the msgid point of view, show the
  699  * related article index in arts[] where possible
  700  * A thread is defined as a starting article with no parent
  701  */
  702 static void
  703 dump_msgid_thread(
  704     struct t_msgid *ptr,
  705     int level)
  706 {
  707     fprintf(dbgfd, "%*s %s (%d)\n", level * 3, "   ", ptr->txt, ptr->article);
  708 
  709     if (ptr->child != NULL)
  710         dump_msgid_thread(ptr->child, level + 1);
  711 
  712     if (ptr->sibling != NULL)
  713         dump_msgid_thread(ptr->sibling, level);
  714 
  715     return;
  716 }
  717 
  718 
  719 static void
  720 dump_msgid_threads(
  721     void)
  722 {
  723     int i;
  724     struct t_msgid *ptr;
  725 
  726     fprintf(dbgfd, "Dump started.\n\n");
  727 
  728     for (i = 0; i < MSGID_HASH_SIZE; i++) {
  729         if (msgids[i] != NULL) {
  730             for (ptr = msgids[i]; ptr != NULL; ptr = ptr->next) {
  731                 if (ptr->parent == NULL) {
  732                     dump_msgid_thread(ptr, 1);
  733                     fprintf(dbgfd, "\n");
  734                 }
  735             }
  736         }
  737     }
  738     fprintf(dbgfd, "Dump complete.\n\n");
  739 }
  740 #endif /* DEBUG */
  741 
  742 
  743 /*
  744  * Find the next message in the thread.
  745  * We descend children before siblings, and only return articles that
  746  * exist in arts[] or NULL if we are truly at the end of a thread.
  747  * If there are no more down pointers, backtrack to find a sibling
  748  * to continue the thread, we note this with the 'bottom' flag.
  749  *
  750  * A Message-ID will not be included in a thread if
  751  *  It doesn't point to an article OR
  752  *     (it's already threaded/expired OR it has been autokilled)
  753  */
  754 #define SKIP_ART(ptr)   \
  755     (ptr && (ptr->article == ART_UNAVAILABLE || \
  756         (arts[ptr->article].thread != ART_UNTHREADED || \
  757             (tinrc.kill_level == KILL_NOTHREAD && arts[ptr->article].killed))))
  758 
  759 static struct t_msgid *
  760 find_next(
  761     struct t_msgid *ptr)
  762 {
  763     static t_bool bottom = FALSE;
  764 
  765     /*
  766      * Keep going while we haven't bottomed out and we haven't
  767      * got something in arts[]
  768      */
  769     while (ptr != NULL) {
  770 
  771         /*
  772          * Children first, unless bottom is set
  773          */
  774         if (!bottom && ptr->child != NULL) {
  775             ptr = ptr->child;
  776 
  777             /*
  778              * If article not present, keep going
  779              */
  780             if (SKIP_ART(ptr))
  781                 continue;
  782             else
  783                 break;
  784         }
  785 
  786         if (ptr->sibling != NULL) {
  787             bottom = FALSE;
  788 
  789             ptr = ptr->sibling;
  790 
  791             /*
  792              * If article not present, keep going
  793              */
  794             if (SKIP_ART(ptr))
  795                 continue;
  796             else
  797                 break;
  798         }
  799 
  800         /*
  801          * No more child or sibling to follow, backtrack up to
  802          * a sibling if we can find one
  803          */
  804         if (ptr->child == NULL && ptr->sibling == NULL) {
  805             while (ptr != NULL && ptr->sibling == NULL)
  806                 ptr = ptr->parent;
  807 
  808             /*
  809              * We've backtracked up to the parent with a suitable sibling
  810              * go round once again to move to this sibling
  811              */
  812             if (ptr)
  813                 bottom = TRUE;
  814             else
  815                 break;      /* Nothing found, exit with NULL */
  816         }
  817     }
  818 
  819     return ptr;
  820 }
  821 
  822 
  823 /*
  824  * Run the .thread and .prev pointers through the members of this
  825  * thread.
  826  */
  827 static void
  828 build_thread(
  829     struct t_msgid *ptr)
  830 {
  831     struct t_msgid *newptr;
  832 
  833     /*
  834      * If the root article is gone/expired/killed, find the first valid one
  835      */
  836     if (SKIP_ART(ptr))
  837         ptr = find_next(ptr);
  838 
  839     /*
  840      * Keep working through the thread, updating the ptrs as we go
  841      */
  842     while ((newptr = find_next(ptr)) != NULL) {
  843         arts[newptr->article].prev = ptr->article;
  844         arts[ptr->article].thread = newptr->article;
  845         ptr = newptr;
  846     }
  847 }
  848 
  849 
  850 /*
  851  * Run a new set of threads through the base articles, using the
  852  * parent / child / sibling / article pointers in the msgid hash.
  853  */
  854 void
  855 thread_by_reference(
  856     void)
  857 {
  858     int i;
  859     struct t_msgid *ptr;
  860 
  861 #ifdef DEBUG
  862     if (debug & DEBUG_REFS) {
  863         char file[PATH_LEN];
  864 
  865         joinpath(file, sizeof(file), TMPDIR, "REFS.info");
  866         if ((dbgfd = fopen(file, "w")) != NULL)
  867             dump_msgid_threads();
  868     }
  869 #endif /* DEBUG */
  870 
  871     /*
  872      * Build threads starting from root msgids (ie without parent)
  873      */
  874     for (i = 0; i < MSGID_HASH_SIZE; i++) {
  875         if (msgids[i] != NULL) {
  876             for (ptr = msgids[i]; ptr != NULL; ptr = ptr->next) {
  877                 if (ptr->parent == NULL)
  878                     build_thread(ptr);
  879             }
  880         }
  881     }
  882 
  883 #ifdef DEBUG
  884     if ((debug & DEBUG_REFS) && dbgfd != NULL) {
  885         DEBUG_PRINT((dbgfd, "Full dump of threading info...\n"));
  886         DEBUG_PRINT((dbgfd, "%3s %3s %3s %3s : %3s %3s\n", "#", "Par", "Sib", "Chd", "In", "Thd"));
  887 
  888         for_each_art(i) {
  889             DEBUG_PRINT((dbgfd, "%3d %3d %3d %3d : %3d %3d : %.50s %s\n", i,
  890                 (arts[i].refptr->parent) ? arts[i].refptr->parent->article : -2,
  891                 (arts[i].refptr->sibling) ? arts[i].refptr->sibling->article : -2,
  892                 (arts[i].refptr->child) ? arts[i].refptr->child->article : -2,
  893                 arts[i].prev, arts[i].thread, arts[i].refptr->txt, arts[i].subject));
  894         }
  895 
  896         fclose(dbgfd);
  897     }
  898 #endif /* DEBUG */
  899 }
  900 
  901 
  902 /*
  903  * Do the equivalent of subject threading, but only on the thread base
  904  * messages.
  905  * This should help thread together mistakenly multiply posted articles,
  906  * articles which were posted to a group rather than as followups, those
  907  * with missing ref headers etc.
  908  * We add joined threads onto the end of the .thread chain of the previous
  909  * thread. arts[] is already sorted, so the sorting of these will be
  910  * correct.
  911  */
  912 void
  913 collate_subjects(
  914     void)
  915 {
  916     int i, j, art;
  917     struct t_hashnode *h;
  918 
  919     /*
  920      * Run through the root messages of each thread. We have to traverse
  921      * using arts[] and not msgids[] to preserve the sorting.
  922      */
  923     for_each_art(i) {
  924         /*
  925          * Ignore already threaded and expired arts
  926          */
  927         if (arts[i].prev >= 0 || IGNORE_ART(i))
  928             continue;
  929 
  930         /*
  931          * Get the contents of the magic marker in the hashnode
  932          */
  933         h = (struct t_hashnode *) (arts[i].subject - sizeof(int) - sizeof(void *)); /* FIXME: cast increases required alignment of target type */
  934         j = h->aptr;
  935 
  936         if (j != -1 && j < i) {
  937             /*
  938              * Modified form of the subject threading - the only difference
  939              * is that we have to add later threads onto the end of the
  940              * previous thread
  941              */
  942 #if 1
  943             if (arts[i].subject == arts[j].subject)
  944 #else
  945             /* see also art.c:thread_by_subject() */
  946             if ((arts[i].subject == arts[j].subject) || (arts[i].archive && arts[j].archive && (arts[i].archive->name == arts[j].archive->name)))
  947 #endif /* 1 */
  948             {
  949                 /* DEBUG_PRINT((dbgfd, "RES: %d is now previous, at end of %d\n", i, j)); */
  950                 for (art = j; arts[art].thread >= 0; art = arts[art].thread)
  951                     ;
  952 
  953                 arts[art].thread = i;
  954                 arts[i].prev = art;
  955             }
  956         }
  957 
  958         /*
  959          * Update the magic marker with the highest numbered mesg in
  960          * arts[] that has been used in this thread so far
  961          */
  962         h->aptr = i;
  963     }
  964 }
  965 
  966 
  967 /*
  968  * Builds the reference tree:
  969  *
  970  * 1) Sort the article base. This will ensure that articles and their
  971  *    siblings are inserted in the correct order.
  972  * 2) Add each Message-ID header and its direct reference ('reliable info')
  973  *    to the cache. Son of RFC 1036 mandates that if References headers must
  974  *    be trimmed, then at least the (1st three and) last reference should be
  975  *    maintained.
  976  * 3) Add rest of References header to the cache. This information is less
  977  *    reliable than the info added in 2) and is only used to fill in any
  978  *    gaps in the reference tree - no information is superseded.
  979  * 4) free() up the msgid and refs headers once cached
  980  */
  981 void
  982 build_references(
  983     struct t_group *group)
  984 {
  985     char *s;
  986     int i;
  987     struct t_article *art;
  988     struct t_msgid *refs;
  989 
  990     /*
  991      * The articles are currently unsorted, and are as they were put by setup_hard_base()
  992      */
  993     if (group->attribute->sort_article_type != SORT_ARTICLES_BY_NOTHING)
  994         sort_arts(group->attribute->sort_article_type);
  995 
  996 #ifdef DEBUG
  997     if (debug & DEBUG_REFS) {
  998         char file[PATH_LEN];
  999 
 1000         joinpath(file, sizeof(file), TMPDIR, "REFS.dump");
 1001         if ((dbgfd = fopen(file, "w")) != NULL) {
 1002 #   ifdef HAVE_SETVBUF
 1003             SETVBUF(dbgfd, NULL, _IONBF, 0);
 1004 #   endif /* HAVE_SETVBUF */
 1005             DEBUG_PRINT((dbgfd, "MSGID phase\n"));
 1006         }
 1007     }
 1008 #endif /* DEBUG */
 1009 
 1010     /*
 1011      * Add the Message-ID headers to the cache, using the last Reference
 1012      * as the parent
 1013      */
 1014     for_each_art(i) {
 1015         art = &arts[i];
 1016 
 1017         art->refptr = add_msgid(MSGID_REF, art->msgid, NULL); /* preset art->refptr */
 1018         if (art->refs) {
 1019             strip_line(art->refs);
 1020 
 1021             /*
 1022              * Add the last ref, and then trim it to save wasting time adding
 1023              * it again later
 1024              * Check for circular references to current article
 1025              *
 1026              * TODO: do this in a single pass
 1027              */
 1028             if ((s = strrchr(art->refs, '<')) != NULL) {
 1029                 char *ptr;
 1030 
 1031                 /*
 1032                  * A comment can occur after another REF_SEP, remove it
 1033                  *
 1034                  * TODO: parse it to be sure it is a comment
 1035                  */
 1036                 if ((ptr = strpbrk(s, REF_SEP)) != NULL)
 1037                     *ptr = '\0';
 1038 
 1039                 if (valid_msgid(s) && !strcmp(art->msgid, s)) {
 1040                     /*
 1041                      * Remove circular reference to current article
 1042                      */
 1043 #ifdef DEBUG
 1044                     if (debug & DEBUG_REFS)
 1045                         DEBUG_PRINT((dbgfd, "removing circular reference to: %s\n", s));
 1046 #endif /* DEBUG */
 1047                     *s = '\0';
 1048                 }
 1049                 if (valid_msgid(art->msgid) && valid_msgid(s))
 1050                     art->refptr = add_msgid(MSGID_REF, art->msgid, add_msgid(REF_REF, s, NULL));
 1051                 *s = '\0';
 1052             } else
 1053                 FreeAndNull(art->refs);
 1054         }
 1055 
 1056         /*
 1057          * set art->refptr->article - rearrange_siblings() needs this
 1058          */
 1059         if (art->refptr != NULL)
 1060             art->refptr->article = i;
 1061 
 1062         FreeAndNull(art->msgid);    /* Now cached - discard this */
 1063     }
 1064 
 1065 #ifdef DEBUG
 1066     if (debug & DEBUG_REFS)
 1067         DEBUG_PRINT((dbgfd, "REFS phase\n"));
 1068 #endif /* DEBUG */
 1069     /*
 1070      * Add the References data to the cache
 1071      */
 1072     for_each_art(i) {
 1073         if (!arts[i].refs)                      /* No refs - skip */
 1074             continue;
 1075 
 1076         art = &arts[i];
 1077 
 1078         /*
 1079          * Add the remaining references as parent to the last ref we added
 1080          * earlier. skip call to add_msgid when NULL pointer
 1081          */
 1082 
 1083         refs = parse_references(art->refs);
 1084 
 1085         if (art->refptr && art->refptr->parent && valid_msgid(art->refptr->parent->txt))
 1086             add_msgid(REF_REF, art->refptr->parent->txt, refs);
 1087 
 1088         FreeAndNull(art->refs);
 1089     }
 1090 
 1091 #ifdef DEBUG
 1092     if ((debug & DEBUG_REFS) && dbgfd != NULL)
 1093         fclose(dbgfd);
 1094 #endif /* DEBUG */
 1095 
 1096     /*
 1097      * all msgids are cached now
 1098      * change order of siblings if needed
 1099      */
 1100     rearrange_siblings();
 1101 }