"Fossies" - the Fresh Open Source Software Archive

Member "tin-2.4.1/src/refs.c" (12 Oct 2016, 27501 Bytes) of archive /linux/misc/tin-2.4.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "refs.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 2.4.0_vs_2.4.1.

    1 /*
    2  *  Project   : tin - a Usenet reader
    3  *  Module    : refs.c
    4  *  Author    : Jason Faultless <jason@altarstone.com>
    5  *  Created   : 1996-05-09
    6  *  Updated   : 2016-05-02
    7  *  Notes     : Caching of message ids / References based threading
    8  *  Credits   : Richard Hodson <richard@macgyver.tele2.co.uk>
    9  *              hash_msgid, free_msgid
   10  *
   11  * Copyright (c) 1996-2017 Jason Faultless <jason@altarstone.com>
   12  * All rights reserved.
   13  *
   14  * Redistribution and use in source and binary forms, with or without
   15  * modification, are permitted provided that the following conditions
   16  * are met:
   17  * 1. Redistributions of source code must retain the above copyright
   18  *    notice, this list of conditions and the following disclaimer.
   19  * 2. Redistributions in binary form must reproduce the above copyright
   20  *    notice, this list of conditions and the following disclaimer in the
   21  *    documentation and/or other materials provided with the distribution.
   22  * 3. The name of the author may not be used to endorse or promote
   23  *    products derived from this software without specific prior written
   24  *    permission.
   25  *
   26  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
   27  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
   28  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
   30  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
   32  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
   34  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   35  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   36  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   37  */
   38 
   39 
   40 #ifndef TIN_H
   41 #   include "tin.h"
   42 #endif /* !TIN_H */
   43 
   44 #define MAX_REFS    100         /* Limit recursion depth */
   45 #define REF_SEP " \t"           /* Separator chars in ref headers */
   46 
   47 #ifdef DEBUG
   48 #   define DEBUG_PRINT(x)   if (dbgfd != NULL) fprintf x
   49 static FILE *dbgfd;
   50 #endif /* DEBUG */
   51 
   52 /*
   53  * local prototypes
   54  */
   55 static char *_get_references(struct t_msgid *refptr, int depth);
   56 static struct t_msgid *add_msgid(int key, const char *msgid, struct t_msgid *newparent);
   57 static struct t_msgid *find_next(struct t_msgid *ptr);
   58 static struct t_msgid *parse_references(char *r);
   59 static t_bool valid_msgid(char *msgid);
   60 static unsigned int hash_msgid(const char *key);
   61 static void add_to_parent(struct t_msgid *ptr);
   62 static void build_thread(struct t_msgid *ptr);
   63 static void rearrange_siblings(void);
   64 #ifdef DEBUG
   65     static void dump_msgid_thread(struct t_msgid *ptr, int level);
   66     static void dump_msgid_threads(void);
   67 #endif /* DEBUG */
   68 #if 0
   69     static void dump_msgids(void);
   70     static void dump_thread(FILE *fp, struct t_msgid *msgid, int level);
   71 #endif /* 0 */
   72 
   73 /*
   74  * The msgids are all hashed into a big array, with overspill
   75  */
   76 static struct t_msgid *msgids[MSGID_HASH_SIZE] = {0};
   77 
   78 /*
   79  * This part of the code deals with the caching and retrieval
   80  * of Message-ID and References headers
   81  *
   82  * Rationale:
   83  *    Even though the Message-ID is unique to an article, the References
   84  *    field contains msgids from elsewhere in the group. As the expiry
   85  *    period increases, so does the redundancy of data.
   86  *    At the time of writing, comp.os.ms-windows.advocacy held ~850
   87  *    articles. The references fields contained 192k of text, of which
   88  *    169k was saved using the new caching.
   89  *
   90  *    When threading on Refs, a much better view of the original thread
   91  *    can be built up using this data, and threading is much faster
   92  *    because all the article relationships are automatically available
   93  *    to us.
   94  *
   95  * NB: We don't cache msgids from the filter file.
   96  */
   97 
   98 /*
   99  * Hash a message id. A msgid is of the form <unique@sitename>
  100  * (But badly broken message id's do occur)
  101  * We hash on the unique portion which should have good randomness in
  102  * the lower 5 bits. Propagate the random bits up with a shift, and
  103  * mix in the new bits with an exclusive or.
  104  *
  105  * This should generate about 5+strlen(string_start) bits of randomness.
  106  * MSGID_HASH_SIZE is a prime of order 2^11
  107  */
  108 static unsigned int
  109 hash_msgid(
  110     const char *key)
  111 {
  112     unsigned int hash = 0;
  113 
  114     while (*key && *key != '@') {
  115         hash = (hash << 1) ^ *key;
  116         ++key;
  117     }
  118 
  119     hash %= MSGID_HASH_SIZE;
  120 
  121     return hash;
  122 }
  123 
  124 
  125 /*
  126  * Thread us into our parents' list of children.
  127  */
  128 static void
  129 add_to_parent(
  130     struct t_msgid *ptr)
  131 {
  132     struct t_msgid *p;
  133 
  134     if (!ptr->parent)
  135         return;
  136 
  137     /*
  138      * Trivial case - if we are the first child (followup)
  139      */
  140     if (ptr->parent->child == NULL) {
  141         ptr->parent->child = ptr;
  142         return;
  143     }
  144 
  145     /*
  146      * Add this followup to the sibling chain of our parent.
  147      * We add at the end of the chain, rearrange_siblings() will
  148      * sort it later.
  149      */
  150     for (p = ptr->parent->child; p->sibling != NULL; p = p->sibling)
  151         ;
  152 
  153 /*  ptr->sibling is already NULL */
  154     p->sibling = ptr;
  155 }
  156 
  157 
  158 /*
  159  * Checks if Message-ID has valid format
  160  * Returns TRUE if it does, FALSE if it does not
  161  *
  162  * TODO: combine with post.c:damaged_id()
  163  */
  164 static t_bool
  165 valid_msgid(
  166     char *msgid)
  167 {
  168     size_t mlen = 0;
  169     t_bool at_present = FALSE;
  170 
  171     str_trim(msgid);
  172     if (!msgid || *msgid != '<')
  173         return FALSE;
  174 
  175     while (isascii((unsigned char) *msgid) && isgraph((unsigned char) *msgid) && !iscntrl((unsigned char) *msgid) && *msgid != '>') {
  176         if (*msgid == '@')
  177             at_present = TRUE;
  178         mlen++;
  179         msgid++;
  180     }
  181 
  182     if (!at_present || (*msgid != '>') || mlen <= 2 || *(msgid + 1))
  183         return FALSE;
  184 
  185     return TRUE;
  186 }
  187 
  188 
  189 /*
  190  * Adds or updates a message id in the cache.
  191  * We return a ptr to the msgid, whether located or newly created.
  192  *
  193  * . If the message id is new, add it to the cache, creating parent, child
  194  *   & sibling ptrs if a parent is given.
  195  *
  196  * . If the message id is a duplicate, then:
  197  *     a) If no parent or the same parent, is given, no action is needed.
  198  *
  199  *     b) If a parent is specified and the current parent is NULL, then
  200  *        add in the new parent and create child/sibling ptrs.
  201  *        Because we add Message-ID headers first, we don't have to worry
  202  *        about bogus Reference headers messing things up.
  203  *
  204  *     c) If a conflicting parent is given:
  205  *
  206  *        If (key == REF_REF) ignore the error - probably the refs
  207  *        headers are broken or have been truncated.
  208  *
  209  *        Otherwise we have a genuine problem, two articles in one group
  210  *        with identical Message-IDs. This is indicative of a broken
  211  *        overview database.
  212  */
  213 static struct t_msgid *
  214 add_msgid(
  215     int key,
  216     const char *msgid,
  217     struct t_msgid *newparent)
  218 {
  219     struct t_msgid *ptr;
  220     struct t_msgid *i;
  221     unsigned int h;
  222 
  223     if (!msgid) {
  224         error_message(2, "add_msgid: NULL msgid\n");
  225         free_all_arrays();
  226         giveup();
  227     }
  228 
  229     h = hash_msgid(msgid + 1);              /* Don't hash the initial '<' */
  230 
  231 #ifdef DEBUG
  232     if (debug & DEBUG_REFS)
  233         DEBUG_PRINT((dbgfd, "---------------- Add %s %s with parent %s\n", (key == MSGID_REF) ? "MSG" : "REF", msgid, (newparent == NULL) ? _("unchanged") : newparent->txt));
  234 #endif /* DEBUG */
  235 
  236     /*
  237      * Look for this message id in the cache.
  238      * Broken software will sometimes damage the case of a message-id.
  239      */
  240     for (i = msgids[h]; i != NULL; i = i->next) {
  241         if (strcasecmp(i->txt, msgid) != 0)             /* No match yet */
  242             continue;
  243 
  244         /*
  245          * CASE 1a - No parent specified, do nothing
  246          */
  247         if (newparent == NULL) {
  248 #ifdef DEBUG
  249             if (debug & DEBUG_REFS)
  250                 DEBUG_PRINT((dbgfd, "nop: %s No parent specified\n", i->txt));
  251 #endif /* DEBUG */
  252             return i;
  253         }
  254 
  255         /*
  256          * CASE 1b - Parent not changed, do nothing
  257          */
  258         if (newparent == i->parent) {
  259 #ifdef DEBUG
  260             if (debug & DEBUG_REFS)
  261                 DEBUG_PRINT((dbgfd, "dup: %s -> %s (no change)\n", i->txt, i->parent ? i->parent->txt : "NULL"));
  262 #endif /* DEBUG */
  263             return i;
  264         }
  265 
  266         /*
  267          * CASE2 - A parent has been given where there was none before.
  268          *         Change parent from null -> not-null & update ptrs
  269          */
  270         if (i->parent == NULL) {
  271             /*
  272              * Detect & ignore circular reference paths by looking for the
  273              * new parent in this thread
  274              */
  275             for (ptr = newparent; ptr != NULL; ptr = ptr->parent) {
  276                 if (ptr == i) {
  277 #ifdef DEBUG
  278                     if (debug & DEBUG_REFS)
  279                         DEBUG_PRINT((dbgfd, "Avoiding circular reference! (%s)\n", (key == MSGID_REF) ? "MSG" : "REF"));
  280 #endif /* DEBUG */
  281                     return i;
  282                 }
  283             }
  284 
  285             i->parent = newparent;
  286             add_to_parent(i);
  287 #ifdef DEBUG
  288             if (debug & DEBUG_REFS)
  289                 DEBUG_PRINT((dbgfd, "set: %s -> %s\n", i->txt, newparent ? newparent->txt : _("None")));
  290 #endif /* DEBUG */
  291             return i;
  292         }
  293 
  294         /*
  295          * CASE 3 - A new parent has been given that conflicts with the
  296          *          current one. This is caused by
  297          * 1) A duplicate Message-ID in the spool (very bad !)
  298          * 2) corrupt References header
  299          * All we can do is ignore the error
  300          */
  301         if (i->parent != newparent) {
  302 #ifdef DEBUG
  303             if (debug & DEBUG_REFS)
  304                 DEBUG_PRINT((dbgfd, "Warning: (%s) Ignoring %s -> %s (already %s)\n",
  305                     (key == MSGID_REF) ? "MSG" : "REF", i->txt,
  306                     newparent ? newparent->txt : "None", i->parent->txt));
  307 #endif /* DEBUG */
  308 
  309             return i;
  310         }
  311 
  312         error_message(2, "Error: Impossible combination of conditions!\n");
  313         return i;
  314     }
  315 
  316 #ifdef DEBUG
  317     if (debug & DEBUG_REFS)
  318         DEBUG_PRINT((dbgfd, "new: %s -> %s\n", msgid, (newparent) ? newparent->txt : "None"));
  319 #endif /* DEBUG */
  320 
  321     /*
  322      * This is a new node, so build a structure for it
  323      */
  324     ptr = my_malloc(sizeof(struct t_msgid) + strlen(msgid));
  325 
  326     strcpy(ptr->txt, msgid);
  327     ptr->parent = newparent;
  328     ptr->child = ptr->sibling = NULL;
  329     ptr->article = (key == MSGID_REF ? top_art : ART_UNAVAILABLE);
  330 
  331     add_to_parent(ptr);
  332 
  333     /*
  334      * Insert at head of list for speed.
  335      */
  336     ptr->next = msgids[h];
  337     msgids[h] = ptr;
  338 
  339     return ptr;
  340 }
  341 
  342 
  343 /*
  344  * Find a Message-ID in the cache. Return ptr to this node, or NULL if
  345  * not found.
  346  */
  347 struct t_msgid *
  348 find_msgid(
  349     const char *msgid)
  350 {
  351     unsigned int h;
  352     struct t_msgid *i;
  353 
  354     h = hash_msgid(msgid + 1);              /* Don't hash the initial '<' */
  355 
  356     /*
  357      * Look for this message id in the cache.
  358      * Broken software will sometimes damage the case of a message-id.
  359      */
  360     for (i = msgids[h]; i != NULL; i = i->next) {
  361         if (strcasecmp(i->txt, msgid) == 0)             /* Found it */
  362             return i;
  363     }
  364 
  365     return NULL;
  366 }
  367 
  368 
  369 /*
  370  * Take a raw line of references data and return a ptr to a linked list of
  371  * msgids, starting with the most recent entry. (Thus the list is reversed)
  372  * Following the parent ptrs leads us back to the start of the thread.
  373  *
  374  * We iterate through the refs, adding each to the msgid cache, with
  375  * the previous ref as the parent.
  376  * The space saving vs. storing the refs as a single string is significant.
  377  */
  378 static struct t_msgid *
  379 parse_references(
  380     char *r)
  381 {
  382     char *ptr;
  383     struct t_msgid *parent, *current;
  384 
  385     if (!r)
  386         return NULL;
  387 
  388 #ifdef DEBUG
  389     if (debug & DEBUG_REFS)
  390         DEBUG_PRINT((dbgfd, "parse_references: %s\n", r));
  391 #endif /* DEBUG */
  392 
  393     /*
  394      * Break the refs down, using REF_SEP as delimiters
  395      */
  396     if ((ptr = strtok(r, REF_SEP)) == NULL)
  397         return NULL;
  398 
  399     /*
  400      * As per RFC 5536, a leading comment is allowed -> skip unknown
  401      * token until we find a valid msgid
  402      *
  403      * TODO: parse these tokens to be sure it is a comment and not
  404      *       a damaged header
  405      */
  406     if (!valid_msgid(ptr)) {
  407         while ((ptr = strtok(NULL, REF_SEP)) != NULL && !valid_msgid(ptr))
  408             ;
  409     }
  410 
  411     if (ptr == NULL)
  412         return NULL;
  413 
  414     /*
  415      * By definition, the head of the thread has no parent
  416      */
  417     parent = NULL;
  418 
  419     current = add_msgid(REF_REF, ptr, parent);
  420 
  421     while ((ptr = strtok(NULL, REF_SEP)) != NULL) {
  422         if (valid_msgid(ptr)) {
  423             parent = current;
  424             current = add_msgid(REF_REF, ptr, parent);
  425         }
  426     }
  427 
  428     return current;
  429 }
  430 
  431 
  432 static void
  433 rearrange_siblings(
  434     void)
  435 {
  436     int i;
  437     struct t_msgid *current, *p1, *p2;
  438 
  439     for_each_art(i) {
  440         current = arts[i].refptr;
  441 
  442         if (!current)
  443             continue;
  444 
  445         for (; current->sibling == NULL && current->parent != NULL; current = current->parent)
  446             ;
  447 
  448         if (current->sibling != NULL) {
  449             for (p1 = current; p1->article == ART_UNAVAILABLE && p1->child != NULL; p1 = p1->child)
  450                 ;
  451 
  452             for (p2 = current->sibling; p2->article == ART_UNAVAILABLE && p2->child != NULL; p2 = p2->child)
  453                 ;
  454 
  455             if (p1->article != ART_UNAVAILABLE && p2->article != ART_UNAVAILABLE && p1->article > p2->article) {
  456                 if (current->parent->child == current) {
  457                     /*
  458                      * current is the first followup
  459                      *  adjust parent->child pointer
  460                      */
  461                     current->parent->child = current->sibling;
  462                     p1 = current->parent->child;
  463                 } else {
  464                     /*
  465                      * current is not the first followup
  466                      *  find the sibling above current
  467                      *  adjust the sibling pointer there
  468                      */
  469                     for (p1 = current->parent->child; p1->sibling != current; p1 = p1->sibling)
  470                         ;
  471 
  472                     p1->sibling = current->sibling;
  473                     p1 = p1->sibling;
  474                 }
  475                 /*
  476                  * swap current <-> sibling
  477                  */
  478                 current->sibling = p1->sibling;
  479                 p1->sibling = current;
  480             }
  481         }
  482     }
  483 }
  484 
  485 
  486 /*
  487  * Reconstruct the References: field from the parent pointers
  488  * NB: In deep threads this can lead to a very long line. If you want to use
  489  *     this function to build a Reference: line for posting be aware that the
  490  *     server might refuse long lines -- short it accordingly!
  491  */
  492 static char *
  493 _get_references(
  494     struct t_msgid *refptr,
  495     int depth)
  496 {
  497     char *refs;
  498     static size_t len;      /* Accumulated size */
  499     static size_t pos;      /* current insertion position */
  500 
  501     if (depth == 1)
  502         len = 0;
  503 
  504     len += strlen(refptr->txt) + 1; /* msgid + space */
  505     if (refptr->parent == NULL || depth > MAX_REFS) {
  506 
  507 #ifdef DEBUG
  508         if (debug & DEBUG_REFS) {
  509             if (depth > MAX_REFS)
  510                 error_message(2, "Warning: Too many refs near to %s. Truncated\n", refptr->txt);
  511         }
  512 #endif /* DEBUG */
  513         refs = my_malloc(len + 1);  /* total length + nullbyte */
  514         pos = 0;
  515     } else
  516         refs = _get_references(refptr->parent, depth + 1);
  517 
  518     sprintf(refs + pos, "%s ", refptr->txt);
  519     pos = strlen(refs);
  520 
  521     return refs;
  522 }
  523 
  524 
  525 /*
  526  * A wrapper to the above, null terminate the string
  527  */
  528 char *
  529 get_references(
  530     struct t_msgid *refptr)
  531 {
  532     char *refs;
  533     size_t len;
  534 
  535     if (refptr == NULL)
  536         return NULL;
  537 
  538     refs = _get_references(refptr, 1);
  539     len = strlen(refs);
  540     refs[len - 1] = '\0';
  541 
  542     return refs;
  543 }
  544 
  545 
  546 /*
  547  * Clear the entire msgid cache, freeing up all chains. This is
  548  * normally only needed when entering a new group
  549  */
  550 void
  551 free_msgids(
  552     void)
  553 {
  554     int i;
  555     struct t_msgid *ptr, *next, **msgptr;
  556 
  557     msgptr = msgids;                /* first list */
  558 
  559     for (i = MSGID_HASH_SIZE - 1; i >= 0; i--) {    /* count down is faster */
  560         ptr = *msgptr;
  561         *msgptr++ = NULL;           /* declare list empty */
  562 
  563         while (ptr != NULL) {   /* for each node in the list */
  564 
  565             next = ptr->next;       /* grab ptr before we free node */
  566             free(ptr);          /* release node */
  567 
  568             ptr = next;         /* hop down the chain */
  569         }
  570     }
  571 }
  572 
  573 
  574 #if 0   /* But please don't remove it */
  575 /*
  576  * Function to dump an ASCII tree map of a thread rooted at msgid.
  577  * Output goes to fp, level is the current depth of the tree.
  578  */
  579 static void
  580 dump_thread(
  581     FILE *fp,
  582     struct t_msgid *msgid,
  583     int level)
  584 {
  585     char buff[120];     /* This is _probably_ enough */
  586     char *ptr = buff;
  587     int i, len;
  588 
  589     /*
  590      * Dump the current article
  591      */
  592     sprintf(ptr, "%3d %*s", msgid->article, 2*level, "  ");
  593 
  594     len = strlen(ptr);
  595     i = cCOLS - len - 20;
  596 
  597     if (msgid->article >= 0)
  598         sprintf(ptr + len, "%-*.*s   %-17.17s", i, i, arts[msgid->article].subject, (arts[msgid->article].name) ? arts[msgid->article].name : arts[msgid->article].from);
  599     else
  600         sprintf(ptr + len, "%-*.*s", i, i, _("[- Unavailable -]"));
  601 
  602     fprintf(fp, "%s\n", ptr);
  603 
  604     if (msgid->child != NULL)
  605         dump_thread(fp, msgid->child, level + 1);
  606 
  607     if (msgid->sibling != NULL)
  608         dump_thread(fp, msgid->sibling, level);
  609 
  610     return;
  611 }
  612 
  613 
  614 static void
  615 dump_msgids(
  616     void)
  617 {
  618     int i;
  619     struct t_msgid *ptr;
  620 
  621     my_fprintf(stderr, "Dumping...\n");
  622 
  623     for (i = 0; i < MSGID_HASH_SIZE; i++) {
  624         if (msgids[i] != NULL) {
  625             my_fprintf(stderr, "node %d", i);
  626             for (ptr = msgids[i]; ptr != NULL; ptr = ptr->next)
  627                 my_fprintf(stderr, " -> %s", ptr->txt);
  628 
  629             my_fprintf(stderr, "\n");
  630 
  631         }
  632     }
  633 }
  634 #endif /* 0 */
  635 
  636 
  637 /*
  638  * The rest of this code deals with reference threading
  639  *
  640  * Legend:
  641  *
  642  * . When a new thread is started, the root message will have no
  643  *   References: field
  644  *
  645  * . When a followup is posted, the message-id that was referred to
  646  *   will be appended to the References: field. If no References:
  647  *   field exists, a new one will be created, containing the single
  648  *   message-id
  649  *
  650  * . The References: field should not be truncated, though in practice
  651  *   this will happen, often in badly broken ways.
  652  *
  653  * This is simplistic, so check out RFC 1036 & son of RFC 1036 for full
  654  * details from the posting point of view.
  655  *
  656  * We attempt to maintain 3 pointers in each message-id to handle threading
  657  * on References:
  658  *
  659  * 1) parent  - the article that the current one was in reply to
  660  *              An article with no References: has no parent, therefore
  661  *              it is the root of a thread.
  662  *
  663  * 2) sibling - the next reply in sequence to parent.
  664  *
  665  * 3) child   - the first reply to the current article.
  666  *
  667  * These pointers are automatically set up when we read in the
  668  * headers for a group.
  669  *
  670  * It remains for us to fill in the .thread and .prev ptrs in
  671  * each article that exists in the spool, using the intelligence of
  672  * the reference tree to locate the 'next' article in the thread.
  673  *
  674  */
  675 /*
  676  * Clear out all the article fields from the msgid hash prior to a
  677  * rethread.
  678  */
  679 void
  680 clear_art_ptrs(
  681     void)
  682 {
  683     int i;
  684     struct t_msgid *ptr;
  685 
  686     for (i = MSGID_HASH_SIZE - 1; i >= 0; i--) {
  687         for (ptr = msgids[i]; ptr != NULL; ptr = ptr->next)
  688             ptr->article = ART_UNAVAILABLE;
  689     }
  690 }
  691 
  692 
  693 #ifdef DEBUG
  694 /*
  695  * Dump out all the threads from the msgid point of view, show the
  696  * related article index in arts[] where possible
  697  * A thread is defined as a starting article with no parent
  698  */
  699 static void
  700 dump_msgid_thread(
  701     struct t_msgid *ptr,
  702     int level)
  703 {
  704     fprintf(dbgfd, "%*s %s (%d)\n", level * 3, "   ", ptr->txt, ptr->article);
  705 
  706     if (ptr->child != NULL)
  707         dump_msgid_thread(ptr->child, level + 1);
  708 
  709     if (ptr->sibling != NULL)
  710         dump_msgid_thread(ptr->sibling, level);
  711 
  712     return;
  713 }
  714 
  715 
  716 static void
  717 dump_msgid_threads(
  718     void)
  719 {
  720     int i;
  721     struct t_msgid *ptr;
  722 
  723     fprintf(dbgfd, "Dump started.\n\n");
  724 
  725     for (i = 0; i < MSGID_HASH_SIZE; i++) {
  726         if (msgids[i] != NULL) {
  727             for (ptr = msgids[i]; ptr != NULL; ptr = ptr->next) {
  728                 if (ptr->parent == NULL) {
  729                     dump_msgid_thread(ptr, 1);
  730                     fprintf(dbgfd, "\n");
  731                 }
  732             }
  733         }
  734     }
  735     fprintf(dbgfd, "Dump complete.\n\n");
  736 }
  737 #endif /* DEBUG */
  738 
  739 
  740 /*
  741  * Find the next message in the thread.
  742  * We descend children before siblings, and only return articles that
  743  * exist in arts[] or NULL if we are truly at the end of a thread.
  744  * If there are no more down pointers, backtrack to find a sibling
  745  * to continue the thread, we note this with the 'bottom' flag.
  746  *
  747  * A Message-ID will not be included in a thread if
  748  *  It doesn't point to an article OR
  749  *     (it's already threaded/expired OR it has been autokilled)
  750  */
  751 #define SKIP_ART(ptr)   \
  752     (ptr && (ptr->article == ART_UNAVAILABLE || \
  753         (arts[ptr->article].thread != ART_UNTHREADED || \
  754             (tinrc.kill_level == KILL_NOTHREAD && arts[ptr->article].killed))))
  755 
  756 static struct t_msgid *
  757 find_next(
  758     struct t_msgid *ptr)
  759 {
  760     static t_bool bottom = FALSE;
  761 
  762     /*
  763      * Keep going while we haven't bottomed out and we haven't
  764      * got something in arts[]
  765      */
  766     while (ptr != NULL) {
  767 
  768         /*
  769          * Children first, unless bottom is set
  770          */
  771         if (!bottom && ptr->child != NULL) {
  772             ptr = ptr->child;
  773 
  774             /*
  775              * If article not present, keep going
  776              */
  777             if (SKIP_ART(ptr))
  778                 continue;
  779             else
  780                 break;
  781         }
  782 
  783         if (ptr->sibling != NULL) {
  784             bottom = FALSE;
  785 
  786             ptr = ptr->sibling;
  787 
  788             /*
  789              * If article not present, keep going
  790              */
  791             if (SKIP_ART(ptr))
  792                 continue;
  793             else
  794                 break;
  795         }
  796 
  797         /*
  798          * No more child or sibling to follow, backtrack up to
  799          * a sibling if we can find one
  800          */
  801         if (ptr->child == NULL && ptr->sibling == NULL) {
  802             while (ptr != NULL && ptr->sibling == NULL)
  803                 ptr = ptr->parent;
  804 
  805             /*
  806              * We've backtracked up to the parent with a suitable sibling
  807              * go round once again to move to this sibling
  808              */
  809             if (ptr)
  810                 bottom = TRUE;
  811             else
  812                 break;      /* Nothing found, exit with NULL */
  813         }
  814     }
  815 
  816     return ptr;
  817 }
  818 
  819 
  820 /*
  821  * Run the .thread and .prev pointers through the members of this
  822  * thread.
  823  */
  824 static void
  825 build_thread(
  826     struct t_msgid *ptr)
  827 {
  828     struct t_msgid *newptr;
  829 
  830     /*
  831      * If the root article is gone/expired/killed, find the first valid one
  832      */
  833     if (SKIP_ART(ptr))
  834         ptr = find_next(ptr);
  835 
  836     /*
  837      * Keep working through the thread, updating the ptrs as we go
  838      */
  839     while ((newptr = find_next(ptr)) != NULL) {
  840         arts[newptr->article].prev = ptr->article;
  841         arts[ptr->article].thread = newptr->article;
  842         ptr = newptr;
  843     }
  844 }
  845 
  846 
  847 /*
  848  * Run a new set of threads through the base articles, using the
  849  * parent / child / sibling / article pointers in the msgid hash.
  850  */
  851 void
  852 thread_by_reference(
  853     void)
  854 {
  855     int i;
  856     struct t_msgid *ptr;
  857 
  858 #ifdef DEBUG
  859     if (debug & DEBUG_REFS) {
  860         char file[PATH_LEN];
  861 
  862         joinpath(file, sizeof(file), TMPDIR, "REFS.info");
  863         if ((dbgfd = fopen(file, "w")) != NULL)
  864             dump_msgid_threads();
  865     }
  866 #endif /* DEBUG */
  867 
  868     /*
  869      * Build threads starting from root msgids (ie without parent)
  870      */
  871     for (i = 0; i < MSGID_HASH_SIZE; i++) {
  872         if (msgids[i] != NULL) {
  873             for (ptr = msgids[i]; ptr != NULL; ptr = ptr->next) {
  874                 if (ptr->parent == NULL)
  875                     build_thread(ptr);
  876             }
  877         }
  878     }
  879 
  880 #ifdef DEBUG
  881     if ((debug & DEBUG_REFS) && dbgfd != NULL) {
  882         DEBUG_PRINT((dbgfd, "Full dump of threading info...\n"));
  883         DEBUG_PRINT((dbgfd, "%3s %3s %3s %3s : %3s %3s\n", "#", "Par", "Sib", "Chd", "In", "Thd"));
  884 
  885         for_each_art(i) {
  886             DEBUG_PRINT((dbgfd, "%3d %3d %3d %3d : %3d %3d : %.50s %s\n", i,
  887                 (arts[i].refptr->parent) ? arts[i].refptr->parent->article : -2,
  888                 (arts[i].refptr->sibling) ? arts[i].refptr->sibling->article : -2,
  889                 (arts[i].refptr->child) ? arts[i].refptr->child->article : -2,
  890                 arts[i].prev, arts[i].thread, arts[i].refptr->txt, arts[i].subject));
  891         }
  892 
  893         fclose(dbgfd);
  894     }
  895 #endif /* DEBUG */
  896 
  897     return;
  898 }
  899 
  900 
  901 /*
  902  * Do the equivalent of subject threading, but only on the thread base
  903  * messages.
  904  * This should help thread together mistakenly multiply posted articles,
  905  * articles which were posted to a group rather than as followups, those
  906  * with missing ref headers etc.
  907  * We add joined threads onto the end of the .thread chain of the previous
  908  * thread. arts[] is already sorted, so the sorting of these will be
  909  * correct.
  910  */
  911 void
  912 collate_subjects(
  913     void)
  914 {
  915     int i, j, art;
  916     struct t_hashnode *h;
  917 
  918     /*
  919      * Run through the root messages of each thread. We have to traverse
  920      * using arts[] and not msgids[] to preserve the sorting.
  921      */
  922     for_each_art(i) {
  923         /*
  924          * Ignore already threaded and expired arts
  925          */
  926         if (arts[i].prev >= 0 || IGNORE_ART(i))
  927             continue;
  928 
  929         /*
  930          * Get the contents of the magic marker in the hashnode
  931          */
  932         h = (struct t_hashnode *) (arts[i].subject - sizeof(int) - sizeof(void *)); /* FIXME: cast increases required alignment of target type */
  933         j = h->aptr;
  934 
  935         if (j != -1 && j < i) {
  936             /*
  937              * Modified form of the subject threading - the only difference
  938              * is that we have to add later threads onto the end of the
  939              * previous thread
  940              */
  941 #if 1
  942             if (arts[i].subject == arts[j].subject)
  943 #else
  944             /* see also art.c:thread_by_subject() */
  945             if ((arts[i].subject == arts[j].subject) || (arts[i].archive && arts[j].archive && (arts[i].archive->name == arts[j].archive->name)))
  946 #endif /* 1 */
  947             {
  948                 /* DEBUG_PRINT((dbgfd, "RES: %d is now previous, at end of %d\n", i, j)); */
  949                 for (art = j; arts[art].thread >= 0; art = arts[art].thread)
  950                     ;
  951 
  952                 arts[art].thread = i;
  953                 arts[i].prev = art;
  954             }
  955         }
  956 
  957         /*
  958          * Update the magic marker with the highest numbered mesg in
  959          * arts[] that has been used in this thread so far
  960          */
  961         h->aptr = i;
  962     }
  963 
  964     return;
  965 }
  966 
  967 
  968 /*
  969  * Builds the reference tree:
  970  *
  971  * 1) Sort the article base. This will ensure that articles and their
  972  *    siblings are inserted in the correct order.
  973  * 2) Add each Message-ID header and its direct reference ('reliable info')
  974  *    to the cache. Son of RFC 1036 mandates that if References headers must
  975  *    be trimmed, then at least the (1st three and) last reference should be
  976  *    maintained.
  977  * 3) Add rest of References header to the cache. This information is less
  978  *    reliable than the info added in 2) and is only used to fill in any
  979  *    gaps in the reference tree - no information is superseded.
  980  * 4) free() up the msgid and refs headers once cached
  981  */
  982 void
  983 build_references(
  984     struct t_group *group)
  985 {
  986     char *s;
  987     int i;
  988     struct t_article *art;
  989     struct t_msgid *refs;
  990 
  991     /*
  992      * The articles are currently unsorted, and are as they were put by setup_hard_base()
  993      */
  994     if (group->attribute->sort_article_type != SORT_ARTICLES_BY_NOTHING)
  995         sort_arts(group->attribute->sort_article_type);
  996 
  997 #ifdef DEBUG
  998     if (debug & DEBUG_REFS) {
  999         char file[PATH_LEN];
 1000 
 1001         joinpath(file, sizeof(file), TMPDIR, "REFS.dump");
 1002         if ((dbgfd = fopen(file, "w")) != NULL) {
 1003 #   ifdef HAVE_SETVBUF
 1004             SETVBUF(dbgfd, NULL, _IONBF, 0);
 1005 #   endif /* HAVE_SETVBUF */
 1006             DEBUG_PRINT((dbgfd, "MSGID phase\n"));
 1007         }
 1008     }
 1009 #endif /* DEBUG */
 1010 
 1011     /*
 1012      * Add the Message-ID headers to the cache, using the last Reference
 1013      * as the parent
 1014      */
 1015     for_each_art(i) {
 1016         art = &arts[i];
 1017 
 1018         art->refptr = add_msgid(MSGID_REF, art->msgid, NULL); /* preset art->refptr */
 1019         if (art->refs) {
 1020             strip_line(art->refs);
 1021 
 1022             /*
 1023              * Add the last ref, and then trim it to save wasting time adding
 1024              * it again later
 1025              * Check for circular references to current article
 1026              *
 1027              * TODO: do this in a single pass
 1028              */
 1029             if ((s = strrchr(art->refs, '<')) != NULL) {
 1030                 char *ptr;
 1031 
 1032                 /*
 1033                  * A comment can occur after another REF_SEP, remove it
 1034                  *
 1035                  * TODO: parse it to be sure it is a comment
 1036                  */
 1037                 if ((ptr = strpbrk(s, REF_SEP)) != NULL)
 1038                     *ptr = '\0';
 1039 
 1040                 if (valid_msgid(s) && !strcmp(art->msgid, s)) {
 1041                     /*
 1042                      * Remove circular reference to current article
 1043                      */
 1044 #ifdef DEBUG
 1045                     if (debug & DEBUG_REFS)
 1046                         DEBUG_PRINT((dbgfd, "removing circular reference to: %s\n", s));
 1047 #endif /* DEBUG */
 1048                     *s = '\0';
 1049                 }
 1050                 if (valid_msgid(art->msgid) && valid_msgid(s))
 1051                     art->refptr = add_msgid(MSGID_REF, art->msgid, add_msgid(REF_REF, s, NULL));
 1052                 *s = '\0';
 1053             } else
 1054                 FreeAndNull(art->refs);
 1055         }
 1056 
 1057         /*
 1058          * set art->refptr->article - rearrange_siblings() needs this
 1059          */
 1060         if (art->refptr != NULL)
 1061             art->refptr->article = i;
 1062 
 1063         FreeAndNull(art->msgid);    /* Now cached - discard this */
 1064     }
 1065 
 1066 #ifdef DEBUG
 1067     if (debug & DEBUG_REFS)
 1068         DEBUG_PRINT((dbgfd, "REFS phase\n"));
 1069 #endif /* DEBUG */
 1070     /*
 1071      * Add the References data to the cache
 1072      */
 1073     for_each_art(i) {
 1074         if (!arts[i].refs)                      /* No refs - skip */
 1075             continue;
 1076 
 1077         art = &arts[i];
 1078 
 1079         /*
 1080          * Add the remaining references as parent to the last ref we added
 1081          * earlier. skip call to add_msgid when NULL pointer
 1082          */
 1083 
 1084         refs = parse_references(art->refs);
 1085 
 1086         if (art->refptr && art->refptr->parent && valid_msgid(art->refptr->parent->txt))
 1087             add_msgid(REF_REF, art->refptr->parent->txt, refs);
 1088 
 1089         FreeAndNull(art->refs);
 1090     }
 1091 
 1092 #ifdef DEBUG
 1093     if ((debug & DEBUG_REFS) && dbgfd != NULL)
 1094         fclose(dbgfd);
 1095 #endif /* DEBUG */
 1096 
 1097     /*
 1098      * all msgids are cached now
 1099      * change order of siblings if needed
 1100      */
 1101     rearrange_siblings();
 1102 }