"Fossies" - the Fresh Open Source Software Archive

Member "citadel/modules/fulltext/serv_fulltext.c" (5 Jun 2021, 11850 Bytes) of package /linux/www/citadel.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "serv_fulltext.c" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 9.01_vs_902.

    1 /*
    2  * This module handles fulltext indexing of the message base.
    3  * Copyright (c) 2005-2021 by the citadel.org team
    4  *
    5  * This program is open source software; you can redistribute it and/or
    6  * modify it under the terms of the GNU General Public License as published
    7  * by the Free Software Foundation; either version 3 of the License, or
    8  * (at your option) any later version.
    9  *
   10  * This program is distributed in the hope that it will be useful,
   11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   13  * GNU General Public License for more details.
   14  *
   15  * You should have received a copy of the GNU General Public License
   16  * along with this program; if not, write to the Free Software
   17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
   18  */
   19 
   20 #include "sysdep.h"
   21 #include <stdlib.h>
   22 #include <unistd.h>
   23 #include <stdio.h>
   24 #include <fcntl.h>
   25 #include <signal.h>
   26 #include <pwd.h>
   27 #include <errno.h>
   28 #include <sys/types.h>
   29 #include <time.h>
   30 #include <sys/wait.h>
   31 #include <string.h>
   32 #include <limits.h>
   33 #include <libcitadel.h>
   34 #include "citadel.h"
   35 #include "server.h"
   36 #include "citserver.h"
   37 #include "support.h"
   38 #include "config.h"
   39 #include "database.h"
   40 #include "msgbase.h"
   41 #include "control.h"
   42 #include "serv_fulltext.h"
   43 #include "ft_wordbreaker.h"
   44 #include "threads.h"
   45 #include "context.h"
   46 
   47 #include "ctdl_module.h"
   48 
   49 long ft_newhighest = 0L;
   50 long *ft_newmsgs = NULL;
   51 int ft_num_msgs = 0;
   52 int ft_num_alloc = 0;
   53 
   54 int ftc_num_msgs[65536];
   55 long *ftc_msgs[65536];
   56 
   57 
   58 /*
   59  * Compare function
   60  */
   61 int longcmp(const void *rec1, const void *rec2) {
   62     long i1, i2;
   63 
   64     i1 = *(const long *)rec1;
   65     i2 = *(const long *)rec2;
   66 
   67     if (i1 > i2) return(1);
   68     if (i1 < i2) return(-1);
   69     return(0);
   70 }
   71 
   72 
   73 /*
   74  * Flush our index cache out to disk.
   75  */
   76 void ft_flush_cache(void) {
   77     int i;
   78     time_t last_update = 0;
   79 
   80     for (i=0; i<65536; ++i) {
   81         if ((time(NULL) - last_update) >= 10) {
   82             syslog(LOG_INFO,
   83                 "fulltext: flushing index cache to disk (%d%% complete)",
   84                 (i * 100 / 65536)
   85             );
   86             last_update = time(NULL);
   87         }
   88         if (ftc_msgs[i] != NULL) {
   89             cdb_store(CDB_FULLTEXT, &i, sizeof(int), ftc_msgs[i],
   90                 (ftc_num_msgs[i] * sizeof(long)));
   91             ftc_num_msgs[i] = 0;
   92             free(ftc_msgs[i]);
   93             ftc_msgs[i] = NULL;
   94         }
   95     }
   96     syslog(LOG_INFO, "fulltext: flushed index cache to disk (100%% complete)");
   97 }
   98 
   99 
  100 /*
  101  * Index or de-index a message.  (op == 1 to index, 0 to de-index)
  102  */
  103 void ft_index_message(long msgnum, int op) {
  104     int num_tokens = 0;
  105     int *tokens = NULL;
  106     int i, j;
  107     struct cdbdata *cdb_bucket;
  108     StrBuf *msgtext;
  109     char *txt;
  110     int tok;
  111     struct CtdlMessage *msg = NULL;
  112 
  113     msg = CtdlFetchMessage(msgnum, 1);
  114     if (msg == NULL) {
  115         syslog(LOG_ERR, "fulltext: ft_index_message() could not load msg %ld", msgnum);
  116         return;
  117     }
  118 
  119     if (!CM_IsEmpty(msg, eSuppressIdx)) {
  120         syslog(LOG_DEBUG, "fulltext: ft_index_message() excluded msg %ld", msgnum);
  121         CM_Free(msg);
  122         return;
  123     }
  124 
  125     syslog(LOG_DEBUG, "fulltext: ft_index_message() %s msg %ld", (op ? "adding" : "removing") , msgnum);
  126 
  127     /* Output the message as text before indexing it, so we don't end up
  128      * indexing a bunch of encoded base64, etc.
  129      */
  130     CC->redirect_buffer = NewStrBufPlain(NULL, SIZ);
  131     CtdlOutputPreLoadedMsg(msg, MT_CITADEL, HEADERS_ALL, 0, 1, 0);
  132     CM_Free(msg);
  133     msgtext = CC->redirect_buffer;
  134     CC->redirect_buffer = NULL;
  135     if (msgtext != NULL) {
  136         syslog(LOG_DEBUG, "fulltext: wordbreaking message %ld (%d bytes)", msgnum, StrLength(msgtext));
  137     }
  138     txt = SmashStrBuf(&msgtext);
  139     wordbreaker(txt, &num_tokens, &tokens);
  140     free(txt);
  141 
  142     syslog(LOG_DEBUG, "fulltext: indexing message %ld [%d tokens]", msgnum, num_tokens);
  143     if (num_tokens > 0) {
  144         for (i=0; i<num_tokens; ++i) {
  145 
  146             /* Add the message to the relevant token bucket */
  147 
  148             /* search for tokens[i] */
  149             tok = tokens[i];
  150 
  151             if ( (tok >= 0) && (tok <= 65535) ) {
  152                 /* fetch the bucket, Liza */
  153                 if (ftc_msgs[tok] == NULL) {
  154                     cdb_bucket = cdb_fetch(CDB_FULLTEXT, &tok, sizeof(int));
  155                     if (cdb_bucket != NULL) {
  156                         ftc_num_msgs[tok] = cdb_bucket->len / sizeof(long);
  157                         ftc_msgs[tok] = (long *)cdb_bucket->ptr;
  158                         cdb_bucket->ptr = NULL;
  159                         cdb_free(cdb_bucket);
  160                     }
  161                     else {
  162                         ftc_num_msgs[tok] = 0;
  163                         ftc_msgs[tok] = malloc(sizeof(long));
  164                     }
  165                 }
  166     
  167     
  168                 if (op == 1) {  /* add to index */
  169                     ++ftc_num_msgs[tok];
  170                     ftc_msgs[tok] = realloc(ftc_msgs[tok],
  171                                 ftc_num_msgs[tok]*sizeof(long));
  172                     ftc_msgs[tok][ftc_num_msgs[tok] - 1] = msgnum;
  173                 }
  174     
  175                 if (op == 0) {  /* remove from index */
  176                     if (ftc_num_msgs[tok] >= 1) {
  177                         for (j=0; j<ftc_num_msgs[tok]; ++j) {
  178                             if (ftc_msgs[tok][j] == msgnum) {
  179                                 memmove(&ftc_msgs[tok][j], &ftc_msgs[tok][j+1], ((ftc_num_msgs[tok] - j - 1)*sizeof(long)));
  180                                 --ftc_num_msgs[tok];
  181                                 --j;
  182                             }
  183                         }
  184                     }
  185                 }
  186             }
  187             else {
  188                 syslog(LOG_ALERT, "fulltext: invalid token %d !!", tok);
  189             }
  190         }
  191 
  192         free(tokens);
  193     }
  194 }
  195 
  196 
  197 /*
  198  * Add a message to the list of those to be indexed.
  199  */
  200 void ft_index_msg(long msgnum, void *userdata) {
  201 
  202     if ((msgnum > CtdlGetConfigLong("MMfulltext")) && (msgnum <= ft_newhighest)) {
  203         ++ft_num_msgs;
  204         if (ft_num_msgs > ft_num_alloc) {
  205             ft_num_alloc += 1024;
  206             ft_newmsgs = realloc(ft_newmsgs, (ft_num_alloc * sizeof(long)));
  207         }
  208         ft_newmsgs[ft_num_msgs - 1] = msgnum;
  209     }
  210 
  211 }
  212 
  213 
  214 /*
  215  * Scan a room for messages to index.
  216  */
  217 void ft_index_room(struct ctdlroom *qrbuf, void *data)
  218 {
  219     if (server_shutting_down)
  220         return;
  221         
  222     CtdlGetRoom(&CC->room, qrbuf->QRname);
  223     CtdlForEachMessage(MSGS_ALL, 0L, NULL, NULL, NULL, ft_index_msg, NULL);
  224 }
  225 
  226 
  227 /*
  228  * Begin the fulltext indexing process.
  229  */
  230 void do_fulltext_indexing(void) {
  231     int i;
  232     static time_t last_progress = 0L;
  233     static int is_running = 0;
  234     if (is_running) return;         /* Concurrency check - only one can run */
  235     is_running = 1;
  236 
  237     /*
  238      * Don't do this if the site doesn't have it enabled.
  239      */
  240     if (!CtdlGetConfigInt("c_enable_fulltext")) {
  241         return;
  242     }
  243 
  244     /*
  245      * If we've switched wordbreaker modules, burn the index and start over.
  246      */
  247     begin_critical_section(S_CONTROL);
  248     if (CtdlGetConfigInt("MM_fulltext_wordbreaker") != FT_WORDBREAKER_ID) {
  249         syslog(LOG_DEBUG, "fulltext: wb ver on disk = %d, code ver = %d",
  250             CtdlGetConfigInt("MM_fulltext_wordbreaker"), FT_WORDBREAKER_ID
  251         );
  252         syslog(LOG_INFO, "fulltext: (re)initializing index");
  253         cdb_trunc(CDB_FULLTEXT);
  254         CtdlSetConfigLong("MMfulltext", 0);
  255     }
  256     end_critical_section(S_CONTROL);
  257 
  258     /*
  259      * Silently return if our fulltext index is up to date with new messages.
  260      */
  261     if ((CtdlGetConfigLong("MMfulltext") >= CtdlGetConfigLong("MMhighest"))) {
  262         return;     /* nothing to do! */
  263     }
  264 
  265     /*
  266      * Now go through each room and find messages to index.
  267      */
  268     ft_newhighest = CtdlGetConfigLong("MMhighest");
  269     CtdlForEachRoom(ft_index_room, NULL);   /* load all msg pointers */
  270 
  271     if (ft_num_msgs > 0) {
  272         qsort(ft_newmsgs, ft_num_msgs, sizeof(long), longcmp);
  273         for (i=0; i<(ft_num_msgs-1); ++i) { /* purge dups */
  274             if (ft_newmsgs[i] == ft_newmsgs[i+1]) {
  275                 memmove(&ft_newmsgs[i], &ft_newmsgs[i+1],
  276                     ((ft_num_msgs - i - 1)*sizeof(long)));
  277                 --ft_num_msgs;
  278                 --i;
  279             }
  280         }
  281 
  282         /* Here it is ... do each message! */
  283         for (i=0; i<ft_num_msgs; ++i) {
  284             if (time(NULL) != last_progress) {
  285                 syslog(LOG_DEBUG,
  286                     "fulltext: indexed %d of %d messages (%d%%)",
  287                         i, ft_num_msgs,
  288                         ((i*100) / ft_num_msgs)
  289                 );
  290                 last_progress = time(NULL);
  291             }
  292             ft_index_message(ft_newmsgs[i], 1);
  293 
  294             /* Check to see if we need to quit early */
  295             if (server_shutting_down) {
  296                 syslog(LOG_DEBUG, "fulltext: indexer quitting early");
  297                 ft_newhighest = ft_newmsgs[i];
  298                 break;
  299             }
  300 
  301             /* Check to see if we have to maybe flush to disk */
  302             if (i >= FT_MAX_CACHE) {
  303                 syslog(LOG_DEBUG, "fulltext: time to flush.");
  304                 ft_newhighest = ft_newmsgs[i];
  305                 break;
  306             }
  307 
  308         }
  309 
  310         free(ft_newmsgs);
  311         ft_num_msgs = 0;
  312         ft_num_alloc = 0;
  313         ft_newmsgs = NULL;
  314     }
  315 
  316     if (server_shutting_down) {
  317         is_running = 0;
  318         return;
  319     }
  320     
  321     /* Save our place so we don't have to do this again */
  322     ft_flush_cache();
  323     begin_critical_section(S_CONTROL);
  324     CtdlSetConfigLong("MMfulltext", ft_newhighest);
  325     CtdlSetConfigInt("MM_fulltext_wordbreaker", FT_WORDBREAKER_ID);
  326     end_critical_section(S_CONTROL);
  327 
  328     syslog(LOG_DEBUG, "fulltext: indexing finished");
  329     is_running = 0;
  330     return;
  331 }
  332 
  333 
  334 /*
  335  * API call to perform searches.
  336  * (This one does the "all of these words" search.)
  337  * Caller is responsible for freeing the message list.
  338  */
  339 void ft_search(int *fts_num_msgs, long **fts_msgs, const char *search_string) {
  340     int num_tokens = 0;
  341     int *tokens = NULL;
  342     int i, j;
  343     struct cdbdata *cdb_bucket;
  344     int num_all_msgs = 0;
  345     long *all_msgs = NULL;
  346     int num_ret_msgs = 0;
  347     int num_ret_alloc = 0;
  348     long *ret_msgs = NULL;
  349     int tok;
  350 
  351     wordbreaker(search_string, &num_tokens, &tokens);
  352     if (num_tokens > 0) {
  353         for (i=0; i<num_tokens; ++i) {
  354 
  355             /* search for tokens[i] */
  356             tok = tokens[i];
  357 
  358             /* fetch the bucket, Liza */
  359             if (ftc_msgs[tok] == NULL) {
  360                 cdb_bucket = cdb_fetch(CDB_FULLTEXT, &tok, sizeof(int));
  361                 if (cdb_bucket != NULL) {
  362                     ftc_num_msgs[tok] = cdb_bucket->len / sizeof(long);
  363                     ftc_msgs[tok] = (long *)cdb_bucket->ptr;
  364                     cdb_bucket->ptr = NULL;
  365                     cdb_free(cdb_bucket);
  366                 }
  367                 else {
  368                     ftc_num_msgs[tok] = 0;
  369                     ftc_msgs[tok] = malloc(sizeof(long));
  370                 }
  371             }
  372 
  373             num_all_msgs += ftc_num_msgs[tok];
  374             if (num_all_msgs > 0) {
  375                 all_msgs = realloc(all_msgs, num_all_msgs*sizeof(long) );
  376                 memcpy(&all_msgs[num_all_msgs-ftc_num_msgs[tok]],
  377                     ftc_msgs[tok], ftc_num_msgs[tok]*sizeof(long) );
  378             }
  379 
  380         }
  381         free(tokens);
  382         if (all_msgs != NULL) {
  383             qsort(all_msgs, num_all_msgs, sizeof(long), longcmp);
  384 
  385             /*
  386              * At this point, if a message appears num_tokens times in the
  387              * list, then it contains all of the search tokens.
  388              */
  389             if (num_all_msgs >= num_tokens)
  390                 for (j=0; j<(num_all_msgs-num_tokens+1); ++j) {
  391                     if (all_msgs[j] == all_msgs[j+num_tokens-1]) {
  392                         
  393                         ++num_ret_msgs;
  394                         if (num_ret_msgs > num_ret_alloc) {
  395                             num_ret_alloc += 64;
  396                             ret_msgs = realloc(ret_msgs,
  397                                        (num_ret_alloc*sizeof(long)) );
  398                         }
  399                         ret_msgs[num_ret_msgs - 1] = all_msgs[j];
  400                         
  401                     }
  402                 }
  403             free(all_msgs);
  404         }
  405     }
  406 
  407     *fts_num_msgs = num_ret_msgs;
  408     *fts_msgs = ret_msgs;
  409 }
  410 
  411 
  412 /*
  413  * This search command is for diagnostic purposes and may be removed or replaced.
  414  */
  415 void cmd_srch(char *argbuf) {
  416     int num_msgs = 0;
  417     long *msgs = NULL;
  418     int i;
  419     char search_string[256];
  420 
  421     if (CtdlAccessCheck(ac_logged_in)) return;
  422 
  423     if (!CtdlGetConfigInt("c_enable_fulltext")) {
  424         cprintf("%d Full text index is not enabled on this server.\n",
  425             ERROR + CMD_NOT_SUPPORTED);
  426         return;
  427     }
  428 
  429     extract_token(search_string, argbuf, 0, '|', sizeof search_string);
  430     ft_search(&num_msgs, &msgs, search_string);
  431 
  432     cprintf("%d %d msgs match all search words:\n",
  433         LISTING_FOLLOWS, num_msgs);
  434     if (num_msgs > 0) {
  435         for (i=0; i<num_msgs; ++i) {
  436             cprintf("%ld\n", msgs[i]);
  437         }
  438     }
  439     if (msgs != NULL) free(msgs);
  440     cprintf("000\n");
  441 }
  442 
  443 
  444 /*
  445  * Zero out our index cache.
  446  */
  447 void initialize_ft_cache(void) {
  448     memset(ftc_num_msgs, 0, (65536 * sizeof(int)));
  449     memset(ftc_msgs, 0, (65536 * sizeof(long *)));
  450 }
  451 
  452 
  453 void ft_delete_remove(char *room, long msgnum)
  454 {
  455     if (room) return;
  456     
  457     /* Remove from fulltext index */
  458     if (CtdlGetConfigInt("c_enable_fulltext")) {
  459         ft_index_message(msgnum, 0);
  460     }
  461 }
  462 
  463 
  464 /*****************************************************************************/
  465 
  466 CTDL_MODULE_INIT(fulltext)
  467 {
  468     if (!threading)
  469     {
  470         initialize_ft_cache();
  471         CtdlRegisterProtoHook(cmd_srch, "SRCH", "Full text search");
  472         CtdlRegisterDeleteHook(ft_delete_remove);
  473         CtdlRegisterSearchFuncHook(ft_search, "fulltext");
  474         CtdlRegisterSessionHook(do_fulltext_indexing, EVT_TIMER, PRIO_CLEANUP + 300);
  475     }
  476     /* return our module name for the log */
  477     return "fulltext";
  478 }