citadel
About: Citadel is an advanced messaging and collaboration system for groupware and BBS applications (preferred OS: Linux).
  Fossies Dox: citadel.tar.gz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

Loading...
Searching...
No Matches
ft_wordbreaker.c
Go to the documentation of this file.
1/*
2 * Default wordbreaker module for full text indexing.
3 *
4 * Copyright (c) 2005-2017 by the citadel.org team
5 *
6 * This program is open source software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 3.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 */
14
15#include "../../sysdep.h"
16#include <stdlib.h>
17#include <unistd.h>
18#include <stdio.h>
19#include <fcntl.h>
20#include <signal.h>
21#include <pwd.h>
22#include <errno.h>
23#include <sys/types.h>
24#include <time.h>
25#include <sys/wait.h>
26#include <ctype.h>
27#include <string.h>
28#include <limits.h>
29#include <libcitadel.h>
30#include "../../citadel.h"
31#include "../../server.h"
32#include "../../sysdep_decls.h"
33#include "../../citserver.h"
34#include "../../support.h"
35#include "../../config.h"
36#include "../../database.h"
37#include "../../msgbase.h"
38#include "../../control.h"
39#include "ft_wordbreaker.h"
40#include "crc16.h"
41#include "../../ctdl_module.h"
42
43/*
44 * Noise words are not included in search indices.
45 * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
46 * must also be changed, so that the index is rebuilt.
47 */
48static char *noise_words[] = {
49 "about",
50 "after",
51 "also",
52 "another",
53 "because",
54 "been",
55 "before",
56 "being",
57 "between",
58 "both",
59 "came",
60 "come",
61 "could",
62 "each",
63 "from",
64 "have",
65 "here",
66 "himself",
67 "into",
68 "like",
69 "make",
70 "many",
71 "might",
72 "more",
73 "most",
74 "much",
75 "must",
76 "never",
77 "only",
78 "other",
79 "over",
80 "said",
81 "same",
82 "should",
83 "since",
84 "some",
85 "still",
86 "such",
87 "take",
88 "than",
89 "that",
90 "their",
91 "them",
92 "then",
93 "there",
94 "these",
95 "they",
96 "this",
97 "those",
98 "through",
99 "under",
100 "very",
101 "well",
102 "were",
103 "what",
104 "where",
105 "which",
106 "while",
107 "with",
108 "would",
109 "your"
110};
111#define NUM_NOISE (sizeof(noise_words) / sizeof(char *))
112
113
114/*
115 * Compare function
116 */
117int intcmp(const void *rec1, const void *rec2) {
118 int i1, i2;
119
120 i1 = *(const int *)rec1;
121 i2 = *(const int *)rec2;
122
123 if (i1 > i2) return(1);
124 if (i1 < i2) return(-1);
125 return(0);
126}
127
128
129void wordbreaker(const char *text, int *num_tokens, int **tokens) {
130
131 int wb_num_tokens = 0;
132 int wb_num_alloc = 0;
133 int *wb_tokens = NULL;
134
135 const char *ptr;
136 const char *word_start;
137 const char *word_end;
138 char ch;
139 int word_len;
140 char word[256];
141 int i;
142 int word_crc;
143
144 if (text == NULL) { /* no NULL text please */
145 *num_tokens = 0;
146 *tokens = NULL;
147 return;
148 }
149
150 if (text[0] == 0) { /* no empty text either */
151 *num_tokens = 0;
152 *tokens = NULL;
153 return;
154 }
155
156 ptr = text;
157 word_start = NULL;
158 while (*ptr) {
159 ch = *ptr;
160 if (isalnum(ch)) {
161 if (!word_start) {
162 word_start = ptr;
163 }
164 }
165 ++ptr;
166 ch = *ptr;
167 if ( (!isalnum(ch)) && (word_start) ) {
168 word_end = ptr;
169
170 /* extract the word */
171 word_len = word_end - word_start;
172 if (word_len >= sizeof word) {
173 syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
174 safestrncpy(word, word_start, sizeof word);
175 word[(sizeof word) - 1] = 0;
176 }
177 else {
178 safestrncpy(word, word_start, word_len+1);
179 word[word_len] = 0;
180 }
181 word_start = NULL;
182
183 /* are we ok with the length? */
184 if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) {
185 for (i=0; i<word_len; ++i) {
186 word[i] = tolower(word[i]);
187 }
188 /* disqualify noise words */
189 for (i=0; i<NUM_NOISE; ++i) {
190 if (!strcmp(word, noise_words[i])) {
191 word_len = 0;
192 break;
193 }
194 }
195
196 if (word_len == 0)
197 continue;
198
199 word_crc = (int) CalcCRC16Bytes(word_len, word);
200
201 ++wb_num_tokens;
202 if (wb_num_tokens > wb_num_alloc) {
203 wb_num_alloc += 512;
204 wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
205 }
206 wb_tokens[wb_num_tokens - 1] = word_crc;
207 }
208 }
209 }
210
211 /* sort and purge dups */
212 if (wb_num_tokens > 1) {
213 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
214 for (i=0; i<(wb_num_tokens-1); ++i) {
215 if (wb_tokens[i] == wb_tokens[i+1]) {
216 memmove(&wb_tokens[i], &wb_tokens[i+1],
217 ((wb_num_tokens - i - 1)*sizeof(int)));
218 --wb_num_tokens;
219 --i;
220 }
221 }
222 }
223
224 *num_tokens = wb_num_tokens;
225 *tokens = wb_tokens;
226}
227
CRC16 CalcCRC16Bytes(unsigned int count, char *buffer)
Definition: crc16.c:138
void wordbreaker(const char *text, int *num_tokens, int **tokens)
int intcmp(const void *rec1, const void *rec2)
#define NUM_NOISE
static char * noise_words[]
#define WB_MAX
#define WB_MIN