"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "scriptindex.cc" between
xapian-omega-1.4.19.tar.xz and xapian-omega-1.4.20.tar.xz

About: Xapian Omega is an application built on Xapian, consisting of indexers and a CGI search frontend.

scriptindex.cc  (xapian-omega-1.4.19.tar.xz):scriptindex.cc  (xapian-omega-1.4.20.tar.xz)
/** @file /** @file
* @brief index arbitrary data as described by an index script * @brief index arbitrary data as described by an index script
*/ */
/* Copyright 1999,2000,2001 BrightStation PLC /* Copyright 1999,2000,2001 BrightStation PLC
* Copyright 2001 Sam Liddicott * Copyright 2001 Sam Liddicott
* Copyright 2001,2002 Ananova Ltd * Copyright 2001,2002 Ananova Ltd
* Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2014,2015,2017,20 18 Olly Betts * Copyright 2002-2022 Olly Betts
* *
* This program is free software; you can redistribute it and/or * This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as * modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the * published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version. * License, or (at your option) any later version.
* *
* This program is distributed in the hope that it will be useful, * This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details. * GNU General Public License for more details.
skipping to change at line 73 skipping to change at line 73
using namespace std; using namespace std;
#define PROG_NAME "scriptindex" #define PROG_NAME "scriptindex"
#define PROG_DESC "index arbitrary data as described by an index script" #define PROG_DESC "index arbitrary data as described by an index script"
static bool verbose; static bool verbose;
static int addcount; static int addcount;
static int repcount; static int repcount;
static int delcount; static int delcount;
static int skipcount;
/** What to do if there's a UNIQUE action but a record doesn't use it.
*/
static enum {
UNIQUE_ERROR,
UNIQUE_WARN_NEW,
UNIQUE_NEW,
UNIQUE_WARN_SKIP,
UNIQUE_SKIP
} unique_missing = UNIQUE_WARN_NEW;
/// Track if UNIQUE action is unused in the current record.
static bool unique_unused;
/// Track if the current record is being skipping.
static bool skipping_record = false;
static inline bool static inline bool
prefix_needs_colon(const string & prefix, unsigned ch) prefix_needs_colon(const string & prefix, unsigned ch)
{ {
if (!C_isupper(ch) && ch != ':') return false; if (!C_isupper(ch) && ch != ':') return false;
string::size_type len = prefix.length(); string::size_type len = prefix.length();
return (len > 1 && prefix[len - 1] != ':'); return (len > 1 && prefix[len - 1] != ':');
} }
const char * action_names[] = { const char * action_names[] = {
skipping to change at line 114 skipping to change at line 131
"truncate", "truncate",
"unhtml", "unhtml",
"unique", "unique",
"value", "value",
"valuenumeric", "valuenumeric",
"valuepacked", "valuepacked",
"weight" "weight"
}; };
// For debugging: // For debugging:
#define DUMP_ACTION(A) cout << action_names[(A).get_action()] << "(" << (A).get_ string_arg() << "," << (A).get_num_arg() << ")" << endl #define DUMP_ACTION(A) cout << action_names[(A).get_action()] << "(" << (A).get_ string_arg() << "," << (A).get_num_arg() << ")\n"
class Action { class Action {
public: public:
typedef enum { typedef enum {
// Actions used internally: // Actions used internally:
BAD, BAD,
NEW, NEW,
// Actual actions: // Actual actions:
BOOLEAN, BOOLEAN,
DATE, DATE,
skipping to change at line 216 skipping to change at line 233
auto j = s.find_first_of(chars, i); auto j = s.find_first_of(chars, i);
if (!output.empty()) output += ' '; if (!output.empty()) output += ' ';
output.append(s, i, j - i); output.append(s, i, j - i);
i = j; i = j;
} }
s = std::move(output); s = std::move(output);
} }
enum diag_type { DIAG_ERROR, DIAG_WARN, DIAG_NOTE }; enum diag_type { DIAG_ERROR, DIAG_WARN, DIAG_NOTE };
static unsigned error_count = 0;
static void static void
report_location(enum diag_type type, report_location(enum diag_type type,
const string& filename, const string& filename,
size_t line = 0, size_t line = 0,
size_t pos = string::npos) size_t pos = string::npos)
{ {
cerr << filename; cerr << filename;
if (line != 0) { if (line != 0) {
cerr << ':' << line; cerr << ':' << line;
} if (pos != string::npos) {
if (pos != string::npos) { // The first column is numbered 1.
// The first column is numbered 1. cerr << ':' << pos + 1;
cerr << ':' << pos + 1; }
} }
switch (type) { switch (type) {
case DIAG_ERROR: case DIAG_ERROR:
cerr << ": error: "; cerr << ": error: ";
++error_count;
break; break;
case DIAG_WARN: case DIAG_WARN:
cerr << ": warning: "; cerr << ": warning: ";
break; break;
case DIAG_NOTE: case DIAG_NOTE:
cerr << ": note: "; cerr << ": note: ";
break; break;
} }
} }
static void static void
report_useless_action(const string &file, size_t line, size_t pos, report_useless_action(const string &file, size_t line, size_t pos,
const string &action) const string &action)
{ {
report_location(DIAG_WARN, file, line, pos); report_location(DIAG_WARN, file, line, pos);
cerr << "Index action '" << action << "' has no effect" << endl; cerr << "Index action '" << action << "' has no effect\n";
static bool given_left_to_right_warning = false; static bool given_left_to_right_warning = false;
if (!given_left_to_right_warning) { if (!given_left_to_right_warning) {
given_left_to_right_warning = true; given_left_to_right_warning = true;
report_location(DIAG_NOTE, file, line, pos); report_location(DIAG_NOTE, file, line, pos);
cerr << "Actions are executed from left to right" << endl; cerr << "Actions are executed from left to right\n";
} }
} }
static bool index_spec_uses_unique = false;
static map<string, vector<Action>> index_spec; static map<string, vector<Action>> index_spec;
// Like std::getline() but handle \r\n line endings too.
static istream&
getline_portable(istream& stream, string& line)
{
istream& result = getline(stream, line);
// Trim multiple \r characters, since that seems the best way to handle
// that case.
line.resize(line.find_last_not_of('\r') + 1);
return result;
}
static void static void
parse_index_script(const string &filename) parse_index_script(const string &filename)
{ {
ifstream script(filename.c_str()); ifstream script(filename.c_str());
if (!script.is_open()) { if (!script.is_open()) {
report_location(DIAG_ERROR, filename); report_location(DIAG_ERROR, filename);
cerr << strerror(errno) << endl; cerr << strerror(errno) << '\n';
exit(1); exit(1);
} }
string line; string line;
size_t line_no = 0; size_t line_no = 0;
// Line number where we saw a `unique` action, or -1 if we haven't. // Line number where we saw a `unique` action, or 0 if we haven't.
int unique_line_no = -1; int unique_line_no = 0;
// Offset into line unique_line_no where the `unique` action was.
size_t unique_pos = 0;
while (getline(script, line)) { while (getline(script, line)) {
++line_no; ++line_no;
vector<string> fields; vector<string> fields;
vector<Action> actions; vector<Action> actions;
string::const_iterator i, j; string::const_iterator i, j;
const string &s = line; const string &s = line;
i = find_if(s.begin(), s.end(), [](char ch) { return !C_isspace(ch); }); i = find_if(s.begin(), s.end(), [](char ch) { return !C_isspace(ch); });
if (i == s.end() || *i == '#') { if (i == s.end() || *i == '#') {
// Blank line or comment. // Blank line or comment.
continue; continue;
} }
while (true) { while (true) {
if (!C_isalnum(*i)) { if (!C_isalnum(*i)) {
report_location(DIAG_ERROR, filename, line_no, i - s.begin()); report_location(DIAG_ERROR, filename, line_no, i - s.begin());
cerr << "field name must start with alphanumeric" << endl; cerr << "field name must start with alphanumeric\n";
exit(1);
} }
j = find_if(i, s.end(), j = find_if(i + 1, s.end(),
[](char ch) { return !C_isalnum(ch) && ch != '_'; }); [](char ch) { return !C_isalnum(ch) && ch != '_'; });
fields.push_back(string(i, j)); fields.push_back(string(i, j));
i = find_if(j, s.end(), [](char ch) { return !C_isspace(ch); }); i = find_if(j, s.end(), [](char ch) { return !C_isspace(ch); });
if (i == s.end()) break; if (i == s.end()) break;
if (*i == ':') { if (*i == ':') {
++i; ++i;
i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); }); i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
break; break;
} }
if (i == j) { if (i == j) {
report_location(DIAG_ERROR, filename, line_no, i - s.begin()); report_location(DIAG_ERROR, filename, line_no, i - s.begin());
cerr << "bad character '" << *i << "' in fieldname" << endl; cerr << "bad character '" << *i << "' in field name\n";
exit(1); ++i;
i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
if (i == s.end()) break;
} }
} }
Xapian::termcount weight = 1; Xapian::termcount weight = 1;
size_t useless_weight_pos = string::npos; size_t useless_weight_pos = string::npos;
map<string, Action::type> boolmap; map<string, Action::type> boolmap;
j = i; j = i;
while (j != s.end()) { while (j != s.end()) {
size_t action_pos = j - s.begin(); size_t action_pos = j - s.begin();
i = find_if(j, s.end(), [](char ch) { return !C_isalnum(ch); }); i = find_if(j, s.end(), [](char ch) { return !C_isalnum(ch); });
string action(s, j - s.begin(), i - j); string action(s, j - s.begin(), i - j);
skipping to change at line 411 skipping to change at line 447
} else if (action == "trim") { } else if (action == "trim") {
code = Action::TRIM; code = Action::TRIM;
max_args = 1; max_args = 1;
} }
break; break;
case 'u': case 'u':
if (action == "unhtml") { if (action == "unhtml") {
code = Action::UNHTML; code = Action::UNHTML;
} else if (action == "unique") { } else if (action == "unique") {
code = Action::UNIQUE; code = Action::UNIQUE;
min_args = max_args = 1; min_args = 1;
max_args = 2;
} }
break; break;
case 'v': case 'v':
if (action == "value") { if (action == "value") {
code = Action::VALUE; code = Action::VALUE;
min_args = max_args = 1; min_args = max_args = 1;
takes_integer_argument = true; takes_integer_argument = true;
} else if (action == "valuenumeric") { } else if (action == "valuenumeric") {
code = Action::VALUENUMERIC; code = Action::VALUENUMERIC;
min_args = max_args = 1; min_args = max_args = 1;
skipping to change at line 433 skipping to change at line 470
} else if (action == "valuepacked") { } else if (action == "valuepacked") {
code = Action::VALUEPACKED; code = Action::VALUEPACKED;
min_args = max_args = 1; min_args = max_args = 1;
takes_integer_argument = true; takes_integer_argument = true;
} }
break; break;
case 'w': case 'w':
if (action == "weight") { if (action == "weight") {
code = Action::WEIGHT; code = Action::WEIGHT;
min_args = max_args = 1; min_args = max_args = 1;
takes_integer_argument = true; // Don't set takes_integer_argument since we parse
// it with parse_unsigned() and issue an error there
// - setting takes_integer_argument would give a
// double error for arguments with a decimal point.
} }
break; break;
} }
} }
if (code == Action::BAD) { if (code == Action::BAD) {
report_location(DIAG_ERROR, filename, line_no, action_pos); report_location(DIAG_ERROR, filename, line_no, action_pos);
cerr << "Unknown index action '" << action << "'" << endl; cerr << "Unknown index action '" << action << "'\n";
exit(1);
} }
auto i_after_action = i; auto i_after_action = i;
i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); }); i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
if (i != s.end() && *i == '=') { if (i != s.end() && *i == '=') {
if (i != i_after_action) { if (i != i_after_action) {
report_location(DIAG_WARN, filename, line_no, report_location(DIAG_WARN, filename, line_no,
i_after_action - s.begin()); i_after_action - s.begin());
cerr << "putting spaces between the action and '=' is " cerr << "putting spaces between the action and '=' is "
"deprecated." << endl; "deprecated\n";
} }
if (max_args == 0) { if (max_args == 0) {
report_location(DIAG_ERROR, filename, line_no, report_location(DIAG_ERROR, filename, line_no,
i - s.begin()); i - s.begin());
cerr << "Index action '" << action cerr << "Index action '" << action
<< "' doesn't take an argument" << endl; << "' doesn't take an argument\n";
exit(1);
} }
++i; ++i;
j = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); }); j = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
if (i != j) { if (i != j) {
report_location(DIAG_WARN, filename, line_no, report_location(DIAG_WARN, filename, line_no,
i - s.begin()); i - s.begin());
cerr << "putting spaces between '=' and the argument is " cerr << "putting spaces between '=' and the argument is "
"deprecated." << endl; "deprecated\n";
} }
vector<string> vals; vector<string> vals;
while (true) { while (true) {
if (j != s.end() && *j == '"') { if (j != s.end() && *j == '"') {
// Quoted argument. // Quoted argument.
++j; ++j;
string arg; string arg;
while (true) { while (true) {
i = find_if(j, s.end(), i = find_if(j, s.end(),
[](char ch) { [](char ch) {
return ch == '"' || ch == '\\'; return ch == '"' || ch == '\\';
}); });
if (i == s.end()) { if (i == s.end()) {
report_location(DIAG_ERROR, filename, line_no, report_location(DIAG_ERROR, filename, line_no,
s.size()); s.size());
cerr << "No closing quote" << endl; cerr << "No closing quote\n";
exit(1); break;
} }
arg.append(j, i); arg.append(j, i);
if (*i++ == '"') if (*i++ == '"')
break; break;
// Escape sequence. // Escape sequence.
if (i == s.end()) { if (i == s.end()) {
bad_escaping: bad_escaping:
report_location(DIAG_ERROR, filename, line_no, report_location(DIAG_ERROR, filename, line_no,
i - s.begin()); i - s.begin());
cerr << "Bad escaping in quoted action argument" cerr << "Bad escaping in quoted action "
<< endl; "argument\n";
exit(1); break;
} }
char ch = *i; char ch = *i;
switch (ch) { switch (ch) {
case '\\': case '\\':
case '"': case '"':
break; break;
case '0': case '0':
ch = '\0'; ch = '\0';
break; break;
skipping to change at line 523 skipping to change at line 561
case 'r': case 'r':
ch = '\r'; ch = '\r';
break; break;
case 't': case 't':
ch = '\t'; ch = '\t';
break; break;
case 'x': { case 'x': {
if (++i == s.end()) if (++i == s.end())
goto bad_escaping; goto bad_escaping;
char ch1 = *i; char ch1 = *i;
if (!C_isxdigit(ch1)) {
bad_hex_digit:
report_location(DIAG_ERROR, filename,
line_no, i - s.begin());
cerr << "Bad hex digit in escaping\n";
--i;
break;
}
if (++i == s.end()) if (++i == s.end())
goto bad_escaping; goto bad_escaping;
char ch2 = *i; char ch2 = *i;
if (!C_isxdigit(ch1) || if (!C_isxdigit(ch2)) {
!C_isxdigit(ch2)) goto bad_hex_digit;
goto bad_escaping; }
ch = hex_digit(ch1) << 4 | ch = hex_digit(ch1) << 4 |
hex_digit(ch2); hex_digit(ch2);
break; break;
} }
default: default:
goto bad_escaping; report_location(DIAG_ERROR, filename,
line_no, i - s.begin());
cerr << "Bad escape sequence '\\" << ch
<< "'\n";
break;
} }
arg += ch; arg += ch;
j = i + 1; j = i + 1;
} }
vals.emplace_back(std::move(arg)); vals.emplace_back(std::move(arg));
if (i == s.end() || C_isspace(*i)) break; if (i == s.end() || C_isspace(*i)) break;
if (*i != ',') { if (*i == ',') {
++i;
} else {
report_location(DIAG_ERROR, filename, line_no, report_location(DIAG_ERROR, filename, line_no,
i - s.begin()); i - s.begin());
cerr << "Unexpected character '" << *i cerr << "Unexpected character '" << *i
<< "' after closing quote" << endl; << "' after closing quote\n";
exit(1); do {
++i;
} while (i != s.end() && *i != ',' && !C_isspace(*i))
;
if (*i != ',') break;
++i;
} }
++i;
} else if (max_args > 1) { } else if (max_args > 1) {
// Unquoted argument, split on comma. // Unquoted argument, split on comma.
i = find_if(j, s.end(), i = find_if(j, s.end(),
[](char ch) { [](char ch) {
return C_isspace(ch) || ch == ','; return C_isspace(ch) || ch == ',';
}); });
vals.emplace_back(j, i); vals.emplace_back(j, i);
if (*i != ',') break; if (*i != ',') break;
++i; ++i;
} else { } else {
skipping to change at line 570 skipping to change at line 625
i = find_if(j, s.end(), i = find_if(j, s.end(),
[](char ch) { return C_isspace(ch); }); [](char ch) { return C_isspace(ch); });
vals.emplace_back(j, i); vals.emplace_back(j, i);
break; break;
} }
j = i; j = i;
if (vals.size() == max_args) { if (vals.size() == max_args) {
report_location(DIAG_ERROR, filename, line_no, report_location(DIAG_ERROR, filename, line_no,
i - s.begin()); i - s.begin());
cerr << "Index action '" << action cerr << "Index action '" << action << "' takes at most "
<< "' takes at most " << max_args << " arguments" << max_args << " arguments\n";
<< endl;
exit(1);
} }
} }
if (vals.size() < min_args) { if (vals.size() < min_args) {
report_location(DIAG_ERROR, filename, line_no, report_location(DIAG_ERROR, filename, line_no,
i - s.begin()); i - s.begin());
if (min_args == max_args) { if (min_args == max_args) {
cerr << "Index action '" << action cerr << "Index action '" << action << "' requires "
<< "' requires " << min_args << " arguments" << min_args << " arguments\n";
<< endl; } else {
exit(1); cerr << "Index action '" << action << "' requires "
"at least " << min_args << " arguments\n";
} }
cerr << "Index action '" << action // Allow action handling code to assume there are min_args
<< "' requires at least " << min_args << " arguments" // arguments.
<< endl; vals.resize(min_args);
exit(1);
} }
string val; string val;
if (!vals.empty()) { if (!vals.empty()) {
val = vals.front(); val = vals.front();
} }
if (takes_integer_argument) { if (takes_integer_argument) {
auto dot = val.find('.'); auto dot = val.find('.');
if (dot != string::npos) { if (dot != string::npos) {
report_location(DIAG_WARN, filename, line_no, report_location(DIAG_ERROR, filename, line_no,
j - s.begin() + dot); j - s.begin() + dot);
cerr << "Index action '" << action cerr << "Index action '" << action
<< "' takes an integer argument" << endl; << "' takes an integer argument\n";
} }
} }
switch (code) { switch (code) {
case Action::DATE: case Action::DATE:
if (val != "unix" && if (val != "unix" &&
val != "unixutc" && val != "unixutc" &&
val != "yyyymmdd") { val != "yyyymmdd") {
report_location(DIAG_ERROR, filename, line_no); report_location(DIAG_ERROR, filename, line_no,
cerr << "Invalid parameter '" << val << "' for " j - s.begin());
"action 'date'" << endl; cerr << "Invalid parameter '" << val
exit(1); << "' for action 'date'\n";
} }
actions.emplace_back(code, action_pos, val); actions.emplace_back(code, action_pos, val);
break; break;
case Action::INDEX: case Action::INDEX:
case Action::INDEXNOPOS: case Action::INDEXNOPOS:
actions.emplace_back(code, action_pos, val, weight); actions.emplace_back(code, action_pos, val, weight);
useless_weight_pos = string::npos; useless_weight_pos = string::npos;
break; break;
case Action::WEIGHT: case Action::WEIGHT:
// We don't push an Action for WEIGHT - instead we // We don't push an Action for WEIGHT - instead we
// store it ready to use in the INDEX and INDEXNOPOS // store it ready to use in the INDEX and INDEXNOPOS
// Actions. // Actions.
weight = atoi(val.c_str()); if (!parse_unsigned(val.c_str(), weight)) {
report_location(DIAG_ERROR, filename, line_no,
j - s.begin());
cerr << "Index action 'weight' takes a "
"non-negative integer argument\n";
weight = 0;
}
if (useless_weight_pos != string::npos) { if (useless_weight_pos != string::npos) {
report_useless_action(filename, line_no, report_useless_action(filename, line_no,
useless_weight_pos, action); useless_weight_pos, action);
} }
useless_weight_pos = action_pos; useless_weight_pos = action_pos;
break; break;
case Action::PARSEDATE: { case Action::PARSEDATE: {
if (val.find("%Z") != val.npos) { auto bad_code = val.find("%Z");
report_location(DIAG_ERROR, filename, line_no); if (bad_code != val.npos) {
cerr << "Parsing timezone names with %Z is not suppor report_location(DIAG_ERROR, filename, line_no,
ted" << endl; j - s.begin() + bad_code);
exit(1); cerr << "Parsing timezone names with %Z is not "
"supported\n";
} }
#ifndef HAVE_STRUCT_TM_TM_GMTOFF #ifndef HAVE_STRUCT_TM_TM_GMTOFF
if (val.find("%z") != val.npos) { bad_code = val.find("%z");
report_location(DIAG_ERROR, filename, line_no); if (bad_code != val.npos) {
cerr << "Parsing timezone offsets with %z is not supp report_location(DIAG_ERROR, filename, line_no,
orted on " j - s.begin() + bad_code);
"this platform" << endl; cerr << "Parsing timezone offsets with %z is not "
exit(1); "supported on this platform\n";
} }
#endif #endif
actions.emplace_back(code, action_pos, val); actions.emplace_back(code, action_pos, val);
break; break;
} }
case Action::SPLIT: { case Action::SPLIT: {
if (val.empty()) { if (val.empty()) {
report_location(DIAG_ERROR, filename, line_no); report_location(DIAG_ERROR, filename, line_no,
cerr << "Split delimiter can't be empty" << endl; j - s.begin());
exit(1); cerr << "Split delimiter can't be empty\n";
} }
int operation = Action::SPLIT_NONE; int operation = Action::SPLIT_NONE;
if (vals.size() >= 2) { if (vals.size() >= 2) {
if (vals[1] == "dedup") { if (vals[1] == "dedup") {
operation = Action::SPLIT_DEDUP; operation = Action::SPLIT_DEDUP;
} else if (vals[1] == "sort") { } else if (vals[1] == "sort") {
operation = Action::SPLIT_SORT; operation = Action::SPLIT_SORT;
} else if (vals[1] == "none") { } else if (vals[1] == "none") {
operation = Action::SPLIT_NONE; operation = Action::SPLIT_NONE;
} else if (vals[1] == "prefixes") { } else if (vals[1] == "prefixes") {
operation = Action::SPLIT_PREFIXES; operation = Action::SPLIT_PREFIXES;
} else { } else {
report_location(DIAG_ERROR, filename, line_no); // FIXME: Column should be for where the `op`
// parameter starts, which this isn't if the
// value is quoted, contains escape sequences,
// etc.
report_location(DIAG_ERROR, filename, line_no,
i - s.begin() - vals[1].size());
cerr << "Bad split operation '" << vals[1] cerr << "Bad split operation '" << vals[1]
<< "'" << endl; << "'\n";
exit(1);
} }
} }
actions.emplace_back(code, action_pos, val, operation); actions.emplace_back(code, action_pos, val, operation);
break; break;
} }
case Action::TRUNCATE: case Action::TRUNCATE:
if (!actions.empty() && if (!actions.empty() &&
actions.back().get_action() == Action::LOAD) { actions.back().get_action() == Action::LOAD) {
/* Turn "load truncate=n" into "load" with /* Turn "load truncate=n" into "load" with
* num_arg n, so that we don't needlessly * num_arg n, so that we don't needlessly
* allocate memory and read data we're just * allocate memory and read data we're just
* going to ignore. * going to ignore.
*/ */
actions.pop_back(); actions.pop_back();
code = Action::LOAD; code = Action::LOAD;
} }
actions.emplace_back(code, action_pos, val); actions.emplace_back(code, action_pos, val);
break; break;
case Action::UNIQUE: case Action::UNIQUE:
if (unique_line_no >= 0) { if (unique_line_no) {
report_location(DIAG_ERROR, filename, line_no, report_location(DIAG_ERROR, filename, line_no,
action_pos); action_pos);
cerr << "Index action 'unique' used more than once" cerr << "Index action 'unique' used more than "
<< endl; "once\n";
report_location(DIAG_NOTE, filename, report_location(DIAG_NOTE, filename,
unique_line_no); unique_line_no, unique_pos);
cerr << "Previously used here" << endl; cerr << "Previously used here\n";
exit(1);
} }
unique_line_no = line_no; unique_line_no = line_no;
unique_pos = action_pos;
if (boolmap.find(val) == boolmap.end()) if (boolmap.find(val) == boolmap.end())
boolmap[val] = Action::UNIQUE; boolmap[val] = Action::UNIQUE;
if (vals.size() >= 2) {
if (vals[1] == "missing=error") {
unique_missing = UNIQUE_ERROR;
} else if (vals[1] == "missing=new") {
unique_missing = UNIQUE_NEW;
} else if (vals[1] == "missing=warn+new") {
unique_missing = UNIQUE_WARN_NEW;
} else if (vals[1] == "missing=skip") {
unique_missing = UNIQUE_SKIP;
} else if (vals[1] == "missing=warn+skip") {
unique_missing = UNIQUE_WARN_SKIP;
} else {
report_location(DIAG_ERROR, filename, line_no);
cerr << "Bad unique parameter '" << vals[1]
<< "'\n";
}
}
actions.emplace_back(code, action_pos, val); actions.emplace_back(code, action_pos, val);
break; break;
case Action::GAP: { case Action::GAP: {
actions.emplace_back(code, action_pos, val); actions.emplace_back(code, action_pos, val);
auto& obj = actions.back(); auto& obj = actions.back();
auto gap_size = obj.get_num_arg(); auto gap_size = obj.get_num_arg();
if (gap_size <= 0) { if (gap_size <= 0) {
report_location(DIAG_ERROR, filename, line_no, report_location(DIAG_ERROR, filename, line_no,
obj.get_pos() + 3 + 1); obj.get_pos() + 3 + 1);
cerr << "Index action 'gap' takes a strictly " cerr << "Index action 'gap' takes a strictly "
"positive integer argument" << endl; "positive integer argument\n";
exit(1);
} }
break; break;
} }
case Action::HASH: { case Action::HASH: {
actions.emplace_back(code, action_pos, val); actions.emplace_back(code, action_pos, val);
auto& obj = actions.back(); auto& obj = actions.back();
auto max_length = obj.get_num_arg(); auto max_length = obj.get_num_arg();
if (max_length < 6) { if (max_length < 6) {
report_location(DIAG_ERROR, filename, line_no, report_location(DIAG_ERROR, filename, line_no,
obj.get_pos() + 4 + 1); obj.get_pos() + 4 + 1);
cerr << "Index action 'hash' takes an integer " cerr << "Index action 'hash' takes an integer "
"argument which must be at least 6" << endl; "argument which must be at least 6\n";
exit(1);
} }
break; break;
} }
case Action::LTRIM: case Action::LTRIM:
case Action::RTRIM: case Action::RTRIM:
case Action::SQUASH: case Action::SQUASH:
case Action::TRIM: case Action::TRIM:
for (unsigned char ch : val) { for (unsigned char ch : val) {
if (ch >= 0x80) { if (ch >= 0x80) {
auto column = actions.back().get_pos() + auto column = actions.back().get_pos() +
strlen(action_names[code]) + 1; strlen(action_names[code]) + 1;
report_location(DIAG_ERROR, filename, line_no, report_location(DIAG_ERROR, filename, line_no,
column); column);
cerr << "Index action '" << action_names[code] cerr << "Index action '" << action_names[code]
<< "' only support ASCII characters " << "' only support ASCII characters "
"currently\n"; "currently\n";
exit(1);
} }
} }
actions.emplace_back(code, action_pos, val); actions.emplace_back(code, action_pos, val);
break; break;
case Action::BOOLEAN: case Action::BOOLEAN:
boolmap[val] = Action::BOOLEAN; boolmap[val] = Action::BOOLEAN;
/* FALLTHRU */ /* FALLTHRU */
default: default:
actions.emplace_back(code, action_pos, val); actions.emplace_back(code, action_pos, val);
} }
i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); }); i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
} else { } else {
if (min_args > 0) { if (min_args > 0) {
report_location(DIAG_ERROR, filename, line_no, report_location(DIAG_ERROR, filename, line_no,
i_after_action - s.begin()); i_after_action - s.begin());
if (min_args == max_args) { if (min_args == max_args) {
cerr << "Index action '" << action << "' requires " cerr << "Index action '" << action << "' requires "
<< min_args << " arguments" << endl; << min_args << " arguments\n";
exit(1); } else {
cerr << "Index action '" << action << "' requires "
"at least " << min_args << " arguments\n";
} }
cerr << "Index action '" << action << "' requires at least "
<< min_args << " arguments" << endl;
exit(1);
} }
switch (code) { switch (code) {
case Action::INDEX: case Action::INDEX:
case Action::INDEXNOPOS: case Action::INDEXNOPOS:
useless_weight_pos = string::npos; useless_weight_pos = string::npos;
actions.emplace_back(code, action_pos, "", weight); actions.emplace_back(code, action_pos, "", weight);
break; break;
case Action::GAP: case Action::GAP:
actions.emplace_back(code, action_pos, "", 100); actions.emplace_back(code, action_pos, "", 100);
break; break;
skipping to change at line 832 skipping to change at line 911
break; break;
default: default:
break; break;
} }
if (done) break; if (done) break;
} }
map<string, Action::type>::const_iterator boolpfx; map<string, Action::type>::const_iterator boolpfx;
for (boolpfx = boolmap.begin(); boolpfx != boolmap.end(); ++boolpfx) { for (boolpfx = boolmap.begin(); boolpfx != boolmap.end(); ++boolpfx) {
if (boolpfx->second == Action::UNIQUE) { if (boolpfx->second == Action::UNIQUE) {
report_location(DIAG_WARN, filename, line_no); report_location(DIAG_WARN, filename, unique_line_no,
unique_pos);
cerr << "Index action 'unique=" << boolpfx->first cerr << "Index action 'unique=" << boolpfx->first
<< "' without 'boolean=" << boolpfx->first << "'" << endl; << "' without 'boolean=" << boolpfx->first << "'\n";
static bool given_doesnt_imply_boolean_warning = false; static bool given_doesnt_imply_boolean_warning = false;
if (!given_doesnt_imply_boolean_warning) { if (!given_doesnt_imply_boolean_warning) {
given_doesnt_imply_boolean_warning = true; given_doesnt_imply_boolean_warning = true;
report_location(DIAG_NOTE, filename, line_no); report_location(DIAG_NOTE, filename, unique_line_no,
cerr << "'unique' doesn't implicitly add a boolean term" unique_pos);
<< endl; cerr << "'unique' doesn't implicitly add a boolean term\n";
} }
} }
} }
vector<string>::const_iterator field; vector<string>::const_iterator field;
for (field = fields.begin(); field != fields.end(); ++field) { for (field = fields.begin(); field != fields.end(); ++field) {
vector<Action> &v = index_spec[*field]; vector<Action> &v = index_spec[*field];
if (v.empty()) { if (v.empty()) {
if (fields.size() == 1) { if (fields.size() == 1) {
// Optimise common case where there's only one fieldname // Optimise common case where there's only one fieldname
skipping to change at line 865 skipping to change at line 945
} }
} else { } else {
v.emplace_back(Action::NEW, string::npos); v.emplace_back(Action::NEW, string::npos);
v.insert(v.end(), actions.begin(), actions.end()); v.insert(v.end(), actions.begin(), actions.end());
} }
} }
} }
if (index_spec.empty()) { if (index_spec.empty()) {
report_location(DIAG_ERROR, filename, line_no); report_location(DIAG_ERROR, filename, line_no);
cerr << "No rules found in index script" << endl; cerr << "No rules found in index script\n";
}
if (error_count) {
exit(1); exit(1);
} }
index_spec_uses_unique = (unique_line_no > 0);
} }
static bool static bool
run_actions(vector<Action>::const_iterator action_it, run_actions(vector<Action>::const_iterator action_it,
vector<Action>::const_iterator action_end, vector<Action>::const_iterator action_end,
Xapian::WritableDatabase& database, Xapian::WritableDatabase& database,
Xapian::TermGenerator& indexer, Xapian::TermGenerator& indexer,
const string& old_value, const string& old_value,
bool& this_field_is_content, Xapian::Document& doc, bool& this_field_is_content, Xapian::Document& doc,
map<string, list<string>>& fields, map<string, list<string>>& fields,
skipping to change at line 940 skipping to change at line 1025
case Action::HASH: { case Action::HASH: {
unsigned int max_length = action.get_num_arg(); unsigned int max_length = action.get_num_arg();
if (value.length() > max_length) if (value.length() > max_length)
value = hash_long_term(value, max_length); value = hash_long_term(value, max_length);
break; break;
} }
case Action::HEXTOBIN: { case Action::HEXTOBIN: {
size_t len = value.length(); size_t len = value.length();
if (len & 1) { if (len & 1) {
report_location(DIAG_ERROR, fname, line_no); report_location(DIAG_ERROR, fname, line_no);
cerr << "hextobin: input must have even length" cerr << "hextobin: input must have even length\n";
<< endl; exit(1);
} else { }
string output;
output.reserve(len / 2); string output;
for (size_t j = 0; j < len; j += 2) { output.reserve(len / 2);
char a = value[j]; for (size_t j = 0; j < len; j += 2) {
char b = value[j + 1]; char a = value[j];
if (!C_isxdigit(a) || !C_isxdigit(b)) { char b = value[j + 1];
report_location(DIAG_ERROR, fname, line_no); if (!C_isxdigit(a) || !C_isxdigit(b)) {
cerr << "hextobin: input must be all hex " report_location(DIAG_ERROR, fname, line_no);
"digits" << endl; cerr << "hextobin: input must be all hex digits\n";
goto badhex; exit(1);
}
char r = (hex_digit(a) << 4) | hex_digit(b);
output.push_back(r);
} }
value = std::move(output); char r = (hex_digit(a) << 4) | hex_digit(b);
output.push_back(r);
} }
badhex: value = std::move(output);
break; break;
} }
case Action::LOWER: case Action::LOWER:
value = Xapian::Unicode::tolower(value); value = Xapian::Unicode::tolower(value);
break; break;
case Action::LTRIM: case Action::LTRIM:
ltrim(value, action.get_string_arg()); ltrim(value, action.get_string_arg());
break; break;
case Action::RTRIM: case Action::RTRIM:
rtrim(value, action.get_string_arg()); rtrim(value, action.get_string_arg());
skipping to change at line 982 skipping to change at line 1065
rtrim(value, action.get_string_arg()); rtrim(value, action.get_string_arg());
ltrim(value, action.get_string_arg()); ltrim(value, action.get_string_arg());
break; break;
case Action::SQUASH: case Action::SQUASH:
squash(value, action.get_string_arg()); squash(value, action.get_string_arg());
break; break;
case Action::LOAD: { case Action::LOAD: {
// If there's no input, just issue a warning. // If there's no input, just issue a warning.
if (value.empty()) { if (value.empty()) {
report_location(DIAG_WARN, fname, line_no); report_location(DIAG_WARN, fname, line_no);
cerr << "Empty filename in LOAD action" << endl; cerr << "Empty filename in LOAD action\n";
break; break;
} }
bool truncated = false; bool truncated = false;
string filename = std::move(value); string filename = std::move(value);
// FIXME: Use NOATIME if we own the file or are root. // FIXME: Use NOATIME if we own the file or are root.
if (!load_file(filename, action.get_num_arg(), NOCACHE, if (!load_file(filename, action.get_num_arg(), NOCACHE,
value, truncated)) { value, truncated)) {
report_location(DIAG_ERROR, fname, line_no); report_location(DIAG_ERROR, fname, line_no);
cerr << "Couldn't load file '" << filename << "': " cerr << "Couldn't load file '" << filename << "': "
<< strerror(errno) << endl; << strerror(errno) << '\n';
value.resize(0); exit(1);
break;
} }
if (!truncated) break; if (!truncated) break;
} }
/* FALLTHRU */ /* FALLTHRU */
case Action::TRUNCATE: case Action::TRUNCATE:
utf8_truncate(value, action.get_num_arg()); utf8_truncate(value, action.get_num_arg());
break; break;
case Action::SPELL: case Action::SPELL:
indexer.set_flags(indexer.FLAG_SPELLING); indexer.set_flags(indexer.FLAG_SPELLING);
break; break;
skipping to change at line 1142 skipping to change at line 1224
p.reset(); p.reset();
p.parse_html(value, newcharset, true); p.parse_html(value, newcharset, true);
} }
if (p.indexing_allowed) if (p.indexing_allowed)
value = p.dump; value = p.dump;
else else
value = ""; value = "";
break; break;
} }
case Action::UNIQUE: { case Action::UNIQUE: {
// If there's no text, just issue a warning. unique_unused = false;
if (value.empty()) { if (value.empty()) {
report_location(DIAG_WARN, fname, line_no); enum diag_type diag = DIAG_WARN;
cerr << "Ignoring UNIQUE action on empty text" switch (unique_missing) {
<< endl; case UNIQUE_ERROR:
diag = DIAG_ERROR;
/* FALLTHRU */
case UNIQUE_WARN_NEW:
case UNIQUE_WARN_SKIP:
report_location(diag, fname, line_no);
cerr << "UNIQUE action on empty text\n";
default:
break;
}
switch (unique_missing) {
case UNIQUE_ERROR:
exit(1);
case UNIQUE_SKIP:
case UNIQUE_WARN_SKIP:
skipping_record = true;
break;
case UNIQUE_NEW:
case UNIQUE_WARN_NEW:
break;
}
break; break;
} }
// Ensure that the value of this field is unique. // Ensure that the value of this field is unique.
// If a record already exists with the same value, // If a record already exists with the same value,
// it will be replaced with the new record. // it will be replaced with the new record.
// Unique fields aren't considered content - if // Unique fields aren't considered content - if
// there are no other fields in the document, the // there are no other fields in the document, the
// document is to be deleted. // document is to be deleted.
skipping to change at line 1181 skipping to change at line 1284
if (!value.empty()) if (!value.empty())
doc.add_value(action.get_num_arg(), value); doc.add_value(action.get_num_arg(), value);
break; break;
case Action::VALUENUMERIC: { case Action::VALUENUMERIC: {
if (value.empty()) break; if (value.empty()) break;
char * end; char * end;
double dbl = strtod(value.c_str(), &end); double dbl = strtod(value.c_str(), &end);
if (*end) { if (*end) {
report_location(DIAG_WARN, fname, line_no); report_location(DIAG_WARN, fname, line_no);
cerr << "Trailing characters in VALUENUMERIC: '" cerr << "Trailing characters in VALUENUMERIC: '"
<< value << "'" << endl; << value << "'\n";
} }
doc.add_value(action.get_num_arg(), doc.add_value(action.get_num_arg(),
Xapian::sortable_serialise(dbl)); Xapian::sortable_serialise(dbl));
break; break;
} }
case Action::VALUEPACKED: { case Action::VALUEPACKED: {
uint32_t word = 0; uint32_t word = 0;
if (value.empty() || !C_isdigit(value[0])) { if (value.empty() || !C_isdigit(value[0])) {
// strtoul() accepts leading whitespace and negated // strtoul() accepts leading whitespace and negated
// values, neither of which we want to allow. // values, neither of which we want to allow.
skipping to change at line 1206 skipping to change at line 1309
word = strtoul(value.c_str(), &q, 10); word = strtoul(value.c_str(), &q, 10);
if (!errno && *q != '\0') { if (!errno && *q != '\0') {
// Trailing characters after converted value. // Trailing characters after converted value.
errno = EINVAL; errno = EINVAL;
} }
} }
if (errno) { if (errno) {
report_location(DIAG_WARN, fname, line_no); report_location(DIAG_WARN, fname, line_no);
cerr << "valuepacked \"" << value << "\" "; cerr << "valuepacked \"" << value << "\" ";
if (errno == ERANGE) { if (errno == ERANGE) {
cerr << "out of range"; cerr << "out of range\n";
} else { } else {
cerr << "not an unsigned integer"; cerr << "not an unsigned integer\n";
} }
cerr << endl;
} }
int valueslot = action.get_num_arg(); int valueslot = action.get_num_arg();
doc.add_value(valueslot, int_to_binary_string(word)); doc.add_value(valueslot, int_to_binary_string(word));
break; break;
} }
case Action::DATE: { case Action::DATE: {
// Do nothing for empty input. // Do nothing for empty input.
if (value.empty()) break; if (value.empty()) break;
const string & type = action.get_string_arg(); const string & type = action.get_string_arg();
string yyyymmdd; string yyyymmdd;
if (type == "unix") { if (type == "unix") {
time_t t; time_t t;
if (!parse_signed(value.c_str(), t)) { if (!parse_signed(value.c_str(), t)) {
report_location(DIAG_WARN, fname, line_no); report_location(DIAG_WARN, fname, line_no);
cerr << "Date value (in secs) for action DATE " cerr << "Date value (in secs) for action DATE "
"must be an integer - ignoring" << endl; "must be an integer - ignoring\n";
break; break;
} }
struct tm *tm = localtime(&t); struct tm *tm = localtime(&t);
int y = tm->tm_year + 1900; int y = tm->tm_year + 1900;
int m = tm->tm_mon + 1; int m = tm->tm_mon + 1;
yyyymmdd = date_to_string(y, m, tm->tm_mday); yyyymmdd = date_to_string(y, m, tm->tm_mday);
} else if (type == "unixutc") { } else if (type == "unixutc") {
time_t t; time_t t;
if (!parse_signed(value.c_str(), t)) { if (!parse_signed(value.c_str(), t)) {
report_location(DIAG_WARN, fname, line_no); report_location(DIAG_WARN, fname, line_no);
cerr << "Date value (in secs) for action DATE " cerr << "Date value (in secs) for action DATE "
"must be an integer - ignoring" << endl; "must be an integer - ignoring\n";
break; break;
} }
struct tm *tm = gmtime(&t); struct tm *tm = gmtime(&t);
int y = tm->tm_year + 1900; int y = tm->tm_year + 1900;
int m = tm->tm_mon + 1; int m = tm->tm_mon + 1;
yyyymmdd = date_to_string(y, m, tm->tm_mday); yyyymmdd = date_to_string(y, m, tm->tm_mday);
} else if (type == "yyyymmdd") { } else if (type == "yyyymmdd") {
if (value.length() != 8) { if (value.length() != 8) {
report_location(DIAG_WARN, fname, line_no); report_location(DIAG_WARN, fname, line_no);
cerr << "date=yyyymmdd expects an 8 character value " cerr << "date=yyyymmdd expects an 8 character value "
"- ignoring" << endl; "- ignoring\n";
break; break;
} }
yyyymmdd = value; yyyymmdd = value;
} }
// Date (YYYYMMDD) // Date (YYYYMMDD)
doc.add_boolean_term("D" + yyyymmdd); doc.add_boolean_term("D" + yyyymmdd);
yyyymmdd.resize(6); yyyymmdd.resize(6);
// Month (YYYYMM) // Month (YYYYMM)
doc.add_boolean_term("M" + yyyymmdd); doc.add_boolean_term("M" + yyyymmdd);
skipping to change at line 1274 skipping to change at line 1376
break; break;
} }
case Action::PARSEDATE: { case Action::PARSEDATE: {
string dateformat = action.get_string_arg(); string dateformat = action.get_string_arg();
struct tm tm; struct tm tm;
memset(&tm, 0, sizeof(tm)); memset(&tm, 0, sizeof(tm));
auto ret = strptime(value.c_str(), dateformat.c_str(), &tm); auto ret = strptime(value.c_str(), dateformat.c_str(), &tm);
if (ret == NULL) { if (ret == NULL) {
report_location(DIAG_WARN, fname, line_no); report_location(DIAG_WARN, fname, line_no);
cerr << "\"" << value << "\" doesn't match format " cerr << "\"" << value << "\" doesn't match format "
"\"" << dateformat << '\"' << endl; "\"" << dateformat << '\"' << '\n';
break; break;
} }
if (*ret != '\0') { if (*ret != '\0') {
report_location(DIAG_WARN, fname, line_no); report_location(DIAG_WARN, fname, line_no);
cerr << "\"" << value << "\" not fully matched by " cerr << "\"" << value << "\" not fully matched by "
"format \"" << dateformat << "\" " "format \"" << dateformat << "\" "
"(\"" << ret << "\" left over) but " "(\"" << ret << "\" left over) but "
"indexing anyway" << endl; "indexing anyway\n";
} }
#ifdef HAVE_STRUCT_TM_TM_GMTOFF #ifdef HAVE_STRUCT_TM_TM_GMTOFF
auto gmtoff = tm.tm_gmtoff; auto gmtoff = tm.tm_gmtoff;
#endif #endif
auto secs_since_epoch = timegm(&tm); auto secs_since_epoch = timegm(&tm);
#ifdef HAVE_STRUCT_TM_TM_GMTOFF #ifdef HAVE_STRUCT_TM_TM_GMTOFF
secs_since_epoch -= gmtoff; secs_since_epoch -= gmtoff;
#endif #endif
value = str(secs_since_epoch); value = str(secs_since_epoch);
break; break;
skipping to change at line 1310 skipping to change at line 1412
} }
return true; return true;
} }
static void static void
index_file(const char *fname, istream &stream, index_file(const char *fname, istream &stream,
Xapian::WritableDatabase &database, Xapian::TermGenerator &indexer) Xapian::WritableDatabase &database, Xapian::TermGenerator &indexer)
{ {
string line; string line;
size_t line_no = 0; size_t line_no = 0;
while (!stream.eof() && getline(stream, line)) { while (!stream.eof() && getline_portable(stream, line)) {
++line_no; ++line_no;
// Allow blank lines before the first record and multiple blank lines
// between records.
if (line.empty()) continue;
Xapian::Document doc; Xapian::Document doc;
indexer.set_document(doc); indexer.set_document(doc);
Xapian::docid docid = 0; Xapian::docid docid = 0;
map<string, list<string>> fields; map<string, list<string>> fields;
bool seen_content = false; bool seen_content = false;
skipping_record = false;
unique_unused = index_spec_uses_unique;
while (!line.empty()) { while (!line.empty()) {
// Cope with files from MS Windows (\r\n end of lines).
// Trim multiple \r characters, since that seems the best way
// to handle that case.
string::size_type last = line.find_last_not_of('\r');
if (last == string::npos) break;
line.resize(last + 1);
string::size_type eq = line.find('='); string::size_type eq = line.find('=');
if (eq == string::npos && !line.empty()) { if (eq == string::npos && !line.empty()) {
report_location(DIAG_ERROR, fname, line_no, line.size()); report_location(DIAG_ERROR, fname, line_no);
cerr << "expected = somewhere in this line" << endl; cerr << "Expected = somewhere in this line\n";
// FIXME: die or what? exit(1);
} }
string field(line, 0, eq); string field(line, 0, eq);
string value(line, eq + 1, string::npos); string value(line, eq + 1, string::npos);
while (getline(stream, line)) { line.clear();
while (getline_portable(stream, line)) {
++line_no; ++line_no;
if (line.empty() || line[0] != '=') break; if (line.empty() || line[0] != '=') break;
// Cope with files from MS Windows (\r\n end of lines). // Replace the '=' with a '\n'.
// Trim multiple \r characters, since that seems the best way
// to handle that case.
last = line.find_last_not_of('\r');
// line[0] == '=', so last != string::npos.
// Replace the '=' with a '\n' so we don't have to use substr.
line[0] = '\n'; line[0] = '\n';
line.resize(last + 1);
value += line; value += line;
} }
if (skipping_record) continue;
// Default to not indexing spellings. // Default to not indexing spellings.
indexer.set_flags(Xapian::TermGenerator::flags(0)); indexer.set_flags(Xapian::TermGenerator::flags(0));
bool this_field_is_content = true; bool this_field_is_content = true;
const vector<Action>& v = index_spec[field]; const vector<Action>& v = index_spec[field];
run_actions(v.begin(), v.end(), run_actions(v.begin(), v.end(),
database, indexer, value, database, indexer, value,
this_field_is_content, doc, fields, this_field_is_content, doc, fields,
field, fname, line_no, field, fname, line_no,
docid); docid);
if (this_field_is_content) seen_content = true; if (this_field_is_content) seen_content = true;
if (stream.eof()) break;
} }
// If we haven't seen any fields (other than unique identifiers) if (unique_unused) {
// the document is to be deleted. enum diag_type diag = DIAG_WARN;
if (!seen_content) { switch (unique_missing) {
case UNIQUE_ERROR:
diag = DIAG_ERROR;
/* FALLTHRU */
case UNIQUE_WARN_NEW:
case UNIQUE_WARN_SKIP:
report_location(diag, fname, line_no);
cerr << "UNIQUE action unused in this record\n";
default:
break;
}
switch (unique_missing) {
case UNIQUE_ERROR:
exit(1);
case UNIQUE_SKIP:
case UNIQUE_WARN_SKIP:
skipping_record = true;
break;
case UNIQUE_NEW:
case UNIQUE_WARN_NEW:
break;
}
}
if (skipping_record) {
++skipcount;
} else if (!seen_content) {
// We haven't seen any fields (other than unique identifiers)
// so the document is to be deleted.
if (docid) { if (docid) {
database.delete_document(docid); database.delete_document(docid);
if (verbose) cout << "Del: " << docid << endl; if (verbose) cout << "Del: " << docid << '\n';
++delcount; ++delcount;
} }
} else { } else {
string data; string data;
for (auto&& i : fields) { for (auto&& i : fields) {
for (auto&& field_val : i.second) { for (auto&& field_val : i.second) {
data += i.first; data += i.first;
data += '='; data += '=';
data += field_val; data += field_val;
data += '\n'; data += '\n';
} }
} }
// Put the data in the document // Put the data in the document
doc.set_data(data); doc.set_data(data);
// Add the document to the database // Add the document to the database
if (docid) { if (docid) {
database.replace_document(docid, doc); database.replace_document(docid, doc);
if (verbose) cout << "Replace: " << docid << endl; if (verbose) cout << "Replace: " << docid << '\n';
++repcount; ++repcount;
} else { } else {
docid = database.add_document(doc); docid = database.add_document(doc);
if (verbose) cout << "Add: " << docid << endl; if (verbose) cout << "Add: " << docid << '\n';
++addcount; ++addcount;
} }
} }
} }
// Commit after each file to make sure all changes from that file make it // Commit after each file to make sure all changes from that file make it
// in. // in.
if (verbose) cout << "Committing: " << endl; if (verbose) cout << "Committing\n";
database.commit(); database.commit();
} }
static void static void
show_help(int exit_code) show_help(int exit_code)
{ {
cout << PROG_NAME " - " PROG_DESC "\n" cout << PROG_NAME " - " PROG_DESC "\n"
"Usage: " PROG_NAME " [OPTIONS] DATABASE INDEXER_SCRIPT [INPUT_FILE]...\n" "Usage: " PROG_NAME " [OPTIONS] DATABASE INDEXER_SCRIPT [INPUT_FILE]...\n"
"\n" "\n"
"Creates or updates a Xapian database with the data from the input files listed\ n" "Creates or updates a Xapian database with the data from the input files listed\ n"
skipping to change at line 1471 skipping to change at line 1596
break; break;
case 'v': case 'v':
verbose = true; verbose = true;
break; break;
case 's': case 's':
try { try {
stemmer = Xapian::Stem(optarg); stemmer = Xapian::Stem(optarg);
} catch (const Xapian::InvalidArgumentError &) { } catch (const Xapian::InvalidArgumentError &) {
cerr << "Unknown stemming language '" << optarg << "'.\n"; cerr << "Unknown stemming language '" << optarg << "'.\n";
cerr << "Available language names are: " cerr << "Available language names are: "
<< Xapian::Stem::get_available_languages() << endl; << Xapian::Stem::get_available_languages() << '\n';
return 1; return 1;
} }
break; break;
} }
} }
argv += optind; argv += optind;
argc -= optind; argc -= optind;
if (argc < 2) { if (argc < 2) {
show_help(1); show_help(1);
skipping to change at line 1499 skipping to change at line 1624
Xapian::WritableDatabase database(argv[0], flags); Xapian::WritableDatabase database(argv[0], flags);
Xapian::TermGenerator indexer; Xapian::TermGenerator indexer;
indexer.set_stemmer(stemmer); indexer.set_stemmer(stemmer);
// Set the database for spellings to be added to by the "spell" action. // Set the database for spellings to be added to by the "spell" action.
indexer.set_database(database); indexer.set_database(database);
addcount = 0; addcount = 0;
repcount = 0; repcount = 0;
delcount = 0; delcount = 0;
skipcount = 0;
if (argc == 2) { if (argc == 2) {
// Read from stdin. // Read from stdin.
index_file("<stdin>", cin, database, indexer); index_file("<stdin>", cin, database, indexer);
} else { } else {
// Read file(s) listed on the command line. // Read file(s) listed on the command line.
for (int i = 2; i < argc; ++i) { for (int i = 2; i < argc; ++i) {
ifstream stream(argv[i]); ifstream stream(argv[i]);
if (stream) { if (stream) {
index_file(argv[i], stream, database, indexer); index_file(argv[i], stream, database, indexer);
} else { } else {
cerr << "Can't open file " << argv[i] << endl; cerr << "Can't open file " << argv[i] << '\n';
} }
} }
} }
cout << "records (added, replaced, deleted) = (" << addcount << ", " cout << "records (added, replaced, deleted, skipped) = ("
<< repcount << ", " << delcount << ")" << endl; << addcount << ", "
<< repcount << ", "
<< delcount << ", "
<< skipcount << ")\n";
} catch (const Xapian::Error &error) { } catch (const Xapian::Error &error) {
cerr << "Exception: " << error.get_description() << endl; cerr << "Exception: " << error.get_description() << '\n';
exit(1); exit(1);
} catch (const std::bad_alloc &) { } catch (const std::bad_alloc &) {
cerr << "Exception: std::bad_alloc" << endl; cerr << "Exception: std::bad_alloc\n";
exit(1); exit(1);
} catch (...) { } catch (...) {
cerr << "Unknown Exception" << endl; cerr << "Unknown Exception\n";
exit(1); exit(1);
} }
 End of changes. 94 change blocks. 
164 lines changed or deleted 292 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)