"Fossies" - the Fresh Open Source Software Archive

Member "recoll-1.26.3/query/recollq.cpp" (28 Nov 2019, 13816 Bytes) of package /linux/privat/recoll-1.26.3.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "recollq.cpp" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 1.26.1_vs_1.26.3.

    1 /* Copyright (C) 2006 J.F.Dockes
    2  *   This program is free software; you can redistribute it and/or modify
    3  *   it under the terms of the GNU General Public License as published by
    4  *   the Free Software Foundation; either version 2 of the License, or
    5  *   (at your option) any later version.
    6  *
    7  *   This program is distributed in the hope that it will be useful,
    8  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
    9  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   10  *   GNU General Public License for more details.
   11  *
   12  *   You should have received a copy of the GNU General Public License
   13  *   along with this program; if not, write to the
   14  *   Free Software Foundation, Inc.,
   15  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
   16  */
   17 // Takes a query and run it, no gui, results to stdout
   18 
   19 #include <stdio.h>
   20 #include <stdlib.h>
   21 #include <errno.h>
   22 #include <string.h>
   23 #include <limits.h>
   24 
   25 #include <iostream>
   26 #include <list>
   27 #include <string>
   28 
   29 #include "rcldb.h"
   30 #include "rclquery.h"
   31 #include "rclconfig.h"
   32 #include "pathut.h"
   33 #include "rclinit.h"
   34 #include "log.h"
   35 #include "wasatorcl.h"
   36 #include "internfile.h"
   37 #include "wipedir.h"
   38 #include "transcode.h"
   39 #include "textsplit.h"
   40 #include "smallut.h"
   41 #include "chrono.h"
   42 #include "base64.h"
   43 
   44 using namespace std;
   45 
   46 bool dump_contents(RclConfig *rclconfig, Rcl::Doc& idoc)
   47 {
   48     FileInterner interner(idoc, rclconfig, FileInterner::FIF_forPreview);
   49     Rcl::Doc fdoc;
   50     string ipath = idoc.ipath;
   51     if (interner.internfile(fdoc, ipath)) {
   52     cout << fdoc.text << endl;
   53     } else {
   54     cout << "Cant turn to text:" << idoc.url << " | " << idoc.ipath << endl;
   55     }
   56     return true;
   57 }
   58 
   59 void output_fields(vector<string> fields, Rcl::Doc& doc,
   60            Rcl::Query& query, Rcl::Db& rcldb, bool printnames)
   61 {
   62     if (fields.empty()) {
   63         map<string,string>::const_iterator it;
   64         for (const auto& entry : doc.meta) {
   65             fields.push_back(entry.first);
   66         }
   67     }
   68     for (vector<string>::const_iterator it = fields.begin();
   69      it != fields.end(); it++) {
   70     string out;
   71     if (!it->compare("abstract")) {
   72         string abstract;
   73         query.makeDocAbstract(doc, abstract);
   74         base64_encode(abstract, out);
   75         } else if (!it->compare("xdocid")) {
   76             char cdocid[30];
   77             sprintf(cdocid, "%lu", (unsigned long)doc.xdocid);
   78             base64_encode(cdocid, out);
   79     } else {
   80         base64_encode(doc.meta[*it], out);
   81     }
   82         // Before printnames existed, recollq printed a single blank for empty
   83         // fields. This is a problem when printing names and using strtok, but
   84         // have to keep the old behaviour when printnames is not set.
   85         if (!(out.empty() && printnames)) {
   86             if (printnames)
   87                 cout << *it << " ";
   88             cout << out << " ";
   89         }
   90     }
   91     cout << endl;
   92 }
   93 
   94 static char *thisprog;
   95 static char usage [] =
   96 " -P: Show the date span for all the documents present in the index.\n"
   97 " [-o|-a|-f] [-q] <query string>\n"
   98 " Runs a recoll query and displays result lines. \n"
   99 "  Default: will interpret the argument(s) as a xesam query string.\n"
  100 "  Query elements: \n"
  101 "   * Implicit AND, exclusion, field spec:  t1 -t2 title:t3\n"
  102 "   * OR has priority: t1 OR t2 t3 OR t4 means (t1 OR t2) AND (t3 OR t4)\n"
  103 "   * Phrase: \"t1 t2\" (needs additional quoting on cmd line)\n"
  104 " -o Emulate the GUI simple search in ANY TERM mode.\n"
  105 " -a Emulate the GUI simple search in ALL TERMS mode.\n"
  106 " -f Emulate the GUI simple search in filename mode.\n"
  107 " -q is just ignored (compatibility with the recoll GUI command line).\n"
  108 "Common options:\n"
  109 " -c <configdir> : specify config directory, overriding $RECOLL_CONFDIR.\n"
  110 " -C : collapse duplicates\n"            
  111 " -d also dump file contents.\n"
  112 " -n [first-]<cnt> define the result slice. The default value for [first]\n"
  113 "    is 0. Without the option, the default max count is 2000.\n"
  114 "    Use n=0 for no limit.\n"
  115 " -b : basic. Just output urls, no mime types or titles.\n"
  116 " -Q : no result lines, just the processed query and result count.\n"
  117 " -m : dump the whole document meta[] array for each result.\n"
  118 " -A : output the document abstracts.\n"
  119 " -S fld : sort by field <fld>.\n"
  120 "   -D : sort descending.\n"
  121 " -s stemlang : set stemming language to use (must exist in index...).\n"
  122 "    Use -s \"\" to turn off stem expansion.\n"
  123 " -T <synonyms file>: use the parameter (Thesaurus) for word expansion.\n"
  124 " -i <dbdir> : additional index, several can be given.\n"
  125 " -e use url encoding (%xx) for urls.\n"
  126 " -E use exact result count instead of lower bound estimate"
  127 " -F <field name list> : output exactly these fields for each result.\n"
  128 "    The field values are encoded in base64, output in one line and \n"
  129 "    separated by one space character. This is the recommended format \n"
  130 "    for use by other programs. Use a normal query with option -m to \n"
  131 "    see the field names. Use -F '' to output all fields, but you probably\n"
  132 "    also want option -N in this case.\n"
  133 "  -N : with -F, print the (plain text) field names before the field values.\n"
  134 ;
  135 static void
  136 Usage(void)
  137 {
  138     cerr << thisprog <<  ": usage:" << endl << usage;
  139     exit(1);
  140 }
  141 
  142 // BEWARE COMPATIBILITY WITH recoll OPTIONS letters
  143 static int     op_flags;
  144 
  145 #define OPT_A     0x1
  146 // GUI: -a same
  147 #define OPT_a     0x2
  148 #define OPT_b     0x4
  149 #define OPT_C     0x8
  150 // GUI: -c same
  151 #define OPT_c     0x10 
  152 #define OPT_D     0x20 
  153 #define OPT_d     0x40 
  154 #define OPT_e     0x80 
  155 #define OPT_F     0x100
  156 // GUI: -f same
  157 #define OPT_f     0x200
  158 // GUI uses -h for help. us: usage
  159 #define OPT_i     0x400
  160 // GUI uses -L to set language of messages
  161 // GUI: -l same
  162 #define OPT_l     0x800
  163 #define OPT_m     0x1000
  164 #define OPT_N     0x2000
  165 #define OPT_n     0x4000
  166 // GUI: -o same
  167 #define OPT_o     0x8000
  168 #define OPT_P     0x10000
  169 #define OPT_Q     0x20000
  170 // GUI: -q same
  171 #define OPT_q     0x40000
  172 #define OPT_S     0x80000
  173 #define OPT_s     0x100000
  174 #define OPT_T     0x200000
  175 // GUI: -t use command line, us: ignored
  176 #define OPT_t     0x400000
  177 // GUI uses -v : show version. Us: usage
  178 // GUI uses -w : open minimized
  179 #define OPT_E     0x800000
  180 
  181 int recollq(RclConfig **cfp, int argc, char **argv)
  182 {
  183     string a_config;
  184     string sortfield;
  185     string stemlang("english");
  186     list<string> extra_dbs;
  187     string sf;
  188     vector<string> fields;
  189     string syngroupsfn;
  190     
  191     int firstres = 0;
  192     int maxcount = 2000;
  193     thisprog = argv[0];
  194     argc--; argv++;
  195 
  196     while (argc > 0 && **argv == '-') {
  197         (*argv)++;
  198         if (!(**argv))
  199             /* Cas du "adb - core" */
  200             Usage();
  201         while (**argv)
  202             switch (*(*argv)++) {
  203         case '-': 
  204         // -- : end of options
  205         if (*(*argv) != 0)
  206             Usage();
  207         goto endopts;
  208             case 'A':   op_flags |= OPT_A; break;
  209             case 'a':   op_flags |= OPT_a; break;
  210             case 'b':   op_flags |= OPT_b; break;
  211             case 'C':   op_flags |= OPT_C; break;
  212         case 'c':   op_flags |= OPT_c; if (argc < 2)  Usage();
  213         a_config = *(++argv);
  214         argc--; goto b1;
  215             case 'd':   op_flags |= OPT_d; break;
  216             case 'D':   op_flags |= OPT_D; break;
  217             case 'E':   op_flags |= OPT_E; break;
  218             case 'e':   op_flags |= OPT_e; break;
  219             case 'f':   op_flags |= OPT_f; break;
  220         case 'F':   op_flags |= OPT_F; if (argc < 2)  Usage();
  221         sf = *(++argv);
  222         argc--; goto b1;
  223         case 'i':   op_flags |= OPT_i; if (argc < 2)  Usage();
  224         extra_dbs.push_back(*(++argv));
  225         argc--; goto b1;
  226             case 'l':   op_flags |= OPT_l; break;
  227             case 'm':   op_flags |= OPT_m; break;
  228             case 'N':   op_flags |= OPT_N; break;
  229         case 'n':   op_flags |= OPT_n; if (argc < 2)  Usage();
  230         {
  231         string rescnt = *(++argv);
  232         string::size_type dash = rescnt.find("-");
  233         if (dash != string::npos) {
  234             firstres = atoi(rescnt.substr(0, dash).c_str());
  235             if (dash < rescnt.size()-1) {
  236             maxcount = atoi(rescnt.substr(dash+1).c_str());
  237             }
  238         } else {
  239             maxcount = atoi(rescnt.c_str());
  240         }
  241         if (maxcount <= 0) maxcount = INT_MAX;
  242         }
  243         argc--; goto b1;
  244             case 'o':   op_flags |= OPT_o; break;
  245             case 'P':   op_flags |= OPT_P; break;
  246             case 'q':   op_flags |= OPT_q; break;
  247             case 'Q':   op_flags |= OPT_Q; break;
  248         case 'S':   op_flags |= OPT_S; if (argc < 2)  Usage();
  249         sortfield = *(++argv);
  250         argc--; goto b1;
  251         case 's':   op_flags |= OPT_s; if (argc < 2)  Usage();
  252         stemlang = *(++argv);
  253         argc--; goto b1;
  254             case 't':   op_flags |= OPT_t; break;
  255         case 'T':   op_flags |= OPT_T; if (argc < 2)  Usage();
  256         syngroupsfn = *(++argv);
  257         argc--; goto b1;
  258             default: Usage();   break;
  259             }
  260     b1: argc--; argv++;
  261     }
  262 endopts:
  263 
  264     string reason;
  265     *cfp = recollinit(0, 0, 0, reason, &a_config);
  266     RclConfig *rclconfig = *cfp;
  267     if (!rclconfig || !rclconfig->ok()) {
  268     fprintf(stderr, "Recoll init failed: %s\n", reason.c_str());
  269     exit(1);
  270     }
  271 
  272     if (argc < 1 && !(op_flags & OPT_P)) {
  273     Usage();
  274     }
  275     if (op_flags & OPT_F) {
  276     if (op_flags & (OPT_b|OPT_d|OPT_b|OPT_Q|OPT_m|OPT_A))
  277         Usage();
  278     stringToStrings(sf, fields);
  279     }
  280     Rcl::Db rcldb(rclconfig);
  281     if (!extra_dbs.empty()) {
  282         for (list<string>::iterator it = extra_dbs.begin();
  283              it != extra_dbs.end(); it++) {
  284             if (!rcldb.addQueryDb(*it)) {
  285                 cerr << "Can't add index: " << *it << endl;
  286                 exit(1);
  287             }
  288         }
  289     }
  290     if (!syngroupsfn.empty()) {
  291         if (!rcldb.setSynGroupsFile(syngroupsfn)) {
  292             cerr << "Can't use synonyms file: " << syngroupsfn << endl;
  293             exit(1);
  294         }
  295     }
  296     
  297     if (!rcldb.open(Rcl::Db::DbRO)) {
  298     cerr << "Cant open database in " << rclconfig->getDbDir() << 
  299         " reason: " << rcldb.getReason() << endl;
  300     exit(1);
  301     }
  302 
  303     if (op_flags & OPT_P) {
  304         int minyear, maxyear;
  305         if (!rcldb.maxYearSpan(&minyear, &maxyear)) {
  306             cerr << "maxYearSpan failed: " << rcldb.getReason() << endl;
  307             exit(1);
  308         } else {
  309             cout << "Min year " << minyear << " Max year " << maxyear << endl;
  310             exit(0);
  311         }
  312     }
  313 
  314     if (argc < 1) {
  315     Usage();
  316     }
  317     string qs = *argv++;argc--;
  318     while (argc > 0) {
  319     qs += string(" ") + *argv++;argc--;
  320     }
  321 
  322     {
  323     string uq;
  324     string charset = rclconfig->getDefCharset(true);
  325     int ercnt;
  326     if (!transcode(qs, uq, charset, "UTF-8", &ercnt)) {
  327         fprintf(stderr, "Can't convert command line args to utf-8\n");
  328         exit(1);
  329     } else if (ercnt) {
  330         fprintf(stderr, "%d errors while converting arguments from %s "
  331             "to utf-8\n", ercnt, charset.c_str());
  332     }
  333     qs = uq;
  334     }
  335 
  336     Rcl::SearchData *sd = 0;
  337 
  338     if (op_flags & (OPT_a|OPT_o|OPT_f)) {
  339     sd = new Rcl::SearchData(Rcl::SCLT_OR, stemlang);
  340     Rcl::SearchDataClause *clp = 0;
  341     if (op_flags & OPT_f) {
  342         clp = new Rcl::SearchDataClauseFilename(qs);
  343     } else {
  344         clp = new Rcl::SearchDataClauseSimple((op_flags & OPT_o)?
  345                                                   Rcl::SCLT_OR : Rcl::SCLT_AND, 
  346                                                   qs);
  347     }
  348     if (sd)
  349         sd->addClause(clp);
  350     } else {
  351     sd = wasaStringToRcl(rclconfig, stemlang, qs, reason);
  352     }
  353 
  354     if (!sd) {
  355     cerr << "Query string interpretation failed: " << reason << endl;
  356     return 1;
  357     }
  358 
  359     std::shared_ptr<Rcl::SearchData> rq(sd);
  360     Rcl::Query query(&rcldb);
  361     if (op_flags & OPT_C) {
  362         query.setCollapseDuplicates(true);
  363     }
  364     if (op_flags & OPT_S) {
  365     query.setSortBy(sortfield, (op_flags & OPT_D) ? false : true);
  366     }
  367     Chrono chron;
  368     if (!query.setQuery(rq)) {
  369     cerr << "Query setup failed: " << query.getReason() << endl;
  370     return(1);
  371     }
  372     int cnt;
  373     if (op_flags & OPT_E) {
  374         cnt = query.getResCnt(-1, true);
  375     } else {
  376         cnt = query.getResCnt();
  377     }
  378     if (!(op_flags & OPT_b)) {
  379     cout << "Recoll query: " << rq->getDescription() << endl;
  380     if (firstres == 0) {
  381         if (cnt <= maxcount)
  382         cout << cnt << " results" << endl;
  383         else
  384         cout << cnt << " results (printing  " << maxcount << " max):" 
  385              << endl;
  386     } else {
  387         cout << "Printing at most " << cnt - (firstres+maxcount) <<
  388         " results from first " << firstres << endl;
  389     }
  390     }
  391     if (op_flags & OPT_Q)
  392     cout << "Query setup took " << chron.millis() << " mS" << endl;
  393 
  394     if (op_flags & OPT_Q)
  395     return(0);
  396 
  397     for (int i = firstres; i < firstres + maxcount; i++) {
  398     Rcl::Doc doc;
  399     if (!query.getDoc(i, doc))
  400         break;
  401 
  402     if (op_flags & OPT_F) {
  403         output_fields(fields, doc, query, rcldb, op_flags & OPT_N);
  404         continue;
  405     }
  406 
  407     if (op_flags & OPT_e) 
  408         doc.url = url_encode(doc.url);
  409 
  410     if (op_flags & OPT_b) {
  411         cout << doc.url << endl;
  412     } else {
  413         string titleorfn = doc.meta[Rcl::Doc::keytt];
  414         if (titleorfn.empty())
  415         titleorfn = doc.meta[Rcl::Doc::keyfn];
  416         if (titleorfn.empty()) {
  417                 string url;
  418                 printableUrl(rclconfig->getDefCharset(), doc.url, url);
  419                 titleorfn = path_getsimple(url);
  420             }
  421 
  422         char cpc[20];
  423         sprintf(cpc, "%d", doc.pc);
  424         cout 
  425         << doc.mimetype << "\t"
  426         << "[" << doc.url << "]" << "\t" 
  427         << "[" << titleorfn << "]" << "\t"
  428         << doc.fbytes << "\tbytes" << "\t"
  429         <<  endl;
  430         if (op_flags & OPT_m) {
  431         for (const auto ent : doc.meta) {
  432             cout << ent.first << " = " << ent.second << endl;
  433         }
  434         }
  435             if (op_flags & OPT_A) {
  436                 string abstract;
  437                 if (query.makeDocAbstract(doc, abstract)) {
  438                     cout << "ABSTRACT" << endl;
  439                     cout << abstract << endl;
  440                     cout << "/ABSTRACT" << endl;
  441                 }
  442             }
  443         }
  444         if (op_flags & OPT_d) {
  445             dump_contents(rclconfig, doc);
  446         }   
  447     }
  448 
  449     return 0;
  450 }
  451