"Fossies" - the Fresh Open Source Software Archive

Member "xapian-core-1.4.14/tests/api_opsynonym.cc" (23 Nov 2019, 14624 Bytes) of package /linux/www/xapian-core-1.4.14.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 /** @file api_opsynonym.cc
    2  * @brief tests of OP_SYNONYM and OP_MAX.
    3  */
    4 /* Copyright 2009,2011,2014 Olly Betts
    5  * Copyright 2007,2008,2009 Lemur Consulting Ltd
    6  *
    7  * This program is free software; you can redistribute it and/or
    8  * modify it under the terms of the GNU General Public License as
    9  * published by the Free Software Foundation; either version 2 of the
   10  * License, or (at your option) any later version.
   11  *
   12  * This program is distributed in the hope that it will be useful,
   13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   15  * GNU General Public License for more details.
   16  *
   17  * You should have received a copy of the GNU General Public License
   18  * along with this program; if not, write to the Free Software
   19  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
   20  * USA
   21  */
   22 
   23 #include <config.h>
   24 
   25 #include "api_opsynonym.h"
   26 
   27 #include <map>
   28 #include <set>
   29 #include <vector>
   30 
   31 #include <xapian.h>
   32 
   33 #include "backendmanager.h"
   34 #include "testsuite.h"
   35 #include "testutils.h"
   36 
   37 #include "apitest.h"
   38 
   39 using namespace std;
   40 
   41 // #######################################################################
   42 // # Tests start here
   43 
   44 struct synonym1_data_type {
   45     // How many results should have the same weight when combined with
   46     // OP_SYNONYM instead of OP_OR.
   47     int sameweight_count;
   48     // How many results should have a different weight when combined with
   49     // OP_SYNONYM instead of OP_OR.
   50     int diffweight_count;
   51     // How many subqueries.
   52     unsigned n_subqs;
   53     // The subqueries (use NOQ for unused ones).
   54     Xapian::Query subqs[4];
   55 };
   56 
   57 #define NOQ Xapian::Query::MatchNothing
   58 static const synonym1_data_type synonym1_data[] = {
   59     {
   60     // Single term - all 33 results should be same weight.
   61     33, 0, 1,
   62     { Xapian::Query("date"), NOQ, NOQ, NOQ }
   63     },
   64     {
   65     // Two terms, which co-occur in some documents.
   66     //
   67     // All 34 results should be different.
   68     0, 34, 2,
   69     { Xapian::Query("sky"), Xapian::Query("date"), NOQ, NOQ }
   70     },
   71     {
   72     // Two terms which are entirely disjoint, and where the maximum weight
   73     // doesn't occur in the first or second match.
   74     //
   75     // All 18 results should be different.
   76     0, 18, 2,
   77     { Xapian::Query("gutenberg"), Xapian::Query("blockhead"), NOQ, NOQ }
   78     },
   79     {
   80     // All 34 results should be different.
   81     0, 34, 2,
   82     {
   83         Xapian::Query("date"),
   84         Xapian::Query(Xapian::Query::OP_OR,
   85               Xapian::Query("sky"),
   86               Xapian::Query("glove")),
   87         NOQ, NOQ
   88     }
   89     },
   90     {
   91     // All 34 results should be different.
   92     0, 34, 2,
   93     {
   94         Xapian::Query("date"),
   95         Xapian::Query(Xapian::Query::OP_OR,
   96               Xapian::Query("sky"),
   97               Xapian::Query("date")),
   98         NOQ, NOQ
   99     }
  100     },
  101     {
  102     // All 34 results should be different.
  103     0, 34, 2,
  104     {
  105         Xapian::Query("date"),
  106         Xapian::Query(Xapian::Query::OP_AND_MAYBE,
  107               Xapian::Query("sky"),
  108               Xapian::Query("date")),
  109         NOQ, NOQ
  110     }
  111     },
  112     {
  113     // All 34 results should be different.
  114     0, 34, 2,
  115     {
  116         Xapian::Query("date"),
  117         Xapian::Query(Xapian::Query::OP_AND_NOT,
  118               Xapian::Query("sky"),
  119               Xapian::Query("date")),
  120         NOQ, NOQ
  121     }
  122     },
  123     {
  124     // The AND only matches 1 document, so the estimated termfreq for the
  125     // whole synonym works out as 33 (due to rounding), which is the same
  126     // as the termfreq for "date".  Therefore most of the weights are the
  127     // same as just for the pure "date" search, and the only document which
  128     // gets a different weight is the one also matched by "sky" (because it
  129     // has a wdf boost).
  130     32, 1, 2,
  131     {
  132         Xapian::Query("date"),
  133         Xapian::Query(Xapian::Query::OP_AND,
  134               Xapian::Query("sky"),
  135               Xapian::Query("date")),
  136         NOQ, NOQ
  137     }
  138     },
  139     {
  140     // All 34 results should be different.
  141     0, 34, 2,
  142     {
  143         Xapian::Query("date"),
  144         Xapian::Query(Xapian::Query::OP_XOR,
  145               Xapian::Query("sky"),
  146               Xapian::Query("date")),
  147         NOQ, NOQ
  148     }
  149     },
  150     {
  151     // When the top-level operator is OR, the synonym part has an estimated
  152     // termfreq of 35.  When the top-level operator is SYNONYM, the whole
  153     // query has an estimated termfreq of 66, which is rather bogus, but
  154     // that's the current situation here (1.2 did better as it flattened
  155     // this into a single OP_SYNONYM operator and then merged the two
  156     // "date" terms to one with wqf=2.  We've decided we shouldn't do such
  157     // merging from 1.3.x on (merging to sum the scale_factors is fine, but
  158     // we don't do that yet - FIXME).
  159     //
  160     // Anyway, this means that currently the weights are different for all
  161     // matches.
  162     0, 34, 2,
  163     {
  164         Xapian::Query("date"),
  165         Xapian::Query(Xapian::Query::OP_SYNONYM,
  166               Xapian::Query("sky"),
  167               Xapian::Query("date")),
  168         NOQ, NOQ
  169     }
  170     },
  171     {
  172     // All 35 results should be different.
  173     0, 35, 4,
  174     {
  175         Xapian::Query("sky"),
  176         Xapian::Query("date"),
  177         Xapian::Query("stein"),
  178         Xapian::Query("ally")
  179     }
  180     },
  181     {
  182     // The estimated term frequency for the synoynm is 2 (because the
  183     // estimate for the phrase is 0), which is the same as the term
  184     // frequency of "attitud".  Thus, the synonym gets the same weight as
  185     // "attitud", so documents with only "attitud" (but not the phrase) in
  186     // them get the same wdf, and have the same total weight.  There turns
  187     // out to be exactly one such document.
  188     1, 3, 2,
  189     {
  190         Xapian::Query("attitud"),
  191         Xapian::Query(Xapian::Query::OP_PHRASE,
  192               Xapian::Query("german"),
  193               Xapian::Query("adventur")),
  194         NOQ, NOQ
  195     }
  196     },
  197     {
  198     // All 54 results should be different.
  199     0, 54, 2,
  200     {
  201         Xapian::Query("attitud"),
  202         Xapian::Query(Xapian::Query::OP_OR,
  203               Xapian::Query("german"),
  204               Xapian::Query(Xapian::Query::OP_SYNONYM,
  205                     Xapian::Query("sky"),
  206                     Xapian::Query("date"))),
  207         NOQ, NOQ
  208     }
  209     }
  210 };
  211 
  212 // Check a synonym search
  213 DEFINE_TESTCASE(synonym1, backend) {
  214     Xapian::Database db(get_database("etext"));
  215 
  216     TEST_REL(db.get_doclength_upper_bound(), >, 0);
  217 
  218     const Xapian::doccount lots = 214;
  219 
  220     for (size_t subqgroup = 0;
  221      subqgroup != sizeof(synonym1_data) / sizeof(synonym1_data[0]);
  222      ++subqgroup) {
  223     const synonym1_data_type & data = synonym1_data[subqgroup];
  224     const Xapian::Query * qlist = data.subqs;
  225     const Xapian::Query * qlist_end = qlist + data.n_subqs;
  226 
  227     // Run two queries, one joining the subqueries with OR and one joining
  228     // them with SYNONYM.
  229     Xapian::Enquire enquire(db);
  230 
  231     // Do the search with OP_OR, getting all the results.
  232     Xapian::Query orquery(Xapian::Query::OP_OR, qlist, qlist_end);
  233     enquire.set_query(orquery);
  234     Xapian::MSet ormset = enquire.get_mset(0, lots);
  235 
  236     // Do the search with OP_SYNONYM, getting all the results.
  237     Xapian::Query synquery(Xapian::Query::OP_SYNONYM, qlist, qlist_end);
  238     enquire.set_query(synquery);
  239     Xapian::MSet synmset = enquire.get_mset(0, lots);
  240 
  241     tout << "Comparing " << orquery << " with " << synquery << '\n';
  242 
  243     // Check that the queries return some results.
  244     TEST_NOT_EQUAL(synmset.size(), 0);
  245     // Check that the queries return the same number of results.
  246     TEST_EQUAL(synmset.size(), ormset.size());
  247     map<Xapian::docid, double> values_or;
  248     map<Xapian::docid, double> values_synonym;
  249     for (Xapian::doccount i = 0; i < synmset.size(); ++i) {
  250         values_or[*ormset[i]] = ormset[i].get_weight();
  251         values_synonym[*synmset[i]] = synmset[i].get_weight();
  252     }
  253     TEST_EQUAL(values_or.size(), values_synonym.size());
  254 
  255     /* Check that the most of the weights for items in the "or" mset are
  256      * different from those in the "synonym" mset. */
  257     int same_weight = 0;
  258     int different_weight = 0;
  259     for (map<Xapian::docid, double>::const_iterator
  260          j = values_or.begin(); j != values_or.end(); ++j) {
  261         Xapian::docid did = j->first;
  262         // Check that all the results in the or tree make it to the synonym
  263         // tree.
  264         TEST(values_synonym.find(did) != values_synonym.end());
  265         if (values_or[did] == values_synonym[did]) {
  266         ++same_weight;
  267         } else {
  268         ++different_weight;
  269         }
  270     }
  271 
  272     TEST_EQUAL(different_weight, data.diffweight_count);
  273     TEST_EQUAL(same_weight, data.sameweight_count);
  274 
  275     // Do the search with synonym, but just get the top result.
  276     // (Regression test - the OR subquery in the synonym postlist tree used
  277     // to shortcut incorrectly, and return the wrong result here).
  278     Xapian::MSet mset_top = enquire.get_mset(0, 1);
  279     TEST_EQUAL(mset_top.size(), 1);
  280     TEST(mset_range_is_same(mset_top, 0, synmset, 0, 1));
  281     }
  282     return true;
  283 }
  284 
  285 // Regression test - test a synonym search with a MultiAndPostlist.
  286 DEFINE_TESTCASE(synonym2, backend) {
  287     Xapian::Query query;
  288     vector<Xapian::Query> subqueries;
  289     subqueries.push_back(Xapian::Query("file"));
  290     subqueries.push_back(Xapian::Query("the"));
  291     subqueries.push_back(Xapian::Query("next"));
  292     subqueries.push_back(Xapian::Query("reader"));
  293     query = Xapian::Query(Xapian::Query::OP_AND, subqueries.begin(), subqueries.end());
  294     subqueries.clear();
  295     subqueries.push_back(query);
  296     subqueries.push_back(Xapian::Query("gutenberg"));
  297     query = Xapian::Query(Xapian::Query::OP_SYNONYM, subqueries.begin(), subqueries.end());
  298 
  299     tout << query << '\n';
  300 
  301     Xapian::Database db(get_database("etext"));
  302     Xapian::Enquire enquire(db);
  303     enquire.set_query(query);
  304     Xapian::MSet mset = enquire.get_mset(0, 10);
  305     tout << mset << '\n';
  306 
  307     // Regression test that OP_SCALE_WEIGHT works with OP_SYNONYM
  308     double maxposs = mset.get_max_possible();
  309     query = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 10.0);
  310     enquire.set_query(query);
  311     mset = enquire.get_mset(0, 10);
  312     double maxposs2 = mset.get_max_possible();
  313 
  314     TEST_EQUAL_DOUBLE(maxposs * 10.0, maxposs2);
  315 
  316     return true;
  317 }
  318 
  319 static void
  320 check_msets_contain_same_docs(const Xapian::MSet & mset1,
  321                   const Xapian::MSet & mset2)
  322 {
  323     TEST_EQUAL(mset1.size(), mset2.size());
  324 
  325     set<Xapian::docid> docids;
  326     for (Xapian::doccount i = 0; i < mset1.size(); ++i) {
  327     docids.insert(*mset1[i]);
  328     }
  329 
  330     // Check that all the results in mset1 are in mset2.
  331     for (Xapian::doccount j = 0; j < mset2.size(); ++j) {
  332     // Check that we can erase each entry from mset2 element.  Since mset1
  333     // and mset2 are the same size this means we can be sure that there
  334     // were no repeated docids in either (it would be a bug if there were).
  335     TEST(docids.erase(*mset2[j]));
  336     }
  337 }
  338 
  339 // Test a synonym search which has had its weight scaled to 0.
  340 DEFINE_TESTCASE(synonym3, backend) {
  341     Xapian::Query query = Xapian::Query(Xapian::Query::OP_SYNONYM,
  342                     Xapian::Query("sky"),
  343                     Xapian::Query("date"));
  344 
  345     Xapian::Database db(get_database("etext"));
  346     Xapian::Enquire enquire(db);
  347     enquire.set_query(query);
  348     Xapian::MSet mset_orig = enquire.get_mset(0, db.get_doccount());
  349 
  350     tout << query << '\n';
  351     tout << mset_orig << '\n';
  352 
  353     // Test that OP_SCALE_WEIGHT with a factor of 0.0 works with OP_SYNONYM
  354     // (this has a special codepath to avoid doing the synonym calculation).
  355     query = Xapian::Query(Xapian::Query::OP_SCALE_WEIGHT, query, 0.0);
  356     enquire.set_query(query);
  357     Xapian::MSet mset_zero = enquire.get_mset(0, db.get_doccount());
  358 
  359     tout << query << '\n';
  360     tout << mset_zero << '\n';
  361 
  362     // Check that the queries return some results.
  363     TEST_NOT_EQUAL(mset_zero.size(), 0);
  364     // Check that the queries return the same document IDs, and the zero
  365     // one has zero weight.
  366     check_msets_contain_same_docs(mset_orig, mset_zero);
  367     for (Xapian::doccount i = 0; i < mset_orig.size(); ++i) {
  368     TEST_NOT_EQUAL(mset_orig[i].get_weight(), 0.0);
  369     TEST_EQUAL(mset_zero[i].get_weight(), 0.0);
  370     }
  371 
  372     return true;
  373 }
  374 
  375 // Test synonym searches combined with various operators.
  376 DEFINE_TESTCASE(synonym4, backend) {
  377     Xapian::Database db(get_database("etext"));
  378     Xapian::Enquire enquire(db);
  379     Xapian::Query syn_query = Xapian::Query(Xapian::Query::OP_SYNONYM,
  380                         Xapian::Query("gutenberg"),
  381                         Xapian::Query("blockhead"));
  382     Xapian::Query or_query = Xapian::Query(Xapian::Query::OP_OR,
  383                        Xapian::Query("gutenberg"),
  384                        Xapian::Query("blockhead"));
  385     Xapian::Query date_query = Xapian::Query("date");
  386 
  387     // Check some queries.
  388     static const Xapian::Query::op operators[] = {
  389     Xapian::Query::OP_AND_MAYBE,
  390     Xapian::Query::OP_AND_NOT,
  391     Xapian::Query::OP_AND,
  392     Xapian::Query::OP_XOR,
  393     Xapian::Query::OP_OR,
  394     Xapian::Query::OP_SYNONYM
  395     };
  396     const Xapian::Query::op * end;
  397     end = operators + sizeof(operators) / sizeof(operators[0]);
  398     for (const Xapian::Query::op * i = operators; i != end; ++i) {
  399     tout.str(string());
  400     Xapian::Query query1(*i, syn_query, date_query);
  401     Xapian::Query query2(*i, or_query, date_query);
  402 
  403     enquire.set_query(query1);
  404     tout << "query1:" << query1 << '\n';
  405     Xapian::MSet mset1 = enquire.get_mset(0, db.get_doccount());
  406     tout << "mset1:" << mset1 << '\n';
  407     enquire.set_query(query2);
  408     tout << "query2:" << query2 << '\n';
  409     Xapian::MSet mset2 = enquire.get_mset(0, db.get_doccount());
  410     tout << "mset2:" << mset2 << '\n';
  411 
  412     TEST_NOT_EQUAL(mset1.size(), 0);
  413     if (*i != Xapian::Query::OP_XOR) {
  414         TEST_EQUAL(mset1[0].get_percent(), 100);
  415     } else {
  416         TEST(mset1[0].get_percent() != 100);
  417     }
  418     check_msets_contain_same_docs(mset1, mset2);
  419     }
  420 
  421     return true;
  422 }
  423 
  424 DEFINE_TESTCASE(opmax1, backend) {
  425     Xapian::Database db(get_database("etext"));
  426     Xapian::Enquire enq(db);
  427     Xapian::Query q1("king");
  428     Xapian::Query q2("friedrich");
  429     Xapian::Query qmax(Xapian::Query::OP_MAX, q1, q2);
  430     enq.set_query(q1);
  431     Xapian::MSet mset1 = enq.get_mset(0, db.get_doccount());
  432     enq.set_query(q2);
  433     Xapian::MSet mset2 = enq.get_mset(0, db.get_doccount());
  434     enq.set_query(qmax);
  435     Xapian::MSet msetmax = enq.get_mset(0, db.get_doccount());
  436 
  437     // Check that the weights in msetmax are the maximum of the weights in
  438     // mset1 and mset2 for each docid.
  439     map<Xapian::docid, double> expected_weights;
  440     Xapian::MSetIterator i;
  441     for (i = mset1.begin(); i != mset1.end(); ++i) {
  442     expected_weights[*i] = i.get_weight();
  443     }
  444     for (i = mset2.begin(); i != mset2.end(); ++i) {
  445     map<Xapian::docid, double>::iterator j;
  446     j = expected_weights.find(*i);
  447     if (j != expected_weights.end()) {
  448         j->second = max(j->second, i.get_weight());
  449     } else {
  450         expected_weights[*i] = i.get_weight();
  451     }
  452     }
  453 
  454     for (i = msetmax.begin(); i != msetmax.end(); ++i) {
  455     map<Xapian::docid, double>::iterator j;
  456     j = expected_weights.find(*i);
  457     TEST(j != expected_weights.end());
  458     TEST_EQUAL_DOUBLE(j->second, i.get_weight());
  459     expected_weights.erase(j);
  460     tout << expected_weights.size() << endl;
  461     }
  462 
  463     // Any document in mset1 or mset2 should also be in msetmax.
  464     TEST_EQUAL(expected_weights.size(), 0);
  465 
  466     return true;
  467 }