"Fossies" - the Fresh Open Source Software Archive

Member "recoll-1.26.3/utils/hldata.h" (24 Nov 2019, 5159 Bytes) of package /linux/privat/recoll-1.26.3.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "hldata.h" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 1.25.23_vs_1.26.0.

    1 /* Copyright (C) 2017-2019 J.F.Dockes
    2  *
    3  * License: GPL 2.1
    4  *
    5  * This program is free software; you can redistribute it and/or modify
    6  * it under the terms of the GNU General Public License as published by
    7  * the Free Software Foundation; either version 2.1 of the License, or
    8  * (at your option) any later version.
    9  *
   10  * This program is distributed in the hope that it will be useful,
   11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   13  * GNU Lesser General Public License for more details.
   14  *
   15  * You should have received a copy of the GNU Lesser General Public License
   16  * along with this program; if not, write to the
   17  * Free Software Foundation, Inc.,
   18  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
   19  */
   20 #ifndef _hldata_h_included_
   21 #define _hldata_h_included_
   22 
   23 #include <vector>
   24 #include <string>
   25 #include <set>
   26 #include <unordered_map>
   27 
   28 /** Store data about user search terms and their expansions. This is used
   29  * mostly for highlighting result text and walking the matches, generating 
   30  * spelling suggestions.
   31  */
   32 struct HighlightData {
   33     /** The user terms, excluding those with wildcards. This list is
   34      * intended for orthographic suggestions so the terms are always
   35      * lowercased, unaccented or not depending on the type of index 
   36      * (as the spelling dictionary is generated from the index terms).
   37      */
   38     std::set<std::string> uterms;
   39 
   40     /** The db query terms linked to the uterms entry they were expanded from. 
   41      * This is used for aggregating term stats when generating snippets (for 
   42      * choosing the best terms, allocating slots, etc. )
   43      */
   44     std::unordered_map<std::string, std::string> terms;
   45 
   46     /** The original user terms-or-groups. This is for display
   47      * purposes: ie when creating a menu to look for a specific
   48      * matched group inside a preview window. We want to show the
   49      * user-entered data in the menu, not some transformation, so
   50      * these are always raw, diacritics and case preserved.
   51      */
   52     std::vector<std::vector<std::string> > ugroups;
   53 
   54     /** Processed/expanded terms and groups. Used for looking for
   55      * regions to highlight. A group can be a PHRASE or NEAR entry
   56      * Terms are just groups with 1 entry. All
   57      * terms are transformed to be compatible with index content
   58      * (unaccented and lowercased as needed depending on
   59      * configuration), and the list may include values
   60      * expanded from the original terms by stem or wildcard expansion.
   61      */
   62     struct TermGroup {
   63         // We'd use an union but no can do
   64         std::string term;
   65         std::vector<std::vector<std::string> > orgroups;
   66         int slack{0};
   67 
   68         /* Index into ugroups. As a user term or group may generate
   69          * many processed/expanded terms or groups, this is how we
   70          * relate an expansion to its source (used, e.g. for
   71          * generating anchors for walking search matches in the
   72          * preview window). */
   73         size_t grpsugidx{0};
   74         enum TGK {TGK_TERM, TGK_NEAR, TGK_PHRASE};
   75         TGK kind{TGK_TERM};
   76     };
   77     std::vector<TermGroup> index_term_groups;
   78 
   79     void clear() {
   80     uterms.clear();
   81     ugroups.clear();
   82     index_term_groups.clear();
   83     }
   84     void append(const HighlightData&);
   85 
   86     // Print (debug)
   87     std::string toString() const;
   88 };
   89 
   90 /* The following is used by plaintorich.cpp for finding zones to
   91    highlight and by rclabsfromtext.cpp to choose fragments for the
   92    abstract */
   93 
   94 struct GroupMatchEntry {
   95     // Start/End byte offsets in the document text
   96     std::pair<int, int> offs;
   97     // Index of the search group this comes from: this is to relate a 
   98     // match to the original user input.
   99     size_t grpidx;
  100     GroupMatchEntry(int sta, int sto, size_t idx) 
  101         : offs(sta, sto), grpidx(idx) {
  102     }
  103 };
  104 
  105 // Find NEAR or PHRASE matches for one group of terms.
  106 //
  107 // @param hldata User query expansion descriptor (see above). We only use
  108 //      the index_term_groups entry
  109 //
  110 // @param grpidx Index in hldata.index_term_groups for the group we
  111 //     process. This is used by us to get the terms, group type
  112 //     (phrase/near) and slacks. We also set it in the output
  113 //     GroupMatchEntry structures to allow the caller to link a match
  114 //     with a specific user input (e.g. for walking the match in the
  115 //     GUI preview)
  116 //
  117 // @param inplists Position lists for the the group terms. This is the
  118 //     data used to look for matches.
  119 //
  120 // @param gpostobytes Translation of term position to start/end byte
  121 //     offsets. This is used to translate term positions to byte
  122 //     positions in the output, for ease of use by caller.
  123 //
  124 // @param[out] tboffs Found matches. Each match has a begin and end
  125 //     byte offset and an index linking to the origin data in the
  126 //     HighlightData structure.
  127 extern bool matchGroup(
  128     const HighlightData& hldata,
  129     unsigned int grpidx,
  130     const std::unordered_map<std::string, std::vector<int>>& inplists,
  131     const std::unordered_map<int, std::pair<int,int>>& gpostobytes,
  132     std::vector<GroupMatchEntry>& tboffs
  133     );
  134 
  135 #endif /* _hldata_h_included_ */