"Fossies" - the Fresh Open Source Software Archive

Member "koha-19.11.15/Koha/SearchEngine/Elasticsearch/Search.pm" (23 Feb 2021, 18465 Bytes) of package /linux/misc/koha-19.11.15.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "Search.pm" see the Fossies "Dox" file reference documentation.

    1 package Koha::SearchEngine::Elasticsearch::Search;
    2 
    3 # Copyright 2014 Catalyst IT
    4 #
    5 # This file is part of Koha.
    6 #
    7 # Koha is free software; you can redistribute it and/or modify it under the
    8 # terms of the GNU General Public License as published by the Free Software
    9 # Foundation; either version 3 of the License, or (at your option) any later
   10 # version.
   11 #
   12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
   13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
   14 # A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
   15 #
   16 # You should have received a copy of the GNU General Public License along
   17 # with Koha; if not, write to the Free Software Foundation, Inc.,
   18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
   19 
   20 =head1 NAME
   21 
   22 Koha::SearchEngine::Elasticsearch::Search - search functions for Elasticsearch
   23 
   24 =head1 SYNOPSIS
   25 
   26     my $searcher =
   27       Koha::SearchEngine::Elasticsearch::Search->new( { index => $index } );
   28     my $builder = Koha::SearchEngine::Elasticsearch::QueryBuilder->new(
   29         { index => $index } );
   30     my $query = $builder->build_query('perl');
   31     my $results = $searcher->search($query);
   32     print "There were " . $results->total . " results.\n";
   33     $results->each(sub {
   34         push @hits, @_[0];
   35     });
   36 
   37 =head1 METHODS
   38 
   39 =cut
   40 
   41 use Modern::Perl;
   42 
   43 use base qw(Koha::SearchEngine::Elasticsearch);
   44 use C4::Context;
   45 use C4::AuthoritiesMarc;
   46 use Koha::ItemTypes;
   47 use Koha::AuthorisedValues;
   48 use Koha::SearchEngine::QueryBuilder;
   49 use Koha::SearchEngine::Search;
   50 use Koha::Exceptions::Elasticsearch;
   51 use MARC::Record;
   52 use Catmandu::Store::ElasticSearch;
   53 use MARC::File::XML;
   54 use Data::Dumper; #TODO remove
   55 use Carp qw(cluck);
   56 use MIME::Base64;
   57 
   58 Koha::SearchEngine::Elasticsearch::Search->mk_accessors(qw( store ));
   59 
   60 =head2 search
   61 
   62     my $results = $searcher->search($query, $page, $count, %options);
   63 
   64 Run a search using the query. It'll return C<$count> results, starting at page
   65 C<$page> (C<$page> counts from 1, anything less that, or C<undef> becomes 1.)
   66 C<$count> is also the number of entries on a page.
   67 
   68 C<%options> is a hash containing extra options:
   69 
   70 =over 4
   71 
   72 =item offset
   73 
   74 If provided, this overrides the C<$page> value, and specifies the record as
   75 an offset (i.e. the number of the record to start with), rather than a page.
   76 
   77 =back
   78 
   79 Returns
   80 
   81 =cut
   82 
   83 sub search {
   84     my ($self, $query, $page, $count, %options) = @_;
   85 
   86     my $params = $self->get_elasticsearch_params();
   87     # 20 is the default number of results per page
   88     $query->{size} = $count // 20;
   89     # ES doesn't want pages, it wants a record to start from.
   90     if (exists $options{offset}) {
   91         $query->{from} = $options{offset};
   92     } else {
   93         $page = (!defined($page) || ($page <= 0)) ? 0 : $page - 1;
   94         $query->{from} = $page * $query->{size};
   95     }
   96     my $elasticsearch = $self->get_elasticsearch();
   97     my $results = eval {
   98         $elasticsearch->search(
   99             index => $params->{index_name},
  100             body => $query
  101         );
  102     };
  103     if ($@) {
  104         die $self->process_error($@);
  105     }
  106     return $results;
  107 }
  108 
  109 =head2 count
  110 
  111     my $count = $searcher->count($query);
  112 
  113 This mimics a search request, but just gets the result count instead. That's
  114 faster than pulling all the data in, usually.
  115 
  116 =cut
  117 
  118 sub count {
  119     my ( $self, $query ) = @_;
  120 
  121     my $params = $self->get_elasticsearch_params();
  122     $self->store(
  123         Catmandu::Store::ElasticSearch->new( %$params, trace_calls => 0, ) )
  124       unless $self->store;
  125 
  126     my $search = $self->store->bag->search( %$query);
  127     my $count = $search->total() || 0;
  128     return $count;
  129 }
  130 
  131 =head2 search_compat
  132 
  133     my ( $error, $results, $facets ) = $search->search_compat(
  134         $query,            $simple_query, \@sort_by,       \@servers,
  135         $results_per_page, $offset,       undef,           $item_types,
  136         $query_type,       $scan
  137       )
  138 
  139 A search interface somewhat compatible with L<C4::Search->getRecords>. Anything
  140 that is returned in the query created by build_query_compat will probably
  141 get ignored here, along with some other things (like C<@servers>.)
  142 
  143 =cut
  144 
  145 sub search_compat {
  146     my (
  147         $self,       $query,            $simple_query, $sort_by,
  148         $servers,    $results_per_page, $offset,       $branches,
  149         $item_types, $query_type,       $scan
  150     ) = @_;
  151 
  152     if ( $scan ) {
  153         return $self->_aggregation_scan( $query, $results_per_page, $offset );
  154     }
  155 
  156     my %options;
  157     if ( !defined $offset or $offset < 0 ) {
  158         $offset = 0;
  159     }
  160     $options{offset} = $offset;
  161     my $results = $self->search($query, undef, $results_per_page, %options);
  162 
  163     # Convert each result into a MARC::Record
  164     my @records;
  165     # opac-search expects results to be put in the
  166     # right place in the array, according to $offset
  167     my $index = $offset;
  168     my $hits = $results->{'hits'};
  169     foreach my $es_record (@{$hits->{'hits'}}) {
  170         $records[$index++] = $self->decode_record_from_result($es_record->{'_source'});
  171     }
  172 
  173     # consumers of this expect a name-spaced result, we provide the default
  174     # configuration.
  175     my %result;
  176     $result{biblioserver}{hits} = $hits->{'total'};
  177     $result{biblioserver}{RECORDS} = \@records;
  178     return (undef, \%result, $self->_convert_facets($results->{aggregations}));
  179 }
  180 
  181 =head2 search_auth_compat
  182 
  183     my ( $results, $total ) =
  184       $searcher->search_auth_compat( $query, $offset, $count, $skipmetadata, %options );
  185 
  186 This has a similar calling convention to L<search>, however it returns its
  187 results in a form the same as L<C4::AuthoritiesMarc::SearchAuthorities>.
  188 
  189 =cut
  190 
  191 sub search_auth_compat {
  192     my ($self, $query, $offset, $count, $skipmetadata, %options) = @_;
  193 
  194     if ( !defined $offset or $offset <= 0 ) {
  195         $offset = 1;
  196     }
  197     # Uh, authority search uses 1-based offset..
  198     $options{offset} = $offset - 1;
  199     my $database = Koha::Database->new();
  200     my $schema   = $database->schema();
  201     my $res      = $self->search($query, undef, $count, %options);
  202 
  203     my $bib_searcher = Koha::SearchEngine::Elasticsearch::Search->new({index => 'biblios'});
  204     my @records;
  205     my $hits = $res->{'hits'};
  206     foreach my $es_record (@{$hits->{'hits'}}) {
  207         my $record = $es_record->{'_source'};
  208         my %result;
  209 
  210         # We are using the authid to create links, we should honor the authid as stored in the db, not
  211         # the 001 which, in some circumstances, can contain other data
  212         my $authid = $es_record->{_id};
  213 
  214 
  215         $result{authid} = $authid;
  216 
  217         if (!defined $skipmetadata || !$skipmetadata) {
  218             # TODO put all this info into the record at index time so we
  219             # don't have to go and sort it all out now.
  220             my $authtypecode = $record->{authtype};
  221             my $rs           = $schema->resultset('AuthType')
  222             ->search( { authtypecode => $authtypecode } );
  223 
  224             # FIXME there's an assumption here that we will get a result.
  225             # the original code also makes an assumption that some provided
  226             # authtypecode may sometimes be used instead of the one stored
  227             # with the record. It's not documented why this is the case, so
  228             # it's not reproduced here yet.
  229             my $authtype           = $rs->single;
  230             my $auth_tag_to_report = $authtype ? $authtype->auth_tag_to_report : "";
  231             my $marc               = $self->decode_record_from_result($record);
  232             my $mainentry          = $marc->field($auth_tag_to_report);
  233             my $reported_tag;
  234             if ($mainentry) {
  235                 foreach ( $mainentry->subfields() ) {
  236                     $reported_tag .= '$' . $_->[0] . $_->[1];
  237                 }
  238             }
  239             # Turn the resultset into a hash
  240             $result{authtype}     = $authtype ? $authtype->authtypetext : $authtypecode;
  241             $result{reported_tag} = $reported_tag;
  242 
  243             # Reimplementing BuildSummary is out of scope because it'll be hard
  244             $result{summary} =
  245             C4::AuthoritiesMarc::BuildSummary( $marc, $result{authid},
  246                 $authtypecode );
  247             $result{used} = $self->count_auth_use($bib_searcher, $authid);
  248         }
  249         push @records, \%result;
  250     }
  251     return ( \@records, $hits->{'total'} );
  252 }
  253 
  254 =head2 count_auth_use
  255 
  256     my $count = $auth_searcher->count_auth_use($bib_searcher, $authid);
  257 
  258 This runs a search to determine the number of records that reference the
  259 specified authid. C<$bib_searcher> must be something compatible with
  260 elasticsearch, as the query is built in this function.
  261 
  262 =cut
  263 
  264 sub count_auth_use {
  265     my ($self, $bib_searcher, $authid) = @_;
  266 
  267     my $query = {
  268         query => {
  269             bool => {
  270 #                query  => { match_all => {} },
  271                 filter => { term      => { 'koha-auth-number' => $authid } }
  272             }
  273         }
  274     };
  275     $bib_searcher->count($query);
  276 }
  277 
  278 =head2 simple_search_compat
  279 
  280     my ( $error, $marcresults, $total_hits ) =
  281       $searcher->simple_search( $query, $offset, $max_results, %options );
  282 
  283 This is a simpler interface to the searching, intended to be similar enough to
  284 L<C4::Search::SimpleSearch>.
  285 
  286 Arguments:
  287 
  288 =over 4
  289 
  290 =item C<$query>
  291 
  292 A thing to search for. It could be a simple string, or something constructed
  293 with the appropriate QueryBuilder module.
  294 
  295 =item C<$offset>
  296 
  297 How many results to skip from the start of the results.
  298 
  299 =item C<$max_results>
  300 
  301 The max number of results to return. The default is 100 (because unlimited
  302 is a pretty terrible thing to do.)
  303 
  304 =item C<%options>
  305 
  306 These options are unused by Elasticsearch
  307 
  308 =back
  309 
  310 Returns:
  311 
  312 =over 4
  313 
  314 =item C<$error>
  315 
  316 if something went wrong, this'll contain some kind of error
  317 message.
  318 
  319 =item C<$marcresults>
  320 
  321 an arrayref of MARC::Records (note that this is different from the
  322 L<C4::Search> version which will return plain XML, but too bad.)
  323 
  324 =item C<$total_hits>
  325 
  326 the total number of results that this search could have returned.
  327 
  328 =back
  329 
  330 =cut
  331 
  332 sub simple_search_compat {
  333     my ($self, $query, $offset, $max_results) = @_;
  334 
  335     return ('No query entered', undef, undef) unless $query;
  336 
  337     my %options;
  338     $offset = 0 if not defined $offset or $offset < 0;
  339     $options{offset} = $offset;
  340     $max_results //= 100;
  341 
  342     unless (ref $query) {
  343         # We'll push it through the query builder to sanitise everything.
  344         my $qb = Koha::SearchEngine::QueryBuilder->new({index => $self->index});
  345         (undef,$query) = $qb->build_query_compat(undef, [$query]);
  346     }
  347     my $results = $self->search($query, undef, $max_results, %options);
  348     my @records;
  349     my $hits = $results->{'hits'};
  350     foreach my $es_record (@{$hits->{'hits'}}) {
  351         push @records, $self->decode_record_from_result($es_record->{'_source'});
  352     }
  353     return (undef, \@records, $hits->{'total'});
  354 }
  355 
  356 =head2 extract_biblionumber
  357 
  358     my $biblionumber = $searcher->extract_biblionumber( $searchresult );
  359 
  360 $searchresult comes from simple_search_compat.
  361 
  362 Returns the biblionumber from the search result record.
  363 
  364 =cut
  365 
  366 sub extract_biblionumber {
  367     my ( $self, $searchresultrecord ) = @_;
  368     return Koha::SearchEngine::Search::extract_biblionumber( $searchresultrecord );
  369 }
  370 
  371 =head2 decode_record_from_result
  372     my $marc_record = $self->decode_record_from_result(@result);
  373 
  374 Extracts marc data from Elasticsearch result and decodes to MARC::Record object
  375 
  376 =cut
  377 
  378 sub decode_record_from_result {
  379     # Result is passed in as array, will get flattened
  380     # and first element will be $result
  381     my ( $self, $result ) = @_;
  382     if ($result->{marc_format} eq 'base64ISO2709') {
  383         return MARC::Record->new_from_usmarc(decode_base64($result->{marc_data}));
  384     }
  385     elsif ($result->{marc_format} eq 'MARCXML') {
  386         return MARC::Record->new_from_xml($result->{marc_data}, 'UTF-8', uc C4::Context->preference('marcflavour'));
  387     }
  388     elsif ($result->{marc_format} eq 'ARRAY') {
  389         return $self->_array_to_marc($result->{marc_data_array});
  390     }
  391     else {
  392         Koha::Exceptions::Elasticsearch->throw("Missing marc_format field in Elasticsearch result");
  393     }
  394 }
  395 
  396 =head2 max_result_window
  397 
  398 Returns the maximum number of results that can be fetched
  399 
  400 This directly requests Elasticsearch for the setting index.max_result_window (or
  401 the default value for this setting in case it is not set)
  402 
  403 =cut
  404 
  405 sub max_result_window {
  406     my ($self) = @_;
  407 
  408     $self->store(
  409         Catmandu::Store::ElasticSearch->new(%{ $self->get_elasticsearch_params })
  410     ) unless $self->store;
  411 
  412     my $index_name = $self->store->index_name;
  413     my $settings = $self->store->es->indices->get_settings(
  414         index  => $index_name,
  415         params => { include_defaults => 'true', flat_settings => 'true' },
  416     );
  417 
  418     my $max_result_window = $settings->{$index_name}->{settings}->{'index.max_result_window'};
  419     $max_result_window //= $settings->{$index_name}->{defaults}->{'index.max_result_window'};
  420 
  421     return $max_result_window;
  422 }
  423 
  424 =head2 _convert_facets
  425 
  426     my $koha_facets = _convert_facets($es_facets);
  427 
  428 Converts elasticsearch facets types to the form that Koha expects.
  429 It expects the ES facet name to match the Koha type, for example C<itype>,
  430 C<au>, C<su-to>, etc.
  431 
  432 =cut
  433 
  434 sub _convert_facets {
  435     my ( $self, $es, $exp_facet ) = @_;
  436 
  437     return if !$es;
  438 
  439     # These should correspond to the ES field names, as opposed to the CCL
  440     # things that zebra uses.
  441     my %type_to_label;
  442     my %label = (
  443         author         => 'Authors',
  444         itype          => 'ItemTypes',
  445         location       => 'Location',
  446         'su-geo'       => 'Places',
  447         'title-series' => 'Series',
  448         subject        => 'Topics',
  449         ccode          => 'CollectionCodes',
  450         holdingbranch  => 'HoldingLibrary',
  451         homebranch     => 'HomeLibrary',
  452         ln             => 'Language',
  453     );
  454     my @facetable_fields =
  455       Koha::SearchEngine::Elasticsearch->get_facetable_fields;
  456     for my $f (@facetable_fields) {
  457         next unless defined $f->facet_order;
  458         $type_to_label{ $f->name } =
  459           { order => $f->facet_order, label => $label{ $f->name } };
  460     }
  461 
  462     # We also have some special cases, e.g. itypes that need to show the
  463     # value rather than the code.
  464     my @itypes = Koha::ItemTypes->search;
  465     my @libraries = Koha::Libraries->search;
  466     my $library_names = { map { $_->branchcode => $_->branchname } @libraries };
  467     my @locations = Koha::AuthorisedValues->search( { category => 'LOC' } );
  468     my $opac = C4::Context->interface eq 'opac' ;
  469     my %special = (
  470         itype    => { map { $_->itemtype         => $_->description } @itypes },
  471         location => { map { $_->authorised_value => ( $opac ? ( $_->lib_opac || $_->lib ) : $_->lib ) } @locations },
  472         holdingbranch => $library_names,
  473         homebranch => $library_names
  474     );
  475     my @facets;
  476     $exp_facet //= '';
  477     while ( my ( $type, $data ) = each %$es ) {
  478         next if !exists( $type_to_label{$type} );
  479 
  480         # We restrict to the most popular $limit !results
  481         my $limit = C4::Context->preference('FacetMaxCount');
  482         my $facet = {
  483             type_id    => $type . '_id',
  484             "type_label_$type_to_label{$type}{label}" => 1,
  485             type_link_value                    => $type,
  486             order      => $type_to_label{$type}{order},
  487         };
  488         $limit = @{ $data->{buckets} } if ( $limit > @{ $data->{buckets} } );
  489         foreach my $term ( @{ $data->{buckets} }[ 0 .. $limit - 1 ] ) {
  490             my $t = $term->{key};
  491             my $c = $term->{doc_count};
  492             my $label;
  493             if ( exists( $special{$type} ) ) {
  494                 $label = $special{$type}->{$t} // $t;
  495             }
  496             else {
  497                 $label = $t;
  498             }
  499             push @{ $facet->{facets} }, {
  500                 facet_count       => $c,
  501                 facet_link_value  => $t,
  502                 facet_title_value => $t . " ($c)",
  503                 facet_label_value => $label,        # TODO either truncate this,
  504                      # or make the template do it like it should anyway
  505                 type_link_value => $type,
  506             };
  507         }
  508         push @facets, $facet if exists $facet->{facets};
  509     }
  510 
  511     @facets = sort { $a->{order} <=> $b->{order} } @facets;
  512     return \@facets;
  513 }
  514 
  515 =head2 _aggregation_scan
  516 
  517     my $result = $self->_aggregration_scan($query, 10, 0);
  518 
  519 Perform an aggregation request for scan purposes.
  520 
  521 =cut
  522 
  523 sub _aggregation_scan {
  524     my ($self, $query, $results_per_page, $offset) = @_;
  525 
  526     if (!scalar(keys %{$query->{aggregations}})) {
  527         my %result = {
  528             biblioserver => {
  529                 hits => 0,
  530                 RECORDS => undef
  531             }
  532         };
  533         return (undef, \%result, undef);
  534     }
  535     my ($field) = keys %{$query->{aggregations}};
  536     $query->{aggregations}{$field}{terms}{size} = 1000;
  537     my $results = $self->search($query, 1, 0);
  538 
  539     # Convert each result into a MARC::Record
  540     my (@records, $index);
  541     # opac-search expects results to be put in the
  542     # right place in the array, according to $offset
  543     $index = $offset - 1;
  544 
  545     my $count = scalar(@{$results->{aggregations}{$field}{buckets}});
  546     for (my $index = $offset; $index - $offset < $results_per_page && $index < $count; $index++) {
  547         my $bucket = $results->{aggregations}{$field}{buckets}->[$index];
  548         # Scan values are expressed as:
  549         # - MARC21: 100a (count) and 245a (term)
  550         # - UNIMARC: 200f (count) and 200a (term)
  551         my $marc = MARC::Record->new;
  552         $marc->encoding('UTF-8');
  553         if (C4::Context->preference('marcflavour') eq 'UNIMARC') {
  554             $marc->append_fields(
  555                 MARC::Field->new((200, ' ',  ' ', 'f' => $bucket->{doc_count}))
  556             );
  557             $marc->append_fields(
  558                 MARC::Field->new((200, ' ',  ' ', 'a' => $bucket->{key}))
  559             );
  560         } else {
  561             $marc->append_fields(
  562                 MARC::Field->new((100, ' ',  ' ', 'a' => $bucket->{doc_count}))
  563             );
  564             $marc->append_fields(
  565                 MARC::Field->new((245, ' ',  ' ', 'a' => $bucket->{key}))
  566             );
  567         }
  568         $records[$index] = $marc->as_usmarc();
  569     };
  570     # consumers of this expect a namespaced result, we provide the default
  571     # configuration.
  572     my %result;
  573     $result{biblioserver}{hits} = $count;
  574     $result{biblioserver}{RECORDS} = \@records;
  575     return (undef, \%result, undef);
  576 }
  577 
  578 1;