"Fossies" - the Fresh Open Source Software Archive

Member "install-tl-20200916/tlpkg/tlperl/site/lib/HTML/PullParser.pm" (5 Apr 2016, 5681 Bytes) of package /windows/misc/install-tl.zip:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 package HTML::PullParser;
    2 
    3 require HTML::Parser;
    4 @ISA=qw(HTML::Parser);
    5 $VERSION = "3.57";
    6 
    7 use strict;
    8 use Carp ();
    9 
   10 sub new
   11 {
   12     my($class, %cnf) = @_;
   13 
   14     # Construct argspecs for the various events
   15     my %argspec;
   16     for (qw(start end text declaration comment process default)) {
   17     my $tmp = delete $cnf{$_};
   18     next unless defined $tmp;
   19     $argspec{$_} = $tmp;
   20     }
   21     Carp::croak("Info not collected for any events")
   22       unless %argspec;
   23 
   24     my $file = delete $cnf{file};
   25     my $doc  = delete $cnf{doc};
   26     Carp::croak("Can't parse from both 'doc' and 'file' at the same time")
   27       if defined($file) && defined($doc);
   28     Carp::croak("No 'doc' or 'file' given to parse from")
   29       unless defined($file) || defined($doc);
   30 
   31     # Create object
   32     $cnf{api_version} = 3;
   33     my $self = $class->SUPER::new(%cnf);
   34 
   35     my $accum = $self->{pullparser_accum} = [];
   36     while (my($event, $argspec) = each %argspec) {
   37     $self->SUPER::handler($event => $accum, $argspec);
   38     }
   39 
   40     if (defined $doc) {
   41     $self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc;
   42     $self->{pullparser_str_pos} = 0;
   43     }
   44     else {
   45     if (!ref($file) && ref(\$file) ne "GLOB") {
   46         require IO::File;
   47         $file = IO::File->new($file, "r") || return;
   48     }
   49 
   50     $self->{pullparser_file} = $file;
   51     }
   52     $self;
   53 }
   54 
   55 
   56 sub handler
   57 {
   58     Carp::croak("Can't set handlers for HTML::PullParser");
   59 }
   60 
   61 
   62 sub get_token
   63 {
   64     my $self = shift;
   65     while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) {
   66     if (my $f = $self->{pullparser_file}) {
   67         # must try to parse more from the file
   68         my $buf;
   69         if (read($f, $buf, 512)) {
   70         $self->parse($buf);
   71         } else {
   72         $self->eof;
   73         $self->{pullparser_eof}++;
   74         delete $self->{pullparser_file};
   75         }
   76     }
   77     elsif (my $sref = $self->{pullparser_str_ref}) {
   78         # must try to parse more from the scalar
   79         my $pos = $self->{pullparser_str_pos};
   80         my $chunk = substr($$sref, $pos, 512);
   81         $self->parse($chunk);
   82         $pos += length($chunk);
   83         if ($pos < length($$sref)) {
   84         $self->{pullparser_str_pos} = $pos;
   85         }
   86         else {
   87         $self->eof;
   88         $self->{pullparser_eof}++;
   89         delete $self->{pullparser_str_ref};
   90         delete $self->{pullparser_str_pos};
   91         }
   92     }
   93     else {
   94         die;
   95     }
   96     }
   97     shift @{$self->{pullparser_accum}};
   98 }
   99 
  100 
  101 sub unget_token
  102 {
  103     my $self = shift;
  104     unshift @{$self->{pullparser_accum}}, @_;
  105     $self;
  106 }
  107 
  108 1;
  109 
  110 
  111 __END__
  112 
  113 =head1 NAME
  114 
  115 HTML::PullParser - Alternative HTML::Parser interface
  116 
  117 =head1 SYNOPSIS
  118 
  119  use HTML::PullParser;
  120 
  121  $p = HTML::PullParser->new(file => "index.html",
  122                             start => 'event, tagname, @attr',
  123                             end   => 'event, tagname',
  124                             ignore_elements => [qw(script style)],
  125                            ) || die "Can't open: $!";
  126  while (my $token = $p->get_token) {
  127      #...do something with $token
  128  }
  129 
  130 =head1 DESCRIPTION
  131 
  132 The HTML::PullParser is an alternative interface to the HTML::Parser class.
  133 It basically turns the HTML::Parser inside out.  You associate a file
  134 (or any IO::Handle object or string) with the parser at construction time and
  135 then repeatedly call $parser->get_token to obtain the tags and text
  136 found in the parsed document.
  137 
  138 The following methods are provided:
  139 
  140 =over 4
  141 
  142 =item $p = HTML::PullParser->new( file => $file, %options )
  143 
  144 =item $p = HTML::PullParser->new( doc => \$doc, %options )
  145 
  146 A C<HTML::PullParser> can be made to parse from either a file or a
  147 literal document based on whether the C<file> or C<doc> option is
  148 passed to the parser's constructor.
  149 
  150 The C<file> passed in can either be a file name or a file handle
  151 object.  If a file name is passed, and it can't be opened for reading,
  152 then the constructor will return an undefined value and $!  will tell
  153 you why it failed.  Otherwise the argument is taken to be some object
  154 that the C<HTML::PullParser> can read() from when it needs more data.
  155 The stream will be read() until EOF, but not closed.
  156 
  157 A C<doc> can be passed plain or as a reference
  158 to a scalar.  If a reference is passed then the value of this scalar
  159 should not be changed before all tokens have been extracted.
  160 
  161 Next the information to be returned for the different token types must
  162 be set up.  This is done by simply associating an argspec (as defined
  163 in L<HTML::Parser>) with the events you have an interest in.  For
  164 instance, if you want C<start> tokens to be reported as the string
  165 C<'S'> followed by the tagname and the attributes you might pass an
  166 C<start>-option like this:
  167 
  168    $p = HTML::PullParser->new(
  169           doc   => $document_to_parse,
  170           start => '"S", tagname, @attr',
  171           end   => '"E", tagname',
  172         );
  173 
  174 At last other C<HTML::Parser> options, like C<ignore_tags>, and
  175 C<unbroken_text>, can be passed in.  Note that you should not use the
  176 I<event>_h options to set up parser handlers.  That would confuse the
  177 inner logic of C<HTML::PullParser>.
  178 
  179 =item $token = $p->get_token
  180 
  181 This method will return the next I<token> found in the HTML document,
  182 or C<undef> at the end of the document.  The token is returned as an
  183 array reference.  The content of this array match the argspec set up
  184 during C<HTML::PullParser> construction.
  185 
  186 =item $p->unget_token( @tokens )
  187 
  188 If you find out you have read too many tokens you can push them back,
  189 so that they are returned again the next time $p->get_token is called.
  190 
  191 =back
  192 
  193 =head1 EXAMPLES
  194 
  195 The 'eg/hform' script shows how we might parse the form section of
  196 HTML::Documents using HTML::PullParser.
  197 
  198 =head1 SEE ALSO
  199 
  200 L<HTML::Parser>, L<HTML::TokeParser>
  201 
  202 =head1 COPYRIGHT
  203 
  204 Copyright 1998-2001 Gisle Aas.
  205 
  206 This library is free software; you can redistribute it and/or
  207 modify it under the same terms as Perl itself.
  208 
  209 =cut