"Fossies" - the Fresh Open Source Software Archive

Member "Perl-Tidy-20200110/lib/Perl/Tidy/Tokenizer.pm" (7 Jan 2020, 288463 Bytes) of package /linux/misc/Perl-Tidy-20200110.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "Tokenizer.pm" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 20191203_vs_20200110.

    1 ########################################################################
    2 #
    3 # the Perl::Tidy::Tokenizer package is essentially a filter which
    4 # reads lines of perl source code from a source object and provides
    5 # corresponding tokenized lines through its get_line() method.  Lines
    6 # flow from the source_object to the caller like this:
    7 #
    8 # source_object --> LineBuffer_object --> Tokenizer -->  calling routine
    9 #   get_line()         get_line()           get_line()     line_of_tokens
   10 #
   11 # The source object can be any object with a get_line() method which
   12 # supplies one line (a character string) perl call.
   13 # The LineBuffer object is created by the Tokenizer.
   14 # The Tokenizer returns a reference to a data structure 'line_of_tokens'
   15 # containing one tokenized line for each call to its get_line() method.
   16 #
   17 # WARNING: This is not a real class yet.  Only one tokenizer my be used.
   18 #
   19 ########################################################################
   20 
   21 package Perl::Tidy::Tokenizer;
   22 use strict;
   23 use warnings;
   24 our $VERSION = '20200110';
   25 
   26 use Perl::Tidy::LineBuffer;
   27 
   28 BEGIN {
   29 
   30     # Caution: these debug flags produce a lot of output
   31     # They should all be 0 except when debugging small scripts
   32 
   33     use constant TOKENIZER_DEBUG_FLAG_EXPECT   => 0;
   34     use constant TOKENIZER_DEBUG_FLAG_NSCAN    => 0;
   35     use constant TOKENIZER_DEBUG_FLAG_QUOTE    => 0;
   36     use constant TOKENIZER_DEBUG_FLAG_SCAN_ID  => 0;
   37     use constant TOKENIZER_DEBUG_FLAG_TOKENIZE => 0;
   38 
   39     my $debug_warning = sub {
   40         print STDOUT "TOKENIZER_DEBUGGING with key $_[0]\n";
   41     };
   42 
   43     TOKENIZER_DEBUG_FLAG_EXPECT   && $debug_warning->('EXPECT');
   44     TOKENIZER_DEBUG_FLAG_NSCAN    && $debug_warning->('NSCAN');
   45     TOKENIZER_DEBUG_FLAG_QUOTE    && $debug_warning->('QUOTE');
   46     TOKENIZER_DEBUG_FLAG_SCAN_ID  && $debug_warning->('SCAN_ID');
   47     TOKENIZER_DEBUG_FLAG_TOKENIZE && $debug_warning->('TOKENIZE');
   48 
   49 }
   50 
   51 use Carp;
   52 
   53 # PACKAGE VARIABLES for processing an entire FILE.
   54 use vars qw{
   55   $tokenizer_self
   56 
   57   $last_nonblank_token
   58   $last_nonblank_type
   59   $last_nonblank_block_type
   60   $statement_type
   61   $in_attribute_list
   62   $current_package
   63   $context
   64 
   65   %is_constant
   66   %is_user_function
   67   %user_function_prototype
   68   %is_block_function
   69   %is_block_list_function
   70   %saw_function_definition
   71 
   72   $brace_depth
   73   $paren_depth
   74   $square_bracket_depth
   75 
   76   @current_depth
   77   @total_depth
   78   $total_depth
   79   @nesting_sequence_number
   80   @current_sequence_number
   81   @paren_type
   82   @paren_semicolon_count
   83   @paren_structural_type
   84   @brace_type
   85   @brace_structural_type
   86   @brace_context
   87   @brace_package
   88   @square_bracket_type
   89   @square_bracket_structural_type
   90   @depth_array
   91   @nested_ternary_flag
   92   @nested_statement_type
   93   @starting_line_of_current_depth
   94 };
   95 
   96 # GLOBAL CONSTANTS for routines in this package
   97 use vars qw{
   98   %is_indirect_object_taker
   99   %is_block_operator
  100   %expecting_operator_token
  101   %expecting_operator_types
  102   %expecting_term_types
  103   %expecting_term_token
  104   %is_digraph
  105   %is_file_test_operator
  106   %is_trigraph
  107   %is_tetragraph
  108   %is_valid_token_type
  109   %is_keyword
  110   %is_code_block_token
  111   %really_want_term
  112   @opening_brace_names
  113   @closing_brace_names
  114   %is_keyword_taking_list
  115   %is_keyword_taking_optional_args
  116   %is_q_qq_qw_qx_qr_s_y_tr_m
  117   %is_sub
  118   %is_package
  119 };
  120 
  121 # possible values of operator_expected()
  122 use constant TERM     => -1;
  123 use constant UNKNOWN  => 0;
  124 use constant OPERATOR => 1;
  125 
  126 # possible values of context
  127 use constant SCALAR_CONTEXT  => -1;
  128 use constant UNKNOWN_CONTEXT => 0;
  129 use constant LIST_CONTEXT    => 1;
  130 
  131 # Maximum number of little messages; probably need not be changed.
  132 use constant MAX_NAG_MESSAGES => 6;
  133 
  134 {
  135 
  136     # methods to count instances
  137     my $_count = 0;
  138     sub get_count        { return $_count; }
  139     sub _increment_count { return ++$_count }
  140     sub _decrement_count { return --$_count }
  141 }
  142 
  143 sub DESTROY {
  144     my $self = shift;
  145     $self->_decrement_count();
  146     return;
  147 }
  148 
  149 sub check_options {
  150 
  151     # Check Tokenizer parameters
  152     my $rOpts = shift;
  153 
  154     %is_sub = ();
  155     $is_sub{'sub'} = 1;
  156 
  157     # Install any aliases to 'sub'
  158     if ( $rOpts->{'sub-alias-list'} ) {
  159 
  160         # Note that any 'sub-alias-list' has been preprocessed to
  161         # be a trimmed, space-separated list which includes 'sub'
  162         # for example, it might be 'sub method fun'
  163         my @sub_alias_list = split /\s+/, $rOpts->{'sub-alias-list'};
  164         foreach my $word (@sub_alias_list) {
  165             $is_sub{$word} = 1;
  166         }
  167     }
  168     return;
  169 }
  170 
  171 sub new {
  172 
  173     my ( $class, @args ) = @_;
  174 
  175     # Note: 'tabs' and 'indent_columns' are temporary and should be
  176     # removed asap
  177     my %defaults = (
  178         source_object        => undef,
  179         debugger_object      => undef,
  180         diagnostics_object   => undef,
  181         logger_object        => undef,
  182         starting_level       => undef,
  183         indent_columns       => 4,
  184         tabsize              => 8,
  185         look_for_hash_bang   => 0,
  186         trim_qw              => 1,
  187         look_for_autoloader  => 1,
  188         look_for_selfloader  => 1,
  189         starting_line_number => 1,
  190         extended_syntax      => 0,
  191     );
  192     my %args = ( %defaults, @args );
  193 
  194     # we are given an object with a get_line() method to supply source lines
  195     my $source_object = $args{source_object};
  196 
  197     # we create another object with a get_line() and peek_ahead() method
  198     my $line_buffer_object = Perl::Tidy::LineBuffer->new($source_object);
  199 
  200     # Tokenizer state data is as follows:
  201     # _rhere_target_list    reference to list of here-doc targets
  202     # _here_doc_target      the target string for a here document
  203     # _here_quote_character the type of here-doc quoting (" ' ` or none)
  204     #                       to determine if interpolation is done
  205     # _quote_target         character we seek if chasing a quote
  206     # _line_start_quote     line where we started looking for a long quote
  207     # _in_here_doc          flag indicating if we are in a here-doc
  208     # _in_pod               flag set if we are in pod documentation
  209     # _in_error             flag set if we saw severe error (binary in script)
  210     # _in_data              flag set if we are in __DATA__ section
  211     # _in_end               flag set if we are in __END__ section
  212     # _in_format            flag set if we are in a format description
  213     # _in_attribute_list    flag telling if we are looking for attributes
  214     # _in_quote             flag telling if we are chasing a quote
  215     # _starting_level       indentation level of first line
  216     # _line_buffer_object   object with get_line() method to supply source code
  217     # _diagnostics_object   place to write debugging information
  218     # _unexpected_error_count  error count used to limit output
  219     # _lower_case_labels_at  line numbers where lower case labels seen
  220     # _hit_bug           program bug detected
  221     $tokenizer_self = {
  222         _rhere_target_list                  => [],
  223         _in_here_doc                        => 0,
  224         _here_doc_target                    => "",
  225         _here_quote_character               => "",
  226         _in_data                            => 0,
  227         _in_end                             => 0,
  228         _in_format                          => 0,
  229         _in_error                           => 0,
  230         _in_pod                             => 0,
  231         _in_attribute_list                  => 0,
  232         _in_quote                           => 0,
  233         _quote_target                       => "",
  234         _line_start_quote                   => -1,
  235         _starting_level                     => $args{starting_level},
  236         _know_starting_level                => defined( $args{starting_level} ),
  237         _tabsize                            => $args{tabsize},
  238         _indent_columns                     => $args{indent_columns},
  239         _look_for_hash_bang                 => $args{look_for_hash_bang},
  240         _trim_qw                            => $args{trim_qw},
  241         _continuation_indentation           => $args{continuation_indentation},
  242         _outdent_labels                     => $args{outdent_labels},
  243         _last_line_number                   => $args{starting_line_number} - 1,
  244         _saw_perl_dash_P                    => 0,
  245         _saw_perl_dash_w                    => 0,
  246         _saw_use_strict                     => 0,
  247         _saw_v_string                       => 0,
  248         _hit_bug                            => 0,
  249         _look_for_autoloader                => $args{look_for_autoloader},
  250         _look_for_selfloader                => $args{look_for_selfloader},
  251         _saw_autoloader                     => 0,
  252         _saw_selfloader                     => 0,
  253         _saw_hash_bang                      => 0,
  254         _saw_end                            => 0,
  255         _saw_data                           => 0,
  256         _saw_negative_indentation           => 0,
  257         _started_tokenizing                 => 0,
  258         _line_buffer_object                 => $line_buffer_object,
  259         _debugger_object                    => $args{debugger_object},
  260         _diagnostics_object                 => $args{diagnostics_object},
  261         _logger_object                      => $args{logger_object},
  262         _unexpected_error_count             => 0,
  263         _started_looking_for_here_target_at => 0,
  264         _nearly_matched_here_target_at      => undef,
  265         _line_text                          => "",
  266         _rlower_case_labels_at              => undef,
  267         _extended_syntax                    => $args{extended_syntax},
  268     };
  269 
  270     prepare_for_a_new_file();
  271     find_starting_indentation_level();
  272 
  273     bless $tokenizer_self, $class;
  274 
  275     # This is not a full class yet, so die if an attempt is made to
  276     # create more than one object.
  277 
  278     if ( _increment_count() > 1 ) {
  279         confess
  280 "Attempt to create more than 1 object in $class, which is not a true class yet\n";
  281     }
  282 
  283     return $tokenizer_self;
  284 
  285 }
  286 
  287 # interface to Perl::Tidy::Logger routines
  288 sub warning {
  289     my $msg           = shift;
  290     my $logger_object = $tokenizer_self->{_logger_object};
  291     if ($logger_object) {
  292         $logger_object->warning($msg);
  293     }
  294     return;
  295 }
  296 
  297 sub complain {
  298     my $msg           = shift;
  299     my $logger_object = $tokenizer_self->{_logger_object};
  300     if ($logger_object) {
  301         $logger_object->complain($msg);
  302     }
  303     return;
  304 }
  305 
  306 sub write_logfile_entry {
  307     my $msg           = shift;
  308     my $logger_object = $tokenizer_self->{_logger_object};
  309     if ($logger_object) {
  310         $logger_object->write_logfile_entry($msg);
  311     }
  312     return;
  313 }
  314 
  315 sub interrupt_logfile {
  316     my $logger_object = $tokenizer_self->{_logger_object};
  317     if ($logger_object) {
  318         $logger_object->interrupt_logfile();
  319     }
  320     return;
  321 }
  322 
  323 sub resume_logfile {
  324     my $logger_object = $tokenizer_self->{_logger_object};
  325     if ($logger_object) {
  326         $logger_object->resume_logfile();
  327     }
  328     return;
  329 }
  330 
  331 sub increment_brace_error {
  332     my $logger_object = $tokenizer_self->{_logger_object};
  333     if ($logger_object) {
  334         $logger_object->increment_brace_error();
  335     }
  336     return;
  337 }
  338 
  339 sub report_definite_bug {
  340     $tokenizer_self->{_hit_bug} = 1;
  341     my $logger_object = $tokenizer_self->{_logger_object};
  342     if ($logger_object) {
  343         $logger_object->report_definite_bug();
  344     }
  345     return;
  346 }
  347 
  348 sub brace_warning {
  349     my $msg           = shift;
  350     my $logger_object = $tokenizer_self->{_logger_object};
  351     if ($logger_object) {
  352         $logger_object->brace_warning($msg);
  353     }
  354     return;
  355 }
  356 
  357 sub get_saw_brace_error {
  358     my $logger_object = $tokenizer_self->{_logger_object};
  359     if ($logger_object) {
  360         return $logger_object->get_saw_brace_error();
  361     }
  362     else {
  363         return 0;
  364     }
  365 }
  366 
  367 sub get_unexpected_error_count {
  368     my ($self) = shift;
  369     return $self->{_unexpected_error_count};
  370 }
  371 
  372 # interface to Perl::Tidy::Diagnostics routines
  373 sub write_diagnostics {
  374     my $msg = shift;
  375     if ( $tokenizer_self->{_diagnostics_object} ) {
  376         $tokenizer_self->{_diagnostics_object}->write_diagnostics($msg);
  377     }
  378     return;
  379 }
  380 
  381 sub report_tokenization_errors {
  382 
  383     my $self         = shift;
  384     my $severe_error = $self->{_in_error};
  385 
  386     my $level = get_indentation_level();
  387     if ( $level != $tokenizer_self->{_starting_level} ) {
  388         warning("final indentation level: $level\n");
  389     }
  390 
  391     check_final_nesting_depths();
  392 
  393     if ( $tokenizer_self->{_look_for_hash_bang}
  394         && !$tokenizer_self->{_saw_hash_bang} )
  395     {
  396         warning(
  397             "hit EOF without seeing hash-bang line; maybe don't need -x?\n");
  398     }
  399 
  400     if ( $tokenizer_self->{_in_format} ) {
  401         warning("hit EOF while in format description\n");
  402     }
  403 
  404     if ( $tokenizer_self->{_in_pod} ) {
  405 
  406         # Just write log entry if this is after __END__ or __DATA__
  407         # because this happens to often, and it is not likely to be
  408         # a parsing error.
  409         if ( $tokenizer_self->{_saw_data} || $tokenizer_self->{_saw_end} ) {
  410             write_logfile_entry(
  411 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
  412             );
  413         }
  414 
  415         else {
  416             complain(
  417 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
  418             );
  419         }
  420 
  421     }
  422 
  423     if ( $tokenizer_self->{_in_here_doc} ) {
  424         $severe_error = 1;
  425         my $here_doc_target = $tokenizer_self->{_here_doc_target};
  426         my $started_looking_for_here_target_at =
  427           $tokenizer_self->{_started_looking_for_here_target_at};
  428         if ($here_doc_target) {
  429             warning(
  430 "hit EOF in here document starting at line $started_looking_for_here_target_at with target: $here_doc_target\n"
  431             );
  432         }
  433         else {
  434             warning(
  435 "hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string\n"
  436             );
  437         }
  438         my $nearly_matched_here_target_at =
  439           $tokenizer_self->{_nearly_matched_here_target_at};
  440         if ($nearly_matched_here_target_at) {
  441             warning(
  442 "NOTE: almost matched at input line $nearly_matched_here_target_at except for whitespace\n"
  443             );
  444         }
  445     }
  446 
  447     if ( $tokenizer_self->{_in_quote} ) {
  448         $severe_error = 1;
  449         my $line_start_quote = $tokenizer_self->{_line_start_quote};
  450         my $quote_target     = $tokenizer_self->{_quote_target};
  451         my $what =
  452           ( $tokenizer_self->{_in_attribute_list} )
  453           ? "attribute list"
  454           : "quote/pattern";
  455         warning(
  456 "hit EOF seeking end of $what starting at line $line_start_quote ending in $quote_target\n"
  457         );
  458     }
  459 
  460     if ( $tokenizer_self->{_hit_bug} ) {
  461         $severe_error = 1;
  462     }
  463 
  464     my $logger_object = $tokenizer_self->{_logger_object};
  465 
  466 # TODO: eventually may want to activate this to cause file to be output verbatim
  467     if (0) {
  468 
  469         # Set the severe error for a fairly high warning count because
  470         # some of the warnings do not harm formatting, such as duplicate
  471         # sub names.
  472         my $warning_count = $logger_object->{_warning_count};
  473         if ( $warning_count > 50 ) {
  474             $severe_error = 1;
  475         }
  476 
  477         # Brace errors are significant, so set the severe error flag at
  478         # a low number.
  479         my $saw_brace_error = $logger_object->{_saw_brace_error};
  480         if ( $saw_brace_error > 2 ) {
  481             $severe_error = 1;
  482         }
  483     }
  484 
  485     unless ( $tokenizer_self->{_saw_perl_dash_w} ) {
  486         if ( $] < 5.006 ) {
  487             write_logfile_entry("Suggest including '-w parameter'\n");
  488         }
  489         else {
  490             write_logfile_entry("Suggest including 'use warnings;'\n");
  491         }
  492     }
  493 
  494     if ( $tokenizer_self->{_saw_perl_dash_P} ) {
  495         write_logfile_entry("Use of -P parameter for defines is discouraged\n");
  496     }
  497 
  498     unless ( $tokenizer_self->{_saw_use_strict} ) {
  499         write_logfile_entry("Suggest including 'use strict;'\n");
  500     }
  501 
  502     # it is suggested that labels have at least one upper case character
  503     # for legibility and to avoid code breakage as new keywords are introduced
  504     if ( $tokenizer_self->{_rlower_case_labels_at} ) {
  505         my @lower_case_labels_at =
  506           @{ $tokenizer_self->{_rlower_case_labels_at} };
  507         write_logfile_entry(
  508             "Suggest using upper case characters in label(s)\n");
  509         local $" = ')(';
  510         write_logfile_entry("  defined at line(s): (@lower_case_labels_at)\n");
  511     }
  512     return $severe_error;
  513 }
  514 
  515 sub report_v_string {
  516 
  517     # warn if this version can't handle v-strings
  518     my $tok = shift;
  519     unless ( $tokenizer_self->{_saw_v_string} ) {
  520         $tokenizer_self->{_saw_v_string} = $tokenizer_self->{_last_line_number};
  521     }
  522     if ( $] < 5.006 ) {
  523         warning(
  524 "Found v-string '$tok' but v-strings are not implemented in your version of perl; see Camel 3 book ch 2\n"
  525         );
  526     }
  527     return;
  528 }
  529 
  530 sub get_input_line_number {
  531     return $tokenizer_self->{_last_line_number};
  532 }
  533 
  534 # returns the next tokenized line
  535 sub get_line {
  536 
  537     my $self = shift;
  538 
  539     # USES GLOBAL VARIABLES: $tokenizer_self, $brace_depth,
  540     # $square_bracket_depth, $paren_depth
  541 
  542     my $input_line = $tokenizer_self->{_line_buffer_object}->get_line();
  543     $tokenizer_self->{_line_text} = $input_line;
  544 
  545     return unless ($input_line);
  546 
  547     my $input_line_number = ++$tokenizer_self->{_last_line_number};
  548 
  549     # Find and remove what characters terminate this line, including any
  550     # control r
  551     my $input_line_separator = "";
  552     if ( chomp($input_line) ) { $input_line_separator = $/ }
  553 
  554     # TODO: what other characters should be included here?
  555     if ( $input_line =~ s/((\r|\035|\032)+)$// ) {
  556         $input_line_separator = $2 . $input_line_separator;
  557     }
  558 
  559     # for backwards compatibility we keep the line text terminated with
  560     # a newline character
  561     $input_line .= "\n";
  562     $tokenizer_self->{_line_text} = $input_line;    # update
  563 
  564     # create a data structure describing this line which will be
  565     # returned to the caller.
  566 
  567     # _line_type codes are:
  568     #   SYSTEM         - system-specific code before hash-bang line
  569     #   CODE           - line of perl code (including comments)
  570     #   POD_START      - line starting pod, such as '=head'
  571     #   POD            - pod documentation text
  572     #   POD_END        - last line of pod section, '=cut'
  573     #   HERE           - text of here-document
  574     #   HERE_END       - last line of here-doc (target word)
  575     #   FORMAT         - format section
  576     #   FORMAT_END     - last line of format section, '.'
  577     #   DATA_START     - __DATA__ line
  578     #   DATA           - unidentified text following __DATA__
  579     #   END_START      - __END__ line
  580     #   END            - unidentified text following __END__
  581     #   ERROR          - we are in big trouble, probably not a perl script
  582 
  583     # Other variables:
  584     #   _curly_brace_depth     - depth of curly braces at start of line
  585     #   _square_bracket_depth  - depth of square brackets at start of line
  586     #   _paren_depth           - depth of parens at start of line
  587     #   _starting_in_quote     - this line continues a multi-line quote
  588     #                            (so don't trim leading blanks!)
  589     #   _ending_in_quote       - this line ends in a multi-line quote
  590     #                            (so don't trim trailing blanks!)
  591     my $line_of_tokens = {
  592         _line_type                 => 'EOF',
  593         _line_text                 => $input_line,
  594         _line_number               => $input_line_number,
  595         _rtoken_type               => undef,
  596         _rtokens                   => undef,
  597         _rlevels                   => undef,
  598         _rslevels                  => undef,
  599         _rblock_type               => undef,
  600         _rcontainer_type           => undef,
  601         _rcontainer_environment    => undef,
  602         _rtype_sequence            => undef,
  603         _rnesting_tokens           => undef,
  604         _rci_levels                => undef,
  605         _rnesting_blocks           => undef,
  606         _guessed_indentation_level => 0,
  607         _starting_in_quote    => 0,                    # to be set by subroutine
  608         _ending_in_quote      => 0,
  609         _curly_brace_depth    => $brace_depth,
  610         _square_bracket_depth => $square_bracket_depth,
  611         _paren_depth          => $paren_depth,
  612         _quote_character      => '',
  613     };
  614 
  615     # must print line unchanged if we are in a here document
  616     if ( $tokenizer_self->{_in_here_doc} ) {
  617 
  618         $line_of_tokens->{_line_type} = 'HERE';
  619         my $here_doc_target      = $tokenizer_self->{_here_doc_target};
  620         my $here_quote_character = $tokenizer_self->{_here_quote_character};
  621         my $candidate_target     = $input_line;
  622         chomp $candidate_target;
  623 
  624         # Handle <<~ targets, which are indicated here by a leading space on
  625         # the here quote character
  626         if ( $here_quote_character =~ /^\s/ ) {
  627             $candidate_target =~ s/^\s*//;
  628         }
  629         if ( $candidate_target eq $here_doc_target ) {
  630             $tokenizer_self->{_nearly_matched_here_target_at} = undef;
  631             $line_of_tokens->{_line_type}                     = 'HERE_END';
  632             write_logfile_entry("Exiting HERE document $here_doc_target\n");
  633 
  634             my $rhere_target_list = $tokenizer_self->{_rhere_target_list};
  635             if ( @{$rhere_target_list} ) {  # there can be multiple here targets
  636                 ( $here_doc_target, $here_quote_character ) =
  637                   @{ shift @{$rhere_target_list} };
  638                 $tokenizer_self->{_here_doc_target} = $here_doc_target;
  639                 $tokenizer_self->{_here_quote_character} =
  640                   $here_quote_character;
  641                 write_logfile_entry(
  642                     "Entering HERE document $here_doc_target\n");
  643                 $tokenizer_self->{_nearly_matched_here_target_at} = undef;
  644                 $tokenizer_self->{_started_looking_for_here_target_at} =
  645                   $input_line_number;
  646             }
  647             else {
  648                 $tokenizer_self->{_in_here_doc}          = 0;
  649                 $tokenizer_self->{_here_doc_target}      = "";
  650                 $tokenizer_self->{_here_quote_character} = "";
  651             }
  652         }
  653 
  654         # check for error of extra whitespace
  655         # note for PERL6: leading whitespace is allowed
  656         else {
  657             $candidate_target =~ s/\s*$//;
  658             $candidate_target =~ s/^\s*//;
  659             if ( $candidate_target eq $here_doc_target ) {
  660                 $tokenizer_self->{_nearly_matched_here_target_at} =
  661                   $input_line_number;
  662             }
  663         }
  664         return $line_of_tokens;
  665     }
  666 
  667     # must print line unchanged if we are in a format section
  668     elsif ( $tokenizer_self->{_in_format} ) {
  669 
  670         if ( $input_line =~ /^\.[\s#]*$/ ) {
  671             write_logfile_entry("Exiting format section\n");
  672             $tokenizer_self->{_in_format} = 0;
  673             $line_of_tokens->{_line_type} = 'FORMAT_END';
  674         }
  675         else {
  676             $line_of_tokens->{_line_type} = 'FORMAT';
  677         }
  678         return $line_of_tokens;
  679     }
  680 
  681     # must print line unchanged if we are in pod documentation
  682     elsif ( $tokenizer_self->{_in_pod} ) {
  683 
  684         $line_of_tokens->{_line_type} = 'POD';
  685         if ( $input_line =~ /^=cut/ ) {
  686             $line_of_tokens->{_line_type} = 'POD_END';
  687             write_logfile_entry("Exiting POD section\n");
  688             $tokenizer_self->{_in_pod} = 0;
  689         }
  690         if ( $input_line =~ /^\#\!.*perl\b/ ) {
  691             warning(
  692                 "Hash-bang in pod can cause older versions of perl to fail! \n"
  693             );
  694         }
  695 
  696         return $line_of_tokens;
  697     }
  698 
  699     # must print line unchanged if we have seen a severe error (i.e., we
  700     # are seeing illegal tokens and cannot continue.  Syntax errors do
  701     # not pass this route).  Calling routine can decide what to do, but
  702     # the default can be to just pass all lines as if they were after __END__
  703     elsif ( $tokenizer_self->{_in_error} ) {
  704         $line_of_tokens->{_line_type} = 'ERROR';
  705         return $line_of_tokens;
  706     }
  707 
  708     # print line unchanged if we are __DATA__ section
  709     elsif ( $tokenizer_self->{_in_data} ) {
  710 
  711         # ...but look for POD
  712         # Note that the _in_data and _in_end flags remain set
  713         # so that we return to that state after seeing the
  714         # end of a pod section
  715         if ( $input_line =~ /^=(?!cut)/ ) {
  716             $line_of_tokens->{_line_type} = 'POD_START';
  717             write_logfile_entry("Entering POD section\n");
  718             $tokenizer_self->{_in_pod} = 1;
  719             return $line_of_tokens;
  720         }
  721         else {
  722             $line_of_tokens->{_line_type} = 'DATA';
  723             return $line_of_tokens;
  724         }
  725     }
  726 
  727     # print line unchanged if we are in __END__ section
  728     elsif ( $tokenizer_self->{_in_end} ) {
  729 
  730         # ...but look for POD
  731         # Note that the _in_data and _in_end flags remain set
  732         # so that we return to that state after seeing the
  733         # end of a pod section
  734         if ( $input_line =~ /^=(?!cut)/ ) {
  735             $line_of_tokens->{_line_type} = 'POD_START';
  736             write_logfile_entry("Entering POD section\n");
  737             $tokenizer_self->{_in_pod} = 1;
  738             return $line_of_tokens;
  739         }
  740         else {
  741             $line_of_tokens->{_line_type} = 'END';
  742             return $line_of_tokens;
  743         }
  744     }
  745 
  746     # check for a hash-bang line if we haven't seen one
  747     if ( !$tokenizer_self->{_saw_hash_bang} ) {
  748         if ( $input_line =~ /^\#\!.*perl\b/ ) {
  749             $tokenizer_self->{_saw_hash_bang} = $input_line_number;
  750 
  751             # check for -w and -P flags
  752             if ( $input_line =~ /^\#\!.*perl\s.*-.*P/ ) {
  753                 $tokenizer_self->{_saw_perl_dash_P} = 1;
  754             }
  755 
  756             if ( $input_line =~ /^\#\!.*perl\s.*-.*w/ ) {
  757                 $tokenizer_self->{_saw_perl_dash_w} = 1;
  758             }
  759 
  760             if (
  761                 ( $input_line_number > 1 )
  762 
  763                 # leave any hash bang in a BEGIN block alone
  764                 # i.e. see 'debugger-duck_type.t'
  765                 && !(
  766                        $last_nonblank_block_type
  767                     && $last_nonblank_block_type eq 'BEGIN'
  768                 )
  769                 && ( !$tokenizer_self->{_look_for_hash_bang} )
  770               )
  771             {
  772 
  773                 # this is helpful for VMS systems; we may have accidentally
  774                 # tokenized some DCL commands
  775                 if ( $tokenizer_self->{_started_tokenizing} ) {
  776                     warning(
  777 "There seems to be a hash-bang after line 1; do you need to run with -x ?\n"
  778                     );
  779                 }
  780                 else {
  781                     complain("Useless hash-bang after line 1\n");
  782                 }
  783             }
  784 
  785             # Report the leading hash-bang as a system line
  786             # This will prevent -dac from deleting it
  787             else {
  788                 $line_of_tokens->{_line_type} = 'SYSTEM';
  789                 return $line_of_tokens;
  790             }
  791         }
  792     }
  793 
  794     # wait for a hash-bang before parsing if the user invoked us with -x
  795     if ( $tokenizer_self->{_look_for_hash_bang}
  796         && !$tokenizer_self->{_saw_hash_bang} )
  797     {
  798         $line_of_tokens->{_line_type} = 'SYSTEM';
  799         return $line_of_tokens;
  800     }
  801 
  802     # a first line of the form ': #' will be marked as SYSTEM
  803     # since lines of this form may be used by tcsh
  804     if ( $input_line_number == 1 && $input_line =~ /^\s*\:\s*\#/ ) {
  805         $line_of_tokens->{_line_type} = 'SYSTEM';
  806         return $line_of_tokens;
  807     }
  808 
  809     # now we know that it is ok to tokenize the line...
  810     # the line tokenizer will modify any of these private variables:
  811     #        _rhere_target_list
  812     #        _in_data
  813     #        _in_end
  814     #        _in_format
  815     #        _in_error
  816     #        _in_pod
  817     #        _in_quote
  818     my $ending_in_quote_last = $tokenizer_self->{_in_quote};
  819     tokenize_this_line($line_of_tokens);
  820 
  821     # Now finish defining the return structure and return it
  822     $line_of_tokens->{_ending_in_quote} = $tokenizer_self->{_in_quote};
  823 
  824     # handle severe error (binary data in script)
  825     if ( $tokenizer_self->{_in_error} ) {
  826         $tokenizer_self->{_in_quote} = 0;    # to avoid any more messages
  827         warning("Giving up after error\n");
  828         $line_of_tokens->{_line_type} = 'ERROR';
  829         reset_indentation_level(0);          # avoid error messages
  830         return $line_of_tokens;
  831     }
  832 
  833     # handle start of pod documentation
  834     if ( $tokenizer_self->{_in_pod} ) {
  835 
  836         # This gets tricky..above a __DATA__ or __END__ section, perl
  837         # accepts '=cut' as the start of pod section. But afterwards,
  838         # only pod utilities see it and they may ignore an =cut without
  839         # leading =head.  In any case, this isn't good.
  840         if ( $input_line =~ /^=cut\b/ ) {
  841             if ( $tokenizer_self->{_saw_data} || $tokenizer_self->{_saw_end} ) {
  842                 complain("=cut while not in pod ignored\n");
  843                 $tokenizer_self->{_in_pod}    = 0;
  844                 $line_of_tokens->{_line_type} = 'POD_END';
  845             }
  846             else {
  847                 $line_of_tokens->{_line_type} = 'POD_START';
  848                 complain(
  849 "=cut starts a pod section .. this can fool pod utilities.\n"
  850                 );
  851                 write_logfile_entry("Entering POD section\n");
  852             }
  853         }
  854 
  855         else {
  856             $line_of_tokens->{_line_type} = 'POD_START';
  857             write_logfile_entry("Entering POD section\n");
  858         }
  859 
  860         return $line_of_tokens;
  861     }
  862 
  863     # update indentation levels for log messages
  864     if ( $input_line !~ /^\s*$/ ) {
  865         my $rlevels = $line_of_tokens->{_rlevels};
  866         $line_of_tokens->{_guessed_indentation_level} =
  867           guess_old_indentation_level($input_line);
  868     }
  869 
  870     # see if this line contains here doc targets
  871     my $rhere_target_list = $tokenizer_self->{_rhere_target_list};
  872     if ( @{$rhere_target_list} ) {
  873 
  874         my ( $here_doc_target, $here_quote_character ) =
  875           @{ shift @{$rhere_target_list} };
  876         $tokenizer_self->{_in_here_doc}          = 1;
  877         $tokenizer_self->{_here_doc_target}      = $here_doc_target;
  878         $tokenizer_self->{_here_quote_character} = $here_quote_character;
  879         write_logfile_entry("Entering HERE document $here_doc_target\n");
  880         $tokenizer_self->{_started_looking_for_here_target_at} =
  881           $input_line_number;
  882     }
  883 
  884     # NOTE: __END__ and __DATA__ statements are written unformatted
  885     # because they can theoretically contain additional characters
  886     # which are not tokenized (and cannot be read with <DATA> either!).
  887     if ( $tokenizer_self->{_in_data} ) {
  888         $line_of_tokens->{_line_type} = 'DATA_START';
  889         write_logfile_entry("Starting __DATA__ section\n");
  890         $tokenizer_self->{_saw_data} = 1;
  891 
  892         # keep parsing after __DATA__ if use SelfLoader was seen
  893         if ( $tokenizer_self->{_saw_selfloader} ) {
  894             $tokenizer_self->{_in_data} = 0;
  895             write_logfile_entry(
  896                 "SelfLoader seen, continuing; -nlsl deactivates\n");
  897         }
  898 
  899         return $line_of_tokens;
  900     }
  901 
  902     elsif ( $tokenizer_self->{_in_end} ) {
  903         $line_of_tokens->{_line_type} = 'END_START';
  904         write_logfile_entry("Starting __END__ section\n");
  905         $tokenizer_self->{_saw_end} = 1;
  906 
  907         # keep parsing after __END__ if use AutoLoader was seen
  908         if ( $tokenizer_self->{_saw_autoloader} ) {
  909             $tokenizer_self->{_in_end} = 0;
  910             write_logfile_entry(
  911                 "AutoLoader seen, continuing; -nlal deactivates\n");
  912         }
  913         return $line_of_tokens;
  914     }
  915 
  916     # now, finally, we know that this line is type 'CODE'
  917     $line_of_tokens->{_line_type} = 'CODE';
  918 
  919     # remember if we have seen any real code
  920     if (  !$tokenizer_self->{_started_tokenizing}
  921         && $input_line !~ /^\s*$/
  922         && $input_line !~ /^\s*#/ )
  923     {
  924         $tokenizer_self->{_started_tokenizing} = 1;
  925     }
  926 
  927     if ( $tokenizer_self->{_debugger_object} ) {
  928         $tokenizer_self->{_debugger_object}->write_debug_entry($line_of_tokens);
  929     }
  930 
  931     # Note: if keyword 'format' occurs in this line code, it is still CODE
  932     # (keyword 'format' need not start a line)
  933     if ( $tokenizer_self->{_in_format} ) {
  934         write_logfile_entry("Entering format section\n");
  935     }
  936 
  937     if ( $tokenizer_self->{_in_quote}
  938         and ( $tokenizer_self->{_line_start_quote} < 0 ) )
  939     {
  940 
  941         #if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) {
  942         if (
  943             ( my $quote_target = $tokenizer_self->{_quote_target} ) !~ /^\s*$/ )
  944         {
  945             $tokenizer_self->{_line_start_quote} = $input_line_number;
  946             write_logfile_entry(
  947                 "Start multi-line quote or pattern ending in $quote_target\n");
  948         }
  949     }
  950     elsif ( ( $tokenizer_self->{_line_start_quote} >= 0 )
  951         && !$tokenizer_self->{_in_quote} )
  952     {
  953         $tokenizer_self->{_line_start_quote} = -1;
  954         write_logfile_entry("End of multi-line quote or pattern\n");
  955     }
  956 
  957     # we are returning a line of CODE
  958     return $line_of_tokens;
  959 }
  960 
  961 sub find_starting_indentation_level {
  962 
  963     # We need to find the indentation level of the first line of the
  964     # script being formatted.  Often it will be zero for an entire file,
  965     # but if we are formatting a local block of code (within an editor for
  966     # example) it may not be zero.  The user may specify this with the
  967     # -sil=n parameter but normally doesn't so we have to guess.
  968     #
  969     # USES GLOBAL VARIABLES: $tokenizer_self
  970     my $starting_level = 0;
  971 
  972     # use value if given as parameter
  973     if ( $tokenizer_self->{_know_starting_level} ) {
  974         $starting_level = $tokenizer_self->{_starting_level};
  975     }
  976 
  977     # if we know there is a hash_bang line, the level must be zero
  978     elsif ( $tokenizer_self->{_look_for_hash_bang} ) {
  979         $tokenizer_self->{_know_starting_level} = 1;
  980     }
  981 
  982     # otherwise figure it out from the input file
  983     else {
  984         my $line;
  985         my $i = 0;
  986 
  987         # keep looking at lines until we find a hash bang or piece of code
  988         my $msg = "";
  989         while ( $line =
  990             $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) )
  991         {
  992 
  993             # if first line is #! then assume starting level is zero
  994             if ( $i == 1 && $line =~ /^\#\!/ ) {
  995                 $starting_level = 0;
  996                 last;
  997             }
  998             next if ( $line =~ /^\s*#/ );    # skip past comments
  999             next if ( $line =~ /^\s*$/ );    # skip past blank lines
 1000             $starting_level = guess_old_indentation_level($line);
 1001             last;
 1002         }
 1003         $msg = "Line $i implies starting-indentation-level = $starting_level\n";
 1004         write_logfile_entry("$msg");
 1005     }
 1006     $tokenizer_self->{_starting_level} = $starting_level;
 1007     reset_indentation_level($starting_level);
 1008     return;
 1009 }
 1010 
 1011 sub guess_old_indentation_level {
 1012     my ($line) = @_;
 1013 
 1014     # Guess the indentation level of an input line.
 1015     #
 1016     # For the first line of code this result will define the starting
 1017     # indentation level.  It will mainly be non-zero when perltidy is applied
 1018     # within an editor to a local block of code.
 1019     #
 1020     # This is an impossible task in general because we can't know what tabs
 1021     # meant for the old script and how many spaces were used for one
 1022     # indentation level in the given input script.  For example it may have
 1023     # been previously formatted with -i=7 -et=3.  But we can at least try to
 1024     # make sure that perltidy guesses correctly if it is applied repeatedly to
 1025     # a block of code within an editor, so that the block stays at the same
 1026     # level when perltidy is applied repeatedly.
 1027     #
 1028     # USES GLOBAL VARIABLES: $tokenizer_self
 1029     my $level = 0;
 1030 
 1031     # find leading tabs, spaces, and any statement label
 1032     my $spaces = 0;
 1033     if ( $line =~ /^(\t+)?(\s+)?(\w+:[^:])?/ ) {
 1034 
 1035         # If there are leading tabs, we use the tab scheme for this run, if
 1036         # any, so that the code will remain stable when editing.
 1037         if ($1) { $spaces += length($1) * $tokenizer_self->{_tabsize} }
 1038 
 1039         if ($2) { $spaces += length($2) }
 1040 
 1041         # correct for outdented labels
 1042         if ( $3 && $tokenizer_self->{'_outdent_labels'} ) {
 1043             $spaces += $tokenizer_self->{_continuation_indentation};
 1044         }
 1045     }
 1046 
 1047     # compute indentation using the value of -i for this run.
 1048     # If -i=0 is used for this run (which is possible) it doesn't matter
 1049     # what we do here but we'll guess that the old run used 4 spaces per level.
 1050     my $indent_columns = $tokenizer_self->{_indent_columns};
 1051     $indent_columns = 4 if ( !$indent_columns );
 1052     $level          = int( $spaces / $indent_columns );
 1053     return ($level);
 1054 }
 1055 
 1056 # This is a currently unused debug routine
 1057 sub dump_functions {
 1058 
 1059     my $fh = *STDOUT;
 1060     foreach my $pkg ( keys %is_user_function ) {
 1061         print $fh "\nnon-constant subs in package $pkg\n";
 1062 
 1063         foreach my $sub ( keys %{ $is_user_function{$pkg} } ) {
 1064             my $msg = "";
 1065             if ( $is_block_list_function{$pkg}{$sub} ) {
 1066                 $msg = 'block_list';
 1067             }
 1068 
 1069             if ( $is_block_function{$pkg}{$sub} ) {
 1070                 $msg = 'block';
 1071             }
 1072             print $fh "$sub $msg\n";
 1073         }
 1074     }
 1075 
 1076     foreach my $pkg ( keys %is_constant ) {
 1077         print $fh "\nconstants and constant subs in package $pkg\n";
 1078 
 1079         foreach my $sub ( keys %{ $is_constant{$pkg} } ) {
 1080             print $fh "$sub\n";
 1081         }
 1082     }
 1083     return;
 1084 }
 1085 
 1086 sub ones_count {
 1087 
 1088     # count number of 1's in a string of 1's and 0's
 1089     # example: ones_count("010101010101") gives 6
 1090     my $str = shift;
 1091     return $str =~ tr/1/0/;
 1092 }
 1093 
 1094 sub prepare_for_a_new_file {
 1095 
 1096     # previous tokens needed to determine what to expect next
 1097     $last_nonblank_token      = ';';    # the only possible starting state which
 1098     $last_nonblank_type       = ';';    # will make a leading brace a code block
 1099     $last_nonblank_block_type = '';
 1100 
 1101     # scalars for remembering statement types across multiple lines
 1102     $statement_type    = '';            # '' or 'use' or 'sub..' or 'case..'
 1103     $in_attribute_list = 0;
 1104 
 1105     # scalars for remembering where we are in the file
 1106     $current_package = "main";
 1107     $context         = UNKNOWN_CONTEXT;
 1108 
 1109     # hashes used to remember function information
 1110     %is_constant             = ();      # user-defined constants
 1111     %is_user_function        = ();      # user-defined functions
 1112     %user_function_prototype = ();      # their prototypes
 1113     %is_block_function       = ();
 1114     %is_block_list_function  = ();
 1115     %saw_function_definition = ();
 1116 
 1117     # variables used to track depths of various containers
 1118     # and report nesting errors
 1119     $paren_depth                         = 0;
 1120     $brace_depth                         = 0;
 1121     $square_bracket_depth                = 0;
 1122     @current_depth                       = (0) x scalar @closing_brace_names;
 1123     $total_depth                         = 0;
 1124     @total_depth                         = ();
 1125     @nesting_sequence_number             = ( 0 .. @closing_brace_names - 1 );
 1126     @current_sequence_number             = ();
 1127     $paren_type[$paren_depth]            = '';
 1128     $paren_semicolon_count[$paren_depth] = 0;
 1129     $paren_structural_type[$brace_depth] = '';
 1130     $brace_type[$brace_depth] = ';';    # identify opening brace as code block
 1131     $brace_structural_type[$brace_depth]                   = '';
 1132     $brace_context[$brace_depth]                           = UNKNOWN_CONTEXT;
 1133     $brace_package[$paren_depth]                           = $current_package;
 1134     $square_bracket_type[$square_bracket_depth]            = '';
 1135     $square_bracket_structural_type[$square_bracket_depth] = '';
 1136 
 1137     initialize_tokenizer_state();
 1138     return;
 1139 }
 1140 
 1141 {                                       # begin tokenize_this_line
 1142 
 1143     use constant BRACE          => 0;
 1144     use constant SQUARE_BRACKET => 1;
 1145     use constant PAREN          => 2;
 1146     use constant QUESTION_COLON => 3;
 1147 
 1148     # TV1: scalars for processing one LINE.
 1149     # Re-initialized on each entry to sub tokenize_this_line.
 1150     my (
 1151         $block_type,        $container_type,    $expecting,
 1152         $i,                 $i_tok,             $input_line,
 1153         $input_line_number, $last_nonblank_i,   $max_token_index,
 1154         $next_tok,          $next_type,         $peeked_ahead,
 1155         $prototype,         $rhere_target_list, $rtoken_map,
 1156         $rtoken_type,       $rtokens,           $tok,
 1157         $type,              $type_sequence,     $indent_flag,
 1158     );
 1159 
 1160     # TV2: refs to ARRAYS for processing one LINE
 1161     # Re-initialized on each call.
 1162     my $routput_token_list     = [];    # stack of output token indexes
 1163     my $routput_token_type     = [];    # token types
 1164     my $routput_block_type     = [];    # types of code block
 1165     my $routput_container_type = [];    # paren types, such as if, elsif, ..
 1166     my $routput_type_sequence  = [];    # nesting sequential number
 1167     my $routput_indent_flag    = [];    #
 1168 
 1169     # TV3: SCALARS for quote variables.  These are initialized with a
 1170     # subroutine call and continually updated as lines are processed.
 1171     my ( $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
 1172         $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers, );
 1173 
 1174     # TV4: SCALARS for multi-line identifiers and
 1175     # statements. These are initialized with a subroutine call
 1176     # and continually updated as lines are processed.
 1177     my ( $id_scan_state, $identifier, $want_paren, $indented_if_level );
 1178 
 1179     # TV5: SCALARS for tracking indentation level.
 1180     # Initialized once and continually updated as lines are
 1181     # processed.
 1182     my (
 1183         $nesting_token_string,      $nesting_type_string,
 1184         $nesting_block_string,      $nesting_block_flag,
 1185         $nesting_list_string,       $nesting_list_flag,
 1186         $ci_string_in_tokenizer,    $continuation_string_in_tokenizer,
 1187         $in_statement_continuation, $level_in_tokenizer,
 1188         $slevel_in_tokenizer,       $rslevel_stack,
 1189     );
 1190 
 1191     # TV6: SCALARS for remembering several previous
 1192     # tokens. Initialized once and continually updated as
 1193     # lines are processed.
 1194     my (
 1195         $last_nonblank_container_type,     $last_nonblank_type_sequence,
 1196         $last_last_nonblank_token,         $last_last_nonblank_type,
 1197         $last_last_nonblank_block_type,    $last_last_nonblank_container_type,
 1198         $last_last_nonblank_type_sequence, $last_nonblank_prototype,
 1199     );
 1200 
 1201     # ----------------------------------------------------------------
 1202     # beginning of tokenizer variable access and manipulation routines
 1203     # ----------------------------------------------------------------
 1204 
 1205     sub initialize_tokenizer_state {
 1206 
 1207         # TV1: initialized on each call
 1208         # TV2: initialized on each call
 1209         # TV3:
 1210         $in_quote                = 0;
 1211         $quote_type              = 'Q';
 1212         $quote_character         = "";
 1213         $quote_pos               = 0;
 1214         $quote_depth             = 0;
 1215         $quoted_string_1         = "";
 1216         $quoted_string_2         = "";
 1217         $allowed_quote_modifiers = "";
 1218 
 1219         # TV4:
 1220         $id_scan_state     = '';
 1221         $identifier        = '';
 1222         $want_paren        = "";
 1223         $indented_if_level = 0;
 1224 
 1225         # TV5:
 1226         $nesting_token_string             = "";
 1227         $nesting_type_string              = "";
 1228         $nesting_block_string             = '1';    # initially in a block
 1229         $nesting_block_flag               = 1;
 1230         $nesting_list_string              = '0';    # initially not in a list
 1231         $nesting_list_flag                = 0;      # initially not in a list
 1232         $ci_string_in_tokenizer           = "";
 1233         $continuation_string_in_tokenizer = "0";
 1234         $in_statement_continuation        = 0;
 1235         $level_in_tokenizer               = 0;
 1236         $slevel_in_tokenizer              = 0;
 1237         $rslevel_stack                    = [];
 1238 
 1239         # TV6:
 1240         $last_nonblank_container_type      = '';
 1241         $last_nonblank_type_sequence       = '';
 1242         $last_last_nonblank_token          = ';';
 1243         $last_last_nonblank_type           = ';';
 1244         $last_last_nonblank_block_type     = '';
 1245         $last_last_nonblank_container_type = '';
 1246         $last_last_nonblank_type_sequence  = '';
 1247         $last_nonblank_prototype           = "";
 1248         return;
 1249     }
 1250 
 1251     sub save_tokenizer_state {
 1252 
 1253         my $rTV1 = [
 1254             $block_type,        $container_type,    $expecting,
 1255             $i,                 $i_tok,             $input_line,
 1256             $input_line_number, $last_nonblank_i,   $max_token_index,
 1257             $next_tok,          $next_type,         $peeked_ahead,
 1258             $prototype,         $rhere_target_list, $rtoken_map,
 1259             $rtoken_type,       $rtokens,           $tok,
 1260             $type,              $type_sequence,     $indent_flag,
 1261         ];
 1262 
 1263         my $rTV2 = [
 1264             $routput_token_list,    $routput_token_type,
 1265             $routput_block_type,    $routput_container_type,
 1266             $routput_type_sequence, $routput_indent_flag,
 1267         ];
 1268 
 1269         my $rTV3 = [
 1270             $in_quote,        $quote_type,
 1271             $quote_character, $quote_pos,
 1272             $quote_depth,     $quoted_string_1,
 1273             $quoted_string_2, $allowed_quote_modifiers,
 1274         ];
 1275 
 1276         my $rTV4 =
 1277           [ $id_scan_state, $identifier, $want_paren, $indented_if_level ];
 1278 
 1279         my $rTV5 = [
 1280             $nesting_token_string,      $nesting_type_string,
 1281             $nesting_block_string,      $nesting_block_flag,
 1282             $nesting_list_string,       $nesting_list_flag,
 1283             $ci_string_in_tokenizer,    $continuation_string_in_tokenizer,
 1284             $in_statement_continuation, $level_in_tokenizer,
 1285             $slevel_in_tokenizer,       $rslevel_stack,
 1286         ];
 1287 
 1288         my $rTV6 = [
 1289             $last_nonblank_container_type,
 1290             $last_nonblank_type_sequence,
 1291             $last_last_nonblank_token,
 1292             $last_last_nonblank_type,
 1293             $last_last_nonblank_block_type,
 1294             $last_last_nonblank_container_type,
 1295             $last_last_nonblank_type_sequence,
 1296             $last_nonblank_prototype,
 1297         ];
 1298         return [ $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ];
 1299     }
 1300 
 1301     sub restore_tokenizer_state {
 1302         my ($rstate) = @_;
 1303         my ( $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ) = @{$rstate};
 1304         (
 1305             $block_type,        $container_type,    $expecting,
 1306             $i,                 $i_tok,             $input_line,
 1307             $input_line_number, $last_nonblank_i,   $max_token_index,
 1308             $next_tok,          $next_type,         $peeked_ahead,
 1309             $prototype,         $rhere_target_list, $rtoken_map,
 1310             $rtoken_type,       $rtokens,           $tok,
 1311             $type,              $type_sequence,     $indent_flag,
 1312         ) = @{$rTV1};
 1313 
 1314         (
 1315             $routput_token_list,    $routput_token_type,
 1316             $routput_block_type,    $routput_container_type,
 1317             $routput_type_sequence, $routput_type_sequence,
 1318         ) = @{$rTV2};
 1319 
 1320         (
 1321             $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
 1322             $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers,
 1323         ) = @{$rTV3};
 1324 
 1325         ( $id_scan_state, $identifier, $want_paren, $indented_if_level ) =
 1326           @{$rTV4};
 1327 
 1328         (
 1329             $nesting_token_string,      $nesting_type_string,
 1330             $nesting_block_string,      $nesting_block_flag,
 1331             $nesting_list_string,       $nesting_list_flag,
 1332             $ci_string_in_tokenizer,    $continuation_string_in_tokenizer,
 1333             $in_statement_continuation, $level_in_tokenizer,
 1334             $slevel_in_tokenizer,       $rslevel_stack,
 1335         ) = @{$rTV5};
 1336 
 1337         (
 1338             $last_nonblank_container_type,
 1339             $last_nonblank_type_sequence,
 1340             $last_last_nonblank_token,
 1341             $last_last_nonblank_type,
 1342             $last_last_nonblank_block_type,
 1343             $last_last_nonblank_container_type,
 1344             $last_last_nonblank_type_sequence,
 1345             $last_nonblank_prototype,
 1346         ) = @{$rTV6};
 1347         return;
 1348     }
 1349 
 1350     sub get_indentation_level {
 1351 
 1352         # patch to avoid reporting error if indented if is not terminated
 1353         if ($indented_if_level) { return $level_in_tokenizer - 1 }
 1354         return $level_in_tokenizer;
 1355     }
 1356 
 1357     sub reset_indentation_level {
 1358         $level_in_tokenizer = $slevel_in_tokenizer = shift;
 1359         push @{$rslevel_stack}, $slevel_in_tokenizer;
 1360         return;
 1361     }
 1362 
 1363     sub peeked_ahead {
 1364         my $flag = shift;
 1365         $peeked_ahead = defined($flag) ? $flag : $peeked_ahead;
 1366         return $peeked_ahead;
 1367     }
 1368 
 1369     # ------------------------------------------------------------
 1370     # end of tokenizer variable access and manipulation routines
 1371     # ------------------------------------------------------------
 1372 
 1373     # ------------------------------------------------------------
 1374     # beginning of various scanner interface routines
 1375     # ------------------------------------------------------------
 1376     sub scan_replacement_text {
 1377 
 1378         # check for here-docs in replacement text invoked by
 1379         # a substitution operator with executable modifier 'e'.
 1380         #
 1381         # given:
 1382         #  $replacement_text
 1383         # return:
 1384         #  $rht = reference to any here-doc targets
 1385         my ($replacement_text) = @_;
 1386 
 1387         # quick check
 1388         return unless ( $replacement_text =~ /<</ );
 1389 
 1390         write_logfile_entry("scanning replacement text for here-doc targets\n");
 1391 
 1392         # save the logger object for error messages
 1393         my $logger_object = $tokenizer_self->{_logger_object};
 1394 
 1395         # localize all package variables
 1396         local (
 1397             $tokenizer_self,                 $last_nonblank_token,
 1398             $last_nonblank_type,             $last_nonblank_block_type,
 1399             $statement_type,                 $in_attribute_list,
 1400             $current_package,                $context,
 1401             %is_constant,                    %is_user_function,
 1402             %user_function_prototype,        %is_block_function,
 1403             %is_block_list_function,         %saw_function_definition,
 1404             $brace_depth,                    $paren_depth,
 1405             $square_bracket_depth,           @current_depth,
 1406             @total_depth,                    $total_depth,
 1407             @nesting_sequence_number,        @current_sequence_number,
 1408             @paren_type,                     @paren_semicolon_count,
 1409             @paren_structural_type,          @brace_type,
 1410             @brace_structural_type,          @brace_context,
 1411             @brace_package,                  @square_bracket_type,
 1412             @square_bracket_structural_type, @depth_array,
 1413             @starting_line_of_current_depth, @nested_ternary_flag,
 1414             @nested_statement_type,
 1415         );
 1416 
 1417         # save all lexical variables
 1418         my $rstate = save_tokenizer_state();
 1419         _decrement_count();    # avoid error check for multiple tokenizers
 1420 
 1421         # make a new tokenizer
 1422         my $rOpts = {};
 1423         my $rpending_logfile_message;
 1424         my $source_object =
 1425           Perl::Tidy::LineSource->new( \$replacement_text, $rOpts,
 1426             $rpending_logfile_message );
 1427         my $tokenizer = Perl::Tidy::Tokenizer->new(
 1428             source_object        => $source_object,
 1429             logger_object        => $logger_object,
 1430             starting_line_number => $input_line_number,
 1431         );
 1432 
 1433         # scan the replacement text
 1434         1 while ( $tokenizer->get_line() );
 1435 
 1436         # remove any here doc targets
 1437         my $rht = undef;
 1438         if ( $tokenizer_self->{_in_here_doc} ) {
 1439             $rht = [];
 1440             push @{$rht},
 1441               [
 1442                 $tokenizer_self->{_here_doc_target},
 1443                 $tokenizer_self->{_here_quote_character}
 1444               ];
 1445             if ( $tokenizer_self->{_rhere_target_list} ) {
 1446                 push @{$rht}, @{ $tokenizer_self->{_rhere_target_list} };
 1447                 $tokenizer_self->{_rhere_target_list} = undef;
 1448             }
 1449             $tokenizer_self->{_in_here_doc} = undef;
 1450         }
 1451 
 1452         # now its safe to report errors
 1453         my $severe_error = $tokenizer->report_tokenization_errors();
 1454 
 1455         # TODO: Could propagate a severe error up
 1456 
 1457         # restore all tokenizer lexical variables
 1458         restore_tokenizer_state($rstate);
 1459 
 1460         # return the here doc targets
 1461         return $rht;
 1462     }
 1463 
 1464     sub scan_bare_identifier {
 1465         ( $i, $tok, $type, $prototype ) =
 1466           scan_bare_identifier_do( $input_line, $i, $tok, $type, $prototype,
 1467             $rtoken_map, $max_token_index );
 1468         return;
 1469     }
 1470 
 1471     sub scan_identifier {
 1472         ( $i, $tok, $type, $id_scan_state, $identifier ) =
 1473           scan_identifier_do( $i, $id_scan_state, $identifier, $rtokens,
 1474             $max_token_index, $expecting, $paren_type[$paren_depth] );
 1475         return;
 1476     }
 1477 
 1478     sub scan_id {
 1479         ( $i, $tok, $type, $id_scan_state ) =
 1480           scan_id_do( $input_line, $i, $tok, $rtokens, $rtoken_map,
 1481             $id_scan_state, $max_token_index );
 1482         return;
 1483     }
 1484 
 1485     sub scan_number {
 1486         my $number;
 1487         ( $i, $type, $number ) =
 1488           scan_number_do( $input_line, $i, $rtoken_map, $type,
 1489             $max_token_index );
 1490         return $number;
 1491     }
 1492 
 1493     # a sub to warn if token found where term expected
 1494     sub error_if_expecting_TERM {
 1495         if ( $expecting == TERM ) {
 1496             if ( $really_want_term{$last_nonblank_type} ) {
 1497                 report_unexpected( $tok, "term", $i_tok, $last_nonblank_i,
 1498                     $rtoken_map, $rtoken_type, $input_line );
 1499                 return 1;
 1500             }
 1501         }
 1502         return;
 1503     }
 1504 
 1505     # a sub to warn if token found where operator expected
 1506     sub error_if_expecting_OPERATOR {
 1507         my $thing = shift;
 1508         if ( $expecting == OPERATOR ) {
 1509             if ( !defined($thing) ) { $thing = $tok }
 1510             report_unexpected( $thing, "operator", $i_tok, $last_nonblank_i,
 1511                 $rtoken_map, $rtoken_type, $input_line );
 1512             if ( $i_tok == 0 ) {
 1513                 interrupt_logfile();
 1514                 warning("Missing ';' above?\n");
 1515                 resume_logfile();
 1516             }
 1517             return 1;
 1518         }
 1519         return;
 1520     }
 1521 
 1522     # ------------------------------------------------------------
 1523     # end scanner interfaces
 1524     # ------------------------------------------------------------
 1525 
 1526     my %is_for_foreach;
 1527     @_ = qw(for foreach);
 1528     @is_for_foreach{@_} = (1) x scalar(@_);
 1529 
 1530     my %is_my_our;
 1531     @_ = qw(my our);
 1532     @is_my_our{@_} = (1) x scalar(@_);
 1533 
 1534     # These keywords may introduce blocks after parenthesized expressions,
 1535     # in the form:
 1536     # keyword ( .... ) { BLOCK }
 1537     # patch for SWITCH/CASE: added 'switch' 'case' 'given' 'when'
 1538     my %is_blocktype_with_paren;
 1539     @_ =
 1540       qw(if elsif unless while until for foreach switch case given when catch);
 1541     @is_blocktype_with_paren{@_} = (1) x scalar(@_);
 1542 
 1543     # ------------------------------------------------------------
 1544     # begin hash of code for handling most token types
 1545     # ------------------------------------------------------------
 1546     my $tokenization_code = {
 1547 
 1548         # no special code for these types yet, but syntax checks
 1549         # could be added
 1550 
 1551 ##      '!'   => undef,
 1552 ##      '!='  => undef,
 1553 ##      '!~'  => undef,
 1554 ##      '%='  => undef,
 1555 ##      '&&=' => undef,
 1556 ##      '&='  => undef,
 1557 ##      '+='  => undef,
 1558 ##      '-='  => undef,
 1559 ##      '..'  => undef,
 1560 ##      '..'  => undef,
 1561 ##      '...' => undef,
 1562 ##      '.='  => undef,
 1563 ##      '<<=' => undef,
 1564 ##      '<='  => undef,
 1565 ##      '<=>' => undef,
 1566 ##      '<>'  => undef,
 1567 ##      '='   => undef,
 1568 ##      '=='  => undef,
 1569 ##      '=~'  => undef,
 1570 ##      '>='  => undef,
 1571 ##      '>>'  => undef,
 1572 ##      '>>=' => undef,
 1573 ##      '\\'  => undef,
 1574 ##      '^='  => undef,
 1575 ##      '|='  => undef,
 1576 ##      '||=' => undef,
 1577 ##      '//=' => undef,
 1578 ##      '~'   => undef,
 1579 ##      '~~'  => undef,
 1580 ##      '!~~'  => undef,
 1581 
 1582         '>' => sub {
 1583             error_if_expecting_TERM()
 1584               if ( $expecting == TERM );
 1585         },
 1586         '|' => sub {
 1587             error_if_expecting_TERM()
 1588               if ( $expecting == TERM );
 1589         },
 1590         '$' => sub {
 1591 
 1592             # start looking for a scalar
 1593             error_if_expecting_OPERATOR("Scalar")
 1594               if ( $expecting == OPERATOR );
 1595             scan_identifier();
 1596 
 1597             if ( $identifier eq '$^W' ) {
 1598                 $tokenizer_self->{_saw_perl_dash_w} = 1;
 1599             }
 1600 
 1601             # Check for identifier in indirect object slot
 1602             # (vorboard.pl, sort.t).  Something like:
 1603             #   /^(print|printf|sort|exec|system)$/
 1604             if (
 1605                 $is_indirect_object_taker{$last_nonblank_token}
 1606 
 1607                 || ( ( $last_nonblank_token eq '(' )
 1608                     && $is_indirect_object_taker{ $paren_type[$paren_depth] } )
 1609                 || ( $last_nonblank_type =~ /^[Uw]$/ )    # possible object
 1610               )
 1611             {
 1612                 $type = 'Z';
 1613             }
 1614         },
 1615         '(' => sub {
 1616 
 1617             ++$paren_depth;
 1618             $paren_semicolon_count[$paren_depth] = 0;
 1619             if ($want_paren) {
 1620                 $container_type = $want_paren;
 1621                 $want_paren     = "";
 1622             }
 1623             elsif ( $statement_type =~ /^sub\b/ ) {
 1624                 $container_type = $statement_type;
 1625             }
 1626             else {
 1627                 $container_type = $last_nonblank_token;
 1628 
 1629                 # We can check for a syntax error here of unexpected '(',
 1630                 # but this is going to get messy...
 1631                 if (
 1632                     $expecting == OPERATOR
 1633 
 1634                     # be sure this is not a method call of the form
 1635                     # &method(...), $method->(..), &{method}(...),
 1636                     # $ref[2](list) is ok & short for $ref[2]->(list)
 1637                     # NOTE: at present, braces in something like &{ xxx }
 1638                     # are not marked as a block, we might have a method call
 1639                     && $last_nonblank_token !~ /^([\]\}\&]|\-\>)/
 1640 
 1641                   )
 1642                 {
 1643 
 1644                     # ref: camel 3 p 703.
 1645                     if ( $last_last_nonblank_token eq 'do' ) {
 1646                         complain(
 1647 "do SUBROUTINE is deprecated; consider & or -> notation\n"
 1648                         );
 1649                     }
 1650                     else {
 1651 
 1652                         # if this is an empty list, (), then it is not an
 1653                         # error; for example, we might have a constant pi and
 1654                         # invoke it with pi() or just pi;
 1655                         my ( $next_nonblank_token, $i_next ) =
 1656                           find_next_nonblank_token( $i, $rtokens,
 1657                             $max_token_index );
 1658                         if ( $next_nonblank_token ne ')' ) {
 1659                             my $hint;
 1660                             error_if_expecting_OPERATOR('(');
 1661 
 1662                             if ( $last_nonblank_type eq 'C' ) {
 1663                                 $hint =
 1664                                   "$last_nonblank_token has a void prototype\n";
 1665                             }
 1666                             elsif ( $last_nonblank_type eq 'i' ) {
 1667                                 if (   $i_tok > 0
 1668                                     && $last_nonblank_token =~ /^\$/ )
 1669                                 {
 1670                                     $hint =
 1671 "Do you mean '$last_nonblank_token->(' ?\n";
 1672                                 }
 1673                             }
 1674                             if ($hint) {
 1675                                 interrupt_logfile();
 1676                                 warning($hint);
 1677                                 resume_logfile();
 1678                             }
 1679                         } ## end if ( $next_nonblank_token...
 1680                     } ## end else [ if ( $last_last_nonblank_token...
 1681                 } ## end if ( $expecting == OPERATOR...
 1682             }
 1683             $paren_type[$paren_depth] = $container_type;
 1684             ( $type_sequence, $indent_flag ) =
 1685               increase_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
 1686 
 1687             # propagate types down through nested parens
 1688             # for example: the second paren in 'if ((' would be structural
 1689             # since the first is.
 1690 
 1691             if ( $last_nonblank_token eq '(' ) {
 1692                 $type = $last_nonblank_type;
 1693             }
 1694 
 1695             #     We exclude parens as structural after a ',' because it
 1696             #     causes subtle problems with continuation indentation for
 1697             #     something like this, where the first 'or' will not get
 1698             #     indented.
 1699             #
 1700             #         assert(
 1701             #             __LINE__,
 1702             #             ( not defined $check )
 1703             #               or ref $check
 1704             #               or $check eq "new"
 1705             #               or $check eq "old",
 1706             #         );
 1707             #
 1708             #     Likewise, we exclude parens where a statement can start
 1709             #     because of problems with continuation indentation, like
 1710             #     these:
 1711             #
 1712             #         ($firstline =~ /^#\!.*perl/)
 1713             #         and (print $File::Find::name, "\n")
 1714             #           and (return 1);
 1715             #
 1716             #         (ref($usage_fref) =~ /CODE/)
 1717             #         ? &$usage_fref
 1718             #           : (&blast_usage, &blast_params, &blast_general_params);
 1719 
 1720             else {
 1721                 $type = '{';
 1722             }
 1723 
 1724             if ( $last_nonblank_type eq ')' ) {
 1725                 warning(
 1726                     "Syntax error? found token '$last_nonblank_type' then '('\n"
 1727                 );
 1728             }
 1729             $paren_structural_type[$paren_depth] = $type;
 1730 
 1731         },
 1732         ')' => sub {
 1733             ( $type_sequence, $indent_flag ) =
 1734               decrease_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
 1735 
 1736             if ( $paren_structural_type[$paren_depth] eq '{' ) {
 1737                 $type = '}';
 1738             }
 1739 
 1740             $container_type = $paren_type[$paren_depth];
 1741 
 1742             # restore statement type as 'sub' at closing paren of a signature
 1743             # so that a subsequent ':' is identified as an attribute
 1744             if ( $container_type =~ /^sub\b/ ) {
 1745                 $statement_type = $container_type;
 1746             }
 1747 
 1748             #    /^(for|foreach)$/
 1749             if ( $is_for_foreach{ $paren_type[$paren_depth] } ) {
 1750                 my $num_sc = $paren_semicolon_count[$paren_depth];
 1751                 if ( $num_sc > 0 && $num_sc != 2 ) {
 1752                     warning("Expected 2 ';' in 'for(;;)' but saw $num_sc\n");
 1753                 }
 1754             }
 1755 
 1756             if ( $paren_depth > 0 ) { $paren_depth-- }
 1757         },
 1758         ',' => sub {
 1759             if ( $last_nonblank_type eq ',' ) {
 1760                 complain("Repeated ','s \n");
 1761             }
 1762 
 1763             # patch for operator_expected: note if we are in the list (use.t)
 1764             if ( $statement_type eq 'use' ) { $statement_type = '_use' }
 1765 ##                FIXME: need to move this elsewhere, perhaps check after a '('
 1766 ##                elsif ($last_nonblank_token eq '(') {
 1767 ##                    warning("Leading ','s illegal in some versions of perl\n");
 1768 ##                }
 1769         },
 1770         ';' => sub {
 1771             $context        = UNKNOWN_CONTEXT;
 1772             $statement_type = '';
 1773             $want_paren     = "";
 1774 
 1775             #    /^(for|foreach)$/
 1776             if ( $is_for_foreach{ $paren_type[$paren_depth] } )
 1777             {    # mark ; in for loop
 1778 
 1779                 # Be careful: we do not want a semicolon such as the
 1780                 # following to be included:
 1781                 #
 1782                 #    for (sort {strcoll($a,$b);} keys %investments) {
 1783 
 1784                 if (   $brace_depth == $depth_array[PAREN][BRACE][$paren_depth]
 1785                     && $square_bracket_depth ==
 1786                     $depth_array[PAREN][SQUARE_BRACKET][$paren_depth] )
 1787                 {
 1788 
 1789                     $type = 'f';
 1790                     $paren_semicolon_count[$paren_depth]++;
 1791                 }
 1792             }
 1793 
 1794         },
 1795         '"' => sub {
 1796             error_if_expecting_OPERATOR("String")
 1797               if ( $expecting == OPERATOR );
 1798             $in_quote                = 1;
 1799             $type                    = 'Q';
 1800             $allowed_quote_modifiers = "";
 1801         },
 1802         "'" => sub {
 1803             error_if_expecting_OPERATOR("String")
 1804               if ( $expecting == OPERATOR );
 1805             $in_quote                = 1;
 1806             $type                    = 'Q';
 1807             $allowed_quote_modifiers = "";
 1808         },
 1809         '`' => sub {
 1810             error_if_expecting_OPERATOR("String")
 1811               if ( $expecting == OPERATOR );
 1812             $in_quote                = 1;
 1813             $type                    = 'Q';
 1814             $allowed_quote_modifiers = "";
 1815         },
 1816         '/' => sub {
 1817             my $is_pattern;
 1818 
 1819             # a pattern cannot follow certain keywords which take optional
 1820             # arguments, like 'shift' and 'pop'. See also '?'.
 1821             if (   $last_nonblank_type eq 'k'
 1822                 && $is_keyword_taking_optional_args{$last_nonblank_token} )
 1823             {
 1824                 $is_pattern = 0;
 1825             }
 1826             elsif ( $expecting == UNKNOWN ) {    # indeterminate, must guess..
 1827                 my $msg;
 1828                 ( $is_pattern, $msg ) =
 1829                   guess_if_pattern_or_division( $i, $rtokens, $rtoken_map,
 1830                     $max_token_index );
 1831 
 1832                 if ($msg) {
 1833                     write_diagnostics("DIVIDE:$msg\n");
 1834                     write_logfile_entry($msg);
 1835                 }
 1836             }
 1837             else { $is_pattern = ( $expecting == TERM ) }
 1838 
 1839             if ($is_pattern) {
 1840                 $in_quote                = 1;
 1841                 $type                    = 'Q';
 1842                 $allowed_quote_modifiers = '[msixpodualngc]';
 1843             }
 1844             else {    # not a pattern; check for a /= token
 1845 
 1846                 if ( $rtokens->[ $i + 1 ] eq '=' ) {    # form token /=
 1847                     $i++;
 1848                     $tok  = '/=';
 1849                     $type = $tok;
 1850                 }
 1851 
 1852               #DEBUG - collecting info on what tokens follow a divide
 1853               # for development of guessing algorithm
 1854               #if ( numerator_expected( $i, $rtokens, $max_token_index ) < 0 ) {
 1855               #    #write_diagnostics( "DIVIDE? $input_line\n" );
 1856               #}
 1857             }
 1858         },
 1859         '{' => sub {
 1860 
 1861             # if we just saw a ')', we will label this block with
 1862             # its type.  We need to do this to allow sub
 1863             # code_block_type to determine if this brace starts a
 1864             # code block or anonymous hash.  (The type of a paren
 1865             # pair is the preceding token, such as 'if', 'else',
 1866             # etc).
 1867             $container_type = "";
 1868 
 1869             # ATTRS: for a '{' following an attribute list, reset
 1870             # things to look like we just saw the sub name
 1871             if ( $statement_type =~ /^sub/ ) {
 1872                 $last_nonblank_token = $statement_type;
 1873                 $last_nonblank_type  = 'i';
 1874                 $statement_type      = "";
 1875             }
 1876 
 1877             # patch for SWITCH/CASE: hide these keywords from an immediately
 1878             # following opening brace
 1879             elsif ( ( $statement_type eq 'case' || $statement_type eq 'when' )
 1880                 && $statement_type eq $last_nonblank_token )
 1881             {
 1882                 $last_nonblank_token = ";";
 1883             }
 1884 
 1885             elsif ( $last_nonblank_token eq ')' ) {
 1886                 $last_nonblank_token = $paren_type[ $paren_depth + 1 ];
 1887 
 1888                 # defensive move in case of a nesting error (pbug.t)
 1889                 # in which this ')' had no previous '('
 1890                 # this nesting error will have been caught
 1891                 if ( !defined($last_nonblank_token) ) {
 1892                     $last_nonblank_token = 'if';
 1893                 }
 1894 
 1895                 # check for syntax error here;
 1896                 unless ( $is_blocktype_with_paren{$last_nonblank_token} ) {
 1897                     if ( $tokenizer_self->{'_extended_syntax'} ) {
 1898 
 1899                         # we append a trailing () to mark this as an unknown
 1900                         # block type.  This allows perltidy to format some
 1901                         # common extensions of perl syntax.
 1902                         # This is used by sub code_block_type
 1903                         $last_nonblank_token .= '()';
 1904                     }
 1905                     else {
 1906                         my $list =
 1907                           join( ' ', sort keys %is_blocktype_with_paren );
 1908                         warning(
 1909 "syntax error at ') {', didn't see one of: <<$list>>; If this code is okay try using the -xs flag\n"
 1910                         );
 1911                     }
 1912                 }
 1913             }
 1914 
 1915             # patch for paren-less for/foreach glitch, part 2.
 1916             # see note below under 'qw'
 1917             elsif ($last_nonblank_token eq 'qw'
 1918                 && $is_for_foreach{$want_paren} )
 1919             {
 1920                 $last_nonblank_token = $want_paren;
 1921                 if ( $last_last_nonblank_token eq $want_paren ) {
 1922                     warning(
 1923 "syntax error at '$want_paren .. {' -- missing \$ loop variable\n"
 1924                     );
 1925 
 1926                 }
 1927                 $want_paren = "";
 1928             }
 1929 
 1930             # now identify which of the three possible types of
 1931             # curly braces we have: hash index container, anonymous
 1932             # hash reference, or code block.
 1933 
 1934             # non-structural (hash index) curly brace pair
 1935             # get marked 'L' and 'R'
 1936             if ( is_non_structural_brace() ) {
 1937                 $type = 'L';
 1938 
 1939                 # patch for SWITCH/CASE:
 1940                 # allow paren-less identifier after 'when'
 1941                 # if the brace is preceded by a space
 1942                 if (   $statement_type eq 'when'
 1943                     && $last_nonblank_type eq 'i'
 1944                     && $last_last_nonblank_type eq 'k'
 1945                     && ( $i_tok == 0 || $rtoken_type->[ $i_tok - 1 ] eq 'b' ) )
 1946                 {
 1947                     $type       = '{';
 1948                     $block_type = $statement_type;
 1949                 }
 1950             }
 1951 
 1952             # code and anonymous hash have the same type, '{', but are
 1953             # distinguished by 'block_type',
 1954             # which will be blank for an anonymous hash
 1955             else {
 1956 
 1957                 $block_type = code_block_type( $i_tok, $rtokens, $rtoken_type,
 1958                     $max_token_index );
 1959 
 1960                 # patch to promote bareword type to function taking block
 1961                 if (   $block_type
 1962                     && $last_nonblank_type eq 'w'
 1963                     && $last_nonblank_i >= 0 )
 1964                 {
 1965                     if ( $routput_token_type->[$last_nonblank_i] eq 'w' ) {
 1966                         $routput_token_type->[$last_nonblank_i] = 'G';
 1967                     }
 1968                 }
 1969 
 1970                 # patch for SWITCH/CASE: if we find a stray opening block brace
 1971                 # where we might accept a 'case' or 'when' block, then take it
 1972                 if (   $statement_type eq 'case'
 1973                     || $statement_type eq 'when' )
 1974                 {
 1975                     if ( !$block_type || $block_type eq '}' ) {
 1976                         $block_type = $statement_type;
 1977                     }
 1978                 }
 1979             }
 1980 
 1981             $brace_type[ ++$brace_depth ]        = $block_type;
 1982             $brace_package[$brace_depth]         = $current_package;
 1983             $brace_structural_type[$brace_depth] = $type;
 1984             $brace_context[$brace_depth]         = $context;
 1985             ( $type_sequence, $indent_flag ) =
 1986               increase_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
 1987         },
 1988         '}' => sub {
 1989             $block_type = $brace_type[$brace_depth];
 1990             if ($block_type) { $statement_type = '' }
 1991             if ( defined( $brace_package[$brace_depth] ) ) {
 1992                 $current_package = $brace_package[$brace_depth];
 1993             }
 1994 
 1995             # can happen on brace error (caught elsewhere)
 1996             else {
 1997             }
 1998             ( $type_sequence, $indent_flag ) =
 1999               decrease_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
 2000 
 2001             if ( $brace_structural_type[$brace_depth] eq 'L' ) {
 2002                 $type = 'R';
 2003             }
 2004 
 2005             # propagate type information for 'do' and 'eval' blocks, and also
 2006             # for smartmatch operator.  This is necessary to enable us to know
 2007             # if an operator or term is expected next.
 2008             if ( $is_block_operator{$block_type} ) {
 2009                 $tok = $block_type;
 2010             }
 2011 
 2012             $context = $brace_context[$brace_depth];
 2013             if ( $brace_depth > 0 ) { $brace_depth--; }
 2014         },
 2015         '&' => sub {    # maybe sub call? start looking
 2016 
 2017             # We have to check for sub call unless we are sure we
 2018             # are expecting an operator.  This example from s2p
 2019             # got mistaken as a q operator in an early version:
 2020             #   print BODY &q(<<'EOT');
 2021             if ( $expecting != OPERATOR ) {
 2022 
 2023                 # But only look for a sub call if we are expecting a term or
 2024                 # if there is no existing space after the &.
 2025                 # For example we probably don't want & as sub call here:
 2026                 #    Fcntl::S_IRUSR & $mode;
 2027                 if ( $expecting == TERM || $next_type ne 'b' ) {
 2028                     scan_identifier();
 2029                 }
 2030             }
 2031             else {
 2032             }
 2033         },
 2034         '<' => sub {    # angle operator or less than?
 2035 
 2036             if ( $expecting != OPERATOR ) {
 2037                 ( $i, $type ) =
 2038                   find_angle_operator_termination( $input_line, $i, $rtoken_map,
 2039                     $expecting, $max_token_index );
 2040 
 2041                 if ( $type eq '<' && $expecting == TERM ) {
 2042                     error_if_expecting_TERM();
 2043                     interrupt_logfile();
 2044                     warning("Unterminated <> operator?\n");
 2045                     resume_logfile();
 2046                 }
 2047             }
 2048             else {
 2049             }
 2050         },
 2051         '?' => sub {    # ?: conditional or starting pattern?
 2052 
 2053             my $is_pattern;
 2054 
 2055             # Patch for rt #126965
 2056             # a pattern cannot follow certain keywords which take optional
 2057             # arguments, like 'shift' and 'pop'. See also '/'.
 2058             if (   $last_nonblank_type eq 'k'
 2059                 && $is_keyword_taking_optional_args{$last_nonblank_token} )
 2060             {
 2061                 $is_pattern = 0;
 2062             }
 2063 
 2064             # patch for RT#131288, user constant function without prototype
 2065             # last type is 'U' followed by ?.
 2066             elsif ( $last_nonblank_type =~ /^[FUY]$/ ) {
 2067                 $is_pattern = 0;
 2068             }
 2069             elsif ( $expecting == UNKNOWN ) {
 2070 
 2071                 # In older versions of Perl, a bare ? can be a pattern
 2072                 # delimiter.  Sometime after Perl 5.10 this seems to have
 2073                 # been dropped, but we have to support it in order to format
 2074                 # older programs.  For example, the following line worked
 2075                 # at one time:
 2076                 #      ?(.*)? && (print $1,"\n");
 2077                 # In current versions it would have to be written with slashes:
 2078                 #      /(.*)/ && (print $1,"\n");
 2079                 my $msg;
 2080                 ( $is_pattern, $msg ) =
 2081                   guess_if_pattern_or_conditional( $i, $rtokens, $rtoken_map,
 2082                     $max_token_index );
 2083 
 2084                 if ($msg) { write_logfile_entry($msg) }
 2085             }
 2086             else { $is_pattern = ( $expecting == TERM ) }
 2087 
 2088             if ($is_pattern) {
 2089                 $in_quote                = 1;
 2090                 $type                    = 'Q';
 2091                 $allowed_quote_modifiers = '[msixpodualngc]';
 2092             }
 2093             else {
 2094                 ( $type_sequence, $indent_flag ) =
 2095                   increase_nesting_depth( QUESTION_COLON,
 2096                     $rtoken_map->[$i_tok] );
 2097             }
 2098         },
 2099         '*' => sub {    # typeglob, or multiply?
 2100 
 2101             if ( $expecting == TERM ) {
 2102                 scan_identifier();
 2103             }
 2104             else {
 2105 
 2106                 if ( $rtokens->[ $i + 1 ] eq '=' ) {
 2107                     $tok  = '*=';
 2108                     $type = $tok;
 2109                     $i++;
 2110                 }
 2111                 elsif ( $rtokens->[ $i + 1 ] eq '*' ) {
 2112                     $tok  = '**';
 2113                     $type = $tok;
 2114                     $i++;
 2115                     if ( $rtokens->[ $i + 1 ] eq '=' ) {
 2116                         $tok  = '**=';
 2117                         $type = $tok;
 2118                         $i++;
 2119                     }
 2120                 }
 2121             }
 2122         },
 2123         '.' => sub {    # what kind of . ?
 2124 
 2125             if ( $expecting != OPERATOR ) {
 2126                 scan_number();
 2127                 if ( $type eq '.' ) {
 2128                     error_if_expecting_TERM()
 2129                       if ( $expecting == TERM );
 2130                 }
 2131             }
 2132             else {
 2133             }
 2134         },
 2135         ':' => sub {
 2136 
 2137             # if this is the first nonblank character, call it a label
 2138             # since perl seems to just swallow it
 2139             if ( $input_line_number == 1 && $last_nonblank_i == -1 ) {
 2140                 $type = 'J';
 2141             }
 2142 
 2143             # ATTRS: check for a ':' which introduces an attribute list
 2144             # (this might eventually get its own token type)
 2145             elsif ( $statement_type =~ /^sub\b/ ) {
 2146                 $type              = 'A';
 2147                 $in_attribute_list = 1;
 2148             }
 2149 
 2150             # check for scalar attribute, such as
 2151             # my $foo : shared = 1;
 2152             elsif ($is_my_our{$statement_type}
 2153                 && $current_depth[QUESTION_COLON] == 0 )
 2154             {
 2155                 $type              = 'A';
 2156                 $in_attribute_list = 1;
 2157             }
 2158 
 2159             # otherwise, it should be part of a ?/: operator
 2160             else {
 2161                 ( $type_sequence, $indent_flag ) =
 2162                   decrease_nesting_depth( QUESTION_COLON,
 2163                     $rtoken_map->[$i_tok] );
 2164                 if ( $last_nonblank_token eq '?' ) {
 2165                     warning("Syntax error near ? :\n");
 2166                 }
 2167             }
 2168         },
 2169         '+' => sub {    # what kind of plus?
 2170 
 2171             if ( $expecting == TERM ) {
 2172                 my $number = scan_number();
 2173 
 2174                 # unary plus is safest assumption if not a number
 2175                 if ( !defined($number) ) { $type = 'p'; }
 2176             }
 2177             elsif ( $expecting == OPERATOR ) {
 2178             }
 2179             else {
 2180                 if ( $next_type eq 'w' ) { $type = 'p' }
 2181             }
 2182         },
 2183         '@' => sub {
 2184 
 2185             error_if_expecting_OPERATOR("Array")
 2186               if ( $expecting == OPERATOR );
 2187             scan_identifier();
 2188         },
 2189         '%' => sub {    # hash or modulo?
 2190 
 2191             # first guess is hash if no following blank
 2192             if ( $expecting == UNKNOWN ) {
 2193                 if ( $next_type ne 'b' ) { $expecting = TERM }
 2194             }
 2195             if ( $expecting == TERM ) {
 2196                 scan_identifier();
 2197             }
 2198         },
 2199         '[' => sub {
 2200             $square_bracket_type[ ++$square_bracket_depth ] =
 2201               $last_nonblank_token;
 2202             ( $type_sequence, $indent_flag ) =
 2203               increase_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
 2204 
 2205             # It may seem odd, but structural square brackets have
 2206             # type '{' and '}'.  This simplifies the indentation logic.
 2207             if ( !is_non_structural_brace() ) {
 2208                 $type = '{';
 2209             }
 2210             $square_bracket_structural_type[$square_bracket_depth] = $type;
 2211         },
 2212         ']' => sub {
 2213             ( $type_sequence, $indent_flag ) =
 2214               decrease_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
 2215 
 2216             if ( $square_bracket_structural_type[$square_bracket_depth] eq '{' )
 2217             {
 2218                 $type = '}';
 2219             }
 2220 
 2221             # propagate type information for smartmatch operator.  This is
 2222             # necessary to enable us to know if an operator or term is expected
 2223             # next.
 2224             if ( $square_bracket_type[$square_bracket_depth] eq '~~' ) {
 2225                 $tok = $square_bracket_type[$square_bracket_depth];
 2226             }
 2227 
 2228             if ( $square_bracket_depth > 0 ) { $square_bracket_depth--; }
 2229         },
 2230         '-' => sub {    # what kind of minus?
 2231 
 2232             if ( ( $expecting != OPERATOR )
 2233                 && $is_file_test_operator{$next_tok} )
 2234             {
 2235                 my ( $next_nonblank_token, $i_next ) =
 2236                   find_next_nonblank_token( $i + 1, $rtokens,
 2237                     $max_token_index );
 2238 
 2239                 # check for a quoted word like "-w=>xx";
 2240                 # it is sufficient to just check for a following '='
 2241                 if ( $next_nonblank_token eq '=' ) {
 2242                     $type = 'm';
 2243                 }
 2244                 else {
 2245                     $i++;
 2246                     $tok .= $next_tok;
 2247                     $type = 'F';
 2248                 }
 2249             }
 2250             elsif ( $expecting == TERM ) {
 2251                 my $number = scan_number();
 2252 
 2253                 # maybe part of bareword token? unary is safest
 2254                 if ( !defined($number) ) { $type = 'm'; }
 2255 
 2256             }
 2257             elsif ( $expecting == OPERATOR ) {
 2258             }
 2259             else {
 2260 
 2261                 if ( $next_type eq 'w' ) {
 2262                     $type = 'm';
 2263                 }
 2264             }
 2265         },
 2266 
 2267         '^' => sub {
 2268 
 2269             # check for special variables like ${^WARNING_BITS}
 2270             if ( $expecting == TERM ) {
 2271 
 2272                 # FIXME: this should work but will not catch errors
 2273                 # because we also have to be sure that previous token is
 2274                 # a type character ($,@,%).
 2275                 if ( $last_nonblank_token eq '{'
 2276                     && ( $next_tok =~ /^[A-Za-z_]/ ) )
 2277                 {
 2278 
 2279                     if ( $next_tok eq 'W' ) {
 2280                         $tokenizer_self->{_saw_perl_dash_w} = 1;
 2281                     }
 2282                     $tok  = $tok . $next_tok;
 2283                     $i    = $i + 1;
 2284                     $type = 'w';
 2285                 }
 2286 
 2287                 else {
 2288                     unless ( error_if_expecting_TERM() ) {
 2289 
 2290                         # Something like this is valid but strange:
 2291                         # undef ^I;
 2292                         complain("The '^' seems unusual here\n");
 2293                     }
 2294                 }
 2295             }
 2296         },
 2297 
 2298         '::' => sub {    # probably a sub call
 2299             scan_bare_identifier();
 2300         },
 2301         '<<' => sub {    # maybe a here-doc?
 2302             return
 2303               unless ( $i < $max_token_index )
 2304               ;          # here-doc not possible if end of line
 2305 
 2306             if ( $expecting != OPERATOR ) {
 2307                 my ( $found_target, $here_doc_target, $here_quote_character,
 2308                     $saw_error );
 2309                 (
 2310                     $found_target, $here_doc_target, $here_quote_character, $i,
 2311                     $saw_error
 2312                   )
 2313                   = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
 2314                     $max_token_index );
 2315 
 2316                 if ($found_target) {
 2317                     push @{$rhere_target_list},
 2318                       [ $here_doc_target, $here_quote_character ];
 2319                     $type = 'h';
 2320                     if ( length($here_doc_target) > 80 ) {
 2321                         my $truncated = substr( $here_doc_target, 0, 80 );
 2322                         complain("Long here-target: '$truncated' ...\n");
 2323                     }
 2324                     elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
 2325                         complain(
 2326                             "Unconventional here-target: '$here_doc_target'\n");
 2327                     }
 2328                 }
 2329                 elsif ( $expecting == TERM ) {
 2330                     unless ($saw_error) {
 2331 
 2332                         # shouldn't happen..
 2333                         warning("Program bug; didn't find here doc target\n");
 2334                         report_definite_bug();
 2335                     }
 2336                 }
 2337             }
 2338             else {
 2339             }
 2340         },
 2341         '<<~' => sub {    # a here-doc, new type added in v26
 2342             return
 2343               unless ( $i < $max_token_index )
 2344               ;           # here-doc not possible if end of line
 2345             if ( $expecting != OPERATOR ) {
 2346                 my ( $found_target, $here_doc_target, $here_quote_character,
 2347                     $saw_error );
 2348                 (
 2349                     $found_target, $here_doc_target, $here_quote_character, $i,
 2350                     $saw_error
 2351                   )
 2352                   = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
 2353                     $max_token_index );
 2354 
 2355                 if ($found_target) {
 2356 
 2357                     if ( length($here_doc_target) > 80 ) {
 2358                         my $truncated = substr( $here_doc_target, 0, 80 );
 2359                         complain("Long here-target: '$truncated' ...\n");
 2360                     }
 2361                     elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
 2362                         complain(
 2363                             "Unconventional here-target: '$here_doc_target'\n");
 2364                     }
 2365 
 2366                     # Note that we put a leading space on the here quote
 2367                     # character indicate that it may be preceded by spaces
 2368                     $here_quote_character = " " . $here_quote_character;
 2369                     push @{$rhere_target_list},
 2370                       [ $here_doc_target, $here_quote_character ];
 2371                     $type = 'h';
 2372                 }
 2373                 elsif ( $expecting == TERM ) {
 2374                     unless ($saw_error) {
 2375 
 2376                         # shouldn't happen..
 2377                         warning("Program bug; didn't find here doc target\n");
 2378                         report_definite_bug();
 2379                     }
 2380                 }
 2381             }
 2382             else {
 2383             }
 2384         },
 2385         '->' => sub {
 2386 
 2387             # if -> points to a bare word, we must scan for an identifier,
 2388             # otherwise something like ->y would look like the y operator
 2389             scan_identifier();
 2390         },
 2391 
 2392         # type = 'pp' for pre-increment, '++' for post-increment
 2393         '++' => sub {
 2394             if ( $expecting == TERM ) { $type = 'pp' }
 2395             elsif ( $expecting == UNKNOWN ) {
 2396                 my ( $next_nonblank_token, $i_next ) =
 2397                   find_next_nonblank_token( $i, $rtokens, $max_token_index );
 2398                 if ( $next_nonblank_token eq '$' ) { $type = 'pp' }
 2399             }
 2400         },
 2401 
 2402         '=>' => sub {
 2403             if ( $last_nonblank_type eq $tok ) {
 2404                 complain("Repeated '=>'s \n");
 2405             }
 2406 
 2407             # patch for operator_expected: note if we are in the list (use.t)
 2408             # TODO: make version numbers a new token type
 2409             if ( $statement_type eq 'use' ) { $statement_type = '_use' }
 2410         },
 2411 
 2412         # type = 'mm' for pre-decrement, '--' for post-decrement
 2413         '--' => sub {
 2414 
 2415             if ( $expecting == TERM ) { $type = 'mm' }
 2416             elsif ( $expecting == UNKNOWN ) {
 2417                 my ( $next_nonblank_token, $i_next ) =
 2418                   find_next_nonblank_token( $i, $rtokens, $max_token_index );
 2419                 if ( $next_nonblank_token eq '$' ) { $type = 'mm' }
 2420             }
 2421         },
 2422 
 2423         '&&' => sub {
 2424             error_if_expecting_TERM()
 2425               if ( $expecting == TERM );
 2426         },
 2427 
 2428         '||' => sub {
 2429             error_if_expecting_TERM()
 2430               if ( $expecting == TERM );
 2431         },
 2432 
 2433         '//' => sub {
 2434             error_if_expecting_TERM()
 2435               if ( $expecting == TERM );
 2436         },
 2437     };
 2438 
 2439     # ------------------------------------------------------------
 2440     # end hash of code for handling individual token types
 2441     # ------------------------------------------------------------
 2442 
 2443     my %matching_start_token = ( '}' => '{', ']' => '[', ')' => '(' );
 2444 
 2445     # These block types terminate statements and do not need a trailing
 2446     # semicolon
 2447     # patched for SWITCH/CASE/
 2448     my %is_zero_continuation_block_type;
 2449     @_ = qw( } { BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue ;
 2450       if elsif else unless while until for foreach switch case given when);
 2451     @is_zero_continuation_block_type{@_} = (1) x scalar(@_);
 2452 
 2453     my %is_not_zero_continuation_block_type;
 2454     @_ = qw(sort grep map do eval);
 2455     @is_not_zero_continuation_block_type{@_} = (1) x scalar(@_);
 2456 
 2457     my %is_logical_container;
 2458     @_ = qw(if elsif unless while and or err not && !  || for foreach);
 2459     @is_logical_container{@_} = (1) x scalar(@_);
 2460 
 2461     my %is_binary_type;
 2462     @_ = qw(|| &&);
 2463     @is_binary_type{@_} = (1) x scalar(@_);
 2464 
 2465     my %is_binary_keyword;
 2466     @_ = qw(and or err eq ne cmp);
 2467     @is_binary_keyword{@_} = (1) x scalar(@_);
 2468 
 2469     # 'L' is token for opening { at hash key
 2470     my %is_opening_type;
 2471     @_ = qw< L { ( [ >;
 2472     @is_opening_type{@_} = (1) x scalar(@_);
 2473 
 2474     # 'R' is token for closing } at hash key
 2475     my %is_closing_type;
 2476     @_ = qw< R } ) ] >;
 2477     @is_closing_type{@_} = (1) x scalar(@_);
 2478 
 2479     my %is_redo_last_next_goto;
 2480     @_ = qw(redo last next goto);
 2481     @is_redo_last_next_goto{@_} = (1) x scalar(@_);
 2482 
 2483     my %is_use_require;
 2484     @_ = qw(use require);
 2485     @is_use_require{@_} = (1) x scalar(@_);
 2486 
 2487     # This hash holds the hash key in $tokenizer_self for these keywords:
 2488     my %is_format_END_DATA = (
 2489         'format'   => '_in_format',
 2490         '__END__'  => '_in_end',
 2491         '__DATA__' => '_in_data',
 2492     );
 2493 
 2494     # original ref: camel 3 p 147,
 2495     # but perl may accept undocumented flags
 2496     # perl 5.10 adds 'p' (preserve)
 2497     # Perl version 5.22 added 'n'
 2498     # From http://perldoc.perl.org/perlop.html we have
 2499     # /PATTERN/msixpodualngc or m?PATTERN?msixpodualngc
 2500     # s/PATTERN/REPLACEMENT/msixpodualngcer
 2501     # y/SEARCHLIST/REPLACEMENTLIST/cdsr
 2502     # tr/SEARCHLIST/REPLACEMENTLIST/cdsr
 2503     # qr/STRING/msixpodualn
 2504     my %quote_modifiers = (
 2505         's'  => '[msixpodualngcer]',
 2506         'y'  => '[cdsr]',
 2507         'tr' => '[cdsr]',
 2508         'm'  => '[msixpodualngc]',
 2509         'qr' => '[msixpodualn]',
 2510         'q'  => "",
 2511         'qq' => "",
 2512         'qw' => "",
 2513         'qx' => "",
 2514     );
 2515 
 2516     # table showing how many quoted things to look for after quote operator..
 2517     # s, y, tr have 2 (pattern and replacement)
 2518     # others have 1 (pattern only)
 2519     my %quote_items = (
 2520         's'  => 2,
 2521         'y'  => 2,
 2522         'tr' => 2,
 2523         'm'  => 1,
 2524         'qr' => 1,
 2525         'q'  => 1,
 2526         'qq' => 1,
 2527         'qw' => 1,
 2528         'qx' => 1,
 2529     );
 2530 
 2531     sub tokenize_this_line {
 2532 
 2533   # This routine breaks a line of perl code into tokens which are of use in
 2534   # indentation and reformatting.  One of my goals has been to define tokens
 2535   # such that a newline may be inserted between any pair of tokens without
 2536   # changing or invalidating the program. This version comes close to this,
 2537   # although there are necessarily a few exceptions which must be caught by
 2538   # the formatter.  Many of these involve the treatment of bare words.
 2539   #
 2540   # The tokens and their types are returned in arrays.  See previous
 2541   # routine for their names.
 2542   #
 2543   # See also the array "valid_token_types" in the BEGIN section for an
 2544   # up-to-date list.
 2545   #
 2546   # To simplify things, token types are either a single character, or they
 2547   # are identical to the tokens themselves.
 2548   #
 2549   # As a debugging aid, the -D flag creates a file containing a side-by-side
 2550   # comparison of the input string and its tokenization for each line of a file.
 2551   # This is an invaluable debugging aid.
 2552   #
 2553   # In addition to tokens, and some associated quantities, the tokenizer
 2554   # also returns flags indication any special line types.  These include
 2555   # quotes, here_docs, formats.
 2556   #
 2557   # -----------------------------------------------------------------------
 2558   #
 2559   # How to add NEW_TOKENS:
 2560   #
 2561   # New token types will undoubtedly be needed in the future both to keep up
 2562   # with changes in perl and to help adapt the tokenizer to other applications.
 2563   #
 2564   # Here are some notes on the minimal steps.  I wrote these notes while
 2565   # adding the 'v' token type for v-strings, which are things like version
 2566   # numbers 5.6.0, and ip addresses, and will use that as an example.  ( You
 2567   # can use your editor to search for the string "NEW_TOKENS" to find the
 2568   # appropriate sections to change):
 2569   #
 2570   # *. Try to talk somebody else into doing it!  If not, ..
 2571   #
 2572   # *. Make a backup of your current version in case things don't work out!
 2573   #
 2574   # *. Think of a new, unused character for the token type, and add to
 2575   # the array @valid_token_types in the BEGIN section of this package.
 2576   # For example, I used 'v' for v-strings.
 2577   #
 2578   # *. Implement coding to recognize the $type of the token in this routine.
 2579   # This is the hardest part, and is best done by imitating or modifying
 2580   # some of the existing coding.  For example, to recognize v-strings, I
 2581   # patched 'sub scan_bare_identifier' to recognize v-strings beginning with
 2582   # 'v' and 'sub scan_number' to recognize v-strings without the leading 'v'.
 2583   #
 2584   # *. Update sub operator_expected.  This update is critically important but
 2585   # the coding is trivial.  Look at the comments in that routine for help.
 2586   # For v-strings, which should behave like numbers, I just added 'v' to the
 2587   # regex used to handle numbers and strings (types 'n' and 'Q').
 2588   #
 2589   # *. Implement a 'bond strength' rule in sub set_bond_strengths in
 2590   # Perl::Tidy::Formatter for breaking lines around this token type.  You can
 2591   # skip this step and take the default at first, then adjust later to get
 2592   # desired results.  For adding type 'v', I looked at sub bond_strength and
 2593   # saw that number type 'n' was using default strengths, so I didn't do
 2594   # anything.  I may tune it up someday if I don't like the way line
 2595   # breaks with v-strings look.
 2596   #
 2597   # *. Implement a 'whitespace' rule in sub set_whitespace_flags in
 2598   # Perl::Tidy::Formatter.  For adding type 'v', I looked at this routine
 2599   # and saw that type 'n' used spaces on both sides, so I just added 'v'
 2600   # to the array @spaces_both_sides.
 2601   #
 2602   # *. Update HtmlWriter package so that users can colorize the token as
 2603   # desired.  This is quite easy; see comments identified by 'NEW_TOKENS' in
 2604   # that package.  For v-strings, I initially chose to use a default color
 2605   # equal to the default for numbers, but it might be nice to change that
 2606   # eventually.
 2607   #
 2608   # *. Update comments in Perl::Tidy::Tokenizer::dump_token_types.
 2609   #
 2610   # *. Run lots and lots of debug tests.  Start with special files designed
 2611   # to test the new token type.  Run with the -D flag to create a .DEBUG
 2612   # file which shows the tokenization.  When these work ok, test as many old
 2613   # scripts as possible.  Start with all of the '.t' files in the 'test'
 2614   # directory of the distribution file.  Compare .tdy output with previous
 2615   # version and updated version to see the differences.  Then include as
 2616   # many more files as possible. My own technique has been to collect a huge
 2617   # number of perl scripts (thousands!) into one directory and run perltidy
 2618   # *, then run diff between the output of the previous version and the
 2619   # current version.
 2620   #
 2621   # *. For another example, search for the smartmatch operator '~~'
 2622   # with your editor to see where updates were made for it.
 2623   #
 2624   # -----------------------------------------------------------------------
 2625 
 2626         my $line_of_tokens = shift;
 2627         my ($untrimmed_input_line) = $line_of_tokens->{_line_text};
 2628 
 2629         # patch while coding change is underway
 2630         # make callers private data to allow access
 2631         # $tokenizer_self = $caller_tokenizer_self;
 2632 
 2633         # extract line number for use in error messages
 2634         $input_line_number = $line_of_tokens->{_line_number};
 2635 
 2636         # reinitialize for multi-line quote
 2637         $line_of_tokens->{_starting_in_quote} = $in_quote && $quote_type eq 'Q';
 2638 
 2639         # check for pod documentation
 2640         if ( ( $untrimmed_input_line =~ /^=[A-Za-z_]/ ) ) {
 2641 
 2642             # must not be in multi-line quote
 2643             # and must not be in an equation
 2644             if ( !$in_quote && ( operator_expected( 'b', '=', 'b' ) == TERM ) )
 2645             {
 2646                 $tokenizer_self->{_in_pod} = 1;
 2647                 return;
 2648             }
 2649         }
 2650 
 2651         $input_line = $untrimmed_input_line;
 2652 
 2653         chomp $input_line;
 2654 
 2655         # trim start of this line unless we are continuing a quoted line
 2656         # do not trim end because we might end in a quote (test: deken4.pl)
 2657         # Perl::Tidy::Formatter will delete needless trailing blanks
 2658         unless ( $in_quote && ( $quote_type eq 'Q' ) ) {
 2659             $input_line =~ s/^\s*//;    # trim left end
 2660         }
 2661 
 2662         # Set a flag to indicate if we might be at an __END__ or __DATA__ line
 2663         # This will be used below to avoid quoting a bare word followed by
 2664         # a fat comma.
 2665         my $is_END_or_DATA = $input_line =~ /^\s*__(END|DATA)__\s*$/;
 2666 
 2667         # update the copy of the line for use in error messages
 2668         # This must be exactly what we give the pre_tokenizer
 2669         $tokenizer_self->{_line_text} = $input_line;
 2670 
 2671         # re-initialize for the main loop
 2672         $routput_token_list     = [];    # stack of output token indexes
 2673         $routput_token_type     = [];    # token types
 2674         $routput_block_type     = [];    # types of code block
 2675         $routput_container_type = [];    # paren types, such as if, elsif, ..
 2676         $routput_type_sequence  = [];    # nesting sequential number
 2677 
 2678         $rhere_target_list = [];
 2679 
 2680         $tok             = $last_nonblank_token;
 2681         $type            = $last_nonblank_type;
 2682         $prototype       = $last_nonblank_prototype;
 2683         $last_nonblank_i = -1;
 2684         $block_type      = $last_nonblank_block_type;
 2685         $container_type  = $last_nonblank_container_type;
 2686         $type_sequence   = $last_nonblank_type_sequence;
 2687         $indent_flag     = 0;
 2688         $peeked_ahead    = 0;
 2689 
 2690         # tokenization is done in two stages..
 2691         # stage 1 is a very simple pre-tokenization
 2692         my $max_tokens_wanted = 0; # this signals pre_tokenize to get all tokens
 2693 
 2694         # a little optimization for a full-line comment
 2695         if ( !$in_quote && ( $input_line =~ /^#/ ) ) {
 2696             $max_tokens_wanted = 1    # no use tokenizing a comment
 2697         }
 2698 
 2699         # start by breaking the line into pre-tokens
 2700         ( $rtokens, $rtoken_map, $rtoken_type ) =
 2701           pre_tokenize( $input_line, $max_tokens_wanted );
 2702 
 2703         $max_token_index = scalar( @{$rtokens} ) - 1;
 2704         push( @{$rtokens}, ' ', ' ', ' ' );  # extra whitespace simplifies logic
 2705         push( @{$rtoken_map},  0,   0,   0 );     # shouldn't be referenced
 2706         push( @{$rtoken_type}, 'b', 'b', 'b' );
 2707 
 2708         # initialize for main loop
 2709         foreach my $ii ( 0 .. $max_token_index + 3 ) {
 2710             $routput_token_type->[$ii]     = "";
 2711             $routput_block_type->[$ii]     = "";
 2712             $routput_container_type->[$ii] = "";
 2713             $routput_type_sequence->[$ii]  = "";
 2714             $routput_indent_flag->[$ii]    = 0;
 2715         }
 2716         $i     = -1;
 2717         $i_tok = -1;
 2718 
 2719         # ------------------------------------------------------------
 2720         # begin main tokenization loop
 2721         # ------------------------------------------------------------
 2722 
 2723         # we are looking at each pre-token of one line and combining them
 2724         # into tokens
 2725         while ( ++$i <= $max_token_index ) {
 2726 
 2727             if ($in_quote) {    # continue looking for end of a quote
 2728                 $type = $quote_type;
 2729 
 2730                 unless ( @{$routput_token_list} )
 2731                 {               # initialize if continuation line
 2732                     push( @{$routput_token_list}, $i );
 2733                     $routput_token_type->[$i] = $type;
 2734 
 2735                 }
 2736                 $tok = $quote_character unless ( $quote_character =~ /^\s*$/ );
 2737 
 2738                 # scan for the end of the quote or pattern
 2739                 (
 2740                     $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
 2741                     $quoted_string_1, $quoted_string_2
 2742                   )
 2743                   = do_quote(
 2744                     $i,               $in_quote,    $quote_character,
 2745                     $quote_pos,       $quote_depth, $quoted_string_1,
 2746                     $quoted_string_2, $rtokens,     $rtoken_map,
 2747                     $max_token_index
 2748                   );
 2749 
 2750                 # all done if we didn't find it
 2751                 last if ($in_quote);
 2752 
 2753                 # save pattern and replacement text for rescanning
 2754                 my $qs1 = $quoted_string_1;
 2755                 my $qs2 = $quoted_string_2;
 2756 
 2757                 # re-initialize for next search
 2758                 $quote_character = '';
 2759                 $quote_pos       = 0;
 2760                 $quote_type      = 'Q';
 2761                 $quoted_string_1 = "";
 2762                 $quoted_string_2 = "";
 2763                 last if ( ++$i > $max_token_index );
 2764 
 2765                 # look for any modifiers
 2766                 if ($allowed_quote_modifiers) {
 2767 
 2768                     # check for exact quote modifiers
 2769                     if ( $rtokens->[$i] =~ /^[A-Za-z_]/ ) {
 2770                         my $str = $rtokens->[$i];
 2771                         my $saw_modifier_e;
 2772                         while ( $str =~ /\G$allowed_quote_modifiers/gc ) {
 2773                             my $pos  = pos($str);
 2774                             my $char = substr( $str, $pos - 1, 1 );
 2775                             $saw_modifier_e ||= ( $char eq 'e' );
 2776                         }
 2777 
 2778                         # For an 'e' quote modifier we must scan the replacement
 2779                         # text for here-doc targets.
 2780                         if ($saw_modifier_e) {
 2781 
 2782                             my $rht = scan_replacement_text($qs1);
 2783 
 2784                             # Change type from 'Q' to 'h' for quotes with
 2785                             # here-doc targets so that the formatter (see sub
 2786                             # print_line_of_tokens) will not make any line
 2787                             # breaks after this point.
 2788                             if ($rht) {
 2789                                 push @{$rhere_target_list}, @{$rht};
 2790                                 $type = 'h';
 2791                                 if ( $i_tok < 0 ) {
 2792                                     my $ilast = $routput_token_list->[-1];
 2793                                     $routput_token_type->[$ilast] = $type;
 2794                                 }
 2795                             }
 2796                         }
 2797 
 2798                         if ( defined( pos($str) ) ) {
 2799 
 2800                             # matched
 2801                             if ( pos($str) == length($str) ) {
 2802                                 last if ( ++$i > $max_token_index );
 2803                             }
 2804 
 2805                             # Looks like a joined quote modifier
 2806                             # and keyword, maybe something like
 2807                             # s/xxx/yyy/gefor @k=...
 2808                             # Example is "galgen.pl".  Would have to split
 2809                             # the word and insert a new token in the
 2810                             # pre-token list.  This is so rare that I haven't
 2811                             # done it.  Will just issue a warning citation.
 2812 
 2813                             # This error might also be triggered if my quote
 2814                             # modifier characters are incomplete
 2815                             else {
 2816                                 warning(<<EOM);
 2817 
 2818 Partial match to quote modifier $allowed_quote_modifiers at word: '$str'
 2819 Please put a space between quote modifiers and trailing keywords.
 2820 EOM
 2821 
 2822                          # print "token $rtokens->[$i]\n";
 2823                          # my $num = length($str) - pos($str);
 2824                          # $rtokens->[$i]=substr($rtokens->[$i],pos($str),$num);
 2825                          # print "continuing with new token $rtokens->[$i]\n";
 2826 
 2827                                 # skipping past this token does least damage
 2828                                 last if ( ++$i > $max_token_index );
 2829                             }
 2830                         }
 2831                         else {
 2832 
 2833                             # example file: rokicki4.pl
 2834                             # This error might also be triggered if my quote
 2835                             # modifier characters are incomplete
 2836                             write_logfile_entry(
 2837 "Note: found word $str at quote modifier location\n"
 2838                             );
 2839                         }
 2840                     }
 2841 
 2842                     # re-initialize
 2843                     $allowed_quote_modifiers = "";
 2844                 }
 2845             }
 2846 
 2847             unless ( $tok =~ /^\s*$/ || $tok eq 'CORE::' ) {
 2848 
 2849                 # try to catch some common errors
 2850                 if ( ( $type eq 'n' ) && ( $tok ne '0' ) ) {
 2851 
 2852                     if ( $last_nonblank_token eq 'eq' ) {
 2853                         complain("Should 'eq' be '==' here ?\n");
 2854                     }
 2855                     elsif ( $last_nonblank_token eq 'ne' ) {
 2856                         complain("Should 'ne' be '!=' here ?\n");
 2857                     }
 2858                 }
 2859 
 2860                 $last_last_nonblank_token      = $last_nonblank_token;
 2861                 $last_last_nonblank_type       = $last_nonblank_type;
 2862                 $last_last_nonblank_block_type = $last_nonblank_block_type;
 2863                 $last_last_nonblank_container_type =
 2864                   $last_nonblank_container_type;
 2865                 $last_last_nonblank_type_sequence =
 2866                   $last_nonblank_type_sequence;
 2867                 $last_nonblank_token          = $tok;
 2868                 $last_nonblank_type           = $type;
 2869                 $last_nonblank_prototype      = $prototype;
 2870                 $last_nonblank_block_type     = $block_type;
 2871                 $last_nonblank_container_type = $container_type;
 2872                 $last_nonblank_type_sequence  = $type_sequence;
 2873                 $last_nonblank_i              = $i_tok;
 2874             }
 2875 
 2876             # store previous token type
 2877             if ( $i_tok >= 0 ) {
 2878                 $routput_token_type->[$i_tok]     = $type;
 2879                 $routput_block_type->[$i_tok]     = $block_type;
 2880                 $routput_container_type->[$i_tok] = $container_type;
 2881                 $routput_type_sequence->[$i_tok]  = $type_sequence;
 2882                 $routput_indent_flag->[$i_tok]    = $indent_flag;
 2883             }
 2884             my $pre_tok  = $rtokens->[$i];        # get the next pre-token
 2885             my $pre_type = $rtoken_type->[$i];    # and type
 2886             $tok  = $pre_tok;
 2887             $type = $pre_type;                    # to be modified as necessary
 2888             $block_type = "";    # blank for all tokens except code block braces
 2889             $container_type = "";    # blank for all tokens except some parens
 2890             $type_sequence  = "";    # blank for all tokens except ?/:
 2891             $indent_flag    = 0;
 2892             $prototype = "";    # blank for all tokens except user defined subs
 2893             $i_tok     = $i;
 2894 
 2895             # this pre-token will start an output token
 2896             push( @{$routput_token_list}, $i_tok );
 2897 
 2898             # continue gathering identifier if necessary
 2899             # but do not start on blanks and comments
 2900             if ( $id_scan_state && $pre_type !~ /[b#]/ ) {
 2901 
 2902                 if ( $is_sub{$id_scan_state} || $is_package{$id_scan_state} ) {
 2903                     scan_id();
 2904                 }
 2905                 else {
 2906                     scan_identifier();
 2907                 }
 2908 
 2909                 last if ($id_scan_state);
 2910                 next if ( ( $i > 0 ) || $type );
 2911 
 2912                 # didn't find any token; start over
 2913                 $type = $pre_type;
 2914                 $tok  = $pre_tok;
 2915             }
 2916 
 2917             # handle whitespace tokens..
 2918             next if ( $type eq 'b' );
 2919             my $prev_tok  = $i > 0 ? $rtokens->[ $i - 1 ]     : ' ';
 2920             my $prev_type = $i > 0 ? $rtoken_type->[ $i - 1 ] : 'b';
 2921 
 2922             # Build larger tokens where possible, since we are not in a quote.
 2923             #
 2924             # First try to assemble digraphs.  The following tokens are
 2925             # excluded and handled specially:
 2926             # '/=' is excluded because the / might start a pattern.
 2927             # 'x=' is excluded since it might be $x=, with $ on previous line
 2928             # '**' and *= might be typeglobs of punctuation variables
 2929             # I have allowed tokens starting with <, such as <=,
 2930             # because I don't think these could be valid angle operators.
 2931             # test file: storrs4.pl
 2932             my $test_tok   = $tok . $rtokens->[ $i + 1 ];
 2933             my $combine_ok = $is_digraph{$test_tok};
 2934 
 2935             # check for special cases which cannot be combined
 2936             if ($combine_ok) {
 2937 
 2938                 # '//' must be defined_or operator if an operator is expected.
 2939                 # TODO: Code for other ambiguous digraphs (/=, x=, **, *=)
 2940                 # could be migrated here for clarity
 2941 
 2942               # Patch for RT#102371, misparsing a // in the following snippet:
 2943               #     state $b //= ccc();
 2944               # The solution is to always accept the digraph (or trigraph) after
 2945               # token type 'Z' (possible file handle).  The reason is that
 2946               # sub operator_expected gives TERM expected here, which is
 2947               # wrong in this case.
 2948                 if ( $test_tok eq '//' && $last_nonblank_type ne 'Z' ) {
 2949                     my $next_type = $rtokens->[ $i + 1 ];
 2950                     my $expecting =
 2951                       operator_expected( $prev_type, $tok, $next_type );
 2952 
 2953                     # Patched for RT#101547, was 'unless ($expecting==OPERATOR)'
 2954                     $combine_ok = 0 if ( $expecting == TERM );
 2955                 }
 2956 
 2957                 # Patch for RT #114359: Missparsing of "print $x ** 0.5;
 2958                 # Accept the digraphs '**' only after type 'Z'
 2959                 # Otherwise postpone the decision.
 2960                 if ( $test_tok eq '**' ) {
 2961                     if ( $last_nonblank_type ne 'Z' ) { $combine_ok = 0 }
 2962                 }
 2963             }
 2964 
 2965             if (
 2966                 $combine_ok
 2967 
 2968                 && ( $test_tok ne '/=' )    # might be pattern
 2969                 && ( $test_tok ne 'x=' )    # might be $x
 2970                 && ( $test_tok ne '*=' )    # typeglob?
 2971 
 2972                 # Moved above as part of fix for
 2973                 # RT #114359: Missparsing of "print $x ** 0.5;
 2974                 # && ( $test_tok ne '**' )    # typeglob?
 2975               )
 2976             {
 2977                 $tok = $test_tok;
 2978                 $i++;
 2979 
 2980                 # Now try to assemble trigraphs.  Note that all possible
 2981                 # perl trigraphs can be constructed by appending a character
 2982                 # to a digraph.
 2983                 $test_tok = $tok . $rtokens->[ $i + 1 ];
 2984 
 2985                 if ( $is_trigraph{$test_tok} ) {
 2986                     $tok = $test_tok;
 2987                     $i++;
 2988                 }
 2989 
 2990                 # The only current tetragraph is the double diamond operator
 2991                 # and its first three characters are not a trigraph, so
 2992                 # we do can do a special test for it
 2993                 elsif ( $test_tok eq '<<>' ) {
 2994                     $test_tok .= $rtokens->[ $i + 2 ];
 2995                     if ( $is_tetragraph{$test_tok} ) {
 2996                         $tok = $test_tok;
 2997                         $i += 2;
 2998                     }
 2999                 }
 3000             }
 3001 
 3002             $type      = $tok;
 3003             $next_tok  = $rtokens->[ $i + 1 ];
 3004             $next_type = $rtoken_type->[ $i + 1 ];
 3005 
 3006             TOKENIZER_DEBUG_FLAG_TOKENIZE && do {
 3007                 local $" = ')(';
 3008                 my @debug_list = (
 3009                     $last_nonblank_token,      $tok,
 3010                     $next_tok,                 $brace_depth,
 3011                     $brace_type[$brace_depth], $paren_depth,
 3012                     $paren_type[$paren_depth]
 3013                 );
 3014                 print STDOUT "TOKENIZE:(@debug_list)\n";
 3015             };
 3016 
 3017             # turn off attribute list on first non-blank, non-bareword
 3018             if ( $pre_type ne 'w' ) { $in_attribute_list = 0 }
 3019 
 3020             ###############################################################
 3021             # We have the next token, $tok.
 3022             # Now we have to examine this token and decide what it is
 3023             # and define its $type
 3024             #
 3025             # section 1: bare words
 3026             ###############################################################
 3027 
 3028             if ( $pre_type eq 'w' ) {
 3029                 $expecting = operator_expected( $prev_type, $tok, $next_type );
 3030                 my ( $next_nonblank_token, $i_next ) =
 3031                   find_next_nonblank_token( $i, $rtokens, $max_token_index );
 3032 
 3033                 # ATTRS: handle sub and variable attributes
 3034                 if ($in_attribute_list) {
 3035 
 3036                     # treat bare word followed by open paren like qw(
 3037                     if ( $next_nonblank_token eq '(' ) {
 3038                         $in_quote                = $quote_items{'q'};
 3039                         $allowed_quote_modifiers = $quote_modifiers{'q'};
 3040                         $type                    = 'q';
 3041                         $quote_type              = 'q';
 3042                         next;
 3043                     }
 3044 
 3045                     # handle bareword not followed by open paren
 3046                     else {
 3047                         $type = 'w';
 3048                         next;
 3049                     }
 3050                 }
 3051 
 3052                 # quote a word followed by => operator
 3053                 # unless the word __END__ or __DATA__ and the only word on
 3054                 # the line.
 3055                 if ( !$is_END_or_DATA && $next_nonblank_token eq '=' ) {
 3056 
 3057                     if ( $rtokens->[ $i_next + 1 ] eq '>' ) {
 3058                         if ( $is_constant{$current_package}{$tok} ) {
 3059                             $type = 'C';
 3060                         }
 3061                         elsif ( $is_user_function{$current_package}{$tok} ) {
 3062                             $type = 'U';
 3063                             $prototype =
 3064                               $user_function_prototype{$current_package}{$tok};
 3065                         }
 3066                         elsif ( $tok =~ /^v\d+$/ ) {
 3067                             $type = 'v';
 3068                             report_v_string($tok);
 3069                         }
 3070                         else { $type = 'w' }
 3071 
 3072                         next;
 3073                     }
 3074                 }
 3075 
 3076      # quote a bare word within braces..like xxx->{s}; note that we
 3077      # must be sure this is not a structural brace, to avoid
 3078      # mistaking {s} in the following for a quoted bare word:
 3079      #     for(@[){s}bla}BLA}
 3080      # Also treat q in something like var{-q} as a bare word, not qoute operator
 3081                 if (
 3082                     $next_nonblank_token eq '}'
 3083                     && (
 3084                         $last_nonblank_type eq 'L'
 3085                         || (   $last_nonblank_type eq 'm'
 3086                             && $last_last_nonblank_type eq 'L' )
 3087                     )
 3088                   )
 3089                 {
 3090                     $type = 'w';
 3091                     next;
 3092                 }
 3093 
 3094                 # a bare word immediately followed by :: is not a keyword;
 3095                 # use $tok_kw when testing for keywords to avoid a mistake
 3096                 my $tok_kw = $tok;
 3097                 if (   $rtokens->[ $i + 1 ] eq ':'
 3098                     && $rtokens->[ $i + 2 ] eq ':' )
 3099                 {
 3100                     $tok_kw .= '::';
 3101                 }
 3102 
 3103                 # handle operator x (now we know it isn't $x=)
 3104                 if ( ( $tok =~ /^x\d*$/ ) && ( $expecting == OPERATOR ) ) {
 3105                     if ( $tok eq 'x' ) {
 3106 
 3107                         if ( $rtokens->[ $i + 1 ] eq '=' ) {    # x=
 3108                             $tok  = 'x=';
 3109                             $type = $tok;
 3110                             $i++;
 3111                         }
 3112                         else {
 3113                             $type = 'x';
 3114                         }
 3115                     }
 3116 
 3117                     # FIXME: Patch: mark something like x4 as an integer for now
 3118                     # It gets fixed downstream.  This is easier than
 3119                     # splitting the pretoken.
 3120                     else {
 3121                         $type = 'n';
 3122                     }
 3123                 }
 3124                 elsif ( $tok_kw eq 'CORE::' ) {
 3125                     $type = $tok = $tok_kw;
 3126                     $i += 2;
 3127                 }
 3128                 elsif ( ( $tok eq 'strict' )
 3129                     and ( $last_nonblank_token eq 'use' ) )
 3130                 {
 3131                     $tokenizer_self->{_saw_use_strict} = 1;
 3132                     scan_bare_identifier();
 3133                 }
 3134 
 3135                 elsif ( ( $tok eq 'warnings' )
 3136                     and ( $last_nonblank_token eq 'use' ) )
 3137                 {
 3138                     $tokenizer_self->{_saw_perl_dash_w} = 1;
 3139 
 3140                     # scan as identifier, so that we pick up something like:
 3141                     # use warnings::register
 3142                     scan_bare_identifier();
 3143                 }
 3144 
 3145                 elsif (
 3146                        $tok eq 'AutoLoader'
 3147                     && $tokenizer_self->{_look_for_autoloader}
 3148                     && (
 3149                         $last_nonblank_token eq 'use'
 3150 
 3151                         # these regexes are from AutoSplit.pm, which we want
 3152                         # to mimic
 3153                         || $input_line =~ /^\s*(use|require)\s+AutoLoader\b/
 3154                         || $input_line =~ /\bISA\s*=.*\bAutoLoader\b/
 3155                     )
 3156                   )
 3157                 {
 3158                     write_logfile_entry("AutoLoader seen, -nlal deactivates\n");
 3159                     $tokenizer_self->{_saw_autoloader}      = 1;
 3160                     $tokenizer_self->{_look_for_autoloader} = 0;
 3161                     scan_bare_identifier();
 3162                 }
 3163 
 3164                 elsif (
 3165                        $tok eq 'SelfLoader'
 3166                     && $tokenizer_self->{_look_for_selfloader}
 3167                     && (   $last_nonblank_token eq 'use'
 3168                         || $input_line =~ /^\s*(use|require)\s+SelfLoader\b/
 3169                         || $input_line =~ /\bISA\s*=.*\bSelfLoader\b/ )
 3170                   )
 3171                 {
 3172                     write_logfile_entry("SelfLoader seen, -nlsl deactivates\n");
 3173                     $tokenizer_self->{_saw_selfloader}      = 1;
 3174                     $tokenizer_self->{_look_for_selfloader} = 0;
 3175                     scan_bare_identifier();
 3176                 }
 3177 
 3178                 elsif ( ( $tok eq 'constant' )
 3179                     and ( $last_nonblank_token eq 'use' ) )
 3180                 {
 3181                     scan_bare_identifier();
 3182                     my ( $next_nonblank_token, $i_next ) =
 3183                       find_next_nonblank_token( $i, $rtokens,
 3184                         $max_token_index );
 3185 
 3186                     if ($next_nonblank_token) {
 3187 
 3188                         if ( $is_keyword{$next_nonblank_token} ) {
 3189 
 3190                             # Assume qw is used as a quote and okay, as in:
 3191                             #  use constant qw{ DEBUG 0 };
 3192                             # Not worth trying to parse for just a warning
 3193 
 3194                             # NOTE: This warning is deactivated because recent
 3195                             # versions of perl do not complain here, but
 3196                             # the coding is retained for reference.
 3197                             if ( 0 && $next_nonblank_token ne 'qw' ) {
 3198                                 warning(
 3199 "Attempting to define constant '$next_nonblank_token' which is a perl keyword\n"
 3200                                 );
 3201                             }
 3202                         }
 3203 
 3204                         # FIXME: could check for error in which next token is
 3205                         # not a word (number, punctuation, ..)
 3206                         else {
 3207                             $is_constant{$current_package}{$next_nonblank_token}
 3208                               = 1;
 3209                         }
 3210                     }
 3211                 }
 3212 
 3213                 # various quote operators
 3214                 elsif ( $is_q_qq_qw_qx_qr_s_y_tr_m{$tok} ) {
 3215 ##NICOL PATCH
 3216                     if ( $expecting == OPERATOR ) {
 3217 
 3218                         # Be careful not to call an error for a qw quote
 3219                         # where a parenthesized list is allowed.  For example,
 3220                         # it could also be a for/foreach construct such as
 3221                         #
 3222                         #    foreach my $key qw\Uno Due Tres Quadro\ {
 3223                         #        print "Set $key\n";
 3224                         #    }
 3225                         #
 3226 
 3227                         # Or it could be a function call.
 3228                         # NOTE: Braces in something like &{ xxx } are not
 3229                         # marked as a block, we might have a method call.
 3230                         # &method(...), $method->(..), &{method}(...),
 3231                         # $ref[2](list) is ok & short for $ref[2]->(list)
 3232                         #
 3233                         # See notes in 'sub code_block_type' and
 3234                         # 'sub is_non_structural_brace'
 3235 
 3236                         unless (
 3237                             $tok eq 'qw'
 3238                             && (   $last_nonblank_token =~ /^([\]\}\&]|\-\>)/
 3239                                 || $is_for_foreach{$want_paren} )
 3240                           )
 3241                         {
 3242                             error_if_expecting_OPERATOR();
 3243                         }
 3244                     }
 3245                     $in_quote                = $quote_items{$tok};
 3246                     $allowed_quote_modifiers = $quote_modifiers{$tok};
 3247 
 3248                    # All quote types are 'Q' except possibly qw quotes.
 3249                    # qw quotes are special in that they may generally be trimmed
 3250                    # of leading and trailing whitespace.  So they are given a
 3251                    # separate type, 'q', unless requested otherwise.
 3252                     $type =
 3253                       ( $tok eq 'qw' && $tokenizer_self->{_trim_qw} )
 3254                       ? 'q'
 3255                       : 'Q';
 3256                     $quote_type = $type;
 3257                 }
 3258 
 3259                 # check for a statement label
 3260                 elsif (
 3261                        ( $next_nonblank_token eq ':' )
 3262                     && ( $rtokens->[ $i_next + 1 ] ne ':' )
 3263                     && ( $i_next <= $max_token_index )    # colon on same line
 3264                     && label_ok()
 3265                   )
 3266                 {
 3267                     if ( $tok !~ /[A-Z]/ ) {
 3268                         push @{ $tokenizer_self->{_rlower_case_labels_at} },
 3269                           $input_line_number;
 3270                     }
 3271                     $type = 'J';
 3272                     $tok .= ':';
 3273                     $i = $i_next;
 3274                     next;
 3275                 }
 3276 
 3277                 #      'sub' || 'package'
 3278                 elsif ( $is_sub{$tok_kw} || $is_package{$tok_kw} ) {
 3279                     error_if_expecting_OPERATOR()
 3280                       if ( $expecting == OPERATOR );
 3281                     scan_id();
 3282                 }
 3283 
 3284                 # Note on token types for format, __DATA__, __END__:
 3285                 # It simplifies things to give these type ';', so that when we
 3286                 # start rescanning we will be expecting a token of type TERM.
 3287                 # We will switch to type 'k' before outputting the tokens.
 3288                 elsif ( $is_format_END_DATA{$tok_kw} ) {
 3289                     $type = ';';    # make tokenizer look for TERM next
 3290                     $tokenizer_self->{ $is_format_END_DATA{$tok_kw} } = 1;
 3291                     last;
 3292                 }
 3293 
 3294                 elsif ( $is_keyword{$tok_kw} ) {
 3295                     $type = 'k';
 3296 
 3297                     # Since for and foreach may not be followed immediately
 3298                     # by an opening paren, we have to remember which keyword
 3299                     # is associated with the next '('
 3300                     if ( $is_for_foreach{$tok} ) {
 3301                         if ( new_statement_ok() ) {
 3302                             $want_paren = $tok;
 3303                         }
 3304                     }
 3305 
 3306                     # recognize 'use' statements, which are special
 3307                     elsif ( $is_use_require{$tok} ) {
 3308                         $statement_type = $tok;
 3309                         error_if_expecting_OPERATOR()
 3310                           if ( $expecting == OPERATOR );
 3311                     }
 3312 
 3313                     # remember my and our to check for trailing ": shared"
 3314                     elsif ( $is_my_our{$tok} ) {
 3315                         $statement_type = $tok;
 3316                     }
 3317 
 3318                     # Check for misplaced 'elsif' and 'else', but allow isolated
 3319                     # else or elsif blocks to be formatted.  This is indicated
 3320                     # by a last noblank token of ';'
 3321                     elsif ( $tok eq 'elsif' ) {
 3322                         if (   $last_nonblank_token ne ';'
 3323                             && $last_nonblank_block_type !~
 3324                             /^(if|elsif|unless)$/ )
 3325                         {
 3326                             warning(
 3327 "expecting '$tok' to follow one of 'if|elsif|unless'\n"
 3328                             );
 3329                         }
 3330                     }
 3331                     elsif ( $tok eq 'else' ) {
 3332 
 3333                         # patched for SWITCH/CASE
 3334                         if (
 3335                                $last_nonblank_token ne ';'
 3336                             && $last_nonblank_block_type !~
 3337                             /^(if|elsif|unless|case|when)$/
 3338 
 3339                             # patch to avoid an unwanted error message for
 3340                             # the case of a parenless 'case' (RT 105484):
 3341                             # switch ( 1 ) { case x { 2 } else { } }
 3342                             && $statement_type !~
 3343                             /^(if|elsif|unless|case|when)$/
 3344                           )
 3345                         {
 3346                             warning(
 3347 "expecting '$tok' to follow one of 'if|elsif|unless|case|when'\n"
 3348                             );
 3349                         }
 3350                     }
 3351                     elsif ( $tok eq 'continue' ) {
 3352                         if (   $last_nonblank_token ne ';'
 3353                             && $last_nonblank_block_type !~
 3354                             /(^(\{|\}|;|while|until|for|foreach)|:$)/ )
 3355                         {
 3356 
 3357                             # note: ';' '{' and '}' in list above
 3358                             # because continues can follow bare blocks;
 3359                             # ':' is labeled block
 3360                             #
 3361                             ############################################
 3362                             # NOTE: This check has been deactivated because
 3363                             # continue has an alternative usage for given/when
 3364                             # blocks in perl 5.10
 3365                             ## warning("'$tok' should follow a block\n");
 3366                             ############################################
 3367                         }
 3368                     }
 3369 
 3370                     # patch for SWITCH/CASE if 'case' and 'when are
 3371                     # treated as keywords.
 3372                     elsif ( $tok eq 'when' || $tok eq 'case' ) {
 3373                         $statement_type = $tok;    # next '{' is block
 3374                     }
 3375 
 3376                     #
 3377                     # indent trailing if/unless/while/until
 3378                     # outdenting will be handled by later indentation loop
 3379 ## DEACTIVATED: unfortunately this can cause some unwanted indentation like:
 3380 ##$opt_o = 1
 3381 ##  if !(
 3382 ##             $opt_b
 3383 ##          || $opt_c
 3384 ##          || $opt_d
 3385 ##          || $opt_f
 3386 ##          || $opt_i
 3387 ##          || $opt_l
 3388 ##          || $opt_o
 3389 ##          || $opt_x
 3390 ##  );
 3391 ##                    if (   $tok =~ /^(if|unless|while|until)$/
 3392 ##                        && $next_nonblank_token ne '(' )
 3393 ##                    {
 3394 ##                        $indent_flag = 1;
 3395 ##                    }
 3396                 }
 3397 
 3398                 # check for inline label following
 3399                 #         /^(redo|last|next|goto)$/
 3400                 elsif (( $last_nonblank_type eq 'k' )
 3401                     && ( $is_redo_last_next_goto{$last_nonblank_token} ) )
 3402                 {
 3403                     $type = 'j';
 3404                     next;
 3405                 }
 3406 
 3407                 # something else --
 3408                 else {
 3409 
 3410                     scan_bare_identifier();
 3411                     if ( $type eq 'w' ) {
 3412 
 3413                         if ( $expecting == OPERATOR ) {
 3414 
 3415                             # don't complain about possible indirect object
 3416                             # notation.
 3417                             # For example:
 3418                             #   package main;
 3419                             #   sub new($) { ... }
 3420                             #   $b = new A::;  # calls A::new
 3421                             #   $c = new A;    # same thing but suspicious
 3422                             # This will call A::new but we have a 'new' in
 3423                             # main:: which looks like a constant.
 3424                             #
 3425                             if ( $last_nonblank_type eq 'C' ) {
 3426                                 if ( $tok !~ /::$/ ) {
 3427                                     complain(<<EOM);
 3428 Expecting operator after '$last_nonblank_token' but found bare word '$tok'
 3429        Maybe indirectet object notation?
 3430 EOM
 3431                                 }
 3432                             }
 3433                             else {
 3434                                 error_if_expecting_OPERATOR("bareword");
 3435                             }
 3436                         }
 3437 
 3438                         # mark bare words immediately followed by a paren as
 3439                         # functions
 3440                         $next_tok = $rtokens->[ $i + 1 ];
 3441                         if ( $next_tok eq '(' ) {
 3442                             $type = 'U';
 3443                         }
 3444 
 3445                         # underscore after file test operator is file handle
 3446                         if ( $tok eq '_' && $last_nonblank_type eq 'F' ) {
 3447                             $type = 'Z';
 3448                         }
 3449 
 3450                         # patch for SWITCH/CASE if 'case' and 'when are
 3451                         # not treated as keywords:
 3452                         if (
 3453                             (
 3454                                    $tok eq 'case'
 3455                                 && $brace_type[$brace_depth] eq 'switch'
 3456                             )
 3457                             || (   $tok eq 'when'
 3458                                 && $brace_type[$brace_depth] eq 'given' )
 3459                           )
 3460                         {
 3461                             $statement_type = $tok;    # next '{' is block
 3462                             $type = 'k';    # for keyword syntax coloring
 3463                         }
 3464 
 3465                         # patch for SWITCH/CASE if switch and given not keywords
 3466                         # Switch is not a perl 5 keyword, but we will gamble
 3467                         # and mark switch followed by paren as a keyword.  This
 3468                         # is only necessary to get html syntax coloring nice,
 3469                         # and does not commit this as being a switch/case.
 3470                         if ( $next_nonblank_token eq '('
 3471                             && ( $tok eq 'switch' || $tok eq 'given' ) )
 3472                         {
 3473                             $type = 'k';    # for keyword syntax coloring
 3474                         }
 3475                     }
 3476                 }
 3477             }
 3478 
 3479             ###############################################################
 3480             # section 2: strings of digits
 3481             ###############################################################
 3482             elsif ( $pre_type eq 'd' ) {
 3483                 $expecting = operator_expected( $prev_type, $tok, $next_type );
 3484                 error_if_expecting_OPERATOR("Number")
 3485                   if ( $expecting == OPERATOR );
 3486                 my $number = scan_number();
 3487                 if ( !defined($number) ) {
 3488 
 3489                     # shouldn't happen - we should always get a number
 3490                     warning("non-number beginning with digit--program bug\n");
 3491                     report_definite_bug();
 3492                 }
 3493             }
 3494 
 3495             ###############################################################
 3496             # section 3: all other tokens
 3497             ###############################################################
 3498 
 3499             else {
 3500                 last if ( $tok eq '#' );
 3501                 my $code = $tokenization_code->{$tok};
 3502                 if ($code) {
 3503                     $expecting =
 3504                       operator_expected( $prev_type, $tok, $next_type );
 3505                     $code->();
 3506                     redo if $in_quote;
 3507                 }
 3508             }
 3509         }
 3510 
 3511         # -----------------------------
 3512         # end of main tokenization loop
 3513         # -----------------------------
 3514 
 3515         if ( $i_tok >= 0 ) {
 3516             $routput_token_type->[$i_tok]     = $type;
 3517             $routput_block_type->[$i_tok]     = $block_type;
 3518             $routput_container_type->[$i_tok] = $container_type;
 3519             $routput_type_sequence->[$i_tok]  = $type_sequence;
 3520             $routput_indent_flag->[$i_tok]    = $indent_flag;
 3521         }
 3522 
 3523         unless ( ( $type eq 'b' ) || ( $type eq '#' ) ) {
 3524             $last_last_nonblank_token          = $last_nonblank_token;
 3525             $last_last_nonblank_type           = $last_nonblank_type;
 3526             $last_last_nonblank_block_type     = $last_nonblank_block_type;
 3527             $last_last_nonblank_container_type = $last_nonblank_container_type;
 3528             $last_last_nonblank_type_sequence  = $last_nonblank_type_sequence;
 3529             $last_nonblank_token               = $tok;
 3530             $last_nonblank_type                = $type;
 3531             $last_nonblank_block_type          = $block_type;
 3532             $last_nonblank_container_type      = $container_type;
 3533             $last_nonblank_type_sequence       = $type_sequence;
 3534             $last_nonblank_prototype           = $prototype;
 3535         }
 3536 
 3537         # reset indentation level if necessary at a sub or package
 3538         # in an attempt to recover from a nesting error
 3539         if ( $level_in_tokenizer < 0 ) {
 3540             if ( $input_line =~ /^\s*(sub|package)\s+(\w+)/ ) {
 3541                 reset_indentation_level(0);
 3542                 brace_warning("resetting level to 0 at $1 $2\n");
 3543             }
 3544         }
 3545 
 3546         # all done tokenizing this line ...
 3547         # now prepare the final list of tokens and types
 3548 
 3549         my @token_type     = ();   # stack of output token types
 3550         my @block_type     = ();   # stack of output code block types
 3551         my @container_type = ();   # stack of output code container types
 3552         my @type_sequence  = ();   # stack of output type sequence numbers
 3553         my @tokens         = ();   # output tokens
 3554         my @levels         = ();   # structural brace levels of output tokens
 3555         my @slevels        = ();   # secondary nesting levels of output tokens
 3556         my @nesting_tokens = ();   # string of tokens leading to this depth
 3557         my @nesting_types  = ();   # string of token types leading to this depth
 3558         my @nesting_blocks = ();   # string of block types leading to this depth
 3559         my @nesting_lists  = ();   # string of list types leading to this depth
 3560         my @ci_string = ();  # string needed to compute continuation indentation
 3561         my @container_environment = ();    # BLOCK or LIST
 3562         my $container_environment = '';
 3563         my $im                    = -1;    # previous $i value
 3564         my $num;
 3565         my $ci_string_sum = ones_count($ci_string_in_tokenizer);
 3566 
 3567 # Computing Token Indentation
 3568 #
 3569 #     The final section of the tokenizer forms tokens and also computes
 3570 #     parameters needed to find indentation.  It is much easier to do it
 3571 #     in the tokenizer than elsewhere.  Here is a brief description of how
 3572 #     indentation is computed.  Perl::Tidy computes indentation as the sum
 3573 #     of 2 terms:
 3574 #
 3575 #     (1) structural indentation, such as if/else/elsif blocks
 3576 #     (2) continuation indentation, such as long parameter call lists.
 3577 #
 3578 #     These are occasionally called primary and secondary indentation.
 3579 #
 3580 #     Structural indentation is introduced by tokens of type '{', although
 3581 #     the actual tokens might be '{', '(', or '['.  Structural indentation
 3582 #     is of two types: BLOCK and non-BLOCK.  Default structural indentation
 3583 #     is 4 characters if the standard indentation scheme is used.
 3584 #
 3585 #     Continuation indentation is introduced whenever a line at BLOCK level
 3586 #     is broken before its termination.  Default continuation indentation
 3587 #     is 2 characters in the standard indentation scheme.
 3588 #
 3589 #     Both types of indentation may be nested arbitrarily deep and
 3590 #     interlaced.  The distinction between the two is somewhat arbitrary.
 3591 #
 3592 #     For each token, we will define two variables which would apply if
 3593 #     the current statement were broken just before that token, so that
 3594 #     that token started a new line:
 3595 #
 3596 #     $level = the structural indentation level,
 3597 #     $ci_level = the continuation indentation level
 3598 #
 3599 #     The total indentation will be $level * (4 spaces) + $ci_level * (2 spaces),
 3600 #     assuming defaults.  However, in some special cases it is customary
 3601 #     to modify $ci_level from this strict value.
 3602 #
 3603 #     The total structural indentation is easy to compute by adding and
 3604 #     subtracting 1 from a saved value as types '{' and '}' are seen.  The
 3605 #     running value of this variable is $level_in_tokenizer.
 3606 #
 3607 #     The total continuation is much more difficult to compute, and requires
 3608 #     several variables.  These variables are:
 3609 #
 3610 #     $ci_string_in_tokenizer = a string of 1's and 0's indicating, for
 3611 #       each indentation level, if there are intervening open secondary
 3612 #       structures just prior to that level.
 3613 #     $continuation_string_in_tokenizer = a string of 1's and 0's indicating
 3614 #       if the last token at that level is "continued", meaning that it
 3615 #       is not the first token of an expression.
 3616 #     $nesting_block_string = a string of 1's and 0's indicating, for each
 3617 #       indentation level, if the level is of type BLOCK or not.
 3618 #     $nesting_block_flag = the most recent 1 or 0 of $nesting_block_string
 3619 #     $nesting_list_string = a string of 1's and 0's indicating, for each
 3620 #       indentation level, if it is appropriate for list formatting.
 3621 #       If so, continuation indentation is used to indent long list items.
 3622 #     $nesting_list_flag = the most recent 1 or 0 of $nesting_list_string
 3623 #     @{$rslevel_stack} = a stack of total nesting depths at each
 3624 #       structural indentation level, where "total nesting depth" means
 3625 #       the nesting depth that would occur if every nesting token -- '{', '[',
 3626 #       and '(' -- , regardless of context, is used to compute a nesting
 3627 #       depth.
 3628 
 3629         #my $nesting_block_flag = ($nesting_block_string =~ /1$/);
 3630         #my $nesting_list_flag = ($nesting_list_string =~ /1$/);
 3631 
 3632         my ( $ci_string_i, $level_i, $nesting_block_string_i,
 3633             $nesting_list_string_i, $nesting_token_string_i,
 3634             $nesting_type_string_i, );
 3635 
 3636         foreach my $i ( @{$routput_token_list} )
 3637         {    # scan the list of pre-tokens indexes
 3638 
 3639             # self-checking for valid token types
 3640             my $type                    = $routput_token_type->[$i];
 3641             my $forced_indentation_flag = $routput_indent_flag->[$i];
 3642 
 3643             # See if we should undo the $forced_indentation_flag.
 3644             # Forced indentation after 'if', 'unless', 'while' and 'until'
 3645             # expressions without trailing parens is optional and doesn't
 3646             # always look good.  It is usually okay for a trailing logical
 3647             # expression, but if the expression is a function call, code block,
 3648             # or some kind of list it puts in an unwanted extra indentation
 3649             # level which is hard to remove.
 3650             #
 3651             # Example where extra indentation looks ok:
 3652             # return 1
 3653             #   if $det_a < 0 and $det_b > 0
 3654             #       or $det_a > 0 and $det_b < 0;
 3655             #
 3656             # Example where extra indentation is not needed because
 3657             # the eval brace also provides indentation:
 3658             # print "not " if defined eval {
 3659             #     reduce { die if $b > 2; $a + $b } 0, 1, 2, 3, 4;
 3660             # };
 3661             #
 3662             # The following rule works fairly well:
 3663             #   Undo the flag if the end of this line, or start of the next
 3664             #   line, is an opening container token or a comma.
 3665             # This almost always works, but if not after another pass it will
 3666             # be stable.
 3667             if ( $forced_indentation_flag && $type eq 'k' ) {
 3668                 my $ixlast  = -1;
 3669                 my $ilast   = $routput_token_list->[$ixlast];
 3670                 my $toklast = $routput_token_type->[$ilast];
 3671                 if ( $toklast eq '#' ) {
 3672                     $ixlast--;
 3673                     $ilast   = $routput_token_list->[$ixlast];
 3674                     $toklast = $routput_token_type->[$ilast];
 3675                 }
 3676                 if ( $toklast eq 'b' ) {
 3677                     $ixlast--;
 3678                     $ilast   = $routput_token_list->[$ixlast];
 3679                     $toklast = $routput_token_type->[$ilast];
 3680                 }
 3681                 if ( $toklast =~ /^[\{,]$/ ) {
 3682                     $forced_indentation_flag = 0;
 3683                 }
 3684                 else {
 3685                     ( $toklast, my $i_next ) =
 3686                       find_next_nonblank_token( $max_token_index, $rtokens,
 3687                         $max_token_index );
 3688                     if ( $toklast =~ /^[\{,]$/ ) {
 3689                         $forced_indentation_flag = 0;
 3690                     }
 3691                 }
 3692             }
 3693 
 3694             # if we are already in an indented if, see if we should outdent
 3695             if ($indented_if_level) {
 3696 
 3697                 # don't try to nest trailing if's - shouldn't happen
 3698                 if ( $type eq 'k' ) {
 3699                     $forced_indentation_flag = 0;
 3700                 }
 3701 
 3702                 # check for the normal case - outdenting at next ';'
 3703                 elsif ( $type eq ';' ) {
 3704                     if ( $level_in_tokenizer == $indented_if_level ) {
 3705                         $forced_indentation_flag = -1;
 3706                         $indented_if_level       = 0;
 3707                     }
 3708                 }
 3709 
 3710                 # handle case of missing semicolon
 3711                 elsif ( $type eq '}' ) {
 3712                     if ( $level_in_tokenizer == $indented_if_level ) {
 3713                         $indented_if_level = 0;
 3714 
 3715                         # TBD: This could be a subroutine call
 3716                         $level_in_tokenizer--;
 3717                         if ( @{$rslevel_stack} > 1 ) {
 3718                             pop( @{$rslevel_stack} );
 3719                         }
 3720                         if ( length($nesting_block_string) > 1 )
 3721                         {    # true for valid script
 3722                             chop $nesting_block_string;
 3723                             chop $nesting_list_string;
 3724                         }
 3725 
 3726                     }
 3727                 }
 3728             }
 3729 
 3730             my $tok = $rtokens->[$i];  # the token, but ONLY if same as pretoken
 3731             $level_i = $level_in_tokenizer;
 3732 
 3733             # This can happen by running perltidy on non-scripts
 3734             # although it could also be bug introduced by programming change.
 3735             # Perl silently accepts a 032 (^Z) and takes it as the end
 3736             if ( !$is_valid_token_type{$type} ) {
 3737                 my $val = ord($type);
 3738                 warning(
 3739                     "unexpected character decimal $val ($type) in script\n");
 3740                 $tokenizer_self->{_in_error} = 1;
 3741             }
 3742 
 3743             # ----------------------------------------------------------------
 3744             # TOKEN TYPE PATCHES
 3745             #  output __END__, __DATA__, and format as type 'k' instead of ';'
 3746             # to make html colors correct, etc.
 3747             my $fix_type = $type;
 3748             if ( $type eq ';' && $tok =~ /\w/ ) { $fix_type = 'k' }
 3749 
 3750             # output anonymous 'sub' as keyword
 3751             if ( $type eq 't' && $is_sub{$tok} ) { $fix_type = 'k' }
 3752 
 3753             # -----------------------------------------------------------------
 3754 
 3755             $nesting_token_string_i = $nesting_token_string;
 3756             $nesting_type_string_i  = $nesting_type_string;
 3757             $nesting_block_string_i = $nesting_block_string;
 3758             $nesting_list_string_i  = $nesting_list_string;
 3759 
 3760             # set primary indentation levels based on structural braces
 3761             # Note: these are set so that the leading braces have a HIGHER
 3762             # level than their CONTENTS, which is convenient for indentation
 3763             # Also, define continuation indentation for each token.
 3764             if ( $type eq '{' || $type eq 'L' || $forced_indentation_flag > 0 )
 3765             {
 3766 
 3767                 # use environment before updating
 3768                 $container_environment =
 3769                     $nesting_block_flag ? 'BLOCK'
 3770                   : $nesting_list_flag  ? 'LIST'
 3771                   :                       "";
 3772 
 3773                 # if the difference between total nesting levels is not 1,
 3774                 # there are intervening non-structural nesting types between
 3775                 # this '{' and the previous unclosed '{'
 3776                 my $intervening_secondary_structure = 0;
 3777                 if ( @{$rslevel_stack} ) {
 3778                     $intervening_secondary_structure =
 3779                       $slevel_in_tokenizer - $rslevel_stack->[-1];
 3780                 }
 3781 
 3782      # Continuation Indentation
 3783      #
 3784      # Having tried setting continuation indentation both in the formatter and
 3785      # in the tokenizer, I can say that setting it in the tokenizer is much,
 3786      # much easier.  The formatter already has too much to do, and can't
 3787      # make decisions on line breaks without knowing what 'ci' will be at
 3788      # arbitrary locations.
 3789      #
 3790      # But a problem with setting the continuation indentation (ci) here
 3791      # in the tokenizer is that we do not know where line breaks will actually
 3792      # be.  As a result, we don't know if we should propagate continuation
 3793      # indentation to higher levels of structure.
 3794      #
 3795      # For nesting of only structural indentation, we never need to do this.
 3796      # For example, in a long if statement, like this
 3797      #
 3798      #   if ( !$output_block_type[$i]
 3799      #     && ($in_statement_continuation) )
 3800      #   {           <--outdented
 3801      #       do_something();
 3802      #   }
 3803      #
 3804      # the second line has ci but we do normally give the lines within the BLOCK
 3805      # any ci.  This would be true if we had blocks nested arbitrarily deeply.
 3806      #
 3807      # But consider something like this, where we have created a break after
 3808      # an opening paren on line 1, and the paren is not (currently) a
 3809      # structural indentation token:
 3810      #
 3811      # my $file = $menubar->Menubutton(
 3812      #   qw/-text File -underline 0 -menuitems/ => [
 3813      #       [
 3814      #           Cascade    => '~View',
 3815      #           -menuitems => [
 3816      #           ...
 3817      #
 3818      # The second line has ci, so it would seem reasonable to propagate it
 3819      # down, giving the third line 1 ci + 1 indentation.  This suggests the
 3820      # following rule, which is currently used to propagating ci down: if there
 3821      # are any non-structural opening parens (or brackets, or braces), before
 3822      # an opening structural brace, then ci is propagated down, and otherwise
 3823      # not.  The variable $intervening_secondary_structure contains this
 3824      # information for the current token, and the string
 3825      # "$ci_string_in_tokenizer" is a stack of previous values of this
 3826      # variable.
 3827 
 3828                 # save the current states
 3829                 push( @{$rslevel_stack}, 1 + $slevel_in_tokenizer );
 3830                 $level_in_tokenizer++;
 3831 
 3832                 if ($forced_indentation_flag) {
 3833 
 3834                     # break BEFORE '?' when there is forced indentation
 3835                     if ( $type eq '?' ) { $level_i = $level_in_tokenizer; }
 3836                     if ( $type eq 'k' ) {
 3837                         $indented_if_level = $level_in_tokenizer;
 3838                     }
 3839 
 3840                     # do not change container environment here if we are not
 3841                     # at a real list. Adding this check prevents "blinkers"
 3842                     # often near 'unless" clauses, such as in the following
 3843                     # code:
 3844 ##          next
 3845 ##            unless -e (
 3846 ##                    $archive =
 3847 ##                      File::Spec->catdir( $_, "auto", $root, "$sub$lib_ext" )
 3848 ##            );
 3849 
 3850                     $nesting_block_string .= "$nesting_block_flag";
 3851                 }
 3852                 else {
 3853 
 3854                     if ( $routput_block_type->[$i] ) {
 3855                         $nesting_block_flag = 1;
 3856                         $nesting_block_string .= '1';
 3857                     }
 3858                     else {
 3859                         $nesting_block_flag = 0;
 3860                         $nesting_block_string .= '0';
 3861                     }
 3862                 }
 3863 
 3864                 # we will use continuation indentation within containers
 3865                 # which are not blocks and not logical expressions
 3866                 my $bit = 0;
 3867                 if ( !$routput_block_type->[$i] ) {
 3868 
 3869                     # propagate flag down at nested open parens
 3870                     if ( $routput_container_type->[$i] eq '(' ) {
 3871                         $bit = 1 if $nesting_list_flag;
 3872                     }
 3873 
 3874                   # use list continuation if not a logical grouping
 3875                   # /^(if|elsif|unless|while|and|or|not|&&|!|\|\||for|foreach)$/
 3876                     else {
 3877                         $bit = 1
 3878                           unless
 3879                           $is_logical_container{ $routput_container_type->[$i]
 3880                           };
 3881                     }
 3882                 }
 3883                 $nesting_list_string .= $bit;
 3884                 $nesting_list_flag = $bit;
 3885 
 3886                 $ci_string_in_tokenizer .=
 3887                   ( $intervening_secondary_structure != 0 ) ? '1' : '0';
 3888                 $ci_string_sum = ones_count($ci_string_in_tokenizer);
 3889                 $continuation_string_in_tokenizer .=
 3890                   ( $in_statement_continuation > 0 ) ? '1' : '0';
 3891 
 3892    #  Sometimes we want to give an opening brace continuation indentation,
 3893    #  and sometimes not.  For code blocks, we don't do it, so that the leading
 3894    #  '{' gets outdented, like this:
 3895    #
 3896    #   if ( !$output_block_type[$i]
 3897    #     && ($in_statement_continuation) )
 3898    #   {           <--outdented
 3899    #
 3900    #  For other types, we will give them continuation indentation.  For example,
 3901    #  here is how a list looks with the opening paren indented:
 3902    #
 3903    #     @LoL =
 3904    #       ( [ "fred", "barney" ], [ "george", "jane", "elroy" ],
 3905    #         [ "homer", "marge", "bart" ], );
 3906    #
 3907    #  This looks best when 'ci' is one-half of the indentation  (i.e., 2 and 4)
 3908 
 3909                 my $total_ci = $ci_string_sum;
 3910                 if (
 3911                     !$routput_block_type->[$i]    # patch: skip for BLOCK
 3912                     && ($in_statement_continuation)
 3913                     && !( $forced_indentation_flag && $type eq ':' )
 3914                   )
 3915                 {
 3916                     $total_ci += $in_statement_continuation
 3917                       unless ( $ci_string_in_tokenizer =~ /1$/ );
 3918                 }
 3919 
 3920                 $ci_string_i               = $total_ci;
 3921                 $in_statement_continuation = 0;
 3922             }
 3923 
 3924             elsif ($type eq '}'
 3925                 || $type eq 'R'
 3926                 || $forced_indentation_flag < 0 )
 3927             {
 3928 
 3929                 # only a nesting error in the script would prevent popping here
 3930                 if ( @{$rslevel_stack} > 1 ) { pop( @{$rslevel_stack} ); }
 3931 
 3932                 $level_i = --$level_in_tokenizer;
 3933 
 3934                 # restore previous level values
 3935                 if ( length($nesting_block_string) > 1 )
 3936                 {    # true for valid script
 3937                     chop $nesting_block_string;
 3938                     $nesting_block_flag = ( $nesting_block_string =~ /1$/ );
 3939                     chop $nesting_list_string;
 3940                     $nesting_list_flag = ( $nesting_list_string =~ /1$/ );
 3941 
 3942                     chop $ci_string_in_tokenizer;
 3943                     $ci_string_sum = ones_count($ci_string_in_tokenizer);
 3944 
 3945                     $in_statement_continuation =
 3946                       chop $continuation_string_in_tokenizer;
 3947 
 3948                     # zero continuation flag at terminal BLOCK '}' which
 3949                     # ends a statement.
 3950                     if ( $routput_block_type->[$i] ) {
 3951 
 3952                         # ...These include non-anonymous subs
 3953                         # note: could be sub ::abc { or sub 'abc
 3954                         if ( $routput_block_type->[$i] =~ m/^sub\s*/gc ) {
 3955 
 3956                          # note: older versions of perl require the /gc modifier
 3957                          # here or else the \G does not work.
 3958                             if ( $routput_block_type->[$i] =~ /\G('|::|\w)/gc )
 3959                             {
 3960                                 $in_statement_continuation = 0;
 3961                             }
 3962                         }
 3963 
 3964 # ...and include all block types except user subs with
 3965 # block prototypes and these: (sort|grep|map|do|eval)
 3966 # /^(\}|\{|BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|;|if|elsif|else|unless|while|until|for|foreach)$/
 3967                         elsif (
 3968                             $is_zero_continuation_block_type{
 3969                                 $routput_block_type->[$i]
 3970                             } )
 3971                         {
 3972                             $in_statement_continuation = 0;
 3973                         }
 3974 
 3975                         # ..but these are not terminal types:
 3976                         #     /^(sort|grep|map|do|eval)$/ )
 3977                         elsif (
 3978                             $is_not_zero_continuation_block_type{
 3979                                 $routput_block_type->[$i]
 3980                             } )
 3981                         {
 3982                         }
 3983 
 3984                         # ..and a block introduced by a label
 3985                         # /^\w+\s*:$/gc ) {
 3986                         elsif ( $routput_block_type->[$i] =~ /:$/ ) {
 3987                             $in_statement_continuation = 0;
 3988                         }
 3989 
 3990                         # user function with block prototype
 3991                         else {
 3992                             $in_statement_continuation = 0;
 3993                         }
 3994                     }
 3995 
 3996                     # If we are in a list, then
 3997                     # we must set continuation indentation at the closing
 3998                     # paren of something like this (paren after $check):
 3999                     #     assert(
 4000                     #         __LINE__,
 4001                     #         ( not defined $check )
 4002                     #           or ref $check
 4003                     #           or $check eq "new"
 4004                     #           or $check eq "old",
 4005                     #     );
 4006                     elsif ( $tok eq ')' ) {
 4007                         $in_statement_continuation = 1
 4008                           if $routput_container_type->[$i] =~ /^[;,\{\}]$/;
 4009                     }
 4010 
 4011                     elsif ( $tok eq ';' ) { $in_statement_continuation = 0 }
 4012                 }
 4013 
 4014                 # use environment after updating
 4015                 $container_environment =
 4016                     $nesting_block_flag ? 'BLOCK'
 4017                   : $nesting_list_flag  ? 'LIST'
 4018                   :                       "";
 4019                 $ci_string_i = $ci_string_sum + $in_statement_continuation;
 4020                 $nesting_block_string_i = $nesting_block_string;
 4021                 $nesting_list_string_i  = $nesting_list_string;
 4022             }
 4023 
 4024             # not a structural indentation type..
 4025             else {
 4026 
 4027                 $container_environment =
 4028                     $nesting_block_flag ? 'BLOCK'
 4029                   : $nesting_list_flag  ? 'LIST'
 4030                   :                       "";
 4031 
 4032                 # zero the continuation indentation at certain tokens so
 4033                 # that they will be at the same level as its container.  For
 4034                 # commas, this simplifies the -lp indentation logic, which
 4035                 # counts commas.  For ?: it makes them stand out.
 4036                 if ($nesting_list_flag) {
 4037                     if ( $type =~ /^[,\?\:]$/ ) {
 4038                         $in_statement_continuation = 0;
 4039                     }
 4040                 }
 4041 
 4042                 # be sure binary operators get continuation indentation
 4043                 if (
 4044                     $container_environment
 4045                     && (   $type eq 'k' && $is_binary_keyword{$tok}
 4046                         || $is_binary_type{$type} )
 4047                   )
 4048                 {
 4049                     $in_statement_continuation = 1;
 4050                 }
 4051 
 4052                 # continuation indentation is sum of any open ci from previous
 4053                 # levels plus the current level
 4054                 $ci_string_i = $ci_string_sum + $in_statement_continuation;
 4055 
 4056                 # update continuation flag ...
 4057                 # if this isn't a blank or comment..
 4058                 if ( $type ne 'b' && $type ne '#' ) {
 4059 
 4060                     # and we are in a BLOCK
 4061                     if ($nesting_block_flag) {
 4062 
 4063                         # the next token after a ';' and label starts a new stmt
 4064                         if ( $type eq ';' || $type eq 'J' ) {
 4065                             $in_statement_continuation = 0;
 4066                         }
 4067 
 4068                         # otherwise, we are continuing the current statement
 4069                         else {
 4070                             $in_statement_continuation = 1;
 4071                         }
 4072                     }
 4073 
 4074                     # if we are not in a BLOCK..
 4075                     else {
 4076 
 4077                         # do not use continuation indentation if not list
 4078                         # environment (could be within if/elsif clause)
 4079                         if ( !$nesting_list_flag ) {
 4080                             $in_statement_continuation = 0;
 4081                         }
 4082 
 4083                         # otherwise, the token after a ',' starts a new term
 4084 
 4085                         # Patch FOR RT#99961; no continuation after a ';'
 4086                         # This is needed because perltidy currently marks
 4087                         # a block preceded by a type character like % or @
 4088                         # as a non block, to simplify formatting. But these
 4089                         # are actually blocks and can have semicolons.
 4090                         # See code_block_type() and is_non_structural_brace().
 4091                         elsif ( $type eq ',' || $type eq ';' ) {
 4092                             $in_statement_continuation = 0;
 4093                         }
 4094 
 4095                         # otherwise, we are continuing the current term
 4096                         else {
 4097                             $in_statement_continuation = 1;
 4098                         }
 4099                     }
 4100                 }
 4101             }
 4102 
 4103             if ( $level_in_tokenizer < 0 ) {
 4104                 unless ( $tokenizer_self->{_saw_negative_indentation} ) {
 4105                     $tokenizer_self->{_saw_negative_indentation} = 1;
 4106                     warning("Starting negative indentation\n");
 4107                 }
 4108             }
 4109 
 4110             # set secondary nesting levels based on all containment token types
 4111             # Note: these are set so that the nesting depth is the depth
 4112             # of the PREVIOUS TOKEN, which is convenient for setting
 4113             # the strength of token bonds
 4114             my $slevel_i = $slevel_in_tokenizer;
 4115 
 4116             #    /^[L\{\(\[]$/
 4117             if ( $is_opening_type{$type} ) {
 4118                 $slevel_in_tokenizer++;
 4119                 $nesting_token_string .= $tok;
 4120                 $nesting_type_string  .= $type;
 4121             }
 4122 
 4123             #       /^[R\}\)\]]$/
 4124             elsif ( $is_closing_type{$type} ) {
 4125                 $slevel_in_tokenizer--;
 4126                 my $char = chop $nesting_token_string;
 4127 
 4128                 if ( $char ne $matching_start_token{$tok} ) {
 4129                     $nesting_token_string .= $char . $tok;
 4130                     $nesting_type_string  .= $type;
 4131                 }
 4132                 else {
 4133                     chop $nesting_type_string;
 4134                 }
 4135             }
 4136 
 4137             push( @block_type,            $routput_block_type->[$i] );
 4138             push( @ci_string,             $ci_string_i );
 4139             push( @container_environment, $container_environment );
 4140             push( @container_type,        $routput_container_type->[$i] );
 4141             push( @levels,                $level_i );
 4142             push( @nesting_tokens,        $nesting_token_string_i );
 4143             push( @nesting_types,         $nesting_type_string_i );
 4144             push( @slevels,               $slevel_i );
 4145             push( @token_type,            $fix_type );
 4146             push( @type_sequence,         $routput_type_sequence->[$i] );
 4147             push( @nesting_blocks,        $nesting_block_string );
 4148             push( @nesting_lists,         $nesting_list_string );
 4149 
 4150             # now form the previous token
 4151             if ( $im >= 0 ) {
 4152                 $num =
 4153                   $rtoken_map->[$i] - $rtoken_map->[$im];  # how many characters
 4154 
 4155                 if ( $num > 0 ) {
 4156                     push( @tokens,
 4157                         substr( $input_line, $rtoken_map->[$im], $num ) );
 4158                 }
 4159             }
 4160             $im = $i;
 4161         }
 4162 
 4163         $num = length($input_line) - $rtoken_map->[$im];   # make the last token
 4164         if ( $num > 0 ) {
 4165             push( @tokens, substr( $input_line, $rtoken_map->[$im], $num ) );
 4166         }
 4167 
 4168         $tokenizer_self->{_in_attribute_list} = $in_attribute_list;
 4169         $tokenizer_self->{_in_quote}          = $in_quote;
 4170         $tokenizer_self->{_quote_target} =
 4171           $in_quote ? matching_end_token($quote_character) : "";
 4172         $tokenizer_self->{_rhere_target_list} = $rhere_target_list;
 4173 
 4174         $line_of_tokens->{_rtoken_type}            = \@token_type;
 4175         $line_of_tokens->{_rtokens}                = \@tokens;
 4176         $line_of_tokens->{_rblock_type}            = \@block_type;
 4177         $line_of_tokens->{_rcontainer_type}        = \@container_type;
 4178         $line_of_tokens->{_rcontainer_environment} = \@container_environment;
 4179         $line_of_tokens->{_rtype_sequence}         = \@type_sequence;
 4180         $line_of_tokens->{_rlevels}                = \@levels;
 4181         $line_of_tokens->{_rslevels}               = \@slevels;
 4182         $line_of_tokens->{_rnesting_tokens}        = \@nesting_tokens;
 4183         $line_of_tokens->{_rci_levels}             = \@ci_string;
 4184         $line_of_tokens->{_rnesting_blocks}        = \@nesting_blocks;
 4185 
 4186         return;
 4187     }
 4188 }    # end tokenize_this_line
 4189 
 4190 #########i#############################################################
 4191 # Tokenizer routines which assist in identifying token types
 4192 #######################################################################
 4193 
 4194 sub operator_expected {
 4195 
 4196     # Many perl symbols have two or more meanings.  For example, '<<'
 4197     # can be a shift operator or a here-doc operator.  The
 4198     # interpretation of these symbols depends on the current state of
 4199     # the tokenizer, which may either be expecting a term or an
 4200     # operator.  For this example, a << would be a shift if an operator
 4201     # is expected, and a here-doc if a term is expected.  This routine
 4202     # is called to make this decision for any current token.  It returns
 4203     # one of three possible values:
 4204     #
 4205     #     OPERATOR - operator expected (or at least, not a term)
 4206     #     UNKNOWN  - can't tell
 4207     #     TERM     - a term is expected (or at least, not an operator)
 4208     #
 4209     # The decision is based on what has been seen so far.  This
 4210     # information is stored in the "$last_nonblank_type" and
 4211     # "$last_nonblank_token" variables.  For example, if the
 4212     # $last_nonblank_type is '=~', then we are expecting a TERM, whereas
 4213     # if $last_nonblank_type is 'n' (numeric), we are expecting an
 4214     # OPERATOR.
 4215     #
 4216     # If a UNKNOWN is returned, the calling routine must guess. A major
 4217     # goal of this tokenizer is to minimize the possibility of returning
 4218     # UNKNOWN, because a wrong guess can spoil the formatting of a
 4219     # script.
 4220     #
 4221     # adding NEW_TOKENS: it is critically important that this routine be
 4222     # updated to allow it to determine if an operator or term is to be
 4223     # expected after the new token.  Doing this simply involves adding
 4224     # the new token character to one of the regexes in this routine or
 4225     # to one of the hash lists
 4226     # that it uses, which are initialized in the BEGIN section.
 4227     # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token,
 4228     # $statement_type
 4229 
 4230     my ( $prev_type, $tok, $next_type ) = @_;
 4231 
 4232     my $op_expected = UNKNOWN;
 4233 
 4234 ##print "tok=$tok last type=$last_nonblank_type last tok=$last_nonblank_token\n";
 4235 
 4236 # Note: function prototype is available for token type 'U' for future
 4237 # program development.  It contains the leading and trailing parens,
 4238 # and no blanks.  It might be used to eliminate token type 'C', for
 4239 # example (prototype = '()'). Thus:
 4240 # if ($last_nonblank_type eq 'U') {
 4241 #     print "previous token=$last_nonblank_token  type=$last_nonblank_type prototype=$last_nonblank_prototype\n";
 4242 # }
 4243 
 4244     # A possible filehandle (or object) requires some care...
 4245     if ( $last_nonblank_type eq 'Z' ) {
 4246 
 4247         # angle.t
 4248         if ( $last_nonblank_token =~ /^[A-Za-z_]/ ) {
 4249             $op_expected = UNKNOWN;
 4250         }
 4251 
 4252         # For possible file handle like "$a", Perl uses weird parsing rules.
 4253         # For example:
 4254         # print $a/2,"/hi";   - division
 4255         # print $a / 2,"/hi"; - division
 4256         # print $a/ 2,"/hi";  - division
 4257         # print $a /2,"/hi";  - pattern (and error)!
 4258         elsif ( ( $prev_type eq 'b' ) && ( $next_type ne 'b' ) ) {
 4259             $op_expected = TERM;
 4260         }
 4261 
 4262         # Note when an operation is being done where a
 4263         # filehandle might be expected, since a change in whitespace
 4264         # could change the interpretation of the statement.
 4265         else {
 4266             if ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) {
 4267 
 4268                # Do not complain in 'use' statements, which have special syntax.
 4269                # For example, from RT#130344:
 4270                #   use lib $FindBin::Bin . '/lib';
 4271                 if ( $statement_type ne 'use' ) {
 4272                     complain("operator in print statement not recommended\n");
 4273                 }
 4274                 $op_expected = OPERATOR;
 4275             }
 4276         }
 4277     }
 4278 
 4279     # Check for smartmatch operator before preceding brace or square bracket.
 4280     # For example, at the ? after the ] in the following expressions we are
 4281     # expecting an operator:
 4282     #
 4283     # qr/3/ ~~ ['1234'] ? 1 : 0;
 4284     # map { $_ ~~ [ '0', '1' ] ? 'x' : 'o' } @a;
 4285     elsif ( $last_nonblank_type eq '}' && $last_nonblank_token eq '~~' ) {
 4286         $op_expected = OPERATOR;
 4287     }
 4288 
 4289     # handle something after 'do' and 'eval'
 4290     elsif ( $is_block_operator{$last_nonblank_token} ) {
 4291 
 4292         # something like $a = eval "expression";
 4293         #                          ^
 4294         if ( $last_nonblank_type eq 'k' ) {
 4295             $op_expected = TERM;    # expression or list mode following keyword
 4296         }
 4297 
 4298         # something like $a = do { BLOCK } / 2;
 4299         # or this ? after a smartmatch anonynmous hash or array reference:
 4300         #   qr/3/ ~~ ['1234'] ? 1 : 0;
 4301         #                                  ^
 4302         else {
 4303             $op_expected = OPERATOR;    # block mode following }
 4304         }
 4305     }
 4306 
 4307     # handle bare word..
 4308     elsif ( $last_nonblank_type eq 'w' ) {
 4309 
 4310         # unfortunately, we can't tell what type of token to expect next
 4311         # after most bare words
 4312         $op_expected = UNKNOWN;
 4313     }
 4314 
 4315     # operator, but not term possible after these types
 4316     # Note: moved ')' from type to token because parens in list context
 4317     # get marked as '{' '}' now.  This is a minor glitch in the following:
 4318     #    my %opts = (ref $_[0] eq 'HASH') ? %{shift()} : ();
 4319     #
 4320     elsif (( $last_nonblank_type =~ /^[\]RnviQh]$/ )
 4321         || ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) )
 4322     {
 4323         $op_expected = OPERATOR;
 4324 
 4325         # in a 'use' statement, numbers and v-strings are not true
 4326         # numbers, so to avoid incorrect error messages, we will
 4327         # mark them as unknown for now (use.t)
 4328         # TODO: it would be much nicer to create a new token V for VERSION
 4329         # number in a use statement.  Then this could be a check on type V
 4330         # and related patches which change $statement_type for '=>'
 4331         # and ',' could be removed.  Further, it would clean things up to
 4332         # scan the 'use' statement with a separate subroutine.
 4333         if (   ( $statement_type eq 'use' )
 4334             && ( $last_nonblank_type =~ /^[nv]$/ ) )
 4335         {
 4336             $op_expected = UNKNOWN;
 4337         }
 4338 
 4339         # expecting VERSION or {} after package NAMESPACE
 4340         elsif ($statement_type =~ /^package\b/
 4341             && $last_nonblank_token =~ /^package\b/ )
 4342         {
 4343             $op_expected = TERM;
 4344         }
 4345     }
 4346 
 4347     # no operator after many keywords, such as "die", "warn", etc
 4348     elsif ( $expecting_term_token{$last_nonblank_token} ) {
 4349 
 4350         # // may follow perl functions which may be unary operators
 4351         # see test file dor.t (defined or);
 4352         if (   $tok eq '/'
 4353             && $next_type eq '/'
 4354             && $last_nonblank_type eq 'k'
 4355             && $is_keyword_taking_optional_args{$last_nonblank_token} )
 4356         {
 4357             $op_expected = OPERATOR;
 4358         }
 4359         else {
 4360             $op_expected = TERM;
 4361         }
 4362     }
 4363 
 4364     # no operator after things like + - **  (i.e., other operators)
 4365     elsif ( $expecting_term_types{$last_nonblank_type} ) {
 4366         $op_expected = TERM;
 4367     }
 4368 
 4369     # a few operators, like "time", have an empty prototype () and so
 4370     # take no parameters but produce a value to operate on
 4371     elsif ( $expecting_operator_token{$last_nonblank_token} ) {
 4372         $op_expected = OPERATOR;
 4373     }
 4374 
 4375     # post-increment and decrement produce values to be operated on
 4376     elsif ( $expecting_operator_types{$last_nonblank_type} ) {
 4377         $op_expected = OPERATOR;
 4378     }
 4379 
 4380     # no value to operate on after sub block
 4381     elsif ( $last_nonblank_token =~ /^sub\s/ ) { $op_expected = TERM; }
 4382 
 4383     # a right brace here indicates the end of a simple block.
 4384     # all non-structural right braces have type 'R'
 4385     # all braces associated with block operator keywords have been given those
 4386     # keywords as "last_nonblank_token" and caught above.
 4387     # (This statement is order dependent, and must come after checking
 4388     # $last_nonblank_token).
 4389     elsif ( $last_nonblank_type eq '}' ) {
 4390 
 4391         # patch for dor.t (defined or).
 4392         if (   $tok eq '/'
 4393             && $next_type eq '/'
 4394             && $last_nonblank_token eq ']' )
 4395         {
 4396             $op_expected = OPERATOR;
 4397         }
 4398 
 4399         # Patch for RT #116344: misparse a ternary operator after an anonymous
 4400         # hash, like this:
 4401         #   return ref {} ? 1 : 0;
 4402         # The right brace should really be marked type 'R' in this case, and
 4403         # it is safest to return an UNKNOWN here. Expecting a TERM will
 4404         # cause the '?' to always be interpreted as a pattern delimiter
 4405         # rather than introducing a ternary operator.
 4406         elsif ( $tok eq '?' ) {
 4407             $op_expected = UNKNOWN;
 4408         }
 4409         else {
 4410             $op_expected = TERM;
 4411         }
 4412     }
 4413 
 4414     # something else..what did I forget?
 4415     else {
 4416 
 4417         # collecting diagnostics on unknown operator types..see what was missed
 4418         $op_expected = UNKNOWN;
 4419         write_diagnostics(
 4420 "OP: unknown after type=$last_nonblank_type  token=$last_nonblank_token\n"
 4421         );
 4422     }
 4423 
 4424     TOKENIZER_DEBUG_FLAG_EXPECT && do {
 4425         print STDOUT
 4426 "EXPECT: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
 4427     };
 4428     return $op_expected;
 4429 }
 4430 
 4431 sub new_statement_ok {
 4432 
 4433     # return true if the current token can start a new statement
 4434     # USES GLOBAL VARIABLES: $last_nonblank_type
 4435 
 4436     return label_ok()    # a label would be ok here
 4437 
 4438       || $last_nonblank_type eq 'J';    # or we follow a label
 4439 
 4440 }
 4441 
 4442 sub label_ok {
 4443 
 4444     # Decide if a bare word followed by a colon here is a label
 4445     # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
 4446     # $brace_depth, @brace_type
 4447 
 4448     # if it follows an opening or closing code block curly brace..
 4449     if ( ( $last_nonblank_token eq '{' || $last_nonblank_token eq '}' )
 4450         && $last_nonblank_type eq $last_nonblank_token )
 4451     {
 4452 
 4453         # it is a label if and only if the curly encloses a code block
 4454         return $brace_type[$brace_depth];
 4455     }
 4456 
 4457     # otherwise, it is a label if and only if it follows a ';' (real or fake)
 4458     # or another label
 4459     else {
 4460         return ( $last_nonblank_type eq ';' || $last_nonblank_type eq 'J' );
 4461     }
 4462 }
 4463 
 4464 sub code_block_type {
 4465 
 4466     # Decide if this is a block of code, and its type.
 4467     # Must be called only when $type = $token = '{'
 4468     # The problem is to distinguish between the start of a block of code
 4469     # and the start of an anonymous hash reference
 4470     # Returns "" if not code block, otherwise returns 'last_nonblank_token'
 4471     # to indicate the type of code block.  (For example, 'last_nonblank_token'
 4472     # might be 'if' for an if block, 'else' for an else block, etc).
 4473     # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
 4474     # $last_nonblank_block_type, $brace_depth, @brace_type
 4475 
 4476     # handle case of multiple '{'s
 4477 
 4478 # print "BLOCK_TYPE EXAMINING: type=$last_nonblank_type tok=$last_nonblank_token\n";
 4479 
 4480     my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
 4481     if (   $last_nonblank_token eq '{'
 4482         && $last_nonblank_type eq $last_nonblank_token )
 4483     {
 4484 
 4485         # opening brace where a statement may appear is probably
 4486         # a code block but might be and anonymous hash reference
 4487         if ( $brace_type[$brace_depth] ) {
 4488             return decide_if_code_block( $i, $rtokens, $rtoken_type,
 4489                 $max_token_index );
 4490         }
 4491 
 4492         # cannot start a code block within an anonymous hash
 4493         else {
 4494             return "";
 4495         }
 4496     }
 4497 
 4498     elsif ( $last_nonblank_token eq ';' ) {
 4499 
 4500         # an opening brace where a statement may appear is probably
 4501         # a code block but might be and anonymous hash reference
 4502         return decide_if_code_block( $i, $rtokens, $rtoken_type,
 4503             $max_token_index );
 4504     }
 4505 
 4506     # handle case of '}{'
 4507     elsif ($last_nonblank_token eq '}'
 4508         && $last_nonblank_type eq $last_nonblank_token )
 4509     {
 4510 
 4511         # a } { situation ...
 4512         # could be hash reference after code block..(blktype1.t)
 4513         if ($last_nonblank_block_type) {
 4514             return decide_if_code_block( $i, $rtokens, $rtoken_type,
 4515                 $max_token_index );
 4516         }
 4517 
 4518         # must be a block if it follows a closing hash reference
 4519         else {
 4520             return $last_nonblank_token;
 4521         }
 4522     }
 4523 
 4524     ################################################################
 4525     # NOTE: braces after type characters start code blocks, but for
 4526     # simplicity these are not identified as such.  See also
 4527     # sub is_non_structural_brace.
 4528     ################################################################
 4529 
 4530 ##    elsif ( $last_nonblank_type eq 't' ) {
 4531 ##       return $last_nonblank_token;
 4532 ##    }
 4533 
 4534     # brace after label:
 4535     elsif ( $last_nonblank_type eq 'J' ) {
 4536         return $last_nonblank_token;
 4537     }
 4538 
 4539 # otherwise, look at previous token.  This must be a code block if
 4540 # it follows any of these:
 4541 # /^(BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|if|elsif|else|unless|do|while|until|eval|for|foreach|map|grep|sort)$/
 4542     elsif ( $is_code_block_token{$last_nonblank_token} ) {
 4543 
 4544         # Bug Patch: Note that the opening brace after the 'if' in the following
 4545         # snippet is an anonymous hash ref and not a code block!
 4546         #   print 'hi' if { x => 1, }->{x};
 4547         # We can identify this situation because the last nonblank type
 4548         # will be a keyword (instead of a closing peren)
 4549         if (   $last_nonblank_token =~ /^(if|unless)$/
 4550             && $last_nonblank_type eq 'k' )
 4551         {
 4552             return "";
 4553         }
 4554         else {
 4555             return $last_nonblank_token;
 4556         }
 4557     }
 4558 
 4559     # or a sub or package BLOCK
 4560     elsif ( ( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
 4561         && $last_nonblank_token =~ /^(sub|package)\b/ )
 4562     {
 4563         return $last_nonblank_token;
 4564     }
 4565 
 4566     # or a sub alias
 4567     elsif (( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
 4568         && ( $is_sub{$last_nonblank_token} ) )
 4569     {
 4570         return 'sub';
 4571     }
 4572 
 4573     elsif ( $statement_type =~ /^(sub|package)\b/ ) {
 4574         return $statement_type;
 4575     }
 4576 
 4577     # user-defined subs with block parameters (like grep/map/eval)
 4578     elsif ( $last_nonblank_type eq 'G' ) {
 4579         return $last_nonblank_token;
 4580     }
 4581 
 4582     # check bareword
 4583     elsif ( $last_nonblank_type eq 'w' ) {
 4584         return decide_if_code_block( $i, $rtokens, $rtoken_type,
 4585             $max_token_index );
 4586     }
 4587 
 4588     # Patch for bug # RT #94338 reported by Daniel Trizen
 4589     # for-loop in a parenthesized block-map triggering an error message:
 4590     #    map( { foreach my $item ( '0', '1' ) { print $item} } qw(a b c) );
 4591     # Check for a code block within a parenthesized function call
 4592     elsif ( $last_nonblank_token eq '(' ) {
 4593         my $paren_type = $paren_type[$paren_depth];
 4594         if ( $paren_type && $paren_type =~ /^(map|grep|sort)$/ ) {
 4595 
 4596             # We will mark this as a code block but use type 't' instead
 4597             # of the name of the contining function.  This will allow for
 4598             # correct parsing but will usually produce better formatting.
 4599             # Braces with block type 't' are not broken open automatically
 4600             # in the formatter as are other code block types, and this usually
 4601             # works best.
 4602             return 't';    # (Not $paren_type)
 4603         }
 4604         else {
 4605             return "";
 4606         }
 4607     }
 4608 
 4609     # handle unknown syntax ') {'
 4610     # we previously appended a '()' to mark this case
 4611     elsif ( $last_nonblank_token =~ /\(\)$/ ) {
 4612         return $last_nonblank_token;
 4613     }
 4614 
 4615     # anything else must be anonymous hash reference
 4616     else {
 4617         return "";
 4618     }
 4619 }
 4620 
 4621 sub decide_if_code_block {
 4622 
 4623     # USES GLOBAL VARIABLES: $last_nonblank_token
 4624     my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
 4625 
 4626     my ( $next_nonblank_token, $i_next ) =
 4627       find_next_nonblank_token( $i, $rtokens, $max_token_index );
 4628 
 4629     # we are at a '{' where a statement may appear.
 4630     # We must decide if this brace starts an anonymous hash or a code
 4631     # block.
 4632     # return "" if anonymous hash, and $last_nonblank_token otherwise
 4633 
 4634     # initialize to be code BLOCK
 4635     my $code_block_type = $last_nonblank_token;
 4636 
 4637     # Check for the common case of an empty anonymous hash reference:
 4638     # Maybe something like sub { { } }
 4639     if ( $next_nonblank_token eq '}' ) {
 4640         $code_block_type = "";
 4641     }
 4642 
 4643     else {
 4644 
 4645         # To guess if this '{' is an anonymous hash reference, look ahead
 4646         # and test as follows:
 4647         #
 4648         # it is a hash reference if next come:
 4649         #   - a string or digit followed by a comma or =>
 4650         #   - bareword followed by =>
 4651         # otherwise it is a code block
 4652         #
 4653         # Examples of anonymous hash ref:
 4654         # {'aa',};
 4655         # {1,2}
 4656         #
 4657         # Examples of code blocks:
 4658         # {1; print "hello\n", 1;}
 4659         # {$a,1};
 4660 
 4661         # We are only going to look ahead one more (nonblank/comment) line.
 4662         # Strange formatting could cause a bad guess, but that's unlikely.
 4663         my @pre_types;
 4664         my @pre_tokens;
 4665 
 4666         # Ignore the rest of this line if it is a side comment
 4667         if ( $next_nonblank_token ne '#' ) {
 4668             @pre_types  = @{$rtoken_type}[ $i + 1 .. $max_token_index ];
 4669             @pre_tokens = @{$rtokens}[ $i + 1 .. $max_token_index ];
 4670         }
 4671         my ( $rpre_tokens, $rpre_types ) =
 4672           peek_ahead_for_n_nonblank_pre_tokens(20);    # 20 is arbitrary but
 4673                                                        # generous, and prevents
 4674                                                        # wasting lots of
 4675                                                        # time in mangled files
 4676         if ( defined($rpre_types) && @{$rpre_types} ) {
 4677             push @pre_types,  @{$rpre_types};
 4678             push @pre_tokens, @{$rpre_tokens};
 4679         }
 4680 
 4681         # put a sentinel token to simplify stopping the search
 4682         push @pre_types, '}';
 4683         push @pre_types, '}';
 4684 
 4685         my $jbeg = 0;
 4686         $jbeg = 1 if $pre_types[0] eq 'b';
 4687 
 4688         # first look for one of these
 4689         #  - bareword
 4690         #  - bareword with leading -
 4691         #  - digit
 4692         #  - quoted string
 4693         my $j = $jbeg;
 4694         if ( $pre_types[$j] =~ /^[\'\"]/ ) {
 4695 
 4696             # find the closing quote; don't worry about escapes
 4697             my $quote_mark = $pre_types[$j];
 4698             foreach my $k ( $j + 1 .. @pre_types - 2 ) {
 4699                 if ( $pre_types[$k] eq $quote_mark ) {
 4700                     $j = $k + 1;
 4701                     my $next = $pre_types[$j];
 4702                     last;
 4703                 }
 4704             }
 4705         }
 4706         elsif ( $pre_types[$j] eq 'd' ) {
 4707             $j++;
 4708         }
 4709         elsif ( $pre_types[$j] eq 'w' ) {
 4710             $j++;
 4711         }
 4712         elsif ( $pre_types[$j] eq '-' && $pre_types[ ++$j ] eq 'w' ) {
 4713             $j++;
 4714         }
 4715         if ( $j > $jbeg ) {
 4716 
 4717             $j++ if $pre_types[$j] eq 'b';
 4718 
 4719             # Patched for RT #95708
 4720             if (
 4721 
 4722                 # it is a comma which is not a pattern delimeter except for qw
 4723                 (
 4724                        $pre_types[$j] eq ','
 4725                     && $pre_tokens[$jbeg] !~ /^(s|m|y|tr|qr|q|qq|qx)$/
 4726                 )
 4727 
 4728                 # or a =>
 4729                 || ( $pre_types[$j] eq '=' && $pre_types[ ++$j ] eq '>' )
 4730               )
 4731             {
 4732                 $code_block_type = "";
 4733             }
 4734         }
 4735     }
 4736 
 4737     return $code_block_type;
 4738 }
 4739 
 4740 sub report_unexpected {
 4741 
 4742     # report unexpected token type and show where it is
 4743     # USES GLOBAL VARIABLES: $tokenizer_self
 4744     my ( $found, $expecting, $i_tok, $last_nonblank_i, $rpretoken_map,
 4745         $rpretoken_type, $input_line )
 4746       = @_;
 4747 
 4748     if ( ++$tokenizer_self->{_unexpected_error_count} <= MAX_NAG_MESSAGES ) {
 4749         my $msg = "found $found where $expecting expected";
 4750         my $pos = $rpretoken_map->[$i_tok];
 4751         interrupt_logfile();
 4752         my $input_line_number = $tokenizer_self->{_last_line_number};
 4753         my ( $offset, $numbered_line, $underline ) =
 4754           make_numbered_line( $input_line_number, $input_line, $pos );
 4755         $underline = write_on_underline( $underline, $pos - $offset, '^' );
 4756 
 4757         my $trailer = "";
 4758         if ( ( $i_tok > 0 ) && ( $last_nonblank_i >= 0 ) ) {
 4759             my $pos_prev = $rpretoken_map->[$last_nonblank_i];
 4760             my $num;
 4761             if ( $rpretoken_type->[ $i_tok - 1 ] eq 'b' ) {
 4762                 $num = $rpretoken_map->[ $i_tok - 1 ] - $pos_prev;
 4763             }
 4764             else {
 4765                 $num = $pos - $pos_prev;
 4766             }
 4767             if ( $num > 40 ) { $num = 40; $pos_prev = $pos - 40; }
 4768 
 4769             $underline =
 4770               write_on_underline( $underline, $pos_prev - $offset, '-' x $num );
 4771             $trailer = " (previous token underlined)";
 4772         }
 4773         $underline =~ s/\s+$//;
 4774         warning( $numbered_line . "\n" );
 4775         warning( $underline . "\n" );
 4776         warning( $msg . $trailer . "\n" );
 4777         resume_logfile();
 4778     }
 4779     return;
 4780 }
 4781 
 4782 sub is_non_structural_brace {
 4783 
 4784     # Decide if a brace or bracket is structural or non-structural
 4785     # by looking at the previous token and type
 4786     # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token
 4787 
 4788     # EXPERIMENTAL: Mark slices as structural; idea was to improve formatting.
 4789     # Tentatively deactivated because it caused the wrong operator expectation
 4790     # for this code:
 4791     #      $user = @vars[1] / 100;
 4792     # Must update sub operator_expected before re-implementing.
 4793     # if ( $last_nonblank_type eq 'i' && $last_nonblank_token =~ /^@/ ) {
 4794     #    return 0;
 4795     # }
 4796 
 4797     ################################################################
 4798     # NOTE: braces after type characters start code blocks, but for
 4799     # simplicity these are not identified as such.  See also
 4800     # sub code_block_type
 4801     ################################################################
 4802 
 4803     ##if ($last_nonblank_type eq 't') {return 0}
 4804 
 4805     # otherwise, it is non-structural if it is decorated
 4806     # by type information.
 4807     # For example, the '{' here is non-structural:   ${xxx}
 4808     return (
 4809         $last_nonblank_token =~ /^([\$\@\*\&\%\)]|->|::)/
 4810 
 4811           # or if we follow a hash or array closing curly brace or bracket
 4812           # For example, the second '{' in this is non-structural: $a{'x'}{'y'}
 4813           # because the first '}' would have been given type 'R'
 4814           || $last_nonblank_type =~ /^([R\]])$/
 4815     );
 4816 }
 4817 
 4818 #########i#############################################################
 4819 # Tokenizer routines for tracking container nesting depths
 4820 #######################################################################
 4821 
 4822 # The following routines keep track of nesting depths of the nesting
 4823 # types, ( [ { and ?.  This is necessary for determining the indentation
 4824 # level, and also for debugging programs.  Not only do they keep track of
 4825 # nesting depths of the individual brace types, but they check that each
 4826 # of the other brace types is balanced within matching pairs.  For
 4827 # example, if the program sees this sequence:
 4828 #
 4829 #         {  ( ( ) }
 4830 #
 4831 # then it can determine that there is an extra left paren somewhere
 4832 # between the { and the }.  And so on with every other possible
 4833 # combination of outer and inner brace types.  For another
 4834 # example:
 4835 #
 4836 #         ( [ ..... ]  ] )
 4837 #
 4838 # which has an extra ] within the parens.
 4839 #
 4840 # The brace types have indexes 0 .. 3 which are indexes into
 4841 # the matrices.
 4842 #
 4843 # The pair ? : are treated as just another nesting type, with ? acting
 4844 # as the opening brace and : acting as the closing brace.
 4845 #
 4846 # The matrix
 4847 #
 4848 #         $depth_array[$a][$b][ $current_depth[$a] ] = $current_depth[$b];
 4849 #
 4850 # saves the nesting depth of brace type $b (where $b is either of the other
 4851 # nesting types) when brace type $a enters a new depth.  When this depth
 4852 # decreases, a check is made that the current depth of brace types $b is
 4853 # unchanged, or otherwise there must have been an error.  This can
 4854 # be very useful for localizing errors, particularly when perl runs to
 4855 # the end of a large file (such as this one) and announces that there
 4856 # is a problem somewhere.
 4857 #
 4858 # A numerical sequence number is maintained for every nesting type,
 4859 # so that each matching pair can be uniquely identified in a simple
 4860 # way.
 4861 
 4862 sub increase_nesting_depth {
 4863     my ( $aa, $pos ) = @_;
 4864 
 4865     # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
 4866     # @current_sequence_number, @depth_array, @starting_line_of_current_depth,
 4867     # $statement_type
 4868     $current_depth[$aa]++;
 4869     $total_depth++;
 4870     $total_depth[$aa][ $current_depth[$aa] ] = $total_depth;
 4871     my $input_line_number = $tokenizer_self->{_last_line_number};
 4872     my $input_line        = $tokenizer_self->{_line_text};
 4873 
 4874     # Sequence numbers increment by number of items.  This keeps
 4875     # a unique set of numbers but still allows the relative location
 4876     # of any type to be determined.
 4877     $nesting_sequence_number[$aa] += scalar(@closing_brace_names);
 4878     my $seqno = $nesting_sequence_number[$aa];
 4879     $current_sequence_number[$aa][ $current_depth[$aa] ] = $seqno;
 4880 
 4881     $starting_line_of_current_depth[$aa][ $current_depth[$aa] ] =
 4882       [ $input_line_number, $input_line, $pos ];
 4883 
 4884     for my $bb ( 0 .. @closing_brace_names - 1 ) {
 4885         next if ( $bb == $aa );
 4886         $depth_array[$aa][$bb][ $current_depth[$aa] ] = $current_depth[$bb];
 4887     }
 4888 
 4889     # set a flag for indenting a nested ternary statement
 4890     my $indent = 0;
 4891     if ( $aa == QUESTION_COLON ) {
 4892         $nested_ternary_flag[ $current_depth[$aa] ] = 0;
 4893         if ( $current_depth[$aa] > 1 ) {
 4894             if ( $nested_ternary_flag[ $current_depth[$aa] - 1 ] == 0 ) {
 4895                 my $pdepth = $total_depth[$aa][ $current_depth[$aa] - 1 ];
 4896                 if ( $pdepth == $total_depth - 1 ) {
 4897                     $indent = 1;
 4898                     $nested_ternary_flag[ $current_depth[$aa] - 1 ] = -1;
 4899                 }
 4900             }
 4901         }
 4902     }
 4903     $nested_statement_type[$aa][ $current_depth[$aa] ] = $statement_type;
 4904     $statement_type = "";
 4905     return ( $seqno, $indent );
 4906 }
 4907 
 4908 sub decrease_nesting_depth {
 4909 
 4910     my ( $aa, $pos ) = @_;
 4911 
 4912     # USES GLOBAL VARIABLES: $tokenizer_self, @current_depth,
 4913     # @current_sequence_number, @depth_array, @starting_line_of_current_depth
 4914     # $statement_type
 4915     my $seqno             = 0;
 4916     my $input_line_number = $tokenizer_self->{_last_line_number};
 4917     my $input_line        = $tokenizer_self->{_line_text};
 4918 
 4919     my $outdent = 0;
 4920     $total_depth--;
 4921     if ( $current_depth[$aa] > 0 ) {
 4922 
 4923         # set a flag for un-indenting after seeing a nested ternary statement
 4924         $seqno = $current_sequence_number[$aa][ $current_depth[$aa] ];
 4925         if ( $aa == QUESTION_COLON ) {
 4926             $outdent = $nested_ternary_flag[ $current_depth[$aa] ];
 4927         }
 4928         $statement_type = $nested_statement_type[$aa][ $current_depth[$aa] ];
 4929 
 4930         # check that any brace types $bb contained within are balanced
 4931         for my $bb ( 0 .. @closing_brace_names - 1 ) {
 4932             next if ( $bb == $aa );
 4933 
 4934             unless ( $depth_array[$aa][$bb][ $current_depth[$aa] ] ==
 4935                 $current_depth[$bb] )
 4936             {
 4937                 my $diff =
 4938                   $current_depth[$bb] -
 4939                   $depth_array[$aa][$bb][ $current_depth[$aa] ];
 4940 
 4941                 # don't whine too many times
 4942                 my $saw_brace_error = get_saw_brace_error();
 4943                 if (
 4944                     $saw_brace_error <= MAX_NAG_MESSAGES
 4945 
 4946                     # if too many closing types have occurred, we probably
 4947                     # already caught this error
 4948                     && ( ( $diff > 0 ) || ( $saw_brace_error <= 0 ) )
 4949                   )
 4950                 {
 4951                     interrupt_logfile();
 4952                     my $rsl =
 4953                       $starting_line_of_current_depth[$aa]
 4954                       [ $current_depth[$aa] ];
 4955                     my $sl  = $rsl->[0];
 4956                     my $rel = [ $input_line_number, $input_line, $pos ];
 4957                     my $el  = $rel->[0];
 4958                     my ($ess);
 4959 
 4960                     if ( $diff == 1 || $diff == -1 ) {
 4961                         $ess = '';
 4962                     }
 4963                     else {
 4964                         $ess = 's';
 4965                     }
 4966                     my $bname =
 4967                       ( $diff > 0 )
 4968                       ? $opening_brace_names[$bb]
 4969                       : $closing_brace_names[$bb];
 4970                     write_error_indicator_pair( @{$rsl}, '^' );
 4971                     my $msg = <<"EOM";
 4972 Found $diff extra $bname$ess between $opening_brace_names[$aa] on line $sl and $closing_brace_names[$aa] on line $el
 4973 EOM
 4974 
 4975                     if ( $diff > 0 ) {
 4976                         my $rml =
 4977                           $starting_line_of_current_depth[$bb]
 4978                           [ $current_depth[$bb] ];
 4979                         my $ml = $rml->[0];
 4980                         $msg .=
 4981 "    The most recent un-matched $bname is on line $ml\n";
 4982                         write_error_indicator_pair( @{$rml}, '^' );
 4983                     }
 4984                     write_error_indicator_pair( @{$rel}, '^' );
 4985                     warning($msg);
 4986                     resume_logfile();
 4987                 }
 4988                 increment_brace_error();
 4989             }
 4990         }
 4991         $current_depth[$aa]--;
 4992     }
 4993     else {
 4994 
 4995         my $saw_brace_error = get_saw_brace_error();
 4996         if ( $saw_brace_error <= MAX_NAG_MESSAGES ) {
 4997             my $msg = <<"EOM";
 4998 There is no previous $opening_brace_names[$aa] to match a $closing_brace_names[$aa] on line $input_line_number
 4999 EOM
 5000             indicate_error( $msg, $input_line_number, $input_line, $pos, '^' );
 5001         }
 5002         increment_brace_error();
 5003     }
 5004     return ( $seqno, $outdent );
 5005 }
 5006 
 5007 sub check_final_nesting_depths {
 5008 
 5009     # USES GLOBAL VARIABLES: @current_depth, @starting_line_of_current_depth
 5010 
 5011     for my $aa ( 0 .. @closing_brace_names - 1 ) {
 5012 
 5013         if ( $current_depth[$aa] ) {
 5014             my $rsl =
 5015               $starting_line_of_current_depth[$aa][ $current_depth[$aa] ];
 5016             my $sl  = $rsl->[0];
 5017             my $msg = <<"EOM";
 5018 Final nesting depth of $opening_brace_names[$aa]s is $current_depth[$aa]
 5019 The most recent un-matched $opening_brace_names[$aa] is on line $sl
 5020 EOM
 5021             indicate_error( $msg, @{$rsl}, '^' );
 5022             increment_brace_error();
 5023         }
 5024     }
 5025     return;
 5026 }
 5027 
 5028 #########i#############################################################
 5029 # Tokenizer routines for looking ahead in input stream
 5030 #######################################################################
 5031 
 5032 sub peek_ahead_for_n_nonblank_pre_tokens {
 5033 
 5034     # returns next n pretokens if they exist
 5035     # returns undef's if hits eof without seeing any pretokens
 5036     # USES GLOBAL VARIABLES: $tokenizer_self
 5037     my $max_pretokens = shift;
 5038     my $line;
 5039     my $i = 0;
 5040     my ( $rpre_tokens, $rmap, $rpre_types );
 5041 
 5042     while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) )
 5043     {
 5044         $line =~ s/^\s*//;    # trim leading blanks
 5045         next if ( length($line) <= 0 );    # skip blank
 5046         next if ( $line =~ /^#/ );         # skip comment
 5047         ( $rpre_tokens, $rmap, $rpre_types ) =
 5048           pre_tokenize( $line, $max_pretokens );
 5049         last;
 5050     }
 5051     return ( $rpre_tokens, $rpre_types );
 5052 }
 5053 
 5054 # look ahead for next non-blank, non-comment line of code
 5055 sub peek_ahead_for_nonblank_token {
 5056 
 5057     # USES GLOBAL VARIABLES: $tokenizer_self
 5058     my ( $rtokens, $max_token_index ) = @_;
 5059     my $line;
 5060     my $i = 0;
 5061 
 5062     while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $i++ ) )
 5063     {
 5064         $line =~ s/^\s*//;    # trim leading blanks
 5065         next if ( length($line) <= 0 );    # skip blank
 5066         next if ( $line =~ /^#/ );         # skip comment
 5067         my ( $rtok, $rmap, $rtype ) =
 5068           pre_tokenize( $line, 2 );        # only need 2 pre-tokens
 5069         my $j = $max_token_index + 1;
 5070 
 5071         foreach my $tok ( @{$rtok} ) {
 5072             last if ( $tok =~ "\n" );
 5073             $rtokens->[ ++$j ] = $tok;
 5074         }
 5075         last;
 5076     }
 5077     return $rtokens;
 5078 }
 5079 
 5080 #########i#############################################################
 5081 # Tokenizer guessing routines for ambiguous situations
 5082 #######################################################################
 5083 
 5084 sub guess_if_pattern_or_conditional {
 5085 
 5086     # this routine is called when we have encountered a ? following an
 5087     # unknown bareword, and we must decide if it starts a pattern or not
 5088     # input parameters:
 5089     #   $i - token index of the ? starting possible pattern
 5090     # output parameters:
 5091     #   $is_pattern = 0 if probably not pattern,  =1 if probably a pattern
 5092     #   msg = a warning or diagnostic message
 5093     # USES GLOBAL VARIABLES: $last_nonblank_token
 5094 
 5095     # FIXME: this needs to be rewritten
 5096 
 5097     my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
 5098     my $is_pattern = 0;
 5099     my $msg        = "guessing that ? after $last_nonblank_token starts a ";
 5100 
 5101     if ( $i >= $max_token_index ) {
 5102         $msg .= "conditional (no end to pattern found on the line)\n";
 5103     }
 5104     else {
 5105         my $ibeg = $i;
 5106         $i = $ibeg + 1;
 5107         my $next_token = $rtokens->[$i];    # first token after ?
 5108 
 5109         # look for a possible ending ? on this line..
 5110         my $in_quote        = 1;
 5111         my $quote_depth     = 0;
 5112         my $quote_character = '';
 5113         my $quote_pos       = 0;
 5114         my $quoted_string;
 5115         (
 5116             $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
 5117             $quoted_string
 5118           )
 5119           = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
 5120             $quote_pos, $quote_depth, $max_token_index );
 5121 
 5122         if ($in_quote) {
 5123 
 5124             # we didn't find an ending ? on this line,
 5125             # so we bias towards conditional
 5126             $is_pattern = 0;
 5127             $msg .= "conditional (no ending ? on this line)\n";
 5128 
 5129             # we found an ending ?, so we bias towards a pattern
 5130         }
 5131         else {
 5132 
 5133             # Watch out for an ending ? in quotes, like this
 5134             #    my $case_flag = File::Spec->case_tolerant ? '(?i)' : '';
 5135             my $s_quote = 0;
 5136             my $d_quote = 0;
 5137             my $colons  = 0;
 5138             foreach my $ii ( $ibeg + 1 .. $i - 1 ) {
 5139                 my $tok = $rtokens->[$ii];
 5140                 if ( $tok eq ":" ) { $colons++ }
 5141                 if ( $tok eq "'" ) { $s_quote++ }
 5142                 if ( $tok eq '"' ) { $d_quote++ }
 5143             }
 5144             if ( $s_quote % 2 || $d_quote % 2 || $colons ) {
 5145                 $is_pattern = 0;
 5146                 $msg .= "found ending ? but unbalanced quote chars\n";
 5147             }
 5148             elsif ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) {
 5149                 $is_pattern = 1;
 5150                 $msg .= "pattern (found ending ? and pattern expected)\n";
 5151             }
 5152             else {
 5153                 $msg .= "pattern (uncertain, but found ending ?)\n";
 5154             }
 5155         }
 5156     }
 5157     return ( $is_pattern, $msg );
 5158 }
 5159 
 5160 sub guess_if_pattern_or_division {
 5161 
 5162     # this routine is called when we have encountered a / following an
 5163     # unknown bareword, and we must decide if it starts a pattern or is a
 5164     # division
 5165     # input parameters:
 5166     #   $i - token index of the / starting possible pattern
 5167     # output parameters:
 5168     #   $is_pattern = 0 if probably division,  =1 if probably a pattern
 5169     #   msg = a warning or diagnostic message
 5170     # USES GLOBAL VARIABLES: $last_nonblank_token
 5171     my ( $i, $rtokens, $rtoken_map, $max_token_index ) = @_;
 5172     my $is_pattern = 0;
 5173     my $msg        = "guessing that / after $last_nonblank_token starts a ";
 5174 
 5175     if ( $i >= $max_token_index ) {
 5176         $msg .= "division (no end to pattern found on the line)\n";
 5177     }
 5178     else {
 5179         my $ibeg = $i;
 5180         my $divide_expected =
 5181           numerator_expected( $i, $rtokens, $max_token_index );
 5182         $i = $ibeg + 1;
 5183         my $next_token = $rtokens->[$i];    # first token after slash
 5184 
 5185         # look for a possible ending / on this line..
 5186         my $in_quote        = 1;
 5187         my $quote_depth     = 0;
 5188         my $quote_character = '';
 5189         my $quote_pos       = 0;
 5190         my $quoted_string;
 5191         (
 5192             $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
 5193             $quoted_string
 5194           )
 5195           = follow_quoted_string( $ibeg, $in_quote, $rtokens, $quote_character,
 5196             $quote_pos, $quote_depth, $max_token_index );
 5197 
 5198         if ($in_quote) {
 5199 
 5200             # we didn't find an ending / on this line,
 5201             # so we bias towards division
 5202             if ( $divide_expected >= 0 ) {
 5203                 $is_pattern = 0;
 5204                 $msg .= "division (no ending / on this line)\n";
 5205             }
 5206             else {
 5207                 $msg        = "multi-line pattern (division not possible)\n";
 5208                 $is_pattern = 1;
 5209             }
 5210 
 5211         }
 5212 
 5213         # we found an ending /, so we bias towards a pattern
 5214         else {
 5215 
 5216             if ( pattern_expected( $i, $rtokens, $max_token_index ) >= 0 ) {
 5217 
 5218                 if ( $divide_expected >= 0 ) {
 5219 
 5220                     if ( $i - $ibeg > 60 ) {
 5221                         $msg .= "division (matching / too distant)\n";
 5222                         $is_pattern = 0;
 5223                     }
 5224                     else {
 5225                         $msg .= "pattern (but division possible too)\n";
 5226                         $is_pattern = 1;
 5227                     }
 5228                 }
 5229                 else {
 5230                     $is_pattern = 1;
 5231                     $msg .= "pattern (division not possible)\n";
 5232                 }
 5233             }
 5234             else {
 5235 
 5236                 if ( $divide_expected >= 0 ) {
 5237                     $is_pattern = 0;
 5238                     $msg .= "division (pattern not possible)\n";
 5239                 }
 5240                 else {
 5241                     $is_pattern = 1;
 5242                     $msg .=
 5243                       "pattern (uncertain, but division would not work here)\n";
 5244                 }
 5245             }
 5246         }
 5247     }
 5248     return ( $is_pattern, $msg );
 5249 }
 5250 
 5251 # try to resolve here-doc vs. shift by looking ahead for
 5252 # non-code or the end token (currently only looks for end token)
 5253 # returns 1 if it is probably a here doc, 0 if not
 5254 sub guess_if_here_doc {
 5255 
 5256     # This is how many lines we will search for a target as part of the
 5257     # guessing strategy.  It is a constant because there is probably
 5258     # little reason to change it.
 5259     # USES GLOBAL VARIABLES: $tokenizer_self, $current_package
 5260     # %is_constant,
 5261     my $HERE_DOC_WINDOW = 40;
 5262 
 5263     my $next_token        = shift;
 5264     my $here_doc_expected = 0;
 5265     my $line;
 5266     my $k   = 0;
 5267     my $msg = "checking <<";
 5268 
 5269     while ( $line = $tokenizer_self->{_line_buffer_object}->peek_ahead( $k++ ) )
 5270     {
 5271         chomp $line;
 5272 
 5273         if ( $line =~ /^$next_token$/ ) {
 5274             $msg .= " -- found target $next_token ahead $k lines\n";
 5275             $here_doc_expected = 1;    # got it
 5276             last;
 5277         }
 5278         last if ( $k >= $HERE_DOC_WINDOW );
 5279     }
 5280 
 5281     unless ($here_doc_expected) {
 5282 
 5283         if ( !defined($line) ) {
 5284             $here_doc_expected = -1;    # hit eof without seeing target
 5285             $msg .= " -- must be shift; target $next_token not in file\n";
 5286 
 5287         }
 5288         else {                          # still unsure..taking a wild guess
 5289 
 5290             if ( !$is_constant{$current_package}{$next_token} ) {
 5291                 $here_doc_expected = 1;
 5292                 $msg .=
 5293                   " -- guessing it's a here-doc ($next_token not a constant)\n";
 5294             }
 5295             else {
 5296                 $msg .=
 5297                   " -- guessing it's a shift ($next_token is a constant)\n";
 5298             }
 5299         }
 5300     }
 5301     write_logfile_entry($msg);
 5302     return $here_doc_expected;
 5303 }
 5304 
 5305 #########i#############################################################
 5306 # Tokenizer Routines for scanning identifiers and related items
 5307 #######################################################################
 5308 
 5309 sub scan_bare_identifier_do {
 5310 
 5311     # this routine is called to scan a token starting with an alphanumeric
 5312     # variable or package separator, :: or '.
 5313     # USES GLOBAL VARIABLES: $current_package, $last_nonblank_token,
 5314     # $last_nonblank_type,@paren_type, $paren_depth
 5315 
 5316     my ( $input_line, $i, $tok, $type, $prototype, $rtoken_map,
 5317         $max_token_index )
 5318       = @_;
 5319     my $i_begin = $i;
 5320     my $package = undef;
 5321 
 5322     my $i_beg = $i;
 5323 
 5324     # we have to back up one pretoken at a :: since each : is one pretoken
 5325     if ( $tok eq '::' ) { $i_beg-- }
 5326     if ( $tok eq '->' ) { $i_beg-- }
 5327     my $pos_beg = $rtoken_map->[$i_beg];
 5328     pos($input_line) = $pos_beg;
 5329 
 5330     #  Examples:
 5331     #   A::B::C
 5332     #   A::
 5333     #   ::A
 5334     #   A'B
 5335     if ( $input_line =~ m/\G\s*((?:\w*(?:'|::)))*(?:(?:->)?(\w+))?/gc ) {
 5336 
 5337         my $pos  = pos($input_line);
 5338         my $numc = $pos - $pos_beg;
 5339         $tok = substr( $input_line, $pos_beg, $numc );
 5340 
 5341         # type 'w' includes anything without leading type info
 5342         # ($,%,@,*) including something like abc::def::ghi
 5343         $type = 'w';
 5344 
 5345         my $sub_name = "";
 5346         if ( defined($2) ) { $sub_name = $2; }
 5347         if ( defined($1) ) {
 5348             $package = $1;
 5349 
 5350             # patch: don't allow isolated package name which just ends
 5351             # in the old style package separator (single quote).  Example:
 5352             #   use CGI':all';
 5353             if ( !($sub_name) && substr( $package, -1, 1 ) eq '\'' ) {
 5354                 $pos--;
 5355             }
 5356 
 5357             $package =~ s/\'/::/g;
 5358             if ( $package =~ /^\:/ ) { $package = 'main' . $package }
 5359             $package =~ s/::$//;
 5360         }
 5361         else {
 5362             $package = $current_package;
 5363 
 5364             if ( $is_keyword{$tok} ) {
 5365                 $type = 'k';
 5366             }
 5367         }
 5368 
 5369         # if it is a bareword..
 5370         if ( $type eq 'w' ) {
 5371 
 5372             # check for v-string with leading 'v' type character
 5373             # (This seems to have precedence over filehandle, type 'Y')
 5374             if ( $tok =~ /^v\d[_\d]*$/ ) {
 5375 
 5376                 # we only have the first part - something like 'v101' -
 5377                 # look for more
 5378                 if ( $input_line =~ m/\G(\.\d[_\d]*)+/gc ) {
 5379                     $pos  = pos($input_line);
 5380                     $numc = $pos - $pos_beg;
 5381                     $tok  = substr( $input_line, $pos_beg, $numc );
 5382                 }
 5383                 $type = 'v';
 5384 
 5385                 # warn if this version can't handle v-strings
 5386                 report_v_string($tok);
 5387             }
 5388 
 5389             elsif ( $is_constant{$package}{$sub_name} ) {
 5390                 $type = 'C';
 5391             }
 5392 
 5393             # bareword after sort has implied empty prototype; for example:
 5394             # @sorted = sort numerically ( 53, 29, 11, 32, 7 );
 5395             # This has priority over whatever the user has specified.
 5396             elsif ($last_nonblank_token eq 'sort'
 5397                 && $last_nonblank_type eq 'k' )
 5398             {
 5399                 $type = 'Z';
 5400             }
 5401 
 5402             # Note: strangely, perl does not seem to really let you create
 5403             # functions which act like eval and do, in the sense that eval
 5404             # and do may have operators following the final }, but any operators
 5405             # that you create with prototype (&) apparently do not allow
 5406             # trailing operators, only terms.  This seems strange.
 5407             # If this ever changes, here is the update
 5408             # to make perltidy behave accordingly:
 5409 
 5410             # elsif ( $is_block_function{$package}{$tok} ) {
 5411             #    $tok='eval'; # patch to do braces like eval  - doesn't work
 5412             #    $type = 'k';
 5413             #}
 5414             # FIXME: This could become a separate type to allow for different
 5415             # future behavior:
 5416             elsif ( $is_block_function{$package}{$sub_name} ) {
 5417                 $type = 'G';
 5418             }
 5419 
 5420             elsif ( $is_block_list_function{$package}{$sub_name} ) {
 5421                 $type = 'G';
 5422             }
 5423             elsif ( $is_user_function{$package}{$sub_name} ) {
 5424                 $type      = 'U';
 5425                 $prototype = $user_function_prototype{$package}{$sub_name};
 5426             }
 5427 
 5428             # check for indirect object
 5429             elsif (
 5430 
 5431                 # added 2001-03-27: must not be followed immediately by '('
 5432                 # see fhandle.t
 5433                 (