"Fossies" - the Fresh Open Source Software Archive

Member "Perl-Tidy-20210717/lib/Perl/Tidy/Tokenizer.pm" (14 Jul 2021, 341319 Bytes) of package /linux/misc/Perl-Tidy-20210717.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 #####################################################################
    2 #
    3 # The Perl::Tidy::Tokenizer package is essentially a filter which
    4 # reads lines of perl source code from a source object and provides
    5 # corresponding tokenized lines through its get_line() method.  Lines
    6 # flow from the source_object to the caller like this:
    7 #
    8 # source_object --> LineBuffer_object --> Tokenizer -->  calling routine
    9 #   get_line()         get_line()           get_line()     line_of_tokens
   10 #
   11 # The source object can be any object with a get_line() method which
   12 # supplies one line (a character string) perl call.
   13 # The LineBuffer object is created by the Tokenizer.
   14 # The Tokenizer returns a reference to a data structure 'line_of_tokens'
   15 # containing one tokenized line for each call to its get_line() method.
   16 #
   17 # WARNING: This is not a real class.  Only one tokenizer my be used.
   18 #
   19 ########################################################################
   20 
   21 package Perl::Tidy::Tokenizer;
   22 use strict;
   23 use warnings;
   24 our $VERSION = '20210717';
   25 
   26 use Perl::Tidy::LineBuffer;
   27 use Carp;
   28 
   29 # PACKAGE VARIABLES for processing an entire FILE.
   30 # These must be package variables because most may get localized during
   31 # processing.  Most are initialized in sub prepare_for_a_new_file.
   32 use vars qw{
   33   $tokenizer_self
   34 
   35   $last_nonblank_token
   36   $last_nonblank_type
   37   $last_nonblank_block_type
   38   $statement_type
   39   $in_attribute_list
   40   $current_package
   41   $context
   42 
   43   %is_constant
   44   %is_user_function
   45   %user_function_prototype
   46   %is_block_function
   47   %is_block_list_function
   48   %saw_function_definition
   49   %saw_use_module
   50 
   51   $brace_depth
   52   $paren_depth
   53   $square_bracket_depth
   54 
   55   @current_depth
   56   @total_depth
   57   $total_depth
   58   @nesting_sequence_number
   59   @current_sequence_number
   60   @paren_type
   61   @paren_semicolon_count
   62   @paren_structural_type
   63   @brace_type
   64   @brace_structural_type
   65   @brace_context
   66   @brace_package
   67   @square_bracket_type
   68   @square_bracket_structural_type
   69   @depth_array
   70   @nested_ternary_flag
   71   @nested_statement_type
   72   @starting_line_of_current_depth
   73 };
   74 
   75 # GLOBAL CONSTANTS for routines in this package,
   76 # Initialized in a BEGIN block.
   77 use vars qw{
   78   %is_indirect_object_taker
   79   %is_block_operator
   80   %expecting_operator_token
   81   %expecting_operator_types
   82   %expecting_term_types
   83   %expecting_term_token
   84   %is_digraph
   85   %is_file_test_operator
   86   %is_trigraph
   87   %is_tetragraph
   88   %is_valid_token_type
   89   %is_keyword
   90   %is_code_block_token
   91   %really_want_term
   92   @opening_brace_names
   93   @closing_brace_names
   94   %is_keyword_taking_list
   95   %is_keyword_taking_optional_arg
   96   %is_keyword_rejecting_slash_as_pattern_delimiter
   97   %is_keyword_rejecting_question_as_pattern_delimiter
   98   %is_q_qq_qw_qx_qr_s_y_tr_m
   99   %is_sub
  100   %is_package
  101   %is_comma_question_colon
  102   %other_line_endings
  103   $code_skipping_pattern_begin
  104   $code_skipping_pattern_end
  105 };
  106 
  107 # GLOBAL VARIABLES which are constant after being configured by user-supplied
  108 # parameters.  They remain constant as a file is being processed.
  109 my (
  110 
  111     $rOpts_code_skipping,
  112     $code_skipping_pattern_begin,
  113     $code_skipping_pattern_end,
  114 );
  115 
  116 # possible values of operator_expected()
  117 use constant TERM     => -1;
  118 use constant UNKNOWN  => 0;
  119 use constant OPERATOR => 1;
  120 
  121 # possible values of context
  122 use constant SCALAR_CONTEXT  => -1;
  123 use constant UNKNOWN_CONTEXT => 0;
  124 use constant LIST_CONTEXT    => 1;
  125 
  126 # Maximum number of little messages; probably need not be changed.
  127 use constant MAX_NAG_MESSAGES => 6;
  128 
  129 BEGIN {
  130 
  131     # Array index names for $self
  132     my $i = 0;
  133     use constant {
  134         _rhere_target_list_                  => $i++,
  135         _in_here_doc_                        => $i++,
  136         _here_doc_target_                    => $i++,
  137         _here_quote_character_               => $i++,
  138         _in_data_                            => $i++,
  139         _in_end_                             => $i++,
  140         _in_format_                          => $i++,
  141         _in_error_                           => $i++,
  142         _in_pod_                             => $i++,
  143         _in_skipped_                         => $i++,
  144         _in_attribute_list_                  => $i++,
  145         _in_quote_                           => $i++,
  146         _quote_target_                       => $i++,
  147         _line_start_quote_                   => $i++,
  148         _starting_level_                     => $i++,
  149         _know_starting_level_                => $i++,
  150         _tabsize_                            => $i++,
  151         _indent_columns_                     => $i++,
  152         _look_for_hash_bang_                 => $i++,
  153         _trim_qw_                            => $i++,
  154         _continuation_indentation_           => $i++,
  155         _outdent_labels_                     => $i++,
  156         _last_line_number_                   => $i++,
  157         _saw_perl_dash_P_                    => $i++,
  158         _saw_perl_dash_w_                    => $i++,
  159         _saw_use_strict_                     => $i++,
  160         _saw_v_string_                       => $i++,
  161         _hit_bug_                            => $i++,
  162         _look_for_autoloader_                => $i++,
  163         _look_for_selfloader_                => $i++,
  164         _saw_autoloader_                     => $i++,
  165         _saw_selfloader_                     => $i++,
  166         _saw_hash_bang_                      => $i++,
  167         _saw_end_                            => $i++,
  168         _saw_data_                           => $i++,
  169         _saw_negative_indentation_           => $i++,
  170         _started_tokenizing_                 => $i++,
  171         _line_buffer_object_                 => $i++,
  172         _debugger_object_                    => $i++,
  173         _diagnostics_object_                 => $i++,
  174         _logger_object_                      => $i++,
  175         _unexpected_error_count_             => $i++,
  176         _started_looking_for_here_target_at_ => $i++,
  177         _nearly_matched_here_target_at_      => $i++,
  178         _line_of_text_                       => $i++,
  179         _rlower_case_labels_at_              => $i++,
  180         _extended_syntax_                    => $i++,
  181         _maximum_level_                      => $i++,
  182         _true_brace_error_count_             => $i++,
  183         _rOpts_maximum_level_errors_         => $i++,
  184         _rOpts_maximum_unexpected_errors_    => $i++,
  185         _rOpts_logfile_                      => $i++,
  186         _rOpts_                              => $i++,
  187     };
  188 }
  189 
  190 {    ## closure for subs to count instances
  191 
  192     # methods to count instances
  193     my $_count = 0;
  194     sub get_count        { return $_count; }
  195     sub _increment_count { return ++$_count }
  196     sub _decrement_count { return --$_count }
  197 }
  198 
  199 sub DESTROY {
  200     my $self = shift;
  201     $self->_decrement_count();
  202     return;
  203 }
  204 
  205 sub AUTOLOAD {
  206 
  207     # Catch any undefined sub calls so that we are sure to get
  208     # some diagnostic information.  This sub should never be called
  209     # except for a programming error.
  210     our $AUTOLOAD;
  211     return if ( $AUTOLOAD =~ /\bDESTROY$/ );
  212     my ( $pkg, $fname, $lno ) = caller();
  213     my $my_package = __PACKAGE__;
  214     print STDERR <<EOM;
  215 ======================================================================
  216 Error detected in package '$my_package', version $VERSION
  217 Received unexpected AUTOLOAD call for sub '$AUTOLOAD'
  218 Called from package: '$pkg'  
  219 Called from File '$fname'  at line '$lno'
  220 This error is probably due to a recent programming change
  221 ======================================================================
  222 EOM
  223     exit 1;
  224 }
  225 
  226 sub Die {
  227     my ($msg) = @_;
  228     Perl::Tidy::Die($msg);
  229     croak "unexpected return from Perl::Tidy::Die";
  230 }
  231 
  232 sub bad_pattern {
  233 
  234     # See if a pattern will compile. We have to use a string eval here,
  235     # but it should be safe because the pattern has been constructed
  236     # by this program.
  237     my ($pattern) = @_;
  238     eval "'##'=~/$pattern/";
  239     return $@;
  240 }
  241 
  242 sub make_code_skipping_pattern {
  243     my ( $rOpts, $opt_name, $default ) = @_;
  244     my $param = $rOpts->{$opt_name};
  245     unless ($param) { $param = $default }
  246     $param =~ s/^\s*//;    # allow leading spaces to be like format-skipping
  247     if ( $param !~ /^#/ ) {
  248         Die("ERROR: the $opt_name parameter '$param' must begin with '#'\n");
  249     }
  250     my $pattern = '^\s*' . $param . '\b';
  251     if ( bad_pattern($pattern) ) {
  252         Die(
  253 "ERROR: the $opt_name parameter '$param' causes the invalid regex '$pattern'\n"
  254         );
  255     }
  256     return $pattern;
  257 }
  258 
  259 sub check_options {
  260 
  261     # Check Tokenizer parameters
  262     my $rOpts = shift;
  263 
  264     %is_sub = ();
  265     $is_sub{'sub'} = 1;
  266 
  267     # Install any aliases to 'sub'
  268     if ( $rOpts->{'sub-alias-list'} ) {
  269 
  270         # Note that any 'sub-alias-list' has been preprocessed to
  271         # be a trimmed, space-separated list which includes 'sub'
  272         # for example, it might be 'sub method fun'
  273         my @sub_alias_list = split /\s+/, $rOpts->{'sub-alias-list'};
  274         foreach my $word (@sub_alias_list) {
  275             $is_sub{$word} = 1;
  276         }
  277     }
  278 
  279     $rOpts_code_skipping = $rOpts->{'code-skipping'};
  280     $code_skipping_pattern_begin =
  281       make_code_skipping_pattern( $rOpts, 'code-skipping-begin', '#<<V' );
  282     $code_skipping_pattern_end =
  283       make_code_skipping_pattern( $rOpts, 'code-skipping-end', '#>>V' );
  284     return;
  285 }
  286 
  287 sub new {
  288 
  289     my ( $class, @args ) = @_;
  290 
  291     # Note: 'tabs' and 'indent_columns' are temporary and should be
  292     # removed asap
  293     my %defaults = (
  294         source_object        => undef,
  295         debugger_object      => undef,
  296         diagnostics_object   => undef,
  297         logger_object        => undef,
  298         starting_level       => undef,
  299         indent_columns       => 4,
  300         tabsize              => 8,
  301         look_for_hash_bang   => 0,
  302         trim_qw              => 1,
  303         look_for_autoloader  => 1,
  304         look_for_selfloader  => 1,
  305         starting_line_number => 1,
  306         extended_syntax      => 0,
  307         rOpts                => {},
  308     );
  309     my %args = ( %defaults, @args );
  310 
  311     # we are given an object with a get_line() method to supply source lines
  312     my $source_object = $args{source_object};
  313     my $rOpts         = $args{rOpts};
  314 
  315     # we create another object with a get_line() and peek_ahead() method
  316     my $line_buffer_object = Perl::Tidy::LineBuffer->new($source_object);
  317 
  318     # Tokenizer state data is as follows:
  319     # _rhere_target_list_    reference to list of here-doc targets
  320     # _here_doc_target_      the target string for a here document
  321     # _here_quote_character_ the type of here-doc quoting (" ' ` or none)
  322     #                        to determine if interpolation is done
  323     # _quote_target_         character we seek if chasing a quote
  324     # _line_start_quote_     line where we started looking for a long quote
  325     # _in_here_doc_          flag indicating if we are in a here-doc
  326     # _in_pod_               flag set if we are in pod documentation
  327     # _in_skipped_           flag set if we are in a skipped section
  328     # _in_error_             flag set if we saw severe error (binary in script)
  329     # _in_data_              flag set if we are in __DATA__ section
  330     # _in_end_               flag set if we are in __END__ section
  331     # _in_format_            flag set if we are in a format description
  332     # _in_attribute_list_    flag telling if we are looking for attributes
  333     # _in_quote_             flag telling if we are chasing a quote
  334     # _starting_level_       indentation level of first line
  335     # _line_buffer_object_   object with get_line() method to supply source code
  336     # _diagnostics_object_   place to write debugging information
  337     # _unexpected_error_count_ error count used to limit output
  338     # _lower_case_labels_at_ line numbers where lower case labels seen
  339     # _hit_bug_              program bug detected
  340 
  341     my $self = [];
  342     $self->[_rhere_target_list_]        = [];
  343     $self->[_in_here_doc_]              = 0;
  344     $self->[_here_doc_target_]          = "";
  345     $self->[_here_quote_character_]     = "";
  346     $self->[_in_data_]                  = 0;
  347     $self->[_in_end_]                   = 0;
  348     $self->[_in_format_]                = 0;
  349     $self->[_in_error_]                 = 0;
  350     $self->[_in_pod_]                   = 0;
  351     $self->[_in_skipped_]               = 0;
  352     $self->[_in_attribute_list_]        = 0;
  353     $self->[_in_quote_]                 = 0;
  354     $self->[_quote_target_]             = "";
  355     $self->[_line_start_quote_]         = -1;
  356     $self->[_starting_level_]           = $args{starting_level};
  357     $self->[_know_starting_level_]      = defined( $args{starting_level} );
  358     $self->[_tabsize_]                  = $args{tabsize};
  359     $self->[_indent_columns_]           = $args{indent_columns};
  360     $self->[_look_for_hash_bang_]       = $args{look_for_hash_bang};
  361     $self->[_trim_qw_]                  = $args{trim_qw};
  362     $self->[_continuation_indentation_] = $args{continuation_indentation};
  363     $self->[_outdent_labels_]           = $args{outdent_labels};
  364     $self->[_last_line_number_]         = $args{starting_line_number} - 1;
  365     $self->[_saw_perl_dash_P_]          = 0;
  366     $self->[_saw_perl_dash_w_]          = 0;
  367     $self->[_saw_use_strict_]           = 0;
  368     $self->[_saw_v_string_]             = 0;
  369     $self->[_hit_bug_]                  = 0;
  370     $self->[_look_for_autoloader_]      = $args{look_for_autoloader};
  371     $self->[_look_for_selfloader_]      = $args{look_for_selfloader};
  372     $self->[_saw_autoloader_]           = 0;
  373     $self->[_saw_selfloader_]           = 0;
  374     $self->[_saw_hash_bang_]            = 0;
  375     $self->[_saw_end_]                  = 0;
  376     $self->[_saw_data_]                 = 0;
  377     $self->[_saw_negative_indentation_] = 0;
  378     $self->[_started_tokenizing_]       = 0;
  379     $self->[_line_buffer_object_]       = $line_buffer_object;
  380     $self->[_debugger_object_]          = $args{debugger_object};
  381     $self->[_diagnostics_object_]       = $args{diagnostics_object};
  382     $self->[_logger_object_]            = $args{logger_object};
  383     $self->[_unexpected_error_count_]   = 0;
  384     $self->[_started_looking_for_here_target_at_] = 0;
  385     $self->[_nearly_matched_here_target_at_]      = undef;
  386     $self->[_line_of_text_]                       = "";
  387     $self->[_rlower_case_labels_at_]              = undef;
  388     $self->[_extended_syntax_]                    = $args{extended_syntax};
  389     $self->[_maximum_level_]                      = 0;
  390     $self->[_true_brace_error_count_]             = 0;
  391     $self->[_rOpts_maximum_level_errors_] = $rOpts->{'maximum-level-errors'};
  392     $self->[_rOpts_maximum_unexpected_errors_] =
  393       $rOpts->{'maximum-unexpected-errors'};
  394     $self->[_rOpts_logfile_] = $rOpts->{'logfile'};
  395     $self->[_rOpts_]         = $rOpts;
  396     bless $self, $class;
  397 
  398     $tokenizer_self = $self;
  399 
  400     prepare_for_a_new_file();
  401     find_starting_indentation_level();
  402 
  403     # This is not a full class yet, so die if an attempt is made to
  404     # create more than one object.
  405 
  406     if ( _increment_count() > 1 ) {
  407         confess
  408 "Attempt to create more than 1 object in $class, which is not a true class yet\n";
  409     }
  410 
  411     return $self;
  412 
  413 }
  414 
  415 # interface to Perl::Tidy::Logger routines
  416 sub warning {
  417     my $msg           = shift;
  418     my $logger_object = $tokenizer_self->[_logger_object_];
  419     if ($logger_object) {
  420         $logger_object->warning($msg);
  421     }
  422     return;
  423 }
  424 
  425 sub complain {
  426     my $msg           = shift;
  427     my $logger_object = $tokenizer_self->[_logger_object_];
  428     if ($logger_object) {
  429         $logger_object->complain($msg);
  430     }
  431     return;
  432 }
  433 
  434 sub write_logfile_entry {
  435     my $msg           = shift;
  436     my $logger_object = $tokenizer_self->[_logger_object_];
  437     if ($logger_object) {
  438         $logger_object->write_logfile_entry($msg);
  439     }
  440     return;
  441 }
  442 
  443 sub interrupt_logfile {
  444     my $logger_object = $tokenizer_self->[_logger_object_];
  445     if ($logger_object) {
  446         $logger_object->interrupt_logfile();
  447     }
  448     return;
  449 }
  450 
  451 sub resume_logfile {
  452     my $logger_object = $tokenizer_self->[_logger_object_];
  453     if ($logger_object) {
  454         $logger_object->resume_logfile();
  455     }
  456     return;
  457 }
  458 
  459 sub increment_brace_error {
  460     my $logger_object = $tokenizer_self->[_logger_object_];
  461     if ($logger_object) {
  462         $logger_object->increment_brace_error();
  463     }
  464     return;
  465 }
  466 
  467 sub report_definite_bug {
  468     $tokenizer_self->[_hit_bug_] = 1;
  469     my $logger_object = $tokenizer_self->[_logger_object_];
  470     if ($logger_object) {
  471         $logger_object->report_definite_bug();
  472     }
  473     return;
  474 }
  475 
  476 sub brace_warning {
  477     my $msg           = shift;
  478     my $logger_object = $tokenizer_self->[_logger_object_];
  479     if ($logger_object) {
  480         $logger_object->brace_warning($msg);
  481     }
  482     return;
  483 }
  484 
  485 sub get_saw_brace_error {
  486     my $logger_object = $tokenizer_self->[_logger_object_];
  487     if ($logger_object) {
  488         return $logger_object->get_saw_brace_error();
  489     }
  490     else {
  491         return 0;
  492     }
  493 }
  494 
  495 sub get_unexpected_error_count {
  496     my ($self) = @_;
  497     return $self->[_unexpected_error_count_];
  498 }
  499 
  500 # interface to Perl::Tidy::Diagnostics routines
  501 sub write_diagnostics {
  502     my $msg = shift;
  503     if ( $tokenizer_self->[_diagnostics_object_] ) {
  504         $tokenizer_self->[_diagnostics_object_]->write_diagnostics($msg);
  505     }
  506     return;
  507 }
  508 
  509 sub get_maximum_level {
  510     return $tokenizer_self->[_maximum_level_];
  511 }
  512 
  513 sub report_tokenization_errors {
  514 
  515     my ($self) = @_;
  516 
  517     # Report any tokenization errors and return a flag '$severe_error'.
  518     # Set $severe_error = 1 if the tokenizations errors are so severe that
  519     # the formatter should not attempt to format the file. Instead, it will
  520     # just output the file verbatim.
  521 
  522     # set severe error flag if tokenizer has encountered file reading problems
  523     # (i.e. unexpected binary characters)
  524     my $severe_error = $self->[_in_error_];
  525 
  526     my $maxle = $self->[_rOpts_maximum_level_errors_];
  527     my $maxue = $self->[_rOpts_maximum_unexpected_errors_];
  528     $maxle = 1 unless defined($maxle);
  529     $maxue = 0 unless defined($maxue);
  530 
  531     my $level = get_indentation_level();
  532     if ( $level != $tokenizer_self->[_starting_level_] ) {
  533         warning("final indentation level: $level\n");
  534         my $level_diff = $tokenizer_self->[_starting_level_] - $level;
  535         if ( $level_diff < 0 ) { $level_diff = -$level_diff }
  536 
  537         # Set severe error flag if the level error is greater than 1.
  538         # The formatter can function for any level error but it is probably
  539         # best not to attempt formatting for a high level error.
  540         if ( $maxle >= 0 && $level_diff > $maxle ) {
  541             $severe_error = 1;
  542             warning(<<EOM);
  543 Formatting will be skipped because level error '$level_diff' exceeds -maxle=$maxle; use -maxle=-1 to force formatting
  544 EOM
  545         }
  546     }
  547 
  548     check_final_nesting_depths();
  549 
  550     # Likewise, large numbers of brace errors usually indicate non-perl
  551     # scirpts, so set the severe error flag at a low number.  This is similar
  552     # to the level check, but different because braces may balance but be
  553     # incorrectly interlaced.
  554     if ( $tokenizer_self->[_true_brace_error_count_] > 2 ) {
  555         $severe_error = 1;
  556     }
  557 
  558     if ( $tokenizer_self->[_look_for_hash_bang_]
  559         && !$tokenizer_self->[_saw_hash_bang_] )
  560     {
  561         warning(
  562             "hit EOF without seeing hash-bang line; maybe don't need -x?\n");
  563     }
  564 
  565     if ( $tokenizer_self->[_in_format_] ) {
  566         warning("hit EOF while in format description\n");
  567     }
  568 
  569     if ( $tokenizer_self->[_in_skipped_] ) {
  570         write_logfile_entry(
  571             "hit EOF while in lines skipped with --code-skipping\n");
  572     }
  573 
  574     if ( $tokenizer_self->[_in_pod_] ) {
  575 
  576         # Just write log entry if this is after __END__ or __DATA__
  577         # because this happens to often, and it is not likely to be
  578         # a parsing error.
  579         if ( $tokenizer_self->[_saw_data_] || $tokenizer_self->[_saw_end_] ) {
  580             write_logfile_entry(
  581 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
  582             );
  583         }
  584 
  585         else {
  586             complain(
  587 "hit eof while in pod documentation (no =cut seen)\n\tthis can cause trouble with some pod utilities\n"
  588             );
  589         }
  590 
  591     }
  592 
  593     if ( $tokenizer_self->[_in_here_doc_] ) {
  594         $severe_error = 1;
  595         my $here_doc_target = $tokenizer_self->[_here_doc_target_];
  596         my $started_looking_for_here_target_at =
  597           $tokenizer_self->[_started_looking_for_here_target_at_];
  598         if ($here_doc_target) {
  599             warning(
  600 "hit EOF in here document starting at line $started_looking_for_here_target_at with target: $here_doc_target\n"
  601             );
  602         }
  603         else {
  604             warning(
  605 "hit EOF in here document starting at line $started_looking_for_here_target_at with empty target string\n"
  606             );
  607         }
  608         my $nearly_matched_here_target_at =
  609           $tokenizer_self->[_nearly_matched_here_target_at_];
  610         if ($nearly_matched_here_target_at) {
  611             warning(
  612 "NOTE: almost matched at input line $nearly_matched_here_target_at except for whitespace\n"
  613             );
  614         }
  615     }
  616 
  617     # Something is seriously wrong if we ended inside a quote
  618     if ( $tokenizer_self->[_in_quote_] ) {
  619         $severe_error = 1;
  620         my $line_start_quote = $tokenizer_self->[_line_start_quote_];
  621         my $quote_target     = $tokenizer_self->[_quote_target_];
  622         my $what =
  623           ( $tokenizer_self->[_in_attribute_list_] )
  624           ? "attribute list"
  625           : "quote/pattern";
  626         warning(
  627 "hit EOF seeking end of $what starting at line $line_start_quote ending in $quote_target\n"
  628         );
  629     }
  630 
  631     if ( $tokenizer_self->[_hit_bug_] ) {
  632         $severe_error = 1;
  633     }
  634 
  635     # Multiple "unexpected" type tokenization errors usually indicate parsing
  636     # non-perl scripts, or that something is seriously wrong, so we should
  637     # avoid formatting them.  This can happen for example if we run perltidy on
  638     # a shell script or an html file.  But unfortunately this check can
  639     # interfere with some extended syntaxes, such as RPerl, so it has to be off
  640     # by default.
  641     my $ue_count = $tokenizer_self->[_unexpected_error_count_];
  642     if ( $maxue > 0 && $ue_count > $maxue ) {
  643         warning(<<EOM);
  644 Formatting will be skipped since unexpected token count = $ue_count > -maxue=$maxue; use -maxue=0 to force formatting
  645 EOM
  646         $severe_error = 1;
  647     }
  648 
  649     unless ( $tokenizer_self->[_saw_perl_dash_w_] ) {
  650         if ( $] < 5.006 ) {
  651             write_logfile_entry("Suggest including '-w parameter'\n");
  652         }
  653         else {
  654             write_logfile_entry("Suggest including 'use warnings;'\n");
  655         }
  656     }
  657 
  658     if ( $tokenizer_self->[_saw_perl_dash_P_] ) {
  659         write_logfile_entry("Use of -P parameter for defines is discouraged\n");
  660     }
  661 
  662     unless ( $tokenizer_self->[_saw_use_strict_] ) {
  663         write_logfile_entry("Suggest including 'use strict;'\n");
  664     }
  665 
  666     # it is suggested that labels have at least one upper case character
  667     # for legibility and to avoid code breakage as new keywords are introduced
  668     if ( $tokenizer_self->[_rlower_case_labels_at_] ) {
  669         my @lower_case_labels_at =
  670           @{ $tokenizer_self->[_rlower_case_labels_at_] };
  671         write_logfile_entry(
  672             "Suggest using upper case characters in label(s)\n");
  673         local $" = ')(';
  674         write_logfile_entry("  defined at line(s): (@lower_case_labels_at)\n");
  675     }
  676     return $severe_error;
  677 }
  678 
  679 sub report_v_string {
  680 
  681     # warn if this version can't handle v-strings
  682     my $tok = shift;
  683     unless ( $tokenizer_self->[_saw_v_string_] ) {
  684         $tokenizer_self->[_saw_v_string_] =
  685           $tokenizer_self->[_last_line_number_];
  686     }
  687     if ( $] < 5.006 ) {
  688         warning(
  689 "Found v-string '$tok' but v-strings are not implemented in your version of perl; see Camel 3 book ch 2\n"
  690         );
  691     }
  692     return;
  693 }
  694 
  695 sub get_input_line_number {
  696     return $tokenizer_self->[_last_line_number_];
  697 }
  698 
  699 # returns the next tokenized line
  700 sub get_line {
  701 
  702     my $self = shift;
  703 
  704     # USES GLOBAL VARIABLES: $tokenizer_self, $brace_depth,
  705     # $square_bracket_depth, $paren_depth
  706 
  707     my $input_line = $tokenizer_self->[_line_buffer_object_]->get_line();
  708     $tokenizer_self->[_line_of_text_] = $input_line;
  709 
  710     return unless ($input_line);
  711 
  712     my $input_line_number = ++$tokenizer_self->[_last_line_number_];
  713 
  714     my $write_logfile_entry = sub {
  715         my ($msg) = @_;
  716         write_logfile_entry("Line $input_line_number: $msg");
  717     };
  718 
  719     # Find and remove what characters terminate this line, including any
  720     # control r
  721     my $input_line_separator = "";
  722     if ( chomp($input_line) ) { $input_line_separator = $/ }
  723 
  724     # The first test here very significantly speeds things up, but be sure to
  725     # keep the regex and hash %other_line_endings the same.
  726     if ( $other_line_endings{ substr( $input_line, -1 ) } ) {
  727         if ( $input_line =~ s/((\r|\035|\032)+)$// ) {
  728             $input_line_separator = $2 . $input_line_separator;
  729         }
  730     }
  731 
  732     # for backwards compatibility we keep the line text terminated with
  733     # a newline character
  734     $input_line .= "\n";
  735     $tokenizer_self->[_line_of_text_] = $input_line;    # update
  736 
  737     # create a data structure describing this line which will be
  738     # returned to the caller.
  739 
  740     # _line_type codes are:
  741     #   SYSTEM         - system-specific code before hash-bang line
  742     #   CODE           - line of perl code (including comments)
  743     #   POD_START      - line starting pod, such as '=head'
  744     #   POD            - pod documentation text
  745     #   POD_END        - last line of pod section, '=cut'
  746     #   HERE           - text of here-document
  747     #   HERE_END       - last line of here-doc (target word)
  748     #   FORMAT         - format section
  749     #   FORMAT_END     - last line of format section, '.'
  750     #   DATA_START     - __DATA__ line
  751     #   DATA           - unidentified text following __DATA__
  752     #   END_START      - __END__ line
  753     #   END            - unidentified text following __END__
  754     #   ERROR          - we are in big trouble, probably not a perl script
  755 
  756     # Other variables:
  757     #   _curly_brace_depth     - depth of curly braces at start of line
  758     #   _square_bracket_depth  - depth of square brackets at start of line
  759     #   _paren_depth           - depth of parens at start of line
  760     #   _starting_in_quote     - this line continues a multi-line quote
  761     #                            (so don't trim leading blanks!)
  762     #   _ending_in_quote       - this line ends in a multi-line quote
  763     #                            (so don't trim trailing blanks!)
  764     my $line_of_tokens = {
  765         _line_type                 => 'EOF',
  766         _line_text                 => $input_line,
  767         _line_number               => $input_line_number,
  768         _guessed_indentation_level => 0,
  769         _curly_brace_depth         => $brace_depth,
  770         _square_bracket_depth      => $square_bracket_depth,
  771         _paren_depth               => $paren_depth,
  772         _quote_character           => '',
  773 ##        _rtoken_type               => undef,
  774 ##        _rtokens                   => undef,
  775 ##        _rlevels                   => undef,
  776 ##        _rslevels                  => undef,
  777 ##        _rblock_type               => undef,
  778 ##        _rcontainer_type           => undef,
  779 ##        _rcontainer_environment    => undef,
  780 ##        _rtype_sequence            => undef,
  781 ##        _rnesting_tokens           => undef,
  782 ##        _rci_levels                => undef,
  783 ##        _rnesting_blocks           => undef,
  784 ##        _starting_in_quote         => 0,
  785 ##        _ending_in_quote           => 0,
  786     };
  787 
  788     # must print line unchanged if we are in a here document
  789     if ( $tokenizer_self->[_in_here_doc_] ) {
  790 
  791         $line_of_tokens->{_line_type} = 'HERE';
  792         my $here_doc_target      = $tokenizer_self->[_here_doc_target_];
  793         my $here_quote_character = $tokenizer_self->[_here_quote_character_];
  794         my $candidate_target     = $input_line;
  795         chomp $candidate_target;
  796 
  797         # Handle <<~ targets, which are indicated here by a leading space on
  798         # the here quote character
  799         if ( $here_quote_character =~ /^\s/ ) {
  800             $candidate_target =~ s/^\s*//;
  801         }
  802         if ( $candidate_target eq $here_doc_target ) {
  803             $tokenizer_self->[_nearly_matched_here_target_at_] = undef;
  804             $line_of_tokens->{_line_type} = 'HERE_END';
  805             $write_logfile_entry->("Exiting HERE document $here_doc_target\n");
  806 
  807             my $rhere_target_list = $tokenizer_self->[_rhere_target_list_];
  808             if ( @{$rhere_target_list} ) {  # there can be multiple here targets
  809                 ( $here_doc_target, $here_quote_character ) =
  810                   @{ shift @{$rhere_target_list} };
  811                 $tokenizer_self->[_here_doc_target_] = $here_doc_target;
  812                 $tokenizer_self->[_here_quote_character_] =
  813                   $here_quote_character;
  814                 $write_logfile_entry->(
  815                     "Entering HERE document $here_doc_target\n");
  816                 $tokenizer_self->[_nearly_matched_here_target_at_] = undef;
  817                 $tokenizer_self->[_started_looking_for_here_target_at_] =
  818                   $input_line_number;
  819             }
  820             else {
  821                 $tokenizer_self->[_in_here_doc_]          = 0;
  822                 $tokenizer_self->[_here_doc_target_]      = "";
  823                 $tokenizer_self->[_here_quote_character_] = "";
  824             }
  825         }
  826 
  827         # check for error of extra whitespace
  828         # note for PERL6: leading whitespace is allowed
  829         else {
  830             $candidate_target =~ s/\s*$//;
  831             $candidate_target =~ s/^\s*//;
  832             if ( $candidate_target eq $here_doc_target ) {
  833                 $tokenizer_self->[_nearly_matched_here_target_at_] =
  834                   $input_line_number;
  835             }
  836         }
  837         return $line_of_tokens;
  838     }
  839 
  840     # Print line unchanged if we are in a format section
  841     elsif ( $tokenizer_self->[_in_format_] ) {
  842 
  843         if ( $input_line =~ /^\.[\s#]*$/ ) {
  844 
  845             # Decrement format depth count at a '.' after a 'format'
  846             $tokenizer_self->[_in_format_]--;
  847 
  848             # This is the end when count reaches 0
  849             if ( !$tokenizer_self->[_in_format_] ) {
  850                 $write_logfile_entry->("Exiting format section\n");
  851                 $line_of_tokens->{_line_type} = 'FORMAT_END';
  852             }
  853         }
  854         else {
  855             $line_of_tokens->{_line_type} = 'FORMAT';
  856             if ( $input_line =~ /^\s*format\s+\w+/ ) {
  857 
  858                 # Increment format depth count at a 'format' within a 'format'
  859                 # This is a simple way to handle nested formats (issue c019).
  860                 $tokenizer_self->[_in_format_]++;
  861             }
  862         }
  863         return $line_of_tokens;
  864     }
  865 
  866     # must print line unchanged if we are in pod documentation
  867     elsif ( $tokenizer_self->[_in_pod_] ) {
  868 
  869         $line_of_tokens->{_line_type} = 'POD';
  870         if ( $input_line =~ /^=cut/ ) {
  871             $line_of_tokens->{_line_type} = 'POD_END';
  872             $write_logfile_entry->("Exiting POD section\n");
  873             $tokenizer_self->[_in_pod_] = 0;
  874         }
  875         if ( $input_line =~ /^\#\!.*perl\b/ && !$tokenizer_self->[_in_end_] ) {
  876             warning(
  877                 "Hash-bang in pod can cause older versions of perl to fail! \n"
  878             );
  879         }
  880 
  881         return $line_of_tokens;
  882     }
  883 
  884     # print line unchanged if in skipped section
  885     elsif ( $tokenizer_self->[_in_skipped_] ) {
  886 
  887         # NOTE: marked as the existing type 'FORMAT' to keep html working
  888         $line_of_tokens->{_line_type} = 'FORMAT';
  889         if ( $input_line =~ /$code_skipping_pattern_end/ ) {
  890             $write_logfile_entry->("Exiting code-skipping section\n");
  891             $tokenizer_self->[_in_skipped_] = 0;
  892         }
  893         return $line_of_tokens;
  894     }
  895 
  896     # must print line unchanged if we have seen a severe error (i.e., we
  897     # are seeing illegal tokens and cannot continue.  Syntax errors do
  898     # not pass this route).  Calling routine can decide what to do, but
  899     # the default can be to just pass all lines as if they were after __END__
  900     elsif ( $tokenizer_self->[_in_error_] ) {
  901         $line_of_tokens->{_line_type} = 'ERROR';
  902         return $line_of_tokens;
  903     }
  904 
  905     # print line unchanged if we are __DATA__ section
  906     elsif ( $tokenizer_self->[_in_data_] ) {
  907 
  908         # ...but look for POD
  909         # Note that the _in_data and _in_end flags remain set
  910         # so that we return to that state after seeing the
  911         # end of a pod section
  912         if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
  913             $line_of_tokens->{_line_type} = 'POD_START';
  914             $write_logfile_entry->("Entering POD section\n");
  915             $tokenizer_self->[_in_pod_] = 1;
  916             return $line_of_tokens;
  917         }
  918         else {
  919             $line_of_tokens->{_line_type} = 'DATA';
  920             return $line_of_tokens;
  921         }
  922     }
  923 
  924     # print line unchanged if we are in __END__ section
  925     elsif ( $tokenizer_self->[_in_end_] ) {
  926 
  927         # ...but look for POD
  928         # Note that the _in_data and _in_end flags remain set
  929         # so that we return to that state after seeing the
  930         # end of a pod section
  931         if ( $input_line =~ /^=(\w+)\b/ && $1 ne 'cut' ) {
  932             $line_of_tokens->{_line_type} = 'POD_START';
  933             $write_logfile_entry->("Entering POD section\n");
  934             $tokenizer_self->[_in_pod_] = 1;
  935             return $line_of_tokens;
  936         }
  937         else {
  938             $line_of_tokens->{_line_type} = 'END';
  939             return $line_of_tokens;
  940         }
  941     }
  942 
  943     # check for a hash-bang line if we haven't seen one
  944     if ( !$tokenizer_self->[_saw_hash_bang_] ) {
  945         if ( $input_line =~ /^\#\!.*perl\b/ ) {
  946             $tokenizer_self->[_saw_hash_bang_] = $input_line_number;
  947 
  948             # check for -w and -P flags
  949             if ( $input_line =~ /^\#\!.*perl\s.*-.*P/ ) {
  950                 $tokenizer_self->[_saw_perl_dash_P_] = 1;
  951             }
  952 
  953             if ( $input_line =~ /^\#\!.*perl\s.*-.*w/ ) {
  954                 $tokenizer_self->[_saw_perl_dash_w_] = 1;
  955             }
  956 
  957             if (
  958                 $input_line_number > 1
  959 
  960                 # leave any hash bang in a BEGIN block alone
  961                 # i.e. see 'debugger-duck_type.t'
  962                 && !(
  963                        $last_nonblank_block_type
  964                     && $last_nonblank_block_type eq 'BEGIN'
  965                 )
  966                 && !$tokenizer_self->[_look_for_hash_bang_]
  967 
  968                 # Try to avoid giving a false alarm at a simple comment.
  969                 # These look like valid hash-bang lines:
  970 
  971                 #!/usr/bin/perl -w
  972                 #!   /usr/bin/perl -w
  973                 #!c:\perl\bin\perl.exe
  974 
  975                 # These are comments:
  976                 #! I love perl
  977                 #!  sunos does not yet provide a /usr/bin/perl
  978 
  979                 # Comments typically have multiple spaces, which suggests
  980                 # the filter
  981                 && $input_line =~ /^\#\!(\s+)?(\S+)?perl/
  982               )
  983             {
  984 
  985                 # this is helpful for VMS systems; we may have accidentally
  986                 # tokenized some DCL commands
  987                 if ( $tokenizer_self->[_started_tokenizing_] ) {
  988                     warning(
  989 "There seems to be a hash-bang after line 1; do you need to run with -x ?\n"
  990                     );
  991                 }
  992                 else {
  993                     complain("Useless hash-bang after line 1\n");
  994                 }
  995             }
  996 
  997             # Report the leading hash-bang as a system line
  998             # This will prevent -dac from deleting it
  999             else {
 1000                 $line_of_tokens->{_line_type} = 'SYSTEM';
 1001                 return $line_of_tokens;
 1002             }
 1003         }
 1004     }
 1005 
 1006     # wait for a hash-bang before parsing if the user invoked us with -x
 1007     if ( $tokenizer_self->[_look_for_hash_bang_]
 1008         && !$tokenizer_self->[_saw_hash_bang_] )
 1009     {
 1010         $line_of_tokens->{_line_type} = 'SYSTEM';
 1011         return $line_of_tokens;
 1012     }
 1013 
 1014     # a first line of the form ': #' will be marked as SYSTEM
 1015     # since lines of this form may be used by tcsh
 1016     if ( $input_line_number == 1 && $input_line =~ /^\s*\:\s*\#/ ) {
 1017         $line_of_tokens->{_line_type} = 'SYSTEM';
 1018         return $line_of_tokens;
 1019     }
 1020 
 1021     # now we know that it is ok to tokenize the line...
 1022     # the line tokenizer will modify any of these private variables:
 1023     #        _rhere_target_list_
 1024     #        _in_data_
 1025     #        _in_end_
 1026     #        _in_format_
 1027     #        _in_error_
 1028     #        _in_skipped_
 1029     #        _in_pod_
 1030     #        _in_quote_
 1031     my $ending_in_quote_last = $tokenizer_self->[_in_quote_];
 1032     tokenize_this_line($line_of_tokens);
 1033 
 1034     # Now finish defining the return structure and return it
 1035     $line_of_tokens->{_ending_in_quote} = $tokenizer_self->[_in_quote_];
 1036 
 1037     # handle severe error (binary data in script)
 1038     if ( $tokenizer_self->[_in_error_] ) {
 1039         $tokenizer_self->[_in_quote_] = 0;    # to avoid any more messages
 1040         warning("Giving up after error\n");
 1041         $line_of_tokens->{_line_type} = 'ERROR';
 1042         reset_indentation_level(0);           # avoid error messages
 1043         return $line_of_tokens;
 1044     }
 1045 
 1046     # handle start of pod documentation
 1047     if ( $tokenizer_self->[_in_pod_] ) {
 1048 
 1049         # This gets tricky..above a __DATA__ or __END__ section, perl
 1050         # accepts '=cut' as the start of pod section. But afterwards,
 1051         # only pod utilities see it and they may ignore an =cut without
 1052         # leading =head.  In any case, this isn't good.
 1053         if ( $input_line =~ /^=cut\b/ ) {
 1054             if ( $tokenizer_self->[_saw_data_] || $tokenizer_self->[_saw_end_] )
 1055             {
 1056                 complain("=cut while not in pod ignored\n");
 1057                 $tokenizer_self->[_in_pod_] = 0;
 1058                 $line_of_tokens->{_line_type} = 'POD_END';
 1059             }
 1060             else {
 1061                 $line_of_tokens->{_line_type} = 'POD_START';
 1062                 warning(
 1063 "=cut starts a pod section .. this can fool pod utilities.\n"
 1064                 );
 1065                 $write_logfile_entry->("Entering POD section\n");
 1066             }
 1067         }
 1068 
 1069         else {
 1070             $line_of_tokens->{_line_type} = 'POD_START';
 1071             $write_logfile_entry->("Entering POD section\n");
 1072         }
 1073 
 1074         return $line_of_tokens;
 1075     }
 1076 
 1077     # handle start of skipped section
 1078     if ( $tokenizer_self->[_in_skipped_] ) {
 1079 
 1080         # NOTE: marked as the existing type 'FORMAT' to keep html working
 1081         $line_of_tokens->{_line_type} = 'FORMAT';
 1082         $write_logfile_entry->("Entering code-skipping section\n");
 1083         return $line_of_tokens;
 1084     }
 1085 
 1086     # Update indentation levels for log messages.
 1087     # Skip blank lines and also block comments, unless a logfile is requested.
 1088     # Note that _line_of_text_ is the input line but trimmed from left to right.
 1089     my $lot = $tokenizer_self->[_line_of_text_];
 1090     if ( $lot && ( $self->[_rOpts_logfile_] || substr( $lot, 0, 1 ) ne '#' ) ) {
 1091         my $rlevels = $line_of_tokens->{_rlevels};
 1092         $line_of_tokens->{_guessed_indentation_level} =
 1093           guess_old_indentation_level($input_line);
 1094     }
 1095 
 1096     # see if this line contains here doc targets
 1097     my $rhere_target_list = $tokenizer_self->[_rhere_target_list_];
 1098     if ( @{$rhere_target_list} ) {
 1099 
 1100         my ( $here_doc_target, $here_quote_character ) =
 1101           @{ shift @{$rhere_target_list} };
 1102         $tokenizer_self->[_in_here_doc_]          = 1;
 1103         $tokenizer_self->[_here_doc_target_]      = $here_doc_target;
 1104         $tokenizer_self->[_here_quote_character_] = $here_quote_character;
 1105         $write_logfile_entry->("Entering HERE document $here_doc_target\n");
 1106         $tokenizer_self->[_started_looking_for_here_target_at_] =
 1107           $input_line_number;
 1108     }
 1109 
 1110     # NOTE: __END__ and __DATA__ statements are written unformatted
 1111     # because they can theoretically contain additional characters
 1112     # which are not tokenized (and cannot be read with <DATA> either!).
 1113     if ( $tokenizer_self->[_in_data_] ) {
 1114         $line_of_tokens->{_line_type} = 'DATA_START';
 1115         $write_logfile_entry->("Starting __DATA__ section\n");
 1116         $tokenizer_self->[_saw_data_] = 1;
 1117 
 1118         # keep parsing after __DATA__ if use SelfLoader was seen
 1119         if ( $tokenizer_self->[_saw_selfloader_] ) {
 1120             $tokenizer_self->[_in_data_] = 0;
 1121             $write_logfile_entry->(
 1122                 "SelfLoader seen, continuing; -nlsl deactivates\n");
 1123         }
 1124 
 1125         return $line_of_tokens;
 1126     }
 1127 
 1128     elsif ( $tokenizer_self->[_in_end_] ) {
 1129         $line_of_tokens->{_line_type} = 'END_START';
 1130         $write_logfile_entry->("Starting __END__ section\n");
 1131         $tokenizer_self->[_saw_end_] = 1;
 1132 
 1133         # keep parsing after __END__ if use AutoLoader was seen
 1134         if ( $tokenizer_self->[_saw_autoloader_] ) {
 1135             $tokenizer_self->[_in_end_] = 0;
 1136             $write_logfile_entry->(
 1137                 "AutoLoader seen, continuing; -nlal deactivates\n");
 1138         }
 1139         return $line_of_tokens;
 1140     }
 1141 
 1142     # now, finally, we know that this line is type 'CODE'
 1143     $line_of_tokens->{_line_type} = 'CODE';
 1144 
 1145     # remember if we have seen any real code
 1146     if (  !$tokenizer_self->[_started_tokenizing_]
 1147         && $input_line !~ /^\s*$/
 1148         && $input_line !~ /^\s*#/ )
 1149     {
 1150         $tokenizer_self->[_started_tokenizing_] = 1;
 1151     }
 1152 
 1153     if ( $tokenizer_self->[_debugger_object_] ) {
 1154         $tokenizer_self->[_debugger_object_]
 1155           ->write_debug_entry($line_of_tokens);
 1156     }
 1157 
 1158     # Note: if keyword 'format' occurs in this line code, it is still CODE
 1159     # (keyword 'format' need not start a line)
 1160     if ( $tokenizer_self->[_in_format_] ) {
 1161         $write_logfile_entry->("Entering format section\n");
 1162     }
 1163 
 1164     if ( $tokenizer_self->[_in_quote_]
 1165         and ( $tokenizer_self->[_line_start_quote_] < 0 ) )
 1166     {
 1167 
 1168         #if ( ( my $quote_target = get_quote_target() ) !~ /^\s*$/ ) {
 1169         if ( ( my $quote_target = $tokenizer_self->[_quote_target_] ) !~
 1170             /^\s*$/ )
 1171         {
 1172             $tokenizer_self->[_line_start_quote_] = $input_line_number;
 1173             $write_logfile_entry->(
 1174                 "Start multi-line quote or pattern ending in $quote_target\n");
 1175         }
 1176     }
 1177     elsif ( ( $tokenizer_self->[_line_start_quote_] >= 0 )
 1178         && !$tokenizer_self->[_in_quote_] )
 1179     {
 1180         $tokenizer_self->[_line_start_quote_] = -1;
 1181         $write_logfile_entry->("End of multi-line quote or pattern\n");
 1182     }
 1183 
 1184     # we are returning a line of CODE
 1185     return $line_of_tokens;
 1186 }
 1187 
 1188 sub find_starting_indentation_level {
 1189 
 1190     # We need to find the indentation level of the first line of the
 1191     # script being formatted.  Often it will be zero for an entire file,
 1192     # but if we are formatting a local block of code (within an editor for
 1193     # example) it may not be zero.  The user may specify this with the
 1194     # -sil=n parameter but normally doesn't so we have to guess.
 1195     #
 1196     # USES GLOBAL VARIABLES: $tokenizer_self
 1197     my $starting_level = 0;
 1198 
 1199     # use value if given as parameter
 1200     if ( $tokenizer_self->[_know_starting_level_] ) {
 1201         $starting_level = $tokenizer_self->[_starting_level_];
 1202     }
 1203 
 1204     # if we know there is a hash_bang line, the level must be zero
 1205     elsif ( $tokenizer_self->[_look_for_hash_bang_] ) {
 1206         $tokenizer_self->[_know_starting_level_] = 1;
 1207     }
 1208 
 1209     # otherwise figure it out from the input file
 1210     else {
 1211         my $line;
 1212         my $i = 0;
 1213 
 1214         # keep looking at lines until we find a hash bang or piece of code
 1215         my $msg = "";
 1216         while ( $line =
 1217             $tokenizer_self->[_line_buffer_object_]->peek_ahead( $i++ ) )
 1218         {
 1219 
 1220             # if first line is #! then assume starting level is zero
 1221             if ( $i == 1 && $line =~ /^\#\!/ ) {
 1222                 $starting_level = 0;
 1223                 last;
 1224             }
 1225             next if ( $line =~ /^\s*#/ );    # skip past comments
 1226             next if ( $line =~ /^\s*$/ );    # skip past blank lines
 1227             $starting_level = guess_old_indentation_level($line);
 1228             last;
 1229         }
 1230         $msg = "Line $i implies starting-indentation-level = $starting_level\n";
 1231         write_logfile_entry("$msg");
 1232     }
 1233     $tokenizer_self->[_starting_level_] = $starting_level;
 1234     reset_indentation_level($starting_level);
 1235     return;
 1236 }
 1237 
 1238 sub guess_old_indentation_level {
 1239     my ($line) = @_;
 1240 
 1241     # Guess the indentation level of an input line.
 1242     #
 1243     # For the first line of code this result will define the starting
 1244     # indentation level.  It will mainly be non-zero when perltidy is applied
 1245     # within an editor to a local block of code.
 1246     #
 1247     # This is an impossible task in general because we can't know what tabs
 1248     # meant for the old script and how many spaces were used for one
 1249     # indentation level in the given input script.  For example it may have
 1250     # been previously formatted with -i=7 -et=3.  But we can at least try to
 1251     # make sure that perltidy guesses correctly if it is applied repeatedly to
 1252     # a block of code within an editor, so that the block stays at the same
 1253     # level when perltidy is applied repeatedly.
 1254     #
 1255     # USES GLOBAL VARIABLES: $tokenizer_self
 1256     my $level = 0;
 1257 
 1258     # find leading tabs, spaces, and any statement label
 1259     my $spaces = 0;
 1260     if ( $line =~ /^(\t+)?(\s+)?(\w+:[^:])?/ ) {
 1261 
 1262         # If there are leading tabs, we use the tab scheme for this run, if
 1263         # any, so that the code will remain stable when editing.
 1264         if ($1) { $spaces += length($1) * $tokenizer_self->[_tabsize_] }
 1265 
 1266         if ($2) { $spaces += length($2) }
 1267 
 1268         # correct for outdented labels
 1269         if ( $3 && $tokenizer_self->[_outdent_labels_] ) {
 1270             $spaces += $tokenizer_self->[_continuation_indentation_];
 1271         }
 1272     }
 1273 
 1274     # compute indentation using the value of -i for this run.
 1275     # If -i=0 is used for this run (which is possible) it doesn't matter
 1276     # what we do here but we'll guess that the old run used 4 spaces per level.
 1277     my $indent_columns = $tokenizer_self->[_indent_columns_];
 1278     $indent_columns = 4 if ( !$indent_columns );
 1279     $level          = int( $spaces / $indent_columns );
 1280     return ($level);
 1281 }
 1282 
 1283 # This is a currently unused debug routine
 1284 sub dump_functions {
 1285 
 1286     my $fh = *STDOUT;
 1287     foreach my $pkg ( keys %is_user_function ) {
 1288         $fh->print("\nnon-constant subs in package $pkg\n");
 1289 
 1290         foreach my $sub ( keys %{ $is_user_function{$pkg} } ) {
 1291             my $msg = "";
 1292             if ( $is_block_list_function{$pkg}{$sub} ) {
 1293                 $msg = 'block_list';
 1294             }
 1295 
 1296             if ( $is_block_function{$pkg}{$sub} ) {
 1297                 $msg = 'block';
 1298             }
 1299             $fh->print("$sub $msg\n");
 1300         }
 1301     }
 1302 
 1303     foreach my $pkg ( keys %is_constant ) {
 1304         $fh->print("\nconstants and constant subs in package $pkg\n");
 1305 
 1306         foreach my $sub ( keys %{ $is_constant{$pkg} } ) {
 1307             $fh->print("$sub\n");
 1308         }
 1309     }
 1310     return;
 1311 }
 1312 
 1313 sub prepare_for_a_new_file {
 1314 
 1315     # previous tokens needed to determine what to expect next
 1316     $last_nonblank_token      = ';';    # the only possible starting state which
 1317     $last_nonblank_type       = ';';    # will make a leading brace a code block
 1318     $last_nonblank_block_type = '';
 1319 
 1320     # scalars for remembering statement types across multiple lines
 1321     $statement_type    = '';            # '' or 'use' or 'sub..' or 'case..'
 1322     $in_attribute_list = 0;
 1323 
 1324     # scalars for remembering where we are in the file
 1325     $current_package = "main";
 1326     $context         = UNKNOWN_CONTEXT;
 1327 
 1328     # hashes used to remember function information
 1329     %is_constant             = ();      # user-defined constants
 1330     %is_user_function        = ();      # user-defined functions
 1331     %user_function_prototype = ();      # their prototypes
 1332     %is_block_function       = ();
 1333     %is_block_list_function  = ();
 1334     %saw_function_definition = ();
 1335     %saw_use_module          = ();
 1336 
 1337     # variables used to track depths of various containers
 1338     # and report nesting errors
 1339     $paren_depth             = 0;
 1340     $brace_depth             = 0;
 1341     $square_bracket_depth    = 0;
 1342     @current_depth           = (0) x scalar @closing_brace_names;
 1343     $total_depth             = 0;
 1344     @total_depth             = ();
 1345     @nesting_sequence_number = ( 0 .. @closing_brace_names - 1 );
 1346     @current_sequence_number = ();
 1347 
 1348     @paren_type                     = ();
 1349     @paren_semicolon_count          = ();
 1350     @paren_structural_type          = ();
 1351     @brace_type                     = ();
 1352     @brace_structural_type          = ();
 1353     @brace_context                  = ();
 1354     @brace_package                  = ();
 1355     @square_bracket_type            = ();
 1356     @square_bracket_structural_type = ();
 1357     @depth_array                    = ();
 1358     @nested_ternary_flag            = ();
 1359     @nested_statement_type          = ();
 1360     @starting_line_of_current_depth = ();
 1361 
 1362     $paren_type[$paren_depth]            = '';
 1363     $paren_semicolon_count[$paren_depth] = 0;
 1364     $paren_structural_type[$brace_depth] = '';
 1365     $brace_type[$brace_depth] = ';';    # identify opening brace as code block
 1366     $brace_structural_type[$brace_depth]                   = '';
 1367     $brace_context[$brace_depth]                           = UNKNOWN_CONTEXT;
 1368     $brace_package[$paren_depth]                           = $current_package;
 1369     $square_bracket_type[$square_bracket_depth]            = '';
 1370     $square_bracket_structural_type[$square_bracket_depth] = '';
 1371 
 1372     initialize_tokenizer_state();
 1373     return;
 1374 }
 1375 
 1376 {    ## closure for sub tokenize_this_line
 1377 
 1378     use constant BRACE          => 0;
 1379     use constant SQUARE_BRACKET => 1;
 1380     use constant PAREN          => 2;
 1381     use constant QUESTION_COLON => 3;
 1382 
 1383     # TV1: scalars for processing one LINE.
 1384     # Re-initialized on each entry to sub tokenize_this_line.
 1385     my (
 1386         $block_type,        $container_type,    $expecting,
 1387         $i,                 $i_tok,             $input_line,
 1388         $input_line_number, $last_nonblank_i,   $max_token_index,
 1389         $next_tok,          $next_type,         $peeked_ahead,
 1390         $prototype,         $rhere_target_list, $rtoken_map,
 1391         $rtoken_type,       $rtokens,           $tok,
 1392         $type,              $type_sequence,     $indent_flag,
 1393     );
 1394 
 1395     # TV2: refs to ARRAYS for processing one LINE
 1396     # Re-initialized on each call.
 1397     my $routput_token_list     = [];    # stack of output token indexes
 1398     my $routput_token_type     = [];    # token types
 1399     my $routput_block_type     = [];    # types of code block
 1400     my $routput_container_type = [];    # paren types, such as if, elsif, ..
 1401     my $routput_type_sequence  = [];    # nesting sequential number
 1402     my $routput_indent_flag    = [];    #
 1403 
 1404     # TV3: SCALARS for quote variables.  These are initialized with a
 1405     # subroutine call and continually updated as lines are processed.
 1406     my ( $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
 1407         $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers, );
 1408 
 1409     # TV4: SCALARS for multi-line identifiers and
 1410     # statements. These are initialized with a subroutine call
 1411     # and continually updated as lines are processed.
 1412     my ( $id_scan_state, $identifier, $want_paren, $indented_if_level );
 1413 
 1414     # TV5: SCALARS for tracking indentation level.
 1415     # Initialized once and continually updated as lines are
 1416     # processed.
 1417     my (
 1418         $nesting_token_string,      $nesting_type_string,
 1419         $nesting_block_string,      $nesting_block_flag,
 1420         $nesting_list_string,       $nesting_list_flag,
 1421         $ci_string_in_tokenizer,    $continuation_string_in_tokenizer,
 1422         $in_statement_continuation, $level_in_tokenizer,
 1423         $slevel_in_tokenizer,       $rslevel_stack,
 1424     );
 1425 
 1426     # TV6: SCALARS for remembering several previous
 1427     # tokens. Initialized once and continually updated as
 1428     # lines are processed.
 1429     my (
 1430         $last_nonblank_container_type,     $last_nonblank_type_sequence,
 1431         $last_last_nonblank_token,         $last_last_nonblank_type,
 1432         $last_last_nonblank_block_type,    $last_last_nonblank_container_type,
 1433         $last_last_nonblank_type_sequence, $last_nonblank_prototype,
 1434     );
 1435 
 1436     # ----------------------------------------------------------------
 1437     # beginning of tokenizer variable access and manipulation routines
 1438     # ----------------------------------------------------------------
 1439 
 1440     sub initialize_tokenizer_state {
 1441 
 1442         # TV1: initialized on each call
 1443         # TV2: initialized on each call
 1444         # TV3:
 1445         $in_quote                = 0;
 1446         $quote_type              = 'Q';
 1447         $quote_character         = "";
 1448         $quote_pos               = 0;
 1449         $quote_depth             = 0;
 1450         $quoted_string_1         = "";
 1451         $quoted_string_2         = "";
 1452         $allowed_quote_modifiers = "";
 1453 
 1454         # TV4:
 1455         $id_scan_state     = '';
 1456         $identifier        = '';
 1457         $want_paren        = "";
 1458         $indented_if_level = 0;
 1459 
 1460         # TV5:
 1461         $nesting_token_string             = "";
 1462         $nesting_type_string              = "";
 1463         $nesting_block_string             = '1';    # initially in a block
 1464         $nesting_block_flag               = 1;
 1465         $nesting_list_string              = '0';    # initially not in a list
 1466         $nesting_list_flag                = 0;      # initially not in a list
 1467         $ci_string_in_tokenizer           = "";
 1468         $continuation_string_in_tokenizer = "0";
 1469         $in_statement_continuation        = 0;
 1470         $level_in_tokenizer               = 0;
 1471         $slevel_in_tokenizer              = 0;
 1472         $rslevel_stack                    = [];
 1473 
 1474         # TV6:
 1475         $last_nonblank_container_type      = '';
 1476         $last_nonblank_type_sequence       = '';
 1477         $last_last_nonblank_token          = ';';
 1478         $last_last_nonblank_type           = ';';
 1479         $last_last_nonblank_block_type     = '';
 1480         $last_last_nonblank_container_type = '';
 1481         $last_last_nonblank_type_sequence  = '';
 1482         $last_nonblank_prototype           = "";
 1483         return;
 1484     }
 1485 
 1486     sub save_tokenizer_state {
 1487 
 1488         my $rTV1 = [
 1489             $block_type,        $container_type,    $expecting,
 1490             $i,                 $i_tok,             $input_line,
 1491             $input_line_number, $last_nonblank_i,   $max_token_index,
 1492             $next_tok,          $next_type,         $peeked_ahead,
 1493             $prototype,         $rhere_target_list, $rtoken_map,
 1494             $rtoken_type,       $rtokens,           $tok,
 1495             $type,              $type_sequence,     $indent_flag,
 1496         ];
 1497 
 1498         my $rTV2 = [
 1499             $routput_token_list,    $routput_token_type,
 1500             $routput_block_type,    $routput_container_type,
 1501             $routput_type_sequence, $routput_indent_flag,
 1502         ];
 1503 
 1504         my $rTV3 = [
 1505             $in_quote,        $quote_type,
 1506             $quote_character, $quote_pos,
 1507             $quote_depth,     $quoted_string_1,
 1508             $quoted_string_2, $allowed_quote_modifiers,
 1509         ];
 1510 
 1511         my $rTV4 =
 1512           [ $id_scan_state, $identifier, $want_paren, $indented_if_level ];
 1513 
 1514         my $rTV5 = [
 1515             $nesting_token_string,      $nesting_type_string,
 1516             $nesting_block_string,      $nesting_block_flag,
 1517             $nesting_list_string,       $nesting_list_flag,
 1518             $ci_string_in_tokenizer,    $continuation_string_in_tokenizer,
 1519             $in_statement_continuation, $level_in_tokenizer,
 1520             $slevel_in_tokenizer,       $rslevel_stack,
 1521         ];
 1522 
 1523         my $rTV6 = [
 1524             $last_nonblank_container_type,
 1525             $last_nonblank_type_sequence,
 1526             $last_last_nonblank_token,
 1527             $last_last_nonblank_type,
 1528             $last_last_nonblank_block_type,
 1529             $last_last_nonblank_container_type,
 1530             $last_last_nonblank_type_sequence,
 1531             $last_nonblank_prototype,
 1532         ];
 1533         return [ $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ];
 1534     }
 1535 
 1536     sub restore_tokenizer_state {
 1537         my ($rstate) = @_;
 1538         my ( $rTV1, $rTV2, $rTV3, $rTV4, $rTV5, $rTV6 ) = @{$rstate};
 1539         (
 1540             $block_type,        $container_type,    $expecting,
 1541             $i,                 $i_tok,             $input_line,
 1542             $input_line_number, $last_nonblank_i,   $max_token_index,
 1543             $next_tok,          $next_type,         $peeked_ahead,
 1544             $prototype,         $rhere_target_list, $rtoken_map,
 1545             $rtoken_type,       $rtokens,           $tok,
 1546             $type,              $type_sequence,     $indent_flag,
 1547         ) = @{$rTV1};
 1548 
 1549         (
 1550             $routput_token_list,    $routput_token_type,
 1551             $routput_block_type,    $routput_container_type,
 1552             $routput_type_sequence, $routput_type_sequence,
 1553         ) = @{$rTV2};
 1554 
 1555         (
 1556             $in_quote, $quote_type, $quote_character, $quote_pos, $quote_depth,
 1557             $quoted_string_1, $quoted_string_2, $allowed_quote_modifiers,
 1558         ) = @{$rTV3};
 1559 
 1560         ( $id_scan_state, $identifier, $want_paren, $indented_if_level ) =
 1561           @{$rTV4};
 1562 
 1563         (
 1564             $nesting_token_string,      $nesting_type_string,
 1565             $nesting_block_string,      $nesting_block_flag,
 1566             $nesting_list_string,       $nesting_list_flag,
 1567             $ci_string_in_tokenizer,    $continuation_string_in_tokenizer,
 1568             $in_statement_continuation, $level_in_tokenizer,
 1569             $slevel_in_tokenizer,       $rslevel_stack,
 1570         ) = @{$rTV5};
 1571 
 1572         (
 1573             $last_nonblank_container_type,
 1574             $last_nonblank_type_sequence,
 1575             $last_last_nonblank_token,
 1576             $last_last_nonblank_type,
 1577             $last_last_nonblank_block_type,
 1578             $last_last_nonblank_container_type,
 1579             $last_last_nonblank_type_sequence,
 1580             $last_nonblank_prototype,
 1581         ) = @{$rTV6};
 1582         return;
 1583     }
 1584 
 1585     sub get_indentation_level {
 1586 
 1587         # patch to avoid reporting error if indented if is not terminated
 1588         if ($indented_if_level) { return $level_in_tokenizer - 1 }
 1589         return $level_in_tokenizer;
 1590     }
 1591 
 1592     sub reset_indentation_level {
 1593         $level_in_tokenizer = $slevel_in_tokenizer = shift;
 1594         push @{$rslevel_stack}, $slevel_in_tokenizer;
 1595         return;
 1596     }
 1597 
 1598     sub peeked_ahead {
 1599         my $flag = shift;
 1600         $peeked_ahead = defined($flag) ? $flag : $peeked_ahead;
 1601         return $peeked_ahead;
 1602     }
 1603 
 1604     # ------------------------------------------------------------
 1605     # end of tokenizer variable access and manipulation routines
 1606     # ------------------------------------------------------------
 1607 
 1608     # ------------------------------------------------------------
 1609     # beginning of various scanner interface routines
 1610     # ------------------------------------------------------------
 1611     sub scan_replacement_text {
 1612 
 1613         # check for here-docs in replacement text invoked by
 1614         # a substitution operator with executable modifier 'e'.
 1615         #
 1616         # given:
 1617         #  $replacement_text
 1618         # return:
 1619         #  $rht = reference to any here-doc targets
 1620         my ($replacement_text) = @_;
 1621 
 1622         # quick check
 1623         return unless ( $replacement_text =~ /<</ );
 1624 
 1625         write_logfile_entry("scanning replacement text for here-doc targets\n");
 1626 
 1627         # save the logger object for error messages
 1628         my $logger_object = $tokenizer_self->[_logger_object_];
 1629 
 1630         # localize all package variables
 1631         local (
 1632             $tokenizer_self,                 $last_nonblank_token,
 1633             $last_nonblank_type,             $last_nonblank_block_type,
 1634             $statement_type,                 $in_attribute_list,
 1635             $current_package,                $context,
 1636             %is_constant,                    %is_user_function,
 1637             %user_function_prototype,        %is_block_function,
 1638             %is_block_list_function,         %saw_function_definition,
 1639             $brace_depth,                    $paren_depth,
 1640             $square_bracket_depth,           @current_depth,
 1641             @total_depth,                    $total_depth,
 1642             @nesting_sequence_number,        @current_sequence_number,
 1643             @paren_type,                     @paren_semicolon_count,
 1644             @paren_structural_type,          @brace_type,
 1645             @brace_structural_type,          @brace_context,
 1646             @brace_package,                  @square_bracket_type,
 1647             @square_bracket_structural_type, @depth_array,
 1648             @starting_line_of_current_depth, @nested_ternary_flag,
 1649             @nested_statement_type,
 1650         );
 1651 
 1652         # save all lexical variables
 1653         my $rstate = save_tokenizer_state();
 1654         _decrement_count();    # avoid error check for multiple tokenizers
 1655 
 1656         # make a new tokenizer
 1657         my $rOpts = {};
 1658         my $rpending_logfile_message;
 1659         my $source_object = Perl::Tidy::LineSource->new(
 1660             input_file               => \$replacement_text,
 1661             rOpts                    => $rOpts,
 1662             rpending_logfile_message => $rpending_logfile_message,
 1663         );
 1664         my $tokenizer = Perl::Tidy::Tokenizer->new(
 1665             source_object        => $source_object,
 1666             logger_object        => $logger_object,
 1667             starting_line_number => $input_line_number,
 1668         );
 1669 
 1670         # scan the replacement text
 1671         1 while ( $tokenizer->get_line() );
 1672 
 1673         # remove any here doc targets
 1674         my $rht = undef;
 1675         if ( $tokenizer_self->[_in_here_doc_] ) {
 1676             $rht = [];
 1677             push @{$rht},
 1678               [
 1679                 $tokenizer_self->[_here_doc_target_],
 1680                 $tokenizer_self->[_here_quote_character_]
 1681               ];
 1682             if ( $tokenizer_self->[_rhere_target_list_] ) {
 1683                 push @{$rht}, @{ $tokenizer_self->[_rhere_target_list_] };
 1684                 $tokenizer_self->[_rhere_target_list_] = undef;
 1685             }
 1686             $tokenizer_self->[_in_here_doc_] = undef;
 1687         }
 1688 
 1689         # now its safe to report errors
 1690         my $severe_error = $tokenizer->report_tokenization_errors();
 1691 
 1692         # TODO: Could propagate a severe error up
 1693 
 1694         # restore all tokenizer lexical variables
 1695         restore_tokenizer_state($rstate);
 1696 
 1697         # return the here doc targets
 1698         return $rht;
 1699     }
 1700 
 1701     sub scan_bare_identifier {
 1702         ( $i, $tok, $type, $prototype ) =
 1703           scan_bare_identifier_do( $input_line, $i, $tok, $type, $prototype,
 1704             $rtoken_map, $max_token_index );
 1705         return;
 1706     }
 1707 
 1708     sub scan_identifier {
 1709         ( $i, $tok, $type, $id_scan_state, $identifier ) =
 1710           scan_identifier_do( $i, $id_scan_state, $identifier, $rtokens,
 1711             $max_token_index, $expecting, $paren_type[$paren_depth] );
 1712         return;
 1713     }
 1714 
 1715     use constant VERIFY_FASTSCAN => 0;
 1716     my %fast_scan_context;
 1717 
 1718     BEGIN {
 1719         %fast_scan_context = (
 1720             '$' => SCALAR_CONTEXT,
 1721             '*' => SCALAR_CONTEXT,
 1722             '@' => LIST_CONTEXT,
 1723             '%' => LIST_CONTEXT,
 1724             '&' => UNKNOWN_CONTEXT,
 1725         );
 1726     }
 1727 
 1728     sub scan_identifier_fast {
 1729 
 1730         # This is a wrapper for sub scan_identifier. It does a fast preliminary
 1731         # scan for certain common identifiers:
 1732         #   '$var', '@var', %var, *var, &var, '@{...}', '%{...}'
 1733         # If it does not find one of these, or this is a restart, it calls the
 1734         # original scanner directly.
 1735 
 1736         # This gives the same results as the full scanner in about 1/4 the
 1737         # total runtime for a typical input stream.
 1738 
 1739         my $i_begin   = $i;
 1740         my $tok_begin = $tok;
 1741         my $fast_scan_type;
 1742 
 1743         ###############################
 1744         # quick scan with leading sigil
 1745         ###############################
 1746         if (  !$id_scan_state
 1747             && $i + 1 <= $max_token_index
 1748             && $fast_scan_context{$tok} )
 1749         {
 1750             $context = $fast_scan_context{$tok};
 1751 
 1752             # look for $var, @var, ...
 1753             if ( $rtoken_type->[ $i + 1 ] eq 'w' ) {
 1754                 my $pretype_next = "";
 1755                 my $i_next       = $i + 2;
 1756                 if ( $i_next <= $max_token_index ) {
 1757                     if (   $rtoken_type->[$i_next] eq 'b'
 1758                         && $i_next < $max_token_index )
 1759                     {
 1760                         $i_next += 1;
 1761                     }
 1762                     $pretype_next = $rtoken_type->[$i_next];
 1763                 }
 1764                 if ( $pretype_next ne ':' && $pretype_next ne "'" ) {
 1765 
 1766                     # Found type 'i' like '$var', '@var', or '%var'
 1767                     $identifier     = $tok . $rtokens->[ $i + 1 ];
 1768                     $tok            = $identifier;
 1769                     $type           = 'i';
 1770                     $i              = $i + 1;
 1771                     $fast_scan_type = $type;
 1772                 }
 1773             }
 1774 
 1775             # Look for @{ or %{  .
 1776             # But we must let the full scanner handle things ${ because it may
 1777             # keep going to get a complete identifier like '${#}'  .
 1778             elsif (
 1779                 $rtoken_type->[ $i + 1 ] eq '{'
 1780                 && (   $tok_begin eq '@'
 1781                     || $tok_begin eq '%' )
 1782               )
 1783             {
 1784 
 1785                 $identifier     = $tok;
 1786                 $type           = 't';
 1787                 $fast_scan_type = $type;
 1788             }
 1789         }
 1790 
 1791         ############################
 1792         # Quick scan with leading ->
 1793         # Look for ->[ and ->{
 1794         ############################
 1795         elsif (
 1796                $tok eq '->'
 1797             && $i < $max_token_index
 1798             && (   $rtokens->[ $i + 1 ] eq '{'
 1799                 || $rtokens->[ $i + 1 ] eq '[' )
 1800           )
 1801         {
 1802             $type           = $tok;
 1803             $fast_scan_type = $type;
 1804             $identifier     = $tok;
 1805             $context        = UNKNOWN_CONTEXT;
 1806         }
 1807 
 1808         #######################################
 1809         # Verify correctness during development
 1810         #######################################
 1811         if ( VERIFY_FASTSCAN && $fast_scan_type ) {
 1812 
 1813             # We will call the full method
 1814             my $identifier_simple = $identifier;
 1815             my $tok_simple        = $tok;
 1816             my $fast_scan_type    = $type;
 1817             my $i_simple          = $i;
 1818             my $context_simple    = $context;
 1819 
 1820             $tok = $tok_begin;
 1821             $i   = $i_begin;
 1822             scan_identifier();
 1823 
 1824             if (   $tok ne $tok_simple
 1825                 || $type ne $fast_scan_type
 1826                 || $i != $i_simple
 1827                 || $identifier ne $identifier_simple
 1828                 || $id_scan_state
 1829                 || $context ne $context_simple )
 1830             {
 1831                 print STDERR <<EOM;
 1832 scan_identifier_fast differs from scan_identifier:
 1833 simple:  i=$i_simple, tok=$tok_simple, type=$fast_scan_type, ident=$identifier_simple, context='$context_simple
 1834 full:    i=$i, tok=$tok, type=$type, ident=$identifier, context='$context state=$id_scan_state
 1835 EOM
 1836             }
 1837         }
 1838 
 1839         ###################################################
 1840         # call full scanner if fast method did not succeed
 1841         ###################################################
 1842         if ( !$fast_scan_type ) {
 1843             scan_identifier();
 1844         }
 1845         return;
 1846     }
 1847 
 1848     sub scan_id {
 1849         ( $i, $tok, $type, $id_scan_state ) =
 1850           scan_id_do( $input_line, $i, $tok, $rtokens, $rtoken_map,
 1851             $id_scan_state, $max_token_index );
 1852         return;
 1853     }
 1854 
 1855     sub scan_number {
 1856         my $number;
 1857         ( $i, $type, $number ) =
 1858           scan_number_do( $input_line, $i, $rtoken_map, $type,
 1859             $max_token_index );
 1860         return $number;
 1861     }
 1862 
 1863     use constant VERIFY_FASTNUM => 0;
 1864 
 1865     sub scan_number_fast {
 1866 
 1867         # This is a wrapper for sub scan_number. It does a fast preliminary
 1868         # scan for a simple integer.  It calls the original scan_number if it
 1869         # does not find one.
 1870 
 1871         my $i_begin   = $i;
 1872         my $tok_begin = $tok;
 1873         my $number;
 1874 
 1875         ##################################
 1876         # Quick check for (signed) integer
 1877         ##################################
 1878 
 1879         # This will be the string of digits:
 1880         my $i_d   = $i;
 1881         my $tok_d = $tok;
 1882         my $typ_d = $rtoken_type->[$i_d];
 1883 
 1884         # check for signed integer
 1885         my $sign = "";
 1886         if (   $typ_d ne 'd'
 1887             && ( $typ_d eq '+' || $typ_d eq '-' )
 1888             && $i_d < $max_token_index )
 1889         {
 1890             $sign = $tok_d;
 1891             $i_d++;
 1892             $tok_d = $rtokens->[$i_d];
 1893             $typ_d = $rtoken_type->[$i_d];
 1894         }
 1895 
 1896         # Handle integers
 1897         if (
 1898             $typ_d eq 'd'
 1899             && (
 1900                 $i_d == $max_token_index
 1901                 || (   $i_d < $max_token_index
 1902                     && $rtoken_type->[ $i_d + 1 ] ne '.'
 1903                     && $rtoken_type->[ $i_d + 1 ] ne 'w' )
 1904             )
 1905           )
 1906         {
 1907             # Let let full scanner handle multi-digit integers beginning with
 1908             # '0' because there could be error messages.  For example, '009' is
 1909             # not a valid number.
 1910 
 1911             if ( $tok_d eq '0' || substr( $tok_d, 0, 1 ) ne '0' ) {
 1912                 $number = $sign . $tok_d;
 1913                 $type   = 'n';
 1914                 $i      = $i_d;
 1915             }
 1916         }
 1917 
 1918         #######################################
 1919         # Verify correctness during development
 1920         #######################################
 1921         if ( VERIFY_FASTNUM && defined($number) ) {
 1922 
 1923             # We will call the full method
 1924             my $type_simple   = $type;
 1925             my $i_simple      = $i;
 1926             my $number_simple = $number;
 1927 
 1928             $tok    = $tok_begin;
 1929             $i      = $i_begin;
 1930             $number = scan_number();
 1931 
 1932             if (   $type ne $type_simple
 1933                 || ( $i != $i_simple && $i <= $max_token_index )
 1934                 || $number ne $number_simple )
 1935             {
 1936                 print STDERR <<EOM;
 1937 scan_number_fast differs from scan_number:
 1938 simple:  i=$i_simple, type=$type_simple, number=$number_simple
 1939 full:  i=$i, type=$type, number=$number
 1940 EOM
 1941             }
 1942         }
 1943 
 1944         #########################################
 1945         # call full scanner if may not be integer
 1946         #########################################
 1947         if ( !defined($number) ) {
 1948             $number = scan_number();
 1949         }
 1950         return $number;
 1951     }
 1952 
 1953     # a sub to warn if token found where term expected
 1954     sub error_if_expecting_TERM {
 1955         if ( $expecting == TERM ) {
 1956             if ( $really_want_term{$last_nonblank_type} ) {
 1957                 report_unexpected( $tok, "term", $i_tok, $last_nonblank_i,
 1958                     $rtoken_map, $rtoken_type, $input_line );
 1959                 return 1;
 1960             }
 1961         }
 1962         return;
 1963     }
 1964 
 1965     # a sub to warn if token found where operator expected
 1966     sub error_if_expecting_OPERATOR {
 1967         my $thing = shift;
 1968         if ( $expecting == OPERATOR ) {
 1969             if ( !defined($thing) ) { $thing = $tok }
 1970             report_unexpected( $thing, "operator", $i_tok, $last_nonblank_i,
 1971                 $rtoken_map, $rtoken_type, $input_line );
 1972             if ( $i_tok == 0 ) {
 1973                 interrupt_logfile();
 1974                 warning("Missing ';' or ',' above?\n");
 1975                 resume_logfile();
 1976             }
 1977             return 1;
 1978         }
 1979         return;
 1980     }
 1981 
 1982     # ------------------------------------------------------------
 1983     # end scanner interfaces
 1984     # ------------------------------------------------------------
 1985 
 1986     my %is_for_foreach;
 1987     @_ = qw(for foreach);
 1988     @is_for_foreach{@_} = (1) x scalar(@_);
 1989 
 1990     my %is_my_our_state;
 1991     @_ = qw(my our state);
 1992     @is_my_our_state{@_} = (1) x scalar(@_);
 1993 
 1994     # These keywords may introduce blocks after parenthesized expressions,
 1995     # in the form:
 1996     # keyword ( .... ) { BLOCK }
 1997     # patch for SWITCH/CASE: added 'switch' 'case' 'given' 'when'
 1998     my %is_blocktype_with_paren;
 1999     @_ =
 2000       qw(if elsif unless while until for foreach switch case given when catch);
 2001     @is_blocktype_with_paren{@_} = (1) x scalar(@_);
 2002 
 2003     my %is_case_default;
 2004     @_ = qw(case default);
 2005     @is_case_default{@_} = (1) x scalar(@_);
 2006 
 2007     # ------------------------------------------------------------
 2008     # begin hash of code for handling most token types
 2009     # ------------------------------------------------------------
 2010     my $tokenization_code = {
 2011 
 2012         # no special code for these types yet, but syntax checks
 2013         # could be added
 2014 
 2015 ##      '!'   => undef,
 2016 ##      '!='  => undef,
 2017 ##      '!~'  => undef,
 2018 ##      '%='  => undef,
 2019 ##      '&&=' => undef,
 2020 ##      '&='  => undef,
 2021 ##      '+='  => undef,
 2022 ##      '-='  => undef,
 2023 ##      '..'  => undef,
 2024 ##      '..'  => undef,
 2025 ##      '...' => undef,
 2026 ##      '.='  => undef,
 2027 ##      '<<=' => undef,
 2028 ##      '<='  => undef,
 2029 ##      '<=>' => undef,
 2030 ##      '<>'  => undef,
 2031 ##      '='   => undef,
 2032 ##      '=='  => undef,
 2033 ##      '=~'  => undef,
 2034 ##      '>='  => undef,
 2035 ##      '>>'  => undef,
 2036 ##      '>>=' => undef,
 2037 ##      '\\'  => undef,
 2038 ##      '^='  => undef,
 2039 ##      '|='  => undef,
 2040 ##      '||=' => undef,
 2041 ##      '//=' => undef,
 2042 ##      '~'   => undef,
 2043 ##      '~~'  => undef,
 2044 ##      '!~~'  => undef,
 2045 
 2046         '>' => sub {
 2047             error_if_expecting_TERM()
 2048               if ( $expecting == TERM );
 2049         },
 2050         '|' => sub {
 2051             error_if_expecting_TERM()
 2052               if ( $expecting == TERM );
 2053         },
 2054         '$' => sub {
 2055 
 2056             # start looking for a scalar
 2057             error_if_expecting_OPERATOR("Scalar")
 2058               if ( $expecting == OPERATOR );
 2059             scan_identifier_fast();
 2060 
 2061             if ( $identifier eq '$^W' ) {
 2062                 $tokenizer_self->[_saw_perl_dash_w_] = 1;
 2063             }
 2064 
 2065             # Check for identifier in indirect object slot
 2066             # (vorboard.pl, sort.t).  Something like:
 2067             #   /^(print|printf|sort|exec|system)$/
 2068             if (
 2069                 $is_indirect_object_taker{$last_nonblank_token}
 2070                 || ( ( $last_nonblank_token eq '(' )
 2071                     && $is_indirect_object_taker{ $paren_type[$paren_depth] } )
 2072                 || (   $last_nonblank_type eq 'w'
 2073                     || $last_nonblank_type eq 'U' )    # possible object
 2074               )
 2075             {
 2076                 $type = 'Z';
 2077             }
 2078         },
 2079         '(' => sub {
 2080 
 2081             ++$paren_depth;
 2082             $paren_semicolon_count[$paren_depth] = 0;
 2083             if ($want_paren) {
 2084                 $container_type = $want_paren;
 2085                 $want_paren     = "";
 2086             }
 2087             elsif ( $statement_type =~ /^sub\b/ ) {
 2088                 $container_type = $statement_type;
 2089             }
 2090             else {
 2091                 $container_type = $last_nonblank_token;
 2092 
 2093                 # We can check for a syntax error here of unexpected '(',
 2094                 # but this is going to get messy...
 2095                 if (
 2096                     $expecting == OPERATOR
 2097 
 2098                     # Be sure this is not a method call of the form
 2099                     # &method(...), $method->(..), &{method}(...),
 2100                     # $ref[2](list) is ok & short for $ref[2]->(list)
 2101                     # NOTE: at present, braces in something like &{ xxx }
 2102                     # are not marked as a block, we might have a method call.
 2103                     # Added ')' to fix case c017, something like ()()()
 2104                     && $last_nonblank_token !~ /^([\]\}\)\&]|\-\>)/
 2105 
 2106                   )
 2107                 {
 2108 
 2109                     # ref: camel 3 p 703.
 2110                     if ( $last_last_nonblank_token eq 'do' ) {
 2111                         complain(
 2112 "do SUBROUTINE is deprecated; consider & or -> notation\n"
 2113                         );
 2114                     }
 2115                     else {
 2116 
 2117                         # if this is an empty list, (), then it is not an
 2118                         # error; for example, we might have a constant pi and
 2119                         # invoke it with pi() or just pi;
 2120                         my ( $next_nonblank_token, $i_next ) =
 2121                           find_next_nonblank_token( $i, $rtokens,
 2122                             $max_token_index );
 2123 
 2124                         # Patch for c029: give up error check if
 2125                         # a side comment follows
 2126                         if (   $next_nonblank_token ne ')'
 2127                             && $next_nonblank_token ne '#' )
 2128                         {
 2129                             my $hint;
 2130 
 2131                             error_if_expecting_OPERATOR('(');
 2132 
 2133                             if ( $last_nonblank_type eq 'C' ) {
 2134                                 $hint =
 2135                                   "$last_nonblank_token has a void prototype\n";
 2136                             }
 2137                             elsif ( $last_nonblank_type eq 'i' ) {
 2138                                 if (   $i_tok > 0
 2139                                     && $last_nonblank_token =~ /^\$/ )
 2140                                 {
 2141                                     $hint =
 2142 "Do you mean '$last_nonblank_token->(' ?\n";
 2143                                 }
 2144                             }
 2145                             if ($hint) {
 2146                                 interrupt_logfile();
 2147                                 warning($hint);
 2148                                 resume_logfile();
 2149                             }
 2150                         } ## end if ( $next_nonblank_token...
 2151                     } ## end else [ if ( $last_last_nonblank_token...
 2152                 } ## end if ( $expecting == OPERATOR...
 2153             }
 2154             $paren_type[$paren_depth] = $container_type;
 2155             ( $type_sequence, $indent_flag ) =
 2156               increase_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
 2157 
 2158             # propagate types down through nested parens
 2159             # for example: the second paren in 'if ((' would be structural
 2160             # since the first is.
 2161 
 2162             if ( $last_nonblank_token eq '(' ) {
 2163                 $type = $last_nonblank_type;
 2164             }
 2165 
 2166             #     We exclude parens as structural after a ',' because it
 2167             #     causes subtle problems with continuation indentation for
 2168             #     something like this, where the first 'or' will not get
 2169             #     indented.
 2170             #
 2171             #         assert(
 2172             #             __LINE__,
 2173             #             ( not defined $check )
 2174             #               or ref $check
 2175             #               or $check eq "new"
 2176             #               or $check eq "old",
 2177             #         );
 2178             #
 2179             #     Likewise, we exclude parens where a statement can start
 2180             #     because of problems with continuation indentation, like
 2181             #     these:
 2182             #
 2183             #         ($firstline =~ /^#\!.*perl/)
 2184             #         and (print $File::Find::name, "\n")
 2185             #           and (return 1);
 2186             #
 2187             #         (ref($usage_fref) =~ /CODE/)
 2188             #         ? &$usage_fref
 2189             #           : (&blast_usage, &blast_params, &blast_general_params);
 2190 
 2191             else {
 2192                 $type = '{';
 2193             }
 2194 
 2195             if ( $last_nonblank_type eq ')' ) {
 2196                 warning(
 2197                     "Syntax error? found token '$last_nonblank_type' then '('\n"
 2198                 );
 2199             }
 2200             $paren_structural_type[$paren_depth] = $type;
 2201 
 2202         },
 2203         ')' => sub {
 2204             ( $type_sequence, $indent_flag ) =
 2205               decrease_nesting_depth( PAREN, $rtoken_map->[$i_tok] );
 2206 
 2207             if ( $paren_structural_type[$paren_depth] eq '{' ) {
 2208                 $type = '}';
 2209             }
 2210 
 2211             $container_type = $paren_type[$paren_depth];
 2212 
 2213             # restore statement type as 'sub' at closing paren of a signature
 2214             # so that a subsequent ':' is identified as an attribute
 2215             if ( $container_type =~ /^sub\b/ ) {
 2216                 $statement_type = $container_type;
 2217             }
 2218 
 2219             #    /^(for|foreach)$/
 2220             if ( $is_for_foreach{ $paren_type[$paren_depth] } ) {
 2221                 my $num_sc = $paren_semicolon_count[$paren_depth];
 2222                 if ( $num_sc > 0 && $num_sc != 2 ) {
 2223                     warning("Expected 2 ';' in 'for(;;)' but saw $num_sc\n");
 2224                 }
 2225             }
 2226 
 2227             if ( $paren_depth > 0 ) { $paren_depth-- }
 2228         },
 2229         ',' => sub {
 2230             if ( $last_nonblank_type eq ',' ) {
 2231                 complain("Repeated ','s \n");
 2232             }
 2233 
 2234             # Note that we have to check both token and type here because a
 2235             # comma following a qw list can have last token='(' but type = 'q'
 2236             elsif ( $last_nonblank_token eq '(' && $last_nonblank_type eq '{' )
 2237             {
 2238                 warning("Unexpected leading ',' after a '('\n");
 2239             }
 2240 
 2241             # patch for operator_expected: note if we are in the list (use.t)
 2242             if ( $statement_type eq 'use' ) { $statement_type = '_use' }
 2243 
 2244         },
 2245         ';' => sub {
 2246             $context        = UNKNOWN_CONTEXT;
 2247             $statement_type = '';
 2248             $want_paren     = "";
 2249 
 2250             #    /^(for|foreach)$/
 2251             if ( $is_for_foreach{ $paren_type[$paren_depth] } )
 2252             {    # mark ; in for loop
 2253 
 2254                 # Be careful: we do not want a semicolon such as the
 2255                 # following to be included:
 2256                 #
 2257                 #    for (sort {strcoll($a,$b);} keys %investments) {
 2258 
 2259                 if (   $brace_depth == $depth_array[PAREN][BRACE][$paren_depth]
 2260                     && $square_bracket_depth ==
 2261                     $depth_array[PAREN][SQUARE_BRACKET][$paren_depth] )
 2262                 {
 2263 
 2264                     $type = 'f';
 2265                     $paren_semicolon_count[$paren_depth]++;
 2266                 }
 2267             }
 2268 
 2269         },
 2270         '"' => sub {
 2271             error_if_expecting_OPERATOR("String")
 2272               if ( $expecting == OPERATOR );
 2273             $in_quote                = 1;
 2274             $type                    = 'Q';
 2275             $allowed_quote_modifiers = "";
 2276         },
 2277         "'" => sub {
 2278             error_if_expecting_OPERATOR("String")
 2279               if ( $expecting == OPERATOR );
 2280             $in_quote                = 1;
 2281             $type                    = 'Q';
 2282             $allowed_quote_modifiers = "";
 2283         },
 2284         '`' => sub {
 2285             error_if_expecting_OPERATOR("String")
 2286               if ( $expecting == OPERATOR );
 2287             $in_quote                = 1;
 2288             $type                    = 'Q';
 2289             $allowed_quote_modifiers = "";
 2290         },
 2291         '/' => sub {
 2292             my $is_pattern;
 2293 
 2294             # a pattern cannot follow certain keywords which take optional
 2295             # arguments, like 'shift' and 'pop'. See also '?'.
 2296             if (
 2297                 $last_nonblank_type eq 'k'
 2298                 && $is_keyword_rejecting_slash_as_pattern_delimiter{
 2299                     $last_nonblank_token}
 2300               )
 2301             {
 2302                 $is_pattern = 0;
 2303             }
 2304             elsif ( $expecting == UNKNOWN ) {    # indeterminate, must guess..
 2305                 my $msg;
 2306                 ( $is_pattern, $msg ) =
 2307                   guess_if_pattern_or_division( $i, $rtokens, $rtoken_map,
 2308                     $max_token_index );
 2309 
 2310                 if ($msg) {
 2311                     write_diagnostics("DIVIDE:$msg\n");
 2312                     write_logfile_entry($msg);
 2313                 }
 2314             }
 2315             else { $is_pattern = ( $expecting == TERM ) }
 2316 
 2317             if ($is_pattern) {
 2318                 $in_quote                = 1;
 2319                 $type                    = 'Q';
 2320                 $allowed_quote_modifiers = '[msixpodualngc]';
 2321             }
 2322             else {    # not a pattern; check for a /= token
 2323 
 2324                 if ( $rtokens->[ $i + 1 ] eq '=' ) {    # form token /=
 2325                     $i++;
 2326                     $tok  = '/=';
 2327                     $type = $tok;
 2328                 }
 2329 
 2330            #DEBUG - collecting info on what tokens follow a divide
 2331            # for development of guessing algorithm
 2332            #if ( is_possible_numerator( $i, $rtokens, $max_token_index ) < 0 ) {
 2333            #    #write_diagnostics( "DIVIDE? $input_line\n" );
 2334            #}
 2335             }
 2336         },
 2337         '{' => sub {
 2338 
 2339             # if we just saw a ')', we will label this block with
 2340             # its type.  We need to do this to allow sub
 2341             # code_block_type to determine if this brace starts a
 2342             # code block or anonymous hash.  (The type of a paren
 2343             # pair is the preceding token, such as 'if', 'else',
 2344             # etc).
 2345             $container_type = "";
 2346 
 2347             # ATTRS: for a '{' following an attribute list, reset
 2348             # things to look like we just saw the sub name
 2349             if ( $statement_type =~ /^sub\b/ ) {
 2350                 $last_nonblank_token = $statement_type;
 2351                 $last_nonblank_type  = 'i';
 2352                 $statement_type      = "";
 2353             }
 2354 
 2355             # patch for SWITCH/CASE: hide these keywords from an immediately
 2356             # following opening brace
 2357             elsif ( ( $statement_type eq 'case' || $statement_type eq 'when' )
 2358                 && $statement_type eq $last_nonblank_token )
 2359             {
 2360                 $last_nonblank_token = ";";
 2361             }
 2362 
 2363             elsif ( $last_nonblank_token eq ')' ) {
 2364                 $last_nonblank_token = $paren_type[ $paren_depth + 1 ];
 2365 
 2366                 # defensive move in case of a nesting error (pbug.t)
 2367                 # in which this ')' had no previous '('
 2368                 # this nesting error will have been caught
 2369                 if ( !defined($last_nonblank_token) ) {
 2370                     $last_nonblank_token = 'if';
 2371                 }
 2372 
 2373                 # check for syntax error here;
 2374                 unless ( $is_blocktype_with_paren{$last_nonblank_token} ) {
 2375                     if ( $tokenizer_self->[_extended_syntax_] ) {
 2376 
 2377                         # we append a trailing () to mark this as an unknown
 2378                         # block type.  This allows perltidy to format some
 2379                         # common extensions of perl syntax.
 2380                         # This is used by sub code_block_type
 2381                         $last_nonblank_token .= '()';
 2382                     }
 2383                     else {
 2384                         my $list =
 2385                           join( ' ', sort keys %is_blocktype_with_paren );
 2386                         warning(
 2387 "syntax error at ') {', didn't see one of: <<$list>>; If this code is okay try using the -xs flag\n"
 2388                         );
 2389                     }
 2390                 }
 2391             }
 2392 
 2393             # patch for paren-less for/foreach glitch, part 2.
 2394             # see note below under 'qw'
 2395             elsif ($last_nonblank_token eq 'qw'
 2396                 && $is_for_foreach{$want_paren} )
 2397             {
 2398                 $last_nonblank_token = $want_paren;
 2399                 if ( $last_last_nonblank_token eq $want_paren ) {
 2400                     warning(
 2401 "syntax error at '$want_paren .. {' -- missing \$ loop variable\n"
 2402                     );
 2403 
 2404                 }
 2405                 $want_paren = "";
 2406             }
 2407 
 2408             # now identify which of the three possible types of
 2409             # curly braces we have: hash index container, anonymous
 2410             # hash reference, or code block.
 2411 
 2412             # non-structural (hash index) curly brace pair
 2413             # get marked 'L' and 'R'
 2414             if ( is_non_structural_brace() ) {
 2415                 $type = 'L';
 2416 
 2417                 # patch for SWITCH/CASE:
 2418                 # allow paren-less identifier after 'when'
 2419                 # if the brace is preceded by a space
 2420                 if (   $statement_type eq 'when'
 2421                     && $last_nonblank_type eq 'i'
 2422                     && $last_last_nonblank_type eq 'k'
 2423                     && ( $i_tok == 0 || $rtoken_type->[ $i_tok - 1 ] eq 'b' ) )
 2424                 {
 2425                     $type       = '{';
 2426                     $block_type = $statement_type;
 2427                 }
 2428             }
 2429 
 2430             # code and anonymous hash have the same type, '{', but are
 2431             # distinguished by 'block_type',
 2432             # which will be blank for an anonymous hash
 2433             else {
 2434 
 2435                 $block_type = code_block_type( $i_tok, $rtokens, $rtoken_type,
 2436                     $max_token_index );
 2437 
 2438                 # patch to promote bareword type to function taking block
 2439                 if (   $block_type
 2440                     && $last_nonblank_type eq 'w'
 2441                     && $last_nonblank_i >= 0 )
 2442                 {
 2443                     if ( $routput_token_type->[$last_nonblank_i] eq 'w' ) {
 2444                         $routput_token_type->[$last_nonblank_i] = 'G';
 2445                     }
 2446                 }
 2447 
 2448                 # patch for SWITCH/CASE: if we find a stray opening block brace
 2449                 # where we might accept a 'case' or 'when' block, then take it
 2450                 if (   $statement_type eq 'case'
 2451                     || $statement_type eq 'when' )
 2452                 {
 2453                     if ( !$block_type || $block_type eq '}' ) {
 2454                         $block_type = $statement_type;
 2455                     }
 2456                 }
 2457             }
 2458 
 2459             $brace_type[ ++$brace_depth ]        = $block_type;
 2460             $brace_package[$brace_depth]         = $current_package;
 2461             $brace_structural_type[$brace_depth] = $type;
 2462             $brace_context[$brace_depth]         = $context;
 2463             ( $type_sequence, $indent_flag ) =
 2464               increase_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
 2465         },
 2466         '}' => sub {
 2467             $block_type = $brace_type[$brace_depth];
 2468             if ($block_type) { $statement_type = '' }
 2469             if ( defined( $brace_package[$brace_depth] ) ) {
 2470                 $current_package = $brace_package[$brace_depth];
 2471             }
 2472 
 2473             # can happen on brace error (caught elsewhere)
 2474             else {
 2475             }
 2476             ( $type_sequence, $indent_flag ) =
 2477               decrease_nesting_depth( BRACE, $rtoken_map->[$i_tok] );
 2478 
 2479             if ( $brace_structural_type[$brace_depth] eq 'L' ) {
 2480                 $type = 'R';
 2481             }
 2482 
 2483             # propagate type information for 'do' and 'eval' blocks, and also
 2484             # for smartmatch operator.  This is necessary to enable us to know
 2485             # if an operator or term is expected next.
 2486             if ( $is_block_operator{$block_type} ) {
 2487                 $tok = $block_type;
 2488             }
 2489 
 2490             $context = $brace_context[$brace_depth];
 2491             if ( $brace_depth > 0 ) { $brace_depth--; }
 2492         },
 2493         '&' => sub {    # maybe sub call? start looking
 2494 
 2495             # We have to check for sub call unless we are sure we
 2496             # are expecting an operator.  This example from s2p
 2497             # got mistaken as a q operator in an early version:
 2498             #   print BODY &q(<<'EOT');
 2499             if ( $expecting != OPERATOR ) {
 2500 
 2501                 # But only look for a sub call if we are expecting a term or
 2502                 # if there is no existing space after the &.
 2503                 # For example we probably don't want & as sub call here:
 2504                 #    Fcntl::S_IRUSR & $mode;
 2505                 if ( $expecting == TERM || $next_type ne 'b' ) {
 2506                     scan_identifier_fast();
 2507                 }
 2508             }
 2509             else {
 2510             }
 2511         },
 2512         '<' => sub {    # angle operator or less than?
 2513 
 2514             if ( $expecting != OPERATOR ) {
 2515                 ( $i, $type ) =
 2516                   find_angle_operator_termination( $input_line, $i, $rtoken_map,
 2517                     $expecting, $max_token_index );
 2518 
 2519                 ##  This message is not very helpful and quite confusing if the above
 2520                 ##  routine decided not to write a message with the line number.
 2521                 ##  if ( $type eq '<' && $expecting == TERM ) {
 2522                 ##      error_if_expecting_TERM();
 2523                 ##      interrupt_logfile();
 2524                 ##      warning("Unterminated <> operator?\n");
 2525                 ##      resume_logfile();
 2526                 ##  }
 2527 
 2528             }
 2529             else {
 2530             }
 2531         },
 2532         '?' => sub {    # ?: conditional or starting pattern?
 2533 
 2534             my $is_pattern;
 2535 
 2536             # Patch for rt #126965
 2537             # a pattern cannot follow certain keywords which take optional
 2538             # arguments, like 'shift' and 'pop'. See also '/'.
 2539             if (
 2540                 $last_nonblank_type eq 'k'
 2541                 && $is_keyword_rejecting_question_as_pattern_delimiter{
 2542                     $last_nonblank_token}
 2543               )
 2544             {
 2545                 $is_pattern = 0;
 2546             }
 2547 
 2548             # patch for RT#131288, user constant function without prototype
 2549             # last type is 'U' followed by ?.
 2550             elsif ( $last_nonblank_type =~ /^[FUY]$/ ) {
 2551                 $is_pattern = 0;
 2552             }
 2553             elsif ( $expecting == UNKNOWN ) {
 2554 
 2555                 # In older versions of Perl, a bare ? can be a pattern
 2556                 # delimiter.  In perl version 5.22 this was
 2557                 # dropped, but we have to support it in order to format
 2558                 # older programs. See:
 2559                 ## https://perl.developpez.com/documentations/en/5.22.0/perl5211delta.html
 2560                 # For example, the following line worked
 2561                 # at one time:
 2562                 #      ?(.*)? && (print $1,"\n");
 2563                 # In current versions it would have to be written with slashes:
 2564                 #      /(.*)/ && (print $1,"\n");
 2565                 my $msg;
 2566                 ( $is_pattern, $msg ) =
 2567                   guess_if_pattern_or_conditional( $i, $rtokens, $rtoken_map,
 2568                     $max_token_index );
 2569 
 2570                 if ($msg) { write_logfile_entry($msg) }
 2571             }
 2572             else { $is_pattern = ( $expecting == TERM ) }
 2573 
 2574             if ($is_pattern) {
 2575                 $in_quote                = 1;
 2576                 $type                    = 'Q';
 2577                 $allowed_quote_modifiers = '[msixpodualngc]';
 2578             }
 2579             else {
 2580                 ( $type_sequence, $indent_flag ) =
 2581                   increase_nesting_depth( QUESTION_COLON,
 2582                     $rtoken_map->[$i_tok] );
 2583             }
 2584         },
 2585         '*' => sub {    # typeglob, or multiply?
 2586 
 2587             if ( $expecting == UNKNOWN && $last_nonblank_type eq 'Z' ) {
 2588                 if (   $next_type ne 'b'
 2589                     && $next_type ne '('
 2590                     && $next_type ne '#' )    # Fix c036
 2591                 {
 2592                     $expecting = TERM;
 2593                 }
 2594             }
 2595             if ( $expecting == TERM ) {
 2596                 scan_identifier_fast();
 2597             }
 2598             else {
 2599 
 2600                 if ( $rtokens->[ $i + 1 ] eq '=' ) {
 2601                     $tok  = '*=';
 2602                     $type = $tok;
 2603                     $i++;
 2604                 }
 2605                 elsif ( $rtokens->[ $i + 1 ] eq '*' ) {
 2606                     $tok  = '**';
 2607                     $type = $tok;
 2608                     $i++;
 2609                     if ( $rtokens->[ $i + 1 ] eq '=' ) {
 2610                         $tok  = '**=';
 2611                         $type = $tok;
 2612                         $i++;
 2613                     }
 2614                 }
 2615             }
 2616         },
 2617         '.' => sub {    # what kind of . ?
 2618 
 2619             if ( $expecting != OPERATOR ) {
 2620                 scan_number();
 2621                 if ( $type eq '.' ) {
 2622                     error_if_expecting_TERM()
 2623                       if ( $expecting == TERM );
 2624                 }
 2625             }
 2626             else {
 2627             }
 2628         },
 2629         ':' => sub {
 2630 
 2631             # if this is the first nonblank character, call it a label
 2632             # since perl seems to just swallow it
 2633             if ( $input_line_number == 1 && $last_nonblank_i == -1 ) {
 2634                 $type = 'J';
 2635             }
 2636 
 2637             # ATTRS: check for a ':' which introduces an attribute list
 2638             # either after a 'sub' keyword or within a paren list
 2639             elsif ( $statement_type =~ /^sub\b/ ) {
 2640                 $type              = 'A';
 2641                 $in_attribute_list = 1;
 2642             }
 2643 
 2644             # Within a signature, unless we are in a ternary.  For example,
 2645             # from 't/filter_example.t':
 2646             #    method foo4 ( $class: $bar ) { $class->bar($bar) }
 2647             elsif ( $paren_type[$paren_depth] =~ /^sub\b/
 2648                 && !is_balanced_closing_container(QUESTION_COLON) )
 2649             {
 2650                 $type              = 'A';
 2651                 $in_attribute_list = 1;
 2652             }
 2653 
 2654             # check for scalar attribute, such as
 2655             # my $foo : shared = 1;
 2656             elsif ($is_my_our_state{$statement_type}
 2657                 && $current_depth[QUESTION_COLON] == 0 )
 2658             {
 2659                 $type              = 'A';
 2660                 $in_attribute_list = 1;
 2661             }
 2662 
 2663             # Look for Switch::Plain syntax if an error would otherwise occur
 2664             # here. Note that we do not need to check if the extended syntax
 2665             # flag is set because otherwise an error would occur, and we would
 2666             # then have to output a message telling the user to set the
 2667             # extended syntax flag to avoid the error.
 2668             #  case 1: {
 2669             #  default: {
 2670             #  default:
 2671             # Note that the line 'default:' will be parsed as a label elsewhere.
 2672             elsif ( $is_case_default{$statement_type}
 2673                 && !is_balanced_closing_container(QUESTION_COLON) )
 2674             {
 2675                 # mark it as a perltidy label type
 2676                 $type = 'J';
 2677             }
 2678 
 2679             # otherwise, it should be part of a ?/: operator
 2680             else {
 2681                 ( $type_sequence, $indent_flag ) =
 2682                   decrease_nesting_depth( QUESTION_COLON,
 2683                     $rtoken_map->[$i_tok] );
 2684                 if ( $last_nonblank_token eq '?' ) {
 2685                     warning("Syntax error near ? :\n");
 2686                 }
 2687             }
 2688         },
 2689         '+' => sub {    # what kind of plus?
 2690 
 2691             if ( $expecting == TERM ) {
 2692                 my $number = scan_number_fast();
 2693 
 2694                 # unary plus is safest assumption if not a number
 2695                 if ( !defined($number) ) { $type = 'p'; }
 2696             }
 2697             elsif ( $expecting == OPERATOR ) {
 2698             }
 2699             else {
 2700                 if ( $next_type eq 'w' ) { $type = 'p' }
 2701             }
 2702         },
 2703         '@' => sub {
 2704 
 2705             error_if_expecting_OPERATOR("Array")
 2706               if ( $expecting == OPERATOR );
 2707             scan_identifier_fast();
 2708         },
 2709         '%' => sub {    # hash or modulo?
 2710 
 2711             # first guess is hash if no following blank or paren
 2712             if ( $expecting == UNKNOWN ) {
 2713                 if ( $next_type ne 'b' && $next_type ne '(' ) {
 2714                     $expecting = TERM;
 2715                 }
 2716             }
 2717             if ( $expecting == TERM ) {
 2718                 scan_identifier_fast();
 2719             }
 2720         },
 2721         '[' => sub {
 2722             $square_bracket_type[ ++$square_bracket_depth ] =
 2723               $last_nonblank_token;
 2724             ( $type_sequence, $indent_flag ) =
 2725               increase_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
 2726 
 2727             # It may seem odd, but structural square brackets have
 2728             # type '{' and '}'.  This simplifies the indentation logic.
 2729             if ( !is_non_structural_brace() ) {
 2730                 $type = '{';
 2731             }
 2732             $square_bracket_structural_type[$square_bracket_depth] = $type;
 2733         },
 2734         ']' => sub {
 2735             ( $type_sequence, $indent_flag ) =
 2736               decrease_nesting_depth( SQUARE_BRACKET, $rtoken_map->[$i_tok] );
 2737 
 2738             if ( $square_bracket_structural_type[$square_bracket_depth] eq '{' )
 2739             {
 2740                 $type = '}';
 2741             }
 2742 
 2743             # propagate type information for smartmatch operator.  This is
 2744             # necessary to enable us to know if an operator or term is expected
 2745             # next.
 2746             if ( $square_bracket_type[$square_bracket_depth] eq '~~' ) {
 2747                 $tok = $square_bracket_type[$square_bracket_depth];
 2748             }
 2749 
 2750             if ( $square_bracket_depth > 0 ) { $square_bracket_depth--; }
 2751         },
 2752         '-' => sub {    # what kind of minus?
 2753 
 2754             if ( ( $expecting != OPERATOR )
 2755                 && $is_file_test_operator{$next_tok} )
 2756             {
 2757                 my ( $next_nonblank_token, $i_next ) =
 2758                   find_next_nonblank_token( $i + 1, $rtokens,
 2759                     $max_token_index );
 2760 
 2761                 # check for a quoted word like "-w=>xx";
 2762                 # it is sufficient to just check for a following '='
 2763                 if ( $next_nonblank_token eq '=' ) {
 2764                     $type = 'm';
 2765                 }
 2766                 else {
 2767                     $i++;
 2768                     $tok .= $next_tok;
 2769                     $type = 'F';
 2770                 }
 2771             }
 2772             elsif ( $expecting == TERM ) {
 2773                 my $number = scan_number_fast();
 2774 
 2775                 # maybe part of bareword token? unary is safest
 2776                 if ( !defined($number) ) { $type = 'm'; }
 2777 
 2778             }
 2779             elsif ( $expecting == OPERATOR ) {
 2780             }
 2781             else {
 2782 
 2783                 if ( $next_type eq 'w' ) {
 2784                     $type = 'm';
 2785                 }
 2786             }
 2787         },
 2788 
 2789         '^' => sub {
 2790 
 2791             # check for special variables like ${^WARNING_BITS}
 2792             if ( $expecting == TERM ) {
 2793 
 2794                 # FIXME: this should work but will not catch errors
 2795                 # because we also have to be sure that previous token is
 2796                 # a type character ($,@,%).
 2797                 if (   $last_nonblank_token eq '{'
 2798                     && ( $next_tok !~ /^\d/ )
 2799                     && ( $next_tok =~ /^\w/ ) )
 2800                 {
 2801 
 2802                     if ( $next_tok eq 'W' ) {
 2803                         $tokenizer_self->[_saw_perl_dash_w_] = 1;
 2804                     }
 2805                     $tok  = $tok . $next_tok;
 2806                     $i    = $i + 1;
 2807                     $type = 'w';
 2808                 }
 2809 
 2810                 else {
 2811                     unless ( error_if_expecting_TERM() ) {
 2812 
 2813                         # Something like this is valid but strange:
 2814                         # undef ^I;
 2815                         complain("The '^' seems unusual here\n");
 2816                     }
 2817                 }
 2818             }
 2819         },
 2820 
 2821         '::' => sub {    # probably a sub call
 2822             scan_bare_identifier();
 2823         },
 2824         '<<' => sub {    # maybe a here-doc?
 2825 
 2826 ##      This check removed because it could be a deprecated here-doc with
 2827 ##      no specified target.  See example in log 16 Sep 2020.
 2828 ##            return
 2829 ##              unless ( $i < $max_token_index )
 2830 ##              ;          # here-doc not possible if end of line
 2831 
 2832             if ( $expecting != OPERATOR ) {
 2833                 my ( $found_target, $here_doc_target, $here_quote_character,
 2834                     $saw_error );
 2835                 (
 2836                     $found_target, $here_doc_target, $here_quote_character, $i,
 2837                     $saw_error
 2838                   )
 2839                   = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
 2840                     $max_token_index );
 2841 
 2842                 if ($found_target) {
 2843                     push @{$rhere_target_list},
 2844                       [ $here_doc_target, $here_quote_character ];
 2845                     $type = 'h';
 2846                     if ( length($here_doc_target) > 80 ) {
 2847                         my $truncated = substr( $here_doc_target, 0, 80 );
 2848                         complain("Long here-target: '$truncated' ...\n");
 2849                     }
 2850                     elsif ( !$here_doc_target ) {
 2851                         warning(
 2852                             'Use of bare << to mean <<"" is deprecated' . "\n" )
 2853                           unless ($here_quote_character);
 2854                     }
 2855                     elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
 2856                         complain(
 2857                             "Unconventional here-target: '$here_doc_target'\n");
 2858                     }
 2859                 }
 2860                 elsif ( $expecting == TERM ) {
 2861                     unless ($saw_error) {
 2862 
 2863                         # shouldn't happen..
 2864                         warning("Program bug; didn't find here doc target\n");
 2865                         report_definite_bug();
 2866                     }
 2867                 }
 2868             }
 2869             else {
 2870             }
 2871         },
 2872         '<<~' => sub {    # a here-doc, new type added in v26
 2873             return
 2874               unless ( $i < $max_token_index )
 2875               ;           # here-doc not possible if end of line
 2876             if ( $expecting != OPERATOR ) {
 2877                 my ( $found_target, $here_doc_target, $here_quote_character,
 2878                     $saw_error );
 2879                 (
 2880                     $found_target, $here_doc_target, $here_quote_character, $i,
 2881                     $saw_error
 2882                   )
 2883                   = find_here_doc( $expecting, $i, $rtokens, $rtoken_map,
 2884                     $max_token_index );
 2885 
 2886                 if ($found_target) {
 2887 
 2888                     if ( length($here_doc_target) > 80 ) {
 2889                         my $truncated = substr( $here_doc_target, 0, 80 );
 2890                         complain("Long here-target: '$truncated' ...\n");
 2891                     }
 2892                     elsif ( $here_doc_target !~ /^[A-Z_]\w+$/ ) {
 2893                         complain(
 2894                             "Unconventional here-target: '$here_doc_target'\n");
 2895                     }
 2896 
 2897                     # Note that we put a leading space on the here quote
 2898                     # character indicate that it may be preceded by spaces
 2899                     $here_quote_character = " " . $here_quote_character;
 2900                     push @{$rhere_target_list},
 2901                       [ $here_doc_target, $here_quote_character ];
 2902                     $type = 'h';
 2903                 }
 2904                 elsif ( $expecting == TERM ) {
 2905                     unless ($saw_error) {
 2906 
 2907                         # shouldn't happen..
 2908                         warning("Program bug; didn't find here doc target\n");
 2909                         report_definite_bug();
 2910                     }
 2911                 }
 2912             }
 2913             else {
 2914             }
 2915         },
 2916         '->' => sub {
 2917 
 2918             # if -> points to a bare word, we must scan for an identifier,
 2919             # otherwise something like ->y would look like the y operator
 2920 
 2921             # NOTE: this will currently allow things like
 2922             #     '->@array'    '->*VAR'  '->%hash'
 2923             # to get parsed as identifiers, even though these are not currently
 2924             # allowed syntax.  To catch syntax errors like this we could first
 2925             # check that the next character and skip this call if it is one of
 2926             # ' @ % * '.  A disadvantage with doing this is that this would
 2927             # have to be fixed if the perltidy syntax is ever extended to make
 2928             # any of these valid.  So for now this check is not done.
 2929             scan_identifier_fast();
 2930         },
 2931 
 2932         # type = 'pp' for pre-increment, '++' for post-increment
 2933         '++' => sub {
 2934             if    ( $expecting == TERM ) { $type = 'pp' }
 2935             elsif ( $expecting == UNKNOWN ) {
 2936 
 2937                 my ( $next_nonblank_token, $i_next ) =
 2938                   find_next_nonblank_token( $i, $rtokens, $max_token_index );
 2939 
 2940                 # Fix for c042: look past a side comment
 2941                 if ( $next_nonblank_token eq '#' ) {
 2942                     ( $next_nonblank_token, $i_next ) =
 2943                       find_next_nonblank_token( $max_token_index,
 2944                         $rtokens, $max_token_index );
 2945                 }
 2946 
 2947                 if ( $next_nonblank_token eq '$' ) { $type = 'pp' }
 2948             }
 2949         },
 2950 
 2951         '=>' => sub {
 2952             if ( $last_nonblank_type eq $tok ) {
 2953                 complain("Repeated '=>'s \n");
 2954             }
 2955 
 2956             # patch for operator_expected: note if we are in the list (use.t)
 2957             # TODO: make version numbers a new token type
 2958             if ( $statement_type eq 'use' ) { $statement_type = '_use' }
 2959         },
 2960 
 2961         # type = 'mm' for pre-decrement, '--' for post-decrement
 2962         '--' => sub {
 2963 
 2964             if    ( $expecting == TERM ) { $type = 'mm' }
 2965             elsif ( $expecting == UNKNOWN ) {
 2966                 my ( $next_nonblank_token, $i_next ) =
 2967                   find_next_nonblank_token( $i, $rtokens, $max_token_index );
 2968 
 2969                 # Fix for c042: look past a side comment
 2970                 if ( $next_nonblank_token eq '#' ) {
 2971                     ( $next_nonblank_token, $i_next ) =
 2972                       find_next_nonblank_token( $max_token_index,
 2973                         $rtokens, $max_token_index );
 2974                 }
 2975 
 2976                 if ( $next_nonblank_token eq '$' ) { $type = 'mm' }
 2977             }
 2978         },
 2979 
 2980         '&&' => sub {
 2981             error_if_expecting_TERM()
 2982               if ( $expecting == TERM && $last_nonblank_token ne ',' );    #c015
 2983         },
 2984 
 2985         '||' => sub {
 2986             error_if_expecting_TERM()
 2987               if ( $expecting == TERM && $last_nonblank_token ne ',' );    #c015
 2988         },
 2989 
 2990         '//' => sub {
 2991             error_if_expecting_TERM()
 2992               if ( $expecting == TERM );
 2993         },
 2994     };
 2995 
 2996     # ------------------------------------------------------------
 2997     # end hash of code for handling individual token types
 2998     # ------------------------------------------------------------
 2999 
 3000     my %matching_start_token = ( '}' => '{', ']' => '[', ')' => '(' );
 3001 
 3002     # These block types terminate statements and do not need a trailing
 3003     # semicolon
 3004     # patched for SWITCH/CASE/
 3005     my %is_zero_continuation_block_type;
 3006     @_ = qw( } { BEGIN END CHECK INIT AUTOLOAD DESTROY UNITCHECK continue ;
 3007       if elsif else unless while until for foreach switch case given when);
 3008     @is_zero_continuation_block_type{@_} = (1) x scalar(@_);
 3009 
 3010     my %is_not_zero_continuation_block_type;
 3011     @_ = qw(sort grep map do eval);
 3012     @is_not_zero_continuation_block_type{@_} = (1) x scalar(@_);
 3013 
 3014     my %is_logical_container;
 3015     @_ = qw(if elsif unless while and or err not && !  || for foreach);
 3016     @is_logical_container{@_} = (1) x scalar(@_);
 3017 
 3018     my %is_binary_type;
 3019     @_ = qw(|| &&);
 3020     @is_binary_type{@_} = (1) x scalar(@_);
 3021 
 3022     my %is_binary_keyword;
 3023     @_ = qw(and or err eq ne cmp);
 3024     @is_binary_keyword{@_} = (1) x scalar(@_);
 3025 
 3026     # 'L' is token for opening { at hash key
 3027     my %is_opening_type;
 3028     @_ = qw< L { ( [ >;
 3029     @is_opening_type{@_} = (1) x scalar(@_);
 3030 
 3031     # 'R' is token for closing } at hash key
 3032     my %is_closing_type;
 3033     @_ = qw< R } ) ] >;
 3034     @is_closing_type{@_} = (1) x scalar(@_);
 3035 
 3036     my %is_redo_last_next_goto;
 3037     @_ = qw(redo last next goto);
 3038     @is_redo_last_next_goto{@_} = (1) x scalar(@_);
 3039 
 3040     my %is_use_require;
 3041     @_ = qw(use require);
 3042     @is_use_require{@_} = (1) x scalar(@_);
 3043 
 3044     # This hash holds the array index in $tokenizer_self for these keywords:
 3045     # Fix for issue c035: removed 'format' from this hash
 3046     my %is_END_DATA = (
 3047         '__END__'  => _in_end_,
 3048         '__DATA__' => _in_data_,
 3049     );
 3050 
 3051     # original ref: camel 3 p 147,
 3052     # but perl may accept undocumented flags
 3053     # perl 5.10 adds 'p' (preserve)
 3054     # Perl version 5.22 added 'n'
 3055     # From http://perldoc.perl.org/perlop.html we have
 3056     # /PATTERN/msixpodualngc or m?PATTERN?msixpodualngc
 3057     # s/PATTERN/REPLACEMENT/msixpodualngcer
 3058     # y/SEARCHLIST/REPLACEMENTLIST/cdsr
 3059     # tr/SEARCHLIST/REPLACEMENTLIST/cdsr
 3060     # qr/STRING/msixpodualn
 3061     my %quote_modifiers = (
 3062         's'  => '[msixpodualngcer]',
 3063         'y'  => '[cdsr]',
 3064         'tr' => '[cdsr]',
 3065         'm'  => '[msixpodualngc]',
 3066         'qr' => '[msixpodualn]',
 3067         'q'  => "",
 3068         'qq' => "",
 3069         'qw' => "",
 3070         'qx' => "",
 3071     );
 3072 
 3073     # table showing how many quoted things to look for after quote operator..
 3074     # s, y, tr have 2 (pattern and replacement)
 3075     # others have 1 (pattern only)
 3076     my %quote_items = (
 3077         's'  => 2,
 3078         'y'  => 2,
 3079         'tr' => 2,
 3080         'm'  => 1,
 3081         'qr' => 1,
 3082         'q'  => 1,
 3083         'qq' => 1,
 3084         'qw' => 1,
 3085         'qx' => 1,
 3086     );
 3087 
 3088     use constant DEBUG_TOKENIZE => 0;
 3089 
 3090     sub tokenize_this_line {
 3091 
 3092   # This routine breaks a line of perl code into tokens which are of use in
 3093   # indentation and reformatting.  One of my goals has been to define tokens
 3094   # such that a newline may be inserted between any pair of tokens without
 3095   # changing or invalidating the program. This version comes close to this,
 3096   # although there are necessarily a few exceptions which must be caught by
 3097   # the formatter.  Many of these involve the treatment of bare words.
 3098   #
 3099   # The tokens and their types are returned in arrays.  See previous
 3100   # routine for their names.
 3101   #
 3102   # See also the array "valid_token_types" in the BEGIN section for an
 3103   # up-to-date list.
 3104   #
 3105   # To simplify things, token types are either a single character, or they
 3106   # are identical to the tokens themselves.
 3107   #
 3108   # As a debugging aid, the -D flag creates a file containing a side-by-side
 3109   # comparison of the input string and its tokenization for each line of a file.
 3110   # This is an invaluable debugging aid.
 3111   #
 3112   # In addition to tokens, and some associated quantities, the tokenizer
 3113   # also returns flags indication any special line types.  These include
 3114   # quotes, here_docs, formats.
 3115   #
 3116   # -----------------------------------------------------------------------
 3117   #
 3118   # How to add NEW_TOKENS:
 3119   #
 3120   # New token types will undoubtedly be needed in the future both to keep up
 3121   # with changes in perl and to help adapt the tokenizer to other applications.
 3122   #
 3123   # Here are some notes on the minimal steps.  I wrote these notes while
 3124   # adding the 'v' token type for v-strings, which are things like version
 3125   # numbers 5.6.0, and ip addresses, and will use that as an example.  ( You
 3126   # can use your editor to search for the string "NEW_TOKENS" to find the
 3127   # appropriate sections to change):
 3128   #
 3129   # *. Try to talk somebody else into doing it!  If not, ..
 3130   #
 3131   # *. Make a backup of your current version in case things don't work out!
 3132   #
 3133   # *. Think of a new, unused character for the token type, and add to
 3134   # the array @valid_token_types in the BEGIN section of this package.
 3135   # For example, I used 'v' for v-strings.
 3136   #
 3137   # *. Implement coding to recognize the $type of the token in this routine.
 3138   # This is the hardest part, and is best done by imitating or modifying
 3139   # some of the existing coding.  For example, to recognize v-strings, I
 3140   # patched 'sub scan_bare_identifier' to recognize v-strings beginning with
 3141   # 'v' and 'sub scan_number' to recognize v-strings without the leading 'v'.
 3142   #
 3143   # *. Update sub operator_expected.  This update is critically important but
 3144   # the coding is trivial.  Look at the comments in that routine for help.
 3145   # For v-strings, which should behave like numbers, I just added 'v' to the
 3146   # regex used to handle numbers and strings (types 'n' and 'Q').
 3147   #
 3148   # *. Implement a 'bond strength' rule in sub set_bond_strengths in
 3149   # Perl::Tidy::Formatter for breaking lines around this token type.  You can
 3150   # skip this step and take the default at first, then adjust later to get
 3151   # desired results.  For adding type 'v', I looked at sub bond_strength and
 3152   # saw that number type 'n' was using default strengths, so I didn't do
 3153   # anything.  I may tune it up someday if I don't like the way line
 3154   # breaks with v-strings look.
 3155   #
 3156   # *. Implement a 'whitespace' rule in sub set_whitespace_flags in
 3157   # Perl::Tidy::Formatter.  For adding type 'v', I looked at this routine
 3158   # and saw that type 'n' used spaces on both sides, so I just added 'v'
 3159   # to the array @spaces_both_sides.
 3160   #
 3161   # *. Update HtmlWriter package so that users can colorize the token as
 3162   # desired.  This is quite easy; see comments identified by 'NEW_TOKENS' in
 3163   # that package.  For v-strings, I initially chose to use a default color
 3164   # equal to the default for numbers, but it might be nice to change that
 3165   # eventually.
 3166   #
 3167   # *. Update comments in Perl::Tidy::Tokenizer::dump_token_types.
 3168   #
 3169   # *. Run lots and lots of debug tests.  Start with special files designed
 3170   # to test the new token type.  Run with the -D flag to create a .DEBUG
 3171   # file which shows the tokenization.  When these work ok, test as many old
 3172   # scripts as possible.  Start with all of the '.t' files in the 'test'
 3173   # directory of the distribution file.  Compare .tdy output with previous
 3174   # version and updated version to see the differences.  Then include as
 3175   # many more files as possible. My own technique has been to collect a huge
 3176   # number of perl scripts (thousands!) into one directory and run perltidy
 3177   # *, then run diff between the output of the previous version and the
 3178   # current version.
 3179   #
 3180   # *. For another example, search for the smartmatch operator '~~'
 3181   # with your editor to see where updates were made for it.
 3182   #
 3183   # -----------------------------------------------------------------------
 3184 
 3185         my $line_of_tokens = shift;
 3186         my ($untrimmed_input_line) = $line_of_tokens->{_line_text};
 3187 
 3188         # patch while coding change is underway
 3189         # make callers private data to allow access
 3190         # $tokenizer_self = $caller_tokenizer_self;
 3191 
 3192         # extract line number for use in error messages
 3193         $input_line_number = $line_of_tokens->{_line_number};
 3194 
 3195         # reinitialize for multi-line quote
 3196         $line_of_tokens->{_starting_in_quote} = $in_quote && $quote_type eq 'Q';
 3197 
 3198         # check for pod documentation
 3199         if ( substr( $untrimmed_input_line, 0, 1 ) eq '='
 3200             && $untrimmed_input_line =~ /^=[A-Za-z_]/ )
 3201         {
 3202 
 3203             # must not be in multi-line quote
 3204             # and must not be in an equation
 3205             if ( !$in_quote
 3206                 && ( operator_expected( [ 'b', '=', 'b' ] ) == TERM ) )
 3207             {
 3208                 $tokenizer_self->[_in_pod_] = 1;
 3209                 return;
 3210             }
 3211         }
 3212 
 3213         $input_line = $untrimmed_input_line;
 3214 
 3215         chomp $input_line;
 3216 
 3217         # Set a flag to indicate if we might be at an __END__ or __DATA__ line
 3218         # This will be used below to avoid quoting a bare word followed by
 3219         # a fat comma.
 3220         my $is_END_or_DATA;
 3221 
 3222         # trim start of this line unless we are continuing a quoted line
 3223         # do not trim end because we might end in a quote (test: deken4.pl)
 3224         # Perl::Tidy::Formatter will delete needless trailing blanks
 3225         unless ( $in_quote && ( $quote_type eq 'Q' ) ) {
 3226             $input_line =~ s/^\s+//;    # trim left end
 3227 
 3228             $is_END_or_DATA = substr( $input_line, 0, 1 ) eq '_'
 3229               && $input_line =~ /^\s*__(END|DATA)__\s*$/;
 3230         }
 3231 
 3232         # update the copy of the line for use in error messages
 3233         # This must be exactly what we give the pre_tokenizer
 3234         $tokenizer_self->[_line_of_text_] = $input_line;
 3235 
 3236         # re-initialize for the main loop
 3237         $routput_token_list     = [];    # stack of output token indexes
 3238         $routput_token_type     = [];    # token types
 3239         $routput_block_type     = [];    # types of code block
 3240         $routput_container_type = [];    # paren types, such as if, elsif, ..
 3241         $routput_type_sequence  = [];    # nesting sequential number
 3242 
 3243         $rhere_target_list = [];
 3244 
 3245         $tok             = $last_nonblank_token;
 3246         $type            = $last_nonblank_type;
 3247         $prototype       = $last_nonblank_prototype;
 3248         $last_nonblank_i = -1;
 3249         $block_type      = $last_nonblank_block_type;
 3250         $container_type  = $last_nonblank_container_type;
 3251         $type_sequence   = $last_nonblank_type_sequence;
 3252         $indent_flag     = 0;
 3253         $peeked_ahead    = 0;
 3254 
 3255         # tokenization is done in two stages..
 3256         # stage 1 is a very simple pre-tokenization
 3257         my $max_tokens_wanted = 0; # this signals pre_tokenize to get all tokens
 3258 
 3259         # optimize for a full-line comment
 3260         if ( !$in_quote && substr( $input_line, 0, 1 ) eq '#' ) {
 3261             $max_tokens_wanted = 1;    # no use tokenizing a comment
 3262 
 3263             # and check for skipped section
 3264             if (   $rOpts_code_skipping
 3265                 && $input_line =~ /$code_skipping_pattern_begin/ )
 3266             {
 3267                 $tokenizer_self->[_in_skipped_] = 1;
 3268                 return;
 3269             }
 3270         }
 3271 
 3272         # start by breaking the line into pre-tokens
 3273         ( $rtokens, $rtoken_map, $rtoken_type ) =
 3274           pre_tokenize( $input_line, $max_tokens_wanted );
 3275 
 3276         $max_token_index = scalar( @{$rtokens} ) - 1;
 3277         push( @{$rtokens}, ' ', ' ', ' ' );  # extra whitespace simplifies logic
 3278         push( @{$rtoken_map},  0,   0,   0 );     # shouldn't be referenced
 3279         push( @{$rtoken_type}, 'b', 'b', 'b' );
 3280 
 3281         # initialize for main loop
 3282         foreach my $ii ( 0 .. $max_token_index + 3 ) {
 3283             $routput_token_type->[$ii]     = "";
 3284             $routput_block_type->[$ii]     = "";
 3285             $routput_container_type->[$ii] = "";
 3286             $routput_type_sequence->[$ii]  = "";
 3287             $routput_indent_flag->[$ii]    = 0;
 3288         }
 3289         $i     = -1;
 3290         $i_tok = -1;
 3291 
 3292         # ------------------------------------------------------------
 3293         # begin main tokenization loop
 3294         # ------------------------------------------------------------
 3295 
 3296         # we are looking at each pre-token of one line and combining them
 3297         # into tokens
 3298         while ( ++$i <= $max_token_index ) {
 3299 
 3300             if ($in_quote) {    # continue looking for end of a quote
 3301                 $type = $quote_type;
 3302 
 3303                 unless ( @{$routput_token_list} )
 3304                 {               # initialize if continuation line
 3305                     push( @{$routput_token_list}, $i );
 3306                     $routput_token_type->[$i] = $type;
 3307 
 3308                 }
 3309                 $tok = $quote_character if ($quote_character);
 3310 
 3311                 # scan for the end of the quote or pattern
 3312                 (
 3313                     $i, $in_quote, $quote_character, $quote_pos, $quote_depth,
 3314                     $quoted_string_1, $quoted_string_2
 3315                   )
 3316                   = do_quote(
 3317                     $i,               $in_quote,    $quote_character,
 3318                     $quote_pos,       $quote_depth, $quoted_string_1,
 3319                     $quoted_string_2, $rtokens,     $rtoken_map,
 3320                     $max_token_index
 3321                   );
 3322 
 3323                 # all done if we didn't find it
 3324                 last if ($in_quote);
 3325 
 3326                 # save pattern and replacement text for rescanning
 3327                 my $qs1 = $quoted_string_1;
 3328                 my $qs2 = $quoted_string_2;
 3329 
 3330                 # re-initialize for next search
 3331                 $quote_character = '';
 3332                 $quote_pos       = 0;
 3333                 $quote_type      = 'Q';
 3334                 $quoted_string_1 = "";
 3335                 $quoted_string_2 = "";
 3336                 last if ( ++$i > $max_token_index );
 3337 
 3338                 # look for any modifiers
 3339                 if ($allowed_quote_modifiers) {
 3340 
 3341                     # check for exact quote modifiers
 3342                     if ( $rtokens->[$i] =~ /^[A-Za-z_]/ ) {
 3343                         my $str = $rtokens->[$i];
 3344                         my $saw_modifier_e;
 3345                         while ( $str =~ /\G$allowed_quote_modifiers/gc ) {
 3346                             my $pos  = pos($str);
 3347                             my $char = substr( $str, $pos - 1, 1 );
 3348                             $saw_modifier_e ||= ( $char eq 'e' );
 3349                         }
 3350 
 3351                         # For an 'e' quote modifier we must scan the replacement
 3352                         # text for here-doc targets...
 3353                         # but if the modifier starts a new line we can skip
 3354                         # this because either the here doc will be fully
 3355                         # contained in the replacement text (so we can
 3356                         # ignore it) or Perl will not find it.
 3357                         # See test 'here2.in'.
 3358                         if ( $saw_modifier_e && $i_tok >= 0 ) {
 3359 
 3360                             my $rht = scan_replacement_text($qs1);
 3361 
 3362                             # Change type from 'Q' to 'h' for quotes with
 3363                             # here-doc targets so that the formatter (see sub
 3364                             # process_line_of_CODE) will not make any line
 3365                             # breaks after this point.
 3366                             if ($rht) {
 3367                                 push @{$rhere_target_list}, @{$rht};
 3368                                 $type = 'h';
 3369                                 if ( $i_tok < 0 ) {
 3370                                     my $ilast = $routput_token_list->[-1];
 3371                                     $routput_token_type->[$ilast] = $type;
 3372                                 }
 3373                             }
 3374                         }
 3375 
 3376                         if ( defined( pos($str) ) ) {
 3377 
 3378                             # matched
 3379                             if ( pos($str) == length($str) ) {
 3380                                 last if ( ++$i > $max_token_index );
 3381                             }
 3382 
 3383                             # Looks like a joined quote modifier
 3384                             # and keyword, maybe something like
 3385                             # s/xxx/yyy/gefor @k=...
 3386                             # Example is "galgen.pl".  Would have to split
 3387                             # the word and insert a new token in the
 3388                             # pre-token list.  This is so rare that I haven't
 3389                             # done it.  Will just issue a warning citation.
 3390 
 3391                             # This error might also be triggered if my quote
 3392                             # modifier characters are incomplete
 3393                             else {
 3394                                 warning(<<EOM);
 3395 
 3396 Partial match to quote modifier $allowed_quote_modifiers at word: '$str'
 3397 Please put a space between quote modifiers and trailing keywords.
 3398 EOM
 3399 
 3400                          # print "token $rtokens->[$i]\n";
 3401                          # my $num = length($str) - pos($str);
 3402                          # $rtokens->[$i]=substr($rtokens->[$i],pos($str),$num);
 3403                          # print "continuing with new token $rtokens->[$i]\n";
 3404 
 3405                                 # skipping past this token does least damage
 3406                                 last if ( ++$i > $max_token_index );
 3407                             }
 3408                         }
 3409                         else {
 3410 
 3411                             # example file: rokicki4.pl
 3412                             # This error might also be triggered if my quote
 3413                             # modifier characters are incomplete
 3414                             write_logfile_entry(
 3415 "Note: found word $str at quote modifier location\n"
 3416                             );
 3417                         }
 3418                     }
 3419 
 3420                     # re-initialize
 3421                     $allowed_quote_modifiers = "";
 3422                 }
 3423             }
 3424 
 3425             unless ( $type eq 'b' || $tok eq 'CORE::' ) {
 3426 
 3427                 # try to catch some common errors
 3428                 if ( ( $type eq 'n' ) && ( $tok ne '0' ) ) {
 3429 
 3430                     if ( $last_nonblank_token eq 'eq' ) {
 3431                         complain("Should 'eq' be '==' here ?\n");
 3432                     }
 3433                     elsif ( $last_nonblank_token eq 'ne' ) {
 3434                         complain("Should 'ne' be '!=' here ?\n");
 3435                     }
 3436                 }
 3437 
 3438                 $last_last_nonblank_token      = $last_nonblank_token;
 3439                 $last_last_nonblank_type       = $last_nonblank_type;
 3440                 $last_last_nonblank_block_type = $last_nonblank_block_type;
 3441                 $last_last_nonblank_container_type =
 3442                   $last_nonblank_container_type;
 3443                 $last_last_nonblank_type_sequence =
 3444                   $last_nonblank_type_sequence;
 3445                 $last_nonblank_token          = $tok;
 3446                 $last_nonblank_type           = $type;
 3447                 $last_nonblank_prototype      = $prototype;
 3448                 $last_nonblank_block_type     = $block_type;
 3449                 $last_nonblank_container_type = $container_type;
 3450                 $last_nonblank_type_sequence  = $type_sequence;
 3451                 $last_nonblank_i              = $i_tok;
 3452 
 3453                 # Patch for c030: Fix things in case a '->' got separated from
 3454                 # the subsequent identifier by a side comment.  We need the
 3455                 # last_nonblank_token to have a leading -> to avoid triggering
 3456                 # an operator expected error message at the next '('. See also
 3457                 # fix for git #63.
 3458                 if ( $last_last_nonblank_token eq '->' ) {
 3459                     if (   $last_nonblank_type eq 'w'
 3460                         || $last_nonblank_type eq 'i'
 3461                         && substr( $last_nonblank_token, 0, 1 ) eq '$' )
 3462                     {
 3463                         $last_nonblank_token = '->' . $last_nonblank_token;
 3464                         $last_nonblank_type  = 'i';
 3465                     }
 3466                 }
 3467             }
 3468 
 3469             # store previous token type
 3470             if ( $i_tok >= 0 ) {
 3471                 $routput_token_type->[$i_tok]     = $type;
 3472                 $routput_block_type->[$i_tok]     = $block_type;
 3473                 $routput_container_type->[$i_tok] = $container_type;
 3474                 $routput_type_sequence->[$i_tok]  = $type_sequence;
 3475                 $routput_indent_flag->[$i_tok]    = $indent_flag;
 3476             }
 3477             my $pre_tok  = $rtokens->[$i];        # get the next pre-token
 3478             my $pre_type = $rtoken_type->[$i];    # and type
 3479             $tok        = $pre_tok;
 3480             $type       = $pre_type;              # to be modified as necessary
 3481             $block_type = "";    # blank for all tokens except code block braces
 3482             $container_type = "";    # blank for all tokens except some parens
 3483             $type_sequence  = "";    # blank for all tokens except ?/:
 3484             $indent_flag    = 0;
 3485             $prototype = "";    # blank for all tokens except user defined subs
 3486             $i_tok     = $i;
 3487 
 3488             # this pre-token will start an output token
 3489             push( @{$routput_token_list}, $i_tok );
 3490 
 3491             # continue gathering identifier if necessary
 3492             # but do not start on blanks and comments
 3493             if ( $id_scan_state && $pre_type ne 'b' && $pre_type ne '#' ) {
 3494 
 3495                 if ( $is_sub{$id_scan_state} || $is_package{$id_scan_state} ) {
 3496                     scan_id();
 3497                 }
 3498                 else {
 3499                     scan_identifier();
 3500                 }
 3501 
 3502                 last if ($id_scan_state);
 3503                 next if ( ( $i > 0 ) || $type );
 3504 
 3505                 # didn't find any token; start over
 3506                 $type = $pre_type;
 3507                 $tok  = $pre_tok;
 3508             }
 3509 
 3510             # handle whitespace tokens..
 3511             next if ( $type eq 'b' );
 3512             my $prev_tok  = $i > 0 ? $rtokens->[ $i - 1 ]     : ' ';
 3513             my $prev_type = $i > 0 ? $rtoken_type->[ $i - 1 ] : 'b';
 3514 
 3515             # Build larger tokens where possible, since we are not in a quote.
 3516             #
 3517             # First try to assemble digraphs.  The following tokens are
 3518             # excluded and handled specially:
 3519             # '/=' is excluded because the / might start a pattern.
 3520             # 'x=' is excluded since it might be $x=, with $ on previous line
 3521             # '**' and *= might be typeglobs of punctuation variables
 3522             # I have allowed tokens starting with <, such as <=,
 3523             # because I don't think these could be valid angle operators.
 3524             # test file: storrs4.pl
 3525             my $test_tok   = $tok . $rtokens->[ $i + 1 ];
 3526             my $combine_ok = $is_digraph{$test_tok};
 3527 
 3528             # check for special cases which cannot be combined
 3529             if ($combine_ok) {
 3530 
 3531                 # '//' must be defined_or operator if an operator is expected.
 3532                 # TODO: Code for other ambiguous digraphs (/=, x=, **, *=)
 3533                 # could be migrated here for clarity
 3534 
 3535               # Patch for RT#102371, misparsing a // in the following snippet:
 3536               #     state $b //= ccc();
 3537               # The solution is to always accept the digraph (or trigraph) after
 3538               # token type 'Z' (possible file handle).  The reason is that
 3539               # sub operator_expected gives TERM expected here, which is
 3540               # wrong in this case.
 3541                 if ( $test_tok eq '//' && $last_nonblank_type ne 'Z' ) {
 3542                     my $next_type = $rtokens->[ $i + 1 ];
 3543                     my $expecting =
 3544                       operator_expected( [ $prev_type, $tok, $next_type ] );
 3545 
 3546                     # Patched for RT#101547, was 'unless ($expecting==OPERATOR)'
 3547                     $combine_ok = 0 if ( $expecting == TERM );
 3548                 }
 3549 
 3550                 # Patch for RT #114359: Missparsing of "print $x ** 0.5;
 3551                 # Accept the digraphs '**' only after type 'Z'
 3552                 # Otherwise postpone the decision.
 3553                 if ( $test_tok eq '**' ) {
 3554                     if ( $last_nonblank_type ne 'Z' ) { $combine_ok = 0 }
 3555                 }
 3556             }
 3557 
 3558             if (
 3559                 $combine_ok
 3560 
 3561                 && ( $test_tok ne '/=' )    # might be pattern
 3562                 && ( $test_tok ne 'x=' )    # might be $x
 3563                 && ( $test_tok ne '*=' )    # typeglob?
 3564 
 3565                 # Moved above as part of fix for
 3566                 # RT #114359: Missparsing of "print $x ** 0.5;
 3567                 # && ( $test_tok ne '**' )    # typeglob?
 3568               )
 3569             {
 3570                 $tok = $test_tok;
 3571                 $i++;
 3572 
 3573                 # Now try to assemble trigraphs.  Note that all possible
 3574                 # perl trigraphs can be constructed by appending a character
 3575                 # to a digraph.
 3576                 $test_tok = $tok . $rtokens->[ $i + 1 ];
 3577 
 3578                 if ( $is_trigraph{$test_tok} ) {
 3579                     $tok = $test_tok;
 3580                     $i++;
 3581                 }
 3582 
 3583                 # The only current tetragraph is the double diamond operator
 3584                 # and its first three characters are not a trigraph, so
 3585                 # we do can do a special test for it
 3586                 elsif ( $test_tok eq '<<>' ) {
 3587                     $test_tok .= $rtokens->[ $i + 2 ];
 3588                     if ( $is_tetragraph{$test_tok} ) {
 3589                         $tok = $test_tok;
 3590                         $i += 2;
 3591                     }
 3592                 }
 3593             }
 3594 
 3595             $type      = $tok;
 3596             $next_tok  = $rtokens->[ $i + 1 ];
 3597             $next_type = $rtoken_type->[ $i + 1 ];
 3598 
 3599             DEBUG_TOKENIZE && do {
 3600                 local $" = ')(';
 3601                 my @debug_list = (
 3602                     $last_nonblank_token,      $tok,
 3603                     $next_tok,                 $brace_depth,
 3604                     $brace_type[$brace_depth], $paren_depth,
 3605                     $paren_type[$paren_depth]
 3606                 );
 3607                 print STDOUT "TOKENIZE:(@debug_list)\n";
 3608             };
 3609 
 3610             # Turn off attribute list on first non-blank, non-bareword.
 3611             # Added '#' to fix c038.
 3612             if ( $pre_type ne 'w' && $pre_type ne '#' ) {
 3613                 $in_attribute_list = 0;
 3614             }
 3615 
 3616             ###############################################################
 3617             # We have the next token, $tok.
 3618             # Now we have to examine this token and decide what it is
 3619             # and define its $type
 3620             #
 3621             # section 1: bare words
 3622             ###############################################################
 3623 
 3624             if ( $pre_type eq 'w' ) {
 3625                 $expecting =
 3626                   operator_expected( [ $prev_type, $tok, $next_type ] );
 3627 
 3628                 # Patch for c043, part 3: A bareword after '->' expects a TERM
 3629                 # FIXME: It would be cleaner to give method calls a new type 'M'
 3630                 # and update sub operator_expected to handle this.
 3631                 if ( $last_nonblank_type eq '->' ) {
 3632                     $expecting = TERM;
 3633                 }
 3634 
 3635                 my ( $next_nonblank_token, $i_next ) =
 3636                   find_next_nonblank_token( $i, $rtokens, $max_token_index );
 3637 
 3638                 # ATTRS: handle sub and variable attributes
 3639                 if ($in_attribute_list) {
 3640 
 3641                     # treat bare word followed by open paren like qw(
 3642                     if ( $next_nonblank_token eq '(' ) {
 3643 
 3644                         # For something like:
 3645                         #     : prototype($$)
 3646                         # we should let do_scan_sub see it so that it can see
 3647                         # the prototype.  All other attributes get parsed as a
 3648                         # quoted string.
 3649                         if ( $tok eq 'prototype' ) {
 3650                             $id_scan_state = 'prototype';
 3651 
 3652                             # start just after the word 'prototype'
 3653                             my $i_beg = $i + 1;
 3654                             ( $i, $tok, $type, $id_scan_state ) = do_scan_sub(
 3655                                 {
 3656                                     input_line      => $input_line,
 3657                                     i               => $i,
 3658                                     i_beg           => $i_beg,
 3659                                     tok             => $tok,
 3660                                     type            => $type,
 3661                                     rtokens         => $rtokens,
 3662                                     rtoken_map      => $rtoken_map,
 3663                                     id_scan_state   => $id_scan_state,
 3664                                     max_token_index => $max_token_index
 3665                                 }
 3666                             );
 3667 
 3668                    # If successful, mark as type 'q' to be consistent with other
 3669                    # attributes.  Note that type 'w' would also work.
 3670                             if ( $i > $i_beg ) {
 3671                                 $type = 'q';
 3672                                 next;
 3673                             }
 3674 
 3675                             # If not successful, continue and parse as a quote.
 3676                         }
 3677 
 3678                         # All other attribute lists must be parsed as quotes
 3679                         # (see 'signatures.t' for good examples)
 3680                         $in_quote                = $quote_items{'q'};
 3681                         $allowed_quote_modifiers = $quote_modifiers{'q'};
 3682                         $type                    = 'q';
 3683                         $quote_type              = 'q';
 3684                         next;
 3685                     }
 3686 
 3687                     # handle bareword not followed by open paren
 3688                     else {
 3689                         $type = 'w';
 3690                         next;
 3691                     }
 3692                 }
 3693 
 3694                 # quote a word followed by => operator
 3695                 # unless the word __END__ or __DATA__ and the only word on
 3696                 # the line.
 3697                 if ( !$is_END_or_DATA && $next_nonblank_token eq '=' ) {
 3698 
 3699                     if ( $rtokens->[ $i_next + 1 ] eq '>' ) {
 3700                         if ( $is_constant{$current_package}{$tok} ) {
 3701                             $type = 'C';
 3702                         }
 3703                         elsif ( $is_user_function{$current_package}{$tok} ) {
 3704                             $type = 'U';
 3705                             $prototype =
 3706                               $user_function_prototype{$current_package}{$tok};
 3707                         }
 3708                         elsif ( $tok =~ /^v\d+$/ ) {
 3709                             $type = 'v';
 3710                             report_v_string($tok);
 3711                         }
 3712                         else {
 3713 
 3714                            # Bareword followed by a fat comma ... see 'git18.in'
 3715                            # If tok is something like 'x17' then it could
 3716                            # actually be operator x followed by number 17.
 3717                            # For example, here:
 3718                            #     123x17 => [ 792, 1224 ],
 3719                            # (a key of 123 repeated 17 times, perhaps not
 3720                            # what was intended). We will mark x17 as type
 3721                            # 'n' and it will be split. If the previous token
 3722                            # was also a bareword then it is not very clear is
 3723                            # going on.  In this case we will not be sure that
 3724                            # an operator is expected, so we just mark it as a
 3725                            # bareword.  Perl is a little murky in what it does
 3726                            # with stuff like this, and its behavior can change
 3727                            # over time.  Something like
 3728                            #    a x18 => [792, 1224], will compile as
 3729                            # a key with 18 a's.  But something like
 3730                            #    push @array, a x18;
 3731                            # is a syntax error.
 3732                             if ( $expecting == OPERATOR && $tok =~ /^x\d+$/ ) {
 3733                                 $type = 'n';
 3734                             }
 3735                             else {
 3736 
 3737                                 # git #18
 3738                                 $type = 'w';
 3739                                 error_if_expecting_OPERATOR();
 3740                             }
 3741                         }
 3742 
 3743                         next;
 3744                     }
 3745                 }
 3746 
 3747      # quote a bare word within braces..like xxx->{s}; note that we
 3748      # must be sure this is not a structural brace, to avoid
 3749      # mistaking {s} in the following for a quoted bare word:
 3750      #     for(@[){s}bla}BLA}
 3751      # Also treat q in something like var{-q} as a bare word, not qoute operator
 3752                 if (
 3753                     $next_nonblank_token eq '}'
 3754                     && (
 3755                         $last_nonblank_type eq 'L'
 3756                         || (   $last_nonblank_type eq 'm'
 3757                             && $last_last_nonblank_type eq 'L' )
 3758                     )
 3759                   )
 3760                 {
 3761                     $type = 'w';
 3762                     next;
 3763                 }
 3764 
 3765                 # Scan a bare word following a -> as an identifir; it could
 3766                 # have a long package name.  Fixes c037, c041.
 3767                 if ( $last_nonblank_token eq '->' ) {
 3768                     scan_bare_identifier();
 3769 
 3770                     # Patch for c043, part 4; use type 'w' after a '->'.
 3771                     # This is just a safety check on sub scan_bare_identifier,
 3772                     # which should get this case correct.
 3773                     $type = 'w';
 3774                     next;
 3775                 }
 3776 
 3777                 # a bare word immediately followed by :: is not a keyword;
 3778                 # use $tok_kw when testing for keywords to avoid a mistake
 3779                 my $tok_kw = $tok;
 3780                 if (   $rtokens->[ $i + 1 ] eq ':'
 3781                     && $rtokens->[ $i + 2 ] eq ':' )
 3782                 {
 3783                     $tok_kw .= '::';
 3784                 }
 3785 
 3786                 # Decide if 'sub :' can be the start of a sub attribute list.
 3787                 # We will decide based on if the colon is followed by a
 3788                 # bareword which is not a keyword.
 3789                 my $sub_attribute_ok_here;
 3790                 if (   $is_sub{$tok_kw}
 3791                     && $expecting != OPERATOR
 3792                     && $next_nonblank_token eq ':' )
 3793                 {
 3794                     my ( $nn_nonblank_token, $i_nn ) =
 3795                       find_next_nonblank_token( $i_next + 1,
 3796                         $rtokens, $max_token_index );
 3797                     $sub_attribute_ok_here =
 3798                          $nn_nonblank_token =~ /^\w/
 3799                       && $nn_nonblank_token !~ /^\d/
 3800                       && !$is_keyword{$nn_nonblank_token};
 3801                 }
 3802 
 3803                 # handle operator x (now we know it isn't $x=)
 3804                 if (   $expecting == OPERATOR
 3805                     && substr( $tok, 0, 1 ) eq 'x'
 3806                     && $tok =~ /^x\d*$/ )
 3807                 {
 3808                     if ( $tok eq 'x' ) {
 3809 
 3810                         if ( $rtokens->[ $i + 1 ] eq '=' ) {    # x=
 3811                             $tok  = 'x=';
 3812                             $type = $tok;
 3813                             $i++;
 3814                         }
 3815                         else {
 3816                             $type = 'x';
 3817                         }
 3818                     }
 3819 
 3820                     # NOTE: mark something like x4 as an integer for now
 3821                     # It gets fixed downstream.  This is easier than
 3822                     # splitting the pretoken.
 3823                     else {
 3824                         $type = 'n';
 3825                     }
 3826                 }
 3827                 elsif ( $tok_kw eq 'CORE::' ) {
 3828                     $type = $tok = $tok_kw;
 3829                     $i += 2;
 3830                 }
 3831                 elsif ( ( $tok eq 'strict' )
 3832                     and ( $last_nonblank_token eq 'use' ) )
 3833                 {
 3834                     $tokenizer_self->[_saw_use_strict_] = 1;
 3835                     scan_bare_identifier();
 3836                 }
 3837 
 3838                 elsif ( ( $tok eq 'warnings' )
 3839                     and ( $last_nonblank_token eq 'use' ) )
 3840                 {
 3841                     $tokenizer_self->[_saw_perl_dash_w_] = 1;
 3842 
 3843                     # scan as identifier, so that we pick up something like:
 3844                     # use warnings::register
 3845                     scan_bare_identifier();
 3846                 }
 3847 
 3848                 elsif (
 3849                        $tok eq 'AutoLoader'
 3850                     && $tokenizer_self->[_look_for_autoloader_]
 3851                     && (
 3852                         $last_nonblank_token eq 'use'
 3853 
 3854                         # these regexes are from AutoSplit.pm, which we want
 3855                         # to mimic
 3856                         || $input_line =~ /^\s*(use|require)\s+AutoLoader\b/
 3857                         || $input_line =~ /\bISA\s*=.*\bAutoLoader\b/
 3858                     )
 3859                   )
 3860                 {
 3861                     write_logfile_entry("AutoLoader seen, -nlal deactivates\n");
 3862                     $tokenizer_self->[_saw_autoloader_]      = 1;
 3863                     $tokenizer_self->[_look_for_autoloader_] = 0;
 3864                     scan_bare_identifier();
 3865                 }
 3866 
 3867                 elsif (
 3868                        $tok eq 'SelfLoader'
 3869                     && $tokenizer_self->[_look_for_selfloader_]
 3870                     && (   $last_nonblank_token eq 'use'
 3871                         || $input_line =~ /^\s*(use|require)\s+SelfLoader\b/
 3872                         || $input_line =~ /\bISA\s*=.*\bSelfLoader\b/ )
 3873                   )
 3874                 {
 3875                     write_logfile_entry("SelfLoader seen, -nlsl deactivates\n");
 3876                     $tokenizer_self->[_saw_selfloader_]      = 1;
 3877                     $tokenizer_self->[_look_for_selfloader_] = 0;
 3878                     scan_bare_identifier();
 3879                 }
 3880 
 3881                 elsif ( ( $tok eq 'constant' )
 3882                     and ( $last_nonblank_token eq 'use' ) )
 3883                 {
 3884                     scan_bare_identifier();
 3885                     my ( $next_nonblank_token, $i_next ) =
 3886                       find_next_nonblank_token( $i, $rtokens,
 3887                         $max_token_index );
 3888 
 3889                     if ($next_nonblank_token) {
 3890 
 3891                         if ( $is_keyword{$next_nonblank_token} ) {
 3892 
 3893                             # Assume qw is used as a quote and okay, as in:
 3894                             #  use constant qw{ DEBUG 0 };
 3895                             # Not worth trying to parse for just a warning
 3896 
 3897                             # NOTE: This warning is deactivated because recent
 3898                             # versions of perl do not complain here, but
 3899                             # the coding is retained for reference.
 3900                             if ( 0 && $next_nonblank_token ne 'qw' ) {
 3901                                 warning(
 3902 "Attempting to define constant '$next_nonblank_token' which is a perl keyword\n"
 3903                                 );
 3904                             }
 3905                         }
 3906 
 3907                         else {
 3908                             $is_constant{$current_package}{$next_nonblank_token}
 3909                               = 1;
 3910                         }
 3911                     }
 3912                 }
 3913 
 3914                 # various quote operators
 3915                 elsif ( $is_q_qq_qw_qx_qr_s_y_tr_m{$tok} ) {
 3916 ##NICOL PATCH
 3917                     if ( $expecting == OPERATOR ) {
 3918 
 3919                         # Be careful not to call an error for a qw quote
 3920                         # where a parenthesized list is allowed.  For example,
 3921                         # it could also be a for/foreach construct such as
 3922                         #
 3923                         #    foreach my $key qw\Uno Due Tres Quadro\ {
 3924                         #        print "Set $key\n";
 3925                         #    }
 3926                         #
 3927 
 3928                         # Or it could be a function call.
 3929                         # NOTE: Braces in something like &{ xxx } are not
 3930                         # marked as a block, we might have a method call.
 3931                         # &method(...), $method->(..), &{method}(...),
 3932                         # $ref[2](list) is ok & short for $ref[2]->(list)
 3933                         #
 3934                         # See notes in 'sub code_block_type' and
 3935                         # 'sub is_non_structural_brace'
 3936 
 3937                         unless (
 3938                             $tok eq 'qw'
 3939                             && (   $last_nonblank_token =~ /^([\]\}\&]|\-\>)/
 3940                                 || $is_for_foreach{$want_paren} )
 3941                           )
 3942                         {
 3943                             error_if_expecting_OPERATOR();
 3944                         }
 3945                     }
 3946                     $in_quote                = $quote_items{$tok};
 3947                     $allowed_quote_modifiers = $quote_modifiers{$tok};
 3948 
 3949                    # All quote types are 'Q' except possibly qw quotes.
 3950                    # qw quotes are special in that they may generally be trimmed
 3951                    # of leading and trailing whitespace.  So they are given a
 3952                    # separate type, 'q', unless requested otherwise.
 3953                     $type =
 3954                       ( $tok eq 'qw' && $tokenizer_self->[_trim_qw_] )
 3955                       ? 'q'
 3956                       : 'Q';
 3957                     $quote_type = $type;
 3958                 }
 3959 
 3960                 # check for a statement label
 3961                 elsif (
 3962                        ( $next_nonblank_token eq ':' )
 3963                     && ( $rtokens->[ $i_next + 1 ] ne ':' )
 3964                     && ( $i_next <= $max_token_index )   # colon on same line
 3965                     && !$sub_attribute_ok_here           # like 'sub : lvalue' ?
 3966                     && label_ok()
 3967                   )
 3968                 {
 3969                     if ( $tok !~ /[A-Z]/ ) {
 3970                         push @{ $tokenizer_self->[_rlower_case_labels_at_] },
 3971                           $input_line_number;
 3972                     }
 3973                     $type = 'J';
 3974                     $tok .= ':';
 3975                     $i = $i_next;
 3976                     next;
 3977                 }
 3978 
 3979                 #      'sub' or alias
 3980                 elsif ( $is_sub{$tok_kw} ) {
 3981                     error_if_expecting_OPERATOR()
 3982                       if ( $expecting == OPERATOR );
 3983                     initialize_subname();
 3984                     scan_id();
 3985                 }
 3986 
 3987                 #      'package'
 3988                 elsif ( $is_package{$tok_kw} ) {
 3989                     error_if_expecting_OPERATOR()
 3990                       if ( $expecting == OPERATOR );
 3991                     scan_id();
 3992                 }
 3993 
 3994                 # Fix for c035: split 'format' from 'is_format_END_DATA' to be
 3995                 # more restrictive. Require a new statement to be ok here.
 3996                 elsif ( $tok_kw eq 'format' && new_statement_ok() ) {
 3997                     $type = ';';    # make tokenizer look for TERM next
 3998                     $tokenizer_self->[_in_format_] = 1;
 3999                     last;
 4000                 }
 4001 
 4002                 # Note on token types for format, __DATA__, __END__:
 4003                 # It simplifies things to give these type ';', so that when we
 4004                 # start rescanning we will be expecting a token of type TERM.
 4005                 # We will switch to type 'k' before outputting the tokens.
 4006                 elsif ( $is_END_DATA{$tok_kw} ) {
 4007                     $type = ';';    # make tokenizer look for TERM next
 4008 
 4009                     # Remember that we are in one of these three sections
 4010                     $tokenizer_self->[ $is_END_DATA{$tok_kw} ] = 1;
 4011                     last;
 4012                 }
 4013 
 4014                 elsif ( $is_keyword{$tok_kw} ) {
 4015                     $type = 'k';
 4016 
 4017                     # Since for and foreach may not be followed immediately
 4018                     # by an opening paren, we have to remember which keyword
 4019                     # is associated with the next '('
 4020                     if ( $is_for_foreach{$tok} ) {
 4021                         if ( new_statement_ok() ) {
 4022                             $want_paren = $tok;
 4023                         }
 4024                     }
 4025 
 4026                     # recognize 'use' statements, which are special
 4027                     elsif ( $is_use_require{$tok} ) {
 4028                         $statement_type = $tok;
 4029                         error_if_expecting_OPERATOR()
 4030                           if ( $expecting == OPERATOR );
 4031                     }
 4032 
 4033                     # remember my and our to check for trailing ": shared"
 4034                     elsif ( $is_my_our_state{$tok} ) {
 4035                         $statement_type = $tok;
 4036                     }
 4037 
 4038                     # Check for misplaced 'elsif' and 'else', but allow isolated
 4039                     # else or elsif blocks to be formatted.  This is indicated
 4040                     # by a last noblank token of ';'
 4041                     elsif ( $tok eq 'elsif' ) {
 4042                         if (   $last_nonblank_token ne ';'
 4043                             && $last_nonblank_block_type !~
 4044                             /^(if|elsif|unless)$/ )
 4045                         {
 4046                             warning(
 4047 "expecting '$tok' to follow one of 'if|elsif|unless'\n"
 4048                             );
 4049                         }
 4050                     }
 4051                     elsif ( $tok eq 'else' ) {
 4052 
 4053                         # patched for SWITCH/CASE
 4054                         if (
 4055                                $last_nonblank_token ne ';'
 4056                             && $last_nonblank_block_type !~
 4057                             /^(if|elsif|unless|case|when)$/
 4058 
 4059                             # patch to avoid an unwanted error message for
 4060                             # the case of a parenless 'case' (RT 105484):
 4061                             # switch ( 1 ) { case x { 2 } else { } }
 4062                             && $statement_type !~
 4063                             /^(if|elsif|unless|case|when)$/
 4064                           )
 4065                         {
 4066                             warning(
 4067 "expecting '$tok' to follow one of 'if|elsif|unless|case|when'\n"
 4068                             );
 4069                         }
 4070                     }
 4071                     elsif ( $tok eq 'continue' ) {
 4072                         if (   $last_nonblank_token ne ';'
 4073                             && $last_nonblank_block_type !~
 4074                             /(^(\{|\}|;|while|until|for|foreach)|:$)/ )
 4075                         {
 4076 
 4077                             # note: ';' '{' and '}' in list above
 4078                             # because continues can follow bare blocks;
 4079                             # ':' is labeled block
 4080                             #
 4081                             ############################################
 4082                             # NOTE: This check has been deactivated because
 4083                             # continue has an alternative usage for given/when
 4084                             # blocks in perl 5.10
 4085                             ## warning("'$tok' should follow a block\n");
 4086                             ############################################
 4087                         }
 4088                     }
 4089 
 4090                     # patch for SWITCH/CASE if 'case' and 'when are
 4091                     # treated as keywords.  Also 'default' for Switch::Plain
 4092                     elsif ($tok eq 'when'
 4093                         || $tok eq 'case'
 4094                         || $tok eq 'default' )
 4095                     {
 4096                         $statement_type = $tok;    # next '{' is block
 4097                     }
 4098 
 4099                     #
 4100                     # indent trailing if/unless/while/until
 4101                     # outdenting will be handled by later indentation loop
 4102 ## DEACTIVATED: unfortunately this can cause some unwanted indentation like:
 4103 ##$opt_o = 1
 4104 ##  if !(
 4105 ##             $opt_b
 4106 ##          || $opt_c
 4107 ##          || $opt_d
 4108 ##          || $opt_f
 4109 ##          || $opt_i
 4110 ##          || $opt_l
 4111 ##          || $opt_o
 4112 ##          || $opt_x
 4113 ##  );
 4114 ##                    if (   $tok =~ /^(if|unless|while|until)$/
 4115 ##                        && $next_nonblank_token ne '(' )
 4116 ##                    {
 4117 ##                        $indent_flag = 1;
 4118 ##                    }
 4119                 }
 4120 
 4121                 # check for inline label following
 4122                 #         /^(redo|last|next|goto)$/
 4123                 elsif (( $last_nonblank_type eq 'k' )
 4124                     && ( $is_redo_last_next_goto{$last_nonblank_token} ) )
 4125                 {
 4126                     $type = 'j';
 4127                     next;
 4128                 }
 4129 
 4130                 # something else --
 4131                 else {
 4132 
 4133                     scan_bare_identifier();
 4134 
 4135                     if (   $statement_type eq 'use'
 4136                         && $last_nonblank_token eq 'use' )
 4137                     {
 4138                         $saw_use_module{$current_package}->{$tok} = 1;
 4139                     }
 4140 
 4141                     if ( $type eq 'w' ) {
 4142 
 4143                         if ( $expecting == OPERATOR ) {
 4144 
 4145                             # Patch to avoid error message for RPerl overloaded
 4146                             # operator functions: use overload
 4147                             #    '+' => \&sse_add,
 4148                             #    '-' => \&sse_sub,
 4149                             #    '*' => \&sse_mul,
 4150                             #    '/' => \&sse_div;
 4151                             # FIXME: this should eventually be generalized
 4152                             if (   $saw_use_module{$current_package}->{'RPerl'}
 4153                                 && $tok =~ /^sse_(mul|div|add|sub)$/ )
 4154                             {
 4155 
 4156                             }
 4157 
 4158                             # Fix part 1 for git #63 in which a comment falls
 4159                             # between an -> and the following word.  An
 4160                             # alternate fix would be to change operator_expected
 4161                             # to return an UNKNOWN for this type.
 4162                             elsif ( $last_nonblank_type eq '->' ) {
 4163 
 4164                             }
 4165 
 4166                             # don't complain about possible indirect object
 4167                             # notation.
 4168                             # For example:
 4169                             #   package main;
 4170                             #   sub new($) { ... }
 4171                             #   $b = new A::;  # calls A::new
 4172                             #   $c = new A;    # same thing but suspicious
 4173                             # This will call A::new but we have a 'new' in
 4174                             # main:: which looks like a constant.
 4175                             #
 4176                             elsif ( $last_nonblank_type eq 'C' ) {
 4177                                 if ( $tok !~ /::$/ ) {
 4178                                     complain(<<EOM);
 4179 Expecting operator after '$last_nonblank_token' but found bare word '$tok'
 4180        Maybe indirectet object notation?
 4181 EOM
 4182                                 }
 4183                             }
 4184                             else {
 4185                                 error_if_expecting_OPERATOR("bareword");
 4186                             }
 4187                         }
 4188 
 4189                         # mark bare words immediately followed by a paren as
 4190                         # functions
 4191                         $next_tok = $rtokens->[ $i + 1 ];
 4192                         if ( $next_tok eq '(' ) {
 4193 
 4194                             # Fix part 2 for git #63.  Leave type as 'w' to keep
 4195                             # the type the same as if the -> were not separated
 4196                             $type = 'U' unless ( $last_nonblank_type eq '->' );
 4197                         }
 4198 
 4199                         # underscore after file test operator is file handle
 4200                         if ( $tok eq '_' && $last_nonblank_type eq 'F' ) {
 4201                             $type = 'Z';
 4202                         }
 4203 
 4204                         # patch for SWITCH/CASE if 'case' and 'when are
 4205                         # not treated as keywords:
 4206                         if (
 4207                             (
 4208                                    $tok eq 'case'
 4209                                 && $brace_type[$brace_depth] eq 'switch'
 4210                             )
 4211                             || (   $tok eq 'when'
 4212                                 && $brace_type[$brace_depth] eq 'given' )
 4213                           )
 4214                         {
 4215                             $statement_type = $tok;    # next '{' is block
 4216                             $type           = 'k'; # for keyword syntax coloring
 4217                         }
 4218 
 4219                         # patch for SWITCH/CASE if switch and given not keywords
 4220                         # Switch is not a perl 5 keyword, but we will gamble
 4221                         # and mark switch followed by paren as a keyword.  This
 4222                         # is only necessary to get html syntax coloring nice,
 4223                         # and does not commit this as being a switch/case.
 4224                         if ( $next_nonblank_token eq '('
 4225                             && ( $tok eq 'switch' || $tok eq 'given' ) )
 4226                         {
 4227                             $type = 'k';    # for keyword syntax coloring
 4228                         }
 4229                     }
 4230                 }
 4231             }
 4232 
 4233             ###############################################################
 4234             # section 2: strings of digits
 4235             ###############################################################
 4236             elsif ( $pre_type eq 'd' ) {
 4237                 $expecting =
 4238                   operator_expected( [ $prev_type, $tok, $next_type ] );
 4239                 error_if_expecting_OPERATOR("Number")
 4240                   if ( $expecting == OPERATOR );
 4241 
 4242                 my $number = scan_number_fast();
 4243                 if ( !defined($number) ) {
 4244 
 4245                     # shouldn't happen - we should always get a number
 4246                     warning("non-number beginning with digit--program bug\n");
 4247                     report_definite_bug();
 4248                 }
 4249             }
 4250 
 4251             ###############################################################
 4252             # section 3: all other tokens
 4253             ###############################################################
 4254 
 4255             else {
 4256                 last if ( $tok eq '#' );
 4257                 my $code = $tokenization_code->{$tok};
 4258                 if ($code) {
 4259                     $expecting =
 4260                       operator_expected( [ $prev_type, $tok, $next_type ] );
 4261                     $code->();
 4262                     redo if $in_quote;
 4263                 }
 4264             }
 4265         }
 4266 
 4267         # -----------------------------
 4268         # end of main tokenization loop
 4269         # -----------------------------
 4270 
 4271         if ( $i_tok >= 0 ) {
 4272             $routput_token_type->[$i_tok]     = $type;
 4273             $routput_block_type->[$i_tok]     = $block_type;
 4274             $routput_container_type->[$i_tok] = $container_type;
 4275             $routput_type_sequence->[$i_tok]  = $type_sequence;
 4276             $routput_indent_flag->[$i_tok]    = $indent_flag;
 4277         }
 4278 
 4279         unless ( ( $type eq 'b' ) || ( $type eq '#' ) ) {
 4280             $last_last_nonblank_token          = $last_nonblank_token;
 4281             $last_last_nonblank_type           = $last_nonblank_type;
 4282             $last_last_nonblank_block_type     = $last_nonblank_block_type;
 4283             $last_last_nonblank_container_type = $last_nonblank_container_type;
 4284             $last_last_nonblank_type_sequence  = $last_nonblank_type_sequence;
 4285             $last_nonblank_token               = $tok;
 4286             $last_nonblank_type                = $type;
 4287             $last_nonblank_block_type          = $block_type;
 4288             $last_nonblank_container_type      = $container_type;
 4289             $last_nonblank_type_sequence       = $type_sequence;
 4290             $last_nonblank_prototype           = $prototype;
 4291         }
 4292 
 4293         # reset indentation level if necessary at a sub or package
 4294         # in an attempt to recover from a nesting error
 4295         if ( $level_in_tokenizer < 0 ) {
 4296             if ( $input_line =~ /^\s*(sub|package)\s+(\w+)/ ) {
 4297                 reset_indentation_level(0);
 4298                 brace_warning("resetting level to 0 at $1 $2\n");
 4299             }
 4300         }
 4301 
 4302         # all done tokenizing this line ...
 4303         # now prepare the final list of tokens and types
 4304 
 4305         my @token_type     = ();   # stack of output token types
 4306         my @block_type     = ();   # stack of output code block types
 4307         my @container_type = ();   # stack of output code container types
 4308         my @type_sequence  = ();   # stack of output type sequence numbers
 4309         my @tokens         = ();   # output tokens
 4310         my @levels         = ();   # structural brace levels of output tokens
 4311         my @slevels        = ();   # secondary nesting levels of output tokens
 4312         my @nesting_tokens = ();   # string of tokens leading to this depth
 4313         my @nesting_types  = ();   # string of token types leading to this depth
 4314         my @nesting_blocks = ();   # string of block types leading to this depth
 4315         my @nesting_lists  = ();   # string of list types leading to this depth
 4316         my @ci_string = ();  # string needed to compute continuation indentation
 4317         my @container_environment = ();    # BLOCK or LIST
 4318         my $container_environment = '';
 4319         my $im                    = -1;    # previous $i value
 4320         my $num;
 4321 
 4322         # Count the number of '1's in the string (previously sub ones_count)
 4323         my $ci_string_sum = ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
 4324 
 4325 # Computing Token Indentation
 4326 #
 4327 #     The final section of the tokenizer forms tokens and also computes
 4328 #     parameters needed to find indentation.  It is much easier to do it
 4329 #     in the tokenizer than elsewhere.  Here is a brief description of how
 4330 #     indentation is computed.  Perl::Tidy computes indentation as the sum
 4331 #     of 2 terms:
 4332 #
 4333 #     (1) structural indentation, such as if/else/elsif blocks
 4334 #     (2) continuation indentation, such as long parameter call lists.
 4335 #
 4336 #     These are occasionally called primary and secondary indentation.
 4337 #
 4338 #     Structural indentation is introduced by tokens of type '{', although
 4339 #     the actual tokens might be '{', '(', or '['.  Structural indentation
 4340 #     is of two types: BLOCK and non-BLOCK.  Default structural indentation
 4341 #     is 4 characters if the standard indentation scheme is used.
 4342 #
 4343 #     Continuation indentation is introduced whenever a line at BLOCK level
 4344 #     is broken before its termination.  Default continuation indentation
 4345 #     is 2 characters in the standard indentation scheme.
 4346 #
 4347 #     Both types of indentation may be nested arbitrarily deep and
 4348 #     interlaced.  The distinction between the two is somewhat arbitrary.
 4349 #
 4350 #     For each token, we will define two variables which would apply if
 4351 #     the current statement were broken just before that token, so that
 4352 #     that token started a new line:
 4353 #
 4354 #     $level = the structural indentation level,
 4355 #     $ci_level = the continuation indentation level
 4356 #
 4357 #     The total indentation will be $level * (4 spaces) + $ci_level * (2 spaces),
 4358 #     assuming defaults.  However, in some special cases it is customary
 4359 #     to modify $ci_level from this strict value.
 4360 #
 4361 #     The total structural indentation is easy to compute by adding and
 4362 #     subtracting 1 from a saved value as types '{' and '}' are seen.  The
 4363 #     running value of this variable is $level_in_tokenizer.
 4364 #
 4365 #     The total continuation is much more difficult to compute, and requires
 4366 #     several variables.  These variables are:
 4367 #
 4368 #     $ci_string_in_tokenizer = a string of 1's and 0's indicating, for
 4369 #       each indentation level, if there are intervening open secondary
 4370 #       structures just prior to that level.
 4371 #     $continuation_string_in_tokenizer = a string of 1's and 0's indicating
 4372 #       if the last token at that level is "continued", meaning that it
 4373 #       is not the first token of an expression.
 4374 #     $nesting_block_string = a string of 1's and 0's indicating, for each
 4375 #       indentation level, if the level is of type BLOCK or not.
 4376 #     $nesting_block_flag = the most recent 1 or 0 of $nesting_block_string
 4377 #     $nesting_list_string = a string of 1's and 0's indicating, for each
 4378 #       indentation level, if it is appropriate for list formatting.
 4379 #       If so, continuation indentation is used to indent long list items.
 4380 #     $nesting_list_flag = the most recent 1 or 0 of $nesting_list_string
 4381 #     @{$rslevel_stack} = a stack of total nesting depths at each
 4382 #       structural indentation level, where "total nesting depth" means
 4383 #       the nesting depth that would occur if every nesting token -- '{', '[',
 4384 #       and '(' -- , regardless of context, is used to compute a nesting
 4385 #       depth.
 4386 
 4387         #my $nesting_block_flag = ($nesting_block_string =~ /1$/);
 4388         #my $nesting_list_flag = ($nesting_list_string =~ /1$/);
 4389 
 4390         my ( $ci_string_i, $level_i, $nesting_block_string_i,
 4391             $nesting_list_string_i, $nesting_token_string_i,
 4392             $nesting_type_string_i, );
 4393 
 4394         foreach my $i ( @{$routput_token_list} )
 4395         {    # scan the list of pre-tokens indexes
 4396 
 4397             # self-checking for valid token types
 4398             my $type                    = $routput_token_type->[$i];
 4399             my $forced_indentation_flag = $routput_indent_flag->[$i];
 4400 
 4401             # See if we should undo the $forced_indentation_flag.
 4402             # Forced indentation after 'if', 'unless', 'while' and 'until'
 4403             # expressions without trailing parens is optional and doesn't
 4404             # always look good.  It is usually okay for a trailing logical
 4405             # expression, but if the expression is a function call, code block,
 4406             # or some kind of list it puts in an unwanted extra indentation
 4407             # level which is hard to remove.
 4408             #
 4409             # Example where extra indentation looks ok:
 4410             # return 1
 4411             #   if $det_a < 0 and $det_b > 0
 4412             #       or $det_a > 0 and $det_b < 0;
 4413             #
 4414             # Example where extra indentation is not needed because
 4415             # the eval brace also provides indentation:
 4416             # print "not " if defined eval {
 4417             #     reduce { die if $b > 2; $a + $b } 0, 1, 2, 3, 4;
 4418             # };
 4419             #
 4420             # The following rule works fairly well:
 4421             #   Undo the flag if the end of this line, or start of the next
 4422             #   line, is an opening container token or a comma.
 4423             # This almost always works, but if not after another pass it will
 4424             # be stable.
 4425             if ( $forced_indentation_flag && $type eq 'k' ) {
 4426                 my $ixlast  = -1;
 4427                 my $ilast   = $routput_token_list->[$ixlast];
 4428                 my $toklast = $routput_token_type->[$ilast];
 4429                 if ( $toklast eq '#' ) {
 4430                     $ixlast--;
 4431                     $ilast   = $routput_token_list->[$ixlast];
 4432                     $toklast = $routput_token_type->[$ilast];
 4433                 }
 4434                 if ( $toklast eq 'b' ) {
 4435                     $ixlast--;
 4436                     $ilast   = $routput_token_list->[$ixlast];
 4437                     $toklast = $routput_token_type->[$ilast];
 4438                 }
 4439                 if ( $toklast =~ /^[\{,]$/ ) {
 4440                     $forced_indentation_flag = 0;
 4441                 }
 4442                 else {
 4443                     ( $toklast, my $i_next ) =
 4444                       find_next_nonblank_token( $max_token_index, $rtokens,
 4445                         $max_token_index );
 4446                     if ( $toklast =~ /^[\{,]$/ ) {
 4447                         $forced_indentation_flag = 0;
 4448                     }
 4449                 }
 4450             }
 4451 
 4452             # if we are already in an indented if, see if we should outdent
 4453             if ($indented_if_level) {
 4454 
 4455                 # don't try to nest trailing if's - shouldn't happen
 4456                 if ( $type eq 'k' ) {
 4457                     $forced_indentation_flag = 0;
 4458                 }
 4459 
 4460                 # check for the normal case - outdenting at next ';'
 4461                 elsif ( $type eq ';' ) {
 4462                     if ( $level_in_tokenizer == $indented_if_level ) {
 4463                         $forced_indentation_flag = -1;
 4464                         $indented_if_level       = 0;
 4465                     }
 4466                 }
 4467 
 4468                 # handle case of missing semicolon
 4469                 elsif ( $type eq '}' ) {
 4470                     if ( $level_in_tokenizer == $indented_if_level ) {
 4471                         $indented_if_level = 0;
 4472 
 4473                         # TBD: This could be a subroutine call
 4474                         $level_in_tokenizer--;
 4475                         if ( @{$rslevel_stack} > 1 ) {
 4476                             pop( @{$rslevel_stack} );
 4477                         }
 4478                         if ( length($nesting_block_string) > 1 )
 4479                         {    # true for valid script
 4480                             chop $nesting_block_string;
 4481                             chop $nesting_list_string;
 4482                         }
 4483 
 4484                     }
 4485                 }
 4486             }
 4487 
 4488             my $tok = $rtokens->[$i];  # the token, but ONLY if same as pretoken
 4489             $level_i = $level_in_tokenizer;
 4490 
 4491             # This can happen by running perltidy on non-scripts
 4492             # although it could also be bug introduced by programming change.
 4493             # Perl silently accepts a 032 (^Z) and takes it as the end
 4494             if ( !$is_valid_token_type{$type} ) {
 4495                 my $val = ord($type);
 4496                 warning(
 4497                     "unexpected character decimal $val ($type) in script\n");
 4498                 $tokenizer_self->[_in_error_] = 1;
 4499             }
 4500 
 4501             # ----------------------------------------------------------------
 4502             # TOKEN TYPE PATCHES
 4503             #  output __END__, __DATA__, and format as type 'k' instead of ';'
 4504             # to make html colors correct, etc.
 4505             my $fix_type = $type;
 4506             if ( $type eq ';' && $tok =~ /\w/ ) { $fix_type = 'k' }
 4507 
 4508             # output anonymous 'sub' as keyword
 4509             if ( $type eq 't' && $is_sub{$tok} ) { $fix_type = 'k' }
 4510 
 4511             # -----------------------------------------------------------------
 4512 
 4513             $nesting_token_string_i = $nesting_token_string;
 4514             $nesting_type_string_i  = $nesting_type_string;
 4515             $nesting_block_string_i = $nesting_block_string;
 4516             $nesting_list_string_i  = $nesting_list_string;
 4517 
 4518             # set primary indentation levels based on structural braces
 4519             # Note: these are set so that the leading braces have a HIGHER
 4520             # level than their CONTENTS, which is convenient for indentation
 4521             # Also, define continuation indentation for each token.
 4522             if ( $type eq '{' || $type eq 'L' || $forced_indentation_flag > 0 )
 4523             {
 4524 
 4525                 # use environment before updating
 4526                 $container_environment =
 4527                     $nesting_block_flag ? 'BLOCK'
 4528                   : $nesting_list_flag  ? 'LIST'
 4529                   :                       "";
 4530 
 4531                 # if the difference between total nesting levels is not 1,
 4532                 # there are intervening non-structural nesting types between
 4533                 # this '{' and the previous unclosed '{'
 4534                 my $intervening_secondary_structure = 0;
 4535                 if ( @{$rslevel_stack} ) {
 4536                     $intervening_secondary_structure =
 4537                       $slevel_in_tokenizer - $rslevel_stack->[-1];
 4538                 }
 4539 
 4540      # Continuation Indentation
 4541      #
 4542      # Having tried setting continuation indentation both in the formatter and
 4543      # in the tokenizer, I can say that setting it in the tokenizer is much,
 4544      # much easier.  The formatter already has too much to do, and can't
 4545      # make decisions on line breaks without knowing what 'ci' will be at
 4546      # arbitrary locations.
 4547      #
 4548      # But a problem with setting the continuation indentation (ci) here
 4549      # in the tokenizer is that we do not know where line breaks will actually
 4550      # be.  As a result, we don't know if we should propagate continuation
 4551      # indentation to higher levels of structure.
 4552      #
 4553      # For nesting of only structural indentation, we never need to do this.
 4554      # For example, in a long if statement, like this
 4555      #
 4556      #   if ( !$output_block_type[$i]
 4557      #     && ($in_statement_continuation) )
 4558      #   {           <--outdented
 4559      #       do_something();
 4560      #   }
 4561      #
 4562      # the second line has ci but we do normally give the lines within the BLOCK
 4563      # any ci.  This would be true if we had blocks nested arbitrarily deeply.
 4564      #
 4565      # But consider something like this, where we have created a break after
 4566      # an opening paren on line 1, and the paren is not (currently) a
 4567      # structural indentation token:
 4568      #
 4569      # my $file = $menubar->Menubutton(
 4570      #   qw/-text File -underline 0 -menuitems/ => [
 4571      #       [
 4572      #           Cascade    => '~View',
 4573      #           -menuitems => [
 4574      #           ...
 4575      #
 4576      # The second line has ci, so it would seem reasonable to propagate it
 4577      # down, giving the third line 1 ci + 1 indentation.  This suggests the
 4578      # following rule, which is currently used to propagating ci down: if there
 4579      # are any non-structural opening parens (or brackets, or braces), before
 4580      # an opening structural brace, then ci is propagated down, and otherwise
 4581      # not.  The variable $intervening_secondary_structure contains this
 4582      # information for the current token, and the string
 4583      # "$ci_string_in_tokenizer" is a stack of previous values of this
 4584      # variable.
 4585 
 4586                 # save the current states
 4587                 push( @{$rslevel_stack}, 1 + $slevel_in_tokenizer );
 4588                 $level_in_tokenizer++;
 4589 
 4590                 if ( $level_in_tokenizer > $tokenizer_self->[_maximum_level_] )
 4591                 {
 4592                     $tokenizer_self->[_maximum_level_] = $level_in_tokenizer;
 4593                 }
 4594 
 4595                 if ($forced_indentation_flag) {
 4596 
 4597                     # break BEFORE '?' when there is forced indentation
 4598                     if ( $type eq '?' ) { $level_i = $level_in_tokenizer; }
 4599                     if ( $type eq 'k' ) {
 4600                         $indented_if_level = $level_in_tokenizer;
 4601                     }
 4602 
 4603                     # do not change container environment here if we are not
 4604                     # at a real list. Adding this check prevents "blinkers"
 4605                     # often near 'unless" clauses, such as in the following
 4606                     # code:
 4607 ##          next
 4608 ##            unless -e (
 4609 ##                    $archive =
 4610 ##                      File::Spec->catdir( $_, "auto", $root, "$sub$lib_ext" )
 4611 ##            );
 4612 
 4613                     $nesting_block_string .= "$nesting_block_flag";
 4614                 }
 4615                 else {
 4616 
 4617                     if ( $routput_block_type->[$i] ) {
 4618                         $nesting_block_flag = 1;
 4619                         $nesting_block_string .= '1';
 4620                     }
 4621                     else {
 4622                         $nesting_block_flag = 0;
 4623                         $nesting_block_string .= '0';
 4624                     }
 4625                 }
 4626 
 4627                 # we will use continuation indentation within containers
 4628                 # which are not blocks and not logical expressions
 4629                 my $bit = 0;
 4630                 if ( !$routput_block_type->[$i] ) {
 4631 
 4632                     # propagate flag down at nested open parens
 4633                     if ( $routput_container_type->[$i] eq '(' ) {
 4634                         $bit = 1 if $nesting_list_flag;
 4635                     }
 4636 
 4637                   # use list continuation if not a logical grouping
 4638                   # /^(if|elsif|unless|while|and|or|not|&&|!|\|\||for|foreach)$/
 4639                     else {
 4640                         $bit = 1
 4641                           unless
 4642                           $is_logical_container{ $routput_container_type->[$i]
 4643                           };
 4644                     }
 4645                 }
 4646                 $nesting_list_string .= $bit;
 4647                 $nesting_list_flag = $bit;
 4648 
 4649                 $ci_string_in_tokenizer .=
 4650                   ( $intervening_secondary_structure != 0 ) ? '1' : '0';
 4651                 $ci_string_sum =
 4652                   ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
 4653                 $continuation_string_in_tokenizer .=
 4654                   ( $in_statement_continuation > 0 ) ? '1' : '0';
 4655 
 4656    #  Sometimes we want to give an opening brace continuation indentation,
 4657    #  and sometimes not.  For code blocks, we don't do it, so that the leading
 4658    #  '{' gets outdented, like this:
 4659    #
 4660    #   if ( !$output_block_type[$i]
 4661    #     && ($in_statement_continuation) )
 4662    #   {           <--outdented
 4663    #
 4664    #  For other types, we will give them continuation indentation.  For example,
 4665    #  here is how a list looks with the opening paren indented:
 4666    #
 4667    #     @LoL =
 4668    #       ( [ "fred", "barney" ], [ "george", "jane", "elroy" ],
 4669    #         [ "homer", "marge", "bart" ], );
 4670    #
 4671    #  This looks best when 'ci' is one-half of the indentation  (i.e., 2 and 4)
 4672 
 4673                 my $total_ci = $ci_string_sum;
 4674                 if (
 4675                     !$routput_block_type->[$i]    # patch: skip for BLOCK
 4676                     && ($in_statement_continuation)
 4677                     && !( $forced_indentation_flag && $type eq ':' )
 4678                   )
 4679                 {
 4680                     $total_ci += $in_statement_continuation
 4681                       unless ( substr( $ci_string_in_tokenizer, -1 ) eq '1' );
 4682                 }
 4683 
 4684                 $ci_string_i               = $total_ci;
 4685                 $in_statement_continuation = 0;
 4686             }
 4687 
 4688             elsif ($type eq '}'
 4689                 || $type eq 'R'
 4690                 || $forced_indentation_flag < 0 )
 4691             {
 4692 
 4693                 # only a nesting error in the script would prevent popping here
 4694                 if ( @{$rslevel_stack} > 1 ) { pop( @{$rslevel_stack} ); }
 4695 
 4696                 $level_i = --$level_in_tokenizer;
 4697 
 4698                 # restore previous level values
 4699                 if ( length($nesting_block_string) > 1 )
 4700                 {    # true for valid script
 4701                     chop $nesting_block_string;
 4702                     $nesting_block_flag =
 4703                       substr( $nesting_block_string, -1 ) eq '1';
 4704                     chop $nesting_list_string;
 4705                     $nesting_list_flag =
 4706                       substr( $nesting_list_string, -1 ) eq '1';
 4707 
 4708                     chop $ci_string_in_tokenizer;
 4709                     $ci_string_sum =
 4710                       ( my $str = $ci_string_in_tokenizer ) =~ tr/1/0/;
 4711 
 4712                     $in_statement_continuation =
 4713                       chop $continuation_string_in_tokenizer;
 4714 
 4715                     # zero continuation flag at terminal BLOCK '}' which
 4716                     # ends a statement.
 4717                     if ( $routput_block_type->[$i] ) {
 4718 
 4719                         # ...These include non-anonymous subs
 4720                         # note: could be sub ::abc { or sub 'abc
 4721                         if ( $routput_block_type->[$i] =~ m/^sub\s*/gc ) {
 4722 
 4723                          # note: older versions of perl require the /gc modifier
 4724                          # here or else the \G does not work.
 4725                             if ( $routput_block_type->[$i] =~ /\G('|::|\w)/gc )
 4726                             {
 4727                                 $in_statement_continuation = 0;
 4728                             }
 4729                         }
 4730 
 4731 # ...and include all block types except user subs with
 4732 # block prototypes and these: (sort|grep|map|do|eval)
 4733 # /^(\}|\{|BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|;|if|elsif|else|unless|while|until|for|foreach)$/
 4734                         elsif (
 4735                             $is_zero_continuation_block_type{
 4736                                 $routput_block_type->[$i]
 4737                             }
 4738                           )
 4739                         {
 4740                             $in_statement_continuation = 0;
 4741                         }
 4742 
 4743                         # ..but these are not terminal types:
 4744                         #     /^(sort|grep|map|do|eval)$/ )
 4745                         elsif (
 4746                             $is_not_zero_continuation_block_type{
 4747                                 $routput_block_type->[$i]
 4748                             }
 4749                           )
 4750                         {
 4751                         }
 4752 
 4753                         # ..and a block introduced by a label
 4754                         # /^\w+\s*:$/gc ) {
 4755                         elsif ( $routput_block_type->[$i] =~ /:$/ ) {
 4756                             $in_statement_continuation = 0;
 4757                         }
 4758 
 4759                         # user function with block prototype
 4760                         else {
 4761                             $in_statement_continuation = 0;
 4762                         }
 4763                     }
 4764 
 4765                     # If we are in a list, then
 4766                     # we must set continuation indentation at the closing
 4767                     # paren of something like this (paren after $check):
 4768                     #     assert(
 4769                     #         __LINE__,
 4770                     #         ( not defined $check )
 4771                     #           or ref $check
 4772                     #           or $check eq "new"
 4773                     #           or $check eq "old",
 4774                     #     );
 4775                     elsif ( $tok eq ')' ) {
 4776                         $in_statement_continuation = 1
 4777                           if $routput_container_type->[$i] =~ /^[;,\{\}]$/;
 4778                     }
 4779 
 4780                     elsif ( $tok eq ';' ) { $in_statement_continuation = 0 }
 4781                 }
 4782 
 4783                 # use environment after updating
 4784                 $container_environment =
 4785                     $nesting_block_flag ? 'BLOCK'
 4786                   : $nesting_list_flag  ? 'LIST'
 4787                   :                       "";
 4788                 $ci_string_i = $ci_string_sum + $in_statement_continuation;
 4789                 $nesting_block_string_i = $nesting_block_string;
 4790                 $nesting_list_string_i  = $nesting_list_string;
 4791             }
 4792 
 4793             # not a structural indentation type..
 4794             else {
 4795 
 4796                 $container_environment =
 4797                     $nesting_block_flag ? 'BLOCK'
 4798                   : $nesting_list_flag  ? 'LIST'
 4799                   :                       "";
 4800 
 4801                 # zero the continuation indentation at certain tokens so
 4802                 # that they will be at the same level as its container.  For
 4803                 # commas, this simplifies the -lp indentation logic, which
 4804                 # counts commas.  For ?: it makes them stand out.
 4805                 if ($nesting_list_flag) {
 4806                     ##      $type =~ /^[,\?\:]$/
 4807                     if ( $is_comma_question_colon{$type} ) {
 4808                         $in_statement_continuation = 0;
 4809                     }
 4810                 }
 4811 
 4812                 # be sure binary operators get continuation indentation
 4813                 if (
 4814                     $container_environment
 4815                     && (   $type eq 'k' && $is_binary_keyword{$tok}
 4816                         || $is_binary_type{$type} )
 4817                   )
 4818                 {
 4819                     $in_statement_continuation = 1;
 4820                 }
 4821 
 4822                 # continuation indentation is sum of any open ci from previous
 4823                 # levels plus the current level
 4824                 $ci_string_i = $ci_string_sum + $in_statement_continuation;
 4825 
 4826                 # update continuation flag ...
 4827                 # if this isn't a blank or comment..
 4828                 if ( $type ne 'b' && $type ne '#' ) {
 4829 
 4830                     # and we are in a BLOCK
 4831                     if ($nesting_block_flag) {
 4832 
 4833                         # the next token after a ';' and label starts a new stmt
 4834                         if ( $type eq ';' || $type eq 'J' ) {
 4835                             $in_statement_continuation = 0;
 4836                         }
 4837 
 4838                         # otherwise, we are continuing the current statement
 4839                         else {
 4840                             $in_statement_continuation = 1;
 4841                         }
 4842                     }
 4843 
 4844                     # if we are not in a BLOCK..
 4845                     else {
 4846 
 4847                         # do not use continuation indentation if not list
 4848                         # environment (could be within if/elsif clause)
 4849                         if ( !$nesting_list_flag ) {
 4850                             $in_statement_continuation = 0;
 4851                         }
 4852 
 4853                         # otherwise, the token after a ',' starts a new term
 4854 
 4855                         # Patch FOR RT#99961; no continuation after a ';'
 4856                         # This is needed because perltidy currently marks
 4857                         # a block preceded by a type character like % or @
 4858                         # as a non block, to simplify formatting. But these
 4859                         # are actually blocks and can have semicolons.
 4860                         # See code_block_type() and is_non_structural_brace().
 4861                         elsif ( $type eq ',' || $type eq ';' ) {
 4862                             $in_statement_continuation = 0;
 4863                         }
 4864 
 4865                         # otherwise, we are continuing the current term
 4866                         else {
 4867                             $in_statement_continuation = 1;
 4868                         }
 4869                     }
 4870                 }
 4871             }
 4872 
 4873             if ( $level_in_tokenizer < 0 ) {
 4874                 unless ( $tokenizer_self->[_saw_negative_indentation_] ) {
 4875                     $tokenizer_self->[_saw_negative_indentation_] = 1;
 4876                     warning("Starting negative indentation\n");
 4877                 }
 4878             }
 4879 
 4880             # set secondary nesting levels based on all containment token types
 4881             # Note: these are set so that the nesting depth is the depth
 4882             # of the PREVIOUS TOKEN, which is convenient for setting
 4883             # the strength of token bonds
 4884             my $slevel_i = $slevel_in_tokenizer;
 4885 
 4886             #    /^[L\{\(\[]$/
 4887             if ( $is_opening_type{$type} ) {
 4888                 $slevel_in_tokenizer++;
 4889                 $nesting_token_string .= $tok;
 4890                 $nesting_type_string  .= $type;
 4891             }
 4892 
 4893             #       /^[R\}\)\]]$/
 4894             elsif ( $is_closing_type{$type} ) {
 4895                 $slevel_in_tokenizer--;
 4896                 my $char = chop $nesting_token_string;
 4897 
 4898                 if ( $char ne $matching_start_token{$tok} ) {
 4899                     $nesting_token_string .= $char . $tok;
 4900                     $nesting_type_string  .= $type;
 4901                 }
 4902                 else {
 4903                     chop $nesting_type_string;
 4904                 }
 4905             }
 4906 
 4907             push( @block_type,            $routput_block_type->[$i] );
 4908             push( @ci_string,             $ci_string_i );
 4909             push( @container_environment, $container_environment );
 4910             push( @container_type,        $routput_container_type->[$i] );
 4911             push( @levels,                $level_i );
 4912             push( @nesting_tokens,        $nesting_token_string_i );
 4913             push( @nesting_types,         $nesting_type_string_i );
 4914             push( @slevels,               $slevel_i );
 4915             push( @token_type,            $fix_type );
 4916             push( @type_sequence,         $routput_type_sequence->[$i] );
 4917             push( @nesting_blocks,        $nesting_block_string );
 4918             push( @nesting_lists,         $nesting_list_string );
 4919 
 4920             # now form the previous token
 4921             if ( $im >= 0 ) {
 4922                 $num =
 4923                   $rtoken_map->[$i] - $rtoken_map->[$im];  # how many characters
 4924 
 4925                 if ( $num > 0 ) {
 4926                     push( @tokens,
 4927                         substr( $input_line, $rtoken_map->[$im], $num ) );
 4928                 }
 4929             }
 4930             $im = $i;
 4931         }
 4932 
 4933         $num = length($input_line) - $rtoken_map->[$im];   # make the last token
 4934         if ( $num > 0 ) {
 4935             push( @tokens, substr( $input_line, $rtoken_map->[$im], $num ) );
 4936         }
 4937 
 4938         $tokenizer_self->[_in_attribute_list_] = $in_attribute_list;
 4939         $tokenizer_self->[_in_quote_]          = $in_quote;
 4940         $tokenizer_self->[_quote_target_] =
 4941           $in_quote ? matching_end_token($quote_character) : "";
 4942         $tokenizer_self->[_rhere_target_list_] = $rhere_target_list;
 4943 
 4944         $line_of_tokens->{_rtoken_type}            = \@token_type;
 4945         $line_of_tokens->{_rtokens}                = \@tokens;
 4946         $line_of_tokens->{_rblock_type}            = \@block_type;
 4947         $line_of_tokens->{_rcontainer_type}        = \@container_type;
 4948         $line_of_tokens->{_rcontainer_environment} = \@container_environment;
 4949         $line_of_tokens->{_rtype_sequence}         = \@type_sequence;
 4950         $line_of_tokens->{_rlevels}                = \@levels;
 4951         $line_of_tokens->{_rslevels}               = \@slevels;
 4952         $line_of_tokens->{_rnesting_tokens}        = \@nesting_tokens;
 4953         $line_of_tokens->{_rci_levels}             = \@ci_string;
 4954         $line_of_tokens->{_rnesting_blocks}        = \@nesting_blocks;
 4955 
 4956         return;
 4957     }
 4958 }    # end tokenize_this_line
 4959 
 4960 #########i#############################################################
 4961 # Tokenizer routines which assist in identifying token types
 4962 #######################################################################
 4963 
 4964 # hash lookup table of operator expected values
 4965 my %op_expected_table;
 4966 
 4967 # exceptions to perl's weird parsing rules after type 'Z'
 4968 my %is_weird_parsing_rule_exception;
 4969 
 4970 BEGIN {
 4971 
 4972     # Always expecting TERM following these types:
 4973     # note: this is identical to '@value_requestor_type' defined later.
 4974     my @q = qw(
 4975       ; ! + x & ?  F J - p / Y : % f U ~ A G j L * . | ^ < = [ m { \ > t
 4976       || >= != mm *= => .. !~ == && |= .= pp -= =~ += <= %= ^= x= ~~ ** << /=
 4977       &= // >> ~. &. |. ^.
 4978       ... **= <<= >>= &&= ||= //= <=> !~~ &.= |.= ^.= <<~
 4979     );
 4980     push @q, ',';
 4981     push @q, '(';    # for completeness, not currently a token type
 4982     @{op_expected_table}{@q} = (TERM) x scalar(@q);
 4983 
 4984     # Always UNKNOWN following these types:
 4985     # Fix for c030: added '->' to this list
 4986     @q = qw( w -> );
 4987     @{op_expected_table}{@q} = (UNKNOWN) x scalar(@q);
 4988 
 4989     # Always expecting OPERATOR ...
 4990     # 'n' and 'v' are currently excluded because they might be VERSION numbers
 4991     # 'i' is currently excluded because it might be a package
 4992     # 'q' is currently excluded because it might be a prototype
 4993     # Fix for c030: removed '->' from this list:
 4994     @q = qw( -- C h R ++ ] Q <> );    ## n v q i );
 4995     push @q, ')';
 4996     @{op_expected_table}{@q} = (OPERATOR) x scalar(@q);
 4997 
 4998     # Fix for git #62: added '*' and '%'
 4999     @q = qw( < ? * % );
 5000     @{is_weird_parsing_rule_exception}{@q} = (OPERATOR) x scalar(@q);
 5001 
 5002 }
 5003 
 5004 use constant DEBUG_OPERATOR_EXPECTED => 0;
 5005 
 5006 sub operator_expected {
 5007 
 5008     # Returns a parameter indicating what types of tokens can occur next
 5009 
 5010     # Call format:
 5011     #    $op_expected = operator_expected( [ $prev_type, $tok, $next_type ] );
 5012     # where
 5013     #    $prev_type is the type of the previous token (blank or not)
 5014     #    $tok is the current token
 5015     #    $next_type is the type of the next token (blank or not)
 5016 
 5017     # Many perl symbols have two or more meanings.  For example, '<<'
 5018     # can be a shift operator or a here-doc operator.  The
 5019     # interpretation of these symbols depends on the current state of
 5020     # the tokenizer, which may either be expecting a term or an
 5021     # operator.  For this example, a << would be a shift if an OPERATOR
 5022     # is expected, and a here-doc if a TERM is expected.  This routine
 5023     # is called to make this decision for any current token.  It returns
 5024     # one of three possible values:
 5025     #
 5026     #     OPERATOR - operator expected (or at least, not a term)
 5027     #     UNKNOWN  - can't tell
 5028     #     TERM     - a term is expected (or at least, not an operator)
 5029     #
 5030     # The decision is based on what has been seen so far.  This
 5031     # information is stored in the "$last_nonblank_type" and
 5032     # "$last_nonblank_token" variables.  For example, if the
 5033     # $last_nonblank_type is '=~', then we are expecting a TERM, whereas
 5034     # if $last_nonblank_type is 'n' (numeric), we are expecting an
 5035     # OPERATOR.
 5036     #
 5037     # If a UNKNOWN is returned, the calling routine must guess. A major
 5038     # goal of this tokenizer is to minimize the possibility of returning
 5039     # UNKNOWN, because a wrong guess can spoil the formatting of a
 5040     # script.
 5041     #
 5042     # Adding NEW_TOKENS: it is critically important that this routine be
 5043     # updated to allow it to determine if an operator or term is to be
 5044     # expected after the new token.  Doing this simply involves adding
 5045     # the new token character to one of the regexes in this routine or
 5046     # to one of the hash lists
 5047     # that it uses, which are initialized in the BEGIN section.
 5048     # USES GLOBAL VARIABLES: $last_nonblank_type, $last_nonblank_token,
 5049     # $statement_type
 5050 
 5051     # When possible, token types should be selected such that we can determine
 5052     # the 'operator_expected' value by a simple hash lookup.  If there are
 5053     # exceptions, that is an indication that a new type is needed.
 5054 
 5055     my ($rarg) = @_;
 5056 
 5057     my $msg = "";
 5058 
 5059     ##############
 5060     # Table lookup
 5061     ##############
 5062 
 5063     # Many types are can be obtained by a table lookup given the previous type.
 5064     # This typically handles half or more of the calls.
 5065     my $op_expected = $op_expected_table{$last_nonblank_type};
 5066     if ( defined($op_expected) ) {
 5067         $msg = "Table lookup";
 5068         goto RETURN;
 5069     }
 5070 
 5071     ######################
 5072     # Handle special cases
 5073     ######################
 5074 
 5075     $op_expected = UNKNOWN;
 5076     my ( $prev_type, $tok, $next_type ) = @{$rarg};
 5077 
 5078     # Types 'k', '}' and 'Z' depend on context
 5079     # FIXME: Types 'i', 'n', 'v', 'q' currently also temporarily depend on
 5080     # context but that dependence could eventually be eliminated with better
 5081     # token type definition
 5082 
 5083     # identifier...
 5084     if ( $last_nonblank_type eq 'i' ) {
 5085         $op_expected = OPERATOR;
 5086 
 5087         # FIXME: it would be cleaner to make this a special type
 5088         # expecting VERSION or {} after package NAMESPACE
 5089         # TODO: maybe mark these words as type 'Y'?
 5090         if (   $statement_type =~ /^package\b/
 5091             && $last_nonblank_token =~ /^package\b/ )
 5092         {
 5093             $op_expected = TERM;
 5094         }
 5095     }
 5096 
 5097     # keyword...
 5098     elsif ( $last_nonblank_type eq 'k' ) {
 5099         $op_expected = TERM;
 5100         if ( $expecting_operator_token{$last_nonblank_token} ) {
 5101             $op_expected = OPERATOR;
 5102         }
 5103         elsif ( $expecting_term_token{$last_nonblank_token} ) {
 5104 
 5105             # Exceptions from TERM:
 5106 
 5107             # // may follow perl functions which may be unary operators
 5108             # see test file dor.t (defined or);
 5109             if (
 5110                    $tok eq '/'
 5111                 && $next_type eq '/'
 5112                 && $is_keyword_rejecting_slash_as_pattern_delimiter{
 5113                     $last_nonblank_token}
 5114               )
 5115             {
 5116                 $op_expected = OPERATOR;
 5117             }
 5118 
 5119             # Patch to allow a ? following 'split' to be a depricated pattern
 5120             # delimiter.  This patch is coordinated with the omission of split
 5121             # from the list
 5122             # %is_keyword_rejecting_question_as_pattern_delimiter. This patch
 5123             # will force perltidy to guess.
 5124             elsif ($tok eq '?'
 5125                 && $last_nonblank_token eq 'split' )
 5126             {
 5127                 $op_expected = UNKNOWN;
 5128             }
 5129         }
 5130     } ## end type 'k'
 5131 
 5132     # closing container token...
 5133 
 5134     # Note that the actual token for type '}' may also be a ')'.
 5135 
 5136     # Also note that $last_nonblank_token is not the token corresponding to
 5137     # $last_nonblank_type when the type is a closing container.  In that
 5138     # case it is the token before the corresponding opening container token.
 5139     # So for example, for this snippet
 5140     #       $a = do { BLOCK } / 2;
 5141     # the $last_nonblank_token is 'do' when $last_nonblank_type eq '}'.
 5142 
 5143     elsif ( $last_nonblank_type eq '}' ) {
 5144         $op_expected = UNKNOWN;
 5145 
 5146         # handle something after 'do' and 'eval'
 5147         if ( $is_block_operator{$last_nonblank_token} ) {
 5148 
 5149             # something like $a = do { BLOCK } / 2;
 5150             $op_expected = OPERATOR;    # block mode following }
 5151         }
 5152 
 5153         elsif ( $last_nonblank_token =~ /^(\)|\$|\-\>)/ ) {
 5154             $op_expected = OPERATOR;
 5155             if ( $last_nonblank_token eq '$' ) { $op_expected = UNKNOWN }
 5156 
 5157         }
 5158 
 5159         # Check for smartmatch operator before preceding brace or square
 5160         # bracket.  For example, at the ? after the ] in the following
 5161         # expressions we are expecting an operator:
 5162         #
 5163         # qr/3/ ~~ ['1234'] ? 1 : 0;
 5164         # map { $_ ~~ [ '0', '1' ] ? 'x' : 'o' } @a;
 5165         elsif ( $last_nonblank_token eq '~~' ) {
 5166             $op_expected = OPERATOR;
 5167         }
 5168 
 5169         # A right brace here indicates the end of a simple block.  All
 5170         # non-structural right braces have type 'R' all braces associated with
 5171         # block operator keywords have been given those keywords as
 5172         # "last_nonblank_token" and caught above.  (This statement is order
 5173         # dependent, and must come after checking $last_nonblank_token).
 5174         else {
 5175 
 5176             # patch for dor.t (defined or).
 5177             if (   $tok eq '/'
 5178                 && $next_type eq '/'
 5179                 && $last_nonblank_token eq ']' )
 5180             {
 5181                 $op_expected = OPERATOR;
 5182             }
 5183 
 5184             # Patch for RT #116344: misparse a ternary operator after an
 5185             # anonymous hash, like this:
 5186             #   return ref {} ? 1 : 0;
 5187             # The right brace should really be marked type 'R' in this case,
 5188             # and it is safest to return an UNKNOWN here. Expecting a TERM will
 5189             # cause the '?' to always be interpreted as a pattern delimiter
 5190             # rather than introducing a ternary operator.
 5191             elsif ( $tok eq '?' ) {
 5192                 $op_expected = UNKNOWN;
 5193             }
 5194             else {
 5195                 $op_expected = TERM;
 5196             }
 5197         }
 5198     } ## end type '}'
 5199 
 5200     # number or v-string...
 5201     # An exception is for VERSION numbers a 'use' statement. It has the format
 5202     #     use Module VERSION LIST
 5203     # We could avoid this exception by writing a special sub to parse 'use'
 5204     # statements and perhaps mark these numbers with a new type V (for VERSION)
 5205     elsif ( $last_nonblank_type =~ /^[nv]$/ ) {
 5206         $op_expected = OPERATOR;
 5207         if ( $statement_type eq 'use' ) {
 5208             $op_expected = UNKNOWN;
 5209         }
 5210     }
 5211 
 5212     # quote...
 5213     # FIXME: labeled prototype words should probably be given type 'A' or maybe
 5214     # 'J'; not 'q'; or maybe mark as type 'Y'
 5215     elsif ( $last_nonblank_type eq 'q' ) {
 5216         $op_expected = OPERATOR;
 5217         if ( $last_nonblank_token eq 'prototype' )
 5218           ##|| $last_nonblank_token eq 'switch' )
 5219         {
 5220             $op_expected = TERM;
 5221         }
 5222     }
 5223 
 5224     # file handle or similar
 5225     elsif ( $last_nonblank_type eq 'Z' ) {
 5226 
 5227         $op_expected = UNKNOWN;
 5228 
 5229         # angle.t
 5230         if ( $last_nonblank_token =~ /^\w/ ) {
 5231             $op_expected = UNKNOWN;
 5232         }
 5233 
 5234         # The 'weird parsing rules' of next section do not work for '<' and '?'
 5235         # It is best to mark them as unknown.  Test case:
 5236         #  print $fh <DATA>;
 5237         elsif ( $is_weird_parsing_rule_exception{$tok} ) {
 5238             $op_expected = UNKNOWN;
 5239         }
 5240 
 5241         # For possible file handle like "$a", Perl uses weird parsing rules.
 5242         # For example:
 5243         # print $a/2,"/hi";   - division
 5244         # print $a / 2,"/hi"; - division
 5245         # print $a/ 2,"/hi";  - division
 5246         # print $a /2,"/hi";  - pattern (and error)!
 5247         # Some examples where this logic works okay, for '&','*','+':
 5248         #    print $fh &xsi_protos(@mods);
 5249         #    my $x = new $CompressClass *FH;
 5250         #    print $OUT +( $count % 15 ? ", " : "\n\t" );
 5251         elsif ($prev_type eq 'b'
 5252             && $next_type ne 'b' )
 5253         {
 5254             $op_expected = TERM;
 5255         }
 5256 
 5257         # Note that '?' and '<' have been moved above
 5258         # ( $tok =~ /^([x\/\+\-\*\%\&\.\?\<]|\>\>)$/ ) {
 5259         elsif ( $tok =~ /^([x\/\+\-\*\%\&\.]|\>\>)$/ ) {
 5260 
 5261             # Do not complain in 'use' statements, which have special syntax.
 5262             # For example, from RT#130344:
 5263             #   use lib $FindBin::Bin . '/lib';
 5264             if ( $statement_type ne 'use' ) {
 5265                 complain(
 5266 "operator in possible indirect object location not recommended\n"
 5267                 );
 5268             }
 5269             $op_expected = OPERATOR;
 5270         }
 5271     }
 5272 
 5273     # anything else...
 5274     else {
 5275         $op_expected = UNKNOWN;
 5276     }
 5277 
 5278   RETURN:
 5279 
 5280     DEBUG_OPERATOR_EXPECTED && do {
 5281         print STDOUT
 5282 "OPERATOR_EXPECTED: $msg: returns $op_expected for last type $last_nonblank_type token $last_nonblank_token\n";
 5283     };
 5284 
 5285     return $op_expected;
 5286 
 5287 } ## end of sub operator_expected
 5288 
 5289 sub new_statement_ok {
 5290 
 5291     # return true if the current token can start a new statement
 5292     # USES GLOBAL VARIABLES: $last_nonblank_type
 5293 
 5294     return label_ok()    # a label would be ok here
 5295 
 5296       || $last_nonblank_type eq 'J';    # or we follow a label
 5297 
 5298 }
 5299 
 5300 sub label_ok {
 5301 
 5302     # Decide if a bare word followed by a colon here is a label
 5303     # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
 5304     # $brace_depth, @brace_type
 5305 
 5306     # if it follows an opening or closing code block curly brace..
 5307     if ( ( $last_nonblank_token eq '{' || $last_nonblank_token eq '}' )
 5308         && $last_nonblank_type eq $last_nonblank_token )
 5309     {
 5310 
 5311         # it is a label if and only if the curly encloses a code block
 5312         return $brace_type[$brace_depth];
 5313     }
 5314 
 5315     # otherwise, it is a label if and only if it follows a ';' (real or fake)
 5316     # or another label
 5317     else {
 5318         return ( $last_nonblank_type eq ';' || $last_nonblank_type eq 'J' );
 5319     }
 5320 }
 5321 
 5322 sub code_block_type {
 5323 
 5324     # Decide if this is a block of code, and its type.
 5325     # Must be called only when $type = $token = '{'
 5326     # The problem is to distinguish between the start of a block of code
 5327     # and the start of an anonymous hash reference
 5328     # Returns "" if not code block, otherwise returns 'last_nonblank_token'
 5329     # to indicate the type of code block.  (For example, 'last_nonblank_token'
 5330     # might be 'if' for an if block, 'else' for an else block, etc).
 5331     # USES GLOBAL VARIABLES: $last_nonblank_token, $last_nonblank_type,
 5332     # $last_nonblank_block_type, $brace_depth, @brace_type
 5333 
 5334     # handle case of multiple '{'s
 5335 
 5336 # print "BLOCK_TYPE EXAMINING: type=$last_nonblank_type tok=$last_nonblank_token\n";
 5337 
 5338     my ( $i, $rtokens, $rtoken_type, $max_token_index ) = @_;
 5339     if (   $last_nonblank_token eq '{'
 5340         && $last_nonblank_type eq $last_nonblank_token )
 5341     {
 5342 
 5343         # opening brace where a statement may appear is probably
 5344         # a code block but might be and anonymous hash reference
 5345         if ( $brace_type[$brace_depth] ) {
 5346             return decide_if_code_block( $i, $rtokens, $rtoken_type,
 5347                 $max_token_index );
 5348         }
 5349 
 5350         # cannot start a code block within an anonymous hash
 5351         else {
 5352             return "";
 5353         }
 5354     }
 5355 
 5356     elsif ( $last_nonblank_token eq ';' ) {
 5357 
 5358         # an opening brace where a statement may appear is probably
 5359         # a code block but might be and anonymous hash reference
 5360         return decide_if_code_block( $i, $rtokens, $rtoken_type,
 5361             $max_token_index );
 5362     }
 5363 
 5364     # handle case of '}{'
 5365     elsif ($last_nonblank_token eq '}'
 5366         && $last_nonblank_type eq $last_nonblank_token )
 5367     {
 5368 
 5369         # a } { situation ...
 5370         # could be hash reference after code block..(blktype1.t)
 5371         if ($last_nonblank_block_type) {
 5372             return decide_if_code_block( $i, $rtokens, $rtoken_type,
 5373                 $max_token_index );
 5374         }
 5375 
 5376         # must be a block if it follows a closing hash reference
 5377         else {
 5378             return $last_nonblank_token;
 5379         }
 5380     }
 5381 
 5382     ################################################################
 5383     # NOTE: braces after type characters start code blocks, but for
 5384     # simplicity these are not identified as such.  See also
 5385     # sub is_non_structural_brace.
 5386     ################################################################
 5387 
 5388 ##    elsif ( $last_nonblank_type eq 't' ) {
 5389 ##       return $last_nonblank_token;
 5390 ##    }
 5391 
 5392     # brace after label:
 5393     elsif ( $last_nonblank_type eq 'J' ) {
 5394         return $last_nonblank_token;
 5395     }
 5396 
 5397 # otherwise, look at previous token.  This must be a code block if
 5398 # it follows any of these:
 5399 # /^(BEGIN|END|CHECK|INIT|AUTOLOAD|DESTROY|UNITCHECK|continue|if|elsif|else|unless|do|while|until|eval|for|foreach|map|grep|sort)$/
 5400     elsif ( $is_code_block_token{$last_nonblank_token} ) {
 5401 
 5402         # Bug Patch: Note that the opening brace after the 'if' in the following
 5403         # snippet is an anonymous hash ref and not a code block!
 5404         #   print 'hi' if { x => 1, }->{x};
 5405         # We can identify this situation because the last nonblank type
 5406         # will be a keyword (instead of a closing peren)
 5407         if (   $last_nonblank_token =~ /^(if|unless)$/
 5408             && $last_nonblank_type eq 'k' )
 5409         {
 5410             return "";
 5411         }
 5412         else {
 5413             return $last_nonblank_token;
 5414         }
 5415     }
 5416 
 5417     # or a sub or package BLOCK
 5418     elsif ( ( $last_nonblank_type eq 'i' || $last_nonblank_type eq 't' )
 5419         && $last_nonblank_token =~ /^(sub|package)\b/ )
 5420     {
 5421         return $last_nonblank_token;
 5422     }
 5423 
 5424     # or a sub alias
 5425     elsif (( $last_nonblank_type eq 'i' || $last_nonblank