"Fossies" - the Fresh Open Source Software Archive

Member "install-tl-20200916/tlpkg/tlperl/lib/Encode/CN/HZ.pm" (10 Mar 2019, 6084 Bytes) of package /windows/misc/install-tl.zip:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 package Encode::CN::HZ;
    2 
    3 use strict;
    4 use warnings;
    5 use utf8 ();
    6 
    7 use vars qw($VERSION);
    8 $VERSION = do { my @r = ( q$Revision: 2.10 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
    9 
   10 use Encode qw(:fallbacks);
   11 
   12 use parent qw(Encode::Encoding);
   13 __PACKAGE__->Define('hz');
   14 
   15 # HZ is a combination of ASCII and escaped GB, so we implement it
   16 # with the GB2312(raw) encoding here. Cf. RFCs 1842 & 1843.
   17 
   18 # not ported for EBCDIC.  Which should be used, "~" or "\x7E"?
   19 
   20 sub needs_lines { 1 }
   21 
   22 sub decode ($$;$) {
   23     my ( $obj, $str, $chk ) = @_;
   24     return undef unless defined $str;
   25 
   26     my $GB  = Encode::find_encoding('gb2312-raw');
   27     my $ret = substr($str, 0, 0); # to propagate taintedness
   28     my $in_ascii = 1;    # default mode is ASCII.
   29 
   30     while ( length $str ) {
   31         if ($in_ascii) {    # ASCII mode
   32             if ( $str =~ s/^([\x00-\x7D\x7F]+)// ) {    # no '~' => ASCII
   33                 $ret .= $1;
   34 
   35                 # EBCDIC should need ascii2native, but not ported.
   36             }
   37             elsif ( $str =~ s/^\x7E\x7E// ) {           # escaped tilde
   38                 $ret .= '~';
   39             }
   40             elsif ( $str =~ s/^\x7E\cJ// ) {    # '\cJ' == LF in ASCII
   41                 1;                              # no-op
   42             }
   43             elsif ( $str =~ s/^\x7E\x7B// ) {    # '~{'
   44                 $in_ascii = 0;                   # to GB
   45             }
   46             else {    # encounters an invalid escape, \x80 or greater
   47                 last;
   48             }
   49         }
   50         else {        # GB mode; the byte ranges are as in RFC 1843.
   51             no warnings 'uninitialized';
   52             if ( $str =~ s/^((?:[\x21-\x77][\x21-\x7E])+)// ) {
   53                 my $prefix = $1;
   54                 $ret .= $GB->decode( $prefix, $chk );
   55             }
   56             elsif ( $str =~ s/^\x7E\x7D// ) {    # '~}'
   57                 $in_ascii = 1;
   58             }
   59             else {                               # invalid
   60                 last;
   61             }
   62         }
   63     }
   64     $_[1] = '' if $chk;    # needs_lines guarantees no partial character
   65     return $ret;
   66 }
   67 
   68 sub cat_decode {
   69     my ( $obj, undef, $src, $pos, $trm, $chk ) = @_;
   70     my ( $rdst, $rsrc, $rpos ) = \@_[ 1 .. 3 ];
   71 
   72     my $GB  = Encode::find_encoding('gb2312-raw');
   73     my $ret = '';
   74     my $in_ascii = 1;      # default mode is ASCII.
   75 
   76     my $ini_pos = pos($$rsrc);
   77 
   78     substr( $src, 0, $pos ) = '';
   79 
   80     my $ini_len = bytes::length($src);
   81 
   82     # $trm is the first of the pair '~~', then 2nd tilde is to be removed.
   83     # XXX: Is better C<$src =~ s/^\x7E// or die if ...>?
   84     $src =~ s/^\x7E// if $trm eq "\x7E";
   85 
   86     while ( length $src ) {
   87         my $now;
   88         if ($in_ascii) {    # ASCII mode
   89             if ( $src =~ s/^([\x00-\x7D\x7F])// ) {    # no '~' => ASCII
   90                 $now = $1;
   91             }
   92             elsif ( $src =~ s/^\x7E\x7E// ) {          # escaped tilde
   93                 $now = '~';
   94             }
   95             elsif ( $src =~ s/^\x7E\cJ// ) {    # '\cJ' == LF in ASCII
   96                 next;
   97             }
   98             elsif ( $src =~ s/^\x7E\x7B// ) {    # '~{'
   99                 $in_ascii = 0;                   # to GB
  100                 next;
  101             }
  102             else {    # encounters an invalid escape, \x80 or greater
  103                 last;
  104             }
  105         }
  106         else {        # GB mode; the byte ranges are as in RFC 1843.
  107             if ( $src =~ s/^((?:[\x21-\x77][\x21-\x7F])+)// ) {
  108                 $now = $GB->decode( $1, $chk );
  109             }
  110             elsif ( $src =~ s/^\x7E\x7D// ) {    # '~}'
  111                 $in_ascii = 1;
  112                 next;
  113             }
  114             else {                               # invalid
  115                 last;
  116             }
  117         }
  118 
  119         next if !defined $now;
  120 
  121         $ret .= $now;
  122 
  123         if ( $now eq $trm ) {
  124             $$rdst .= $ret;
  125             $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src);
  126             pos($$rsrc) = $ini_pos;
  127             return 1;
  128         }
  129     }
  130 
  131     $$rdst .= $ret;
  132     $$rpos = $ini_pos + $pos + $ini_len - bytes::length($src);
  133     pos($$rsrc) = $ini_pos;
  134     return '';    # terminator not found
  135 }
  136 
  137 sub encode($$;$) {
  138      my ( $obj, $str, $chk ) = @_;
  139     return undef unless defined $str;
  140 
  141     my $GB  = Encode::find_encoding('gb2312-raw');
  142     my $ret = substr($str, 0, 0); # to propagate taintedness;
  143     my $in_ascii = 1;    # default mode is ASCII.
  144 
  145     no warnings 'utf8';  # $str may be malformed UTF8 at the end of a chunk.
  146 
  147     while ( length $str ) {
  148         if ( $str =~ s/^([[:ascii:]]+)// ) {
  149             my $tmp = $1;
  150             $tmp =~ s/~/~~/g;    # escapes tildes
  151             if ( !$in_ascii ) {
  152                 $ret .= "\x7E\x7D";    # '~}'
  153                 $in_ascii = 1;
  154             }
  155             $ret .= pack 'a*', $tmp;    # remove UTF8 flag.
  156         }
  157         elsif ( $str =~ s/(.)// ) {
  158             my $s = $1;
  159             my $tmp = $GB->encode( $s, $chk || 0 );
  160             last if !defined $tmp;
  161             if ( length $tmp == 2 ) {    # maybe a valid GB char (XXX)
  162                 if ($in_ascii) {
  163                     $ret .= "\x7E\x7B";    # '~{'
  164                     $in_ascii = 0;
  165                 }
  166                 $ret .= $tmp;
  167             }
  168             elsif ( length $tmp ) {        # maybe FALLBACK in ASCII (XXX)
  169                 if ( !$in_ascii ) {
  170                     $ret .= "\x7E\x7D";    # '~}'
  171                     $in_ascii = 1;
  172                 }
  173                 $ret .= $tmp;
  174             }
  175         }
  176         else {    # if $str is malformed UTF8 *and* if length $str != 0.
  177             last;
  178         }
  179     }
  180     $_[1] = $str if $chk;
  181 
  182     # The state at the end of the chunk is discarded, even if in GB mode.
  183     # That results in the combination of GB-OUT and GB-IN, i.e. "~}~{".
  184     # Parhaps it is harmless, but further investigations may be required...
  185 
  186     if ( !$in_ascii ) {
  187         $ret .= "\x7E\x7D";    # '~}'
  188         $in_ascii = 1;
  189     }
  190     utf8::encode($ret); # https://rt.cpan.org/Ticket/Display.html?id=35120
  191     return $ret;
  192 }
  193 
  194 1;
  195 __END__
  196 
  197 =head1 NAME
  198 
  199 Encode::CN::HZ -- internally used by Encode::CN
  200 
  201 =cut