"Fossies" - the Fresh Open Source Software Archive

Member "install-tl-20200916/tlpkg/tlperl/lib/Encode/MIME/Header.pm" (10 Mar 2019, 16663 Bytes) of package /windows/misc/install-tl.zip:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 package Encode::MIME::Header;
    2 use strict;
    3 use warnings;
    4 
    5 our $VERSION = do { my @r = ( q$Revision: 2.28 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
    6 
    7 use Carp ();
    8 use Encode ();
    9 use MIME::Base64 ();
   10 
   11 my %seed = (
   12     decode_b => 1,       # decodes 'B' encoding ?
   13     decode_q => 1,       # decodes 'Q' encoding ?
   14     encode   => 'B',     # encode with 'B' or 'Q' ?
   15     charset  => 'UTF-8', # encode charset
   16     bpl      => 75,      # bytes per line
   17 );
   18 
   19 my @objs;
   20 
   21 push @objs, bless {
   22     %seed,
   23     Name     => 'MIME-Header',
   24 } => __PACKAGE__;
   25 
   26 push @objs, bless {
   27     %seed,
   28     decode_q => 0,
   29     Name     => 'MIME-B',
   30 } => __PACKAGE__;
   31 
   32 push @objs, bless {
   33     %seed,
   34     decode_b => 0,
   35     encode   => 'Q',
   36     Name     => 'MIME-Q',
   37 } => __PACKAGE__;
   38 
   39 Encode::define_encoding($_, $_->{Name}) foreach @objs;
   40 
   41 use parent qw(Encode::Encoding);
   42 
   43 sub needs_lines { 1 }
   44 sub perlio_ok   { 0 }
   45 
   46 # RFC 2047 and RFC 2231 grammar
   47 my $re_charset = qr/[!"#\$%&'+\-0-9A-Z\\\^_`a-z\{\|\}~]+/;
   48 my $re_language = qr/[A-Za-z]{1,8}(?:-[0-9A-Za-z]{1,8})*/;
   49 my $re_encoding = qr/[QqBb]/;
   50 my $re_encoded_text = qr/[^\?]*/;
   51 my $re_encoded_word = qr/=\?$re_charset(?:\*$re_language)?\?$re_encoding\?$re_encoded_text\?=/;
   52 my $re_capture_encoded_word = qr/=\?($re_charset)((?:\*$re_language)?)\?($re_encoding\?$re_encoded_text)\?=/;
   53 my $re_capture_encoded_word_split = qr/=\?($re_charset)((?:\*$re_language)?)\?($re_encoding)\?($re_encoded_text)\?=/;
   54 
   55 # in strict mode check also for valid base64 characters and also for valid quoted printable codes
   56 my $re_encoding_strict_b = qr/[Bb]/;
   57 my $re_encoding_strict_q = qr/[Qq]/;
   58 my $re_encoded_text_strict_b = qr/[0-9A-Za-z\+\/]*={0,2}/;
   59 my $re_encoded_text_strict_q = qr/(?:[\x21-\x3C\x3E\x40-\x7E]|=[0-9A-Fa-f]{2})*/; # NOTE: first part are printable US-ASCII except ?, =, SPACE and TAB
   60 my $re_encoded_word_strict = qr/=\?$re_charset(?:\*$re_language)?\?(?:$re_encoding_strict_b\?$re_encoded_text_strict_b|$re_encoding_strict_q\?$re_encoded_text_strict_q)\?=/;
   61 my $re_capture_encoded_word_strict = qr/=\?($re_charset)((?:\*$re_language)?)\?($re_encoding_strict_b\?$re_encoded_text_strict_b|$re_encoding_strict_q\?$re_encoded_text_strict_q)\?=/;
   62 
   63 my $re_newline = qr/(?:\r\n|[\r\n])/;
   64 
   65 # in strict mode encoded words must be always separated by spaces or tabs (or folded newline)
   66 # except in comments when separator between words and comment round brackets can be omitted
   67 my $re_word_begin_strict = qr/(?:(?:[ \t]|\A)\(?|(?:[^\\]|\A)\)\()/;
   68 my $re_word_sep_strict = qr/(?:$re_newline?[ \t])+/;
   69 my $re_word_end_strict = qr/(?:\)\(|\)?(?:$re_newline?[ \t]|\z))/;
   70 
   71 my $re_match = qr/()((?:$re_encoded_word\s*)*$re_encoded_word)()/;
   72 my $re_match_strict = qr/($re_word_begin_strict)((?:$re_encoded_word_strict$re_word_sep_strict)*$re_encoded_word_strict)(?=$re_word_end_strict)/;
   73 
   74 my $re_capture = qr/$re_capture_encoded_word(?:\s*)?/;
   75 my $re_capture_strict = qr/$re_capture_encoded_word_strict$re_word_sep_strict?/;
   76 
   77 our $STRICT_DECODE = 0;
   78 
   79 sub decode($$;$) {
   80     my ($obj, $str, $chk) = @_;
   81     return undef unless defined $str;
   82 
   83     my $re_match_decode = $STRICT_DECODE ? $re_match_strict : $re_match;
   84     my $re_capture_decode = $STRICT_DECODE ? $re_capture_strict : $re_capture;
   85 
   86     my $stop = 0;
   87     my $output = substr($str, 0, 0); # to propagate taintedness
   88 
   89     # decode each line separately, match whole continuous folded line at one call
   90     1 while not $stop and $str =~ s{^((?:[^\r\n]*(?:$re_newline[ \t])?)*)($re_newline)?}{
   91 
   92         my $line = $1;
   93         my $sep = defined $2 ? $2 : '';
   94 
   95         $stop = 1 unless length($line) or length($sep);
   96 
   97         # NOTE: this code partially could break $chk support
   98         # in non strict mode concat consecutive encoded mime words with same charset, language and encoding
   99         # fixes breaking inside multi-byte characters
  100         1 while not $STRICT_DECODE and $line =~ s/$re_capture_encoded_word_split\s*=\?\1\2\?\3\?($re_encoded_text)\?=/=\?$1$2\?$3\?$4$5\?=/so;
  101 
  102         # process sequence of encoded MIME words at once
  103         1 while not $stop and $line =~ s{^(.*?)$re_match_decode}{
  104 
  105             my $begin = $1 . $2;
  106             my $words = $3;
  107 
  108             $begin =~ tr/\r\n//d;
  109             $output .= $begin;
  110 
  111             # decode one MIME word
  112             1 while not $stop and $words =~ s{^(.*?)($re_capture_decode)}{
  113 
  114                 $output .= $1;
  115                 my $orig = $2;
  116                 my $charset = $3;
  117                 my ($mime_enc, $text) = split /\?/, $5;
  118 
  119                 $text =~ tr/\r\n//d;
  120 
  121                 my $enc = Encode::find_mime_encoding($charset);
  122 
  123                 # in non strict mode allow also perl encoding aliases
  124                 if ( not defined $enc and not $STRICT_DECODE ) {
  125                     # make sure that decoded string will be always strict UTF-8
  126                     $charset = 'UTF-8' if lc($charset) eq 'utf8';
  127                     $enc = Encode::find_encoding($charset);
  128                 }
  129 
  130                 if ( not defined $enc ) {
  131                     Carp::croak qq(Unknown charset "$charset") if not ref $chk and $chk and $chk & Encode::DIE_ON_ERR;
  132                     Carp::carp qq(Unknown charset "$charset") if not ref $chk and $chk and $chk & Encode::WARN_ON_ERR;
  133                     $stop = 1 if not ref $chk and $chk and $chk & Encode::RETURN_ON_ERR;
  134                     $output .= ($output =~ /(?:\A|[ \t])$/ ? '' : ' ') . $orig unless $stop; # $orig mime word is separated by whitespace
  135                     $stop ? $orig : '';
  136                 } else {
  137                     if ( uc($mime_enc) eq 'B' and $obj->{decode_b} ) {
  138                         my $decoded = _decode_b($enc, $text, $chk);
  139                         $stop = 1 if not defined $decoded and not ref $chk and $chk and $chk & Encode::RETURN_ON_ERR;
  140                         $output .= (defined $decoded ? $decoded : $text) unless $stop;
  141                         $stop ? $orig : '';
  142                     } elsif ( uc($mime_enc) eq 'Q' and $obj->{decode_q} ) {
  143                         my $decoded = _decode_q($enc, $text, $chk);
  144                         $stop = 1 if not defined $decoded and not ref $chk and $chk and $chk & Encode::RETURN_ON_ERR;
  145                         $output .= (defined $decoded ? $decoded : $text) unless $stop;
  146                         $stop ? $orig : '';
  147                     } else {
  148                         Carp::croak qq(MIME "$mime_enc" unsupported) if not ref $chk and $chk and $chk & Encode::DIE_ON_ERR;
  149                         Carp::carp qq(MIME "$mime_enc" unsupported) if not ref $chk and $chk and $chk & Encode::WARN_ON_ERR;
  150                         $stop = 1 if not ref $chk and $chk and $chk & Encode::RETURN_ON_ERR;
  151                         $output .= ($output =~ /(?:\A|[ \t])$/ ? '' : ' ') . $orig unless $stop; # $orig mime word is separated by whitespace
  152                         $stop ? $orig : '';
  153                     }
  154                 }
  155 
  156             }se;
  157 
  158             if ( not $stop ) {
  159                 $output .= $words;
  160                 $words = '';
  161             }
  162 
  163             $words;
  164 
  165         }se;
  166 
  167         if ( not $stop ) {
  168             $line =~ tr/\r\n//d;
  169             $output .= $line . $sep;
  170             $line = '';
  171             $sep = '';
  172         }
  173 
  174         $line . $sep;
  175 
  176     }se;
  177 
  178     $_[1] = $str if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC);
  179     return $output;
  180 }
  181 
  182 sub _decode_b {
  183     my ($enc, $text, $chk) = @_;
  184     # MIME::Base64::decode ignores everything after a '=' padding character
  185     # in non strict mode split string after each sequence of padding characters and decode each substring
  186     my $octets = $STRICT_DECODE ?
  187         MIME::Base64::decode($text) :
  188         join('', map { MIME::Base64::decode($_) } split /(?<==)(?=[^=])/, $text);
  189     return _decode_octets($enc, $octets, $chk);
  190 }
  191 
  192 sub _decode_q {
  193     my ($enc, $text, $chk) = @_;
  194     $text =~ s/_/ /go;
  195     $text =~ s/=([0-9A-Fa-f]{2})/pack('C', hex($1))/ego;
  196     return _decode_octets($enc, $text, $chk);
  197 }
  198 
  199 sub _decode_octets {
  200     my ($enc, $octets, $chk) = @_;
  201     $chk = 0 unless defined $chk;
  202     $chk &= ~Encode::LEAVE_SRC if not ref $chk and $chk;
  203     my $output = $enc->decode($octets, $chk);
  204     return undef if not ref $chk and $chk and $octets ne '';
  205     return $output;
  206 }
  207 
  208 sub encode($$;$) {
  209     my ($obj, $str, $chk) = @_;
  210     return undef unless defined $str;
  211     my $output = $obj->_fold_line($obj->_encode_string($str, $chk));
  212     $_[1] = $str if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC);
  213     return $output . substr($str, 0, 0); # to propagate taintedness
  214 }
  215 
  216 sub _fold_line {
  217     my ($obj, $line) = @_;
  218     my $bpl = $obj->{bpl};
  219     my $output = '';
  220 
  221     while ( length($line) ) {
  222         if ( $line =~ s/^(.{0,$bpl})(\s|\z)// ) {
  223             $output .= $1;
  224             $output .= "\r\n" . $2 if length($line);
  225         } elsif ( $line =~ s/(\s)(.*)$// ) {
  226             $output .= $line;
  227             $line = $2;
  228             $output .= "\r\n" . $1 if length($line);
  229         } else {
  230             $output .= $line;
  231             last;
  232         }
  233     }
  234 
  235     return $output;
  236 }
  237 
  238 sub _encode_string {
  239     my ($obj, $str, $chk) = @_;
  240     my $wordlen = $obj->{bpl} > 76 ? 76 : $obj->{bpl};
  241     my $enc = Encode::find_mime_encoding($obj->{charset});
  242     my $enc_chk = $chk;
  243     $enc_chk = 0 unless defined $enc_chk;
  244     $enc_chk |= Encode::LEAVE_SRC if not ref $enc_chk and $enc_chk;
  245     my @result = ();
  246     my $octets = '';
  247     while ( length( my $chr = substr($str, 0, 1, '') ) ) {
  248         my $seq = $enc->encode($chr, $enc_chk);
  249         if ( not length($seq) ) {
  250             substr($str, 0, 0, $chr);
  251             last;
  252         }
  253         if ( $obj->_encoded_word_len($octets . $seq) > $wordlen ) {
  254             push @result, $obj->_encode_word($octets);
  255             $octets = '';
  256         }
  257         $octets .= $seq;
  258     }
  259     length($octets) and push @result, $obj->_encode_word($octets);
  260     $_[1] = $str if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC);
  261     return join(' ', @result);
  262 }
  263 
  264 sub _encode_word {
  265     my ($obj, $octets) = @_;
  266     my $charset = $obj->{charset};
  267     my $encode = $obj->{encode};
  268     my $text = $encode eq 'B' ? _encode_b($octets) : _encode_q($octets);
  269     return "=?$charset?$encode?$text?=";
  270 }
  271 
  272 sub _encoded_word_len {
  273     my ($obj, $octets) = @_;
  274     my $charset = $obj->{charset};
  275     my $encode = $obj->{encode};
  276     my $text_len = $encode eq 'B' ? _encoded_b_len($octets) : _encoded_q_len($octets);
  277     return length("=?$charset?$encode??=") + $text_len;
  278 }
  279 
  280 sub _encode_b {
  281     my ($octets) = @_;
  282     return MIME::Base64::encode($octets, '');
  283 }
  284 
  285 sub _encoded_b_len {
  286     my ($octets) = @_;
  287     return ( length($octets) + 2 ) / 3 * 4;
  288 }
  289 
  290 my $re_invalid_q_char = qr/[^0-9A-Za-z !*+\-\/]/;
  291 
  292 sub _encode_q {
  293     my ($octets) = @_;
  294     $octets =~ s{($re_invalid_q_char)}{
  295         join('', map { sprintf('=%02X', $_) } unpack('C*', $1))
  296     }egox;
  297     $octets =~ s/ /_/go;
  298     return $octets;
  299 }
  300 
  301 sub _encoded_q_len {
  302     my ($octets) = @_;
  303     my $invalid_count = () = $octets =~ /$re_invalid_q_char/sgo;
  304     return ( $invalid_count * 3 ) + ( length($octets) - $invalid_count );
  305 }
  306 
  307 1;
  308 __END__
  309 
  310 =head1 NAME
  311 
  312 Encode::MIME::Header -- MIME encoding for an unstructured email header
  313 
  314 =head1 SYNOPSIS
  315 
  316     use Encode qw(encode decode);
  317 
  318     my $mime_str = encode("MIME-Header", "Sample:Text \N{U+263A}");
  319     # $mime_str is "=?UTF-8?B?U2FtcGxlOlRleHQg4pi6?="
  320 
  321     my $mime_q_str = encode("MIME-Q", "Sample:Text \N{U+263A}");
  322     # $mime_q_str is "=?UTF-8?Q?Sample=3AText_=E2=98=BA?="
  323 
  324     my $str = decode("MIME-Header",
  325         "=?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=\r\n " .
  326         "=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?="
  327     );
  328     # $str is "If you can read this you understand the example."
  329 
  330     use Encode qw(decode :fallbacks);
  331     use Encode::MIME::Header;
  332     local $Encode::MIME::Header::STRICT_DECODE = 1;
  333     my $strict_string = decode("MIME-Header", $mime_string, FB_CROAK);
  334     # use strict decoding and croak on errors
  335 
  336 =head1 ABSTRACT
  337 
  338 This module implements L<RFC 2047|https://tools.ietf.org/html/rfc2047> MIME
  339 encoding for an unstructured field body of the email header.  It can also be
  340 used for L<RFC 822|https://tools.ietf.org/html/rfc822> 'text' token.  However,
  341 it cannot be used directly for the whole header with the field name or for the
  342 structured header fields like From, To, Cc, Message-Id, etc...  There are 3
  343 encoding names supported by this module: C<MIME-Header>, C<MIME-B> and
  344 C<MIME-Q>.
  345 
  346 =head1 DESCRIPTION
  347 
  348 Decode method takes an unstructured field body of the email header (or
  349 L<RFC 822|https://tools.ietf.org/html/rfc822> 'text' token) as its input and
  350 decodes each MIME encoded-word from input string to a sequence of bytes
  351 according to L<RFC 2047|https://tools.ietf.org/html/rfc2047> and
  352 L<RFC 2231|https://tools.ietf.org/html/rfc2231>.  Subsequently, each sequence
  353 of bytes with the corresponding MIME charset is decoded with
  354 L<the Encode module|Encode> and finally, one output string is returned.  Text
  355 parts of the input string which do not contain MIME encoded-word stay
  356 unmodified in the output string.  Folded newlines between two consecutive MIME
  357 encoded-words are discarded, others are preserved in the output string.
  358 C<MIME-B> can decode Base64 variant, C<MIME-Q> can decode Quoted-Printable
  359 variant and C<MIME-Header> can decode both of them.  If L<Encode module|Encode>
  360 does not support particular MIME charset or chosen variant then an action based
  361 on L<CHECK flags|Encode/Handling Malformed Data> is performed (by default, the
  362 MIME encoded-word is not decoded).
  363 
  364 Encode method takes a scalar string as its input and uses
  365 L<strict UTF-8|Encode/UTF-8 vs. utf8 vs. UTF8> encoder for encoding it to UTF-8
  366 bytes.  Then a sequence of UTF-8 bytes is encoded into MIME encoded-words
  367 (C<MIME-Header> and C<MIME-B> use a Base64 variant while C<MIME-Q> uses a
  368 Quoted-Printable variant) where each MIME encoded-word is limited to 75
  369 characters.  MIME encoded-words are separated by C<CRLF SPACE> and joined to
  370 one output string.  Output string is suitable for unstructured field body of
  371 the email header.
  372 
  373 Both encode and decode methods propagate
  374 L<CHECK flags|Encode/Handling Malformed Data> when encoding and decoding the
  375 MIME charset.
  376 
  377 =head1 BUGS
  378 
  379 Versions prior to 2.22 (part of Encode 2.83) have a malfunctioning decoder
  380 and encoder.  The MIME encoder infamously inserted additional spaces or
  381 discarded white spaces between consecutive MIME encoded-words, which led to
  382 invalid MIME headers produced by this module.  The MIME decoder had a tendency
  383 to discard white spaces, incorrectly interpret data or attempt to decode Base64
  384 MIME encoded-words as Quoted-Printable.  These problems were fixed in version
  385 2.22.  It is highly recommended not to use any version prior 2.22!
  386 
  387 Versions prior to 2.24 (part of Encode 2.87) ignored
  388 L<CHECK flags|Encode/Handling Malformed Data>.  The MIME encoder used
  389 L<not strict utf8|Encode/UTF-8 vs. utf8 vs. UTF8> encoder for input Unicode
  390 strings which could lead to invalid UTF-8 sequences.  MIME decoder used also
  391 L<not strict utf8|Encode/UTF-8 vs. utf8 vs. UTF8> decoder and additionally
  392 called the decode method with a C<Encode::FB_PERLQQ> flag (thus user-specified
  393 L<CHECK flags|Encode/Handling Malformed Data> were ignored).  Moreover, it
  394 automatically croaked when a MIME encoded-word contained unknown encoding.
  395 Since version 2.24, this module uses
  396 L<strict UTF-8|Encode/UTF-8 vs. utf8 vs. UTF8> encoder and decoder.  And
  397 L<CHECK flags|Encode/Handling Malformed Data> are correctly propagated.
  398 
  399 Since version 2.22 (part of Encode 2.83), the MIME encoder should be fully
  400 compliant to L<RFC 2047|https://tools.ietf.org/html/rfc2047> and
  401 L<RFC 2231|https://tools.ietf.org/html/rfc2231>.  Due to the aforementioned
  402 bugs in previous versions of the MIME encoder, there is a I<less strict>
  403 compatible mode for the MIME decoder which is used by default.  It should be
  404 able to decode MIME encoded-words encoded by pre 2.22 versions of this module.
  405 However, note that this is not correct according to
  406 L<RFC 2047|https://tools.ietf.org/html/rfc2047>.
  407 
  408 In default I<not strict> mode the MIME decoder attempts to decode every substring
  409 which looks like a MIME encoded-word.  Therefore, the MIME encoded-words do not
  410 need to be separated by white space.  To enforce a correct I<strict> mode, set
  411 variable C<$Encode::MIME::Header::STRICT_DECODE> to 1 e.g. by localizing:
  412 
  413   use Encode::MIME::Header;
  414   local $Encode::MIME::Header::STRICT_DECODE = 1;
  415 
  416 =head1 AUTHORS
  417 
  418 Pali E<lt>pali@cpan.orgE<gt>
  419 
  420 =head1 SEE ALSO
  421 
  422 L<Encode>,
  423 L<RFC 822|https://tools.ietf.org/html/rfc822>,
  424 L<RFC 2047|https://tools.ietf.org/html/rfc2047>,
  425 L<RFC 2231|https://tools.ietf.org/html/rfc2231>
  426 
  427 =cut