"Fossies" - the Fresh Open Source Software Archive

Member "formmail_modules-3.14m1/lib/CGI/NMS/Charset.pm" (11 Aug 2004, 7687 Bytes) of package /linux/www/old/formmail_modules-3.14m1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Perl source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "Charset.pm" see the Fossies "Dox" file reference documentation.

    1 package CGI::NMS::Charset;
    2 use strict;
    3 
    4 require 5.00404;
    5 
    6 use vars qw($VERSION);
    7 $VERSION = sprintf '%d.%.2d', (q$Revision: 1.1 $ =~ /(\d+)\.(\d+)/);
    8 
    9 =head1 NAME
   10 
   11 CGI::NMS::Charset - a charset-aware object for handling text strings
   12 
   13 =head1 SYNOPSIS
   14 
   15   my $cs = CGI::NMS::Charset->new('iso-8859-1');
   16 
   17   my $safe_to_put_in_html = $cs->escape($untrusted_user_input);
   18 
   19   my $printable = &{ $cs->strip_nonprint_coderef }( $input );
   20   my $escaped = &{ $cs->escape_html_coderef }( $printable );
   21 
   22 =head1 DESCRIPTION
   23 
   24 Each object of class C<CGI::NMS::Charset> is bound to a particular
   25 character set when it is created.  The object provides methods to
   26 generate coderefs to perform a couple of character set dependent
   27 operations on text strings.
   28 
   29 =cut
   30 
   31 =head1 CONSTRUCTORS
   32 
   33 =over
   34 
   35 =item new ( CHARSET )
   36 
   37 Creates a new C<CGI::NMS::Charset> object, suitable for handing text
   38 in the character set CHARSET.  The CHARSET parameter must be a
   39 character set string, such as C<us-ascii> or C<utf-8> for example.
   40 
   41 =cut
   42 
   43 sub new
   44 {
   45    my ($pkg, $charset) = @_;
   46 
   47    my $self = { CHARSET => $charset };
   48 
   49    if ($charset =~ /^utf-8$/i)
   50    {
   51       $self->{SN} = \&_strip_nonprint_utf8;
   52       $self->{EH} = \&_escape_html_utf8;
   53    }
   54    elsif ($charset =~ /^iso-8859/i)
   55    {
   56       $self->{SN} = \&_strip_nonprint_8859;
   57       if ($charset =~ /^iso-8859-1$/i)
   58       {
   59          $self->{EH} = \&_escape_html_8859_1;
   60       }
   61       else
   62       {
   63          $self->{EH} = \&_escape_html_8859;
   64       }
   65    }
   66    elsif ($charset =~ /^us-ascii$/i)
   67    {
   68       $self->{SN} = \&_strip_nonprint_ascii;
   69       $self->{EH} = \&_escape_html_8859_1;
   70    }
   71    else
   72    {
   73       $self->{SN} = \&_strip_nonprint_weak;
   74       $self->{EH} = \&_escape_html_weak;
   75    }
   76 
   77    return bless $self, $pkg;
   78 }
   79 
   80 =back
   81 
   82 =head1 METHODS
   83 
   84 =over
   85 
   86 =item charset ()
   87 
   88 Returns the CHARSET string that was passed to the constructor.
   89 
   90 =cut
   91 
   92 sub charset
   93 {
   94    my ($self) = @_;
   95 
   96    return $self->{CHARSET};
   97 }
   98 
   99 =item escape ( STRING )
  100 
  101 Returns a copy of STRING with runs of non-printable characters
  102 replaced with spaces and HTML metacharacters replaced with the
  103 equivalent entities.
  104 
  105 If STRING is undef then the empty string will be returned.
  106 
  107 =cut
  108 
  109 sub escape
  110 {
  111    my ($self, $string) = @_;
  112 
  113    return &{ $self->{EH} }(  &{ $self->{SN} }($string)  );
  114 }
  115 
  116 =item strip_nonprint_coderef ()
  117 
  118 Returns a reference to a sub to replace runs of non-printable
  119 characters with spaces, in a manner suited to the charset in
  120 use.
  121 
  122 The returned coderef points to a sub that takes a single readonly
  123 string argument and returns a modified version of the string.  If
  124 undef is passed to the function then the empty string will be
  125 returned.
  126 
  127 =cut
  128 
  129 sub strip_nonprint_coderef
  130 {
  131    my ($self) = @_;
  132 
  133    return $self->{SN};
  134 }
  135 
  136 =item escape_html_coderef ()
  137 
  138 Returns a reference to a sub to escape HTML metacharacters in
  139 a manner suited to the charset in use.
  140 
  141 The returned coderef points to a sub that takes a single readonly
  142 string argument and returns a modified version of the string.
  143 
  144 =cut
  145 
  146 sub escape_html_coderef
  147 {
  148    my ($self) = @_;
  149 
  150    return $self->{EH};
  151 }
  152 
  153 =back
  154 
  155 =head1 DATA TABLES
  156 
  157 =over
  158 
  159 =item C<%eschtml_map>
  160 
  161 The C<%eschtml_map> hash maps C<iso-8859-1> characters to the
  162 equivalent HTML entities.
  163 
  164 =cut
  165 
  166 use vars qw(%eschtml_map);
  167 %eschtml_map = ( 
  168                  ( map {chr($_) => "&#$_;"} (0..255) ),
  169                  '<' => '&lt;',
  170                  '>' => '&gt;',
  171                  '&' => '&amp;',
  172                  '"' => '&quot;',
  173                );
  174 
  175 =back
  176 
  177 =head1 PRIVATE FUNCTIONS
  178 
  179 These functions are returned by the strip_nonprint_coderef() and
  180 escape_html_coderef() methods and invoked by the escape() method.
  181 The function most appropriate to the character set in use will be
  182 chosen.
  183 
  184 =over
  185 
  186 =item _strip_nonprint_utf8
  187 
  188 Returns a copy of STRING with everything but printable C<us-ascii>
  189 characters and valid C<utf-8> multibyte sequences replaced with
  190 space characters.
  191 
  192 =cut
  193 
  194 sub _strip_nonprint_utf8
  195 {
  196    my ($string) = @_;
  197    return '' unless defined $string;
  198 
  199    $string =~
  200    s%
  201     ( [\t\n\040-\176]               # printable us-ascii
  202     | [\xC2-\xDF][\x80-\xBF]        # U+00000080 to U+000007FF
  203     | \xE0[\xA0-\xBF][\x80-\xBF]    # U+00000800 to U+00000FFF
  204     | [\xE1-\xEF][\x80-\xBF]{2}     # U+00001000 to U+0000FFFF
  205     | \xF0[\x90-\xBF][\x80-\xBF]{2} # U+00010000 to U+0003FFFF
  206     | [\xF1-\xF7][\x80-\xBF]{3}     # U+00040000 to U+001FFFFF
  207     | \xF8[\x88-\xBF][\x80-\xBF]{3} # U+00200000 to U+00FFFFFF
  208     | [\xF9-\xFB][\x80-\xBF]{4}     # U+01000000 to U+03FFFFFF
  209     | \xFC[\x84-\xBF][\x80-\xBF]{4} # U+04000000 to U+3FFFFFFF
  210     | \xFD[\x80-\xBF]{5}            # U+40000000 to U+7FFFFFFF
  211     ) | .
  212    %
  213     defined $1 ? $1 : ' '
  214    %gexs;
  215 
  216    #
  217    # U+FFFE, U+FFFF and U+D800 to U+DFFF are dangerous and
  218    # should be treated as invalid combinations, according to
  219    # http://www.cl.cam.ac.uk/~mgk25/unicode.html
  220    #
  221    $string =~ s%\xEF\xBF[\xBE-\xBF]% %g;
  222    $string =~ s%\xED[\xA0-\xBF][\x80-\xBF]% %g;
  223 
  224    return $string;
  225 }
  226 
  227 =item _escape_html_utf8 ( STRING )
  228 
  229 Returns a copy of STRING with any HTML metacharacters
  230 escaped.  Escapes all but the most commonly occurring C<us-ascii>
  231 characters and bytes that might form part of valid C<utf-8>
  232 multibyte sequences.
  233 
  234 =cut
  235 
  236 sub _escape_html_utf8
  237 {
  238    my ($string) = @_;
  239 
  240    $string =~ s|([^\w \t\r\n\-\.\,\x80-\xFD])| $eschtml_map{$1} |ge;
  241    return $string;
  242 }
  243 
  244 =item _strip_nonprint_weak ( STRING )
  245 
  246 Returns a copy of STRING with sequences of NULL characters
  247 replaced with space characters.
  248 
  249 =cut
  250 
  251 sub _strip_nonprint_weak
  252 {
  253    my ($string) = @_;
  254    return '' unless defined $string;
  255 
  256    $string =~ s/\0+/ /g;
  257    return $string;
  258 }
  259    
  260 =item _escape_html_weak ( STRING )
  261 
  262 Returns a copy of STRING with any HTML metacharacters escaped.
  263 In order to work in any charset, escapes only E<lt>, E<gt>, C<">
  264 and C<&> characters.
  265 
  266 =cut
  267 
  268 sub _escape_html_weak
  269 {
  270    my ($string) = @_;
  271 
  272    $string =~ s/[<>"&]/$eschtml_map{$1}/eg;
  273    return $string;
  274 }
  275 
  276 =item _escape_html_8859_1 ( STRING )
  277 
  278 Returns a copy of STRING with all but the most commonly
  279 occurring printable characters replaced with HTML entities.
  280 Only suitable for C<us-ascii> or C<iso-8859-1> input.
  281 
  282 =cut
  283 
  284 sub _escape_html_8859_1
  285 {
  286    my ($string) = @_;
  287 
  288    $string =~ s|([^\w \t\r\n\-\.\,\/\:])| $eschtml_map{$1} |ge;
  289    return $string;
  290 }
  291 
  292 =item _escape_html_8859 ( STRING )
  293 
  294 Returns a copy of STRING with all but the most commonly
  295 occurring printable C<us-ascii> characters and characters
  296 that might be printable in some C<iso-8859-*> charset
  297 replaced with HTML entities.
  298 
  299 =cut
  300 
  301 sub _escape_html_8859
  302 {
  303    my ($string) = @_;
  304 
  305    $string =~ s|([^\w \t\r\n\-\.\,\/\:\240-\377])| $eschtml_map{$1} |ge;
  306    return $string;
  307 }
  308 
  309 =item _strip_nonprint_8859 ( STRING )
  310 
  311 Returns a copy of STRING with runs of characters that are not
  312 printable in any C<iso-8859-*> charset replaced with spaces.
  313 
  314 =cut
  315 
  316 sub _strip_nonprint_8859
  317 {
  318    my ($string) = @_;
  319    return '' unless defined $string;
  320 
  321    $string =~ tr#\t\n\040-\176\240-\377# #cs;
  322    return $string;
  323 }
  324 
  325 =item _strip_nonprint_ascii ( STRING )
  326 
  327 Returns a copy of STRING with runs of characters that are not
  328 printable C<us-ascii> replaced with spaces.
  329 
  330 =cut
  331 
  332 sub _strip_nonprint_ascii
  333 {
  334    my ($string) = @_;
  335    return '' unless defined $string;
  336 
  337    $string =~ tr#\t\n\040-\176# #cs;
  338    return $string;
  339 }
  340 
  341 =back
  342 
  343 =head1 MAINTAINERS
  344 
  345 The NMS project, E<lt>http://nms-cgi.sourceforge.net/E<gt>
  346 
  347 To request support or report bugs, please email
  348 E<lt>nms-cgi-support@lists.sourceforge.netE<gt>
  349 
  350 =head1 COPYRIGHT
  351 
  352 Copyright 2002-2003 London Perl Mongers, All rights reserved
  353 
  354 =head1 LICENSE
  355 
  356 This module is free software; you are free to redistribute it
  357 and/or modify it under the same terms as Perl itself.
  358 
  359 =cut
  360 
  361 1;
  362