"Fossies" - the Fresh Open Source Software Archive

Member "detox-1.4.5/src/clean_string.c" (15 Aug 2021, 14275 Bytes) of package /linux/privat/detox-1.4.5.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "clean_string.c" see the Fossies "Dox" file reference documentation.

    1 /**
    2  * This file is part of the Detox package.
    3  *
    4  * Copyright (c) Doug Harple <detox.dharple@gmail.com>
    5  *
    6  * For the full copyright and license information, please view the LICENSE
    7  * file that was distributed with this source code.
    8  */
    9 
   10 #include "config.h"
   11 
   12 #include <stdio.h>
   13 #include <stdlib.h>
   14 #include <string.h>
   15 #include <ctype.h>
   16 #include <errno.h>
   17 
   18 #include "clean_string.h"
   19 
   20 /* translation array for ISO8859.1 characters */
   21 #include "iso8859_1.h"
   22 
   23 /* translation array for unicode characters */
   24 #include "unicode.h"
   25 
   26 #include "parse_table.h"
   27 #include "table.h"
   28 
   29 
   30 /*
   31  * Translates ISO8859.1 characters (Latin-1) into lower ASCII characters.
   32  */
   33 unsigned char *clean_iso8859_1_basic(unsigned char *s, void *opts)
   34 {
   35     unsigned char *output, *input_walk, *output_walk, *replace_walk;
   36     int replace_pos;
   37 
   38     if (s == NULL) {
   39         return NULL;
   40     }
   41 
   42     output = malloc((strlen(s) * ISO8859_1_MAXLEN) + 1);
   43     if (output == NULL) {
   44         fprintf(stderr, "out of memory: %s\n", strerror(errno));
   45         return NULL;
   46     }
   47 
   48     input_walk = s;
   49     output_walk = output;
   50 
   51     while (*input_walk != '\0') {
   52         if (*input_walk >= ISO8859_1_OFFSET) {
   53             replace_pos = *input_walk - ISO8859_1_OFFSET;
   54             replace_walk = (unsigned char *)&iso8859_1_trans[replace_pos];
   55 
   56             while (*replace_walk != '\0') {
   57                 *output_walk++ = *replace_walk++;
   58             }
   59             input_walk++;
   60         }
   61         else {
   62             *output_walk++ = *input_walk++;
   63         }
   64     }
   65 
   66     *output_walk = '\0';
   67 
   68     return output;
   69 }
   70 
   71 /*
   72  * Translates ISO8859.1 characters (Latin-1) into lower ASCII characters.
   73  */
   74 unsigned char *clean_iso8859_1(unsigned char *s, void *opts)
   75 {
   76     unsigned char *output, *input_walk, *output_walk, *replace_walk;
   77 
   78     struct translation_table *table = NULL;
   79     struct clean_string_options *options = NULL;
   80 
   81     if (s == NULL) {
   82         return NULL;
   83     }
   84 
   85     if (opts == NULL) {
   86         fprintf(stderr, "this shouldn't happen\n");
   87         exit(EXIT_FAILURE);
   88     }
   89 
   90     options = (struct clean_string_options *)opts;
   91     table = options->translation_table;
   92 
   93     output = malloc((strlen(s) * table->max_data_length) + 1);
   94     if (output == NULL) {
   95         fprintf(stderr, "out of memory: %s\n", strerror(errno));
   96         return NULL;
   97     }
   98 
   99     input_walk = s;
  100     output_walk = output;
  101 
  102     while (*input_walk != '\0') {
  103         if (*input_walk >= ISO8859_1_OFFSET) {
  104             replace_walk = table_get(table, *input_walk);
  105             if (replace_walk == NULL) {
  106                 if (table->default_translation == NULL) {
  107                     /*
  108                      * Null translation == leave it alone
  109                      */
  110                     *output_walk++ = *input_walk++;
  111                     continue;
  112                 }
  113                 else {
  114                     replace_walk = table->default_translation;
  115                 }
  116             }
  117 
  118             while (*replace_walk != '\0') {
  119                 *output_walk++ = *replace_walk++;
  120             }
  121 
  122             input_walk++;
  123         }
  124         else {
  125             *output_walk++ = *input_walk++;
  126         }
  127     }
  128 
  129     *output_walk = '\0';
  130 
  131     return output;
  132 }
  133 
  134 
  135 /*
  136  * Cleans up any unsafe characters.
  137  *
  138  * The rules are:
  139  *   Leave alone:
  140  *     - # ~ % ^ _ , . + =
  141  *
  142  *   Translate:
  143  *     &  into  _and_
  144  *
  145  *   Replace with _:
  146  *     ` ! @ $ * \ | : ; " ' < ? / '\n' '\r' '\t'
  147  *
  148  *   Replace with -:
  149  *     ( ) [ ] { }
  150  *
  151  */
  152 unsigned char *clean_safe_basic(unsigned char *s, void *opts)
  153 {
  154     unsigned char *output, *input_walk, *output_walk;
  155 
  156     if (s == NULL) {
  157         return NULL;
  158     }
  159 
  160     output = malloc((strlen(s) * 5) + 1);
  161     if (output == NULL) {
  162         fprintf(stderr, "out of memory: %s\n", strerror(errno));
  163         return NULL;
  164     }
  165 
  166     input_walk = s;
  167     output_walk = output;
  168 
  169     while (*input_walk != '\0') {
  170         if (isalnum(*input_walk)) {
  171             *output_walk++ = *input_walk++;
  172             continue;
  173         }
  174 
  175         switch (*input_walk) {
  176             case '-':
  177             case '#':
  178             case '~':
  179             case '%':
  180             case '^':
  181             case '_':
  182             case ',':
  183             case '.':
  184             case '+':
  185             case '=':
  186                 *output_walk++ = *input_walk;
  187                 break;
  188 
  189             case '&':
  190                 *output_walk++ = '_';
  191                 *output_walk++ = 'a';
  192                 *output_walk++ = 'n';
  193                 *output_walk++ = 'd';
  194                 *output_walk++ = '_';
  195                 break;
  196 
  197             case ' ':
  198             case '`':
  199             case '!':
  200             case '@':
  201             case '$':
  202             case '*':
  203             case '\\':
  204             case '|':
  205             case ':':
  206             case ';':
  207             case '"':
  208             case '\'':
  209             case '<':
  210             case '>':
  211             case '?':
  212             case '/':
  213             case '\n':
  214             case '\r':
  215             case '\t':
  216                 *output_walk++ = '_';
  217                 break;
  218 
  219             case '(':
  220             case ')':
  221             case '[':
  222             case ']':
  223             case '{':
  224             case '}':
  225                 *output_walk++ = '-';
  226                 break;
  227         }
  228 
  229         input_walk++;
  230     }
  231 
  232     *output_walk = '\0';
  233 
  234     return output;
  235 }
  236 
  237 
  238 /*
  239  * Translates unsafe characters
  240  */
  241 unsigned char *clean_safe(unsigned char *s, void *opts)
  242 {
  243     unsigned char *output, *input_walk, *output_walk, *replace_walk;
  244 
  245     struct translation_table *table = NULL;
  246     struct clean_string_options *options = NULL;
  247 
  248     if (s == NULL) {
  249         return NULL;
  250     }
  251 
  252     if (opts == NULL) {
  253         fprintf(stderr, "this shouldn't happen\n");
  254         exit(EXIT_FAILURE);
  255     }
  256 
  257     options = (struct clean_string_options *)opts;
  258     table = options->translation_table;
  259 
  260     output = malloc((strlen(s) * table->max_data_length) + 1);
  261     if (output == NULL) {
  262         fprintf(stderr, "out of memory: %s\n", strerror(errno));
  263         return NULL;
  264     }
  265 
  266     input_walk = s;
  267     output_walk = output;
  268 
  269     while (*input_walk != '\0') {
  270         replace_walk = table_get(table, *input_walk);
  271         if (replace_walk == NULL) {
  272             if (table->default_translation == NULL) {
  273 
  274                 /*
  275                  * Null translation == leave it alone
  276                  */
  277                 *output_walk++ = *input_walk++;
  278                 continue;
  279             }
  280             else {
  281                 replace_walk = table->default_translation;
  282             }
  283         }
  284 
  285         while (*replace_walk != '\0') {
  286             *output_walk++ = *replace_walk++;
  287         }
  288 
  289         input_walk++;
  290     }
  291 
  292     *output_walk = '\0';
  293 
  294     return output;
  295 }
  296 
  297 
  298 
  299 /*
  300  * Cleans up any CGI encoded characters, in the form "%" followed by 2 hex
  301  * digits.
  302  */
  303 unsigned char *clean_uncgi(unsigned char *s, void *opts)
  304 {
  305     unsigned char *output, *input_walk, *output_walk;
  306     unsigned char conv[3];
  307 
  308     if (s == NULL) {
  309         return NULL;
  310     }
  311 
  312     output = malloc(strlen(s) + 1);
  313     if (output == NULL) {
  314         fprintf(stderr, "out of memory: %s\n", strerror(errno));
  315         return NULL;
  316     }
  317 
  318     input_walk = s;
  319     output_walk = output;
  320 
  321     while (*input_walk != '\0') {
  322         if (input_walk[0] == '%' && isxdigit(input_walk[1]) && isxdigit(input_walk[2])) {
  323             conv[0] = input_walk[1];
  324             conv[1] = input_walk[2];
  325             conv[2] = 0;
  326             *output_walk++ = (unsigned char)strtol(conv, NULL, 16);
  327             input_walk += 3;
  328         }
  329         else {
  330             *output_walk++ = *input_walk++;
  331         }
  332     }
  333 
  334     *output_walk = '\0';
  335 
  336     return output;
  337 }
  338 
  339 
  340 /*
  341  * Reduces any series of "_" and "-" to a single character.  "-" takes
  342  * precedence.
  343  *
  344  * If "remove_trailing" is set to non-zero, then "." is added to the
  345  * comparison, and takes precedence.  This has the effect of reducing "-." or
  346  * "._", etc, to ".".
  347  *
  348  * Strips any "-", "_" or "#" from the beginning of a string.
  349  *
  350  */
  351 unsigned char *clean_wipeup(unsigned char *s, void *opts)
  352 {
  353     unsigned char *output, *input_walk, *output_walk;
  354     int matched;
  355     int remove_trailing;
  356 
  357     if (s == NULL) {
  358         return NULL;
  359     }
  360 
  361     remove_trailing = 0;
  362     if (opts != NULL) {
  363         remove_trailing = ((struct clean_string_options *)opts)->remove_trailing;
  364     }
  365 
  366     /* remove any -, _, or # at beginning of string */
  367     while (*s == '-' || *s == '_' || *s == '#') {
  368         s++;
  369     }
  370 
  371     output = malloc(strlen(s) + 1);
  372     if (output == NULL) {
  373         fprintf(stderr, "out of memory: %s\n", strerror(errno));
  374         return NULL;
  375     }
  376 
  377     input_walk = s;
  378     output_walk = output;
  379     matched = 0;
  380 
  381     while (*input_walk != '\0') {
  382         switch (*input_walk) {
  383             case '-':
  384                 if (matched) {
  385                     if (*output_walk == '_') {
  386                         *output_walk = '-';
  387                     }
  388                 }
  389                 else {
  390                     *output_walk = '-';
  391                 }
  392 
  393                 matched = 1;
  394                 break;
  395 
  396             case '_':
  397                 if (!matched) {
  398                     *output_walk = '_';
  399                 }
  400 
  401                 matched = 1;
  402                 break;
  403 
  404             case '.':
  405                 if (remove_trailing) {
  406                     *output_walk = '.';
  407                     matched = 1;
  408                     break;
  409                 }   /* else fall through */
  410             default:
  411                 if (matched) {
  412                     output_walk++;
  413                     matched = 0;
  414                 }
  415 
  416                 *output_walk++ = *input_walk;
  417         }
  418         input_walk++;
  419     }
  420 
  421     if (matched) {
  422         output_walk++;
  423     }
  424 
  425     *output_walk = '\0';
  426 
  427     return output;
  428 }
  429 
  430 #define UTF_8_ENCODED 0x80
  431 #define UTF_8_ENCODED_4_CHARS 0xf0
  432 #define UTF_8_ENCODED_3_CHARS 0xe0
  433 #define UTF_8_ENCODED_2_CHARS 0xc0
  434 
  435 /*
  436  * Translates UTF-8 characters (Unicode Translation Format - 8 Bit) into
  437  * Unicode and then lower ASCII characters.
  438  */
  439 unsigned char *clean_utf_8_basic(unsigned char *s, void *opts)
  440 {
  441     unsigned char *output, *input_walk, *output_walk, *replace_walk;
  442     int new_value, expected_chars;
  443 
  444     if (s == NULL) {
  445         return NULL;
  446     }
  447 
  448     output = malloc((strlen(s) * UNICODE_MAXLEN) + 1);
  449     if (output == NULL) {
  450         fprintf(stderr, "out of memory: %s\n", strerror(errno));
  451         return NULL;
  452     }
  453 
  454     input_walk = s;
  455     output_walk = output;
  456 
  457     while (*input_walk != '\0') {
  458         if ((*input_walk & UTF_8_ENCODED) == 0) {
  459             *output_walk++ = *input_walk++;
  460             continue;
  461         }
  462 
  463         new_value = 0;
  464         expected_chars = 0;
  465 
  466         /*
  467          * Needs to be done in descending orders due to the fact that
  468          * the 2 char mask will match on the 4 char mask, but not
  469          * vice versa.
  470          */
  471         if ((*input_walk & UTF_8_ENCODED_4_CHARS) == UTF_8_ENCODED_4_CHARS) {
  472 
  473             /*
  474              * 11110aaa 10bbbbbb 10cccccc 10dddddd
  475              */
  476 
  477             new_value = *input_walk & 0x07;
  478             expected_chars = 3;
  479         }
  480         else if ((*input_walk & UTF_8_ENCODED_3_CHARS) == UTF_8_ENCODED_3_CHARS) {
  481 
  482             /*
  483              * 1110aaaa 10bbbbbb 10cccccc
  484              */
  485 
  486             new_value = *input_walk & 0x0f;
  487             expected_chars = 2;
  488         }
  489         else if ((*input_walk & UTF_8_ENCODED_2_CHARS) == UTF_8_ENCODED_2_CHARS) {
  490 
  491             /*
  492              * 110aaaaa 10bbbbbb
  493              */
  494 
  495             new_value = *input_walk & 0x1f;
  496             expected_chars = 1;
  497         }
  498         else {
  499             input_walk++;
  500             continue;
  501         }
  502 
  503         while (expected_chars > 0) {
  504             new_value <<= 6;
  505 
  506             input_walk++;
  507 
  508             if (*input_walk == '\0') {
  509                 new_value = -1;
  510                 break;
  511             }
  512 
  513             if ((*input_walk & UTF_8_ENCODED) == 0) {
  514                 new_value = -1;
  515                 break;
  516             }
  517 
  518             new_value += *input_walk & 0x3f;
  519 
  520             expected_chars--;
  521         }
  522 
  523         if (new_value == -1) {
  524             continue;
  525         }
  526 
  527         if (new_value >= UNICODE_COUNT) {
  528             *output_walk++ = '_';
  529             continue;
  530         }
  531 
  532         replace_walk = (unsigned char *)&unicode_trans[new_value];
  533 
  534         while (*replace_walk != '\0') {
  535             *output_walk++ = *replace_walk++;
  536         }
  537     }
  538 
  539     *output_walk = '\0';
  540 
  541     return output;
  542 }
  543 
  544 /*
  545  * Translates UTF-8 characters (Unicode Translation Format - 8 Bit) into
  546  * Unicode and then runs the translation table.
  547  */
  548 unsigned char *clean_utf_8(unsigned char *s, void *opts)
  549 {
  550     unsigned char *output, *input_walk, *output_walk, *replace_walk;
  551     int new_value, expected_chars;
  552 
  553     struct translation_table *table = NULL;
  554     struct clean_string_options *options = NULL;
  555 
  556     int characters_eaten;
  557 
  558     if (s == NULL) {
  559         return NULL;
  560     }
  561 
  562     if (opts == NULL) {
  563         fprintf(stderr, "this shouldn't happen\n");
  564         exit(EXIT_FAILURE);
  565     }
  566 
  567     options = (struct clean_string_options *)opts;
  568     table = options->translation_table;
  569 
  570     output = malloc((strlen(s) * table->max_data_length) + 1);
  571     if (output == NULL) {
  572         fprintf(stderr, "out of memory: %s\n", strerror(errno));
  573         return NULL;
  574     }
  575 
  576     input_walk = s;
  577     output_walk = output;
  578 
  579     while (*input_walk != '\0') {
  580         new_value = 0;
  581         expected_chars = 0;
  582         characters_eaten = 0;
  583 
  584         /*
  585          * Needs to be done in descending orders due to the fact that
  586          * the 2 char mask will match on the 4 char mask, but not
  587          * vice versa.
  588          */
  589         if ((*input_walk & UTF_8_ENCODED_4_CHARS) == UTF_8_ENCODED_4_CHARS) {
  590 
  591             /*
  592              * 11110aaa 10bbbbbb 10cccccc 10dddddd
  593              */
  594 
  595             new_value = *input_walk & 0x07;
  596             expected_chars = 3;
  597             characters_eaten = 4;
  598         }
  599         else if ((*input_walk & UTF_8_ENCODED_3_CHARS) == UTF_8_ENCODED_3_CHARS) {
  600 
  601             /*
  602              * 1110aaaa 10bbbbbb 10cccccc
  603              */
  604 
  605             new_value = *input_walk & 0x0f;
  606             expected_chars = 2;
  607             characters_eaten = 3;
  608         }
  609         else if ((*input_walk & UTF_8_ENCODED_2_CHARS) == UTF_8_ENCODED_2_CHARS) {
  610 
  611             /*
  612              * 110aaaaa 10bbbbbb
  613              */
  614 
  615             new_value = *input_walk & 0x1f;
  616             expected_chars = 1;
  617             characters_eaten = 2;
  618         }
  619         else if ((*input_walk & UTF_8_ENCODED) == UTF_8_ENCODED) {
  620             fprintf(stderr, "unsupported unicode length\n");
  621             exit(EXIT_FAILURE);
  622         }
  623         else {
  624             new_value = *input_walk;
  625             expected_chars = 0;
  626             characters_eaten = 1;
  627         }
  628 
  629         while (expected_chars > 0) {
  630             new_value <<= 6;
  631 
  632             input_walk++;
  633 
  634             if (*input_walk == '\0') {
  635                 new_value = -1;
  636                 break;
  637             }
  638 
  639             if ((*input_walk & UTF_8_ENCODED) == 0) {
  640                 new_value = -1;
  641                 break;
  642             }
  643 
  644             new_value += *input_walk & 0x3f;
  645 
  646             expected_chars--;
  647         }
  648         input_walk++;
  649 
  650         if (new_value == -1) {
  651             continue;
  652         }
  653 
  654         replace_walk = table_get(table, new_value);
  655 
  656         if (replace_walk == NULL) {
  657             replace_walk = table->default_translation;
  658         }
  659 
  660         if (replace_walk == NULL) {
  661 
  662             /*
  663              * Null translation == leave it alone
  664              */
  665             input_walk -= characters_eaten;
  666 
  667             while (characters_eaten > 0) {
  668                 *output_walk++ = *input_walk++;
  669                 characters_eaten--;
  670             }
  671 
  672             continue;
  673         }
  674 
  675         while (*replace_walk != '\0') {
  676             *output_walk++ = *replace_walk++;
  677         }
  678     }
  679 
  680     *output_walk = '\0';
  681 
  682     return output;
  683 }
  684 
  685 
  686 
  687 /*
  688  * Trims a file down to specified length.
  689  */
  690 unsigned char *clean_max_length(unsigned char *s, void *opts)
  691 {
  692     unsigned char *output, *input_walk, *output_walk;
  693     size_t max_length;
  694     size_t s_length;
  695     size_t ext_length;
  696 
  697     if (s == NULL) {
  698         return NULL;
  699     }
  700 
  701     max_length = 256;
  702     if (opts != NULL) {
  703         max_length = ((struct clean_string_options *)opts)->max_length;
  704     }
  705 
  706     s_length = strlen(s);
  707 
  708     output = malloc(max_length + 1);
  709     if (output == NULL) {
  710         fprintf(stderr, "out of memory: %s\n", strerror(errno));
  711         return NULL;
  712     }
  713 
  714     snprintf(output, max_length + 1, "%s", s);
  715 
  716     if (s_length <= max_length) {
  717         return output;
  718     }
  719 
  720     input_walk = strrchr(s, '.');
  721 
  722     if (input_walk == NULL) {
  723         return output;
  724     }
  725 
  726     ext_length = strlen(input_walk);
  727 
  728     output_walk = output;
  729     output_walk += max_length - ext_length;
  730 
  731     while (*(output_walk - 1) == '.' && output_walk > output) {
  732         output_walk--;
  733     }
  734 
  735     snprintf(output_walk, ext_length + 1, "%s", input_walk);
  736 
  737     return output;
  738 }
  739 
  740 
  741 /*
  742  * Converts all characters to lowercase.
  743  */
  744 unsigned char *clean_lower(unsigned char *s, void *opts)
  745 {
  746     unsigned char *output, *input_walk, *output_walk;
  747 
  748     if (s == NULL) {
  749         return NULL;
  750     }
  751 
  752     output = malloc(strlen(s) + 1);
  753     if (output == NULL) {
  754         fprintf(stderr, "out of memory: %s\n", strerror(errno));
  755         return NULL;
  756     }
  757 
  758     input_walk = s;
  759     output_walk = output;
  760 
  761     while (*input_walk != '\0') {
  762         if (isupper(*input_walk)) {
  763             *output_walk++ = tolower(*input_walk++);
  764         }
  765         else {
  766             *output_walk++ = *input_walk++;
  767         }
  768     }
  769 
  770     *output_walk = '\0';
  771 
  772     return output;
  773 }