"Fossies" - the Fresh Open Source Software Archive 
Member "detox-1.4.5/src/clean_string.c" (15 Aug 2021, 14275 Bytes) of package /linux/privat/detox-1.4.5.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "clean_string.c" see the
Fossies "Dox" file reference documentation.
1 /**
2 * This file is part of the Detox package.
3 *
4 * Copyright (c) Doug Harple <detox.dharple@gmail.com>
5 *
6 * For the full copyright and license information, please view the LICENSE
7 * file that was distributed with this source code.
8 */
9
10 #include "config.h"
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <errno.h>
17
18 #include "clean_string.h"
19
20 /* translation array for ISO8859.1 characters */
21 #include "iso8859_1.h"
22
23 /* translation array for unicode characters */
24 #include "unicode.h"
25
26 #include "parse_table.h"
27 #include "table.h"
28
29
30 /*
31 * Translates ISO8859.1 characters (Latin-1) into lower ASCII characters.
32 */
33 unsigned char *clean_iso8859_1_basic(unsigned char *s, void *opts)
34 {
35 unsigned char *output, *input_walk, *output_walk, *replace_walk;
36 int replace_pos;
37
38 if (s == NULL) {
39 return NULL;
40 }
41
42 output = malloc((strlen(s) * ISO8859_1_MAXLEN) + 1);
43 if (output == NULL) {
44 fprintf(stderr, "out of memory: %s\n", strerror(errno));
45 return NULL;
46 }
47
48 input_walk = s;
49 output_walk = output;
50
51 while (*input_walk != '\0') {
52 if (*input_walk >= ISO8859_1_OFFSET) {
53 replace_pos = *input_walk - ISO8859_1_OFFSET;
54 replace_walk = (unsigned char *)&iso8859_1_trans[replace_pos];
55
56 while (*replace_walk != '\0') {
57 *output_walk++ = *replace_walk++;
58 }
59 input_walk++;
60 }
61 else {
62 *output_walk++ = *input_walk++;
63 }
64 }
65
66 *output_walk = '\0';
67
68 return output;
69 }
70
71 /*
72 * Translates ISO8859.1 characters (Latin-1) into lower ASCII characters.
73 */
74 unsigned char *clean_iso8859_1(unsigned char *s, void *opts)
75 {
76 unsigned char *output, *input_walk, *output_walk, *replace_walk;
77
78 struct translation_table *table = NULL;
79 struct clean_string_options *options = NULL;
80
81 if (s == NULL) {
82 return NULL;
83 }
84
85 if (opts == NULL) {
86 fprintf(stderr, "this shouldn't happen\n");
87 exit(EXIT_FAILURE);
88 }
89
90 options = (struct clean_string_options *)opts;
91 table = options->translation_table;
92
93 output = malloc((strlen(s) * table->max_data_length) + 1);
94 if (output == NULL) {
95 fprintf(stderr, "out of memory: %s\n", strerror(errno));
96 return NULL;
97 }
98
99 input_walk = s;
100 output_walk = output;
101
102 while (*input_walk != '\0') {
103 if (*input_walk >= ISO8859_1_OFFSET) {
104 replace_walk = table_get(table, *input_walk);
105 if (replace_walk == NULL) {
106 if (table->default_translation == NULL) {
107 /*
108 * Null translation == leave it alone
109 */
110 *output_walk++ = *input_walk++;
111 continue;
112 }
113 else {
114 replace_walk = table->default_translation;
115 }
116 }
117
118 while (*replace_walk != '\0') {
119 *output_walk++ = *replace_walk++;
120 }
121
122 input_walk++;
123 }
124 else {
125 *output_walk++ = *input_walk++;
126 }
127 }
128
129 *output_walk = '\0';
130
131 return output;
132 }
133
134
135 /*
136 * Cleans up any unsafe characters.
137 *
138 * The rules are:
139 * Leave alone:
140 * - # ~ % ^ _ , . + =
141 *
142 * Translate:
143 * & into _and_
144 *
145 * Replace with _:
146 * ` ! @ $ * \ | : ; " ' < ? / '\n' '\r' '\t'
147 *
148 * Replace with -:
149 * ( ) [ ] { }
150 *
151 */
152 unsigned char *clean_safe_basic(unsigned char *s, void *opts)
153 {
154 unsigned char *output, *input_walk, *output_walk;
155
156 if (s == NULL) {
157 return NULL;
158 }
159
160 output = malloc((strlen(s) * 5) + 1);
161 if (output == NULL) {
162 fprintf(stderr, "out of memory: %s\n", strerror(errno));
163 return NULL;
164 }
165
166 input_walk = s;
167 output_walk = output;
168
169 while (*input_walk != '\0') {
170 if (isalnum(*input_walk)) {
171 *output_walk++ = *input_walk++;
172 continue;
173 }
174
175 switch (*input_walk) {
176 case '-':
177 case '#':
178 case '~':
179 case '%':
180 case '^':
181 case '_':
182 case ',':
183 case '.':
184 case '+':
185 case '=':
186 *output_walk++ = *input_walk;
187 break;
188
189 case '&':
190 *output_walk++ = '_';
191 *output_walk++ = 'a';
192 *output_walk++ = 'n';
193 *output_walk++ = 'd';
194 *output_walk++ = '_';
195 break;
196
197 case ' ':
198 case '`':
199 case '!':
200 case '@':
201 case '$':
202 case '*':
203 case '\\':
204 case '|':
205 case ':':
206 case ';':
207 case '"':
208 case '\'':
209 case '<':
210 case '>':
211 case '?':
212 case '/':
213 case '\n':
214 case '\r':
215 case '\t':
216 *output_walk++ = '_';
217 break;
218
219 case '(':
220 case ')':
221 case '[':
222 case ']':
223 case '{':
224 case '}':
225 *output_walk++ = '-';
226 break;
227 }
228
229 input_walk++;
230 }
231
232 *output_walk = '\0';
233
234 return output;
235 }
236
237
238 /*
239 * Translates unsafe characters
240 */
241 unsigned char *clean_safe(unsigned char *s, void *opts)
242 {
243 unsigned char *output, *input_walk, *output_walk, *replace_walk;
244
245 struct translation_table *table = NULL;
246 struct clean_string_options *options = NULL;
247
248 if (s == NULL) {
249 return NULL;
250 }
251
252 if (opts == NULL) {
253 fprintf(stderr, "this shouldn't happen\n");
254 exit(EXIT_FAILURE);
255 }
256
257 options = (struct clean_string_options *)opts;
258 table = options->translation_table;
259
260 output = malloc((strlen(s) * table->max_data_length) + 1);
261 if (output == NULL) {
262 fprintf(stderr, "out of memory: %s\n", strerror(errno));
263 return NULL;
264 }
265
266 input_walk = s;
267 output_walk = output;
268
269 while (*input_walk != '\0') {
270 replace_walk = table_get(table, *input_walk);
271 if (replace_walk == NULL) {
272 if (table->default_translation == NULL) {
273
274 /*
275 * Null translation == leave it alone
276 */
277 *output_walk++ = *input_walk++;
278 continue;
279 }
280 else {
281 replace_walk = table->default_translation;
282 }
283 }
284
285 while (*replace_walk != '\0') {
286 *output_walk++ = *replace_walk++;
287 }
288
289 input_walk++;
290 }
291
292 *output_walk = '\0';
293
294 return output;
295 }
296
297
298
299 /*
300 * Cleans up any CGI encoded characters, in the form "%" followed by 2 hex
301 * digits.
302 */
303 unsigned char *clean_uncgi(unsigned char *s, void *opts)
304 {
305 unsigned char *output, *input_walk, *output_walk;
306 unsigned char conv[3];
307
308 if (s == NULL) {
309 return NULL;
310 }
311
312 output = malloc(strlen(s) + 1);
313 if (output == NULL) {
314 fprintf(stderr, "out of memory: %s\n", strerror(errno));
315 return NULL;
316 }
317
318 input_walk = s;
319 output_walk = output;
320
321 while (*input_walk != '\0') {
322 if (input_walk[0] == '%' && isxdigit(input_walk[1]) && isxdigit(input_walk[2])) {
323 conv[0] = input_walk[1];
324 conv[1] = input_walk[2];
325 conv[2] = 0;
326 *output_walk++ = (unsigned char)strtol(conv, NULL, 16);
327 input_walk += 3;
328 }
329 else {
330 *output_walk++ = *input_walk++;
331 }
332 }
333
334 *output_walk = '\0';
335
336 return output;
337 }
338
339
340 /*
341 * Reduces any series of "_" and "-" to a single character. "-" takes
342 * precedence.
343 *
344 * If "remove_trailing" is set to non-zero, then "." is added to the
345 * comparison, and takes precedence. This has the effect of reducing "-." or
346 * "._", etc, to ".".
347 *
348 * Strips any "-", "_" or "#" from the beginning of a string.
349 *
350 */
351 unsigned char *clean_wipeup(unsigned char *s, void *opts)
352 {
353 unsigned char *output, *input_walk, *output_walk;
354 int matched;
355 int remove_trailing;
356
357 if (s == NULL) {
358 return NULL;
359 }
360
361 remove_trailing = 0;
362 if (opts != NULL) {
363 remove_trailing = ((struct clean_string_options *)opts)->remove_trailing;
364 }
365
366 /* remove any -, _, or # at beginning of string */
367 while (*s == '-' || *s == '_' || *s == '#') {
368 s++;
369 }
370
371 output = malloc(strlen(s) + 1);
372 if (output == NULL) {
373 fprintf(stderr, "out of memory: %s\n", strerror(errno));
374 return NULL;
375 }
376
377 input_walk = s;
378 output_walk = output;
379 matched = 0;
380
381 while (*input_walk != '\0') {
382 switch (*input_walk) {
383 case '-':
384 if (matched) {
385 if (*output_walk == '_') {
386 *output_walk = '-';
387 }
388 }
389 else {
390 *output_walk = '-';
391 }
392
393 matched = 1;
394 break;
395
396 case '_':
397 if (!matched) {
398 *output_walk = '_';
399 }
400
401 matched = 1;
402 break;
403
404 case '.':
405 if (remove_trailing) {
406 *output_walk = '.';
407 matched = 1;
408 break;
409 } /* else fall through */
410 default:
411 if (matched) {
412 output_walk++;
413 matched = 0;
414 }
415
416 *output_walk++ = *input_walk;
417 }
418 input_walk++;
419 }
420
421 if (matched) {
422 output_walk++;
423 }
424
425 *output_walk = '\0';
426
427 return output;
428 }
429
430 #define UTF_8_ENCODED 0x80
431 #define UTF_8_ENCODED_4_CHARS 0xf0
432 #define UTF_8_ENCODED_3_CHARS 0xe0
433 #define UTF_8_ENCODED_2_CHARS 0xc0
434
435 /*
436 * Translates UTF-8 characters (Unicode Translation Format - 8 Bit) into
437 * Unicode and then lower ASCII characters.
438 */
439 unsigned char *clean_utf_8_basic(unsigned char *s, void *opts)
440 {
441 unsigned char *output, *input_walk, *output_walk, *replace_walk;
442 int new_value, expected_chars;
443
444 if (s == NULL) {
445 return NULL;
446 }
447
448 output = malloc((strlen(s) * UNICODE_MAXLEN) + 1);
449 if (output == NULL) {
450 fprintf(stderr, "out of memory: %s\n", strerror(errno));
451 return NULL;
452 }
453
454 input_walk = s;
455 output_walk = output;
456
457 while (*input_walk != '\0') {
458 if ((*input_walk & UTF_8_ENCODED) == 0) {
459 *output_walk++ = *input_walk++;
460 continue;
461 }
462
463 new_value = 0;
464 expected_chars = 0;
465
466 /*
467 * Needs to be done in descending orders due to the fact that
468 * the 2 char mask will match on the 4 char mask, but not
469 * vice versa.
470 */
471 if ((*input_walk & UTF_8_ENCODED_4_CHARS) == UTF_8_ENCODED_4_CHARS) {
472
473 /*
474 * 11110aaa 10bbbbbb 10cccccc 10dddddd
475 */
476
477 new_value = *input_walk & 0x07;
478 expected_chars = 3;
479 }
480 else if ((*input_walk & UTF_8_ENCODED_3_CHARS) == UTF_8_ENCODED_3_CHARS) {
481
482 /*
483 * 1110aaaa 10bbbbbb 10cccccc
484 */
485
486 new_value = *input_walk & 0x0f;
487 expected_chars = 2;
488 }
489 else if ((*input_walk & UTF_8_ENCODED_2_CHARS) == UTF_8_ENCODED_2_CHARS) {
490
491 /*
492 * 110aaaaa 10bbbbbb
493 */
494
495 new_value = *input_walk & 0x1f;
496 expected_chars = 1;
497 }
498 else {
499 input_walk++;
500 continue;
501 }
502
503 while (expected_chars > 0) {
504 new_value <<= 6;
505
506 input_walk++;
507
508 if (*input_walk == '\0') {
509 new_value = -1;
510 break;
511 }
512
513 if ((*input_walk & UTF_8_ENCODED) == 0) {
514 new_value = -1;
515 break;
516 }
517
518 new_value += *input_walk & 0x3f;
519
520 expected_chars--;
521 }
522
523 if (new_value == -1) {
524 continue;
525 }
526
527 if (new_value >= UNICODE_COUNT) {
528 *output_walk++ = '_';
529 continue;
530 }
531
532 replace_walk = (unsigned char *)&unicode_trans[new_value];
533
534 while (*replace_walk != '\0') {
535 *output_walk++ = *replace_walk++;
536 }
537 }
538
539 *output_walk = '\0';
540
541 return output;
542 }
543
544 /*
545 * Translates UTF-8 characters (Unicode Translation Format - 8 Bit) into
546 * Unicode and then runs the translation table.
547 */
548 unsigned char *clean_utf_8(unsigned char *s, void *opts)
549 {
550 unsigned char *output, *input_walk, *output_walk, *replace_walk;
551 int new_value, expected_chars;
552
553 struct translation_table *table = NULL;
554 struct clean_string_options *options = NULL;
555
556 int characters_eaten;
557
558 if (s == NULL) {
559 return NULL;
560 }
561
562 if (opts == NULL) {
563 fprintf(stderr, "this shouldn't happen\n");
564 exit(EXIT_FAILURE);
565 }
566
567 options = (struct clean_string_options *)opts;
568 table = options->translation_table;
569
570 output = malloc((strlen(s) * table->max_data_length) + 1);
571 if (output == NULL) {
572 fprintf(stderr, "out of memory: %s\n", strerror(errno));
573 return NULL;
574 }
575
576 input_walk = s;
577 output_walk = output;
578
579 while (*input_walk != '\0') {
580 new_value = 0;
581 expected_chars = 0;
582 characters_eaten = 0;
583
584 /*
585 * Needs to be done in descending orders due to the fact that
586 * the 2 char mask will match on the 4 char mask, but not
587 * vice versa.
588 */
589 if ((*input_walk & UTF_8_ENCODED_4_CHARS) == UTF_8_ENCODED_4_CHARS) {
590
591 /*
592 * 11110aaa 10bbbbbb 10cccccc 10dddddd
593 */
594
595 new_value = *input_walk & 0x07;
596 expected_chars = 3;
597 characters_eaten = 4;
598 }
599 else if ((*input_walk & UTF_8_ENCODED_3_CHARS) == UTF_8_ENCODED_3_CHARS) {
600
601 /*
602 * 1110aaaa 10bbbbbb 10cccccc
603 */
604
605 new_value = *input_walk & 0x0f;
606 expected_chars = 2;
607 characters_eaten = 3;
608 }
609 else if ((*input_walk & UTF_8_ENCODED_2_CHARS) == UTF_8_ENCODED_2_CHARS) {
610
611 /*
612 * 110aaaaa 10bbbbbb
613 */
614
615 new_value = *input_walk & 0x1f;
616 expected_chars = 1;
617 characters_eaten = 2;
618 }
619 else if ((*input_walk & UTF_8_ENCODED) == UTF_8_ENCODED) {
620 fprintf(stderr, "unsupported unicode length\n");
621 exit(EXIT_FAILURE);
622 }
623 else {
624 new_value = *input_walk;
625 expected_chars = 0;
626 characters_eaten = 1;
627 }
628
629 while (expected_chars > 0) {
630 new_value <<= 6;
631
632 input_walk++;
633
634 if (*input_walk == '\0') {
635 new_value = -1;
636 break;
637 }
638
639 if ((*input_walk & UTF_8_ENCODED) == 0) {
640 new_value = -1;
641 break;
642 }
643
644 new_value += *input_walk & 0x3f;
645
646 expected_chars--;
647 }
648 input_walk++;
649
650 if (new_value == -1) {
651 continue;
652 }
653
654 replace_walk = table_get(table, new_value);
655
656 if (replace_walk == NULL) {
657 replace_walk = table->default_translation;
658 }
659
660 if (replace_walk == NULL) {
661
662 /*
663 * Null translation == leave it alone
664 */
665 input_walk -= characters_eaten;
666
667 while (characters_eaten > 0) {
668 *output_walk++ = *input_walk++;
669 characters_eaten--;
670 }
671
672 continue;
673 }
674
675 while (*replace_walk != '\0') {
676 *output_walk++ = *replace_walk++;
677 }
678 }
679
680 *output_walk = '\0';
681
682 return output;
683 }
684
685
686
687 /*
688 * Trims a file down to specified length.
689 */
690 unsigned char *clean_max_length(unsigned char *s, void *opts)
691 {
692 unsigned char *output, *input_walk, *output_walk;
693 size_t max_length;
694 size_t s_length;
695 size_t ext_length;
696
697 if (s == NULL) {
698 return NULL;
699 }
700
701 max_length = 256;
702 if (opts != NULL) {
703 max_length = ((struct clean_string_options *)opts)->max_length;
704 }
705
706 s_length = strlen(s);
707
708 output = malloc(max_length + 1);
709 if (output == NULL) {
710 fprintf(stderr, "out of memory: %s\n", strerror(errno));
711 return NULL;
712 }
713
714 snprintf(output, max_length + 1, "%s", s);
715
716 if (s_length <= max_length) {
717 return output;
718 }
719
720 input_walk = strrchr(s, '.');
721
722 if (input_walk == NULL) {
723 return output;
724 }
725
726 ext_length = strlen(input_walk);
727
728 output_walk = output;
729 output_walk += max_length - ext_length;
730
731 while (*(output_walk - 1) == '.' && output_walk > output) {
732 output_walk--;
733 }
734
735 snprintf(output_walk, ext_length + 1, "%s", input_walk);
736
737 return output;
738 }
739
740
741 /*
742 * Converts all characters to lowercase.
743 */
744 unsigned char *clean_lower(unsigned char *s, void *opts)
745 {
746 unsigned char *output, *input_walk, *output_walk;
747
748 if (s == NULL) {
749 return NULL;
750 }
751
752 output = malloc(strlen(s) + 1);
753 if (output == NULL) {
754 fprintf(stderr, "out of memory: %s\n", strerror(errno));
755 return NULL;
756 }
757
758 input_walk = s;
759 output_walk = output;
760
761 while (*input_walk != '\0') {
762 if (isupper(*input_walk)) {
763 *output_walk++ = tolower(*input_walk++);
764 }
765 else {
766 *output_walk++ = *input_walk++;
767 }
768 }
769
770 *output_walk = '\0';
771
772 return output;
773 }