"Fossies" - the Fresh Open Source Software Archive

Member "file-5.35/src/ascmagic.c" (15 Oct 2018, 9975 Bytes) of package /linux/misc/file-5.35.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "ascmagic.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 5.34_vs_5.35.

    1 /*
    2  * Copyright (c) Ian F. Darwin 1986-1995.
    3  * Software written by Ian F. Darwin and others;
    4  * maintained 1995-present by Christos Zoulas and others.
    5  *
    6  * Redistribution and use in source and binary forms, with or without
    7  * modification, are permitted provided that the following conditions
    8  * are met:
    9  * 1. Redistributions of source code must retain the above copyright
   10  *    notice immediately at the beginning of the file, without modification,
   11  *    this list of conditions, and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  *
   16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   19  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
   20  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   26  * SUCH DAMAGE.
   27  */
   28 /*
   29  * ASCII magic -- try to detect text encoding.
   30  *
   31  * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
   32  * to handle character codes other than ASCII on a unified basis.
   33  */
   34 
   35 #include "file.h"
   36 
   37 #ifndef lint
   38 FILE_RCSID("@(#)$File: ascmagic.c,v 1.100 2018/10/15 16:29:16 christos Exp $")
   39 #endif  /* lint */
   40 
   41 #include "magic.h"
   42 #include <string.h>
   43 #include <memory.h>
   44 #include <ctype.h>
   45 #include <stdlib.h>
   46 #ifdef HAVE_UNISTD_H
   47 #include <unistd.h>
   48 #endif
   49 
   50 #define MAXLINELEN 300  /* longest sane line length */
   51 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
   52           || (x) == 0x85 || (x) == '\f')
   53 
   54 private unsigned char *encode_utf8(unsigned char *, size_t, unichar *, size_t);
   55 private size_t trim_nuls(const unsigned char *, size_t);
   56 
   57 /*
   58  * Undo the NUL-termination kindly provided by process()
   59  * but leave at least one byte to look at
   60  */
   61 private size_t
   62 trim_nuls(const unsigned char *buf, size_t nbytes)
   63 {
   64     while (nbytes > 1 && buf[nbytes - 1] == '\0')
   65         nbytes--;
   66 
   67     return nbytes;
   68 }
   69 
   70 protected int
   71 file_ascmagic(struct magic_set *ms, const struct buffer *b, int text)
   72 {
   73     unichar *ubuf = NULL;
   74     size_t ulen = 0;
   75     int rv = 1;
   76     struct buffer bb;
   77 
   78     const char *code = NULL;
   79     const char *code_mime = NULL;
   80     const char *type = NULL;
   81 
   82     bb = *b;
   83     bb.flen = trim_nuls(CAST(const unsigned char *, b->fbuf), b->flen);
   84 
   85     /* If file doesn't look like any sort of text, give up. */
   86     if (file_encoding(ms, &bb, &ubuf, &ulen, &code, &code_mime,
   87         &type) == 0)
   88         rv = 0;
   89         else
   90         rv = file_ascmagic_with_encoding(ms, &bb,
   91             ubuf, ulen, code, type, text);
   92 
   93     free(ubuf);
   94 
   95     return rv;
   96 }
   97 
   98 protected int
   99 file_ascmagic_with_encoding(struct magic_set *ms,
  100     const struct buffer *b, unichar *ubuf, size_t ulen, const char *code,
  101     const char *type, int text)
  102 {
  103     struct buffer bb;
  104     const unsigned char *buf = CAST(const unsigned char *, b->fbuf);
  105     size_t nbytes = b->flen;
  106     unsigned char *utf8_buf = NULL, *utf8_end;
  107     size_t mlen, i;
  108     int rv = -1;
  109     int mime = ms->flags & MAGIC_MIME;
  110 
  111     const char *subtype = NULL;
  112     const char *subtype_mime = NULL;
  113 
  114     int has_escapes = 0;
  115     int has_backspace = 0;
  116     int seen_cr = 0;
  117 
  118     int n_crlf = 0;
  119     int n_lf = 0;
  120     int n_cr = 0;
  121     int n_nel = 0;
  122     int executable = 0;
  123 
  124     size_t last_line_end = (size_t)-1;
  125     int has_long_lines = 0;
  126 
  127     nbytes = trim_nuls(buf, nbytes);
  128 
  129     /* If we have fewer than 2 bytes, give up. */
  130     if (nbytes <= 1) {
  131         rv = 0;
  132         goto done;
  133     }
  134 
  135     if (ulen > 0 && (ms->flags & MAGIC_NO_CHECK_SOFT) == 0) {
  136         /* Convert ubuf to UTF-8 and try text soft magic */
  137         /* malloc size is a conservative overestimate; could be
  138            improved, or at least realloced after conversion. */
  139         mlen = ulen * 6;
  140         if ((utf8_buf = CAST(unsigned char *, malloc(mlen))) == NULL) {
  141             file_oomem(ms, mlen);
  142             goto done;
  143         }
  144         if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen))
  145             == NULL)
  146             goto done;
  147         buffer_init(&bb, b->fd, utf8_buf,
  148             (size_t)(utf8_end - utf8_buf));
  149 
  150         if ((rv = file_softmagic(ms, &bb, NULL, NULL,
  151             TEXTTEST, text)) == 0)
  152             rv = -1;
  153         buffer_fini(&bb);
  154         if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION))) {
  155             rv = rv == -1 ? 0 : 1;
  156             goto done;
  157         }
  158     }
  159     if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION)))
  160         return 0;
  161 
  162     /* Now try to discover other details about the file. */
  163     for (i = 0; i < ulen; i++) {
  164         if (ubuf[i] == '\n') {
  165             if (seen_cr)
  166                 n_crlf++;
  167             else
  168                 n_lf++;
  169             last_line_end = i;
  170         } else if (seen_cr)
  171             n_cr++;
  172 
  173         seen_cr = (ubuf[i] == '\r');
  174         if (seen_cr)
  175             last_line_end = i;
  176 
  177         if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
  178             n_nel++;
  179             last_line_end = i;
  180         }
  181 
  182         /* If this line is _longer_ than MAXLINELEN, remember it. */
  183         if (i > last_line_end + MAXLINELEN)
  184             has_long_lines = 1;
  185 
  186         if (ubuf[i] == '\033')
  187             has_escapes = 1;
  188         if (ubuf[i] == '\b')
  189             has_backspace = 1;
  190     }
  191 
  192     /* Beware, if the data has been truncated, the final CR could have
  193        been followed by a LF.  If we have ms->bytes_max bytes, it indicates
  194        that the data might have been truncated, probably even before
  195        this function was called. */
  196     if (seen_cr && nbytes < ms->bytes_max)
  197         n_cr++;
  198 
  199     if (strcmp(type, "binary") == 0) {
  200         rv = 0;
  201         goto done;
  202     }
  203     if (mime) {
  204         if (!file_printedlen(ms) && (mime & MAGIC_MIME_TYPE) != 0) {
  205             if (subtype_mime) {
  206                 if (file_printf(ms, "%s", subtype_mime) == -1)
  207                     goto done;
  208             } else {
  209                 if (file_printf(ms, "text/plain") == -1)
  210                     goto done;
  211             }
  212         }
  213     } else {
  214         if (file_printedlen(ms)) {
  215             switch (file_replace(ms, " text$", ", ")) {
  216             case 0:
  217                 switch (file_replace(ms, " text executable$",
  218                     ", ")) {
  219                 case 0:
  220                     if (file_printf(ms, ", ") == -1)
  221                         goto done;
  222                     break;
  223                 case -1:
  224                     goto done;
  225                 default:
  226                     executable = 1;
  227                     break;
  228                 }
  229                 break;
  230             case -1:
  231                 goto done;
  232             default:
  233                 break;
  234             }
  235         }
  236 
  237         if (file_printf(ms, "%s", code) == -1)
  238             goto done;
  239 
  240         if (subtype) {
  241             if (file_printf(ms, " %s", subtype) == -1)
  242                 goto done;
  243         }
  244 
  245         if (file_printf(ms, " %s", type) == -1)
  246             goto done;
  247 
  248         if (executable)
  249             if (file_printf(ms, " executable") == -1)
  250                 goto done;
  251 
  252         if (has_long_lines)
  253             if (file_printf(ms, ", with very long lines") == -1)
  254                 goto done;
  255 
  256         /*
  257          * Only report line terminators if we find one other than LF,
  258          * or if we find none at all.
  259          */
  260         if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
  261             (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
  262             if (file_printf(ms, ", with") == -1)
  263                 goto done;
  264 
  265             if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) {
  266                 if (file_printf(ms, " no") == -1)
  267                     goto done;
  268             } else {
  269                 if (n_crlf) {
  270                     if (file_printf(ms, " CRLF") == -1)
  271                         goto done;
  272                     if (n_cr || n_lf || n_nel)
  273                         if (file_printf(ms, ",") == -1)
  274                             goto done;
  275                 }
  276                 if (n_cr) {
  277                     if (file_printf(ms, " CR") == -1)
  278                         goto done;
  279                     if (n_lf || n_nel)
  280                         if (file_printf(ms, ",") == -1)
  281                             goto done;
  282                 }
  283                 if (n_lf) {
  284                     if (file_printf(ms, " LF") == -1)
  285                         goto done;
  286                     if (n_nel)
  287                         if (file_printf(ms, ",") == -1)
  288                             goto done;
  289                 }
  290                 if (n_nel)
  291                     if (file_printf(ms, " NEL") == -1)
  292                         goto done;
  293             }
  294 
  295             if (file_printf(ms, " line terminators") == -1)
  296                 goto done;
  297         }
  298 
  299         if (has_escapes)
  300             if (file_printf(ms, ", with escape sequences") == -1)
  301                 goto done;
  302         if (has_backspace)
  303             if (file_printf(ms, ", with overstriking") == -1)
  304                 goto done;
  305     }
  306     rv = 1;
  307 done:
  308     free(utf8_buf);
  309 
  310     return rv;
  311 }
  312 
  313 /*
  314  * Encode Unicode string as UTF-8, returning pointer to character
  315  * after end of string, or NULL if an invalid character is found.
  316  */
  317 private unsigned char *
  318 encode_utf8(unsigned char *buf, size_t len, unichar *ubuf, size_t ulen)
  319 {
  320     size_t i;
  321     unsigned char *end = buf + len;
  322 
  323     for (i = 0; i < ulen; i++) {
  324         if (ubuf[i] <= 0x7f) {
  325             if (end - buf < 1)
  326                 return NULL;
  327             *buf++ = (unsigned char)ubuf[i];
  328         } else if (ubuf[i] <= 0x7ff) {
  329             if (end - buf < 2)
  330                 return NULL;
  331             *buf++ = (unsigned char)((ubuf[i] >> 6) + 0xc0);
  332             *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
  333         } else if (ubuf[i] <= 0xffff) {
  334             if (end - buf < 3)
  335                 return NULL;
  336             *buf++ = (unsigned char)((ubuf[i] >> 12) + 0xe0);
  337             *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
  338             *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
  339         } else if (ubuf[i] <= 0x1fffff) {
  340             if (end - buf < 4)
  341                 return NULL;
  342             *buf++ = (unsigned char)((ubuf[i] >> 18) + 0xf0);
  343             *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
  344             *buf++ = (unsigned char)(((ubuf[i] >>  6) & 0x3f) + 0x80);
  345             *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
  346         } else if (ubuf[i] <= 0x3ffffff) {
  347             if (end - buf < 5)
  348                 return NULL;
  349             *buf++ = (unsigned char)((ubuf[i] >> 24) + 0xf8);
  350             *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80);
  351             *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
  352             *buf++ = (unsigned char)(((ubuf[i] >>  6) & 0x3f) + 0x80);
  353             *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
  354         } else if (ubuf[i] <= 0x7fffffff) {
  355             if (end - buf < 6)
  356                 return NULL;
  357             *buf++ = (unsigned char)((ubuf[i] >> 30) + 0xfc);
  358             *buf++ = (unsigned char)(((ubuf[i] >> 24) & 0x3f) + 0x80);
  359             *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80);
  360             *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
  361             *buf++ = (unsigned char)(((ubuf[i] >>  6) & 0x3f) + 0x80);
  362             *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
  363         } else /* Invalid character */
  364             return NULL;
  365     }
  366 
  367     return buf;
  368 }