"Fossies" - the Fresh Open Source Software Archive

Member "s-nail-14.9.7/filter.c" (16 Feb 2018, 48111 Bytes) of package /linux/misc/s-nail-14.9.7.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "filter.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 14.9.6_vs_14.9.7.

    1 /*@ S-nail - a mail user agent derived from Berkeley Mail.
    2  *@ Filter objects.
    3  *
    4  * Copyright (c) 2013 - 2018 Steffen (Daode) Nurpmeso <steffen@sdaoden.eu>.
    5  *
    6  * Permission to use, copy, modify, and/or distribute this software for any
    7  * purpose with or without fee is hereby granted, provided that the above
    8  * copyright notice and this permission notice appear in all copies.
    9  *
   10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
   11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
   12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
   13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
   14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
   15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
   16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
   17  */
   18 #undef n_FILE
   19 #define n_FILE filter
   20 
   21 #ifndef HAVE_AMALGAMATION
   22 # include "nail.h"
   23 #endif
   24 
   25 /*
   26  * Quotation filter
   27  */
   28 
   29 /*
   30  * TODO quotation filter: anticipate in future data: don't break if only WS
   31  * TODO or a LF escaping \ follows on the line (simply reuse the latter).
   32  */
   33 
   34 #ifdef HAVE_QUOTE_FOLD
   35 n_CTAV(n_QUOTE_MAX > 3);
   36 
   37 enum qf_state {
   38    _QF_CLEAN,
   39    _QF_PREFIX,
   40    _QF_DATA
   41 };
   42 
   43 struct qf_vc {
   44    struct quoteflt   *self;
   45    char const        *buf;
   46    size_t            len;
   47 };
   48 
   49 /* Print out prefix and current quote */
   50 static ssize_t _qf_dump_prefix(struct quoteflt *self);
   51 
   52 /* Add one data character */
   53 static ssize_t _qf_add_data(struct quoteflt *self, wchar_t wc);
   54 
   55 /* State machine handlers */
   56 static ssize_t _qf_state_prefix(struct qf_vc *vc);
   57 static ssize_t _qf_state_data(struct qf_vc *vc);
   58 
   59 static ssize_t
   60 _qf_dump_prefix(struct quoteflt *self)
   61 {
   62    ssize_t rv;
   63    size_t i;
   64    NYD_ENTER;
   65 
   66    if ((i = self->qf_pfix_len) > 0 && i != fwrite(self->qf_pfix, 1, i,
   67          self->qf_os))
   68       goto jerr;
   69    rv = i;
   70 
   71    if ((i = self->qf_currq.l) > 0 && i != fwrite(self->qf_currq.s, 1, i,
   72          self->qf_os))
   73       goto jerr;
   74    rv += i;
   75 jleave:
   76    NYD_LEAVE;
   77    return rv;
   78 jerr:
   79    rv = -1;
   80    goto jleave;
   81 }
   82 
   83 static ssize_t
   84 _qf_add_data(struct quoteflt *self, wchar_t wc)
   85 {
   86    char *save_b;
   87    ui32_t save_l, save_w;
   88    ssize_t rv = 0;
   89    int w, l;
   90    NYD_ENTER;
   91 
   92    save_l = save_w = 0; /* silence cc */
   93    save_b = NULL;
   94    /* <newline> ends state */
   95    if (wc == L'\n')
   96       goto jflush;
   97    if (wc == L'\r') /* TODO CR should be stripped in lower level!! */
   98       goto jleave;
   99 
  100    /* Unroll <tab> to spaces */
  101    if (wc == L'\t') {
  102       save_l = self->qf_datw;
  103       save_w = (save_l + n_QUOTE_TAB_SPACES) & ~(n_QUOTE_TAB_SPACES - 1);
  104       save_w -= save_l;
  105       while (save_w-- > 0) {
  106          ssize_t j = _qf_add_data(self, L' ');
  107          if (j < 0) {
  108             rv = j;
  109             break;
  110          }
  111          rv += j;
  112       }
  113       goto jleave;
  114    }
  115 
  116    w = wcwidth(wc);
  117    if (w == -1) {
  118 jbad:
  119       ++self->qf_datw;
  120       self->qf_dat.s[self->qf_dat.l++] = '?';
  121    } else {
  122       l = wctomb(self->qf_dat.s + self->qf_dat.l, wc);
  123       if (l < 0)
  124          goto jbad;
  125       self->qf_datw += (ui32_t)w;
  126       self->qf_dat.l += (size_t)l;
  127    }
  128 
  129    /* TODO The last visual may excess (adjusted!) *qfold-max* if it's a wide;
  130     * TODO place it on the next line, break before */
  131    if (self->qf_datw >= self->qf_qfold_max) {
  132       /* If we have seen a nice breakpoint during traversal, shuffle data
  133        * around a bit so as to restore the trailing part after flushing */
  134       if (self->qf_brkl > 0) {
  135          save_w = self->qf_datw - self->qf_brkw;
  136          save_l = self->qf_dat.l - self->qf_brkl;
  137          save_b = self->qf_dat.s + self->qf_brkl + 2;
  138          memmove(save_b, save_b - 2, save_l);
  139          self->qf_dat.l = self->qf_brkl;
  140       }
  141 
  142       self->qf_dat.s[self->qf_dat.l++] = '\\';
  143 jflush:
  144       self->qf_dat.s[self->qf_dat.l++] = '\n';
  145       rv = quoteflt_flush(self);
  146 
  147       /* Restore takeovers, if any */
  148       if (save_b != NULL) {
  149          self->qf_brk_isws = FAL0;
  150          self->qf_datw += save_w;
  151          self->qf_dat.l = save_l;
  152          memmove(self->qf_dat.s, save_b, save_l);
  153       }
  154    } else if (self->qf_datw >= self->qf_qfold_min && !self->qf_brk_isws) {
  155       bool_t isws = (iswspace(wc) != 0);
  156 
  157       if (isws || !self->qf_brk_isws || self->qf_brkl == 0) {
  158          self->qf_brkl = self->qf_dat.l;
  159          self->qf_brkw = self->qf_datw;
  160          self->qf_brk_isws = isws;
  161       }
  162    }
  163 
  164    /* If state changed to prefix, perform full reset (note this implies that
  165     * quoteflt_flush() performs too much work..) */
  166    if (wc == '\n') {
  167       self->qf_state = _QF_PREFIX;
  168       self->qf_wscnt = self->qf_datw = 0;
  169       self->qf_currq.l = 0;
  170    }
  171 jleave:
  172    NYD_LEAVE;
  173    return rv;
  174 }
  175 
  176 static ssize_t
  177 _qf_state_prefix(struct qf_vc *vc)
  178 {
  179    struct quoteflt *self;
  180    ssize_t rv;
  181    char const *buf;
  182    size_t len, i;
  183    wchar_t wc;
  184    NYD_ENTER;
  185 
  186    self = vc->self;
  187    rv = 0;
  188 
  189    for (buf = vc->buf, len = vc->len; len > 0;) {
  190       /* xxx NULL BYTE! */
  191       i = mbrtowc(&wc, buf, len, self->qf_mbps);
  192       if (i == (size_t)-1) {
  193          /* On hard error, don't modify mbstate_t and step one byte */
  194          self->qf_mbps[0] = self->qf_mbps[1];
  195          ++buf;
  196          --len;
  197          self->qf_wscnt = 0;
  198          continue;
  199       }
  200       self->qf_mbps[1] = self->qf_mbps[0];
  201       if (i == (size_t)-2) {
  202          /* Redundant shift sequence, out of buffer */
  203          len = 0;
  204          break;
  205       }
  206       buf += i;
  207       len -= i;
  208 
  209       if (wc == L'\n')
  210          goto jfin;
  211       if (iswspace(wc)) {
  212          ++self->qf_wscnt;
  213          continue;
  214       }
  215       if (i == 1 && n_uasciichar(wc) &&
  216             strchr(self->qf_quote_chars, (char)wc) != NULL){
  217          self->qf_wscnt = 0;
  218          if (self->qf_currq.l >= n_QUOTE_MAX - 3) {
  219             self->qf_currq.s[n_QUOTE_MAX - 3] = '.';
  220             self->qf_currq.s[n_QUOTE_MAX - 2] = '.';
  221             self->qf_currq.s[n_QUOTE_MAX - 1] = '.';
  222             self->qf_currq.l = n_QUOTE_MAX;
  223          } else
  224             self->qf_currq.s[self->qf_currq.l++] = buf[-1];
  225          continue;
  226       }
  227 
  228       /* The quote is parsed and compressed; dump it */
  229 jfin:
  230       self->qf_state = _QF_DATA;
  231       /* Overtake WS to the current quote in order to preserve it for eventual
  232        * necessary follow lines, too */
  233       /* TODO we de-facto "normalize" to ASCII SP here which MESSES tabs!! */
  234       while (self->qf_wscnt-- > 0 && self->qf_currq.l < n_QUOTE_MAX)
  235          self->qf_currq.s[self->qf_currq.l++] = ' ';
  236       self->qf_datw = self->qf_pfix_len + self->qf_currq.l;
  237       self->qf_wscnt = 0;
  238       rv = _qf_add_data(self, wc);
  239       break;
  240    }
  241 
  242    vc->buf = buf;
  243    vc->len = len;
  244    NYD_LEAVE;
  245    return rv;
  246 }
  247 
  248 static ssize_t
  249 _qf_state_data(struct qf_vc *vc)
  250 {
  251    struct quoteflt *self;
  252    ssize_t rv;
  253    char const *buf;
  254    size_t len, i;
  255    wchar_t wc;
  256    NYD_ENTER;
  257 
  258    self = vc->self;
  259    rv = 0;
  260 
  261    for (buf = vc->buf, len = vc->len; len > 0;) {
  262       /* xxx NULL BYTE! */
  263       i = mbrtowc(&wc, buf, len, self->qf_mbps);
  264       if (i == (size_t)-1) {
  265          /* On hard error, don't modify mbstate_t and step one byte */
  266          self->qf_mbps[0] = self->qf_mbps[1];
  267          ++buf;
  268          --len;
  269          continue;
  270       }
  271       self->qf_mbps[1] = self->qf_mbps[0];
  272       if (i == (size_t)-2) {
  273          /* Redundant shift sequence, out of buffer */
  274          len = 0;
  275          break;
  276       }
  277       buf += i;
  278       len -= i;
  279 
  280       {  ssize_t j = _qf_add_data(self, wc);
  281          if (j < 0) {
  282             rv = j;
  283             break;
  284          }
  285          rv += j;
  286       }
  287 
  288       if (self->qf_state != _QF_DATA)
  289          break;
  290    }
  291 
  292    vc->buf = buf;
  293    vc->len = len;
  294    NYD_LEAVE;
  295    return rv;
  296 }
  297 #endif /* HAVE_QUOTE_FOLD */
  298 
  299 FL struct quoteflt *
  300 quoteflt_dummy(void) /* TODO LEGACY (until filters are plugged when needed) */
  301 {
  302    static struct quoteflt qf_i;
  303 
  304    return &qf_i;
  305 }
  306 
  307 FL void
  308 quoteflt_init(struct quoteflt *self, char const *prefix)
  309 {
  310 #ifdef HAVE_QUOTE_FOLD
  311    char const *xcp, *cp;
  312 #endif
  313    NYD_ENTER;
  314 
  315    memset(self, 0, sizeof *self);
  316 
  317    if ((self->qf_pfix = prefix) != NULL)
  318       self->qf_pfix_len = (ui32_t)strlen(prefix);
  319 
  320    /* Check whether the user wants the more fancy quoting algorithm */
  321    /* TODO *quote-fold*: n_QUOTE_MAX may excess it! */
  322 #ifdef HAVE_QUOTE_FOLD
  323    if (self->qf_pfix_len > 0 && (cp = ok_vlook(quote_fold)) != NULL) {
  324       ui32_t qmin, qmax;
  325 
  326       /* These magic values ensure we don't bail */
  327       n_idec_ui32_cp(&qmax, cp, 10, &xcp);
  328       if (qmax < self->qf_pfix_len + 6)
  329          qmax = self->qf_pfix_len + 6;
  330       --qmax; /* The newline escape */
  331       if (cp == xcp || *xcp == '\0')
  332          qmin = (qmax >> 1) + (qmax >> 2) + (qmax >> 5);
  333       else {
  334          n_idec_ui32_cp(&qmin, &xcp[1], 10, NULL);
  335          if (qmin < qmax >> 1)
  336             qmin = qmax >> 1;
  337          else if (qmin > qmax - 2)
  338             qmin = qmax - 2;
  339       }
  340       self->qf_qfold_min = qmin;
  341       self->qf_qfold_max = qmax;
  342       self->qf_quote_chars = ok_vlook(quote_chars);
  343 
  344       /* Add pad for takeover copies, reverse solidus and newline */
  345       self->qf_dat.s = salloc((qmax + 3) * n_mb_cur_max);
  346       self->qf_currq.s = salloc((n_QUOTE_MAX + 1) * n_mb_cur_max);
  347    }
  348 #endif
  349    NYD_LEAVE;
  350 }
  351 
  352 FL void
  353 quoteflt_destroy(struct quoteflt *self) /* xxx inline */
  354 {
  355    NYD_ENTER;
  356    n_UNUSED(self);
  357    NYD_LEAVE;
  358 }
  359 
  360 FL void
  361 quoteflt_reset(struct quoteflt *self, FILE *f) /* xxx inline */
  362 {
  363    NYD_ENTER;
  364    self->qf_os = f;
  365 #ifdef HAVE_QUOTE_FOLD
  366    self->qf_state = _QF_CLEAN;
  367    self->qf_dat.l =
  368    self->qf_currq.l = 0;
  369    memset(self->qf_mbps, 0, sizeof self->qf_mbps);
  370 #endif
  371    NYD_LEAVE;
  372 }
  373 
  374 FL ssize_t
  375 quoteflt_push(struct quoteflt *self, char const *dat, size_t len)
  376 {
  377    /* (xxx Ideally the actual push() [and flush()] would be functions on their
  378     * xxx own, via indirect vtbl call ..) */
  379    ssize_t rv = 0;
  380    NYD_ENTER;
  381 
  382    self->qf_nl_last = (len > 0 && dat[len - 1] == '\n'); /* TODO HACK */
  383 
  384    if (len == 0)
  385       goto jleave;
  386 
  387    /* Bypass? TODO Finally, this filter simply should not be used, then
  388     * (TODO It supercedes prefix_write() or something) */
  389    if (self->qf_pfix_len == 0) {
  390       if (len != fwrite(dat, 1, len, self->qf_os))
  391          goto jerr;
  392       rv = len;
  393    }
  394    /* Normal: place *indentprefix* at every BOL */
  395    else
  396 #ifdef HAVE_QUOTE_FOLD
  397       if (self->qf_qfold_max == 0)
  398 #endif
  399    {
  400       void *vp;
  401       size_t ll;
  402       bool_t pxok = (self->qf_qfold_min != 0);
  403 
  404       for (;;) {
  405          if (!pxok) {
  406             ll = self->qf_pfix_len;
  407             if (ll != fwrite(self->qf_pfix, 1, ll, self->qf_os))
  408                goto jerr;
  409             rv += ll;
  410             pxok = TRU1;
  411          }
  412 
  413          /* xxx Strictly speaking this is invalid, because only `/' and `.' are
  414           * xxx mandated by POSIX.1-2008 as "invariant across all locales
  415           * xxx supported"; though there is no charset known which uses this
  416           * xxx control char as part of a multibyte character; note that S-nail
  417           * XXX (and the Mail codebase as such) do not support EBCDIC */
  418          if ((vp = memchr(dat, '\n', len)) == NULL)
  419             ll = len;
  420          else {
  421             pxok = FAL0;
  422             ll = PTR2SIZE((char*)vp - dat) + 1;
  423          }
  424 
  425          if (ll != fwrite(dat, sizeof *dat, ll, self->qf_os))
  426             goto jerr;
  427          rv += ll;
  428          if ((len -= ll) == 0)
  429             break;
  430          dat += ll;
  431       }
  432 
  433       self->qf_qfold_min = pxok;
  434    }
  435    /* Overly complicated, though still only line-per-line: *quote-fold*.
  436     * - If .qf_currq.l is 0, then we are in a clean state.  Reset .qf_mbps;
  437     *   TODO note this means we assume that lines start with reset escape seq,
  438     *   TODO but i don't think this is any worse than what we currently do;
  439     *   TODO in 15.0, with the value carrier, we should carry conversion states
  440     *   TODO all along, only resetting on error (or at words for header =???=);
  441     *   TODO this still is weird for error handling, but we need to act more
  442     *   TODO stream-alike (though in practice i don't think cross-line states
  443     *   TODO can be found, because of compatibility reasons; however, being
  444     *   TODO a problem rather than a solution is not a good thing (tm))
  445     * - Lookout for a newline */
  446 #ifdef HAVE_QUOTE_FOLD
  447    else {
  448       struct qf_vc vc;
  449       ssize_t i;
  450 
  451       vc.self = self;
  452       vc.buf = dat;
  453       vc.len = len;
  454       while (vc.len > 0) {
  455          switch (self->qf_state) {
  456          case _QF_CLEAN:
  457          case _QF_PREFIX:
  458             i = _qf_state_prefix(&vc);
  459             break;
  460          default: /* silence cc (`i' unused) */
  461          case _QF_DATA:
  462             i = _qf_state_data(&vc);
  463             break;
  464          }
  465          if (i < 0)
  466             goto jerr;
  467          rv += i;
  468       }
  469    }
  470 #endif /* HAVE_QUOTE_FOLD */
  471 
  472 jleave:
  473    NYD_LEAVE;
  474    return rv;
  475 jerr:
  476    rv = -1;
  477    goto jleave;
  478 }
  479 
  480 FL ssize_t
  481 quoteflt_flush(struct quoteflt *self)
  482 {
  483    ssize_t rv = 0;
  484    NYD_ENTER;
  485    n_UNUSED(self);
  486 
  487 #ifdef HAVE_QUOTE_FOLD
  488    if (self->qf_dat.l > 0) {
  489       rv = _qf_dump_prefix(self);
  490       if (rv >= 0) {
  491          size_t i = self->qf_dat.l;
  492          if (i == fwrite(self->qf_dat.s, 1, i, self->qf_os))
  493             rv += i;
  494          else
  495             rv = -1;
  496          self->qf_dat.l = 0;
  497          self->qf_brk_isws = FAL0;
  498          self->qf_wscnt = self->qf_brkl = self->qf_brkw = 0;
  499          self->qf_datw = self->qf_pfix_len + self->qf_currq.l;
  500       }
  501    }
  502 #endif
  503    NYD_LEAVE;
  504    return rv;
  505 }
  506 
  507 /*
  508  * HTML tagsoup filter TODO rewrite wchar_t based (require HAVE_C90AMEND1)
  509  * TODO . Numeric &#NO; entities should also be treated by struct hf_ent
  510  * TODO . Yes, we COULD support CSS based quoting when we'd check type="quote"
  511  * TODO   (nonstandard) and watch out for style="gmail_quote" (or so, VERY
  512  * TODO   nonstandard) and tracking a stack of such elements (to be popped
  513  * TODO   once the closing element is seen).  Then, after writing a newline,
  514  * TODO   place sizeof(stack) ">"s first.  But aren't these HTML mails rude?
  515  * TODO Interlocking and non-well-formed data will break us down
  516  */
  517 #ifdef HAVE_FILTER_HTML_TAGSOUP
  518 
  519 enum hf_limits {
  520    _HF_MINLEN  = 10,       /* Minimum line length (can't really be smaller) */
  521    _HF_BRKSUB  = 8         /* Start considering line break MAX - BRKSUB */
  522 };
  523 
  524 enum hf_flags {
  525    _HF_BQUOTE_MASK = 0xFFFFu,
  526    _HF_UTF8 = 1u<<16,   /* Data is in UTF-8 */
  527    _HF_ERROR = 1u<<17,  /* A hard error occurred, bail as soon as possible */
  528    _HF_NOPUT = 1u<<18,  /* (In a tag,) Don't generate output */
  529    _HF_IGN = 1u<<19,    /* Ignore mode on */
  530    _HF_ANY = 1u<<20,    /* Yet seen just any output */
  531    _HF_PRE = 1u<<21,    /* In <pre>formatted mode */
  532    _HF_ENT = 1u<<22,    /* Currently parsing an entity */
  533    _HF_BLANK = 1u<<23,  /* Whitespace last */
  534    _HF_HREF = 1u<<24,   /* External <a href=> was the last href seen */
  535 
  536    _HF_NL_1 = 1u<<25,   /* One \n seen */
  537    _HF_NL_2 = 2u<<25,   /* We have produced an all empty line */
  538    _HF_NL_MASK = _HF_NL_1 | _HF_NL_2
  539 };
  540 
  541 enum hf_special_actions {
  542    _HFSA_NEEDSEP  = -1,    /* Need an empty line (paragraph separator) */
  543    _HFSA_NEEDNL   = -2,    /* Need a new line start (table row) */
  544    _HFSA_IGN      = -3,    /* Things like <style>..</style>, <script>.. */
  545    _HFSA_PRE      = -4,    /* <pre>.. */
  546    _HFSA_PRE_END  = -5,
  547    _HFSA_IMG      = -6,    /* <img> */
  548    _HFSA_HREF     = -7,    /* <a>.. */
  549    _HFSA_HREF_END = -8,
  550    _HFSA_BQUOTE   = -9,    /* <blockquote>, interpreted as citation! */
  551    _HFSA_BQUOTE_END = -10
  552 };
  553 
  554 enum hf_entity_flags {
  555    _HFE_HAVE_UNI  = 1<<6,  /* Have a Unicode replacement character */
  556    _HFE_HAVE_CSTR = 1<<7,  /* Have a string replacement */
  557    /* We store the length of the entity name in the flags, too */
  558    _HFE_LENGTH_MASK = (1<<6) - 1
  559 };
  560 
  561 struct htmlflt_href {
  562    struct htmlflt_href *hfh_next;
  563    ui32_t      hfh_no;     /* Running sequence */
  564    ui32_t      hfh_len;    /* of .hfh_dat */
  565    char        hfh_dat[n_VFIELD_SIZE(0)];
  566 };
  567 
  568 struct htmlflt_tag {
  569    si32_t      hft_act;    /* char or hf_special_actions */
  570    /* Not NUL: character to inject, with high bit set: place a space
  571     * afterwards.  Note: only recognized with _HFSA_NEEDSEP or _HFSA_NEEDNL */
  572    char        hft_injc;
  573    ui8_t       hft_len;    /* Useful bytes in (NUL terminated) .hft_tag */
  574    char const  hft_tag[10]; /* Tag less < and > surroundings (TR, /TR, ..) */
  575 };
  576 n_CTA(n_SIZEOF_FIELD(struct htmlflt_tag, hft_tag) < LINESIZE,
  577    "Structure field too large a size"); /* .hf_ign_tag */
  578 
  579 struct hf_ent {
  580    ui8_t       hfe_flags;  /* enum hf_entity_flags plus length of .hfe_ent */
  581    char        hfe_c;      /* Plain replacement character */
  582    ui16_t      hfe_uni;    /* Unicode codepoint if _HFE_HAVE_UNI */
  583    char        hfe_cstr[5]; /* _HFE_HAVE_CSTR (e.g., &hellip; -> ...) */
  584    char const  hfe_ent[7]; /* Entity less & and ; surroundings */
  585 };
  586 
  587 /* Tag list; not binary searched :(, so try to take care a bit */
  588 static struct htmlflt_tag const  _hf_tags[] = {
  589 # undef _X
  590 # undef _XC
  591 # define _X(S,A)     {A, '\0', sizeof(S) -1, S "\0"}
  592 # define _XC(S,C,A)  {A, C, sizeof(S) -1, S "\0"}
  593 
  594 # if 0 /* This is treated very special (to avoid wasting space in .hft_tag) */
  595    _X("BLOCKQUOTE", _HFSA_BQUOTE), _X("/BLOCKQUOTE", _HFSA_BQUOTE_END),
  596 # endif
  597 
  598    _X("P", _HFSA_NEEDSEP),       _X("/P", _HFSA_NEEDNL),
  599    _X("DIV", _HFSA_NEEDSEP),     _X("/DIV", _HFSA_NEEDNL),
  600    _X("TR", _HFSA_NEEDNL),
  601                                  _X("/TH", '\t'),
  602                                  _X("/TD", '\t'),
  603    /* Let it stand out; also since we don't support implicit paragraphs after
  604     * block elements, plain running text after a list (seen in Unicode
  605     * announcement via Firefox) */
  606    _X("UL", _HFSA_NEEDSEP),      _X("/UL", _HFSA_NEEDSEP),
  607    _XC("LI", (char)0x80 | '*', _HFSA_NEEDSEP),
  608    _X("DL", _HFSA_NEEDSEP),
  609    _X("DT", _HFSA_NEEDNL),
  610 
  611    _X("A", _HFSA_HREF),          _X("/A", _HFSA_HREF_END),
  612    _X("IMG", _HFSA_IMG),
  613    _X("BR", '\n'),
  614    _X("PRE", _HFSA_PRE),         _X("/PRE", _HFSA_PRE_END),
  615    _X("TITLE", _HFSA_NEEDSEP),   /*_X("/TITLE", '\n'),*/
  616    _X("H1", _HFSA_NEEDSEP),      /*_X("/H1", '\n'),*/
  617    _X("H2", _HFSA_NEEDSEP),      /*_X("/H2", '\n'),*/
  618    _X("H3", _HFSA_NEEDSEP),      /*_X("/H3", '\n'),*/
  619    _X("H4", _HFSA_NEEDSEP),      /*_X("/H4", '\n'),*/
  620    _X("H5", _HFSA_NEEDSEP),      /*_X("/H5", '\n'),*/
  621    _X("H6", _HFSA_NEEDSEP),      /*_X("/H6", '\n'),*/
  622 
  623    _X("STYLE", _HFSA_IGN),
  624    _X("SCRIPT", _HFSA_IGN),
  625 
  626 # undef _X
  627 };
  628 
  629 /* Entity list; not binary searched.. */
  630 static struct hf_ent const       _hf_ents[] = {
  631 # undef _X
  632 # undef _XU
  633 # undef _XS
  634 # undef _XUS
  635 # define _X(E,C)     {(sizeof(E) -1), C, 0x0u, "", E "\0"}
  636 # define _XU(E,C,U)  {(sizeof(E) -1) | _HFE_HAVE_UNI, C, U, "", E "\0"}
  637 # define _XS(E,S)    {(sizeof(E) -1) | _HFE_HAVE_CSTR, '\0', 0x0u,S "\0",E "\0"}
  638 # define _XSU(E,S,U) \
  639    {(sizeof(E) -1) | _HFE_HAVE_UNI | _HFE_HAVE_CSTR, '\0', U, S "\0", E "\0"}
  640 
  641    _X("quot", '"'),
  642    _X("amp", '&'),
  643    _X("lt", '<'),                _X("gt", '>'),
  644 
  645    _XU("nbsp", ' ', 0x0020 /* Note: not 0x00A0 seems to be better for us */),
  646    _XU("middot", '.', 0x00B7),
  647    _XSU("hellip", "...", 0x2026),
  648    _XSU("mdash", "---", 0x2014), _XSU("ndash", "--", 0x2013),
  649    _XSU("laquo", "<<", 0x00AB),  _XSU("raquo", ">>", 0x00BB),
  650    _XSU("lsaquo", "<", 0x2039),  _XSU("rsaquo", ">", 0x203A),
  651    _XSU("lsquo", "'", 0x2018),   _XSU("rsquo", "'", 0x2019),
  652    _XSU("ldquo", "\"", 0x201C),  _XSU("rdquo", "\"", 0x201D),
  653    _XSU("uarr", "^|", 0x2191),   _XSU("darr", "|v", 0x2193),
  654 
  655    _XSU("cent", "CENT", 0x00A2),
  656    _XSU("copy", "(C)", 0x00A9),
  657    _XSU("euro", "EUR", 0x20AC),
  658    _XSU("infin", "INFY", 0x221E),
  659    _XSU("pound", "GBP", 0x00A3),
  660    _XSU("reg", "(R)", 0x00AE),
  661    _XSU("sect", "S:", 0x00A7),
  662    _XSU("yen", "JPY", 0x00A5),
  663 
  664    /* German umlauts */
  665    _XSU("Auml", "Ae", 0x00C4),   _XSU("auml", "ae", 0x00E4),
  666    _XSU("Ouml", "Oe", 0x00D6),   _XSU("ouml", "oe", 0x00F6),
  667    _XSU("Uuml", "Ue", 0x00DC),   _XSU("uuml", "ue", 0x00FC),
  668    _XSU("szlig", "ss", 0x00DF)
  669 
  670 # undef _X
  671 # undef _XU
  672 # undef _XS
  673 # undef _XSU
  674 };
  675 
  676 /* Real output */
  677 static struct htmlflt * _hf_dump_hrefs(struct htmlflt *self);
  678 static struct htmlflt * _hf_dump(struct htmlflt *self);
  679 static struct htmlflt * _hf_store(struct htmlflt *self, char c);
  680 # ifdef HAVE_NATCH_CHAR
  681 static struct htmlflt * __hf_sync_mbstuff(struct htmlflt *self);
  682 # endif
  683 
  684 /* Virtual output */
  685 static struct htmlflt * _hf_nl(struct htmlflt *self);
  686 static struct htmlflt * _hf_nl_force(struct htmlflt *self);
  687 static struct htmlflt * _hf_putc(struct htmlflt *self, char c);
  688 static struct htmlflt * _hf_putc_premode(struct htmlflt *self, char c);
  689 static struct htmlflt * _hf_puts(struct htmlflt *self, char const *cp);
  690 static struct htmlflt * _hf_putbuf(struct htmlflt *self,
  691                            char const *cp, size_t len);
  692 
  693 /* Try to locate a param'eter in >hf_bdat, store it (non-terminated!) or NULL */
  694 static struct htmlflt * _hf_param(struct htmlflt *self, struct str *store,
  695                            char const *param);
  696 
  697 /* Expand all entities in the given parameter */
  698 static struct htmlflt * _hf_expand_all_ents(struct htmlflt *self,
  699                            struct str const *param);
  700 
  701 /* Completely parsed over a tag / an entity, interpret that */
  702 static struct htmlflt * _hf_check_tag(struct htmlflt *self, char const *s);
  703 static struct htmlflt * _hf_check_ent(struct htmlflt *self, char const *s,
  704                            size_t l);
  705 
  706 /* Input handler */
  707 static ssize_t          _hf_add_data(struct htmlflt *self,
  708                            char const *dat, size_t len);
  709 
  710 static struct htmlflt *
  711 _hf_dump_hrefs(struct htmlflt *self)
  712 {
  713    struct htmlflt_href *hhp;
  714    NYD2_ENTER;
  715 
  716    if (!(self->hf_flags & _HF_NL_2) && putc('\n', self->hf_os) == EOF) {
  717       self->hf_flags |= _HF_ERROR;
  718       goto jleave;
  719    }
  720 
  721    /* Reverse the list */
  722    for (hhp = self->hf_hrefs, self->hf_hrefs = NULL; hhp != NULL;) {
  723       struct htmlflt_href *tmp = hhp->hfh_next;
  724       hhp->hfh_next = self->hf_hrefs;
  725       self->hf_hrefs = hhp;
  726       hhp = tmp;
  727    }
  728 
  729    /* Then dump it */
  730    while ((hhp = self->hf_hrefs) != NULL) {
  731       self->hf_hrefs = hhp->hfh_next;
  732 
  733       if (!(self->hf_flags & _HF_ERROR)) {
  734          int w = fprintf(self->hf_os, "  [%u] %.*s\n",
  735                hhp->hfh_no, (int)hhp->hfh_len, hhp->hfh_dat);
  736          if (w < 0)
  737             self->hf_flags |= _HF_ERROR;
  738       }
  739       free(hhp);
  740    }
  741 
  742    self->hf_flags |= (putc('\n', self->hf_os) == EOF)
  743          ?  _HF_ERROR : _HF_NL_1 | _HF_NL_2;
  744    self->hf_href_dist = (ui32_t)n_realscreenheight >> 1;
  745 jleave:
  746    NYD2_LEAVE;
  747    return self;
  748 }
  749 
  750 static struct htmlflt *
  751 _hf_dump(struct htmlflt *self)
  752 {
  753    ui32_t f, l;
  754    char c, *cp;
  755    NYD2_ENTER;
  756 
  757    f = self->hf_flags & ~_HF_BLANK;
  758    l = self->hf_len;
  759    cp = self->hf_line;
  760    self->hf_mbwidth = self->hf_mboff = self->hf_last_ws = self->hf_len = 0;
  761 
  762    for (c = '\0'; l > 0; --l) {
  763       c = *cp++;
  764 jput:
  765       if (putc(c, self->hf_os) == EOF) {
  766          self->hf_flags = (f |= _HF_ERROR);
  767          goto jleave;
  768       }
  769    }
  770 
  771    if (c != '\n') {
  772       f |= (f & _HF_NL_1) ? _HF_NL_2 : _HF_NL_1;
  773       l = 1;
  774       c = '\n';
  775       goto jput;
  776    }
  777    self->hf_flags = f;
  778 
  779    /* Check whether there are HREFs to dump; there is so much messy tagsoup out
  780     * there that it seems best not to simply dump HREFs in each _dump(), but
  781     * only with some gap, let's say half the real screen height */
  782    if (--self->hf_href_dist < 0 && (f & _HF_NL_2) && self->hf_hrefs != NULL)
  783       self = _hf_dump_hrefs(self);
  784 jleave:
  785    NYD2_LEAVE;
  786    return self;
  787 }
  788 
  789 static struct htmlflt *
  790 _hf_store(struct htmlflt *self, char c)
  791 {
  792    ui32_t l, i;
  793    NYD2_ENTER;
  794 
  795    assert(c != '\n');
  796 
  797    l = self->hf_len;
  798    if(n_UNLIKELY(l == 0) && (i = (self->hf_flags & _HF_BQUOTE_MASK)) != 0 &&
  799          self->hf_lmax > _HF_MINLEN){
  800       ui32_t len, j;
  801       char const *ip;
  802 
  803       ip = ok_vlook(indentprefix);
  804       len = strlen(ip);
  805       if(len == 0 || len >= _HF_MINLEN){
  806          ip = "   |"; /* XXX something from *quote-chars* */
  807          len = sizeof("   |") -1;
  808       }
  809 
  810       self->hf_len = len;
  811       for(j = len; j-- != 0;){
  812          char x;
  813 
  814          if((x = ip[j]) == '\t')
  815             x = ' ';
  816          self->hf_line[j] = x;
  817       }
  818 
  819       while(--i > 0 && self->hf_len < self->hf_lmax - _HF_BRKSUB)
  820          self = _hf_store(self, '|'); /* XXX something from *quote-chars* */
  821 
  822       l = self->hf_len;
  823    }
  824 
  825    self->hf_line[l] = (c == '\t' ? ' ' : c);
  826    self->hf_len = ++l;
  827    if (blankspacechar(c)) {
  828       if (c == '\t') {
  829          i = 8 - ((l - 1) & 7); /* xxx magic tab width of 8 */
  830          if (i > 0) {
  831             do
  832                self = _hf_store(self, ' ');
  833             while (--i > 0);
  834             goto jleave;
  835          }
  836       }
  837       self->hf_last_ws = l;
  838    } else if (/*c == '.' ||*/ c == ',' || c == ';' || c == '-')
  839       self->hf_last_ws = l;
  840 
  841    i = l;
  842 # ifdef HAVE_NATCH_CHAR /* XXX This code is really ridiculous! */
  843    if (n_mb_cur_max > 1) { /* XXX should mbrtowc() and THEN store, at least */
  844       wchar_t wc;
  845       int w, x;
  846 
  847       if((x = mbtowc(&wc, self->hf_line + self->hf_mboff, l - self->hf_mboff)
  848             ) > 0){
  849          if ((w = wcwidth(wc)) == -1 ||
  850                /* Actively filter out L-TO-R and R-TO-R marks TODO ctext */
  851                (wc == 0x200E || wc == 0x200F ||
  852                   (wc >= 0x202A && wc <= 0x202E)) ||
  853                /* And some zero-width messes */
  854                wc == 0x00AD || (wc >= 0x200B && wc <= 0x200D) ||
  855                /* Oh about the ISO C wide character interfaces, baby! */
  856                (wc == 0xFEFF)){
  857             self->hf_len -= x;
  858             goto jleave;
  859          } else if (iswspace(wc))
  860             self->hf_last_ws = l;
  861          self->hf_mboff += x;
  862          i = (self->hf_mbwidth += w);
  863       } else {
  864          if (x < 0) {
  865             (void)mbtowc(&wc, NULL, n_mb_cur_max);
  866             if (UICMP(32, l - self->hf_mboff, >=, n_mb_cur_max)) { /* XXX */
  867                ++self->hf_mboff;
  868                ++self->hf_mbwidth;
  869             }
  870          }
  871          i = self->hf_mbwidth;
  872       }
  873    }
  874 # endif
  875 
  876    /* Do we need to break the line? */
  877    if (i >= self->hf_lmax - _HF_BRKSUB) {
  878       ui32_t f, lim;
  879 
  880 
  881       /* Let's hope we saw a sane place to break this line! */
  882       if (self->hf_last_ws >= (lim = self->hf_lmax >> 1)) {
  883 jput:
  884          i = self->hf_len = self->hf_last_ws;
  885          self = _hf_dump(self);
  886          if ((self->hf_len = (l -= i)) > 0) {
  887             self->hf_flags &= ~_HF_NL_MASK;
  888             memmove(self->hf_line, self->hf_line + i, l);
  889 # ifdef HAVE_NATCH_CHAR
  890             __hf_sync_mbstuff(self);
  891 # endif
  892          }
  893          goto jleave;
  894       }
  895 
  896       /* Any 7-bit characters? */
  897       f = self->hf_flags;
  898       for (i = l; i-- >= lim;)
  899          if (asciichar((c = self->hf_line[i]))) {
  900             self->hf_last_ws = ++i;
  901             goto jput;
  902          } else if ((f & _HF_UTF8) && ((ui8_t)c & 0xC0) != 0x80) {
  903             self->hf_last_ws = i;
  904             goto jput;
  905          }
  906 
  907       /* Hard break necessary!  xxx really badly done */
  908       if (l >= self->hf_lmax - 1)
  909          self = _hf_dump(self);
  910    }
  911 jleave:
  912    NYD2_LEAVE;
  913    return self;
  914 }
  915 
  916 # ifdef HAVE_NATCH_CHAR
  917 static struct htmlflt *
  918 __hf_sync_mbstuff(struct htmlflt *self)
  919 {
  920    wchar_t wc;
  921    char const *b;
  922    ui32_t o, w, l;
  923    NYD2_ENTER;
  924 
  925    b = self->hf_line;
  926    o = w = 0;
  927    l = self->hf_len;
  928    goto jumpin;
  929 
  930    while (l > 0) {
  931       int x = mbtowc(&wc, b, l);
  932 
  933       if (x == 0)
  934          break;
  935 
  936       if (x > 0) {
  937          b += x;
  938          l -= x;
  939          o += x;
  940          if ((x = wcwidth(wc)) == -1)
  941             x = 1;
  942          w += x;
  943          continue;
  944       }
  945 
  946       /* Bad, skip over a single character.. XXX very bad indeed */
  947       ++b;
  948       ++o;
  949       ++w;
  950       --l;
  951 jumpin:
  952       (void)mbtowc(&wc, NULL, n_mb_cur_max);
  953    }
  954 
  955    self->hf_mboff = o;
  956    self->hf_mbwidth = w;
  957 
  958    NYD2_LEAVE;
  959    return self;
  960 }
  961 # endif /* HAVE_NATCH_CHAR */
  962 
  963 static struct htmlflt *
  964 _hf_nl(struct htmlflt *self)
  965 {
  966    ui32_t f;
  967    NYD2_ENTER;
  968 
  969    if (!((f = self->hf_flags) & _HF_ERROR)) {
  970       if (f & _HF_ANY) {
  971          if ((f & _HF_NL_MASK) != _HF_NL_MASK)
  972             self = _hf_dump(self);
  973       } else
  974          self->hf_flags = (f |= _HF_NL_MASK);
  975    }
  976    NYD2_LEAVE;
  977    return self;
  978 }
  979 
  980 static struct htmlflt *
  981 _hf_nl_force(struct htmlflt *self)
  982 {
  983    NYD2_ENTER;
  984    if (!(self->hf_flags & _HF_ERROR))
  985       self = _hf_dump(self);
  986    NYD2_LEAVE;
  987    return self;
  988 }
  989 
  990 static struct htmlflt *
  991 _hf_putc(struct htmlflt *self, char c)
  992 {
  993    ui32_t f;
  994    NYD2_ENTER;
  995 
  996    if ((f = self->hf_flags) & _HF_ERROR)
  997       goto jleave;
  998 
  999    if (c == '\n') {
 1000       self = _hf_nl(self);
 1001       goto jleave;
 1002    } else if (c == ' ' || c == '\t') {
 1003       if ((f & _HF_BLANK) || self->hf_len == 0)
 1004          goto jleave;
 1005       f |= _HF_BLANK;
 1006    } else
 1007       f &= ~_HF_BLANK;
 1008    f &= ~_HF_NL_MASK;
 1009    self->hf_flags = (f |= _HF_ANY);
 1010    self = _hf_store(self, c);
 1011 jleave:
 1012    NYD2_LEAVE;
 1013    return self;
 1014 }
 1015 
 1016 static struct htmlflt *
 1017 _hf_putc_premode(struct htmlflt *self, char c)
 1018 {
 1019    ui32_t f;
 1020    NYD2_ENTER;
 1021 
 1022    if ((f = self->hf_flags) & _HF_ERROR) {
 1023       ;
 1024    } else if (c == '\n')
 1025       self = _hf_nl_force(self);
 1026    else {
 1027       f &= ~_HF_NL_MASK;
 1028       self->hf_flags = (f |= _HF_ANY);
 1029       self = _hf_store(self, c);
 1030    }
 1031    NYD2_LEAVE;
 1032    return self;
 1033 }
 1034 
 1035 static struct htmlflt *
 1036 _hf_puts(struct htmlflt *self, char const *cp)
 1037 {
 1038    char c;
 1039    NYD2_ENTER;
 1040 
 1041    while ((c = *cp++) != '\0')
 1042       self = _hf_putc(self, c);
 1043    NYD2_LEAVE;
 1044    return self;
 1045 }
 1046 
 1047 static struct htmlflt *
 1048 _hf_putbuf(struct htmlflt *self, char const *cp, size_t len)
 1049 {
 1050    NYD2_ENTER;
 1051 
 1052    while (len-- > 0)
 1053       self = _hf_putc(self, *cp++);
 1054    NYD2_LEAVE;
 1055    return self;
 1056 }
 1057 
 1058 static struct htmlflt *
 1059 _hf_param(struct htmlflt *self, struct str *store, char const *param)
 1060 {
 1061    char const *cp;
 1062    char c, x, quote;
 1063    size_t i;
 1064    bool_t hot;
 1065    NYD2_ENTER;
 1066 
 1067    store->s = NULL;
 1068    store->l = 0;
 1069    cp = self->hf_bdat;
 1070 
 1071    /* Skip over any non-WS first; be aware of soup, if it slipped through */
 1072    for(;;){
 1073       if((c = *cp++) == '\0' || c == '>')
 1074          goto jleave;
 1075       if(whitechar(c))
 1076          break;
 1077    }
 1078 
 1079    /* Search for the parameter, take care of other quoting along the way */
 1080    x = *param++;
 1081    x = upperconv(x);
 1082    i = strlen(param);
 1083 
 1084    for(hot = TRU1;;){
 1085       if((c = *cp++) == '\0' || c == '>')
 1086          goto jleave;
 1087       if(whitechar(c)){
 1088          hot = TRU1;
 1089          continue;
 1090       }
 1091 
 1092       /* Could it be a parameter? */
 1093       if(hot){
 1094          hot = FAL0;
 1095 
 1096          /* Is it the desired one? */
 1097          if((c = upperconv(c)) == x && !ascncasecmp(param, cp, i)){
 1098             char const *cp2 = cp + i;
 1099 
 1100             if((quote = *cp2++) != '='){
 1101                if(quote == '\0' || quote == '>')
 1102                   goto jleave;
 1103                while(whitechar(quote))
 1104                   quote = *cp2++;
 1105             }
 1106             if(quote == '='){
 1107                cp = cp2;
 1108                break;
 1109             }
 1110             continue; /* XXX Optimize: i bytes or even cp2 can't be it! */
 1111          }
 1112       }
 1113 
 1114       /* Not the desired one; but a parameter? */
 1115       if(c != '=')
 1116          continue;
 1117       /* If so, properly skip over the value */
 1118       if((c = *cp++) == '"' || c == '\''){
 1119          /* TODO i have forgotten whether reverse solidus quoting is allowed
 1120           * TODO quoted HTML parameter values?  not supporting that for now.. */
 1121          for(quote = c; (c = *cp++) != '\0' && c != quote;)
 1122             ;
 1123       }else
 1124          while(c != '\0' && !whitechar(c) && c != '>')
 1125             c = *++cp;
 1126       if(c == '\0')
 1127          goto jleave;
 1128    }
 1129 
 1130    /* Skip further whitespace */
 1131    for(;;){
 1132       if((c = *cp++) == '\0' || c == '>')
 1133          goto jleave;
 1134       if(!whitechar(c))
 1135          break;
 1136    }
 1137 
 1138    if(c == '"' || c == '\''){
 1139       /* TODO i have forgotten whether reverse solisud quoting is allowed in
 1140        * TODO quoted HTML parameter values?  not supporting that for now.. */
 1141       store->s = n_UNCONST(cp);
 1142       for(quote = c; (c = *cp) != '\0' && c != quote; ++cp)
 1143          ;
 1144       /* XXX ... and we simply ignore a missing trailing " :> */
 1145    }else{
 1146       store->s = n_UNCONST(cp - 1);
 1147       if(!whitechar(c))
 1148          while((c = *cp) != '\0' && !whitechar(c) && c != '>')
 1149             ++cp;
 1150    }
 1151    i = PTR2SIZE(cp - store->s);
 1152 
 1153    /* Terrible tagsoup out there, e.g., groups.google.com produces href=""
 1154     * parameter values prefixed and suffixed by newlines!  Therefore trim the
 1155     * value content TODO join into the parse step above! */
 1156    for (cp = store->s; i > 0 && spacechar(*cp); ++cp, --i)
 1157       ;
 1158    store->s = n_UNCONST(cp);
 1159    for (cp += i - 1; i > 0 && spacechar(*cp); --cp, --i)
 1160       ;
 1161    if ((store->l = i) == 0)
 1162       store->s = NULL;
 1163 jleave:
 1164    NYD2_LEAVE;
 1165    return self;
 1166 }
 1167 
 1168 static struct htmlflt *
 1169 _hf_expand_all_ents(struct htmlflt *self, struct str const *param)
 1170 {
 1171    char const *cp, *maxcp, *ep;
 1172    char c;
 1173    size_t i;
 1174    NYD2_ENTER;
 1175 
 1176    for (cp = param->s, maxcp = cp + param->l; cp < maxcp;)
 1177       if ((c = *cp++) != '&')
 1178 jputc:
 1179          self = _hf_putc(self, c);
 1180       else {
 1181          for (ep = cp--;;) {
 1182             if (ep == maxcp || (c = *ep++) == '\0') {
 1183                for (; cp < ep; ++cp)
 1184                   self = _hf_putc(self, *cp);
 1185                goto jleave;
 1186             } else if (c == ';') {
 1187                if ((i = PTR2SIZE(ep - cp)) > 1) {
 1188                   self = _hf_check_ent(self, cp, i);
 1189                   break;
 1190                } else {
 1191                   c = *cp++;
 1192                   goto jputc;
 1193                }
 1194             }
 1195          }
 1196          cp = ep;
 1197       }
 1198 jleave:
 1199    NYD2_LEAVE;
 1200    return self;
 1201 }
 1202 
 1203 static struct htmlflt *
 1204 _hf_check_tag(struct htmlflt *self, char const *s)
 1205 {
 1206    char nobuf[32], c;
 1207    struct str param;
 1208    size_t i;
 1209    struct htmlflt_tag const *hftp;
 1210    ui32_t f;
 1211    NYD2_ENTER;
 1212 
 1213    /* Extra check only */
 1214    assert(s != NULL);
 1215    if (*s != '<') {
 1216       DBG( n_alert("HTML tagsoup filter _hf_check_tag() called on soup!"); )
 1217 jput_as_is:
 1218       self = _hf_puts(self, self->hf_bdat);
 1219       goto jleave;
 1220    }
 1221 
 1222    for (++s, i = 0; (c = s[i]) != '\0' && c != '>' && !whitechar(c); ++i)
 1223       /* Special massage for things like <br/>: after the slash only whitespace
 1224        * may separate us from the closing right angle! */
 1225       if (c == '/') {
 1226          size_t j = i + 1;
 1227 
 1228          while ((c = s[j]) != '\0' && c != '>' && whitechar(c))
 1229             ++j;
 1230          if (c == '>')
 1231             break;
 1232       }
 1233 
 1234    for (hftp = _hf_tags;;) {
 1235       if (i == hftp->hft_len && !ascncasecmp(s, hftp->hft_tag, i)) {
 1236          c = s[hftp->hft_len];
 1237          if (c == '>' || c == '/' || whitechar(c))
 1238             break;
 1239       }
 1240       if (n_UNLIKELY(PTRCMP(++hftp, >=, _hf_tags + n_NELEM(_hf_tags)))){
 1241          /* A <blockquote> is very special xxx */
 1242          bool_t isct;
 1243 
 1244          if((isct = (i > 1 && *s == '/'))){
 1245             ++s;
 1246             --i;
 1247          }
 1248 
 1249          if(i != sizeof("blockquote") -1 || ascncasecmp(s, "blockquote", i) ||
 1250                ((c = s[sizeof("blockquote") -1]) != '>' && !whitechar(c))){
 1251             s -= isct;
 1252             i += isct;
 1253             goto jnotknown;
 1254          }
 1255 
 1256          if(!isct && !(self->hf_flags & _HF_NL_2))
 1257             self = _hf_nl(self);
 1258          if(!(self->hf_flags & _HF_NL_1))
 1259             self = _hf_nl(self);
 1260          f = self->hf_flags;
 1261          f &= _HF_BQUOTE_MASK;
 1262          if(!isct){
 1263             if(f != _HF_BQUOTE_MASK)
 1264                ++f;
 1265          }else if(f > 0)
 1266             --f;
 1267          f |= (self->hf_flags & ~_HF_BQUOTE_MASK);
 1268          self->hf_flags = f;
 1269          goto jleave;
 1270       }
 1271    }
 1272 
 1273    f = self->hf_flags;
 1274    switch (hftp->hft_act) {
 1275    case _HFSA_PRE_END:
 1276       f &= ~_HF_PRE;
 1277       if (0) {
 1278          /* FALLTHRU */
 1279    case _HFSA_PRE:
 1280          f |= _HF_PRE;
 1281       }
 1282       self->hf_flags = f;
 1283       /* FALLTHRU */
 1284 
 1285    case _HFSA_NEEDSEP:
 1286       if (!(self->hf_flags & _HF_NL_2))
 1287          self = _hf_nl(self);
 1288       /* FALLTHRU */
 1289    case _HFSA_NEEDNL:
 1290       if (!(f & _HF_NL_1))
 1291          self = _hf_nl(self);
 1292       if (hftp->hft_injc != '\0') {
 1293          self = _hf_putc(self, hftp->hft_injc & 0x7F);
 1294          if ((uc_i)hftp->hft_injc & 0x80)
 1295             self = _hf_putc(self, ' ');
 1296       }
 1297       break;
 1298 
 1299    case _HFSA_IGN:
 1300       self->hf_ign_tag = hftp;
 1301       self->hf_flags = (f |= _HF_IGN | _HF_NOPUT);
 1302       break;
 1303 
 1304    case _HFSA_IMG:
 1305       self = _hf_param(self, &param, "alt");
 1306       self = _hf_putc(self, '[');
 1307       if (param.s == NULL) {
 1308          param.s = n_UNCONST("IMG");
 1309          param.l = 3;
 1310          goto jimg_put;
 1311       } /* else */ if (memchr(param.s, '&', param.l) != NULL)
 1312          self = _hf_expand_all_ents(self, &param);
 1313       else
 1314 jimg_put:
 1315          self = _hf_putbuf(self, param.s, param.l);
 1316       self = _hf_putc(self, ']');
 1317       break;
 1318 
 1319    case _HFSA_HREF:
 1320       self = _hf_param(self, &param, "href");
 1321       /* Ignore non-external links */
 1322       if (param.s != NULL && *param.s != '#') {
 1323          struct htmlflt_href *hhp = smalloc(
 1324                n_VSTRUCT_SIZEOF(struct htmlflt_href, hfh_dat) + param.l +1);
 1325 
 1326          hhp->hfh_next = self->hf_hrefs;
 1327          hhp->hfh_no = ++self->hf_href_no;
 1328          hhp->hfh_len = (ui32_t)param.l;
 1329          memcpy(hhp->hfh_dat, param.s, param.l);
 1330 
 1331          snprintf(nobuf, sizeof nobuf, "[%u]", hhp->hfh_no);
 1332          self->hf_flags = (f |= _HF_HREF);
 1333          self->hf_hrefs = hhp;
 1334          self = _hf_puts(self, nobuf);
 1335       } else
 1336          self->hf_flags = (f &= ~_HF_HREF);
 1337       break;
 1338    case _HFSA_HREF_END:
 1339       if (f & _HF_HREF) {
 1340          snprintf(nobuf, sizeof nobuf, "[/%u]", self->hf_href_no);
 1341          self = _hf_puts(self, nobuf);
 1342       }
 1343       break;
 1344 
 1345    default:
 1346       c = (char)(hftp->hft_act & 0xFF);
 1347       self = _hf_putc(self, c);
 1348       break;
 1349    case '\0':
 1350       break;
 1351    }
 1352 jleave:
 1353    NYD2_LEAVE;
 1354    return self;
 1355 
 1356    /* The problem is that even invalid tagsoup is widely used, without real
 1357     * searching i have seen e-mail address in <N@H.D> notation, and more.
 1358     * To protect us a bit look around and possibly write the content as such */
 1359 jnotknown:
 1360    switch (*s) {
 1361    case '!':
 1362    case '?':
 1363       /* Ignore <!DOCTYPE, <!-- comments, <? PIs.. */
 1364       goto jleave;
 1365    case '>':
 1366       /* Print out an empty tag as such */
 1367       if (s[1] == '\0') {
 1368          --s;
 1369          goto jput_as_is;
 1370       }
 1371       break;
 1372    case '/':
 1373       ++s;
 1374       break;
 1375    default:
 1376       break;
 1377    }
 1378 
 1379    /* Also skip over : in order to suppress v:roundrect, w:anchorlock.. */
 1380    while ((c = *s++) != '\0' && c != '>' && !whitechar(c) && c != ':')
 1381       if (!asciichar(c) || punctchar(c)) {
 1382          self = _hf_puts(self, self->hf_bdat);
 1383          break;
 1384       }
 1385    goto jleave;
 1386 }
 1387 
 1388 static struct htmlflt *
 1389 _hf_check_ent(struct htmlflt *self, char const *s, size_t l)
 1390 {
 1391    char nobuf[32];
 1392    char const *s_save;
 1393    size_t l_save;
 1394    struct hf_ent const *hfep;
 1395    size_t i;
 1396    NYD2_ENTER;
 1397 
 1398    s_save = s;
 1399    l_save = l;
 1400    assert(*s == '&');
 1401    assert(l > 0);
 1402    /* False entities seen in the wild assert(s[l - 1] == ';'); */
 1403    ++s;
 1404    l -= 2;
 1405 
 1406    /* Numeric entity, or try named search */
 1407    if (*s == '#') {
 1408       i = (*++s == 'x' ? 16 : 10);
 1409 
 1410       if ((i != 16 || (++s, --l) > 0) && l < sizeof(nobuf)) {
 1411          memcpy(nobuf, s, l);
 1412          nobuf[l] = '\0';
 1413          n_idec_uiz_cp(&i, nobuf, i, NULL);
 1414          if (i <= 0x7F)
 1415             self = _hf_putc(self, (char)i);
 1416          else if (self->hf_flags & _HF_UTF8) {
 1417 jputuni:
 1418             l = n_utf32_to_utf8((ui32_t)i, nobuf);
 1419             self = _hf_putbuf(self, nobuf, l);
 1420          } else
 1421             goto jeent;
 1422       } else
 1423          goto jeent;
 1424    } else {
 1425       ui32_t f = self->hf_flags, hf;
 1426 
 1427       for (hfep = _hf_ents; PTRCMP(hfep, <, _hf_ents + n_NELEM(_hf_ents));
 1428             ++hfep)
 1429          if (l == ((hf = hfep->hfe_flags) & _HFE_LENGTH_MASK) &&
 1430                !strncmp(s, hfep->hfe_ent, l)) {
 1431             if ((hf & _HFE_HAVE_UNI) && (f & _HF_UTF8)) {
 1432                i = hfep->hfe_uni;
 1433                goto jputuni;
 1434             } else if (hf & _HFE_HAVE_CSTR)
 1435                self = _hf_puts(self, hfep->hfe_cstr);
 1436             else
 1437                self = _hf_putc(self, hfep->hfe_c);
 1438             goto jleave;
 1439          }
 1440 jeent:
 1441       self = _hf_putbuf(self, s_save, l_save);
 1442    }
 1443 jleave:
 1444    NYD2_LEAVE;
 1445    return self;
 1446 }
 1447 
 1448 static ssize_t
 1449 _hf_add_data(struct htmlflt *self, char const *dat, size_t len)
 1450 {
 1451    char c, *cp, *cp_max;
 1452    bool_t hot;
 1453    ssize_t rv = 0;
 1454    NYD_ENTER;
 1455 
 1456    /* Final put request? */
 1457    if (dat == NULL) {
 1458       if (self->hf_len > 0 || self->hf_hrefs != NULL) {
 1459          self = _hf_dump(self);
 1460          if (self->hf_hrefs != NULL)
 1461             self = _hf_dump_hrefs(self);
 1462          rv = 1;
 1463       }
 1464       goto jleave;
 1465    }
 1466 
 1467    /* Always ensure some initial buffer */
 1468    if ((cp = self->hf_curr) != NULL)
 1469       cp_max = self->hf_bmax;
 1470    else {
 1471       cp = self->hf_curr = self->hf_bdat = smalloc(LINESIZE);
 1472       cp_max = self->hf_bmax = cp + LINESIZE -1; /* (Always room for NUL!) */
 1473    }
 1474    hot = (cp != self->hf_bdat);
 1475 
 1476    for (rv = (ssize_t)len; len > 0; --len) {
 1477       ui32_t f = self->hf_flags;
 1478 
 1479       if (f & _HF_ERROR)
 1480          break;
 1481       c = *dat++;
 1482 
 1483       /* Soup is really weird, and scripts may contain almost anything (and
 1484        * newer CSS standards are also cryptic): therefore prefix the _HF_IGN
 1485        * test and walk until we see the required end tag */
 1486       /* TODO For real safety _HF_IGN soup condome would also need to know
 1487        * TODO about quoted strings so that 'var i = "</script>";' couldn't
 1488        * TODO fool it!   We really want this mode also for _HF_NOPUT to be
 1489        * TODO able to *gracefully* detect the tag-closing '>', but then if
 1490        * TODO that is a single mechanism we should have made it! */
 1491       if (f & _HF_IGN) {
 1492          struct htmlflt_tag const *hftp = self->hf_ign_tag;
 1493          size_t i;
 1494 
 1495          if (c == '<') {
 1496             hot = TRU1;
 1497 jcp_reset:
 1498             cp = self->hf_bdat;
 1499          } else if (c == '>') {
 1500             if (hot) {
 1501                if ((i = PTR2SIZE(cp - self->hf_bdat)) > 1 &&
 1502                      --i == hftp->hft_len &&
 1503                      !ascncasecmp(self->hf_bdat + 1, hftp->hft_tag, i))
 1504                   self->hf_flags = (f &= ~(_HF_IGN | _HF_NOPUT));
 1505                hot = FAL0;
 1506                goto jcp_reset;
 1507             }
 1508          } else if (hot) {
 1509             *cp++ = c;
 1510             i = PTR2SIZE(cp - self->hf_bdat);
 1511             if ((i == 1 && c != '/') || --i > hftp->hft_len) {
 1512                hot = FAL0;
 1513                goto jcp_reset;
 1514             }
 1515          }
 1516       } else switch (c) {
 1517       case '<':
 1518          /* People are using & without &amp;ing it, ditto <; be aware */
 1519          if (f & (_HF_NOPUT | _HF_ENT)) {
 1520             f &= ~_HF_ENT;
 1521             /* Special case "<!--" buffer content to deal with really weird
 1522              * things that can be done with "<!--[if gte mso 9]>" syntax */
 1523             if (PTR2SIZE(cp - self->hf_bdat) != 4 ||
 1524                   memcmp(self->hf_bdat, "<!--", 4)) {
 1525                self->hf_flags = f;
 1526                *cp = '\0';
 1527                self = _hf_puts(self, self->hf_bdat);
 1528                f = self->hf_flags;
 1529             }
 1530          }
 1531          cp = self->hf_bdat;
 1532          *cp++ = c;
 1533          self->hf_flags = (f |= _HF_NOPUT);
 1534          break;
 1535       case '>':
 1536          /* Weird tagsoup around, do we actually parse a tag? */
 1537          if (!(f & _HF_NOPUT))
 1538             goto jdo_c;
 1539          cp[0] = c;
 1540          cp[1] = '\0';
 1541          f &= ~(_HF_NOPUT | _HF_ENT);
 1542          self->hf_flags = f;
 1543          self = _hf_check_tag(self, self->hf_bdat);
 1544          *(cp = self->hf_bdat) = '\0'; /* xxx extra safety */
 1545          /* Quick hack to get rid of redundant newline after <pre> XXX */
 1546          if (!(f & _HF_PRE) && (self->hf_flags & _HF_PRE) &&
 1547                len > 1 && *dat == '\n')
 1548             ++dat, --len;
 1549          break;
 1550 
 1551       case '\r': /* TODO CR should be stripped in lower level!! (Only B64!?!) */
 1552          break;
 1553       case '\n':
 1554          /* End of line is not considered unless we are in PRE section.
 1555           * However, in _HF_NOPUT mode we must be aware of tagsoup which uses
 1556           * newlines for separating parameters */
 1557          if (f & _HF_NOPUT)
 1558             goto jdo_c;
 1559          self = (f & _HF_PRE) ? _hf_nl_force(self) : _hf_putc(self, ' ');
 1560          break;
 1561 
 1562       case '\t':
 1563          if (!(f & _HF_PRE))
 1564             c = ' ';
 1565          /* FALLTHRU */
 1566       default:
 1567 jdo_c:
 1568          /* If not currently parsing a tag and bypassing normal output.. */
 1569          if (!(f & _HF_NOPUT)) {
 1570             if (cntrlchar(c))
 1571                break;
 1572             if (c == '&') {
 1573                cp = self->hf_bdat;
 1574                *cp++ = c;
 1575                self->hf_flags = (f |= _HF_NOPUT | _HF_ENT);
 1576             } else if (f & _HF_PRE) {
 1577                self = _hf_putc_premode(self, c);
 1578                self->hf_flags &= ~_HF_BLANK;
 1579             } else
 1580               self = _hf_putc(self, c);
 1581          } else if ((f & _HF_ENT) && c == ';') {
 1582             cp[0] = c;
 1583             cp[1] = '\0';
 1584             f &= ~(_HF_NOPUT | _HF_ENT);
 1585             self->hf_flags = f;
 1586            self = _hf_check_ent(self, self->hf_bdat,
 1587                PTR2SIZE(cp + 1 - self->hf_bdat));
 1588          } else {
 1589             /* We may need to grow the buffer */
 1590             if (PTRCMP(cp + 42/2, >=, cp_max)) {
 1591                size_t i = PTR2SIZE(cp - self->hf_bdat),
 1592                   m = PTR2SIZE(self->hf_bmax - self->hf_bdat) + LINESIZE;
 1593 
 1594                cp = self->hf_bdat = srealloc(self->hf_bdat, m);
 1595                self->hf_bmax = cp + m -1;
 1596                self->hf_curr = (cp += i);
 1597             }
 1598             *cp++ = c;
 1599          }
 1600       }
 1601    }
 1602    self->hf_curr = cp;
 1603 jleave:
 1604   NYD_LEAVE;
 1605   return (self->hf_flags & _HF_ERROR) ? -1 : rv;
 1606 }
 1607 
 1608 /*
 1609  * TODO Because we don't support filter chains yet this filter will be run
 1610  * TODO in a dedicated subprocess, driven via a special Popen() mode
 1611  */
 1612 static bool_t __hf_hadpipesig;
 1613 static void
 1614 __hf_onpipe(int signo)
 1615 {
 1616    NYD_X; /* Signal handler */
 1617    n_UNUSED(signo);
 1618    __hf_hadpipesig = TRU1;
 1619 }
 1620 
 1621 FL int
 1622 htmlflt_process_main(void)
 1623 {
 1624    char buf[BUFFER_SIZE];
 1625    struct htmlflt hf;
 1626    size_t i;
 1627    int rv;
 1628    NYD_ENTER;
 1629 
 1630    __hf_hadpipesig = FAL0;
 1631    safe_signal(SIGPIPE, &__hf_onpipe);
 1632 
 1633    htmlflt_init(&hf);
 1634    htmlflt_reset(&hf, n_stdout);
 1635 
 1636    for (;;) {
 1637       if ((i = fread(buf, sizeof(buf[0]), n_NELEM(buf), n_stdin)) == 0) {
 1638          rv = !feof(n_stdin);
 1639          break;
 1640       }
 1641 
 1642       if ((rv = __hf_hadpipesig))
 1643          break;
 1644       /* Just use this directly.. */
 1645       if (htmlflt_push(&hf, buf, i) < 0) {
 1646          rv = 1;
 1647          break;
 1648       }
 1649    }
 1650    if (rv == 0 && htmlflt_flush(&hf) < 0)
 1651       rv = 1;
 1652 
 1653    htmlflt_destroy(&hf);
 1654 
 1655    rv |= __hf_hadpipesig;
 1656    NYD_LEAVE;
 1657    return rv;
 1658 }
 1659 
 1660 FL void
 1661 htmlflt_init(struct htmlflt *self)
 1662 {
 1663    NYD_ENTER;
 1664    /* (Rather redundant though) */
 1665    memset(self, 0, sizeof *self);
 1666    NYD_LEAVE;
 1667 }
 1668 
 1669 FL void
 1670 htmlflt_destroy(struct htmlflt *self)
 1671 {
 1672    NYD_ENTER;
 1673    htmlflt_reset(self, NULL);
 1674    NYD_LEAVE;
 1675 }
 1676 
 1677 FL void
 1678 htmlflt_reset(struct htmlflt *self, FILE *f)
 1679 {
 1680    struct htmlflt_href *hfhp;
 1681    NYD_ENTER;
 1682 
 1683    while ((hfhp = self->hf_hrefs) != NULL) {
 1684       self->hf_hrefs = hfhp->hfh_next;
 1685       free(hfhp);
 1686    }
 1687 
 1688    if (self->hf_bdat != NULL)
 1689       free(self->hf_bdat);
 1690    if (self->hf_line != NULL)
 1691       free(self->hf_line);
 1692 
 1693    memset(self, 0, sizeof *self);
 1694 
 1695    if (f != NULL) {
 1696       ui32_t sw = n_MAX(_HF_MINLEN, (ui32_t)n_scrnwidth);
 1697 
 1698       self->hf_line = smalloc((size_t)sw * n_mb_cur_max +1);
 1699       self->hf_lmax = sw;
 1700 
 1701       if (n_psonce & n_PSO_UNICODE) /* TODO not truly generic */
 1702          self->hf_flags = _HF_UTF8;
 1703       self->hf_os = f;
 1704    }
 1705    NYD_LEAVE;
 1706 }
 1707 
 1708 FL ssize_t
 1709 htmlflt_push(struct htmlflt *self, char const *dat, size_t len)
 1710 {
 1711    ssize_t rv;
 1712    NYD_ENTER;
 1713 
 1714    rv = _hf_add_data(self, dat, len);
 1715    NYD_LEAVE;
 1716    return rv;
 1717 }
 1718 
 1719 FL ssize_t
 1720 htmlflt_flush(struct htmlflt *self)
 1721 {
 1722    ssize_t rv;
 1723    NYD_ENTER;
 1724 
 1725    rv = _hf_add_data(self, NULL, 0);
 1726    rv |= !fflush(self->hf_os) ? 0 : -1;
 1727    NYD_LEAVE;
 1728    return rv;
 1729 }
 1730 #endif /* HAVE_FILTER_HTML_TAGSOUP */
 1731 
 1732 /* s-it-mode */