"Fossies" - the Fresh Open Source Software Archive

Member "s-nail-14.9.11/filter.c" (8 Aug 2018, 48965 Bytes) of package /linux/misc/s-nail-14.9.11.tar.xz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "filter.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 14.9.10_vs_14.9.11.

    1 /*@ S-nail - a mail user agent derived from Berkeley Mail.
    2  *@ Filter objects.
    3  *
    4  * Copyright (c) 2013 - 2018 Steffen (Daode) Nurpmeso <steffen@sdaoden.eu>.
    5  * SPDX-License-Identifier: ISC
    6  *
    7  * Permission to use, copy, modify, and/or distribute this software for any
    8  * purpose with or without fee is hereby granted, provided that the above
    9  * copyright notice and this permission notice appear in all copies.
   10  *
   11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
   12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
   13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
   14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
   15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
   16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
   17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
   18  */
   19 #undef n_FILE
   20 #define n_FILE filter
   21 
   22 #ifndef HAVE_AMALGAMATION
   23 # include "nail.h"
   24 #endif
   25 
   26 /*
   27  * Quotation filter
   28  */
   29 
   30 /*
   31  * TODO quotation filter: anticipate in future data: don't break if only WS
   32  * TODO or a LF escaping \ follows on the line (simply reuse the latter).
   33  */
   34 
   35 #ifdef HAVE_QUOTE_FOLD
   36 n_CTAV(n_QUOTE_MAX > 3);
   37 
   38 enum qf_state {
   39    _QF_CLEAN,
   40    _QF_PREFIX,
   41    _QF_DATA
   42 };
   43 
   44 struct qf_vc {
   45    struct quoteflt   *self;
   46    char const        *buf;
   47    size_t            len;
   48 };
   49 
   50 /* Print out prefix and current quote */
   51 static ssize_t _qf_dump_prefix(struct quoteflt *self);
   52 
   53 /* Add one data character */
   54 static ssize_t _qf_add_data(struct quoteflt *self, wchar_t wc);
   55 
   56 /* State machine handlers */
   57 static ssize_t _qf_state_prefix(struct qf_vc *vc);
   58 static ssize_t _qf_state_data(struct qf_vc *vc);
   59 
   60 static ssize_t
   61 _qf_dump_prefix(struct quoteflt *self)
   62 {
   63    ssize_t rv;
   64    size_t i;
   65    NYD_ENTER;
   66 
   67    if ((i = self->qf_pfix_len) > 0 && i != fwrite(self->qf_pfix, 1, i,
   68          self->qf_os))
   69       goto jerr;
   70    rv = i;
   71 
   72    if ((i = self->qf_currq.l) > 0 && i != fwrite(self->qf_currq.s, 1, i,
   73          self->qf_os))
   74       goto jerr;
   75    rv += i;
   76 jleave:
   77    NYD_LEAVE;
   78    return rv;
   79 jerr:
   80    rv = -1;
   81    goto jleave;
   82 }
   83 
   84 static ssize_t
   85 _qf_add_data(struct quoteflt *self, wchar_t wc)
   86 {
   87    int w, l;
   88    char *save_b;
   89    ui32_t save_l, save_w;
   90    ssize_t rv;
   91    NYD_ENTER;
   92 
   93    rv = 0;
   94    save_l = save_w = 0; /* silence cc */
   95    save_b = NULL;
   96 
   97    /* <newline> ends state */
   98    if (wc == L'\n') {
   99       w = 0;
  100       goto jflush;
  101    }
  102    if (wc == L'\r') /* TODO CR should be stripped in lower level!! */
  103       goto jleave;
  104 
  105    /* Unroll <tab> to spaces */
  106    if (wc == L'\t') {
  107       save_l = self->qf_datw;
  108       save_w = (save_l + n_QUOTE_TAB_SPACES) & ~(n_QUOTE_TAB_SPACES - 1);
  109       save_w -= save_l;
  110       while (save_w-- > 0) {
  111          ssize_t j = _qf_add_data(self, L' ');
  112          if (j < 0) {
  113             rv = j;
  114             break;
  115          }
  116          rv += j;
  117       }
  118       goto jleave;
  119    }
  120 
  121    /* To avoid that the last visual excesses *qfold-max*, which may happen for
  122     * multi-column characters, use w as an indicator for this and move that
  123     * thing to the next line */
  124    w = wcwidth(wc);
  125    if (w == -1) {
  126       w = 0;
  127 jbad:
  128       ++self->qf_datw;
  129       self->qf_dat.s[self->qf_dat.l++] = '?';
  130    } else if (self->qf_datw > self->qf_qfold_max - w) {
  131       w = -1;
  132       goto jneednl;
  133    } else {
  134       l = wctomb(self->qf_dat.s + self->qf_dat.l, wc);
  135       if (l < 0)
  136          goto jbad;
  137       self->qf_datw += (ui32_t)w;
  138       self->qf_dat.l += (size_t)l;
  139    }
  140 
  141    if (self->qf_datw >= self->qf_qfold_max) {
  142       /* If we have seen a nice breakpoint during traversal, shuffle data
  143        * around a bit so as to restore the trailing part after flushing */
  144 jneednl:
  145       if (self->qf_brkl > 0) {
  146          save_w = self->qf_datw - self->qf_brkw;
  147          save_l = self->qf_dat.l - self->qf_brkl;
  148          save_b = self->qf_dat.s + self->qf_brkl + 2;
  149          memmove(save_b, save_b - 2, save_l);
  150          self->qf_dat.l = self->qf_brkl;
  151       }
  152 
  153       self->qf_dat.s[self->qf_dat.l++] = '\\';
  154 jflush:
  155       self->qf_dat.s[self->qf_dat.l++] = '\n';
  156       rv = quoteflt_flush(self);
  157 
  158       /* Restore takeovers, if any */
  159       if (save_b != NULL) {
  160          self->qf_brk_isws = FAL0;
  161          self->qf_datw += save_w;
  162          self->qf_dat.l = save_l;
  163          memmove(self->qf_dat.s, save_b, save_l);
  164       }
  165    } else if (self->qf_datw >= self->qf_qfold_min && !self->qf_brk_isws) {
  166       bool_t isws = (iswspace(wc) != 0);
  167 
  168       if (isws || !self->qf_brk_isws || self->qf_brkl == 0) {
  169          if((self->qf_brk_isws = isws) ||
  170                self->qf_brkl < self->qf_qfold_maxnws){
  171             self->qf_brkl = self->qf_dat.l;
  172             self->qf_brkw = self->qf_datw;
  173          }
  174       }
  175    }
  176 
  177    /* Did we hold this back to avoid qf_fold_max excess?  Then do it now */
  178    if(rv >= 0 && w == -1){
  179       ssize_t j = _qf_add_data(self, wc);
  180       if(j < 0)
  181          rv = j;
  182       else
  183          rv += j;
  184    }
  185    /* If state changed to prefix, perform full reset (note this implies that
  186     * quoteflt_flush() performs too much work..) */
  187    else if (wc == '\n') {
  188       self->qf_state = _QF_PREFIX;
  189       self->qf_wscnt = self->qf_datw = 0;
  190       self->qf_currq.l = 0;
  191    }
  192 jleave:
  193    NYD_LEAVE;
  194    return rv;
  195 }
  196 
  197 static ssize_t
  198 _qf_state_prefix(struct qf_vc *vc)
  199 {
  200    struct quoteflt *self;
  201    ssize_t rv;
  202    char const *buf;
  203    size_t len, i;
  204    wchar_t wc;
  205    NYD_ENTER;
  206 
  207    self = vc->self;
  208    rv = 0;
  209 
  210    for (buf = vc->buf, len = vc->len; len > 0;) {
  211       /* xxx NULL BYTE! */
  212       i = mbrtowc(&wc, buf, len, self->qf_mbps);
  213       if (i == (size_t)-1) {
  214          /* On hard error, don't modify mbstate_t and step one byte */
  215          self->qf_mbps[0] = self->qf_mbps[1];
  216          ++buf;
  217          --len;
  218          self->qf_wscnt = 0;
  219          continue;
  220       }
  221       self->qf_mbps[1] = self->qf_mbps[0];
  222       if (i == (size_t)-2) {
  223          /* Redundant shift sequence, out of buffer */
  224          len = 0;
  225          break;
  226       }
  227       buf += i;
  228       len -= i;
  229 
  230       if (wc == L'\n')
  231          goto jfin;
  232       if (iswspace(wc)) {
  233          ++self->qf_wscnt;
  234          continue;
  235       }
  236       if (i == 1 && n_uasciichar(wc) &&
  237             strchr(self->qf_quote_chars, (char)wc) != NULL){
  238          self->qf_wscnt = 0;
  239          if (self->qf_currq.l >= n_QUOTE_MAX - 3) {
  240             self->qf_currq.s[n_QUOTE_MAX - 3] = '.';
  241             self->qf_currq.s[n_QUOTE_MAX - 2] = '.';
  242             self->qf_currq.s[n_QUOTE_MAX - 1] = '.';
  243             self->qf_currq.l = n_QUOTE_MAX;
  244          } else
  245             self->qf_currq.s[self->qf_currq.l++] = buf[-1];
  246          continue;
  247       }
  248 
  249       /* The quote is parsed and compressed; dump it */
  250 jfin:
  251       self->qf_state = _QF_DATA;
  252       /* Overtake WS to the current quote in order to preserve it for eventual
  253        * necessary follow lines, too */
  254       /* TODO we de-facto "normalize" to ASCII SP here which MESSES tabs!! */
  255       while (self->qf_wscnt-- > 0 && self->qf_currq.l < n_QUOTE_MAX)
  256          self->qf_currq.s[self->qf_currq.l++] = ' ';
  257       self->qf_datw = self->qf_pfix_len + self->qf_currq.l;
  258       self->qf_wscnt = 0;
  259       rv = _qf_add_data(self, wc);
  260       break;
  261    }
  262 
  263    vc->buf = buf;
  264    vc->len = len;
  265    NYD_LEAVE;
  266    return rv;
  267 }
  268 
  269 static ssize_t
  270 _qf_state_data(struct qf_vc *vc)
  271 {
  272    struct quoteflt *self;
  273    ssize_t rv;
  274    char const *buf;
  275    size_t len, i;
  276    wchar_t wc;
  277    NYD_ENTER;
  278 
  279    self = vc->self;
  280    rv = 0;
  281 
  282    for (buf = vc->buf, len = vc->len; len > 0;) {
  283       /* xxx NULL BYTE! */
  284       i = mbrtowc(&wc, buf, len, self->qf_mbps);
  285       if (i == (size_t)-1) {
  286          /* On hard error, don't modify mbstate_t and step one byte */
  287          self->qf_mbps[0] = self->qf_mbps[1];
  288          ++buf;
  289          --len;
  290          continue;
  291       }
  292       self->qf_mbps[1] = self->qf_mbps[0];
  293       if (i == (size_t)-2) {
  294          /* Redundant shift sequence, out of buffer */
  295          len = 0;
  296          break;
  297       }
  298       buf += i;
  299       len -= i;
  300 
  301       {  ssize_t j = _qf_add_data(self, wc);
  302          if (j < 0) {
  303             rv = j;
  304             break;
  305          }
  306          rv += j;
  307       }
  308 
  309       if (self->qf_state != _QF_DATA)
  310          break;
  311    }
  312 
  313    vc->buf = buf;
  314    vc->len = len;
  315    NYD_LEAVE;
  316    return rv;
  317 }
  318 #endif /* HAVE_QUOTE_FOLD */
  319 
  320 FL struct quoteflt *
  321 quoteflt_dummy(void) /* TODO LEGACY (until filters are plugged when needed) */
  322 {
  323    static struct quoteflt qf_i;
  324 
  325    qf_i.qf_bypass = TRU1;
  326    return &qf_i;
  327 }
  328 
  329 FL void
  330 quoteflt_init(struct quoteflt *self, char const *prefix, bool_t bypass)
  331 {
  332 #ifdef HAVE_QUOTE_FOLD
  333    char const *xcp, *cp;
  334 #endif
  335    NYD_ENTER;
  336 
  337    memset(self, 0, sizeof *self);
  338 
  339    if ((self->qf_pfix = prefix) != NULL)
  340       self->qf_pfix_len = (ui32_t)strlen(prefix);
  341    self->qf_bypass = bypass;
  342 
  343    /* Check whether the user wants the more fancy quoting algorithm */
  344    /* TODO *quote-fold*: n_QUOTE_MAX may excess it! */
  345 #ifdef HAVE_QUOTE_FOLD
  346    if (!bypass && (cp = ok_vlook(quote_fold)) != NULL) {
  347       ui32_t qmax, qmaxnws, qmin;
  348 
  349       /* These magic values ensure we don't bail */
  350       n_idec_ui32_cp(&qmax, cp, 10, &xcp);
  351       if (qmax < self->qf_pfix_len + 6)
  352          qmax = self->qf_pfix_len + 6;
  353       qmaxnws = --qmax; /* The newline escape */
  354       if (cp == xcp || *xcp == '\0')
  355          qmin = (qmax >> 1) + (qmax >> 2) + (qmax >> 5);
  356       else {
  357          n_idec_ui32_cp(&qmin, &xcp[1], 10, &xcp);
  358          if (qmin < qmax >> 1)
  359             qmin = qmax >> 1;
  360          else if (qmin > qmax - 2)
  361             qmin = qmax - 2;
  362 
  363          if (cp != xcp && *xcp != '\0') {
  364             n_idec_ui32_cp(&qmaxnws, &xcp[1], 10, &xcp);
  365             if (qmaxnws > qmax || qmaxnws < qmin)
  366                qmaxnws = qmax;
  367          }
  368       }
  369       self->qf_qfold_min = qmin;
  370       self->qf_qfold_max = qmax;
  371       self->qf_qfold_maxnws = qmaxnws;
  372       self->qf_quote_chars = ok_vlook(quote_chars);
  373 
  374       /* Add pad for takeover copies, reverse solidus and newline */
  375       self->qf_dat.s = n_autorec_alloc((qmax + 3) * n_mb_cur_max);
  376       self->qf_currq.s = n_autorec_alloc((n_QUOTE_MAX + 1) * n_mb_cur_max);
  377    }
  378 #endif
  379    NYD_LEAVE;
  380 }
  381 
  382 FL void
  383 quoteflt_destroy(struct quoteflt *self) /* xxx inline */
  384 {
  385    NYD_ENTER;
  386    n_UNUSED(self);
  387    NYD_LEAVE;
  388 }
  389 
  390 FL void
  391 quoteflt_reset(struct quoteflt *self, FILE *f) /* xxx inline */
  392 {
  393    NYD_ENTER;
  394    self->qf_os = f;
  395 #ifdef HAVE_QUOTE_FOLD
  396    self->qf_state = _QF_CLEAN;
  397    self->qf_dat.l =
  398    self->qf_currq.l = 0;
  399    memset(self->qf_mbps, 0, sizeof self->qf_mbps);
  400 #endif
  401    NYD_LEAVE;
  402 }
  403 
  404 FL ssize_t
  405 quoteflt_push(struct quoteflt *self, char const *dat, size_t len)
  406 {
  407    /* (xxx Ideally the actual push() [and flush()] would be functions on their
  408     * xxx own, via indirect vtbl call ..) */
  409    ssize_t rv = 0;
  410    NYD_ENTER;
  411 
  412    self->qf_nl_last = (len > 0 && dat[len - 1] == '\n'); /* TODO HACK */
  413 
  414    if (len == 0)
  415       goto jleave;
  416 
  417    /* Bypass? TODO Finally, this filter simply should not be used, then
  418     * (TODO It supercedes prefix_write() or something) */
  419    if (self->qf_bypass) {
  420       if (len != fwrite(dat, 1, len, self->qf_os))
  421          goto jerr;
  422       rv = len;
  423    }
  424    /* Normal: place *indentprefix* at every BOL */
  425    else
  426 #ifdef HAVE_QUOTE_FOLD
  427       if (self->qf_qfold_max == 0)
  428 #endif
  429    {
  430       void *vp;
  431       size_t ll;
  432       bool_t pxok = (self->qf_qfold_min != 0);
  433 
  434       for (;;) {
  435          if (!pxok && (ll = self->qf_pfix_len) > 0) {
  436             if (ll != fwrite(self->qf_pfix, 1, ll, self->qf_os))
  437                goto jerr;
  438             rv += ll;
  439             pxok = TRU1;
  440          }
  441 
  442          /* xxx Strictly speaking this is invalid, because only `/' and `.' are
  443           * xxx mandated by POSIX.1-2008 as "invariant across all locales
  444           * xxx supported"; though there is no charset known which uses this
  445           * xxx control char as part of a multibyte character; note that S-nail
  446           * XXX (and the Mail codebase as such) do not support EBCDIC */
  447          if ((vp = memchr(dat, '\n', len)) == NULL)
  448             ll = len;
  449          else {
  450             pxok = FAL0;
  451             ll = PTR2SIZE((char*)vp - dat) + 1;
  452          }
  453 
  454          if (ll != fwrite(dat, sizeof *dat, ll, self->qf_os))
  455             goto jerr;
  456          rv += ll;
  457          if ((len -= ll) == 0)
  458             break;
  459          dat += ll;
  460       }
  461 
  462       self->qf_qfold_min = pxok;
  463    }
  464    /* Overly complicated, though still only line-per-line: *quote-fold*.
  465     * - If .qf_currq.l is 0, then we are in a clean state.  Reset .qf_mbps;
  466     *   TODO note this means we assume that lines start with reset escape seq,
  467     *   TODO but i don't think this is any worse than what we currently do;
  468     *   TODO in 15.0, with the value carrier, we should carry conversion states
  469     *   TODO all along, only resetting on error (or at words for header =???=);
  470     *   TODO this still is weird for error handling, but we need to act more
  471     *   TODO stream-alike (though in practice i don't think cross-line states
  472     *   TODO can be found, because of compatibility reasons; however, being
  473     *   TODO a problem rather than a solution is not a good thing (tm))
  474     * - Lookout for a newline */
  475 #ifdef HAVE_QUOTE_FOLD
  476    else {
  477       struct qf_vc vc;
  478       ssize_t i;
  479 
  480       vc.self = self;
  481       vc.buf = dat;
  482       vc.len = len;
  483       while (vc.len > 0) {
  484          switch (self->qf_state) {
  485          case _QF_CLEAN:
  486          case _QF_PREFIX:
  487             i = _qf_state_prefix(&vc);
  488             break;
  489          default: /* silence cc (`i' unused) */
  490          case _QF_DATA:
  491             i = _qf_state_data(&vc);
  492             break;
  493          }
  494          if (i < 0)
  495             goto jerr;
  496          rv += i;
  497       }
  498    }
  499 #endif /* HAVE_QUOTE_FOLD */
  500 
  501 jleave:
  502    NYD_LEAVE;
  503    return rv;
  504 jerr:
  505    rv = -1;
  506    goto jleave;
  507 }
  508 
  509 FL ssize_t
  510 quoteflt_flush(struct quoteflt *self)
  511 {
  512    ssize_t rv = 0;
  513    NYD_ENTER;
  514    n_UNUSED(self);
  515 
  516 #ifdef HAVE_QUOTE_FOLD
  517    if (self->qf_dat.l > 0) {
  518       rv = _qf_dump_prefix(self);
  519       if (rv >= 0) {
  520          size_t i = self->qf_dat.l;
  521          if (i == fwrite(self->qf_dat.s, 1, i, self->qf_os))
  522             rv += i;
  523          else
  524             rv = -1;
  525          self->qf_dat.l = 0;
  526          self->qf_brk_isws = FAL0;
  527          self->qf_wscnt = self->qf_brkl = self->qf_brkw = 0;
  528          self->qf_datw = self->qf_pfix_len + self->qf_currq.l;
  529       }
  530    }
  531 #endif
  532    NYD_LEAVE;
  533    return rv;
  534 }
  535 
  536 /*
  537  * HTML tagsoup filter TODO rewrite wchar_t based (require HAVE_C90AMEND1)
  538  * TODO . Numeric &#NO; entities should also be treated by struct hf_ent
  539  * TODO . Yes, we COULD support CSS based quoting when we'd check type="quote"
  540  * TODO   (nonstandard) and watch out for style="gmail_quote" (or so, VERY
  541  * TODO   nonstandard) and tracking a stack of such elements (to be popped
  542  * TODO   once the closing element is seen).  Then, after writing a newline,
  543  * TODO   place sizeof(stack) ">"s first.  But aren't these HTML mails rude?
  544  * TODO Interlocking and non-well-formed data will break us down
  545  */
  546 #ifdef HAVE_FILTER_HTML_TAGSOUP
  547 
  548 enum hf_limits {
  549    _HF_MINLEN  = 10,       /* Minimum line length (can't really be smaller) */
  550    _HF_BRKSUB  = 8         /* Start considering line break MAX - BRKSUB */
  551 };
  552 
  553 enum hf_flags {
  554    _HF_BQUOTE_MASK = 0xFFFFu,
  555    _HF_UTF8 = 1u<<16,   /* Data is in UTF-8 */
  556    _HF_ERROR = 1u<<17,  /* A hard error occurred, bail as soon as possible */
  557    _HF_NOPUT = 1u<<18,  /* (In a tag,) Don't generate output */
  558    _HF_IGN = 1u<<19,    /* Ignore mode on */
  559    _HF_ANY = 1u<<20,    /* Yet seen just any output */
  560    _HF_PRE = 1u<<21,    /* In <pre>formatted mode */
  561    _HF_ENT = 1u<<22,    /* Currently parsing an entity */
  562    _HF_BLANK = 1u<<23,  /* Whitespace last */
  563    _HF_HREF = 1u<<24,   /* External <a href=> was the last href seen */
  564 
  565    _HF_NL_1 = 1u<<25,   /* One \n seen */
  566    _HF_NL_2 = 2u<<25,   /* We have produced an all empty line */
  567    _HF_NL_MASK = _HF_NL_1 | _HF_NL_2
  568 };
  569 
  570 enum hf_special_actions {
  571    _HFSA_NEEDSEP  = -1,    /* Need an empty line (paragraph separator) */
  572    _HFSA_NEEDNL   = -2,    /* Need a new line start (table row) */
  573    _HFSA_IGN      = -3,    /* Things like <style>..</style>, <script>.. */
  574    _HFSA_PRE      = -4,    /* <pre>.. */
  575    _HFSA_PRE_END  = -5,
  576    _HFSA_IMG      = -6,    /* <img> */
  577    _HFSA_HREF     = -7,    /* <a>.. */
  578    _HFSA_HREF_END = -8,
  579    _HFSA_BQUOTE   = -9,    /* <blockquote>, interpreted as citation! */
  580    _HFSA_BQUOTE_END = -10
  581 };
  582 
  583 enum hf_entity_flags {
  584    _HFE_HAVE_UNI  = 1<<6,  /* Have a Unicode replacement character */
  585    _HFE_HAVE_CSTR = 1<<7,  /* Have a string replacement */
  586    /* We store the length of the entity name in the flags, too */
  587    _HFE_LENGTH_MASK = (1<<6) - 1
  588 };
  589 
  590 struct htmlflt_href {
  591    struct htmlflt_href *hfh_next;
  592    ui32_t      hfh_no;     /* Running sequence */
  593    ui32_t      hfh_len;    /* of .hfh_dat */
  594    char        hfh_dat[n_VFIELD_SIZE(0)];
  595 };
  596 
  597 struct htmlflt_tag {
  598    si32_t      hft_act;    /* char or hf_special_actions */
  599    /* Not NUL: character to inject, with high bit set: place a space
  600     * afterwards.  Note: only recognized with _HFSA_NEEDSEP or _HFSA_NEEDNL */
  601    char        hft_injc;
  602    ui8_t       hft_len;    /* Useful bytes in (NUL terminated) .hft_tag */
  603    char const  hft_tag[10]; /* Tag less < and > surroundings (TR, /TR, ..) */
  604 };
  605 n_CTA(n_SIZEOF_FIELD(struct htmlflt_tag, hft_tag) < LINESIZE,
  606    "Structure field too large a size"); /* .hf_ign_tag */
  607 
  608 struct hf_ent {
  609    ui8_t       hfe_flags;  /* enum hf_entity_flags plus length of .hfe_ent */
  610    char        hfe_c;      /* Plain replacement character */
  611    ui16_t      hfe_uni;    /* Unicode codepoint if _HFE_HAVE_UNI */
  612    char        hfe_cstr[5]; /* _HFE_HAVE_CSTR (e.g., &hellip; -> ...) */
  613    char const  hfe_ent[7]; /* Entity less & and ; surroundings */
  614 };
  615 
  616 /* Tag list; not binary searched :(, so try to take care a bit */
  617 static struct htmlflt_tag const  _hf_tags[] = {
  618 # undef _X
  619 # undef _XC
  620 # define _X(S,A)     {A, '\0', sizeof(S) -1, S "\0"}
  621 # define _XC(S,C,A)  {A, C, sizeof(S) -1, S "\0"}
  622 
  623 # if 0 /* This is treated very special (to avoid wasting space in .hft_tag) */
  624    _X("BLOCKQUOTE", _HFSA_BQUOTE), _X("/BLOCKQUOTE", _HFSA_BQUOTE_END),
  625 # endif
  626 
  627    _X("P", _HFSA_NEEDSEP),       _X("/P", _HFSA_NEEDNL),
  628    _X("DIV", _HFSA_NEEDSEP),     _X("/DIV", _HFSA_NEEDNL),
  629    _X("TR", _HFSA_NEEDNL),
  630                                  _X("/TH", '\t'),
  631                                  _X("/TD", '\t'),
  632    /* Let it stand out; also since we don't support implicit paragraphs after
  633     * block elements, plain running text after a list (seen in Unicode
  634     * announcement via Firefox) */
  635    _X("UL", _HFSA_NEEDSEP),      _X("/UL", _HFSA_NEEDSEP),
  636    _XC("LI", (char)0x80 | '*', _HFSA_NEEDSEP),
  637    _X("DL", _HFSA_NEEDSEP),
  638    _X("DT", _HFSA_NEEDNL),
  639 
  640    _X("A", _HFSA_HREF),          _X("/A", _HFSA_HREF_END),
  641    _X("IMG", _HFSA_IMG),
  642    _X("BR", '\n'),
  643    _X("PRE", _HFSA_PRE),         _X("/PRE", _HFSA_PRE_END),
  644    _X("TITLE", _HFSA_NEEDSEP),   /*_X("/TITLE", '\n'),*/
  645    _X("H1", _HFSA_NEEDSEP),      /*_X("/H1", '\n'),*/
  646    _X("H2", _HFSA_NEEDSEP),      /*_X("/H2", '\n'),*/
  647    _X("H3", _HFSA_NEEDSEP),      /*_X("/H3", '\n'),*/
  648    _X("H4", _HFSA_NEEDSEP),      /*_X("/H4", '\n'),*/
  649    _X("H5", _HFSA_NEEDSEP),      /*_X("/H5", '\n'),*/
  650    _X("H6", _HFSA_NEEDSEP),      /*_X("/H6", '\n'),*/
  651 
  652    _X("STYLE", _HFSA_IGN),
  653    _X("SCRIPT", _HFSA_IGN),
  654 
  655 # undef _X
  656 };
  657 
  658 /* Entity list; not binary searched.. */
  659 static struct hf_ent const       _hf_ents[] = {
  660 # undef _X
  661 # undef _XU
  662 # undef _XS
  663 # undef _XUS
  664 # define _X(E,C)     {(sizeof(E) -1), C, 0x0u, "", E "\0"}
  665 # define _XU(E,C,U)  {(sizeof(E) -1) | _HFE_HAVE_UNI, C, U, "", E "\0"}
  666 # define _XS(E,S)    {(sizeof(E) -1) | _HFE_HAVE_CSTR, '\0', 0x0u,S "\0",E "\0"}
  667 # define _XSU(E,S,U) \
  668    {(sizeof(E) -1) | _HFE_HAVE_UNI | _HFE_HAVE_CSTR, '\0', U, S "\0", E "\0"}
  669 
  670    _X("quot", '"'),
  671    _X("amp", '&'),
  672    _X("lt", '<'),                _X("gt", '>'),
  673 
  674    _XU("nbsp", ' ', 0x0020 /* Note: not 0x00A0 seems to be better for us */),
  675    _XU("middot", '.', 0x00B7),
  676    _XSU("hellip", "...", 0x2026),
  677    _XSU("mdash", "---", 0x2014), _XSU("ndash", "--", 0x2013),
  678    _XSU("laquo", "<<", 0x00AB),  _XSU("raquo", ">>", 0x00BB),
  679    _XSU("lsaquo", "<", 0x2039),  _XSU("rsaquo", ">", 0x203A),
  680    _XSU("lsquo", "'", 0x2018),   _XSU("rsquo", "'", 0x2019),
  681    _XSU("ldquo", "\"", 0x201C),  _XSU("rdquo", "\"", 0x201D),
  682    _XSU("uarr", "^|", 0x2191),   _XSU("darr", "|v", 0x2193),
  683 
  684    _XSU("cent", "CENT", 0x00A2),
  685    _XSU("copy", "(C)", 0x00A9),
  686    _XSU("euro", "EUR", 0x20AC),
  687    _XSU("infin", "INFY", 0x221E),
  688    _XSU("pound", "GBP", 0x00A3),
  689    _XSU("reg", "(R)", 0x00AE),
  690    _XSU("sect", "S:", 0x00A7),
  691    _XSU("yen", "JPY", 0x00A5),
  692 
  693    /* German umlauts */
  694    _XSU("Auml", "Ae", 0x00C4),   _XSU("auml", "ae", 0x00E4),
  695    _XSU("Ouml", "Oe", 0x00D6),   _XSU("ouml", "oe", 0x00F6),
  696    _XSU("Uuml", "Ue", 0x00DC),   _XSU("uuml", "ue", 0x00FC),
  697    _XSU("szlig", "ss", 0x00DF)
  698 
  699 # undef _X
  700 # undef _XU
  701 # undef _XS
  702 # undef _XSU
  703 };
  704 
  705 /* Real output */
  706 static struct htmlflt * _hf_dump_hrefs(struct htmlflt *self);
  707 static struct htmlflt * _hf_dump(struct htmlflt *self);
  708 static struct htmlflt * _hf_store(struct htmlflt *self, char c);
  709 # ifdef HAVE_NATCH_CHAR
  710 static struct htmlflt * __hf_sync_mbstuff(struct htmlflt *self);
  711 # endif
  712 
  713 /* Virtual output */
  714 static struct htmlflt * _hf_nl(struct htmlflt *self);
  715 static struct htmlflt * _hf_nl_force(struct htmlflt *self);
  716 static struct htmlflt * _hf_putc(struct htmlflt *self, char c);
  717 static struct htmlflt * _hf_putc_premode(struct htmlflt *self, char c);
  718 static struct htmlflt * _hf_puts(struct htmlflt *self, char const *cp);
  719 static struct htmlflt * _hf_putbuf(struct htmlflt *self,
  720                            char const *cp, size_t len);
  721 
  722 /* Try to locate a param'eter in >hf_bdat, store it (non-terminated!) or NULL */
  723 static struct htmlflt * _hf_param(struct htmlflt *self, struct str *store,
  724                            char const *param);
  725 
  726 /* Expand all entities in the given parameter */
  727 static struct htmlflt * _hf_expand_all_ents(struct htmlflt *self,
  728                            struct str const *param);
  729 
  730 /* Completely parsed over a tag / an entity, interpret that */
  731 static struct htmlflt * _hf_check_tag(struct htmlflt *self, char const *s);
  732 static struct htmlflt * _hf_check_ent(struct htmlflt *self, char const *s,
  733                            size_t l);
  734 
  735 /* Input handler */
  736 static ssize_t          _hf_add_data(struct htmlflt *self,
  737                            char const *dat, size_t len);
  738 
  739 static struct htmlflt *
  740 _hf_dump_hrefs(struct htmlflt *self)
  741 {
  742    struct htmlflt_href *hhp;
  743    NYD2_ENTER;
  744 
  745    if (!(self->hf_flags & _HF_NL_2) && putc('\n', self->hf_os) == EOF) {
  746       self->hf_flags |= _HF_ERROR;
  747       goto jleave;
  748    }
  749 
  750    /* Reverse the list */
  751    for (hhp = self->hf_hrefs, self->hf_hrefs = NULL; hhp != NULL;) {
  752       struct htmlflt_href *tmp = hhp->hfh_next;
  753       hhp->hfh_next = self->hf_hrefs;
  754       self->hf_hrefs = hhp;
  755       hhp = tmp;
  756    }
  757 
  758    /* Then dump it */
  759    while ((hhp = self->hf_hrefs) != NULL) {
  760       self->hf_hrefs = hhp->hfh_next;
  761 
  762       if (!(self->hf_flags & _HF_ERROR)) {
  763          int w = fprintf(self->hf_os, "  [%u] %.*s\n",
  764                hhp->hfh_no, (int)hhp->hfh_len, hhp->hfh_dat);
  765          if (w < 0)
  766             self->hf_flags |= _HF_ERROR;
  767       }
  768       n_free(hhp);
  769    }
  770 
  771    self->hf_flags |= (putc('\n', self->hf_os) == EOF)
  772          ?  _HF_ERROR : _HF_NL_1 | _HF_NL_2;
  773    self->hf_href_dist = (ui32_t)n_realscreenheight >> 1;
  774 jleave:
  775    NYD2_LEAVE;
  776    return self;
  777 }
  778 
  779 static struct htmlflt *
  780 _hf_dump(struct htmlflt *self)
  781 {
  782    ui32_t f, l;
  783    char c, *cp;
  784    NYD2_ENTER;
  785 
  786    f = self->hf_flags & ~_HF_BLANK;
  787    l = self->hf_len;
  788    cp = self->hf_line;
  789    self->hf_mbwidth = self->hf_mboff = self->hf_last_ws = self->hf_len = 0;
  790 
  791    for (c = '\0'; l > 0; --l) {
  792       c = *cp++;
  793 jput:
  794       if (putc(c, self->hf_os) == EOF) {
  795          self->hf_flags = (f |= _HF_ERROR);
  796          goto jleave;
  797       }
  798    }
  799 
  800    if (c != '\n') {
  801       f |= (f & _HF_NL_1) ? _HF_NL_2 : _HF_NL_1;
  802       l = 1;
  803       c = '\n';
  804       goto jput;
  805    }
  806    self->hf_flags = f;
  807 
  808    /* Check whether there are HREFs to dump; there is so much messy tagsoup out
  809     * there that it seems best not to simply dump HREFs in each _dump(), but
  810     * only with some gap, let's say half the real screen height */
  811    if (--self->hf_href_dist < 0 && (f & _HF_NL_2) && self->hf_hrefs != NULL)
  812       self = _hf_dump_hrefs(self);
  813 jleave:
  814    NYD2_LEAVE;
  815    return self;
  816 }
  817 
  818 static struct htmlflt *
  819 _hf_store(struct htmlflt *self, char c)
  820 {
  821    ui32_t l, i;
  822    NYD2_ENTER;
  823 
  824    assert(c != '\n');
  825 
  826    l = self->hf_len;
  827    if(n_UNLIKELY(l == 0) && (i = (self->hf_flags & _HF_BQUOTE_MASK)) != 0 &&
  828          self->hf_lmax > _HF_MINLEN){
  829       ui32_t len, j;
  830       char const *ip;
  831 
  832       ip = ok_vlook(indentprefix);
  833       len = strlen(ip);
  834       if(len == 0 || len >= _HF_MINLEN){
  835          ip = "   |"; /* XXX something from *quote-chars* */
  836          len = sizeof("   |") -1;
  837       }
  838 
  839       self->hf_len = len;
  840       for(j = len; j-- != 0;){
  841          char x;
  842 
  843          if((x = ip[j]) == '\t')
  844             x = ' ';
  845          self->hf_line[j] = x;
  846       }
  847 
  848       while(--i > 0 && self->hf_len < self->hf_lmax - _HF_BRKSUB)
  849          self = _hf_store(self, '|'); /* XXX something from *quote-chars* */
  850 
  851       l = self->hf_len;
  852    }
  853 
  854    self->hf_line[l] = (c == '\t' ? ' ' : c);
  855    self->hf_len = ++l;
  856    if (blankspacechar(c)) {
  857       if (c == '\t') {
  858          i = 8 - ((l - 1) & 7); /* xxx magic tab width of 8 */
  859          if (i > 0) {
  860             do
  861                self = _hf_store(self, ' ');
  862             while (--i > 0);
  863             goto jleave;
  864          }
  865       }
  866       self->hf_last_ws = l;
  867    } else if (/*c == '.' ||*/ c == ',' || c == ';' || c == '-')
  868       self->hf_last_ws = l;
  869 
  870    i = l;
  871 # ifdef HAVE_NATCH_CHAR /* XXX This code is really ridiculous! */
  872    if (n_mb_cur_max > 1) { /* XXX should mbrtowc() and THEN store, at least */
  873       wchar_t wc;
  874       int w, x;
  875 
  876       if((x = mbtowc(&wc, self->hf_line + self->hf_mboff, l - self->hf_mboff)
  877             ) > 0){
  878          if ((w = wcwidth(wc)) == -1 ||
  879                /* Actively filter out L-TO-R and R-TO-R marks TODO ctext */
  880                (wc == 0x200E || wc == 0x200F ||
  881                   (wc >= 0x202A && wc <= 0x202E)) ||
  882                /* And some zero-width messes */
  883                wc == 0x00AD || (wc >= 0x200B && wc <= 0x200D) ||
  884                /* Oh about the ISO C wide character interfaces, baby! */
  885                (wc == 0xFEFF)){
  886             self->hf_len -= x;
  887             goto jleave;
  888          } else if (iswspace(wc))
  889             self->hf_last_ws = l;
  890          self->hf_mboff += x;
  891          i = (self->hf_mbwidth += w);
  892       } else {
  893          if (x < 0) {
  894             (void)mbtowc(&wc, NULL, n_mb_cur_max);
  895             if (UICMP(32, l - self->hf_mboff, >=, n_mb_cur_max)) { /* XXX */
  896                ++self->hf_mboff;
  897                ++self->hf_mbwidth;
  898             }
  899          }
  900          i = self->hf_mbwidth;
  901       }
  902    }
  903 # endif
  904 
  905    /* Do we need to break the line? */
  906    if (i >= self->hf_lmax - _HF_BRKSUB) {
  907       ui32_t f, lim;
  908 
  909 
  910       /* Let's hope we saw a sane place to break this line! */
  911       if (self->hf_last_ws >= (lim = self->hf_lmax >> 1)) {
  912 jput:
  913          i = self->hf_len = self->hf_last_ws;
  914          self = _hf_dump(self);
  915          if ((self->hf_len = (l -= i)) > 0) {
  916             self->hf_flags &= ~_HF_NL_MASK;
  917             memmove(self->hf_line, self->hf_line + i, l);
  918 # ifdef HAVE_NATCH_CHAR
  919             __hf_sync_mbstuff(self);
  920 # endif
  921          }
  922          goto jleave;
  923       }
  924 
  925       /* Any 7-bit characters? */
  926       f = self->hf_flags;
  927       for (i = l; i-- >= lim;)
  928          if (asciichar((c = self->hf_line[i]))) {
  929             self->hf_last_ws = ++i;
  930             goto jput;
  931          } else if ((f & _HF_UTF8) && ((ui8_t)c & 0xC0) != 0x80) {
  932             self->hf_last_ws = i;
  933             goto jput;
  934          }
  935 
  936       /* Hard break necessary!  xxx really badly done */
  937       if (l >= self->hf_lmax - 1)
  938          self = _hf_dump(self);
  939    }
  940 jleave:
  941    NYD2_LEAVE;
  942    return self;
  943 }
  944 
  945 # ifdef HAVE_NATCH_CHAR
  946 static struct htmlflt *
  947 __hf_sync_mbstuff(struct htmlflt *self)
  948 {
  949    wchar_t wc;
  950    char const *b;
  951    ui32_t o, w, l;
  952    NYD2_ENTER;
  953 
  954    b = self->hf_line;
  955    o = w = 0;
  956    l = self->hf_len;
  957    goto jumpin;
  958 
  959    while (l > 0) {
  960       int x = mbtowc(&wc, b, l);
  961 
  962       if (x == 0)
  963          break;
  964 
  965       if (x > 0) {
  966          b += x;
  967          l -= x;
  968          o += x;
  969          if ((x = wcwidth(wc)) == -1)
  970             x = 1;
  971          w += x;
  972          continue;
  973       }
  974 
  975       /* Bad, skip over a single character.. XXX very bad indeed */
  976       ++b;
  977       ++o;
  978       ++w;
  979       --l;
  980 jumpin:
  981       (void)mbtowc(&wc, NULL, n_mb_cur_max);
  982    }
  983 
  984    self->hf_mboff = o;
  985    self->hf_mbwidth = w;
  986 
  987    NYD2_LEAVE;
  988    return self;
  989 }
  990 # endif /* HAVE_NATCH_CHAR */
  991 
  992 static struct htmlflt *
  993 _hf_nl(struct htmlflt *self)
  994 {
  995    ui32_t f;
  996    NYD2_ENTER;
  997 
  998    if (!((f = self->hf_flags) & _HF_ERROR)) {
  999       if (f & _HF_ANY) {
 1000          if ((f & _HF_NL_MASK) != _HF_NL_MASK)
 1001             self = _hf_dump(self);
 1002       } else
 1003          self->hf_flags = (f |= _HF_NL_MASK);
 1004    }
 1005    NYD2_LEAVE;
 1006    return self;
 1007 }
 1008 
 1009 static struct htmlflt *
 1010 _hf_nl_force(struct htmlflt *self)
 1011 {
 1012    NYD2_ENTER;
 1013    if (!(self->hf_flags & _HF_ERROR))
 1014       self = _hf_dump(self);
 1015    NYD2_LEAVE;
 1016    return self;
 1017 }
 1018 
 1019 static struct htmlflt *
 1020 _hf_putc(struct htmlflt *self, char c)
 1021 {
 1022    ui32_t f;
 1023    NYD2_ENTER;
 1024 
 1025    if ((f = self->hf_flags) & _HF_ERROR)
 1026       goto jleave;
 1027 
 1028    if (c == '\n') {
 1029       self = _hf_nl(self);
 1030       goto jleave;
 1031    } else if (c == ' ' || c == '\t') {
 1032       if ((f & _HF_BLANK) || self->hf_len == 0)
 1033          goto jleave;
 1034       f |= _HF_BLANK;
 1035    } else
 1036       f &= ~_HF_BLANK;
 1037    f &= ~_HF_NL_MASK;
 1038    self->hf_flags = (f |= _HF_ANY);
 1039    self = _hf_store(self, c);
 1040 jleave:
 1041    NYD2_LEAVE;
 1042    return self;
 1043 }
 1044 
 1045 static struct htmlflt *
 1046 _hf_putc_premode(struct htmlflt *self, char c)
 1047 {
 1048    ui32_t f;
 1049    NYD2_ENTER;
 1050 
 1051    if ((f = self->hf_flags) & _HF_ERROR) {
 1052       ;
 1053    } else if (c == '\n')
 1054       self = _hf_nl_force(self);
 1055    else {
 1056       f &= ~_HF_NL_MASK;
 1057       self->hf_flags = (f |= _HF_ANY);
 1058       self = _hf_store(self, c);
 1059    }
 1060    NYD2_LEAVE;
 1061    return self;
 1062 }
 1063 
 1064 static struct htmlflt *
 1065 _hf_puts(struct htmlflt *self, char const *cp)
 1066 {
 1067    char c;
 1068    NYD2_ENTER;
 1069 
 1070    while ((c = *cp++) != '\0')
 1071       self = _hf_putc(self, c);
 1072    NYD2_LEAVE;
 1073    return self;
 1074 }
 1075 
 1076 static struct htmlflt *
 1077 _hf_putbuf(struct htmlflt *self, char const *cp, size_t len)
 1078 {
 1079    NYD2_ENTER;
 1080 
 1081    while (len-- > 0)
 1082       self = _hf_putc(self, *cp++);
 1083    NYD2_LEAVE;
 1084    return self;
 1085 }
 1086 
 1087 static struct htmlflt *
 1088 _hf_param(struct htmlflt *self, struct str *store, char const *param)
 1089 {
 1090    char const *cp;
 1091    char c, x, quote;
 1092    size_t i;
 1093    bool_t hot;
 1094    NYD2_ENTER;
 1095 
 1096    store->s = NULL;
 1097    store->l = 0;
 1098    cp = self->hf_bdat;
 1099 
 1100    /* Skip over any non-WS first; be aware of soup, if it slipped through */
 1101    for(;;){
 1102       if((c = *cp++) == '\0' || c == '>')
 1103          goto jleave;
 1104       if(whitechar(c))
 1105          break;
 1106    }
 1107 
 1108    /* Search for the parameter, take care of other quoting along the way */
 1109    x = *param++;
 1110    x = upperconv(x);
 1111    i = strlen(param);
 1112 
 1113    for(hot = TRU1;;){
 1114       if((c = *cp++) == '\0' || c == '>')
 1115          goto jleave;
 1116       if(whitechar(c)){
 1117          hot = TRU1;
 1118          continue;
 1119       }
 1120 
 1121       /* Could it be a parameter? */
 1122       if(hot){
 1123          hot = FAL0;
 1124 
 1125          /* Is it the desired one? */
 1126          if((c = upperconv(c)) == x && !ascncasecmp(param, cp, i)){
 1127             char const *cp2 = cp + i;
 1128 
 1129             if((quote = *cp2++) != '='){
 1130                if(quote == '\0' || quote == '>')
 1131                   goto jleave;
 1132                while(whitechar(quote))
 1133                   quote = *cp2++;
 1134             }
 1135             if(quote == '='){
 1136                cp = cp2;
 1137                break;
 1138             }
 1139             continue; /* XXX Optimize: i bytes or even cp2 can't be it! */
 1140          }
 1141       }
 1142 
 1143       /* Not the desired one; but a parameter? */
 1144       if(c != '=')
 1145          continue;
 1146       /* If so, properly skip over the value */
 1147       if((c = *cp++) == '"' || c == '\''){
 1148          /* TODO i have forgotten whether reverse solidus quoting is allowed
 1149           * TODO quoted HTML parameter values?  not supporting that for now.. */
 1150          for(quote = c; (c = *cp++) != '\0' && c != quote;)
 1151             ;
 1152       }else
 1153          while(c != '\0' && !whitechar(c) && c != '>')
 1154             c = *++cp;
 1155       if(c == '\0')
 1156          goto jleave;
 1157    }
 1158 
 1159    /* Skip further whitespace */
 1160    for(;;){
 1161       if((c = *cp++) == '\0' || c == '>')
 1162          goto jleave;
 1163       if(!whitechar(c))
 1164          break;
 1165    }
 1166 
 1167    if(c == '"' || c == '\''){
 1168       /* TODO i have forgotten whether reverse solisud quoting is allowed in
 1169        * TODO quoted HTML parameter values?  not supporting that for now.. */
 1170       store->s = n_UNCONST(cp);
 1171       for(quote = c; (c = *cp) != '\0' && c != quote; ++cp)
 1172          ;
 1173       /* XXX ... and we simply ignore a missing trailing " :> */
 1174    }else{
 1175       store->s = n_UNCONST(cp - 1);
 1176       if(!whitechar(c))
 1177          while((c = *cp) != '\0' && !whitechar(c) && c != '>')
 1178             ++cp;
 1179    }
 1180    i = PTR2SIZE(cp - store->s);
 1181 
 1182    /* Terrible tagsoup out there, e.g., groups.google.com produces href=""
 1183     * parameter values prefixed and suffixed by newlines!  Therefore trim the
 1184     * value content TODO join into the parse step above! */
 1185    for (cp = store->s; i > 0 && spacechar(*cp); ++cp, --i)
 1186       ;
 1187    store->s = n_UNCONST(cp);
 1188    for (cp += i - 1; i > 0 && spacechar(*cp); --cp, --i)
 1189       ;
 1190    if ((store->l = i) == 0)
 1191       store->s = NULL;
 1192 jleave:
 1193    NYD2_LEAVE;
 1194    return self;
 1195 }
 1196 
 1197 static struct htmlflt *
 1198 _hf_expand_all_ents(struct htmlflt *self, struct str const *param)
 1199 {
 1200    char const *cp, *maxcp, *ep;
 1201    char c;
 1202    size_t i;
 1203    NYD2_ENTER;
 1204 
 1205    for (cp = param->s, maxcp = cp + param->l; cp < maxcp;)
 1206       if ((c = *cp++) != '&')
 1207 jputc:
 1208          self = _hf_putc(self, c);
 1209       else {
 1210          for (ep = cp--;;) {
 1211             if (ep == maxcp || (c = *ep++) == '\0') {
 1212                for (; cp < ep; ++cp)
 1213                   self = _hf_putc(self, *cp);
 1214                goto jleave;
 1215             } else if (c == ';') {
 1216                if ((i = PTR2SIZE(ep - cp)) > 1) {
 1217                   self = _hf_check_ent(self, cp, i);
 1218                   break;
 1219                } else {
 1220                   c = *cp++;
 1221                   goto jputc;
 1222                }
 1223             }
 1224          }
 1225          cp = ep;
 1226       }
 1227 jleave:
 1228    NYD2_LEAVE;
 1229    return self;
 1230 }
 1231 
 1232 static struct htmlflt *
 1233 _hf_check_tag(struct htmlflt *self, char const *s)
 1234 {
 1235    char nobuf[32], c;
 1236    struct str param;
 1237    size_t i;
 1238    struct htmlflt_tag const *hftp;
 1239    ui32_t f;
 1240    NYD2_ENTER;
 1241 
 1242    /* Extra check only */
 1243    assert(s != NULL);
 1244    if (*s != '<') {
 1245       DBG( n_alert("HTML tagsoup filter _hf_check_tag() called on soup!"); )
 1246 jput_as_is:
 1247       self = _hf_puts(self, self->hf_bdat);
 1248       goto jleave;
 1249    }
 1250 
 1251    for (++s, i = 0; (c = s[i]) != '\0' && c != '>' && !whitechar(c); ++i)
 1252       /* Special massage for things like <br/>: after the slash only whitespace
 1253        * may separate us from the closing right angle! */
 1254       if (c == '/') {
 1255          size_t j = i + 1;
 1256 
 1257          while ((c = s[j]) != '\0' && c != '>' && whitechar(c))
 1258             ++j;
 1259          if (c == '>')
 1260             break;
 1261       }
 1262 
 1263    for (hftp = _hf_tags;;) {
 1264       if (i == hftp->hft_len && !ascncasecmp(s, hftp->hft_tag, i)) {
 1265          c = s[hftp->hft_len];
 1266          if (c == '>' || c == '/' || whitechar(c))
 1267             break;
 1268       }
 1269       if (n_UNLIKELY(PTRCMP(++hftp, >=, _hf_tags + n_NELEM(_hf_tags)))){
 1270          /* A <blockquote> is very special xxx */
 1271          bool_t isct;
 1272 
 1273          if((isct = (i > 1 && *s == '/'))){
 1274             ++s;
 1275             --i;
 1276          }
 1277 
 1278          if(i != sizeof("blockquote") -1 || ascncasecmp(s, "blockquote", i) ||
 1279                ((c = s[sizeof("blockquote") -1]) != '>' && !whitechar(c))){
 1280             s -= isct;
 1281             i += isct;
 1282             goto jnotknown;
 1283          }
 1284 
 1285          if(!isct && !(self->hf_flags & _HF_NL_2))
 1286             self = _hf_nl(self);
 1287          if(!(self->hf_flags & _HF_NL_1))
 1288             self = _hf_nl(self);
 1289          f = self->hf_flags;
 1290          f &= _HF_BQUOTE_MASK;
 1291          if(!isct){
 1292             if(f != _HF_BQUOTE_MASK)
 1293                ++f;
 1294          }else if(f > 0)
 1295             --f;
 1296          f |= (self->hf_flags & ~_HF_BQUOTE_MASK);
 1297          self->hf_flags = f;
 1298          goto jleave;
 1299       }
 1300    }
 1301 
 1302    f = self->hf_flags;
 1303    switch (hftp->hft_act) {
 1304    case _HFSA_PRE_END:
 1305       f &= ~_HF_PRE;
 1306       if (0) {
 1307          /* FALLTHRU */
 1308    case _HFSA_PRE:
 1309          f |= _HF_PRE;
 1310       }
 1311       self->hf_flags = f;
 1312       /* FALLTHRU */
 1313 
 1314    case _HFSA_NEEDSEP:
 1315       if (!(self->hf_flags & _HF_NL_2))
 1316          self = _hf_nl(self);
 1317       /* FALLTHRU */
 1318    case _HFSA_NEEDNL:
 1319       if (!(f & _HF_NL_1))
 1320          self = _hf_nl(self);
 1321       if (hftp->hft_injc != '\0') {
 1322          self = _hf_putc(self, hftp->hft_injc & 0x7F);
 1323          if ((uc_i)hftp->hft_injc & 0x80)
 1324             self = _hf_putc(self, ' ');
 1325       }
 1326       break;
 1327 
 1328    case _HFSA_IGN:
 1329       self->hf_ign_tag = hftp;
 1330       self->hf_flags = (f |= _HF_IGN | _HF_NOPUT);
 1331       break;
 1332 
 1333    case _HFSA_IMG:
 1334       self = _hf_param(self, &param, "alt");
 1335       self = _hf_putc(self, '[');
 1336       if (param.s == NULL) {
 1337          param.s = n_UNCONST("IMG");
 1338          param.l = 3;
 1339          goto jimg_put;
 1340       } /* else */ if (memchr(param.s, '&', param.l) != NULL)
 1341          self = _hf_expand_all_ents(self, &param);
 1342       else
 1343 jimg_put:
 1344          self = _hf_putbuf(self, param.s, param.l);
 1345       self = _hf_putc(self, ']');
 1346       break;
 1347 
 1348    case _HFSA_HREF:
 1349       self = _hf_param(self, &param, "href");
 1350       /* Ignore non-external links */
 1351       if (param.s != NULL && *param.s != '#') {
 1352          struct htmlflt_href *hhp = n_alloc(
 1353                n_VSTRUCT_SIZEOF(struct htmlflt_href, hfh_dat) + param.l +1);
 1354 
 1355          hhp->hfh_next = self->hf_hrefs;
 1356          hhp->hfh_no = ++self->hf_href_no;
 1357          hhp->hfh_len = (ui32_t)param.l;
 1358          memcpy(hhp->hfh_dat, param.s, param.l);
 1359 
 1360          snprintf(nobuf, sizeof nobuf, "[%u]", hhp->hfh_no);
 1361          self->hf_flags = (f |= _HF_HREF);
 1362          self->hf_hrefs = hhp;
 1363          self = _hf_puts(self, nobuf);
 1364       } else
 1365          self->hf_flags = (f &= ~_HF_HREF);
 1366       break;
 1367    case _HFSA_HREF_END:
 1368       if (f & _HF_HREF) {
 1369          snprintf(nobuf, sizeof nobuf, "[/%u]", self->hf_href_no);
 1370          self = _hf_puts(self, nobuf);
 1371       }
 1372       break;
 1373 
 1374    default:
 1375       c = (char)(hftp->hft_act & 0xFF);
 1376       self = _hf_putc(self, c);
 1377       break;
 1378    case '\0':
 1379       break;
 1380    }
 1381 jleave:
 1382    NYD2_LEAVE;
 1383    return self;
 1384 
 1385    /* The problem is that even invalid tagsoup is widely used, without real
 1386     * searching i have seen e-mail address in <N@H.D> notation, and more.
 1387     * To protect us a bit look around and possibly write the content as such */
 1388 jnotknown:
 1389    switch (*s) {
 1390    case '!':
 1391    case '?':
 1392       /* Ignore <!DOCTYPE, <!-- comments, <? PIs.. */
 1393       goto jleave;
 1394    case '>':
 1395       /* Print out an empty tag as such */
 1396       if (s[1] == '\0') {
 1397          --s;
 1398          goto jput_as_is;
 1399       }
 1400       break;
 1401    case '/':
 1402       ++s;
 1403       break;
 1404    default:
 1405       break;
 1406    }
 1407 
 1408    /* Also skip over : in order to suppress v:roundrect, w:anchorlock.. */
 1409    while ((c = *s++) != '\0' && c != '>' && !whitechar(c) && c != ':')
 1410       if (!asciichar(c) || punctchar(c)) {
 1411          self = _hf_puts(self, self->hf_bdat);
 1412          break;
 1413       }
 1414    goto jleave;
 1415 }
 1416 
 1417 static struct htmlflt *
 1418 _hf_check_ent(struct htmlflt *self, char const *s, size_t l)
 1419 {
 1420    char nobuf[32];
 1421    char const *s_save;
 1422    size_t l_save;
 1423    struct hf_ent const *hfep;
 1424    size_t i;
 1425    NYD2_ENTER;
 1426 
 1427    s_save = s;
 1428    l_save = l;
 1429    assert(*s == '&');
 1430    assert(l > 0);
 1431    /* False entities seen in the wild assert(s[l - 1] == ';'); */
 1432    ++s;
 1433    l -= 2;
 1434 
 1435    /* Numeric entity, or try named search */
 1436    if (*s == '#') {
 1437       i = (*++s == 'x' ? 16 : 10);
 1438 
 1439       if ((i != 16 || (++s, --l) > 0) && l < sizeof(nobuf)) {
 1440          memcpy(nobuf, s, l);
 1441          nobuf[l] = '\0';
 1442          n_idec_uiz_cp(&i, nobuf, i, NULL);
 1443          if (i <= 0x7F)
 1444             self = _hf_putc(self, (char)i);
 1445          else if (self->hf_flags & _HF_UTF8) {
 1446 jputuni:
 1447             l = n_utf32_to_utf8((ui32_t)i, nobuf);
 1448             self = _hf_putbuf(self, nobuf, l);
 1449          } else
 1450             goto jeent;
 1451       } else
 1452          goto jeent;
 1453    } else {
 1454       ui32_t f = self->hf_flags, hf;
 1455 
 1456       for (hfep = _hf_ents; PTRCMP(hfep, <, _hf_ents + n_NELEM(_hf_ents));
 1457             ++hfep)
 1458          if (l == ((hf = hfep->hfe_flags) & _HFE_LENGTH_MASK) &&
 1459                !strncmp(s, hfep->hfe_ent, l)) {
 1460             if ((hf & _HFE_HAVE_UNI) && (f & _HF_UTF8)) {
 1461                i = hfep->hfe_uni;
 1462                goto jputuni;
 1463             } else if (hf & _HFE_HAVE_CSTR)
 1464                self = _hf_puts(self, hfep->hfe_cstr);
 1465             else
 1466                self = _hf_putc(self, hfep->hfe_c);
 1467             goto jleave;
 1468          }
 1469 jeent:
 1470       self = _hf_putbuf(self, s_save, l_save);
 1471    }
 1472 jleave:
 1473    NYD2_LEAVE;
 1474    return self;
 1475 }
 1476 
 1477 static ssize_t
 1478 _hf_add_data(struct htmlflt *self, char const *dat, size_t len)
 1479 {
 1480    char c, *cp, *cp_max;
 1481    bool_t hot;
 1482    ssize_t rv = 0;
 1483    NYD_ENTER;
 1484 
 1485    /* Final put request? */
 1486    if (dat == NULL) {
 1487       if (self->hf_len > 0 || self->hf_hrefs != NULL) {
 1488          self = _hf_dump(self);
 1489          if (self->hf_hrefs != NULL)
 1490             self = _hf_dump_hrefs(self);
 1491          rv = 1;
 1492       }
 1493       goto jleave;
 1494    }
 1495 
 1496    /* Always ensure some initial buffer */
 1497    if ((cp = self->hf_curr) != NULL)
 1498       cp_max = self->hf_bmax;
 1499    else {
 1500       cp = self->hf_curr = self->hf_bdat = n_alloc(LINESIZE);
 1501       cp_max = self->hf_bmax = cp + LINESIZE -1; /* (Always room for NUL!) */
 1502    }
 1503    hot = (cp != self->hf_bdat);
 1504 
 1505    for (rv = (ssize_t)len; len > 0; --len) {
 1506       ui32_t f = self->hf_flags;
 1507 
 1508       if (f & _HF_ERROR)
 1509          break;
 1510       c = *dat++;
 1511 
 1512       /* Soup is really weird, and scripts may contain almost anything (and
 1513        * newer CSS standards are also cryptic): therefore prefix the _HF_IGN
 1514        * test and walk until we see the required end tag */
 1515       /* TODO For real safety _HF_IGN soup condome would also need to know
 1516        * TODO about quoted strings so that 'var i = "</script>";' couldn't
 1517        * TODO fool it!   We really want this mode also for _HF_NOPUT to be
 1518        * TODO able to *gracefully* detect the tag-closing '>', but then if
 1519        * TODO that is a single mechanism we should have made it! */
 1520       if (f & _HF_IGN) {
 1521          struct htmlflt_tag const *hftp = self->hf_ign_tag;
 1522          size_t i;
 1523 
 1524          if (c == '<') {
 1525             hot = TRU1;
 1526 jcp_reset:
 1527             cp = self->hf_bdat;
 1528          } else if (c == '>') {
 1529             if (hot) {
 1530                if ((i = PTR2SIZE(cp - self->hf_bdat)) > 1 &&
 1531                      --i == hftp->hft_len &&
 1532                      !ascncasecmp(self->hf_bdat + 1, hftp->hft_tag, i))
 1533                   self->hf_flags = (f &= ~(_HF_IGN | _HF_NOPUT));
 1534                hot = FAL0;
 1535                goto jcp_reset;
 1536             }
 1537          } else if (hot) {
 1538             *cp++ = c;
 1539             i = PTR2SIZE(cp - self->hf_bdat);
 1540             if ((i == 1 && c != '/') || --i > hftp->hft_len) {
 1541                hot = FAL0;
 1542                goto jcp_reset;
 1543             }
 1544          }
 1545       } else switch (c) {
 1546       case '<':
 1547          /* People are using & without &amp;ing it, ditto <; be aware */
 1548          if (f & (_HF_NOPUT | _HF_ENT)) {
 1549             f &= ~_HF_ENT;
 1550             /* Special case "<!--" buffer content to deal with really weird
 1551              * things that can be done with "<!--[if gte mso 9]>" syntax */
 1552             if (PTR2SIZE(cp - self->hf_bdat) != 4 ||
 1553                   memcmp(self->hf_bdat, "<!--", 4)) {
 1554                self->hf_flags = f;
 1555                *cp = '\0';
 1556                self = _hf_puts(self, self->hf_bdat);
 1557                f = self->hf_flags;
 1558             }
 1559          }
 1560          cp = self->hf_bdat;
 1561          *cp++ = c;
 1562          self->hf_flags = (f |= _HF_NOPUT);
 1563          break;
 1564       case '>':
 1565          /* Weird tagsoup around, do we actually parse a tag? */
 1566          if (!(f & _HF_NOPUT))
 1567             goto jdo_c;
 1568          cp[0] = c;
 1569          cp[1] = '\0';
 1570          f &= ~(_HF_NOPUT | _HF_ENT);
 1571          self->hf_flags = f;
 1572          self = _hf_check_tag(self, self->hf_bdat);
 1573          *(cp = self->hf_bdat) = '\0'; /* xxx extra safety */
 1574          /* Quick hack to get rid of redundant newline after <pre> XXX */
 1575          if (!(f & _HF_PRE) && (self->hf_flags & _HF_PRE) &&
 1576                len > 1 && *dat == '\n')
 1577             ++dat, --len;
 1578          break;
 1579 
 1580       case '\r': /* TODO CR should be stripped in lower level!! (Only B64!?!) */
 1581          break;
 1582       case '\n':
 1583          /* End of line is not considered unless we are in PRE section.
 1584           * However, in _HF_NOPUT mode we must be aware of tagsoup which uses
 1585           * newlines for separating parameters */
 1586          if (f & _HF_NOPUT)
 1587             goto jdo_c;
 1588          self = (f & _HF_PRE) ? _hf_nl_force(self) : _hf_putc(self, ' ');
 1589          break;
 1590 
 1591       case '\t':
 1592          if (!(f & _HF_PRE))
 1593             c = ' ';
 1594          /* FALLTHRU */
 1595       default:
 1596 jdo_c:
 1597          /* If not currently parsing a tag and bypassing normal output.. */
 1598          if (!(f & _HF_NOPUT)) {
 1599             if (cntrlchar(c))
 1600                break;
 1601             if (c == '&') {
 1602                cp = self->hf_bdat;
 1603                *cp++ = c;
 1604                self->hf_flags = (f |= _HF_NOPUT | _HF_ENT);
 1605             } else if (f & _HF_PRE) {
 1606                self = _hf_putc_premode(self, c);
 1607                self->hf_flags &= ~_HF_BLANK;
 1608             } else
 1609               self = _hf_putc(self, c);
 1610          } else if ((f & _HF_ENT) && c == ';') {
 1611             cp[0] = c;
 1612             cp[1] = '\0';
 1613             f &= ~(_HF_NOPUT | _HF_ENT);
 1614             self->hf_flags = f;
 1615            self = _hf_check_ent(self, self->hf_bdat,
 1616                PTR2SIZE(cp + 1 - self->hf_bdat));
 1617          } else {
 1618             /* We may need to grow the buffer */
 1619             if (PTRCMP(cp + 42/2, >=, cp_max)) {
 1620                size_t i = PTR2SIZE(cp - self->hf_bdat),
 1621                   m = PTR2SIZE(self->hf_bmax - self->hf_bdat) + LINESIZE;
 1622 
 1623                cp = self->hf_bdat = n_realloc(self->hf_bdat, m);
 1624                self->hf_bmax = cp_max = &cp[m -1];
 1625                self->hf_curr = (cp += i);
 1626             }
 1627             *cp++ = c;
 1628          }
 1629       }
 1630    }
 1631    self->hf_curr = cp;
 1632 jleave:
 1633   NYD_LEAVE;
 1634   return (self->hf_flags & _HF_ERROR) ? -1 : rv;
 1635 }
 1636 
 1637 /*
 1638  * TODO Because we don't support filter chains yet this filter will be run
 1639  * TODO in a dedicated subprocess, driven via a special Popen() mode
 1640  */
 1641 static bool_t __hf_hadpipesig;
 1642 static void
 1643 __hf_onpipe(int signo)
 1644 {
 1645    NYD_X; /* Signal handler */
 1646    n_UNUSED(signo);
 1647    __hf_hadpipesig = TRU1;
 1648 }
 1649 
 1650 FL int
 1651 htmlflt_process_main(void)
 1652 {
 1653    char buf[BUFFER_SIZE];
 1654    struct htmlflt hf;
 1655    size_t i;
 1656    int rv;
 1657    NYD_ENTER;
 1658 
 1659    __hf_hadpipesig = FAL0;
 1660    safe_signal(SIGPIPE, &__hf_onpipe);
 1661 
 1662    htmlflt_init(&hf);
 1663    htmlflt_reset(&hf, n_stdout);
 1664 
 1665    for (;;) {
 1666       if ((i = fread(buf, sizeof(buf[0]), n_NELEM(buf), n_stdin)) == 0) {
 1667          rv = !feof(n_stdin);
 1668          break;
 1669       }
 1670 
 1671       if ((rv = __hf_hadpipesig))
 1672          break;
 1673       /* Just use this directly.. */
 1674       if (htmlflt_push(&hf, buf, i) < 0) {
 1675          rv = 1;
 1676          break;
 1677       }
 1678    }
 1679    if (rv == 0 && htmlflt_flush(&hf) < 0)
 1680       rv = 1;
 1681 
 1682    htmlflt_destroy(&hf);
 1683 
 1684    rv |= __hf_hadpipesig;
 1685    NYD_LEAVE;
 1686    return rv;
 1687 }
 1688 
 1689 FL void
 1690 htmlflt_init(struct htmlflt *self)
 1691 {
 1692    NYD_ENTER;
 1693    /* (Rather redundant though) */
 1694    memset(self, 0, sizeof *self);
 1695    NYD_LEAVE;
 1696 }
 1697 
 1698 FL void
 1699 htmlflt_destroy(struct htmlflt *self)
 1700 {
 1701    NYD_ENTER;
 1702    htmlflt_reset(self, NULL);
 1703    NYD_LEAVE;
 1704 }
 1705 
 1706 FL void
 1707 htmlflt_reset(struct htmlflt *self, FILE *f)
 1708 {
 1709    struct htmlflt_href *hfhp;
 1710    NYD_ENTER;
 1711 
 1712    while ((hfhp = self->hf_hrefs) != NULL) {
 1713       self->hf_hrefs = hfhp->hfh_next;
 1714       n_free(hfhp);
 1715    }
 1716 
 1717    if (self->hf_bdat != NULL)
 1718       n_free(self->hf_bdat);
 1719    if (self->hf_line != NULL)
 1720       n_free(self->hf_line);
 1721 
 1722    memset(self, 0, sizeof *self);
 1723 
 1724    if (f != NULL) {
 1725       ui32_t sw = n_MAX(_HF_MINLEN, (ui32_t)n_scrnwidth);
 1726 
 1727       self->hf_line = n_alloc((size_t)sw * n_mb_cur_max +1);
 1728       self->hf_lmax = sw;
 1729 
 1730       if (n_psonce & n_PSO_UNICODE) /* TODO not truly generic */
 1731          self->hf_flags = _HF_UTF8;
 1732       self->hf_os = f;
 1733    }
 1734    NYD_LEAVE;
 1735 }
 1736 
 1737 FL ssize_t
 1738 htmlflt_push(struct htmlflt *self, char const *dat, size_t len)
 1739 {
 1740    ssize_t rv;
 1741    NYD_ENTER;
 1742 
 1743    rv = _hf_add_data(self, dat, len);
 1744    NYD_LEAVE;
 1745    return rv;
 1746 }
 1747 
 1748 FL ssize_t
 1749 htmlflt_flush(struct htmlflt *self)
 1750 {
 1751    ssize_t rv;
 1752    NYD_ENTER;
 1753 
 1754    rv = _hf_add_data(self, NULL, 0);
 1755    rv |= !fflush(self->hf_os) ? 0 : -1;
 1756    NYD_LEAVE;
 1757    return rv;
 1758 }
 1759 #endif /* HAVE_FILTER_HTML_TAGSOUP */
 1760 
 1761 /* s-it-mode */