"Fossies" - the Fresh Open Source Software Archive

Member "tidy-html5-5.8.0/src/clean.c" (16 Jul 2021, 76946 Bytes) of package /linux/www/tidy-html5-5.8.0.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "clean.c" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 5.7.28_vs_5.8.0.

    1 /*
    2   clean.c -- clean up misuse of presentation markup
    3 
    4   (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
    5   See tidy.h for the copyright notice.
    6 
    7   Filters from other formats such as Microsoft Word
    8   often make excessive use of presentation markup such
    9   as font tags, B, I, and the align attribute. By applying
   10   a set of production rules, it is straight forward to
   11   transform this to use CSS.
   12 
   13   Some rules replace some of the children of an element by
   14   style properties on the element, e.g.
   15 
   16   <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
   17 
   18   Such rules are applied to the element's content and then
   19   to the element itself until none of the rules more apply.
   20   Having applied all the rules to an element, it will have
   21   a style attribute with one or more properties. 
   22 
   23   Other rules strip the element they apply to, replacing
   24   it by style properties on the contents, e.g.
   25   
   26   <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
   27       
   28   These rules are applied to an element before processing
   29   its content and replace the current element by the first
   30   element in the exposed content.
   31 
   32   After applying both sets of rules, you can replace the
   33   style attribute by a class value and style rule in the
   34   document head. To support this, an association of styles
   35   and class names is built.
   36 
   37   A naive approach is to rely on string matching to test
   38   when two property lists are the same. A better approach
   39   would be to first sort the properties before matching.
   40 
   41 */
   42 
   43 #include <stdio.h>
   44 #include <stdlib.h>
   45 #include <string.h>
   46 
   47 #include "tidy-int.h"
   48 #include "clean.h"
   49 #include "lexer.h"
   50 #include "parser.h"
   51 #include "attrs.h"
   52 #include "message.h"
   53 #include "tmbstr.h"
   54 #include "utf8.h"
   55 
   56 static Node* CleanNode( TidyDocImpl* doc, Node *node );
   57 
   58 static void RenameElem( TidyDocImpl* doc, Node* node, TidyTagId tid )
   59 {
   60     const Dict* dict = TY_(LookupTagDef)( tid );
   61     TidyDocFree( doc, node->element );
   62     node->element = TY_(tmbstrdup)( doc->allocator, dict->name );
   63     node->tag = dict;
   64 }
   65 
   66 static void FreeStyleProps(TidyDocImpl* doc, StyleProp *props)
   67 {
   68     StyleProp *next;
   69 
   70     while (props)
   71     {
   72         next = props->next;
   73         TidyDocFree(doc, props->name);
   74         TidyDocFree(doc, props->value);
   75         TidyDocFree(doc, props);
   76         props = next;
   77     }
   78 }
   79 
   80 static StyleProp *InsertProperty( TidyDocImpl* doc, StyleProp* props, ctmbstr name, ctmbstr value )
   81 {
   82     StyleProp *first, *prev, *prop;
   83     int cmp;
   84 
   85     prev = NULL;
   86     first = props;
   87 
   88     while (props)
   89     {
   90         cmp = TY_(tmbstrcmp)(props->name, name);
   91 
   92         if (cmp == 0)
   93         {
   94             /* this property is already defined, ignore new value */
   95             return first;
   96         }
   97 
   98         if (cmp > 0)
   99         {
  100             /* insert before this */
  101 
  102             prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
  103             prop->name = TY_(tmbstrdup)(doc->allocator, name);
  104             prop->value = TY_(tmbstrdup)(doc->allocator, value);
  105             prop->next = props;
  106 
  107             if (prev)
  108                 prev->next = prop;
  109             else
  110                 first = prop;
  111 
  112             return first;
  113         }
  114 
  115         prev = props;
  116         props = props->next;
  117     }
  118 
  119     prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
  120     prop->name = TY_(tmbstrdup)(doc->allocator, name);
  121     prop->value = TY_(tmbstrdup)(doc->allocator, value);
  122     prop->next = NULL;
  123 
  124     if (prev)
  125         prev->next = prop;
  126     else
  127         first = prop;
  128 
  129     return first;
  130 }
  131 
  132 /*
  133  Create sorted linked list of properties from style string
  134  It temporarily places nulls in place of ':' and ';' to
  135  delimit the strings for the property name and value.
  136  Some systems don't allow you to NULL literal strings,
  137  so to avoid this, a copy is made first.
  138 */
  139 static StyleProp* CreateProps( TidyDocImpl* doc, StyleProp* prop, ctmbstr style )
  140 {
  141     tmbstr name, value = NULL, name_end, value_end, line;
  142     Bool more;
  143 
  144     line = TY_(tmbstrdup)(doc->allocator, style);
  145     name = line;
  146 
  147     while (*name)
  148     {
  149         while (*name == ' ')
  150             ++name;
  151 
  152         name_end = name;
  153 
  154         while (*name_end)
  155         {
  156             if (*name_end == ':')
  157             {
  158                 value = name_end + 1;
  159                 break;
  160             }
  161 
  162             ++name_end;
  163         }
  164 
  165         if (*name_end != ':')
  166             break;
  167 
  168         while ( value && *value == ' ')
  169             ++value;
  170 
  171         value_end = value;
  172         more = no;
  173 
  174         while (*value_end)
  175         {
  176             if (*value_end == ';')
  177             {
  178                 more = yes;
  179                 break;
  180             }
  181 
  182             ++value_end;
  183         }
  184 
  185         *name_end = '\0';
  186         *value_end = '\0';
  187 
  188         prop = InsertProperty(doc, prop, name, value);
  189         *name_end = ':';
  190 
  191         if (more)
  192         {
  193             *value_end = ';';
  194             name = value_end + 1;
  195             continue;
  196         }
  197 
  198         break;
  199     }
  200 
  201     TidyDocFree(doc, line);  /* free temporary copy */
  202     return prop;
  203 }
  204 
  205 static tmbstr CreatePropString(TidyDocImpl* doc, StyleProp *props)
  206 {
  207     tmbstr style, p, s;
  208     uint len;
  209     StyleProp *prop;
  210 
  211     /* compute length */
  212 
  213     for (len = 0, prop = props; prop; prop = prop->next)
  214     {
  215         len += TY_(tmbstrlen)(prop->name) + 2;
  216         if (prop->value)
  217             len += TY_(tmbstrlen)(prop->value) + 2;
  218     }
  219 
  220     style = (tmbstr) TidyDocAlloc(doc, len+1);
  221     style[0] = '\0';
  222 
  223     for (p = style, prop = props; prop; prop = prop->next)
  224     {
  225         s = prop->name;
  226 
  227         while((*p++ = *s++))
  228             continue;
  229 
  230         if (prop->value)
  231         {
  232             *--p = ':';
  233             *++p = ' ';
  234             ++p;
  235 
  236             s = prop->value;
  237             while((*p++ = *s++))
  238                 continue;
  239         }
  240         if (prop->next == NULL)
  241             break;
  242 
  243         *--p = ';';
  244         *++p = ' ';
  245         ++p;
  246     }
  247 
  248     return style;
  249 }
  250 
  251 /*
  252   create string with merged properties
  253 static tmbstr AddProperty( ctmbstr style, ctmbstr property )
  254 {
  255     tmbstr line;
  256     StyleProp *prop;
  257 
  258     prop = CreateProps(doc, NULL, style);
  259     prop = CreateProps(doc, prop, property);
  260     line = CreatePropString(doc, prop);
  261     FreeStyleProps(doc, prop);
  262     return line;
  263 }
  264 */
  265 
  266 void TY_(FreeStyles)( TidyDocImpl* doc )
  267 {
  268     Lexer* lexer = doc->lexer;
  269     if ( lexer )
  270     {
  271         TagStyle *style, *next;
  272         for ( style = lexer->styles; style; style = next )
  273         {
  274             next = style->next;
  275             TidyDocFree( doc, style->tag );
  276             TidyDocFree( doc, style->tag_class );
  277             TidyDocFree( doc, style->properties );
  278             TidyDocFree( doc, style );
  279         }
  280     }
  281 }
  282 
  283 static tmbstr GensymClass( TidyDocImpl* doc )
  284 {
  285     tmbchar buf[512];  /* CSSPrefix is limited to 256 characters */
  286     ctmbstr pfx = cfgStr(doc, TidyCSSPrefix);
  287     if ( pfx == NULL || *pfx == 0 )
  288       pfx = "c";
  289 
  290     TY_(tmbsnprintf)(buf, sizeof(buf), "%s%u", pfx, ++doc->nClassId );
  291     return TY_(tmbstrdup)(doc->allocator, buf);
  292 }
  293 
  294 static ctmbstr FindStyle( TidyDocImpl* doc, ctmbstr tag, ctmbstr properties )
  295 {
  296     Lexer* lexer = doc->lexer;
  297     TagStyle* style;
  298 
  299     for (style = lexer->styles; style; style=style->next)
  300     {
  301         if (TY_(tmbstrcmp)(style->tag, tag) == 0 &&
  302             TY_(tmbstrcmp)(style->properties, properties) == 0)
  303             return style->tag_class;
  304     }
  305 
  306     style = (TagStyle *)TidyDocAlloc( doc, sizeof(TagStyle) );
  307     style->tag = TY_(tmbstrdup)(doc->allocator, tag);
  308     style->tag_class = GensymClass( doc );
  309     style->properties = TY_(tmbstrdup)( doc->allocator, properties );
  310     style->next = lexer->styles;
  311     lexer->styles = style;
  312     return style->tag_class;
  313 }
  314 
  315 /*
  316  Add class="foo" to node
  317 */
  318 static void AddClass( TidyDocImpl* doc, Node* node, ctmbstr classname )
  319 {
  320     AttVal *classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);;
  321 
  322     /*
  323      if there already is a class attribute
  324      then append class name after a space.
  325     */
  326     if (classattr)
  327         TY_(AppendToClassAttr)( doc, classattr, classname );
  328     else /* create new class attribute */
  329         TY_(AddAttribute)( doc, node, "class", classname );
  330 }
  331 
  332 void TY_(AddStyleAsClass)( TidyDocImpl* doc, Node *node, ctmbstr stylevalue )
  333 {
  334     ctmbstr classname;
  335 
  336     classname = FindStyle( doc, node->element, stylevalue );
  337     AddClass( doc, node, classname);
  338 }
  339 
  340 /*
  341  Find style attribute in node, and replace it
  342  by corresponding class attribute. Search for
  343  class in style dictionary otherwise gensym
  344  new class and add to dictionary.
  345 
  346  Assumes that node doesn't have a class attribute
  347 */
  348 static void Style2Rule( TidyDocImpl* doc, Node *node)
  349 {
  350     AttVal *styleattr, *classattr;
  351     ctmbstr classname;
  352 
  353     styleattr = TY_(AttrGetById)(node, TidyAttr_STYLE);
  354 
  355     if (styleattr)
  356     {
  357         /* fix for http://tidy.sf.net/bug/850215 */
  358         if (!styleattr->value)
  359         {
  360             TY_(RemoveAttribute)(doc, node, styleattr);
  361             return;
  362         }
  363 
  364         classname = FindStyle( doc, node->element, styleattr->value );
  365         classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);
  366 
  367         /*
  368          if there already is a class attribute
  369          then append class name after an underscore
  370         */
  371         if (classattr)
  372         {
  373             TY_(AppendToClassAttr)( doc, classattr, classname );
  374             TY_(RemoveAttribute)( doc, node, styleattr );
  375         }
  376         else /* reuse style attribute for class attribute */
  377         {
  378             TidyDocFree(doc, styleattr->attribute);
  379             TidyDocFree(doc, styleattr->value);
  380             styleattr->attribute = TY_(tmbstrdup)(doc->allocator, "class");
  381             styleattr->value = TY_(tmbstrdup)(doc->allocator, classname);
  382         }
  383     }
  384 }
  385 
  386 static void AddColorRule( Lexer* lexer, ctmbstr selector, ctmbstr color )
  387 {
  388     if ( selector && color )
  389     {
  390         TY_(AddStringLiteral)(lexer, selector);
  391         TY_(AddStringLiteral)(lexer, " { color: ");
  392         TY_(AddStringLiteral)(lexer, color);
  393         TY_(AddStringLiteral)(lexer, " }\n");
  394     }
  395 }
  396 
  397 /*
  398  move presentation attribs from body to style element
  399 
  400  background="foo" ->  body { background-image: url(foo) }
  401  bgcolor="foo"    ->  body { background-color: foo }
  402  text="foo"       ->  body { color: foo }
  403  link="foo"       ->  :link { color: foo }
  404  vlink="foo"      ->  :visited { color: foo }
  405  alink="foo"      ->  :active { color: foo }
  406 */
  407 static void CleanBodyAttrs( TidyDocImpl* doc, Node* body )
  408 {
  409     Lexer* lexer  = doc->lexer;
  410     tmbstr bgurl   = NULL;
  411     tmbstr bgcolor = NULL;
  412     tmbstr color   = NULL;
  413     AttVal* attr;
  414     
  415     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BACKGROUND)))
  416     {
  417         bgurl = attr->value;
  418         attr->value = NULL;
  419         TY_(RemoveAttribute)( doc, body, attr );
  420     }
  421 
  422     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BGCOLOR)))
  423     {
  424         bgcolor = attr->value;
  425         attr->value = NULL;
  426         TY_(RemoveAttribute)( doc, body, attr );
  427     }
  428 
  429     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_TEXT)))
  430     {
  431         color = attr->value;
  432         attr->value = NULL;
  433         TY_(RemoveAttribute)( doc, body, attr );
  434     }
  435 
  436     if ( bgurl || bgcolor || color )
  437     {
  438         TY_(AddStringLiteral)(lexer, " body {\n");
  439         if (bgurl)
  440         {
  441             TY_(AddStringLiteral)(lexer, "  background-image: url(");
  442             TY_(AddStringLiteral)(lexer, bgurl);
  443             TY_(AddStringLiteral)(lexer, ");\n");
  444             TidyDocFree(doc, bgurl);
  445         }
  446         if (bgcolor)
  447         {
  448             TY_(AddStringLiteral)(lexer, "  background-color: ");
  449             TY_(AddStringLiteral)(lexer, bgcolor);
  450             TY_(AddStringLiteral)(lexer, ";\n");
  451             TidyDocFree(doc, bgcolor);
  452         }
  453         if (color)
  454         {
  455             TY_(AddStringLiteral)(lexer, "  color: ");
  456             TY_(AddStringLiteral)(lexer, color);
  457             TY_(AddStringLiteral)(lexer, ";\n");
  458             TidyDocFree(doc, color);
  459         }
  460 
  461         TY_(AddStringLiteral)(lexer, " }\n");
  462     }
  463 
  464     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_LINK)))
  465     {
  466         AddColorRule(lexer, " :link", attr->value);
  467         TY_(RemoveAttribute)( doc, body, attr );
  468     }
  469 
  470     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_VLINK)))
  471     {
  472         AddColorRule(lexer, " :visited", attr->value);
  473         TY_(RemoveAttribute)( doc, body, attr );
  474     }
  475 
  476     if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_ALINK)))
  477     {
  478         AddColorRule(lexer, " :active", attr->value);
  479         TY_(RemoveAttribute)( doc, body, attr );
  480     }
  481 }
  482 
  483 static Bool NiceBody( TidyDocImpl* doc )
  484 {
  485     Node* node = TY_(FindBody)(doc);
  486     if (node)
  487     {
  488         if (TY_(AttrGetById)(node, TidyAttr_BACKGROUND) ||
  489             TY_(AttrGetById)(node, TidyAttr_BGCOLOR)    ||
  490             TY_(AttrGetById)(node, TidyAttr_TEXT)       ||
  491             TY_(AttrGetById)(node, TidyAttr_LINK)       ||
  492             TY_(AttrGetById)(node, TidyAttr_VLINK)      ||
  493             TY_(AttrGetById)(node, TidyAttr_ALINK))
  494         {
  495             doc->badLayout |= USING_BODY;
  496             return no;
  497         }
  498     }
  499 
  500     return yes;
  501 }
  502 
  503 /* create style element using rules from dictionary */
  504 static void CreateStyleElement( TidyDocImpl* doc )
  505 {
  506     Lexer* lexer = doc->lexer;
  507     Node *node, *head, *body;
  508     TagStyle *style;
  509     AttVal *av;
  510 
  511     if ( lexer->styles == NULL && NiceBody(doc) )
  512         return;
  513 
  514     node = TY_(NewNode)( doc->allocator, lexer );
  515     node->type = StartTag;
  516     node->implicit = yes;
  517     node->element = TY_(tmbstrdup)(doc->allocator, "style");
  518     TY_(FindTag)( doc, node );
  519 
  520     /* insert type attribute */
  521     av = TY_(NewAttributeEx)( doc, "type", "text/css", '"' );
  522     TY_(InsertAttributeAtStart)( node, av );
  523 
  524     body = TY_(FindBody)( doc );
  525     lexer->txtstart = lexer->lexsize;
  526     if ( body )
  527         CleanBodyAttrs( doc, body );
  528 
  529     for (style = lexer->styles; style; style = style->next)
  530     {
  531         TY_(AddCharToLexer)(lexer, ' ');
  532         TY_(AddStringLiteral)(lexer, style->tag);
  533         TY_(AddCharToLexer)(lexer, '.');
  534         TY_(AddStringLiteral)(lexer, style->tag_class);
  535         TY_(AddCharToLexer)(lexer, ' ');
  536         TY_(AddCharToLexer)(lexer, '{');
  537         TY_(AddStringLiteral)(lexer, style->properties);
  538         TY_(AddCharToLexer)(lexer, '}');
  539         TY_(AddCharToLexer)(lexer, '\n');
  540     }
  541 
  542     lexer->txtend = lexer->lexsize;
  543 
  544     TY_(InsertNodeAtEnd)( node, TY_(TextToken)(lexer) );
  545 
  546     /*
  547      now insert style element into document head
  548 
  549      doc is root node. search its children for html node
  550      the head node should be first child of html node
  551     */
  552     if ( NULL != (head = TY_(FindHEAD)( doc )) )
  553         TY_(InsertNodeAtEnd)( head, node );
  554 }
  555 
  556 
  557 /* ensure bidirectional links are consistent */
  558 void TY_(FixNodeLinks)(Node *node)
  559 {
  560     Node *child;
  561 
  562     if (node->prev)
  563         node->prev->next = node;
  564     else
  565         node->parent->content = node;
  566 
  567     if (node->next)
  568         node->next->prev = node;
  569     else
  570         node->parent->last = node;
  571 
  572     for (child = node->content; child; child = child->next)
  573         child->parent = node;
  574 }
  575 
  576 /*
  577  used to strip child of node when
  578  the node has one and only one child
  579 */
  580 static void StripOnlyChild(TidyDocImpl* doc, Node *node)
  581 {
  582     Node *child;
  583 
  584     child = node->content;
  585     node->content = child->content;
  586     node->last = child->last;
  587     child->content = NULL;
  588     TY_(FreeNode)(doc, child);
  589 
  590     for (child = node->content; child; child = child->next)
  591         child->parent = node;
  592 }
  593 
  594 /*
  595   used to strip font start and end tags.
  596   Extricate "element", replace it by its content and delete it.
  597 */
  598 static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
  599 {
  600     if (element->content)
  601     {
  602         Node *node, *parent = element->parent;
  603 
  604         element->last->next = element->next;
  605 
  606         if (element->next)
  607         {
  608             element->next->prev = element->last;
  609         }
  610         else
  611             parent->last = element->last;
  612 
  613         if (element->prev)
  614         {
  615             element->content->prev = element->prev;
  616             element->prev->next = element->content;
  617         }
  618         else
  619             parent->content = element->content;
  620 
  621         for (node = element->content; node; node = node->next)
  622             node->parent = parent;
  623 
  624         *pnode = element->content;
  625 
  626         element->next = element->content = NULL;
  627         TY_(FreeNode)(doc, element);
  628     }
  629     else
  630     {
  631         *pnode = TY_(DiscardElement)(doc, element);
  632     }
  633 }
  634 
  635 /*
  636   Create new string that consists of the
  637   combined style properties in s1 and s2
  638 
  639   To merge property lists, we build a linked
  640   list of property/values and insert properties
  641   into the list in order, merging values for
  642   the same property name.
  643 */
  644 static tmbstr MergeProperties( TidyDocImpl* doc, ctmbstr s1, ctmbstr s2 )
  645 {
  646     tmbstr s;
  647     StyleProp *prop;
  648 
  649     prop = CreateProps(doc, NULL, s1);
  650     prop = CreateProps(doc, prop, s2);
  651     s = CreatePropString(doc, prop);
  652     FreeStyleProps(doc, prop);
  653     return s;
  654 }
  655 
  656 /*
  657  Add style property to element, creating style
  658  attribute as needed and adding ; delimiter
  659 */
  660 void TY_(AddStyleProperty)(TidyDocImpl* doc, Node *node, ctmbstr property )
  661 {
  662     AttVal *av = TY_(AttrGetById)(node, TidyAttr_STYLE);
  663 
  664     /* if style attribute already exists then insert property */
  665 
  666     if ( av )
  667     {
  668         if (av->value != NULL)
  669         {
  670             tmbstr s = MergeProperties( doc, av->value, property );
  671             TidyDocFree( doc, av->value );
  672             av->value = s;
  673         }
  674         else
  675         {
  676             av->value = TY_(tmbstrdup)( doc->allocator, property );
  677         }
  678     }
  679     else /* else create new style attribute */
  680     {
  681         av = TY_(NewAttributeEx)( doc, "style", property, '"' );
  682         TY_(InsertAttributeAtStart)( node, av );
  683     }
  684 }
  685 
  686 static void MergeClasses(TidyDocImpl* doc, Node *node, Node *child)
  687 {
  688     AttVal *av;
  689     tmbstr s1, s2, names;
  690 
  691     for (s2 = NULL, av = child->attributes; av; av = av->next)
  692     {
  693         if (attrIsCLASS(av))
  694         {
  695             s2 = av->value;
  696             break;
  697         }
  698     }
  699 
  700     for (s1 = NULL, av = node->attributes; av; av = av->next)
  701     {
  702         if (attrIsCLASS(av))
  703         {
  704             s1 = av->value;
  705             break;
  706         }
  707     }
  708 
  709     if (s1)
  710     {
  711         if (s2)  /* merge class names from both */
  712         {
  713             uint l1, l2;
  714             l1 = TY_(tmbstrlen)(s1);
  715             l2 = TY_(tmbstrlen)(s2);
  716             names = (tmbstr) TidyDocAlloc(doc, l1 + l2 + 2);
  717             TY_(tmbstrcpy)(names, s1);
  718             names[l1] = ' ';
  719             TY_(tmbstrcpy)(names+l1+1, s2);
  720             TidyDocFree(doc, av->value);
  721             av->value = names;
  722         }
  723     }
  724     else if (s2)  /* copy class names from child */
  725     {
  726         av = TY_(NewAttributeEx)( doc, "class", s2, '"' );
  727         TY_(InsertAttributeAtStart)( node, av );
  728     }
  729 }
  730 
  731 static void MergeStyles(TidyDocImpl* doc, Node *node, Node *child)
  732 {
  733     AttVal *av;
  734     tmbstr s1, s2, style;
  735 
  736     /*
  737        the child may have a class attribute used
  738        for attaching styles, if so the class name
  739        needs to be copied to node's class
  740     */
  741     MergeClasses(doc, node, child);
  742 
  743     for (s2 = NULL, av = child->attributes; av; av = av->next)
  744     {
  745         if (attrIsSTYLE(av))
  746         {
  747             s2 = av->value;
  748             break;
  749         }
  750     }
  751 
  752     for (s1 = NULL, av = node->attributes; av; av = av->next)
  753     {
  754         if (attrIsSTYLE(av))
  755         {
  756             s1 = av->value;
  757             break;
  758         }
  759     }
  760 
  761     if (s1)
  762     {
  763         if (s2)  /* merge styles from both */
  764         {
  765             style = MergeProperties(doc, s1, s2);
  766             TidyDocFree(doc, av->value);
  767             av->value = style;
  768         }
  769     }
  770     else if (s2)  /* copy style of child */
  771     {
  772         av = TY_(NewAttributeEx)( doc, "style", s2, '"' );
  773         TY_(InsertAttributeAtStart)( node, av );
  774     }
  775 }
  776 
  777 static ctmbstr FontSize2Name(ctmbstr size)
  778 {
  779     static const ctmbstr sizes[7] =
  780     {
  781         "60%", "70%", "80%", NULL,
  782         "120%", "150%", "200%"
  783     };
  784 
  785     /* increment of 0.8 */
  786     static const ctmbstr minussizes[] =
  787     {
  788         "100%", "80%", "64%", "51%",
  789         "40%", "32%", "26%"
  790     };
  791 
  792     /* increment of 1.2 */
  793     static const ctmbstr plussizes[] =
  794     {
  795         "100%", "120%", "144%", "172%",
  796         "207%", "248%", "298%"
  797     };
  798 
  799     if (size[0] == '\0')
  800         return NULL;
  801 
  802     if ('0' <= size[0] && size[0] <= '6')
  803     {
  804         int n = size[0] - '0';
  805         return sizes[n];
  806     }
  807 
  808     if (size[0] == '-')
  809     {
  810         if ('0' <= size[1] && size[1] <= '6')
  811         {
  812             int n = size[1] - '0';
  813             return minussizes[n];
  814         }
  815         return "smaller"; /*"70%"; */
  816     }
  817 
  818     if ('0' <= size[1] && size[1] <= '6')
  819     {
  820         int n = size[1] - '0';
  821         return plussizes[n];
  822     }
  823 
  824     return "larger"; /* "140%" */
  825 }
  826 
  827 static void AddFontFace( TidyDocImpl* doc, Node *node, ctmbstr face )
  828 {
  829     tmbchar buf[256];
  830     TY_(tmbsnprintf)(buf, sizeof(buf), "font-family: %s", face );
  831     TY_(AddStyleProperty)( doc, node, buf );
  832 }
  833 
  834 static void AddFontSize( TidyDocImpl* doc, Node* node, ctmbstr size )
  835 {
  836     ctmbstr value = NULL;
  837 
  838     if (nodeIsP(node))
  839     {
  840         if (TY_(tmbstrcmp)(size, "6") == 0)
  841             value = "h1";
  842         else if (TY_(tmbstrcmp)(size, "5") == 0)
  843             value = "h2";
  844         else if (TY_(tmbstrcmp)(size, "4") == 0)
  845             value = "h3";
  846 
  847         if (value)
  848         {
  849             TidyDocFree(doc, node->element);
  850             node->element = TY_(tmbstrdup)(doc->allocator, value);
  851             TY_(FindTag)(doc, node);
  852             return;
  853         }
  854     }
  855 
  856     value = FontSize2Name(size);
  857 
  858     if (value)
  859     {
  860         tmbchar buf[64];
  861         TY_(tmbsnprintf)(buf, sizeof(buf), "font-size: %s", value);
  862         TY_(AddStyleProperty)( doc, node, buf );
  863     }
  864 }
  865 
  866 static void AddFontColor( TidyDocImpl* doc, Node *node, ctmbstr color)
  867 {
  868     tmbchar buf[128];
  869     TY_(tmbsnprintf)(buf, sizeof(buf), "color: %s", color);
  870     TY_(AddStyleProperty)( doc, node, buf );
  871 }
  872 
  873 /* force alignment value to lower case */
  874 static void AddAlign( TidyDocImpl* doc, Node *node, ctmbstr align )
  875 {
  876     uint i;
  877     tmbchar buf[128];
  878 
  879     TY_(tmbstrcpy)( buf, "text-align: " );
  880     for ( i = 12; i < sizeof(buf)/sizeof(buf[0])-1; ++i )
  881     {
  882         if ( (buf[i] = (tmbchar)TY_(ToLower)(*align++)) == '\0' )
  883             break;
  884     }
  885     buf[i] = '\0';
  886     TY_(AddStyleProperty)( doc, node, buf );
  887 }
  888 
  889 /*
  890  add style properties to node corresponding to
  891  the font face, size and color attributes
  892 */
  893 static void AddFontStyles( TidyDocImpl* doc, Node *node, AttVal *av)
  894 {
  895     while (av)
  896     {
  897         if (AttrHasValue(av))
  898         {
  899             if (attrIsFACE(av))
  900                 AddFontFace( doc, node, av->value );
  901             else if (attrIsSIZE(av))
  902                 AddFontSize( doc, node, av->value );
  903             else if (attrIsCOLOR(av))
  904                 AddFontColor( doc, node, av->value );
  905         }
  906         av = av->next;
  907     }
  908 }
  909 
  910 /*
  911     Symptom: <p align=center>
  912     Action: <p style="text-align: center">
  913 */
  914 static void TextAlign( TidyDocImpl* doc, Node* node )
  915 {
  916     AttVal *av, *prev;
  917 
  918     prev = NULL;
  919 
  920     for (av = node->attributes; av; av = av->next)
  921     {
  922         if (attrIsALIGN(av))
  923         {
  924             if (prev)
  925                 prev->next = av->next;
  926             else
  927                 node->attributes = av->next;
  928 
  929             if (av->value)
  930                 AddAlign( doc, node, av->value );
  931 
  932             TY_(FreeAttribute)(doc, av);
  933             break;
  934         }
  935 
  936         prev = av;
  937     }
  938 }
  939 
  940 /*
  941     Symptom: <table bgcolor="red">
  942     Action: <table style="background-color: red">
  943 */
  944 static void TableBgColor( TidyDocImpl* doc, Node* node )
  945 {
  946     AttVal* attr;
  947     tmbchar buf[256];
  948 
  949     if (NULL != (attr = TY_(AttrGetById)(node, TidyAttr_BGCOLOR)))
  950     {
  951         TY_(tmbsnprintf)(buf, sizeof(buf), "background-color: %s", attr->value );
  952         TY_(RemoveAttribute)( doc, node, attr );
  953         TY_(AddStyleProperty)( doc, node, buf );
  954     }
  955 }
  956 
  957 /*
  958    The clean up rules use the pnode argument to return the
  959    next node when the original node has been deleted
  960 */
  961 
  962 /*
  963     Symptom: <dir> <li> where <li> is only child
  964     Action: coerce <dir> <li> to <div> with indent.
  965 */
  966 
  967 static Bool Dir2Div( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode))
  968 {
  969     Node *child;
  970 
  971     if ( nodeIsDIR(node) || nodeIsUL(node) || nodeIsOL(node) )
  972     {
  973         child = node->content;
  974 
  975         if (child == NULL)
  976             return no;
  977 
  978         /* check child has no peers */
  979 
  980         if (child->next)
  981             return no;
  982 
  983         if ( !nodeIsLI(child) )
  984             return no;
  985 
  986         if ( !child->implicit )
  987             return no;
  988 
  989         /* coerce dir to div */
  990         node->tag = TY_(LookupTagDef)( TidyTag_DIV );
  991         TidyDocFree( doc, node->element );
  992         node->element = TY_(tmbstrdup)(doc->allocator, "div");
  993         TY_(AddStyleProperty)( doc, node, "margin-left: 2em" );
  994         StripOnlyChild( doc, node );
  995         return yes;
  996     }
  997 
  998     return no;
  999 }
 1000 
 1001 /*
 1002     Symptom: <center>
 1003     Action: replace <center> by <div style="text-align: center">
 1004 */
 1005 
 1006 static Bool Center2Div( TidyDocImpl* doc, Node *node, Node **pnode)
 1007 {
 1008     if ( nodeIsCENTER(node) )
 1009     {
 1010         RenameElem( doc, node, TidyTag_DIV );
 1011         TY_(AddStyleProperty)( doc, node, "text-align: center" );
 1012         return yes;
 1013     }
 1014 
 1015     return no;
 1016 }
 1017 
 1018 /* Copy child attributes to node. Duplicate attributes are overwritten.
 1019    Unique attributes (such as ID) disable the action.
 1020    Attributes style and class are not dealt with. A call to MergeStyles
 1021    will do that.
 1022 */
 1023 static Bool CopyAttrs( TidyDocImpl* doc, Node *node, Node *child)
 1024 {
 1025     AttVal *av1, *av2;
 1026     TidyAttrId id;
 1027 
 1028     /* Detect attributes that cannot be merged or overwritten. */
 1029     if (TY_(AttrGetById)(child, TidyAttr_ID) != NULL
 1030         && TY_(AttrGetById)(node, TidyAttr_ID) != NULL)
 1031         return no;
 1032 
 1033     /* Move child attributes to node. Attributes in node
 1034      can be overwritten or merged. */
 1035     for (av2 = child->attributes; av2; )
 1036     {
 1037         /* Dealt by MergeStyles. */
 1038         if (attrIsSTYLE(av2) || attrIsCLASS(av2))
 1039         {
 1040             av2 = av2->next;
 1041             continue;
 1042         }
 1043         /* Avoid duplicates in node */
 1044         if ((id=AttrId(av2)) != TidyAttr_UNKNOWN
 1045             && (av1=TY_(AttrGetById)(node, id))!= NULL)
 1046             TY_(RemoveAttribute)( doc, node, av1 );
 1047 
 1048         /* Move attribute from child to node */
 1049         TY_(DetachAttribute)( child, av2 );
 1050         av1 = av2;
 1051         av2 = av2->next;
 1052         av1->next = NULL;
 1053         TY_(InsertAttributeAtEnd)( node, av1 );
 1054     }
 1055 
 1056     return yes;
 1057 }
 1058 
 1059 /*
 1060     Symptom <XX><XX>...</XX></XX>
 1061     Action: merge the two XXs
 1062 
 1063   For instance, this is useful after nested <dir>s used by Word
 1064   for indenting have been converted to <div>s
 1065 
 1066   If state is "no", no merging.
 1067   If state is "yes", inner element is discarded. Only Style and Class
 1068   attributes are merged using MergeStyles().
 1069   If state is "auto", atttibutes are merged as described in CopyAttrs().
 1070   Style and Class attributes are merged using MergeStyles().
 1071 */
 1072 static Bool MergeNestedElements( TidyDocImpl* doc,
 1073                                  TidyTagId Id, TidyTriState state, Node *node,
 1074                                  Node **ARG_UNUSED(pnode))
 1075 {
 1076     Node *child;
 1077 
 1078     if ( state == TidyNoState
 1079          || !TagIsId(node, Id) )
 1080         return no;
 1081 
 1082     child = node->content;
 1083 
 1084     if ( child == NULL
 1085          || child->next != NULL
 1086          || !TagIsId(child, Id) )
 1087         return no;
 1088 
 1089     if ( state == TidyAutoState
 1090          && CopyAttrs(doc, node, child) == no )
 1091         return no;
 1092 
 1093     MergeStyles( doc, node, child );
 1094     StripOnlyChild( doc, node );
 1095     return yes;
 1096 }
 1097 
 1098 /*
 1099     Symptom: <ul><li><ul>...</ul></li></ul>
 1100     Action: discard outer list
 1101 */
 1102 
 1103 static Bool NestedList( TidyDocImpl* doc, Node *node, Node **pnode )
 1104 {
 1105     Node *child, *list;
 1106 
 1107     if ( nodeIsUL(node) || nodeIsOL(node) )
 1108     {
 1109         child = node->content;
 1110 
 1111         if (child == NULL)
 1112             return no;
 1113 
 1114         /* check child has no peers */
 1115 
 1116         if (child->next)
 1117             return no;
 1118 
 1119         list = child->content;
 1120 
 1121         if (!list)
 1122             return no;
 1123 
 1124         if (list->tag != node->tag)
 1125             return no;
 1126 
 1127         /* check list has no peers */
 1128         if (list->next)
 1129             return no;
 1130 
 1131         *pnode = list;  /* Set node to resume iteration */
 1132 
 1133         /* move inner list node into position of outer node */
 1134         list->prev = node->prev;
 1135         list->next = node->next;
 1136         list->parent = node->parent;
 1137         TY_(FixNodeLinks)(list);
 1138 
 1139         /* get rid of outer ul and its li */
 1140         child->content = NULL;
 1141         TY_(FreeNode)( doc, child ); /* See test #427841. */
 1142         child = NULL;
 1143         node->content = NULL;
 1144         node->next = NULL;
 1145         TY_(FreeNode)( doc, node );
 1146         node = NULL;
 1147 
 1148         /*
 1149           If prev node was a list the chances are this node
 1150           should be appended to that list. Word has no way of
 1151           recognizing nested lists and just uses indents
 1152         */
 1153 
 1154         if (list->prev)
 1155         {
 1156             if ( (nodeIsUL(list->prev) || nodeIsOL(list->prev))
 1157                  && list->prev->last )
 1158             {
 1159                 node = list;
 1160                 list = node->prev;
 1161 
 1162                 child = list->last;  /* <li> */
 1163 
 1164                 list->next = node->next;
 1165                 TY_(FixNodeLinks)(list);
 1166 
 1167                 node->parent = child;
 1168                 node->next = NULL;
 1169                 node->prev = child->last;
 1170                 TY_(FixNodeLinks)(node);
 1171                 CleanNode( doc, node );
 1172             }
 1173         }
 1174 
 1175         return yes;
 1176     }
 1177 
 1178     return no;
 1179 }
 1180 
 1181 /* Find CSS equivalent in a SPAN element */
 1182 static
 1183 Bool FindCSSSpanEq( Node *node, ctmbstr *s, Bool deprecatedOnly )
 1184 {
 1185     struct
 1186     {
 1187         TidyTagId id;
 1188         ctmbstr CSSeq;
 1189         Bool deprecated;
 1190     }
 1191     const CSS_SpanEq[] =
 1192         {
 1193             { TidyTag_B, "font-weight: bold", no },
 1194             { TidyTag_I, "font-style: italic", no },
 1195             { TidyTag_S, "text-decoration: line-through", yes},
 1196             { TidyTag_STRIKE, "text-decoration: line-through", yes},
 1197             { TidyTag_U, "text-decoration: underline", yes},
 1198             { TidyTag_UNKNOWN, NULL, no }
 1199         };
 1200     uint i;
 1201 
 1202     for (i=0; CSS_SpanEq[i].CSSeq; ++i)
 1203         if ( (!deprecatedOnly || CSS_SpanEq[i].deprecated)
 1204              && TagIsId(node, CSS_SpanEq[i].id) )
 1205         {
 1206             *s = CSS_SpanEq[i].CSSeq;
 1207             return yes;
 1208         }
 1209     return no; 
 1210 }
 1211 
 1212 /* Necessary conditions to apply BlockStyle(). */
 1213 static Bool CanApplyBlockStyle( Node *node )
 1214 {
 1215     if (TY_(nodeHasCM)(node,CM_BLOCK | CM_LIST | CM_DEFLIST | CM_TABLE)
 1216         && !nodeIsDIV(node) && !nodeIsP(node)
 1217         && !nodeIsTABLE(node) && !nodeIsTR(node) && !nodeIsLI(node) )
 1218     {
 1219         return yes;
 1220     }
 1221     return no;
 1222 }
 1223 
 1224 /*
 1225   Symptom: the only child of a block-level element is a
 1226   presentation element such as B, I or FONT
 1227 
 1228   Action: add style "font-weight: bold" to the block and
 1229   strip the <b> element, leaving its children.
 1230 
 1231   example:
 1232 
 1233     <p>
 1234       <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
 1235     </p>
 1236 
 1237   becomes:
 1238 
 1239       <p style="font-weight: bold; font-family: Arial; font-size: 6">
 1240         Draft Recommended Practice
 1241       </p>
 1242 
 1243   This code also replaces the align attribute by a style attribute.
 1244   However, to avoid CSS problems with Navigator 4, this isn't done
 1245   for the elements: caption, tr and table
 1246 */
 1247 static Bool BlockStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
 1248 {
 1249     Node *child;
 1250     ctmbstr CSSeq;
 1251 
 1252     /* check for bgcolor */
 1253     if (   nodeIsTABLE(node)
 1254         || nodeIsTD(node) || nodeIsTH(node) || nodeIsTR( node ))
 1255         TableBgColor( doc, node );
 1256 
 1257     if (CanApplyBlockStyle(node))
 1258     {
 1259         /* check for align attribute */
 1260         if ( !nodeIsCAPTION(node) )
 1261             TextAlign( doc, node );
 1262 
 1263         child = node->content;
 1264         if (child == NULL)
 1265             return no;
 1266 
 1267         /* check child has no peers */
 1268         if (child->next)
 1269             return no;
 1270 
 1271         if ( FindCSSSpanEq(child, &CSSeq, no) )
 1272         {
 1273             MergeStyles( doc, node, child );
 1274             TY_(AddStyleProperty)( doc, node, CSSeq );
 1275             StripOnlyChild( doc, node );
 1276             return yes;
 1277         }
 1278         else if ( nodeIsFONT(child) )
 1279         {
 1280             MergeStyles( doc, node, child );
 1281             AddFontStyles( doc, node, child->attributes );
 1282             StripOnlyChild( doc, node );
 1283             return yes;
 1284         }
 1285     }
 1286 
 1287     return no;
 1288 }
 1289 
 1290 /* Necessary conditions to apply InlineStyle(). */
 1291 static Bool CanApplyInlineStyle( Node *node )
 1292 {
 1293     return !nodeIsFONT(node) && TY_(nodeHasCM)(node, CM_INLINE|CM_ROW);
 1294 }
 1295 
 1296 /* the only child of table cell or an inline element such as em */
 1297 static Bool InlineStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
 1298 {
 1299     Node *child;
 1300     ctmbstr CSSeq;
 1301 
 1302     if ( CanApplyInlineStyle(node) )
 1303     {
 1304         child = node->content;
 1305 
 1306         if (child == NULL)
 1307             return no;
 1308 
 1309         /* check child has no peers */
 1310 
 1311         if (child->next)
 1312             return no;
 1313 
 1314         if ( FindCSSSpanEq(child, &CSSeq, no) )
 1315         {
 1316             MergeStyles( doc, node, child );
 1317             TY_(AddStyleProperty)( doc, node, CSSeq );
 1318             StripOnlyChild( doc, node );
 1319             return yes;
 1320         }
 1321         else if ( nodeIsFONT(child) )
 1322         {
 1323             MergeStyles( doc, node, child );
 1324             AddFontStyles( doc, node, child->attributes );
 1325             StripOnlyChild( doc, node );
 1326             return yes;
 1327         }
 1328     }
 1329 
 1330     return no;
 1331 }
 1332 
 1333 /*
 1334     Transform element to equivalent CSS
 1335 */
 1336 static Bool InlineElementToCSS( TidyDocImpl* doc, Node* node,
 1337                                 Node **ARG_UNUSED(pnode)  )
 1338 {
 1339     ctmbstr CSSeq;
 1340 
 1341     /* if node is the only child of parent element then leave alone
 1342           Do so only if BlockStyle may be succesful. */
 1343     if ( node->parent->content == node && node->next == NULL &&
 1344          (CanApplyBlockStyle(node->parent)
 1345           || CanApplyInlineStyle(node->parent)) )
 1346         return no;
 1347 
 1348     if ( FindCSSSpanEq(node, &CSSeq, yes) )
 1349     {
 1350         RenameElem( doc, node, TidyTag_SPAN );
 1351         TY_(AddStyleProperty)( doc, node, CSSeq );
 1352         return yes;
 1353     }
 1354     return no;
 1355 } 
 1356 
 1357 /*
 1358   Replace font elements by span elements, deleting
 1359   the font element's attributes and replacing them
 1360   by a single style attribute.
 1361 */
 1362 static Bool Font2Span( TidyDocImpl* doc, Node *node, Node **pnode )
 1363 {
 1364     AttVal *av, *style, *next;
 1365 
 1366     if ( nodeIsFONT(node) )
 1367     {
 1368         /* if node is the only child of parent element then leave alone
 1369           Do so only if BlockStyle may be succesful. */
 1370         if ( node->parent->content == node && node->next == NULL &&
 1371              CanApplyBlockStyle(node->parent) )
 1372             return no;
 1373 
 1374         AddFontStyles( doc, node, node->attributes );
 1375 
 1376         /* extract style attribute and free the rest */
 1377         av = node->attributes;
 1378         style = NULL;
 1379 
 1380         while (av)
 1381         {
 1382             next = av->next;
 1383 
 1384             if (attrIsSTYLE(av))
 1385             {
 1386                 av->next = NULL;
 1387                 style = av;
 1388             }
 1389             else
 1390             {
 1391                 TY_(FreeAttribute)( doc, av );
 1392             }
 1393             av = next;
 1394         }
 1395 
 1396         node->attributes = style;
 1397         RenameElem( doc, node, TidyTag_SPAN );
 1398         return yes;
 1399     }
 1400 
 1401     return no;
 1402 }
 1403 
 1404 /*
 1405   Applies all matching rules to a node.
 1406 */
 1407 Node* CleanNode( TidyDocImpl* doc, Node *node )
 1408 {
 1409     Node *next = NULL;
 1410     TidyTriState mergeDivs = cfgAutoBool(doc, TidyMergeDivs);
 1411     TidyTriState mergeSpans = cfgAutoBool(doc, TidyMergeSpans);
 1412 
 1413     for (next = node; TY_(nodeIsElement)(node); node = next)
 1414     {
 1415         if ( Dir2Div(doc, node, &next) )
 1416             continue;
 1417 
 1418         /* Special case: true result means
 1419         ** that arg node and its parent no longer exist.
 1420         ** So we must jump back up the CreateStyleProperties()
 1421         ** call stack until we have a valid node reference.
 1422         */
 1423         if ( NestedList(doc, node, &next) )
 1424             return next;
 1425 
 1426         if ( Center2Div(doc, node, &next) )
 1427             continue;
 1428 
 1429         if ( MergeNestedElements(doc, TidyTag_DIV, mergeDivs, node, &next) )
 1430             continue;
 1431 
 1432         if ( MergeNestedElements(doc, TidyTag_SPAN, mergeSpans, node, &next) )
 1433             continue;
 1434 
 1435         if ( BlockStyle(doc, node, &next) )
 1436             continue;
 1437 
 1438         if ( InlineStyle(doc, node, &next) )
 1439             continue;
 1440 
 1441         if ( InlineElementToCSS(doc, node, &next) )
 1442             continue;
 1443 
 1444         if ( Font2Span(doc, node, &next) )
 1445             continue;
 1446 
 1447         break;
 1448     }
 1449 
 1450     return next;
 1451 }
 1452 
 1453 /* Special case: if the current node is destroyed by
 1454 ** CleanNode() lower in the tree, this node and its parent
 1455 ** no longer exist.  So we must jump back up the CleanTree()
 1456 ** call stack until we have a valid node reference.
 1457 */
 1458 
 1459 static Node* CleanTree( TidyDocImpl* doc, Node *node )
 1460 {
 1461     if (node->content)
 1462     {
 1463         Node *child;
 1464         for (child = node->content; child != NULL; child = child->next)
 1465         {
 1466             child = CleanTree( doc, child );
 1467             if ( !child )
 1468                 break;
 1469         }
 1470     }
 1471 
 1472     return CleanNode( doc, node );
 1473 }
 1474 
 1475 static void DefineStyleRules( TidyDocImpl* doc, Node *node )
 1476 {
 1477     Node *child;
 1478 
 1479     if (node->content)
 1480     {
 1481         for (child = node->content;
 1482                 child != NULL; child = child->next)
 1483         {
 1484             DefineStyleRules( doc, child );
 1485         }
 1486     }
 1487 
 1488     Style2Rule( doc, node );
 1489 }
 1490 
 1491 void TY_(CleanDocument)( TidyDocImpl* doc )
 1492 {
 1493     /* placeholder.  CleanTree()/CleanNode() will not
 1494     ** zap root element 
 1495     */
 1496     CleanTree( doc, &doc->root );
 1497 
 1498     if ( cfgBool(doc, TidyMakeClean) )
 1499     {
 1500         DefineStyleRules( doc, &doc->root );
 1501         CreateStyleElement( doc );
 1502     }
 1503 }
 1504 
 1505 /* simplifies <b><b> ... </b> ...</b> etc. */
 1506 void TY_(NestedEmphasis)( TidyDocImpl* doc, Node* node )
 1507 {
 1508     Node *next;
 1509 
 1510     while (node)
 1511     {
 1512         next = node->next;
 1513 
 1514         if ( (nodeIsB(node) || nodeIsI(node))
 1515              && node->parent && node->parent->tag == node->tag)
 1516         {
 1517             /* strip redundant inner element */
 1518             DiscardContainer( doc, node, &next );
 1519             node = next;
 1520             continue;
 1521         }
 1522 
 1523         if ( node->content )
 1524             TY_(NestedEmphasis)( doc, node->content );
 1525 
 1526         node = next;
 1527     }
 1528 }
 1529 
 1530 
 1531 
 1532 /* replace i by em and b by strong */
 1533 void TY_(EmFromI)( TidyDocImpl* doc, Node* node )
 1534 {
 1535     while (node)
 1536     {
 1537         if ( nodeIsI(node) )
 1538             RenameElem( doc, node, TidyTag_EM );
 1539         else if ( nodeIsB(node) )
 1540             RenameElem( doc, node, TidyTag_STRONG );
 1541 
 1542         if ( node->content )
 1543             TY_(EmFromI)( doc, node->content );
 1544 
 1545         node = node->next;
 1546     }
 1547 }
 1548 
 1549 static Bool HasOneChild(Node *node)
 1550 {
 1551     return (node->content && node->content->next == NULL);
 1552 }
 1553 
 1554 /*
 1555  Some people use dir or ul without an li
 1556  to indent the content. The pattern to
 1557  look for is a list with a single implicit
 1558  li. This is recursively replaced by an
 1559  implicit blockquote.
 1560 */
 1561 void TY_(List2BQ)( TidyDocImpl* doc, Node* node )
 1562 {
 1563     while (node)
 1564     {
 1565         if (node->content)
 1566             TY_(List2BQ)( doc, node->content );
 1567 
 1568         if ( node->tag && node->tag->parser == TY_(ParseList) &&
 1569              HasOneChild(node) && node->content->implicit )
 1570         {
 1571             StripOnlyChild( doc, node );
 1572             RenameElem( doc, node, TidyTag_BLOCKQUOTE );
 1573             node->implicit = yes;
 1574         }
 1575 
 1576         node = node->next;
 1577     }
 1578 }
 1579 
 1580 
 1581 /*
 1582  Replace implicit blockquote by div with an indent
 1583  taking care to reduce nested blockquotes to a single
 1584  div with the indent set to match the nesting depth
 1585 */
 1586 void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
 1587 {
 1588     tmbchar indent_buf[ 32 ];
 1589     uint indent;
 1590 
 1591     while (node)
 1592     {
 1593         if ( nodeIsBLOCKQUOTE(node) && node->implicit )
 1594         {
 1595             indent = 1;
 1596 
 1597             while( HasOneChild(node) &&
 1598                    nodeIsBLOCKQUOTE(node->content) &&
 1599                    node->implicit)
 1600             {
 1601                 ++indent;
 1602                 StripOnlyChild( doc, node );
 1603             }
 1604 
 1605             if (node->content)
 1606                 TY_(BQ2Div)( doc, node->content );
 1607 
 1608             TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem",
 1609                              2*indent);
 1610 
 1611             RenameElem( doc, node, TidyTag_DIV );
 1612             TY_(AddStyleProperty)(doc, node, indent_buf );
 1613         }
 1614         else if (node->content)
 1615             TY_(BQ2Div)( doc, node->content );
 1616 
 1617         node = node->next;
 1618     }
 1619 }
 1620 
 1621 
 1622 static Node* FindEnclosingCell( TidyDocImpl* ARG_UNUSED(doc), Node *node)
 1623 {
 1624     Node *check;
 1625 
 1626     for ( check=node; check; check = check->parent )
 1627     {
 1628       if ( nodeIsTD(check) )
 1629         return check;
 1630     }
 1631     return NULL;
 1632 }
 1633 
 1634 /* node is <![if ...]> prune up to <![endif]> */
 1635 static Node* PruneSection( TidyDocImpl* doc, Node *node )
 1636 {
 1637     Lexer* lexer = doc->lexer;
 1638 
 1639     for (;;)
 1640     {
 1641         if (node == NULL)
 1642             return NULL;
 1643         
 1644         ctmbstr lexbuf = lexer->lexbuf + node->start;
 1645         if ( TY_(tmbstrncmp)(lexbuf, "if !supportEmptyParas", 21) == 0 )
 1646         {
 1647           Node* cell = FindEnclosingCell( doc, node );
 1648           if ( cell )
 1649           {
 1650             /* Need to put &nbsp; into cell so it doesn't look weird
 1651             */
 1652             Node* nbsp = TY_(NewLiteralTextNode)( lexer, "\240" );
 1653             assert( (byte)'\240' == (byte)160 );
 1654             TY_(InsertNodeBeforeElement)( node, nbsp );
 1655           }
 1656         }
 1657 
 1658         /* discard node and returns next, unless it is a text node */
 1659         if ( node->type == TextNode )
 1660             node = node->next;
 1661         else
 1662             node = TY_(DiscardElement)( doc, node );
 1663 
 1664         if (node == NULL)
 1665             return NULL;
 1666         
 1667         if (node->type == SectionTag)
 1668         {
 1669             if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0)
 1670             {
 1671                 node = PruneSection( doc, node );
 1672                 continue;
 1673             }
 1674 
 1675             if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "endif", 5) == 0)
 1676             {
 1677                 node = TY_(DiscardElement)( doc, node );
 1678                 break;
 1679             }
 1680         }
 1681     }
 1682 
 1683     return node;
 1684 }
 1685 
 1686 void TY_(DropSections)( TidyDocImpl* doc, Node* node )
 1687 {
 1688     Lexer* lexer = doc->lexer;
 1689     while (node)
 1690     {
 1691         if (node->type == SectionTag)
 1692         {
 1693             /* prune up to matching endif */
 1694             if ((TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0) &&
 1695                 (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if !vml", 7) != 0)) /* #444394 - fix 13 Sep 01 */
 1696             {
 1697                 node = PruneSection( doc, node );
 1698                 continue;
 1699             }
 1700 
 1701             /* discard others as well */
 1702             node = TY_(DiscardElement)( doc, node );
 1703             continue;
 1704         }
 1705 
 1706         if (node->content)
 1707             TY_(DropSections)( doc, node->content );
 1708 
 1709         node = node->next;
 1710     }
 1711 }
 1712 
 1713 static void PurgeWord2000Attributes( TidyDocImpl* doc, Node* node )
 1714 {
 1715     AttVal *attr, *next, *prev = NULL;
 1716 
 1717     for ( attr = node->attributes; attr; attr = next )
 1718     {
 1719         next = attr->next;
 1720 
 1721         /* special check for class="Code" denoting pre text */
 1722         /* Pass thru user defined styles as HTML class names */
 1723         if (attrIsCLASS(attr))
 1724         {
 1725             if (AttrValueIs(attr, "Code") ||
 1726                  TY_(tmbstrncmp)(attr->value, "Mso", 3) != 0 )
 1727             {
 1728                 prev = attr;
 1729                 continue;
 1730             }
 1731         }
 1732 
 1733         if (attrIsCLASS(attr) ||
 1734             attrIsSTYLE(attr) ||
 1735             attrIsLANG(attr)  ||
 1736              ( (attrIsHEIGHT(attr) || attrIsWIDTH(attr)) &&
 1737                (nodeIsTD(node) || nodeIsTR(node) || nodeIsTH(node)) ) ||
 1738              (attr->attribute && TY_(tmbstrncmp)(attr->attribute, "x:", 2) == 0) )
 1739         {
 1740             if (prev)
 1741                 prev->next = next;
 1742             else
 1743                 node->attributes = next;
 1744 
 1745             TY_(FreeAttribute)( doc, attr );
 1746         }
 1747         else
 1748             prev = attr;
 1749     }
 1750 }
 1751 
 1752 /* Word2000 uses span excessively, so we strip span out */
 1753 static Node* StripSpan( TidyDocImpl* doc, Node* span )
 1754 {
 1755     Node *node, *prev = NULL, *content;
 1756 
 1757     /*
 1758      deal with span elements that have content
 1759      by splicing the content in place of the span
 1760      after having processed it
 1761     */
 1762 
 1763     TY_(CleanWord2000)( doc, span->content );
 1764     content = span->content;
 1765 
 1766     if (span->prev)
 1767         prev = span->prev;
 1768     else if (content)
 1769     {
 1770         node = content;
 1771         content = content->next;
 1772         TY_(RemoveNode)(node);
 1773         TY_(InsertNodeBeforeElement)(span, node);
 1774         prev = node;
 1775     }
 1776 
 1777     while (content)
 1778     {
 1779         node = content;
 1780         content = content->next;
 1781         TY_(RemoveNode)(node);
 1782         TY_(InsertNodeAfterElement)(prev, node);
 1783         prev = node;
 1784     }
 1785 
 1786     if (span->next == NULL)
 1787         span->parent->last = prev;
 1788 
 1789     node = span->next;
 1790     span->content = NULL;
 1791     TY_(DiscardElement)( doc, span );
 1792     return node;
 1793 }
 1794 
 1795 /* map non-breaking spaces to regular spaces */
 1796 void TY_(NormalizeSpaces)(Lexer *lexer, Node *node)
 1797 {
 1798     while ( node )
 1799     {
 1800         if ( node->content )
 1801             TY_(NormalizeSpaces)( lexer, node->content );
 1802 
 1803         if (TY_(nodeIsText)(node))
 1804         {
 1805             uint i, c;
 1806             tmbstr p = lexer->lexbuf + node->start;
 1807 
 1808             for (i = node->start; i < node->end; ++i)
 1809             {
 1810                 c = (byte) lexer->lexbuf[i];
 1811 
 1812                 /* look for UTF-8 multibyte character */
 1813                 if ( c > 0x7F )
 1814                     i += TY_(GetUTF8)( lexer->lexbuf + i, &c );
 1815 
 1816                 if ( c == 160 )
 1817                     c = ' ';
 1818 
 1819                 p = TY_(PutUTF8)(p, c);
 1820             }
 1821             node->end = p - lexer->lexbuf;
 1822         }
 1823 
 1824         node = node->next;
 1825     }
 1826 }
 1827 
 1828 /* used to hunt for hidden preformatted sections */
 1829 static Bool NoMargins(Node *node)
 1830 {
 1831     AttVal *attval = TY_(AttrGetById)(node, TidyAttr_STYLE);
 1832 
 1833     if ( !AttrHasValue(attval) )
 1834         return no;
 1835 
 1836     /* search for substring "margin-top: 0" */
 1837     if (!TY_(tmbsubstr)(attval->value, "margin-top: 0"))
 1838         return no;
 1839 
 1840     /* search for substring "margin-bottom: 0" */
 1841     if (!TY_(tmbsubstr)(attval->value, "margin-bottom: 0"))
 1842         return no;
 1843 
 1844     return yes;
 1845 }
 1846 
 1847 /* does element have a single space as its content? */
 1848 static Bool SingleSpace( Lexer* lexer, Node* node )
 1849 {
 1850     if ( node->content )
 1851     {
 1852         node = node->content;
 1853 
 1854         if ( node->next != NULL )
 1855             return no;
 1856 
 1857         if ( node->type != TextNode )
 1858             return no;
 1859 
 1860         if ( (node->end - node->start) == 1 &&
 1861              lexer->lexbuf[node->start] == ' ' )
 1862             return yes;
 1863 
 1864         if ( (node->end - node->start) == 2 )
 1865         {
 1866             uint c = 0;
 1867             TY_(GetUTF8)( lexer->lexbuf + node->start, &c );
 1868             if ( c == 160 )
 1869                 return yes;
 1870         }
 1871     }
 1872 
 1873     return no;
 1874 }
 1875 
 1876 /*
 1877  This is a major clean up to strip out all the extra stuff you get
 1878  when you save as web page from Word 2000. It doesn't yet know what
 1879  to do with VML tags, but these will appear as errors unless you
 1880  declare them as new tags, such as o:p which needs to be declared
 1881  as inline.
 1882 */
 1883 void TY_(CleanWord2000)( TidyDocImpl* doc, Node *node)
 1884 {
 1885     /* used to a list from a sequence of bulletted p's */
 1886     Lexer* lexer = doc->lexer;
 1887     Node* list = NULL;
 1888     AttVal *next_attr, *attval;
 1889 
 1890     while ( node )
 1891     {
 1892         /* get rid of Word's xmlns attributes */
 1893         if ( nodeIsHTML(node) )
 1894         {
 1895             /* check that it's a Word 2000 document */
 1896             if ( !TY_(IsWord2000) (doc) ) /* Is. #896 */
 1897                 return;
 1898 
 1899             /* Output proprietary attributes to maintain errout compatability
 1900              * with traditional Tidy. This is a result of moving all of the
 1901              * proprietary checks to near the end of the cleanup process,
 1902              * meaning this result would not ordinarily be displayed. 
 1903              */
 1904             attval = node->attributes;
 1905             while ( attval ) {
 1906                 next_attr = attval->next;
 1907 
 1908                 /* Issue #591 - take care of a NULL attribute, too. */
 1909                 if ( !attval->attribute || ( strcmp(attval->attribute, "xmlns") != 0 ))
 1910                     TY_(ReportAttrError)(doc, node, attval, PROPRIETARY_ATTRIBUTE);
 1911                 attval = next_attr;
 1912             }
 1913 
 1914             TY_(FreeAttrs)( doc, node );
 1915         }
 1916 
 1917         /* fix up preformatted sections by looking for a
 1918         ** sequence of paragraphs with zero top/bottom margin
 1919         */
 1920         if ( nodeIsP(node) )
 1921         {
 1922             if (NoMargins(node))
 1923             {
 1924                 Node *pre, *next;
 1925                 TY_(CoerceNode)(doc, node, TidyTag_PRE, no, yes);
 1926 
 1927                 PurgeWord2000Attributes( doc, node );
 1928 
 1929                 if (node->content)
 1930                     TY_(CleanWord2000)( doc, node->content );
 1931 
 1932                 pre = node;
 1933                 node = node->next;
 1934 
 1935                 /* continue to strip p's */
 1936 
 1937                 while ( nodeIsP(node) && NoMargins(node) )
 1938                 {
 1939                     next = node->next;
 1940                     TY_(RemoveNode)(node);
 1941                     TY_(InsertNodeAtEnd)(pre, TY_(NewLineNode)(lexer));
 1942                     TY_(InsertNodeAtEnd)(pre, node);
 1943                     StripSpan( doc, node );
 1944                     node = next;
 1945                 }
 1946 
 1947                 if (node == NULL)
 1948                     break;
 1949             }
 1950         }
 1951 
 1952         if (node->tag && (node->tag->model & CM_BLOCK)
 1953             && SingleSpace(lexer, node))
 1954         {
 1955             node = StripSpan( doc, node );
 1956             continue;
 1957         }
 1958         /* discard Word's style verbiage */
 1959         if ( nodeIsSTYLE(node) || nodeIsMETA(node) ||
 1960              node->type == CommentTag )
 1961         {
 1962             node = TY_(DiscardElement)( doc, node );
 1963             continue;
 1964         }
 1965 
 1966         /* strip out all span and font tags Word scatters so liberally! */
 1967         if ( nodeIsSPAN(node) || nodeIsFONT(node) )
 1968         {
 1969             node = StripSpan( doc, node );
 1970             continue;
 1971         }
 1972 
 1973         if ( nodeIsLINK(node) )
 1974         {
 1975             AttVal *attr = TY_(AttrGetById)(node, TidyAttr_REL);
 1976 
 1977             if (AttrValueIs(attr, "File-List"))
 1978             {
 1979                 node = TY_(DiscardElement)( doc, node );
 1980                 continue;
 1981             }
 1982         }
 1983 
 1984         /* discards <o:p> which encodes the paragraph mark */
 1985         if ( node->tag && TY_(tmbstrcmp)(node->tag->name,"o:p")==0)
 1986         {
 1987             /* Output proprietary elements to maintain errout compatability
 1988              * with traditional Tidy. This is a result of moving all of the
 1989              * proprietary checks to near the end of the cleanup process,
 1990              * meaning this result would not ordinarily be displayed.
 1991              */
 1992             Node* next;
 1993             TY_(Report)(doc, NULL, node, PROPRIETARY_ELEMENT);
 1994             DiscardContainer( doc, node, &next );
 1995             node = next;
 1996             continue;
 1997         }
 1998 
 1999         /* discard empty paragraphs */
 2000 
 2001         if ( node->content == NULL && nodeIsP(node) )
 2002         {
 2003             /*  Use the existing function to ensure consistency */
 2004             Node *next = TY_(TrimEmptyElement)( doc, node );
 2005             node = next;
 2006             continue;
 2007         }
 2008 
 2009         if ( nodeIsP(node) )
 2010         {
 2011             AttVal *attr, *atrStyle;
 2012             
 2013             attr = TY_(AttrGetById)(node, TidyAttr_CLASS);
 2014             atrStyle = TY_(AttrGetById)(node, TidyAttr_STYLE);
 2015             /*
 2016                (JES) Sometimes Word marks a list item with the following hokie syntax
 2017                <p class="MsoNormal" style="...;mso-list:l1 level1 lfo1;
 2018                 translate these into <li>
 2019             */
 2020             /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
 2021             /* map <p class="MsoListNumber"> to <ol>...</ol> */
 2022             if ( AttrValueIs(attr, "MsoListBullet") ||
 2023                  AttrValueIs(attr, "MsoListNumber") ||
 2024                  AttrContains(atrStyle, "mso-list:") )
 2025             {
 2026                 TidyTagId listType = TidyTag_UL;
 2027                 if (AttrValueIs(attr, "MsoListNumber"))
 2028                     listType = TidyTag_OL;
 2029 
 2030                 TY_(CoerceNode)(doc, node, TidyTag_LI, no, yes);
 2031 
 2032                 if ( !list || TagId(list) != listType )
 2033                 {
 2034                     const Dict* tag = TY_(LookupTagDef)( listType );
 2035                     list = TY_(InferredTag)(doc, tag->id);
 2036                     TY_(InsertNodeBeforeElement)(node, list);
 2037                 }
 2038 
 2039                 PurgeWord2000Attributes( doc, node );
 2040 
 2041                 if ( node->content )
 2042                     TY_(CleanWord2000)( doc, node->content );
 2043 
 2044                 /* remove node and append to contents of list */
 2045                 TY_(RemoveNode)(node);
 2046                 TY_(InsertNodeAtEnd)(list, node);
 2047                 node = list;
 2048             }
 2049             /* map sequence of <p class="Code"> to <pre>...</pre> */
 2050             else if (AttrValueIs(attr, "Code"))
 2051             {
 2052                 Node *br = TY_(NewLineNode)(lexer);
 2053                 TY_(NormalizeSpaces)(lexer, node->content);
 2054 
 2055                 if ( !list || TagId(list) != TidyTag_PRE )
 2056                 {
 2057                     list = TY_(InferredTag)(doc, TidyTag_PRE);
 2058                     TY_(InsertNodeBeforeElement)(node, list);
 2059                 }
 2060 
 2061                 /* remove node and append to contents of list */
 2062                 TY_(RemoveNode)(node);
 2063                 TY_(InsertNodeAtEnd)(list, node);
 2064                 StripSpan( doc, node );
 2065                 TY_(InsertNodeAtEnd)(list, br);
 2066                 node = list->next;
 2067             }
 2068             else
 2069                 list = NULL;
 2070         }
 2071         else
 2072             list = NULL;
 2073 
 2074         if (!node)
 2075             return;
 2076 
 2077         /* strip out style and class attributes */
 2078         if (TY_(nodeIsElement)(node))
 2079             PurgeWord2000Attributes( doc, node );
 2080 
 2081         if (node->content)
 2082             TY_(CleanWord2000)( doc, node->content );
 2083 
 2084         node = node->next;
 2085     }
 2086 }
 2087 
 2088 Bool TY_(IsWord2000)( TidyDocImpl* doc )
 2089 {
 2090     AttVal *attval;
 2091     Node *node, *head;
 2092     Node *html = TY_(FindHTML)( doc );
 2093 
 2094     if (html && TY_(GetAttrByName)(html, "xmlns:o"))
 2095         return yes;
 2096     
 2097     /* search for <meta name="GENERATOR" content="Microsoft ..."> */
 2098     head = TY_(FindHEAD)( doc );
 2099 
 2100     if (head)
 2101     {
 2102         for (node = head->content; node; node = node->next)
 2103         {
 2104             if ( !nodeIsMETA(node) )
 2105                 continue;
 2106 
 2107             attval = TY_(AttrGetById)( node, TidyAttr_NAME );
 2108 
 2109             if ( !AttrValueIs(attval, "generator") )
 2110                 continue;
 2111 
 2112             attval =  TY_(AttrGetById)( node, TidyAttr_CONTENT );
 2113 
 2114             if ( AttrContains(attval, "Microsoft") )
 2115                 return yes;
 2116         }
 2117     }
 2118 
 2119     return no;
 2120 }
 2121 
 2122 /* where appropriate move object elements from head to body */
 2123 void TY_(BumpObject)( TidyDocImpl* doc, Node *html )
 2124 {
 2125     Node *node, *next, *head = NULL, *body = NULL;
 2126 
 2127     if (!html)
 2128         return;
 2129 
 2130     for ( node = html->content; node != NULL; node = node->next )
 2131     {
 2132         if ( nodeIsHEAD(node) )
 2133             head = node;
 2134 
 2135         if ( nodeIsBODY(node) )
 2136             body = node;
 2137     }
 2138 
 2139     if ( head != NULL && body != NULL )
 2140     {
 2141         for (node = head->content; node != NULL; node = next)
 2142         {
 2143             next = node->next;
 2144 
 2145             if ( nodeIsOBJECT(node) )
 2146             {
 2147                 Node *child;
 2148                 Bool bump = no;
 2149 
 2150                 for (child = node->content; child != NULL; child = child->next)
 2151                 {
 2152                     /* bump to body unless content is param */
 2153                     if ( (TY_(nodeIsText)(child) && !TY_(IsBlank)(doc->lexer, node))
 2154                          || !nodeIsPARAM(child) )
 2155                     {
 2156                             bump = yes;
 2157                             break;
 2158                     }
 2159                 }
 2160 
 2161                 if ( bump )
 2162                 {
 2163                     TY_(RemoveNode)( node );
 2164                     TY_(InsertNodeAtStart)( body, node );
 2165                 }
 2166             }
 2167         }
 2168     }
 2169 }
 2170 
 2171 
 2172 /*\
 2173 *  Issue #456 - Check meta charset
 2174 *  1. if there is no meta charset, it adds one, according to doctype, no warning.
 2175 *  2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
 2176 *  3. if it doesn't match the output encoding, and fix. Naybe no warning?
 2177 *  4. if there are duplicates, discard them, with warning.
 2178 \*/
 2179 Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
 2180 {
 2181     AttVal *charsetAttr;
 2182     AttVal *contentAttr;
 2183     AttVal *httpEquivAttr;
 2184     Bool charsetFound = no;
 2185     uint outenc = cfg(doc, TidyOutCharEncoding);
 2186     ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc);
 2187     Node *currentNode;
 2188     Node *head = TY_(FindHEAD)(doc);
 2189     Node *metaTag;
 2190     Node *prevNode;
 2191     TidyBuffer buf;
 2192     TidyBuffer charsetString;
 2193     /* tmbstr httpEquivAttrValue; */
 2194     /* tmbstr lcontent; */
 2195     tmbstr newValue;
 2196     Bool add_meta = cfgBool(doc, TidyMetaCharset);
 2197 
 2198     /* We can't do anything we don't have a head or encoding is NULL */
 2199     if (!head || !enc || !TY_(tmbstrlen)(enc))
 2200         return no;
 2201     if (outenc == RAW)
 2202         return no;
 2203 #ifndef NO_NATIVE_ISO2022_SUPPORT
 2204     if (outenc == ISO2022)
 2205         return no;
 2206 #endif
 2207     if (cfgAutoBool(doc, TidyBodyOnly) == TidyYesState)
 2208         return no; /* nothing to do here if showing body only */
 2209 
 2210     tidyBufInit(&charsetString);
 2211     /* Set up the content test 'charset=value' */
 2212     tidyBufClear(&charsetString);
 2213     tidyBufAppend(&charsetString, "charset=", 8);
 2214     tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc));
 2215     tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */
 2216     /* process the children of the head */
 2217     /* Issue #656 - guard against 'currentNode' being set NULL in loop */
 2218     for (currentNode = head->content; currentNode; 
 2219         currentNode = (currentNode ? currentNode->next : NULL))
 2220     {
 2221         if (!nodeIsMETA(currentNode))
 2222             continue;   /* not a meta node */
 2223         charsetAttr = attrGetCHARSET(currentNode);
 2224         httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
 2225         if (!charsetAttr && !httpEquivAttr)
 2226             continue;   /* has no charset attribute */
 2227                         /*
 2228                         Meta charset comes in quite a few flavors:
 2229                         1. <meta charset="value"> - expected for (X)HTML5.
 2230                         */
 2231         if (charsetAttr && !httpEquivAttr)
 2232         {
 2233             /* we already found one, so remove the rest. */
 2234             if (charsetFound || !charsetAttr->value)
 2235             {
 2236                 prevNode = currentNode->prev;
 2237                 TY_(Report)(doc, head, currentNode, DISCARDING_UNEXPECTED);
 2238                 TY_(DiscardElement)(doc, currentNode);
 2239                 currentNode = prevNode;
 2240                 continue;
 2241             }
 2242             charsetFound = yes;
 2243             /* Fix mismatched attribute value */
 2244             if (TY_(tmbstrcasecmp)(charsetAttr->value, enc) != 0)
 2245             {
 2246                 newValue = (tmbstr)TidyDocAlloc(doc, TY_(tmbstrlen)(enc) + 1);   /* allocate + 1 for 0 */
 2247                 TY_(tmbstrcpy)(newValue, enc);
 2248                 /* Note: previously http-equiv had been modified, without warning
 2249                 in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
 2250                 */
 2251                 TY_(ReportAttrError)(doc, currentNode, charsetAttr, ATTRIBUTE_VALUE_REPLACED);
 2252                 TidyDocFree(doc, charsetAttr->value);   /* free current value */
 2253                 charsetAttr->value = newValue;
 2254             }
 2255             /* Make sure it's the first element. */
 2256             if (currentNode != head->content->next) {
 2257                 TY_(RemoveNode)(currentNode);
 2258                 TY_(InsertNodeAtStart)(head, currentNode);
 2259             }
 2260             continue;
 2261         }
 2262         /*
 2263         2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
 2264         expected for HTML4. This is normally ok - but can clash.
 2265         */
 2266         if (httpEquivAttr && !charsetAttr)
 2267         {
 2268             contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
 2269             if (!contentAttr)
 2270                 continue;   /* has no 'content' attribute */
 2271             if (!httpEquivAttr->value)
 2272             {
 2273                 prevNode = currentNode->prev;
 2274                 TY_(Report)(doc, head, currentNode, DISCARDING_UNEXPECTED);
 2275                 TY_(DiscardElement)(doc, currentNode);
 2276                 currentNode = prevNode;
 2277                 continue;
 2278             }
 2279             /* httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value); */
 2280             if (TY_(tmbstrcasecmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
 2281                 continue;   /* is not 'content-type' */
 2282             if (!contentAttr->value)
 2283             {
 2284                 continue; /* has no 'content' attribute has NO VALUE! */
 2285             }
 2286             /* check encoding matches
 2287             If a miss-match found here, fix it. previous silently done
 2288             in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
 2289             lcontent = TY_(tmbstrtolower)(contentAttr->value);
 2290             */
 2291             if (TY_(tmbstrcasecmp)(contentAttr->value, (ctmbstr)charsetString.bp) == 0)
 2292             {
 2293                 /* we already found one, so remove the rest. */
 2294                 if (charsetFound)
 2295                 {
 2296                     prevNode = currentNode->prev;
 2297                     TY_(Report)(doc, head, currentNode, DISCARDING_UNEXPECTED);
 2298                     TY_(DiscardElement)(doc, currentNode);
 2299                     currentNode = prevNode;
 2300                     continue;
 2301                 }
 2302                 charsetFound = yes;
 2303             }
 2304             else
 2305             {
 2306                 /* fix a mis-match */
 2307                 if (charsetFound)
 2308                 {
 2309                     prevNode = currentNode->prev;
 2310                     TY_(Report)(doc, head, currentNode, DISCARDING_UNEXPECTED);
 2311                     TY_(DiscardElement)(doc, currentNode);
 2312                     currentNode = prevNode;
 2313                 }
 2314                 else
 2315                 {
 2316                     /* correct the content */
 2317                     newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1);
 2318                     TY_(tmbstrcpy)(newValue, "text/html; charset=");
 2319                     TY_(tmbstrcpy)(newValue + 19, enc);
 2320                     if (cfgBool(doc, TidyShowMetaChange))   /* Issue #456 - backward compatibility only */
 2321                         TY_(ReportAttrError)(doc, currentNode, contentAttr, ATTRIBUTE_VALUE_REPLACED);
 2322                     TidyDocFree(doc, contentAttr->value);
 2323                     contentAttr->value = newValue;
 2324                     charsetFound = yes;
 2325                 }
 2326             }
 2327             continue;
 2328         }
 2329         /*
 2330         3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
 2331         This is generally bad. Discard and warn.
 2332         */
 2333         if (httpEquivAttr && charsetAttr)
 2334         {
 2335             /* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
 2336             prevNode = currentNode->prev;
 2337             TY_(Report)(doc, head, currentNode, DISCARDING_UNEXPECTED);
 2338             TY_(DiscardElement)(doc, currentNode);
 2339             currentNode = prevNode;
 2340         }
 2341     }
 2342 
 2343     /* completed head scan - add appropriate meta - if 'yes' and none exists */
 2344     if (add_meta && !charsetFound)
 2345     {
 2346         /* add appropriate meta charset tag - no warning */
 2347         metaTag = TY_(InferredTag)(doc, TidyTag_META);
 2348         switch (TY_(HTMLVersion)(doc))
 2349         {
 2350         case HT50:
 2351         case XH50:
 2352             TY_(AddAttribute)(doc, metaTag, "charset", enc);
 2353             break;
 2354         default:
 2355             tidyBufInit(&buf);
 2356             tidyBufAppend(&buf, "text/html; ", 11);
 2357             tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)((ctmbstr)charsetString.bp));
 2358             tidyBufAppend(&buf, "\0", 1);   /* zero terminate the buffer */
 2359             TY_(AddAttribute)(doc, metaTag, "http-equiv", "Content-Type"); /* add 'http-equiv' const. */
 2360             TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp);  /* add 'content="<enc>"' */
 2361             tidyBufFree(&buf);
 2362         }
 2363         TY_(InsertNodeAtStart)(head, metaTag);
 2364         TY_(Report)(doc, metaTag, head, ADDED_MISSING_CHARSET); /* actually just 'Info:' */
 2365     }
 2366     tidyBufFree(&charsetString);
 2367     return yes;
 2368 }
 2369 
 2370 
 2371 void TY_(DropComments)(TidyDocImpl* doc, Node* node)
 2372 {
 2373     Node* next;
 2374 
 2375     while (node)
 2376     {
 2377         next = node->next;
 2378 
 2379         if (node->type == CommentTag)
 2380         {
 2381             TY_(RemoveNode)(node);
 2382             TY_(FreeNode)(doc, node);
 2383             node = next;
 2384             continue;
 2385         }
 2386 
 2387         if (node->content)
 2388             TY_(DropComments)(doc, node->content);
 2389 
 2390         node = next;
 2391     }
 2392 }
 2393 
 2394 void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **ARG_UNUSED(pnode))
 2395 {
 2396     Node* next;
 2397 
 2398     while (node)
 2399     {
 2400         next = node->next;
 2401 
 2402         if (nodeIsFONT(node))
 2403         {
 2404             DiscardContainer(doc, node, &next);
 2405             node = next;
 2406             continue;
 2407         }
 2408 
 2409         if (node->content)
 2410             TY_(DropFontElements)(doc, node->content, &next);
 2411 
 2412         node = next;
 2413     }
 2414 }
 2415 
 2416 void TY_(WbrToSpace)(TidyDocImpl* doc, Node* node)
 2417 {
 2418     Node* next;
 2419 
 2420     while (node)
 2421     {
 2422         next = node->next;
 2423 
 2424         if (nodeIsWBR(node))
 2425         {
 2426             Node* text;
 2427             text = TY_(NewLiteralTextNode)(doc->lexer, " ");
 2428             TY_(InsertNodeAfterElement)(node, text);
 2429             TY_(RemoveNode)(node);
 2430             TY_(FreeNode)(doc, node);
 2431             node = next;
 2432             continue;
 2433         }
 2434 
 2435         if (node->content)
 2436             TY_(WbrToSpace)(doc, node->content);
 2437 
 2438         node = next;
 2439    }
 2440 }
 2441 
 2442 /*
 2443   Filters from Word and PowerPoint often use smart
 2444   quotes resulting in character codes between 128
 2445   and 159. Unfortunately, the corresponding HTML 4.0
 2446   entities for these are not widely supported. The
 2447   following converts dashes and quotation marks to
 2448   the nearest ASCII equivalent. My thanks to
 2449   Andrzej Novosiolov for his help with this code.
 2450 
 2451   Note: The old code in the pretty printer applied
 2452   this to all node types and attribute values while
 2453   this routine applies it only to text nodes. First,
 2454   Microsoft Office products rarely put the relevant
 2455   characters into these tokens, second support for
 2456   them is much better now and last but not least, it
 2457   can be harmful to replace these characters since
 2458   US-ASCII quote marks are often used as syntax
 2459   characters, a simple
 2460 
 2461     <a onmouseover="alert('&#x2018;')">...</a>
 2462 
 2463   would be broken if the U+2018 is replaced by "'".
 2464   The old code would neither take care whether the
 2465   quote mark is already used as delimiter,
 2466 
 2467     <p title='&#x2018;'>...</p>
 2468 
 2469   got
 2470   
 2471     <p title='''>...</p>
 2472 
 2473   Since browser support is much better nowadays and
 2474   high-quality typography is better than ASCII it'd
 2475   be probably a good idea to drop the feature...
 2476 */
 2477 void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node)
 2478 {
 2479     Node* next;
 2480     Lexer* lexer = doc->lexer;
 2481 
 2482     while (node)
 2483     {
 2484         next = node->next;
 2485 
 2486         if (TY_(nodeIsText)(node))
 2487         {
 2488             uint i, c;
 2489             tmbstr p = lexer->lexbuf + node->start;
 2490 
 2491             for (i = node->start; i < node->end; ++i)
 2492             {
 2493                 c = (unsigned char) lexer->lexbuf[i];
 2494 
 2495                 if (c > 0x7F)
 2496                     i += TY_(GetUTF8)(lexer->lexbuf + i, &c);
 2497 
 2498                 if (c >= 0x2013 && c <= 0x201E)
 2499                 {
 2500                     switch (c)
 2501                     {
 2502                     case 0x2013: /* en dash */
 2503                     case 0x2014: /* em dash */
 2504                         c = '-';
 2505                         break;
 2506                     case 0x2018: /* left single  quotation mark */
 2507                     case 0x2019: /* right single quotation mark */
 2508                     case 0x201A: /* single low-9 quotation mark */
 2509                         c = '\'';
 2510                         break;
 2511                     case 0x201C: /* left double  quotation mark */
 2512                     case 0x201D: /* right double quotation mark */
 2513                     case 0x201E: /* double low-9 quotation mark */
 2514                         c = '"';
 2515                         break;
 2516                     }
 2517                 }
 2518 
 2519                 p = TY_(PutUTF8)(p, c);
 2520             }
 2521 
 2522             node->end = p - lexer->lexbuf;
 2523         }
 2524 
 2525         if (node->content)
 2526             TY_(DowngradeTypography)(doc, node->content);
 2527 
 2528         node = next;
 2529     }
 2530 }
 2531 
 2532 void TY_(ReplacePreformattedSpaces)(TidyDocImpl* doc, Node* node)
 2533 {
 2534     Node* next;
 2535 
 2536     while (node)
 2537     {
 2538         next = node->next;
 2539 
 2540         if (node->tag && node->tag->parser == TY_(ParsePre))
 2541         {
 2542             TY_(NormalizeSpaces)(doc->lexer, node->content);
 2543             node = next;
 2544             continue;
 2545         }
 2546 
 2547         if (node->content)
 2548             TY_(ReplacePreformattedSpaces)(doc, node->content);
 2549 
 2550         node = next;
 2551     }
 2552 }
 2553 
 2554 void TY_(ConvertCDATANodes)(TidyDocImpl* doc, Node* node)
 2555 {
 2556     Node* next;
 2557 
 2558     while (node)
 2559     {
 2560         next = node->next;
 2561 
 2562         if (node->type == CDATATag)
 2563             node->type = TextNode;
 2564 
 2565         if (node->content)
 2566             TY_(ConvertCDATANodes)(doc, node->content);
 2567 
 2568         node = next;
 2569     }
 2570 }
 2571 
 2572 /*
 2573   FixLanguageInformation ensures that the document contains (only)
 2574   the attributes for language information desired by the output
 2575   document type. For example, for XHTML 1.0 documents both
 2576   'xml:lang' and 'lang' are desired, for XHTML 1.1 only 'xml:lang'
 2577   is desired and for HTML 4.01 only 'lang' is desired.
 2578 */
 2579 void TY_(FixLanguageInformation)(TidyDocImpl* doc, Node* node, Bool wantXmlLang, Bool wantLang)
 2580 {
 2581     Node* next;
 2582 
 2583     while (node)
 2584     {
 2585         next = node->next;
 2586 
 2587         /* todo: report modifications made here to the report system */
 2588 
 2589         if (TY_(nodeIsElement)(node))
 2590         {
 2591             AttVal* lang = TY_(AttrGetById)(node, TidyAttr_LANG);
 2592             AttVal* xmlLang = TY_(AttrGetById)(node, TidyAttr_XML_LANG);
 2593 
 2594             if (lang && xmlLang)
 2595             {
 2596                 /*
 2597                   todo: check whether both attributes are in sync,
 2598                   here or elsewhere, where elsewhere is probably
 2599                   preferable.
 2600                   AD - March 2005: not mandatory according the standards.
 2601                 */
 2602             }
 2603             else if (lang && wantXmlLang)
 2604             {
 2605                 if (TY_(NodeAttributeVersions)( node, TidyAttr_XML_LANG )
 2606                     & doc->lexer->versionEmitted)
 2607                     TY_(RepairAttrValue)(doc, node, "xml:lang", lang->value);
 2608             }
 2609             else if (xmlLang && wantLang)
 2610             {
 2611                 if (TY_(NodeAttributeVersions)( node, TidyAttr_LANG )
 2612                     & doc->lexer->versionEmitted)
 2613                     TY_(RepairAttrValue)(doc, node, "lang", xmlLang->value);
 2614             }
 2615 
 2616             if (lang && !wantLang)
 2617                 TY_(RemoveAttribute)(doc, node, lang);
 2618             
 2619             if (xmlLang && !wantXmlLang)
 2620                 TY_(RemoveAttribute)(doc, node, xmlLang);
 2621         }
 2622 
 2623         if (node->content)
 2624             TY_(FixLanguageInformation)(doc, node->content, wantXmlLang, wantLang);
 2625 
 2626         node = next;
 2627     }
 2628 }
 2629 
 2630 /*
 2631   Set/fix/remove <html xmlns='...'>
 2632 */
 2633 void TY_(FixXhtmlNamespace)(TidyDocImpl* doc, Bool wantXmlns)
 2634 {
 2635     Node* html = TY_(FindHTML)(doc);
 2636     AttVal* xmlns;
 2637 
 2638     if (!html)
 2639         return;
 2640 
 2641     xmlns = TY_(AttrGetById)(html, TidyAttr_XMLNS);
 2642 
 2643     if (wantXmlns)
 2644     {
 2645         if (!AttrValueIs(xmlns, XHTML_NAMESPACE))
 2646             TY_(RepairAttrValue)(doc, html, "xmlns", XHTML_NAMESPACE);
 2647     }
 2648     else if (xmlns)
 2649     {
 2650         TY_(RemoveAttribute)(doc, html, xmlns);
 2651     }
 2652 }
 2653 
 2654 /*
 2655   ...
 2656 */
 2657 void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId)
 2658 {
 2659     Node* next;
 2660 
 2661     while (node)
 2662     {
 2663         next = node->next;
 2664 
 2665         if (TY_(IsAnchorElement)(doc, node))
 2666         {
 2667             AttVal *name = TY_(AttrGetById)(node, TidyAttr_NAME);
 2668             AttVal *id = TY_(AttrGetById)(node, TidyAttr_ID);
 2669             Bool hadName = name!=NULL;
 2670             Bool hadId = id!=NULL;
 2671             Bool IdEmitted = no;
 2672             Bool NameEmitted = no;
 2673 
 2674             /* todo: how are empty name/id attributes handled? */
 2675 
 2676             if (name && id)
 2677             {
 2678                 Bool NameHasValue = AttrHasValue(name);
 2679                 Bool IdHasValue = AttrHasValue(id);
 2680                 if ( (NameHasValue != IdHasValue) ||
 2681                      (NameHasValue && IdHasValue &&
 2682                      TY_(tmbstrcmp)(name->value, id->value) != 0 ) )
 2683                     TY_(ReportAttrError)( doc, node, name, ID_NAME_MISMATCH);
 2684             }
 2685             else if (name && wantId)
 2686             {
 2687                 if (TY_(NodeAttributeVersions)( node, TidyAttr_ID )
 2688                     & doc->lexer->versionEmitted)
 2689                 {
 2690                     if (TY_(IsValidHTMLID)(name->value))
 2691                     {
 2692                         TY_(RepairAttrValue)(doc, node, "id", name->value);
 2693                         IdEmitted = yes;
 2694                     }
 2695                     else
 2696                         TY_(ReportAttrError)(doc, node, name, INVALID_XML_ID);
 2697                  }
 2698             }
 2699             else if (id && wantName)
 2700             {
 2701                 if (TY_(NodeAttributeVersions)( node, TidyAttr_NAME )
 2702                     & doc->lexer->versionEmitted)
 2703                 {
 2704                     /* todo: do not assume id is valid */
 2705                     TY_(RepairAttrValue)(doc, node, "name", id->value);
 2706                     NameEmitted = yes;
 2707                 }
 2708             }
 2709 
 2710             if (id && !wantId
 2711                 /* make sure that Name has been emitted if requested */
 2712                 && (hadName || !wantName || NameEmitted) ) {
 2713                 if (!wantId && !wantName)
 2714                     TY_(RemoveAnchorByNode)(doc, id->value, node);
 2715                 TY_(RemoveAttribute)(doc, node, id);
 2716             }
 2717 
 2718             if (name && !wantName
 2719                 /* make sure that Id has been emitted if requested */
 2720                 && (hadId || !wantId || IdEmitted) ) {
 2721                 if (!wantId && !wantName)
 2722                     TY_(RemoveAnchorByNode)(doc, name->value, node);
 2723                 TY_(RemoveAttribute)(doc, node, name);
 2724             }
 2725         }
 2726 
 2727         if (node->content)
 2728             TY_(FixAnchors)(doc, node->content, wantName, wantId);
 2729 
 2730         node = next;
 2731     }
 2732 }
 2733 
 2734 /* Issue #567 - move style elements from body to head 
 2735  * ==================================================
 2736  */
 2737 static void StyleToHead(TidyDocImpl* doc, Node *head, Node *node, Bool fix, int indent)
 2738 {
 2739     Node *next;
 2740     while (node)
 2741     {
 2742         next = node->next;  /* get 'next' now , in case the node is moved */
 2743         /* dbg_show_node(doc, node, 0, indent); */
 2744         if (nodeIsSTYLE(node))
 2745         {
 2746             if (fix)
 2747             {
 2748                 TY_(RemoveNode)(node); /* unhook style node from body */
 2749                 TY_(InsertNodeAtEnd)(head, node);   /* add to end of head */
 2750                 TY_(Report)(doc, node, head, MOVED_STYLE_TO_HEAD); /* report move */
 2751             }
 2752             else
 2753             {
 2754                 TY_(Report)(doc, node, head, FOUND_STYLE_IN_BODY);
 2755             }
 2756         }
 2757         else if (node->content)
 2758         {
 2759             StyleToHead(doc, head, node->content, fix, indent + 1);
 2760         }
 2761         node = next;    /* process the 'next', if any */
 2762     }
 2763 }
 2764 
 2765 
 2766 void TY_(CleanStyle)(TidyDocImpl* doc, Node *html)
 2767 {
 2768     Node *head = NULL, *body = NULL;
 2769     Bool fix = cfgBool(doc, TidyStyleTags);
 2770 
 2771     if (!html)
 2772         return; /* oops, not given a start node */
 2773 
 2774     head = TY_(FindHEAD)( doc );
 2775     body = TY_(FindBody)( doc );
 2776 
 2777     if ((head != NULL) && (body != NULL))
 2778     {
 2779         StyleToHead(doc, head, body, fix, 0); /* found head and body */
 2780     }
 2781 }
 2782 /* ==================================================
 2783  */
 2784 
 2785 /*
 2786  * CleanHead - clean the head node, if it exists, and we
 2787  * are going to show it in the output.
 2788  * Issue #692 - Remove multiple title elements
 2789  */
 2790 void TY_(CleanHead)(TidyDocImpl* doc)
 2791 {
 2792     Node *head, *node, *next;
 2793     uint titles = 0;
 2794     if (cfgAutoBool(doc, TidyBodyOnly) == TidyYesState)
 2795         return; /* not going to show head, so forget it */
 2796     head = TY_(FindHEAD)(doc);
 2797     if (!head)
 2798         return;
 2799     node = head->content;
 2800     while (node)
 2801     {
 2802         next = node->next;  /* get any 'next' */
 2803         if (nodeIsTITLE(node))
 2804         {
 2805             titles++;
 2806             if (titles > 1)
 2807             {
 2808                 TY_(Report)(doc, head, node, DISCARDING_UNEXPECTED);
 2809                 TY_(DiscardElement)(doc, node); /* delete this node */
 2810             }
 2811         }
 2812         node = next;
 2813     }
 2814 }
 2815 
 2816 /*
 2817  * local variables:
 2818  * mode: c
 2819  * indent-tabs-mode: nil
 2820  * c-basic-offset: 4
 2821  * eval: (c-set-offset 'substatement-open 0)
 2822  * end:
 2823  */