"Fossies" - the Fresh Open Source Software Archive

Member "tidy-html5-5.8.0/src/gdoc.c" (16 Jul 2021, 5107 Bytes) of package /linux/www/tidy-html5-5.8.0.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "gdoc.c" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 5.4.0_vs_5.6.0.

    1 /*
    2   clean.c -- clean up misuse of presentation markup
    3 
    4   (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
    5   See tidy.h for the copyright notice.
    6 
    7   Filters from other formats such as Microsoft Word
    8   often make excessive use of presentation markup such
    9   as font tags, B, I, and the align attribute. By applying
   10   a set of production rules, it is straight forward to
   11   transform this to use CSS.
   12 
   13   Some rules replace some of the children of an element by
   14   style properties on the element, e.g.
   15 
   16   <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
   17 
   18   Such rules are applied to the element's content and then
   19   to the element itself until none of the rules more apply.
   20   Having applied all the rules to an element, it will have
   21   a style attribute with one or more properties. 
   22 
   23   Other rules strip the element they apply to, replacing
   24   it by style properties on the contents, e.g.
   25   
   26   <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
   27       
   28   These rules are applied to an element before processing
   29   its content and replace the current element by the first
   30   element in the exposed content.
   31 
   32   After applying both sets of rules, you can replace the
   33   style attribute by a class value and style rule in the
   34   document head. To support this, an association of styles
   35   and class names is built.
   36 
   37   A naive approach is to rely on string matching to test
   38   when two property lists are the same. A better approach
   39   would be to first sort the properties before matching.
   40 
   41 */
   42 
   43 #include <stdio.h>
   44 #include <stdlib.h>
   45 #include <string.h>
   46 
   47 #include "tidy-int.h"
   48 #include "gdoc.h"
   49 #include "lexer.h"
   50 #include "parser.h"
   51 #include "tags.h"
   52 #include "attrs.h"
   53 #include "message.h"
   54 #include "tmbstr.h"
   55 #include "utf8.h"
   56 
   57 /*
   58   Extricate "element", replace it by its content and delete it.
   59 */
   60 static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
   61 {
   62     if (element->content)
   63     {
   64         Node *node, *parent = element->parent;
   65 
   66         element->last->next = element->next;
   67 
   68         if (element->next)
   69         {
   70             element->next->prev = element->last;
   71         }
   72         else
   73             parent->last = element->last;
   74 
   75         if (element->prev)
   76         {
   77             element->content->prev = element->prev;
   78             element->prev->next = element->content;
   79         }
   80         else
   81             parent->content = element->content;
   82 
   83         for (node = element->content; node; node = node->next)
   84             node->parent = parent;
   85 
   86         *pnode = element->content;
   87 
   88         element->next = element->content = NULL;
   89         TY_(FreeNode)(doc, element);
   90     }
   91     else
   92     {
   93         *pnode = TY_(DiscardElement)(doc, element);
   94     }
   95 }
   96 
   97 static void CleanNode( TidyDocImpl* doc, Node *node )
   98 {
   99     Node *child, *next;
  100 
  101     if (node->content)
  102     {
  103         for (child = node->content; child != NULL; child = next)
  104         {
  105             next = child->next;
  106 
  107             if (TY_(nodeIsElement)(child))
  108             {
  109                 if (nodeIsSTYLE(child))
  110                     TY_(DiscardElement)(doc, child);
  111                 if (nodeIsP(child) && !child->content)
  112                     TY_(DiscardElement)(doc, child);
  113                 else if (nodeIsSPAN(child))
  114                     DiscardContainer( doc, child, &next);
  115                 else if (nodeIsA(child) && !child->content)
  116                  {
  117                     AttVal *id = TY_(GetAttrByName)( child, "name" );
  118                     /* Recent Google Docs is using "id" instead of "name" in
  119                     ** the exported html.
  120                     */
  121                     if (!id)
  122                         id = TY_(GetAttrByName)( child, "id" );
  123 
  124                     if (id)
  125                         TY_(RepairAttrValue)( doc, child->parent, "id", id->value );
  126 
  127                     TY_(DiscardElement)(doc, child);
  128                 }
  129                 else
  130                 {
  131                     if (child->attributes)
  132                         TY_(DropAttrByName)( doc, child, "class" );
  133 
  134                     CleanNode(doc, child);
  135                 }
  136             }
  137         }
  138     }
  139 }
  140 
  141 /* insert meta element to force browser to recognize doc as UTF8 */
  142 static void SetUTF8( TidyDocImpl* doc )
  143 {
  144     Node *head = TY_(FindHEAD)( doc );
  145 
  146     if (head)
  147     {
  148         Node *node = TY_(InferredTag)(doc, TidyTag_META);
  149         TY_(AddAttribute)( doc, node, "http-equiv", "Content-Type" );
  150         TY_(AddAttribute)( doc, node, "content", "text/html; charset=UTF-8" );
  151         TY_(InsertNodeAtStart)( head, node );
  152     }
  153 }
  154 
  155 /* clean html exported by Google Docs
  156 
  157     - strip the script element, as the style sheet is a mess
  158     - strip class attributes
  159     - strip span elements, leaving their content in place
  160     - replace <a name=...></a> by id on parent element
  161     - strip empty <p> elements
  162 */
  163 void TY_(CleanGoogleDocument)( TidyDocImpl* doc )
  164 {
  165     /* placeholder.  CleanTree()/CleanNode() will not
  166     ** zap root element 
  167     */
  168     CleanNode( doc, &doc->root );
  169     SetUTF8( doc );
  170 }
  171 
  172 /*
  173  * local variables:
  174  * mode: c
  175  * indent-tabs-mode: nil
  176  * c-basic-offset: 4
  177  * eval: (c-set-offset 'substatement-open 0)
  178  * end:
  179  */