"Fossies" - the Fresh Open Source Software Archive

Member "google-gadgets-for-linux-0.11.2/extensions/libxml2_xml_parser/libxml2_xml_parser.cc" (28 Dec 2009, 30996 Bytes) of package /linux/misc/old/google-gadgets-for-linux-0.11.2.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 /*
    2   Copyright 2008 Google Inc.
    3 
    4   Licensed under the Apache License, Version 2.0 (the "License");
    5   you may not use this file except in compliance with the License.
    6   You may obtain a copy of the License at
    7 
    8        http://www.apache.org/licenses/LICENSE-2.0
    9 
   10   Unless required by applicable law or agreed to in writing, software
   11   distributed under the License is distributed on an "AS IS" BASIS,
   12   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   13   See the License for the specific language governing permissions and
   14   limitations under the License.
   15 */
   16 
   17 #include <cstring>
   18 #include <cmath>
   19 #include <libxml/encoding.h>
   20 #include <libxml/parser.h>
   21 // For xmlCreateMemoryParserCtxt and xmlParseName.
   22 #include <libxml/parserInternals.h>
   23 #include <libxml/tree.h>
   24 
   25 #include <ggadget/logger.h>
   26 #include <ggadget/string_utils.h>
   27 #include <ggadget/xml_parser_interface.h>
   28 #include <ggadget/xml_dom.h>
   29 
   30 namespace ggadget {
   31 namespace libxml2 {
   32 
   33 // Entity will be ignored if size bigger than this limit.
   34 static const size_t kMaxEntitySize = 65536U;
   35 
   36 static inline char *FromXmlCharPtr(xmlChar *xml_char_ptr) {
   37   return reinterpret_cast<char *>(xml_char_ptr);
   38 }
   39 
   40 static inline const char *FromXmlCharPtr(const xmlChar *xml_char_ptr) {
   41   return reinterpret_cast<const char *>(xml_char_ptr);
   42 }
   43 
   44 static inline const xmlChar *ToXmlCharPtr(const char *char_ptr) {
   45   return reinterpret_cast<const xmlChar *>(char_ptr);
   46 }
   47 // There is no non-const version of ToXmlChar *because it's not allowed to
   48 // transfer a non-const char * to libxml.
   49 
   50 static const char kXMLTag[] = {
   51   '<', '?', 'x', 'm', 'l', ' '
   52 };
   53 static const char kXMLTagUTF8[] = {
   54   '\xEF', '\xBB', '\xBF', '<', '?', 'x', 'm', 'l', ' '
   55 };
   56 static const char kXMLTagUTF16LE[] = {
   57   '\xFF', '\xFE', '<', 0, '?', 0, 'x', 0, 'm', 0, 'l', 0, ' ', 0
   58 };
   59 static const char kXMLTagUTF16BE[] = {
   60   '\xFE', '\xFF', 0, '<', 0, '?', 0, 'x', 0, 'm', 0, 'l', 0, ' '
   61 };
   62 static const char kXMLTagBOMLessUTF16LE[] = {
   63   '<', 0, '?', 0, 'x', 0, 'm', 0, 'l', 0, ' ', 0
   64 };
   65 static const char kXMLTagBOMLessUTF16BE[] = {
   66   0, '<', 0, '?', 0, 'x', 0, 'm', 0, 'l', 0, ' '
   67 };
   68 static const char kXMLTagUTF32LE[] = {
   69   '\xFF', '\xFE', 0, 0, '<', 0, 0, 0, '?', 0, 0, 0,
   70   'x', 0, 0, 0, 'm', 0, 0, 0, 'l', 0, 0, 0, ' ', 0, 0, 0
   71 };
   72 static const char kXMLTagUTF32BE[] = {
   73   0, 0, '\xFE', '\xFF', 0, 0, 0, '<', 0, 0, 0, '?',
   74   0, 0, 0, 'x', 0, 0, 0, 'm', 0, 0, 0, 'l', 0, 0, 0, ' '
   75 };
   76 // BOM-less UTF32 is seldom used, so we won't check.
   77 
   78 #define STARTS_WITH(content_ptr, content_size, pattern) \
   79     ((content_size) >= sizeof(pattern) && \
   80      memcmp((content_ptr), (pattern), sizeof(pattern)) == 0)
   81 
   82 // Used in ConvertStringToUTF8 to detect errors during conversion,
   83 // and in ParseXML to let the XML error go into our LOG pipe.
   84 // FIXME: Using global error reporter may have side-effect if another module
   85 // linked to our binary also uses libxml2, especially in other threads.
   86 static bool g_error_occurred = false;
   87 static std::string g_error_buffer;
   88 static void ErrorFunc(void *ctx, const char *msg, ...) {
   89   GGL_UNUSED(ctx);
   90   va_list ap;
   91   va_start(ap, msg);
   92   StringAppendVPrintf(&g_error_buffer, msg, ap);
   93   va_end(ap);
   94   g_error_occurred = true;
   95   if (g_error_buffer.size() &&
   96       g_error_buffer[g_error_buffer.size() - 1] == '\n') {
   97     // Only send to our log when a line break is received.
   98     g_error_buffer.erase(g_error_buffer.size() - 1);
   99     LOG("%s", g_error_buffer.c_str());
  100     g_error_buffer.clear();
  101   }
  102 }
  103 
  104 // Converts a string in given encoding to a utf8.
  105 // Here use libxml routines instead of normal iconv routines to simplify
  106 // compile-time dependencies.
  107 static bool ConvertStringToUTF8(const std::string &content,
  108                                 const char *encoding,
  109                                 std::string *utf8_content) {
  110   ASSERT(encoding);
  111   if (utf8_content)
  112     utf8_content->clear();
  113   if (content.empty())
  114     return true;
  115 
  116   xmlCharEncodingHandler *handler = xmlFindCharEncodingHandler(encoding);
  117   if (!handler)
  118     return false;
  119 
  120   xmlBuffer *input_buffer = xmlBufferCreateStatic(
  121       const_cast<char *>(content.c_str()), content.length());
  122   xmlBuffer *output_buffer = xmlBufferCreate();
  123   // xmlCharEncInFunc's result > 0 even if encoding error occurred, so use
  124   // ErrorFunc to detect errors.
  125   xmlGenericErrorFunc old_error_func = xmlGenericError;
  126   xmlSetGenericErrorFunc(NULL, ErrorFunc);
  127   g_error_occurred = false;
  128   bool success = false;
  129   int result = xmlCharEncInFunc(handler, output_buffer, input_buffer);
  130   xmlSetGenericErrorFunc(NULL, old_error_func);
  131   if (!g_error_occurred && result > 0) {
  132     ASSERT(result == xmlBufferLength(output_buffer));
  133     const char *output = FromXmlCharPtr(xmlBufferContent(output_buffer));
  134     if (IsLegalUTF8String(output, result)) {
  135       success = true;
  136       if (utf8_content)
  137         utf8_content->append(output, result);
  138     }
  139   }
  140 
  141   xmlCharEncCloseFunc(handler);
  142   xmlBufferFree(input_buffer);
  143   xmlBufferFree(output_buffer);
  144   return success;
  145 }
  146 
  147 static std::string GetXMLEncodingDecl(const std::string &xml) {
  148   std::string result;
  149   if (!STARTS_WITH(xml.c_str(), xml.size(), kXMLTag) &&
  150       !STARTS_WITH(xml.c_str(), xml.size(), kXMLTagUTF8))
  151     return result;
  152   size_t end_decl_pos = xml.find("?>");
  153   if (end_decl_pos == std::string::npos)
  154     return result;
  155   size_t encoding_pos = xml.rfind(" encoding=\"", end_decl_pos);
  156   if (encoding_pos == std::string::npos)
  157     return result;
  158   encoding_pos += 11;
  159   size_t end_encoding_pos = xml.find('"', encoding_pos);
  160   if (end_encoding_pos == std::string::npos)
  161     return result;
  162 
  163   return xml.substr(encoding_pos, end_encoding_pos - encoding_pos);
  164 }
  165 
  166 static void ReplaceXMLEncodingDecl(std::string *xml) {
  167   if (!STARTS_WITH(xml->c_str(), xml->size(), kXMLTag) &&
  168       !STARTS_WITH(xml->c_str(), xml->size(), kXMLTagUTF8))
  169     return;
  170 
  171   size_t end_decl_pos = xml->find("?>");
  172   if (end_decl_pos == std::string::npos)
  173     return;
  174   size_t encoding_pos = xml->rfind(" encoding=\"", end_decl_pos);
  175   if (encoding_pos == std::string::npos)
  176     return;
  177   size_t end_encoding_pos = xml->find('"', encoding_pos + 11);
  178   if (end_encoding_pos == std::string::npos)
  179     return;
  180   xml->replace(encoding_pos, end_encoding_pos - encoding_pos + 1,
  181                " encoding=\"UTF-8\"");
  182 }
  183 
  184 struct ContextData {
  185   const StringMap *extra_entities;
  186   getEntitySAXFunc original_get_entity_handler;
  187   entityDeclSAXFunc original_entity_decl_handler;
  188 };
  189 
  190 static void EntityDeclHandler(void *ctx, const xmlChar *name, int type,
  191                               const xmlChar *public_id,
  192                               const xmlChar *system_id,
  193                               xmlChar *content) {
  194   if (type == 1 && system_id == NULL) {
  195     // Only handle internal entities.
  196     xmlParserCtxt *ctxt = static_cast<xmlParserCtxt *>(ctx);
  197     ASSERT(ctxt && ctxt->_private);
  198     ContextData *data = static_cast<ContextData *>(ctxt->_private);
  199     data->original_entity_decl_handler(ctx, name, type, public_id,
  200                                        system_id, content);
  201   } else {
  202     DLOG("External or bad entity decl ignored: %d %s %s %s %s",
  203          type, name, public_id, system_id, content);
  204   }
  205 }
  206 
  207 // Expand the entity and check the length.
  208 static void ExpandEntity(xmlEntity *entity) {
  209   if (entity->children && (entity->children->next ||
  210                            entity->children->type != XML_TEXT_NODE)) {
  211     xmlNode *text = xmlNewText(ToXmlCharPtr(""));
  212     size_t size = 0;
  213     for (xmlNode *child = entity->children; child; child = child->next) {
  214       xmlChar *child_content = xmlNodeGetContent(child);
  215       size_t child_size = strlen(FromXmlCharPtr(child_content));
  216       size += child_size;
  217       if (size > kMaxEntitySize) {
  218         LOG("Entity '%s' is too long, truncated", entity->name);
  219         xmlFree(child_content);
  220         break;
  221       }
  222       xmlNodeAddContentLen(text, child_content, static_cast<int>(child_size));
  223       xmlFree(child_content);
  224     }
  225     xmlFreeNodeList(entity->children);
  226     entity->children = NULL;
  227     xmlAddChild(reinterpret_cast<xmlNode *>(entity), text);
  228     entity->length = static_cast<int>(size);
  229   }
  230 }
  231 
  232 static xmlEntity *GetEntityHandler(void *ctx, const xmlChar *name) {
  233   xmlParserCtxt *ctxt = static_cast<xmlParserCtxt *>(ctx);
  234   ASSERT(ctxt && ctxt->_private);
  235   ContextData *data = static_cast<ContextData *>(ctxt->_private);
  236   xmlEntity *result = data->original_get_entity_handler(ctx, name);
  237   if (result) {
  238     ExpandEntity(result);
  239   } else if (ctxt->myDoc) {
  240     if (!ctxt->myDoc->intSubset) {
  241       ctxt->myDoc->intSubset =
  242           xmlCreateIntSubset(ctxt->myDoc, NULL, NULL, NULL);
  243     }
  244     StringMap::const_iterator it =
  245         data->extra_entities->find(FromXmlCharPtr(name));
  246     if (it != data->extra_entities->end()) {
  247       xmlChar *encoded_value =
  248           xmlEncodeSpecialChars(NULL, ToXmlCharPtr(it->second.c_str()));
  249       result = xmlAddDocEntity(ctxt->myDoc, name, XML_INTERNAL_GENERAL_ENTITY,
  250                                NULL, NULL, encoded_value);
  251       xmlFree(encoded_value);
  252     } else {
  253       LOG("Entity '%s' not defined.", name);
  254       // If the entity is not defined, just use it's name.
  255       result = xmlAddDocEntity(ctxt->myDoc, name, XML_INTERNAL_GENERAL_ENTITY,
  256                                NULL, NULL, name);
  257     }
  258   }
  259   return result;
  260 }
  261 
  262 static xmlDoc *ParseXML(const std::string &xml,
  263                         const StringMap *extra_entities,
  264                         const char *filename,
  265                         const char *encoding_hint,
  266                         const char *encoding_fallback,
  267                         std::string *encoding,
  268                         std::string *utf8_content) {
  269   std::string converted_xml;
  270   std::string use_encoding;
  271   // Indicates whether the encoding is successfully converted before libxml2
  272   // parsing, or is detected by libxml2.
  273   bool converted = false;
  274   if (encoding) encoding->clear();
  275 
  276   // Although libxml2 will do almost the same things, we must do it ourselves
  277   // to make encoding_hint have higher priority than the encoding declaration
  278   // with xml file, according to the XML standard.
  279   if (!DetectUTFEncoding(xml, &use_encoding) &&
  280       encoding_hint && *encoding_hint) {
  281     use_encoding = encoding_hint;
  282   }
  283 
  284   xmlDoc *result = NULL;
  285   bool retry;
  286   do {
  287     retry = false;
  288     if (!use_encoding.empty()) {
  289       if (ConvertStringToUTF8(xml, use_encoding.c_str(), &converted_xml)) {
  290         converted = true;
  291         if (utf8_content)
  292           *utf8_content = converted_xml;
  293         // We have successfully converted the encoding to UTF8, insert a BOM and
  294         // remove the original encoding declaration to prevent libxml2 from
  295         // converting again.
  296         ReplaceXMLEncodingDecl(&converted_xml);
  297       } else if (encoding_fallback && use_encoding != encoding_fallback) {
  298         // Encoding conversion failed, try fallback_encoding if it has not
  299         // been tried.
  300         use_encoding = encoding_fallback;
  301         retry = true;
  302         continue;
  303       }
  304     } else {
  305       converted_xml = xml;
  306     }
  307 
  308     xmlParserCtxt *ctxt = xmlCreateMemoryParserCtxt(
  309         converted_xml.c_str(), static_cast<int>(converted_xml.length()));
  310     if (!ctxt)
  311       return NULL;
  312 
  313     ASSERT(ctxt->sax);
  314     ContextData data;
  315     ctxt->_private = &data;
  316     if (extra_entities) {
  317       // Hook getEntity handler to provide extra entities.
  318       data.extra_entities = extra_entities;
  319       data.original_get_entity_handler = ctxt->sax->getEntity;
  320       ctxt->sax->getEntity = GetEntityHandler;
  321     }
  322 
  323     // Disable external entities to avoid security troubles.
  324     data.original_entity_decl_handler = ctxt->sax->entityDecl;
  325     ctxt->sax->entityDecl = EntityDeclHandler;
  326     ctxt->sax->resolveEntity = NULL;
  327 
  328     // Let the built-in libxml2 error reporter print the correct filename.
  329     ctxt->input->filename = xmlMemStrdup(filename);
  330 
  331     xmlGenericErrorFunc old_error_func = xmlGenericError;
  332     xmlSetGenericErrorFunc(NULL, ErrorFunc);
  333     xmlParseDocument(ctxt);
  334     xmlSetGenericErrorFunc(NULL, old_error_func);
  335 
  336     if (ctxt->wellFormed) {
  337       // Successfully parsed the document.
  338       result = ctxt->myDoc;
  339       if (!converted) {
  340         if (ctxt->input && ctxt->input->encoding)
  341           use_encoding = FromXmlCharPtr(ctxt->input->encoding);
  342         else
  343           use_encoding = "UTF-8";
  344         if (utf8_content)
  345           ConvertStringToUTF8(xml, use_encoding.c_str(), utf8_content);
  346       }
  347     } else if ((ctxt->errNo == XML_ERR_INVALID_CHAR ||
  348                 ctxt->errNo == XML_ERR_UNKNOWN_ENCODING ||
  349                 ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) &&
  350                encoding_fallback && use_encoding != encoding_fallback) {
  351       xmlFreeDoc(ctxt->myDoc);
  352       ctxt->myDoc = NULL;
  353       // libxml2 encoding conversion failed, try fallback_encoding if it has
  354       // not been tried.
  355       use_encoding = encoding_fallback;
  356       retry = true;
  357     } else {
  358       xmlFreeDoc(ctxt->myDoc);
  359       ctxt->myDoc = NULL;
  360 
  361       if (!converted) {
  362         use_encoding.clear();
  363         if (utf8_content)
  364           utf8_content->clear();
  365       }
  366     }
  367     xmlFreeParserCtxt(ctxt);
  368   } while (retry);
  369 
  370   if (encoding)
  371     *encoding = use_encoding;
  372   return result;
  373 }
  374 
  375 static bool IsBlankText(const char *text) {
  376   for (const char *p = text; *p; p++) {
  377     if (strchr(" \r\n\t", *p) == NULL)
  378       return false;
  379   }
  380   return true;
  381 }
  382 
  383 static bool IsTextNode(xmlNode *xmlnode) {
  384   return xmlnode && (xmlnode->type == XML_TEXT_NODE ||
  385                      xmlnode->type == XML_ENTITY_REF_NODE);
  386 }
  387 
  388 static void ConvertCharacterDataIntoDOM(DOMDocumentInterface *domdoc,
  389                                         DOMNodeInterface *parent,
  390                                         xmlNode *xmlnode) {
  391   char *text = FromXmlCharPtr(xmlNodeGetContent(xmlnode));
  392   UTF16String utf16_text;
  393   if (text) {
  394     if (domdoc->PreservesWhiteSpace() ||
  395         xmlnode->type != XML_TEXT_NODE ||
  396         IsTextNode(xmlnode->prev) || IsTextNode(xmlnode->next) ||
  397         !IsBlankText(text)) {
  398       // Don't trim the text. The caller can trim based on their own
  399       // requirements.
  400       ConvertStringUTF8ToUTF16(text, strlen(text), &utf16_text);
  401     }
  402     xmlFree(text);
  403   }
  404 
  405   DOMCharacterDataInterface *data = NULL;
  406   switch (xmlnode->type) {
  407     case XML_TEXT_NODE:
  408       // Don't create empty text nodes.
  409       if (!utf16_text.empty())
  410         data = domdoc->CreateTextNode(utf16_text.c_str());
  411       break;
  412     case XML_ENTITY_REF_NODE:
  413       data = domdoc->CreateTextNode(utf16_text.c_str());
  414       break;
  415     case XML_CDATA_SECTION_NODE:
  416       data = domdoc->CreateCDATASection(utf16_text.c_str());
  417       break;
  418     case XML_COMMENT_NODE:
  419       data = domdoc->CreateComment(utf16_text.c_str());
  420       break;
  421     default:
  422       ASSERT(false);
  423       break;
  424   }
  425   if (data) {
  426     data->SetRow(static_cast<int>(xmlGetLineNo(xmlnode)));
  427     parent->AppendChild(data);
  428   }
  429 }
  430 
  431 static void ConvertPIIntoDOM(DOMDocumentInterface *domdoc,
  432                              DOMNodeInterface *parent,
  433                              xmlNode *xmlpi) {
  434   const char *target = FromXmlCharPtr(xmlpi->name);
  435   char *data = FromXmlCharPtr(xmlNodeGetContent(xmlpi));
  436   DOMProcessingInstructionInterface *pi;
  437   domdoc->CreateProcessingInstruction(target, data, &pi);
  438   if (pi) {
  439     pi->SetRow(static_cast<int>(xmlGetLineNo(xmlpi)));
  440     parent->AppendChild(pi);
  441   }
  442   if (data)
  443     xmlFree(data);
  444 }
  445 
  446 static void ConvertElementIntoDOM(DOMDocumentInterface *domdoc,
  447                                   DOMNodeInterface *parent,
  448                                   xmlNode *xmlele);
  449 
  450 static void ConvertChildrenIntoDOM(DOMDocumentInterface *domdoc,
  451                                    DOMNodeInterface *parent,
  452                                    xmlNode *xmlnode) {
  453   for (xmlNode *child = xmlnode->children; child != NULL; child = child->next) {
  454     switch (child->type) {
  455       case XML_ELEMENT_NODE:
  456         ConvertElementIntoDOM(domdoc, parent, child);
  457         break;
  458       case XML_TEXT_NODE:
  459       case XML_ENTITY_REF_NODE:
  460       case XML_CDATA_SECTION_NODE:
  461       case XML_COMMENT_NODE:
  462         ConvertCharacterDataIntoDOM(domdoc, parent, child);
  463         break;
  464       case XML_PI_NODE:
  465         ConvertPIIntoDOM(domdoc, parent, child);
  466         break;
  467       case XML_DTD_NODE:
  468         break;
  469       default:
  470         DLOG("Ignore XML Node of type %d", child->type);
  471         break;
  472     }
  473   }
  474 }
  475 
  476 static void ConvertElementIntoDOM(DOMDocumentInterface *domdoc,
  477                                   DOMNodeInterface *parent,
  478                                   xmlNode *xmlele) {
  479   DOMElementInterface *element;
  480   domdoc->CreateElement(FromXmlCharPtr(xmlele->name), &element);
  481   if (!element || DOM_NO_ERR != parent->AppendChild(element)) {
  482     // Unlikely to happen.
  483     DLOG("Failed to create DOM element or to add it to parent");
  484     delete element;
  485     return;
  486   }
  487 
  488   // We don't support full DOM2 namespaces, but we must keep all namespace
  489   // related information in the result DOM.
  490   if (xmlele->ns && xmlele->ns->prefix)
  491     element->SetPrefix(FromXmlCharPtr(xmlele->ns->prefix));
  492   for (xmlNsPtr ns = xmlele->nsDef; ns; ns = ns->next) {
  493     DOMAttrInterface *attr;
  494     if (ns->prefix && *ns->prefix) {
  495       // xmlns:prefix="uri" case.
  496       domdoc->CreateAttribute(FromXmlCharPtr(ns->prefix), &attr);
  497       if (attr)
  498         attr->SetPrefix("xmlns");
  499     } else {
  500       // xmlns="uri" case.
  501       domdoc->CreateAttribute("xmlns", &attr);
  502     }
  503     if (!attr || DOM_NO_ERR != element->SetAttributeNode(attr)) {
  504       // Unlikely to happen.
  505       DLOG("Failed to create xmlns attribute or to add it to element");
  506       delete attr;
  507       continue;
  508     }
  509     attr->SetValue(FromXmlCharPtr(ns->href));
  510   }
  511 
  512   // libxml2 doesn't support node column position for now.
  513   element->SetRow(static_cast<int>(xmlGetLineNo(xmlele)));
  514   for (xmlAttr *xmlattr = xmlele->properties; xmlattr != NULL;
  515        xmlattr = xmlattr->next) {
  516     const char *name = FromXmlCharPtr(xmlattr->name);
  517     DOMAttrInterface *attr;
  518     domdoc->CreateAttribute(name, &attr);
  519     if (!attr || DOM_NO_ERR != element->SetAttributeNode(attr)) {
  520       // Unlikely to happen.
  521       DLOG("Failed to create DOM attribute or to add it to element");
  522       delete attr;
  523       continue;
  524     }
  525 
  526     char *value = FromXmlCharPtr(
  527         xmlNodeGetContent(reinterpret_cast<xmlNode *>(xmlattr)));
  528     attr->SetValue(value);
  529     if (xmlattr->ns)
  530       attr->SetPrefix(FromXmlCharPtr(xmlattr->ns->prefix));
  531     if (value)
  532       xmlFree(value);
  533   }
  534 
  535   ConvertChildrenIntoDOM(domdoc, element, xmlele);
  536 }
  537 
  538 static const char* SkipSpaces(const char* str) {
  539   while (*str && isspace(*str))
  540     str++;
  541   return str;
  542 }
  543 
  544 static const int kMaxDetectionDepth = 2048;
  545 static const char kMetaTag[] = "meta";
  546 static const char kHttpEquivAttrName[] = "http-equiv";
  547 static const char kHttpContentType[] = "content-type";
  548 static const char kContentAttrName[] = "content";
  549 static const char kCharsetPrefix[] = "charset=";
  550 
  551 std::string GetHTMLCharset(const char* html_content) {
  552   std::string charset;
  553   const char* cursor = html_content;
  554   while (cursor - html_content < kMaxDetectionDepth) {
  555     cursor = strchr(cursor, '<');
  556     if (!cursor)
  557       break;
  558 
  559     if (strncmp(cursor, "<!--", 3) == 0) {
  560       cursor = strstr(cursor, "-->");
  561       if (!cursor)
  562         break;
  563       continue;
  564     }
  565 
  566     cursor = SkipSpaces(cursor + 1);
  567     if (!strncasecmp(cursor, kMetaTag, arraysize(kMetaTag) - 1)) {
  568       const char* element_end = strchr(cursor, '>');
  569       if (!element_end)
  570         break;
  571 
  572       std::string meta_content(cursor, element_end - cursor);
  573       meta_content = ToLower(meta_content);
  574       if (meta_content.find(kHttpEquivAttrName) != meta_content.npos &&
  575           meta_content.find(kHttpContentType) != meta_content.npos &&
  576           meta_content.find(kContentAttrName) != meta_content.npos) {
  577         size_t charset_pos = meta_content.find(kCharsetPrefix);
  578         if (charset_pos != meta_content.npos) {
  579           const char* charset_start = meta_content.c_str() + charset_pos +
  580                                       arraysize(kCharsetPrefix) - 1;
  581           charset_start = SkipSpaces(charset_start);
  582           const char* charset_end = charset_start;
  583           while (isalnum(*charset_end) || *charset_end == '_' ||
  584                  *charset_end == '.' || *charset_end == '-')
  585             charset_end++;
  586           charset.assign(charset_start, charset_end - charset_start);
  587         }
  588         // Don't try to find another, because there should be only one
  589         // <meta http-equiv="content-type" ...>.
  590         break;
  591       }
  592     }
  593   }
  594   return charset;
  595 }
  596 
  597 // Count the sequence of a child in the elements of the same tag name.
  598 static int CountTagSequence(const xmlNode *child, const char *tag) {
  599   static xmlNode *last_parent = NULL;
  600   static int last_count = 1;
  601   static std::string last_tag;
  602 
  603   if (last_parent == child->parent &&
  604       GadgetStrCmp(last_tag.c_str(), tag) == 0) {
  605     return ++last_count;
  606   }
  607 
  608   last_parent = child->parent;
  609   last_count = 1;
  610   last_tag = tag;
  611   for (const xmlNode *node = child->prev; node != NULL; node = node->prev) {
  612     if (node->type == XML_ELEMENT_NODE &&
  613         GadgetStrCmp(tag, FromXmlCharPtr(node->name)) == 0)
  614       last_count++;
  615   }
  616   return last_count;
  617 }
  618 
  619 static void ConvertElementIntoXPathMap(const xmlNode *element,
  620                                        const std::string &prefix,
  621                                        StringMap *table) {
  622   for (xmlAttr *attribute = element->properties;
  623        attribute != NULL; attribute = attribute->next) {
  624     const char *name = FromXmlCharPtr(attribute->name);
  625     char *value = FromXmlCharPtr(
  626         xmlNodeGetContent(reinterpret_cast<xmlNode *>(attribute)));
  627     (*table)[prefix + '@' + name] = std::string(value ? value : "");
  628     if (value)
  629       xmlFree(value);
  630   }
  631 
  632   for (xmlNode *child = element->children; child != NULL; child = child->next) {
  633     if (child->type == XML_ELEMENT_NODE) {
  634       const char *tag = FromXmlCharPtr(child->name);
  635       char *text = FromXmlCharPtr(xmlNodeGetContent(child));
  636       std::string key(prefix);
  637       if (!prefix.empty()) key += '/';
  638       key += tag;
  639 
  640       if (table->find(key) != table->end()) {
  641         // Postpend the sequence if there are multiple elements with the same
  642         // name.
  643         char buf[20];
  644         snprintf(buf, sizeof(buf), "[%d]", CountTagSequence(child, tag));
  645         key += buf;
  646       }
  647       (*table)[key] = text ? text : "";
  648       if (text) xmlFree(text);
  649 
  650       ConvertElementIntoXPathMap(child, key, table);
  651     }
  652   }
  653 }
  654 
  655 // Check if the content is XML according to XMLHttpRequest standard rule.
  656 static bool ContentTypeIsXML(const char *content_type) {
  657   size_t content_type_len = content_type ? strlen(content_type) : 0;
  658   return content_type_len == 0 ||
  659          strcasecmp(content_type, "text/xml") == 0 ||
  660          strcasecmp(content_type, "application/xml") == 0 ||
  661          (content_type_len > 4 &&
  662           strcasecmp(content_type + content_type_len - 4, "+xml") == 0);
  663 }
  664 
  665 class XMLParser : public XMLParserInterface {
  666  public:
  667   virtual bool CheckXMLName(const char *name) {
  668     return name && *name && xmlValidateName(ToXmlCharPtr(name), 0) == 0;
  669   }
  670 
  671   virtual bool HasXMLDecl(const std::string &content) {
  672     const char *content_ptr = content.c_str();
  673     size_t content_size = content.size();
  674     return STARTS_WITH(content_ptr, content_size, kXMLTag) ||
  675            STARTS_WITH(content_ptr, content_size, kXMLTagUTF8) ||
  676            STARTS_WITH(content_ptr, content_size, kXMLTagUTF16LE) ||
  677            STARTS_WITH(content_ptr, content_size, kXMLTagUTF16BE) ||
  678            STARTS_WITH(content_ptr, content_size, kXMLTagBOMLessUTF16LE) ||
  679            STARTS_WITH(content_ptr, content_size, kXMLTagBOMLessUTF16BE) ||
  680            STARTS_WITH(content_ptr, content_size, kXMLTagUTF32LE) ||
  681            STARTS_WITH(content_ptr, content_size, kXMLTagUTF32BE);
  682   }
  683 
  684   virtual DOMDocumentInterface *CreateDOMDocument() {
  685     return ::ggadget::CreateDOMDocument(this, false, false);
  686   }
  687 
  688 
  689   virtual bool ConvertContentToUTF8(const std::string &content,
  690                                     const char *filename,
  691                                     const char *content_type,
  692                                     const char *encoding_hint,
  693                                     const char *encoding_fallback,
  694                                     std::string *encoding,
  695                                     std::string *utf8_content) {
  696     GGL_UNUSED(filename);
  697     // The caller wants nothing?
  698     if (!encoding && !utf8_content)
  699       return true;
  700 
  701     bool result = true;
  702     std::string encoding_to_use;
  703     if (!DetectUTFEncoding(content, &encoding_to_use)) {
  704       if (encoding_hint && *encoding_hint) {
  705         encoding_to_use = encoding_hint;
  706       } else if (STARTS_WITH(content.c_str(), content.size(),
  707                              kXMLTagBOMLessUTF16LE)) {
  708         encoding_to_use = "UTF-16LE";
  709       } else if (STARTS_WITH(content.c_str(), content.size(),
  710                              kXMLTagBOMLessUTF16BE)) {
  711         encoding_to_use = "UTF-16BE";
  712       } else {
  713         // Try to find encoding declaration from the content.
  714         if (ContentTypeIsXML(content_type) ||
  715             STARTS_WITH(content.c_str(), content.size(), kXMLTag)) {
  716           encoding_to_use = GetXMLEncodingDecl(content);
  717         } else if (content_type && strcasecmp(content_type, "text/html") == 0) {
  718           encoding_to_use = GetHTMLCharset(content.c_str());
  719         }
  720 
  721         if (encoding_to_use.empty()) {
  722           encoding_to_use = "UTF-8";
  723         } else if (ToLower(encoding_to_use).find("utf") == 0 &&
  724                    (encoding_to_use.find("16") != std::string::npos ||
  725                     encoding_to_use.find("32") != std::string::npos)) {
  726           // UTF-16 and UTF-32 makes no sense here. Because if the content is
  727           // in UTF-16 or UTF-32 encoding, then it's impossible to find the
  728           // charset tag by parsing it as char string directly.
  729           // In this case, assuming UTF-8 will be the best choice, and will
  730           // fallback to ISO8859-1 if it's not UTF-8.
  731           encoding_to_use = "UTF-8";
  732         }
  733       }
  734     }
  735 
  736     result = ConvertStringToUTF8(content, encoding_to_use.c_str(),
  737                                  utf8_content);
  738     if (!result && encoding_fallback && *encoding_fallback) {
  739       encoding_to_use = encoding_fallback;
  740       result = ConvertStringToUTF8(content, encoding_fallback, utf8_content);
  741     }
  742     if (encoding)
  743       *encoding = result ? encoding_to_use : "";
  744     return result;
  745   }
  746 
  747   virtual bool ParseContentIntoDOM(const std::string &content,
  748                                    const StringMap *extra_entities,
  749                                    const char *filename,
  750                                    const char *content_type,
  751                                    const char *encoding_hint,
  752                                    const char *encoding_fallback,
  753                                    DOMDocumentInterface *domdoc,
  754                                    std::string *encoding,
  755                                    std::string *utf8_content) {
  756 #ifdef _DEBUG
  757     int original_ref_count = domdoc ? domdoc->GetRefCount() : 0;
  758 #endif
  759     bool result = true;
  760     xmlLineNumbersDefault(1);
  761     if (ContentTypeIsXML(content_type) ||
  762         // However, some XML documents is returned when Content-Type is
  763         // text/html or others, so detect from the contents.
  764         HasXMLDecl(content)) {
  765       ASSERT(!domdoc || !domdoc->HasChildNodes());
  766       xmlDoc *xmldoc = ParseXML(content, extra_entities, filename,
  767                                 encoding_hint, encoding_fallback,
  768                                 encoding, utf8_content);
  769       if (!xmldoc) {
  770         result = false;
  771       } else {
  772         if (!xmlDocGetRootElement(xmldoc)) {
  773           LOG("No root element in XML file: %s", filename);
  774           result = false;
  775         } else {
  776           ConvertChildrenIntoDOM(domdoc, domdoc,
  777                                  reinterpret_cast<xmlNode *>(xmldoc));
  778           domdoc->Normalize();
  779         }
  780         xmlFreeDoc(xmldoc);
  781       }
  782     } else {
  783       result = ConvertContentToUTF8(content, filename, content_type,
  784                                     encoding_hint, encoding_fallback,
  785                                     encoding, utf8_content);
  786     }
  787 #ifdef _DEBUG
  788     ASSERT(!domdoc || domdoc->GetRefCount() == original_ref_count);
  789 #endif
  790     return result;
  791   }
  792 
  793   virtual bool ParseXMLIntoXPathMap(const std::string &xml,
  794                                     const StringMap *extra_entities,
  795                                     const char *filename,
  796                                     const char *root_element_name,
  797                                     const char *encoding_hint,
  798                                     const char *encoding_fallback,
  799                                     StringMap *table) {
  800     xmlDoc *xmldoc = ParseXML(xml, extra_entities, filename, encoding_hint,
  801                               encoding_fallback, NULL, NULL);
  802     if (!xmldoc)
  803       return false;
  804 
  805     xmlNode *root = xmlDocGetRootElement(xmldoc);
  806     if (!root ||
  807         GadgetStrCmp(FromXmlCharPtr(root->name), root_element_name) != 0) {
  808       LOG("No valid root element %s in XML file: %s",
  809           root_element_name, filename);
  810       xmlFreeDoc(xmldoc);
  811       return false;
  812     }
  813 
  814     ConvertElementIntoXPathMap(root, "", table);
  815     xmlFreeDoc(xmldoc);
  816     return true;
  817   }
  818 
  819   virtual std::string EncodeXMLString(const char *src) {
  820     if (!src || !*src)
  821       return std::string();
  822 
  823     char *result = FromXmlCharPtr(xmlEncodeSpecialChars(NULL,
  824                                                         ToXmlCharPtr(src)));
  825     std::string result_str(result ? result : "");
  826     if (result)
  827       xmlFree(result);
  828     return result_str;
  829   }
  830 };
  831 
  832 static XMLParser *g_xml_parser = NULL;
  833 
  834 } // namespace libxml2
  835 } // namespace ggadget
  836 
  837 #define Initialize libxml2_xml_parser_LTX_Initialize
  838 #define Finalize libxml2_xml_parser_LTX_Finalize
  839 
  840 extern "C" {
  841   bool Initialize() {
  842     LOGI("Initialize libxml2_xml_parser extension.");
  843 
  844     // Many files declared as GB2312 encoding contain chararacters outside
  845     // of standard GB2312 range. Tolerate this by using superset GB18030 or GBK.
  846     xmlCharEncodingHandler *handler = xmlFindCharEncodingHandler("GB18030");
  847     if (handler) {
  848       xmlAddEncodingAlias("GB18030", "GB2312");
  849       xmlCharEncCloseFunc(handler);
  850     } else {
  851       DLOG("libxml2 doesn't support GB18030, try GBK");
  852       handler = xmlFindCharEncodingHandler("GBK");
  853       if (handler) {
  854         xmlAddEncodingAlias("GBK", "GB2312");
  855         xmlCharEncCloseFunc(handler);
  856       }
  857     }
  858 
  859     if (!ggadget::libxml2::g_xml_parser)
  860       ggadget::libxml2::g_xml_parser = new ggadget::libxml2::XMLParser;
  861 
  862     return ggadget::SetXMLParser(ggadget::libxml2::g_xml_parser);
  863   }
  864 
  865   void Finalize() {
  866     LOGI("Finalize libxml2_xml_parser extension.");
  867     delete ggadget::libxml2::g_xml_parser;
  868   }
  869 }