"Fossies" - the Fresh Open Source Software Archive

Member "drupal-8.9.10/core/lib/Drupal/Component/Utility/Html.php" (26 Nov 2020, 18104 Bytes) of package /linux/www/drupal-8.9.10.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) PHP source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "Html.php" see the Fossies "Dox" file reference documentation and the last Fossies "Diffs" side-by-side code changes report: 9.0.8_vs_9.1.0-rc1.

    1 <?php
    2 
    3 namespace Drupal\Component\Utility;
    4 
    5 /**
    6  * Provides DOMDocument helpers for parsing and serializing HTML strings.
    7  *
    8  * @ingroup utility
    9  */
   10 class Html {
   11 
   12   /**
   13    * An array of previously cleaned HTML classes.
   14    *
   15    * @var array
   16    */
   17   protected static $classes = [];
   18 
   19   /**
   20    * An array of the initial IDs used in one request.
   21    *
   22    * @var array
   23    */
   24   protected static $seenIdsInit;
   25 
   26   /**
   27    * An array of IDs, including incremented versions when an ID is duplicated.
   28    * @var array
   29    */
   30   protected static $seenIds;
   31 
   32   /**
   33    * Stores whether the current request was sent via AJAX.
   34    *
   35    * @var bool
   36    */
   37   protected static $isAjax = FALSE;
   38 
   39   /**
   40    * All attributes that may contain URIs.
   41    *
   42    * - The attributes 'code' and 'codebase' are omitted, because they only exist
   43    *   for the <applet> tag. The time of Java applets has passed.
   44    * - The attribute 'icon' is omitted, because no browser implements the
   45    *   <command> tag anymore.
   46    *  See https://developer.mozilla.org/en-US/docs/Web/HTML/Element/command.
   47    * - The 'manifest' attribute is omitted because it only exists for the <html>
   48    *   tag. That tag only makes sense in a HTML-served-as-HTML context, in which
   49    *   case relative URLs are guaranteed to work.
   50    *
   51    * @see https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes
   52    * @see https://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value
   53    *
   54    * @var string[]
   55    */
   56   protected static $uriAttributes = ['href', 'poster', 'src', 'cite', 'data', 'action', 'formaction', 'srcset', 'about'];
   57 
   58   /**
   59    * Prepares a string for use as a valid class name.
   60    *
   61    * Do not pass one string containing multiple classes as they will be
   62    * incorrectly concatenated with dashes, i.e. "one two" will become "one-two".
   63    *
   64    * @param mixed $class
   65    *   The class name to clean. It can be a string or anything that can be cast
   66    *   to string.
   67    *
   68    * @return string
   69    *   The cleaned class name.
   70    */
   71   public static function getClass($class) {
   72     $class = (string) $class;
   73     if (!isset(static::$classes[$class])) {
   74       static::$classes[$class] = static::cleanCssIdentifier(mb_strtolower($class));
   75     }
   76     return static::$classes[$class];
   77   }
   78 
   79   /**
   80    * Prepares a string for use as a CSS identifier (element, class, or ID name).
   81    *
   82    * Link below shows the syntax for valid CSS identifiers (including element
   83    * names, classes, and IDs in selectors).
   84    *
   85    * @see http://www.w3.org/TR/CSS21/syndata.html#characters
   86    *
   87    * @param string $identifier
   88    *   The identifier to clean.
   89    * @param array $filter
   90    *   An array of string replacements to use on the identifier.
   91    *
   92    * @return string
   93    *   The cleaned identifier.
   94    */
   95   public static function cleanCssIdentifier($identifier, array $filter = [
   96     ' ' => '-',
   97     '_' => '-',
   98     '/' => '-',
   99     '[' => '-',
  100     ']' => '',
  101   ]) {
  102     // We could also use strtr() here but its much slower than str_replace(). In
  103     // order to keep '__' to stay '__' we first replace it with a different
  104     // placeholder after checking that it is not defined as a filter.
  105     $double_underscore_replacements = 0;
  106     if (!isset($filter['__'])) {
  107       $identifier = str_replace('__', '##', $identifier, $double_underscore_replacements);
  108     }
  109     $identifier = str_replace(array_keys($filter), array_values($filter), $identifier);
  110     // Replace temporary placeholder '##' with '__' only if the original
  111     // $identifier contained '__'.
  112     if ($double_underscore_replacements > 0) {
  113       $identifier = str_replace('##', '__', $identifier);
  114     }
  115 
  116     // Valid characters in a CSS identifier are:
  117     // - the hyphen (U+002D)
  118     // - a-z (U+0030 - U+0039)
  119     // - A-Z (U+0041 - U+005A)
  120     // - the underscore (U+005F)
  121     // - 0-9 (U+0061 - U+007A)
  122     // - ISO 10646 characters U+00A1 and higher
  123     // We strip out any character not in the above list.
  124     $identifier = preg_replace('/[^\x{002D}\x{0030}-\x{0039}\x{0041}-\x{005A}\x{005F}\x{0061}-\x{007A}\x{00A1}-\x{FFFF}]/u', '', $identifier);
  125     // Identifiers cannot start with a digit, two hyphens, or a hyphen followed by a digit.
  126     $identifier = preg_replace([
  127       '/^[0-9]/',
  128       '/^(-[0-9])|^(--)/',
  129     ], ['_', '__'], $identifier);
  130     return $identifier;
  131   }
  132 
  133   /**
  134    * Sets if this request is an Ajax request.
  135    *
  136    * @param bool $is_ajax
  137    *   TRUE if this request is an Ajax request, FALSE otherwise.
  138    */
  139   public static function setIsAjax($is_ajax) {
  140     static::$isAjax = $is_ajax;
  141   }
  142 
  143   /**
  144    * Prepares a string for use as a valid HTML ID and guarantees uniqueness.
  145    *
  146    * This function ensures that each passed HTML ID value only exists once on
  147    * the page. By tracking the already returned ids, this function enables
  148    * forms, blocks, and other content to be output multiple times on the same
  149    * page, without breaking (X)HTML validation.
  150    *
  151    * For already existing IDs, a counter is appended to the ID string.
  152    * Therefore, JavaScript and CSS code should not rely on any value that was
  153    * generated by this function and instead should rely on manually added CSS
  154    * classes or similarly reliable constructs.
  155    *
  156    * Two consecutive hyphens separate the counter from the original ID. To
  157    * manage uniqueness across multiple Ajax requests on the same page, Ajax
  158    * requests POST an array of all IDs currently present on the page, which are
  159    * used to prime this function's cache upon first invocation.
  160    *
  161    * To allow reverse-parsing of IDs submitted via Ajax, any multiple
  162    * consecutive hyphens in the originally passed $id are replaced with a
  163    * single hyphen.
  164    *
  165    * @param string $id
  166    *   The ID to clean.
  167    *
  168    * @return string
  169    *   The cleaned ID.
  170    */
  171   public static function getUniqueId($id) {
  172     // If this is an Ajax request, then content returned by this page request
  173     // will be merged with content already on the base page. The HTML IDs must
  174     // be unique for the fully merged content. Therefore use unique IDs.
  175     if (static::$isAjax) {
  176       return static::getId($id) . '--' . Crypt::randomBytesBase64(8);
  177     }
  178 
  179     // @todo Remove all that code once we switch over to random IDs only,
  180     // see https://www.drupal.org/node/1090592.
  181     if (!isset(static::$seenIdsInit)) {
  182       static::$seenIdsInit = [];
  183     }
  184     if (!isset(static::$seenIds)) {
  185       static::$seenIds = static::$seenIdsInit;
  186     }
  187 
  188     $id = static::getId($id);
  189 
  190     // Ensure IDs are unique by appending a counter after the first occurrence.
  191     // The counter needs to be appended with a delimiter that does not exist in
  192     // the base ID. Requiring a unique delimiter helps ensure that we really do
  193     // return unique IDs and also helps us re-create the $seen_ids array during
  194     // Ajax requests.
  195     if (isset(static::$seenIds[$id])) {
  196       $id = $id . '--' . ++static::$seenIds[$id];
  197     }
  198     else {
  199       static::$seenIds[$id] = 1;
  200     }
  201     return $id;
  202   }
  203 
  204   /**
  205    * Prepares a string for use as a valid HTML ID.
  206    *
  207    * Only use this function when you want to intentionally skip the uniqueness
  208    * guarantee of self::getUniqueId().
  209    *
  210    * @param string $id
  211    *   The ID to clean.
  212    *
  213    * @return string
  214    *   The cleaned ID.
  215    *
  216    * @see self::getUniqueId()
  217    */
  218   public static function getId($id) {
  219     $id = str_replace([' ', '_', '[', ']'], ['-', '-', '-', ''], mb_strtolower($id));
  220 
  221     // As defined in http://www.w3.org/TR/html4/types.html#type-name, HTML IDs can
  222     // only contain letters, digits ([0-9]), hyphens ("-"), underscores ("_"),
  223     // colons (":"), and periods ("."). We strip out any character not in that
  224     // list. Note that the CSS spec doesn't allow colons or periods in identifiers
  225     // (http://www.w3.org/TR/CSS21/syndata.html#characters), so we strip those two
  226     // characters as well.
  227     $id = preg_replace('/[^A-Za-z0-9\-_]/', '', $id);
  228 
  229     // Removing multiple consecutive hyphens.
  230     $id = preg_replace('/\-+/', '-', $id);
  231     return $id;
  232   }
  233 
  234   /**
  235    * Resets the list of seen IDs.
  236    */
  237   public static function resetSeenIds() {
  238     static::$seenIds = NULL;
  239   }
  240 
  241   /**
  242    * Normalizes an HTML snippet.
  243    *
  244    * This function is essentially \DOMDocument::normalizeDocument(), but
  245    * operates on an HTML string instead of a \DOMDocument.
  246    *
  247    * @param string $html
  248    *   The HTML string to normalize.
  249    *
  250    * @return string
  251    *   The normalized HTML string.
  252    */
  253   public static function normalize($html) {
  254     $document = static::load($html);
  255     return static::serialize($document);
  256   }
  257 
  258   /**
  259    * Parses an HTML snippet and returns it as a DOM object.
  260    *
  261    * This function loads the body part of a partial (X)HTML document and returns
  262    * a full \DOMDocument object that represents this document.
  263    *
  264    * Use \Drupal\Component\Utility\Html::serialize() to serialize this
  265    * \DOMDocument back to a string.
  266    *
  267    * @param string $html
  268    *   The partial (X)HTML snippet to load. Invalid markup will be corrected on
  269    *   import.
  270    *
  271    * @return \DOMDocument
  272    *   A \DOMDocument that represents the loaded (X)HTML snippet.
  273    */
  274   public static function load($html) {
  275     $document = <<<EOD
  276 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  277 <html xmlns="http://www.w3.org/1999/xhtml">
  278 <head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head>
  279 <body>!html</body>
  280 </html>
  281 EOD;
  282     // PHP's \DOMDocument serialization adds extra whitespace when the markup
  283     // of the wrapping document contains newlines, so ensure we remove all
  284     // newlines before injecting the actual HTML body to be processed.
  285     $document = strtr($document, ["\n" => '', '!html' => $html]);
  286 
  287     $dom = new \DOMDocument();
  288     // Ignore warnings during HTML soup loading.
  289     @$dom->loadHTML($document);
  290 
  291     return $dom;
  292   }
  293 
  294   /**
  295    * Converts the body of a \DOMDocument back to an HTML snippet.
  296    *
  297    * The function serializes the body part of a \DOMDocument back to an (X)HTML
  298    * snippet. The resulting (X)HTML snippet will be properly formatted to be
  299    * compatible with HTML user agents.
  300    *
  301    * @param \DOMDocument $document
  302    *   A \DOMDocument object to serialize, only the tags below the first <body>
  303    *   node will be converted.
  304    *
  305    * @return string
  306    *   A valid (X)HTML snippet, as a string.
  307    */
  308   public static function serialize(\DOMDocument $document) {
  309     $body_node = $document->getElementsByTagName('body')->item(0);
  310     $html = '';
  311 
  312     if ($body_node !== NULL) {
  313       foreach ($body_node->getElementsByTagName('script') as $node) {
  314         static::escapeCdataElement($node);
  315       }
  316       foreach ($body_node->getElementsByTagName('style') as $node) {
  317         static::escapeCdataElement($node, '/*', '*/');
  318       }
  319       foreach ($body_node->childNodes as $node) {
  320         $html .= $document->saveXML($node);
  321       }
  322     }
  323     return $html;
  324   }
  325 
  326   /**
  327    * Adds comments around a <!CDATA section in a \DOMNode.
  328    *
  329    * \DOMDocument::loadHTML() in \Drupal\Component\Utility\Html::load() makes
  330    * CDATA sections from the contents of inline script and style tags. This can
  331    * cause HTML4 browsers to throw exceptions.
  332    *
  333    * This function attempts to solve the problem by creating a
  334    * \DOMDocumentFragment to comment the CDATA tag.
  335    *
  336    * @param \DOMNode $node
  337    *   The element potentially containing a CDATA node.
  338    * @param string $comment_start
  339    *   (optional) A string to use as a comment start marker to escape the CDATA
  340    *   declaration. Defaults to '//'.
  341    * @param string $comment_end
  342    *   (optional) A string to use as a comment end marker to escape the CDATA
  343    *   declaration. Defaults to an empty string.
  344    */
  345   public static function escapeCdataElement(\DOMNode $node, $comment_start = '//', $comment_end = '') {
  346     foreach ($node->childNodes as $child_node) {
  347       if ($child_node instanceof \DOMCdataSection) {
  348         $embed_prefix = "\n<!--{$comment_start}--><![CDATA[{$comment_start} ><!--{$comment_end}\n";
  349         $embed_suffix = "\n{$comment_start}--><!]]>{$comment_end}\n";
  350 
  351         // Prevent invalid cdata escaping as this would throw a DOM error.
  352         // This is the same behavior as found in libxml2.
  353         // Related W3C standard: http://www.w3.org/TR/REC-xml/#dt-cdsection
  354         // Fix explanation: http://wikipedia.org/wiki/CDATA#Nesting
  355         $data = str_replace(']]>', ']]]]><![CDATA[>', $child_node->data);
  356 
  357         $fragment = $node->ownerDocument->createDocumentFragment();
  358         $fragment->appendXML($embed_prefix . $data . $embed_suffix);
  359         $node->appendChild($fragment);
  360         $node->removeChild($child_node);
  361       }
  362     }
  363   }
  364 
  365   /**
  366    * Decodes all HTML entities including numerical ones to regular UTF-8 bytes.
  367    *
  368    * Double-escaped entities will only be decoded once ("&amp;lt;" becomes
  369    * "&lt;", not "<"). Be careful when using this function, as it will revert
  370    * previous sanitization efforts (&lt;script&gt; will become <script>).
  371    *
  372    * This method is not the opposite of Html::escape(). For example, this method
  373    * will convert "&eacute;" to "é", whereas Html::escape() will not convert "é"
  374    * to "&eacute;".
  375    *
  376    * @param string $text
  377    *   The text to decode entities in.
  378    *
  379    * @return string
  380    *   The input $text, with all HTML entities decoded once.
  381    *
  382    * @see html_entity_decode()
  383    * @see \Drupal\Component\Utility\Html::escape()
  384    */
  385   public static function decodeEntities($text) {
  386     return html_entity_decode($text, ENT_QUOTES, 'UTF-8');
  387   }
  388 
  389   /**
  390    * Escapes text by converting special characters to HTML entities.
  391    *
  392    * This method escapes HTML for sanitization purposes by replacing the
  393    * following special characters with their HTML entity equivalents:
  394    * - & (ampersand) becomes &amp;
  395    * - " (double quote) becomes &quot;
  396    * - ' (single quote) becomes &#039;
  397    * - < (less than) becomes &lt;
  398    * - > (greater than) becomes &gt;
  399    * Special characters that have already been escaped will be double-escaped
  400    * (for example, "&lt;" becomes "&amp;lt;"), and invalid UTF-8 encoding
  401    * will be converted to the Unicode replacement character ("�").
  402    *
  403    * This method is not the opposite of Html::decodeEntities(). For example,
  404    * this method will not encode "é" to "&eacute;", whereas
  405    * Html::decodeEntities() will convert all HTML entities to UTF-8 bytes,
  406    * including "&eacute;" and "&lt;" to "é" and "<".
  407    *
  408    * When constructing @link theme_render render arrays @endlink passing the output of Html::escape() to
  409    * '#markup' is not recommended. Use the '#plain_text' key instead and the
  410    * renderer will autoescape the text.
  411    *
  412    * @param string $text
  413    *   The input text.
  414    *
  415    * @return string
  416    *   The text with all HTML special characters converted.
  417    *
  418    * @see htmlspecialchars()
  419    * @see \Drupal\Component\Utility\Html::decodeEntities()
  420    *
  421    * @ingroup sanitization
  422    */
  423   public static function escape($text) {
  424     return htmlspecialchars($text, ENT_QUOTES | ENT_SUBSTITUTE, 'UTF-8');
  425   }
  426 
  427   /**
  428    * Converts all root-relative URLs to absolute URLs.
  429    *
  430    * Does not change any existing protocol-relative or absolute URLs. Does not
  431    * change other relative URLs because they would result in different absolute
  432    * URLs depending on the current path. For example: when the same content
  433    * containing such a relative URL (for example 'image.png'), is served from
  434    * its canonical URL (for example 'http://example.com/some-article') or from
  435    * a listing or feed (for example 'http://example.com/all-articles') their
  436    * "current path" differs, resulting in different absolute URLs:
  437    * 'http://example.com/some-article/image.png' versus
  438    * 'http://example.com/all-articles/image.png'. Only one can be correct.
  439    * Therefore relative URLs that are not root-relative cannot be safely
  440    * transformed and should generally be avoided.
  441    *
  442    * Necessary for HTML that is served outside of a website, for example, RSS
  443    * and e-mail.
  444    *
  445    * @param string $html
  446    *   The partial (X)HTML snippet to load. Invalid markup will be corrected on
  447    *   import.
  448    * @param string $scheme_and_host
  449    *   The root URL, which has a URI scheme, host and optional port.
  450    *
  451    * @return string
  452    *   The updated (X)HTML snippet.
  453    */
  454   public static function transformRootRelativeUrlsToAbsolute($html, $scheme_and_host) {
  455     assert(empty(array_diff(array_keys(parse_url($scheme_and_host)), ["scheme", "host", "port"])), '$scheme_and_host contains scheme, host and port at most.');
  456     assert(isset(parse_url($scheme_and_host)["scheme"]), '$scheme_and_host is absolute and hence has a scheme.');
  457     assert(isset(parse_url($scheme_and_host)["host"]), '$base_url is absolute and hence has a host.');
  458 
  459     $html_dom = Html::load($html);
  460     $xpath = new \DOMXpath($html_dom);
  461 
  462     // Update all root-relative URLs to absolute URLs in the given HTML.
  463     foreach (static::$uriAttributes as $attr) {
  464       foreach ($xpath->query("//*[starts-with(@$attr, '/') and not(starts-with(@$attr, '//'))]") as $node) {
  465         $node->setAttribute($attr, $scheme_and_host . $node->getAttribute($attr));
  466       }
  467       foreach ($xpath->query("//*[@srcset]") as $node) {
  468         // @see https://html.spec.whatwg.org/multipage/embedded-content.html#attr-img-srcset
  469         // @see https://html.spec.whatwg.org/multipage/embedded-content.html#image-candidate-string
  470         $image_candidate_strings = explode(',', $node->getAttribute('srcset'));
  471         $image_candidate_strings = array_map('trim', $image_candidate_strings);
  472         for ($i = 0; $i < count($image_candidate_strings); $i++) {
  473           $image_candidate_string = $image_candidate_strings[$i];
  474           if ($image_candidate_string[0] === '/' && $image_candidate_string[1] !== '/') {
  475             $image_candidate_strings[$i] = $scheme_and_host . $image_candidate_string;
  476           }
  477         }
  478         $node->setAttribute('srcset', implode(', ', $image_candidate_strings));
  479       }
  480     }
  481     return Html::serialize($html_dom);
  482   }
  483 
  484 }