"Fossies" - the Fresh Open Source Software Archive

Member "googler-4.3.2/googler" (21 Jan 2021, 135035 Bytes) of package /linux/misc/googler-4.3.2.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. See also the latest Fossies "Diffs" side-by-side code changes report for "googler": 4.3.1_vs_4.3.2.

    1 #!/usr/bin/env python3
    2 #
    3 # Copyright © 2008 Henri Hakkinen
    4 # Copyright © 2015-2021 Arun Prakash Jana <engineerarun@gmail.com>
    5 #
    6 # This program is free software: you can redistribute it and/or modify
    7 # it under the terms of the GNU General Public License as published by
    8 # the Free Software Foundation, either version 3 of the License, or
    9 # (at your option) any later version.
   10 #
   11 # This program is distributed in the hope that it will be useful,
   12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
   13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   14 # GNU General Public License for more details.
   15 #
   16 # You should have received a copy of the GNU General Public License
   17 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
   18 
   19 import argparse
   20 import atexit
   21 import base64
   22 import collections
   23 import codecs
   24 import functools
   25 import gzip
   26 import html.entities
   27 import html.parser
   28 import http.client
   29 from http.client import HTTPSConnection
   30 import locale
   31 import logging
   32 import os
   33 import platform
   34 import shutil
   35 import signal
   36 import socket
   37 import ssl
   38 import subprocess
   39 from subprocess import Popen, PIPE, DEVNULL
   40 import sys
   41 import textwrap
   42 import unicodedata
   43 import urllib.parse
   44 import uuid
   45 import webbrowser
   46 
   47 # Python optional dependency compatibility layer
   48 try:
   49     import readline
   50 except ImportError:
   51     pass
   52 
   53 try:
   54     import setproctitle
   55     setproctitle.setproctitle('googler')
   56 except (ImportError, Exception):
   57     pass
   58 
   59 from typing import (
   60     Any,
   61     Dict,
   62     Generator,
   63     Iterable,
   64     Iterator,
   65     List,
   66     Match,
   67     Optional,
   68     Sequence,
   69     Tuple,
   70     Union,
   71     cast,
   72 )
   73 
   74 # Basic setup
   75 
   76 logging.basicConfig(format='[%(levelname)s] %(message)s')
   77 logger = logging.getLogger()
   78 
   79 
   80 def sigint_handler(signum, frame):
   81     print('\nInterrupted.', file=sys.stderr)
   82     sys.exit(1)
   83 
   84 try:
   85     signal.signal(signal.SIGINT, sigint_handler)
   86 except ValueError:
   87     # signal only works in main thread
   88     pass
   89 
   90 
   91 # Constants
   92 
   93 _VERSION_ = '4.3.2'
   94 _EPOCH_ = '20210115'
   95 
   96 COLORMAP = {k: '\x1b[%sm' % v for k, v in {
   97     'a': '30', 'b': '31', 'c': '32', 'd': '33',
   98     'e': '34', 'f': '35', 'g': '36', 'h': '37',
   99     'i': '90', 'j': '91', 'k': '92', 'l': '93',
  100     'm': '94', 'n': '95', 'o': '96', 'p': '97',
  101     'A': '30;1', 'B': '31;1', 'C': '32;1', 'D': '33;1',
  102     'E': '34;1', 'F': '35;1', 'G': '36;1', 'H': '37;1',
  103     'I': '90;1', 'J': '91;1', 'K': '92;1', 'L': '93;1',
  104     'M': '94;1', 'N': '95;1', 'O': '96;1', 'P': '97;1',
  105     'x': '0', 'X': '1', 'y': '7', 'Y': '7;1',
  106 }.items()}
  107 
  108 USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
  109 
  110 text_browsers = ['elinks', 'links', 'lynx', 'w3m', 'www-browser']
  111 
  112 # Self-upgrade parameters
  113 #
  114 # Downstream packagers are recommended to turn off the entire self-upgrade
  115 # mechanism through
  116 #
  117 #     make disable-self-upgrade
  118 #
  119 # before running `make install'.
  120 
  121 ENABLE_SELF_UPGRADE_MECHANISM = True
  122 API_REPO_BASE = 'https://api.github.com/repos/jarun/googler'
  123 RAW_DOWNLOAD_REPO_BASE = 'https://raw.githubusercontent.com/jarun/googler'
  124 
  125 debugger = False
  126 
  127 
  128 # Monkeypatch textwrap for CJK wide characters.
  129 
  130 def monkeypatch_textwrap_for_cjk():
  131     try:
  132         if textwrap.wrap.patched:
  133             return
  134     except AttributeError:
  135         pass
  136     psl_textwrap_wrap = textwrap.wrap
  137 
  138     def textwrap_wrap(text, width=70, **kwargs):
  139         if width <= 2:
  140             width = 2
  141         # We first add a U+0000 after each East Asian Fullwidth or East
  142         # Asian Wide character, then fill to width - 1 (so that if a NUL
  143         # character ends up on a new line, we still have one last column
  144         # to spare for the preceding wide character). Finally we strip
  145         # all the NUL characters.
  146         #
  147         # East Asian Width: https://www.unicode.org/reports/tr11/
  148         return [
  149             line.replace('\0', '')
  150             for line in psl_textwrap_wrap(
  151                 ''.join(
  152                     ch + '\0' if unicodedata.east_asian_width(ch) in ('F', 'W') else ch
  153                     for ch in unicodedata.normalize('NFC', text)
  154                 ),
  155                 width=width - 1,
  156                 **kwargs
  157             )
  158         ]
  159 
  160     def textwrap_fill(text, width=70, **kwargs):
  161         return '\n'.join(textwrap_wrap(text, width=width, **kwargs))
  162 
  163     textwrap.wrap = textwrap_wrap
  164     textwrap.fill = textwrap_fill
  165     textwrap.wrap.patched = True
  166     textwrap.fill.patched = True
  167 
  168 
  169 monkeypatch_textwrap_for_cjk()
  170 
  171 
  172 CoordinateType = Tuple[int, int]
  173 
  174 
  175 class TrackedTextwrap:
  176     """
  177     Implements a text wrapper that tracks the position of each source
  178     character, and can correctly insert zero-width sequences at given
  179     offsets of the source text.
  180 
  181     Wrapping result should be the same as that from PSL textwrap.wrap
  182     with default settings except expand_tabs=False.
  183     """
  184 
  185     def __init__(self, text: str, width: int):
  186         self._original = text
  187 
  188         # Do the job of replace_whitespace first so that we can easily
  189         # match text to wrapped lines later. Note that this operation
  190         # does not change text length or offsets.
  191         whitespace = "\t\n\v\f\r "
  192         whitespace_trans = str.maketrans(whitespace, " " * len(whitespace))
  193         text = text.translate(whitespace_trans)
  194 
  195         self._lines = textwrap.wrap(
  196             text, width, expand_tabs=False, replace_whitespace=False
  197         )
  198 
  199         # self._coords track the (row, column) coordinate of each source
  200         # character in the result text. It is indexed by offset in
  201         # source text.
  202         self._coords = []  # type: List[CoordinateType]
  203         offset = 0
  204         try:
  205             if not self._lines:
  206                 # Source text only has whitespaces. We add an empty line
  207                 # in order to produce meaningful coordinates.
  208                 self._lines = [""]
  209             for row, line in enumerate(self._lines):
  210                 assert text[offset : offset + len(line)] == line
  211                 col = 0
  212                 for _ in line:
  213                     self._coords.append((row, col))
  214                     offset += 1
  215                     col += 1
  216                 # All subsequent dropped whitespaces map to the last, imaginary column
  217                 # (the EOL character if you wish) of the current line.
  218                 while offset < len(text) and text[offset] == " ":
  219                     self._coords.append((row, col))
  220                     offset += 1
  221             # One past the final character (think of it as EOF) should
  222             # be treated as a valid offset.
  223             self._coords.append((row, col))
  224         except AssertionError:
  225             raise RuntimeError(
  226                 "TrackedTextwrap: the impossible happened at offset {} of text {!r}".format(
  227                     offset, self._original
  228                 )
  229             )
  230 
  231     # seq should be a zero-width sequence, e.g., an ANSI escape sequence.
  232     # May raise IndexError if offset is out of bounds.
  233     def insert_zero_width_sequence(self, seq: str, offset: int) -> None:
  234         row, col = self._coords[offset]
  235         line = self._lines[row]
  236         self._lines[row] = line[:col] + seq + line[col:]
  237 
  238         # Shift coordinates of all characters after the given character
  239         # on the same line.
  240         shift = len(seq)
  241         offset += 1
  242         while offset < len(self._coords) and self._coords[offset][0] == row:
  243             _, col = self._coords[offset]
  244             self._coords[offset] = (row, col + shift)
  245             offset += 1
  246 
  247     @property
  248     def original(self) -> str:
  249         return self._original
  250 
  251     @property
  252     def lines(self) -> List[str]:
  253         return self._lines
  254 
  255     @property
  256     def wrapped(self) -> str:
  257         return "\n".join(self._lines)
  258 
  259     # May raise IndexError if offset is out of bounds.
  260     def get_coordinate(self, offset: int) -> CoordinateType:
  261         return self._coords[offset]
  262 
  263 
  264 ### begin dim (DOM implementation with CSS support) ###
  265 ### https://github.com/zmwangx/dim/blob/master/dim.py ###
  266 
  267 import html
  268 import re
  269 from collections import OrderedDict
  270 from enum import Enum
  271 from html.parser import HTMLParser
  272 
  273 
  274 SelectorGroupLike = Union[str, "SelectorGroup", "Selector"]
  275 
  276 
  277 class Node(object):
  278     """
  279     Represents a DOM node.
  280 
  281     Parts of JavaScript's DOM ``Node`` API and ``Element`` API are
  282     mirrored here, with extensions. In particular, ``querySelector`` and
  283     ``querySelectorAll`` are mirrored.
  284 
  285     Notable properties and methods: :meth:`attr()`, :attr:`classes`,
  286     :attr:`html`, :attr:`text`, :meth:`ancestors()`,
  287     :meth:`descendants()`, :meth:`select()`, :meth:`select_all()`,
  288     :meth:`matched_by()`,
  289 
  290     Attributes:
  291         tag      (:class:`Optional`\\[:class:`str`])
  292         attrs    (:class:`Dict`\\[:class:`str`, :class:`str`])
  293         parent   (:class:`Optional`\\[:class:`Node`])
  294         children (:class:`List`\\[:class:`Node`])
  295     """
  296 
  297     # Meant to be reimplemented by subclasses.
  298     def __init__(self) -> None:
  299         self.tag = None  # type: Optional[str]
  300         self.attrs = {}  # type: Dict[str, str]
  301         self.parent = None  # type: Optional[Node]
  302         self.children = []  # type: List[Node]
  303 
  304         # Used in DOMBuilder.
  305         self._partial = False
  306         self._namespace = None  # type: Optional[str]
  307 
  308     # HTML representation of the node. Meant to be implemented by
  309     # subclasses.
  310     def __str__(self) -> str:  # pragma: no cover
  311         raise NotImplementedError
  312 
  313     def select(self, selector: SelectorGroupLike) -> Optional["Node"]:
  314         """DOM ``querySelector`` clone. Returns one match (if any)."""
  315         selector = self._normalize_selector(selector)
  316         for node in self._select_all(selector):
  317             return node
  318         return None
  319 
  320     def query_selector(self, selector: SelectorGroupLike) -> Optional["Node"]:
  321         """Alias of :meth:`select`."""
  322         return self.select(selector)
  323 
  324     def select_all(self, selector: SelectorGroupLike) -> List["Node"]:
  325         """DOM ``querySelectorAll`` clone. Returns all matches in a list."""
  326         selector = self._normalize_selector(selector)
  327         return list(self._select_all(selector))
  328 
  329     def query_selector_all(self, selector: SelectorGroupLike) -> List["Node"]:
  330         """Alias of :meth:`select_all`."""
  331         return self.select_all(selector)
  332 
  333     def matched_by(
  334         self, selector: SelectorGroupLike, root: Optional["Node"] = None
  335     ) -> bool:
  336         """
  337         Checks whether this node is matched by `selector`.
  338 
  339         See :meth:`SelectorGroup.matches()`.
  340         """
  341         selector = self._normalize_selector(selector)
  342         return selector.matches(self, root=root)
  343 
  344     @staticmethod
  345     def _normalize_selector(selector: SelectorGroupLike) -> "SelectorGroup":
  346         if isinstance(selector, str):
  347             return SelectorGroup.from_str(selector)
  348         if isinstance(selector, SelectorGroup):
  349             return selector
  350         if isinstance(selector, Selector):
  351             return SelectorGroup([selector])
  352         raise ValueError("not a selector or group of selectors: %s" % repr(selector))
  353 
  354     def _select_all(self, selector: "SelectorGroup") -> Generator["Node", None, None]:
  355         for descendant in self.descendants():
  356             if selector.matches(descendant, root=self):
  357                 yield descendant
  358 
  359     def child_nodes(self) -> List["Node"]:
  360         return self.children
  361 
  362     def first_child(self) -> Optional["Node"]:
  363         if self.children:
  364             return self.children[0]
  365         else:
  366             return None
  367 
  368     def first_element_child(self) -> Optional["Node"]:
  369         for child in self.children:
  370             if isinstance(child, ElementNode):
  371                 return child
  372         return None
  373 
  374     def last_child(self) -> Optional["Node"]:
  375         if self.children:
  376             return self.children[-1]
  377         else:
  378             return None
  379 
  380     def last_element_child(self) -> Optional["Node"]:
  381         for child in reversed(self.children):
  382             if isinstance(child, ElementNode):
  383                 return child
  384         return None
  385 
  386     def next_sibling(self) -> Optional["Node"]:
  387         """.. note:: Not O(1), use with caution."""
  388         next_siblings = self.next_siblings()
  389         if next_siblings:
  390             return next_siblings[0]
  391         else:
  392             return None
  393 
  394     def next_siblings(self) -> List["Node"]:
  395         parent = self.parent
  396         if not parent:
  397             return []
  398         try:
  399             index = parent.children.index(self)
  400             return parent.children[index + 1 :]
  401         except ValueError:  # pragma: no cover
  402             raise ValueError("node is not found in children of its parent")
  403 
  404     def next_element_sibling(self) -> Optional["ElementNode"]:
  405         """.. note:: Not O(1), use with caution."""
  406         for sibling in self.next_siblings():
  407             if isinstance(sibling, ElementNode):
  408                 return sibling
  409         return None
  410 
  411     def previous_sibling(self) -> Optional["Node"]:
  412         """.. note:: Not O(1), use with caution."""
  413         previous_siblings = self.previous_siblings()
  414         if previous_siblings:
  415             return previous_siblings[0]
  416         else:
  417             return None
  418 
  419     def previous_siblings(self) -> List["Node"]:
  420         """
  421         Compared to the natural DOM order, the order of returned nodes
  422         are reversed. That is, the adjacent sibling (if any) is the
  423         first in the returned list.
  424         """
  425         parent = self.parent
  426         if not parent:
  427             return []
  428         try:
  429             index = parent.children.index(self)
  430             if index > 0:
  431                 return parent.children[index - 1 :: -1]
  432             else:
  433                 return []
  434         except ValueError:  # pragma: no cover
  435             raise ValueError("node is not found in children of its parent")
  436 
  437     def previous_element_sibling(self) -> Optional["ElementNode"]:
  438         """.. note:: Not O(1), use with caution."""
  439         for sibling in self.previous_siblings():
  440             if isinstance(sibling, ElementNode):
  441                 return sibling
  442         return None
  443 
  444     def ancestors(
  445         self, *, root: Optional["Node"] = None
  446     ) -> Generator["Node", None, None]:
  447         """
  448         Ancestors are generated in reverse order of depth, stopping at
  449         `root`.
  450 
  451         A :class:`RuntimeException` is raised if `root` is not in the
  452         ancestral chain.
  453         """
  454         if self is root:
  455             return
  456         ancestor = self.parent
  457         while ancestor is not root:
  458             if ancestor is None:
  459                 raise RuntimeError("provided root node not found in ancestral chain")
  460             yield ancestor
  461             ancestor = ancestor.parent
  462         if root:
  463             yield root
  464 
  465     def descendants(self) -> Generator["Node", None, None]:
  466         """Descendants are generated in depth-first order."""
  467         for child in self.children:
  468             yield child
  469             yield from child.descendants()
  470 
  471     def attr(self, attr: str) -> Optional[str]:
  472         """Returns the attribute if it exists on the node, otherwise ``None``."""
  473         return self.attrs.get(attr)
  474 
  475     @property
  476     def html(self) -> str:
  477         """
  478         HTML representation of the node.
  479 
  480         (For a :class:`TextNode`, :meth:`html` returns the escaped version of the
  481         text.
  482         """
  483         return str(self)
  484 
  485     def outer_html(self) -> str:
  486         """Alias of :attr:`html`."""
  487         return self.html
  488 
  489     def inner_html(self) -> str:
  490         """HTML representation of the node's children."""
  491         return "".join(child.html for child in self.children)
  492 
  493     @property
  494     def text(self) -> str:  # pragma: no cover
  495         """This property is expected to be implemented by subclasses."""
  496         raise NotImplementedError
  497 
  498     def text_content(self) -> str:
  499         """Alias of :attr:`text`."""
  500         return self.text
  501 
  502     @property
  503     def classes(self) -> List[str]:
  504         return self.attrs.get("class", "").split()
  505 
  506     def class_list(self) -> List[str]:
  507         return self.classes
  508 
  509 
  510 class ElementNode(Node):
  511     """
  512     Represents an element node.
  513 
  514     Note that tag and attribute names are case-insensitive; attribute
  515     values are case-sensitive.
  516     """
  517 
  518     def __init__(
  519         self,
  520         tag: str,
  521         attrs: Iterable[Tuple[str, Optional[str]]],
  522         *,
  523         parent: Optional["Node"] = None,
  524         children: Optional[Sequence["Node"]] = None
  525     ) -> None:
  526         Node.__init__(self)
  527         self.tag = tag.lower()  # type: str
  528         self.attrs = OrderedDict((attr.lower(), val or "") for attr, val in attrs)
  529         self.parent = parent
  530         self.children = list(children or [])
  531 
  532     def __repr__(self) -> str:
  533         s = "<" + self.tag
  534         if self.attrs:
  535             s += " attrs=%s" % repr(list(self.attrs.items()))
  536         if self.children:
  537             s += " children=%s" % repr(self.children)
  538         s += ">"
  539         return s
  540 
  541     # https://ipython.readthedocs.io/en/stable/api/generated/IPython.lib.pretty.html
  542     def _repr_pretty_(self, p: Any, cycle: bool) -> None:  # pragma: no cover
  543         if cycle:
  544             raise RuntimeError("cycle detected in DOM tree")
  545         p.text("<\x1b[1m%s\x1b[0m" % self.tag)
  546         if self.attrs:
  547             p.text(" attrs=%s" % repr(list(self.attrs.items())))
  548         if self.children:
  549             p.text(" children=[")
  550             if len(self.children) == 1 and isinstance(self.first_child(), TextNode):
  551                 p.text("\x1b[4m%s\x1b[0m" % repr(self.first_child()))
  552             else:
  553                 with p.indent(2):
  554                     for child in self.children:
  555                         p.break_()
  556                         if hasattr(child, "_repr_pretty_"):
  557                             child._repr_pretty_(p, False)  # type: ignore
  558                         else:
  559                             p.text("\x1b[4m%s\x1b[0m" % repr(child))
  560                         p.text(",")
  561                 p.break_()
  562             p.text("]")
  563         p.text(">")
  564 
  565     def __str__(self) -> str:
  566         """HTML representation of the node."""
  567         s = "<" + self.tag
  568         for attr, val in self.attrs.items():
  569             s += ' %s="%s"' % (attr, html.escape(val))
  570         if self.children:
  571             s += ">"
  572             s += "".join(str(child) for child in self.children)
  573             s += "</%s>" % self.tag
  574         else:
  575             if _tag_is_void(self.tag):
  576                 s += "/>"
  577             else:
  578                 s += "></%s>" % self.tag
  579         return s
  580 
  581     @property
  582     def text(self) -> str:
  583         """The concatenation of all descendant text nodes."""
  584         return "".join(child.text for child in self.children)
  585 
  586 
  587 class TextNode(str, Node):
  588     """
  589     Represents a text node.
  590 
  591     Subclasses :class:`Node` and :class:`str`.
  592     """
  593 
  594     def __new__(cls, text: str) -> "TextNode":
  595         s = str.__new__(cls, text)  # type: ignore
  596         s.parent = None
  597         return s  # type: ignore
  598 
  599     def __init__(self, text: str) -> None:
  600         Node.__init__(self)
  601 
  602     def __repr__(self) -> str:
  603         return "<%s>" % str.__repr__(self)
  604 
  605     # HTML-escaped form of the text node. use text() for unescaped
  606     # version.
  607     def __str__(self) -> str:
  608         return html.escape(self)
  609 
  610     def __eq__(self, other: object) -> bool:
  611         """
  612         Two text nodes are equal if and only if they are the same node.
  613 
  614         For string comparison, use :attr:`text`.
  615         """
  616         return self is other
  617 
  618     def __ne__(self, other: object) -> bool:
  619         """
  620         Two text nodes are non-equal if they are not the same node.
  621 
  622         For string comparison, use :attr:`text`.
  623         """
  624         return self is not other
  625 
  626     @property
  627     def text(self) -> str:
  628         return str.__str__(self)
  629 
  630 
  631 class DOMBuilderException(Exception):
  632     """
  633     Exception raised when :class:`DOMBuilder` detects a bad state.
  634 
  635     Attributes:
  636         pos (:class:`Tuple`\\[:class:`int`, :class:`int`]):
  637             Line number and offset in HTML input.
  638         why (:class:`str`):
  639             Reason of the exception.
  640     """
  641 
  642     def __init__(self, pos: Tuple[int, int], why: str) -> None:
  643         self.pos = pos
  644         self.why = why
  645 
  646     def __str__(self) -> str:  # pragma: no cover
  647         return "DOM builder aborted at %d:%d: %s" % (self.pos[0], self.pos[1], self.why)
  648 
  649 
  650 class DOMBuilder(HTMLParser):
  651     """
  652     HTML parser / DOM builder.
  653 
  654     Subclasses :class:`html.parser.HTMLParser`.
  655 
  656     Consume HTML and builds a :class:`Node` tree. Once finished, use
  657     :attr:`root` to access the root of the tree.
  658 
  659     This parser cannot parse malformed HTML with tag mismatch.
  660     """
  661 
  662     def __init__(self) -> None:
  663         super().__init__(convert_charrefs=True)
  664         # _stack is the stack for nodes. Each node is pushed to the
  665         # stack when its start tag is processed, and remains on the
  666         # stack until its parent node is completed (end tag processed),
  667         # at which point the node is attached to the parent node as a
  668         # child and popped from the stack.
  669         self._stack = []  # type: List[Node]
  670         # _namespace_stack is another stack tracking the parsing
  671         # context, which is generally the default namespace (None) but
  672         # changes when parsing foreign objects (e.g. 'svg' when parsing
  673         # an <svg>). The top element is always the current parsing
  674         # context, so popping works differently from _stack: an element
  675         # is popped as soon as the corresponding end tag is processed.
  676         self._namespace_stack = [None]  # type: List[Optional[str]]
  677 
  678     def handle_starttag(
  679         self, tag: str, attrs: Sequence[Tuple[str, Optional[str]]]
  680     ) -> None:
  681         node = ElementNode(tag, attrs)
  682         node._partial = True
  683         self._stack.append(node)
  684         namespace = (
  685             tag.lower()
  686             if _tag_encloses_foreign_namespace(tag)
  687             else self._namespace_stack[-1]  # Inherit parent namespace
  688         )
  689         node._namespace = namespace
  690         self._namespace_stack.append(namespace)
  691         # For void elements (not in a foreign context), immediately
  692         # invoke the end tag handler (see handle_startendtag()).
  693         if not namespace and _tag_is_void(tag):
  694             self.handle_endtag(tag)
  695 
  696     def handle_endtag(self, tag: str) -> None:
  697         tag = tag.lower()
  698         children = []
  699         while self._stack and not self._stack[-1]._partial:
  700             children.append(self._stack.pop())
  701         if not self._stack:
  702             raise DOMBuilderException(self.getpos(), "extra end tag: %s" % repr(tag))
  703         parent = self._stack[-1]
  704         if parent.tag != tag:
  705             raise DOMBuilderException(
  706                 self.getpos(),
  707                 "expecting end tag %s, got %s" % (repr(parent.tag), repr(tag)),
  708             )
  709         parent.children = list(reversed(children))
  710         parent._partial = False
  711         for child in children:
  712             child.parent = parent
  713         self._namespace_stack.pop()
  714 
  715     # Make parser behavior for explicitly and implicitly void elements
  716     # (e.g., <hr> vs <hr/>) consistent. The former triggers
  717     # handle_starttag only, whereas the latter triggers
  718     # handle_startendtag (which by default triggers both handle_starttag
  719     # and handle_endtag). See https://bugs.python.org/issue25258.
  720     #
  721     # An exception is foreign elements, which aren't considered void
  722     # elements but can be explicitly marked as self-closing according to
  723     # the HTML spec (e.g. <path/> is valid but <path> is not).
  724     # Therefore, both handle_starttag and handle_endtag must be called,
  725     # and handle_endtag should not be triggered from within
  726     # handle_starttag in that case.
  727     #
  728     # Note that for simplicity we do not check whether the foreign
  729     # element in question is allowed to be self-closing by spec. (The
  730     # SVG spec unfortunately doesn't provide a readily available list of
  731     # such elements.)
  732     #
  733     # https://html.spec.whatwg.org/multipage/syntax.html#foreign-elements
  734     def handle_startendtag(
  735         self, tag: str, attrs: Sequence[Tuple[str, Optional[str]]]
  736     ) -> None:
  737         if self._namespace_stack[-1] or _tag_encloses_foreign_namespace(tag):
  738             self.handle_starttag(tag, attrs)
  739             self.handle_endtag(tag)
  740         else:
  741             self.handle_starttag(tag, attrs)
  742 
  743     def handle_data(self, text: str) -> None:
  744         if not self._stack:
  745             # Ignore text nodes before the first tag.
  746             return
  747         self._stack.append(TextNode(text))
  748 
  749     @property
  750     def root(self) -> "Node":
  751         """
  752         Finishes processing and returns the root node.
  753 
  754         Raises :class:`DOMBuilderException` if there is no root tag or
  755         root tag is not closed yet.
  756         """
  757         if not self._stack:
  758             raise DOMBuilderException(self.getpos(), "no root tag")
  759         if self._stack[0]._partial:
  760             raise DOMBuilderException(self.getpos(), "root tag not closed yet")
  761         return self._stack[0]
  762 
  763 
  764 def parse_html(html: str, *, ParserClass: type = DOMBuilder) -> "Node":
  765     """
  766     Parses HTML string, builds DOM, and returns root node.
  767 
  768     The parser may raise :class:`DOMBuilderException`.
  769 
  770     Args:
  771         html: input HTML string
  772         ParserClass: :class:`DOMBuilder` or a subclass
  773 
  774     Returns:
  775         Root note of the parsed tree. If the HTML string contains
  776         multiple top-level elements, only the first is returned and the
  777         rest are lost.
  778     """
  779     builder = ParserClass()  # type: DOMBuilder
  780     builder.feed(html)
  781     builder.close()
  782     return builder.root
  783 
  784 
  785 class SelectorParserException(Exception):
  786     """
  787     Exception raised when the selector parser fails to parse an input.
  788 
  789     Attributes:
  790         s (:class:`str`):
  791             The input string to be parsed.
  792         cursor (:class:`int`):
  793             Cursor position where the failure occurred.
  794         why (:class:`str`):
  795             Reason of the failure.
  796     """
  797 
  798     def __init__(self, s: str, cursor: int, why: str) -> None:
  799         self.s = s
  800         self.cursor = cursor
  801         self.why = why
  802 
  803     def __str__(self) -> str:  # pragma: no cover
  804         return "selector parser aborted at character %d of %s: %s" % (
  805             self.cursor,
  806             repr(self.s),
  807             self.why,
  808         )
  809 
  810 
  811 class SelectorGroup:
  812     """
  813     Represents a group of CSS selectors.
  814 
  815     A group of CSS selectors is simply a comma-separated list of
  816     selectors. [#]_ See :class:`Selector` documentation for the scope of
  817     support.
  818 
  819     Typically, a :class:`SelectorGroup` is constructed from a string
  820     (e.g., ``th.center, td.center``) using the factory function
  821     :meth:`from_str`.
  822 
  823     .. [#] https://www.w3.org/TR/selectors-3/#grouping
  824     """
  825 
  826     def __init__(self, selectors: Iterable["Selector"]) -> None:
  827         self._selectors = list(selectors)
  828 
  829     def __repr__(self) -> str:
  830         return "<SelectorGroup %s>" % repr(str(self))
  831 
  832     def __str__(self) -> str:
  833         return ", ".join(str(selector) for selector in self._selectors)
  834 
  835     def __len__(self) -> int:
  836         return len(self._selectors)
  837 
  838     def __getitem__(self, index: int) -> "Selector":
  839         return self._selectors[index]
  840 
  841     def __iter__(self) -> Iterator["Selector"]:
  842         return iter(self._selectors)
  843 
  844     @classmethod
  845     def from_str(cls, s: str) -> "SelectorGroup":
  846         """
  847         Parses input string into a group of selectors.
  848 
  849         :class:`SelectorParserException` is raised on invalid input. See
  850         :class:`Selector` documentation for the scope of support.
  851 
  852         Args:
  853             s: input string
  854 
  855         Returns:
  856             Parsed group of selectors.
  857         """
  858         i = 0
  859         selectors = []
  860         while i < len(s):
  861             selector, i = Selector.from_str(s, i)
  862             selectors.append(selector)
  863         if not selectors:
  864             raise SelectorParserException(s, i, "selector group is empty")
  865         return cls(selectors)
  866 
  867     def matches(self, node: "Node", root: Optional["Node"] = None) -> bool:
  868         """
  869         Decides whether the group of selectors matches `node`.
  870 
  871         The group of selectors matches `node` as long as one of the
  872         selectors matches `node`.
  873 
  874         If `root` is provided and child and/or descendant combinators
  875         are involved, parent/ancestor lookup terminates at `root`.
  876         """
  877         return any(selector.matches(node, root=root) for selector in self)
  878 
  879 
  880 class Selector:
  881     """
  882     Represents a CSS selector.
  883 
  884     Recall that a CSS selector is a chain of one or more *sequences of
  885     simple selectors* separated by *combinators*. [#selectors-3]_ This
  886     concept is represented as a cons list of sequences of simple
  887     selectors (in right to left order). This class in fact holds a
  888     single sequence, with an optional combinator and reference to the
  889     previous sequence.
  890 
  891     For instance, ``main#main p.important.definition >
  892     a.term[id][href]`` would be parsed into (schematically) the
  893     following structure::
  894 
  895         ">" tag='a' classes=('term') attrs=([id], [href]) ~>
  896         " " tag='p' classes=('important', 'definition') ~>
  897         tag='main' id='main'
  898 
  899     Each line is held in a separate instance of :class:`Selector`,
  900     linked together by the :attr:`previous` attribute.
  901 
  902     Supported grammar (from selectors level 3 [#selectors-3]_):
  903 
  904     - Type selectors;
  905     - Universal selectors;
  906     - Class selectors;
  907     - ID selectors;
  908     - Attribute selectors;
  909     - Combinators.
  910 
  911     Unsupported grammar:
  912 
  913     - Pseudo-classes;
  914     - Pseudo-elements;
  915     - Namespace prefixes (``ns|``, ``*|``, ``|``) in any part of any
  916       selector.
  917 
  918     Rationale:
  919 
  920     - Pseudo-classes have too many variants, a few of which even
  921       complete with an admittedly not-so-complex minilanguage. These add
  922       up to a lot of code.
  923     - Pseudo-elements are useless outside rendering contexts, hence out of
  924       scope.
  925     - Namespace support is too niche to be worth the parsing headache.
  926       *Using namespace prefixes may confuse the parser!*
  927 
  928     Note that the parser only loosely follows the spec and priotizes
  929     ease of parsing (which includes readability and *writability* of
  930     regexes), so some invalid selectors may be accepted (in fact, false
  931     positives abound, but accepting valid inputs is a much more
  932     important goal than rejecting invalid inputs for this library), and
  933     some valid selectors may be rejected (but as long as you stick to
  934     the scope outlined above and common sense you should be fine; the
  935     false negatives shouldn't be used by actual human beings anyway).
  936 
  937     In particular, whitespace character is simplified to ``\\s`` (ASCII
  938     mode) despite CSS spec not counting U+000B (VT) as whitespace,
  939     identifiers are simplified to ``[\\w-]+`` (ASCII mode), and strings
  940     (attribute selector values can be either identifiers or strings)
  941     allow escaped quotes (i.e., ``\\'`` inside single-quoted strings and
  942     ``\\"`` inside double-quoted strings) but everything else is
  943     interpreted literally. The exact specs for CSS identifiers and
  944     strings can be found at [#]_.
  945 
  946     Certain selectors and combinators may be implemented in the parser
  947     but not implemented in matching and/or selection APIs.
  948 
  949     .. [#selectors-3] https://www.w3.org/TR/selectors-3/
  950     .. [#] https://www.w3.org/TR/CSS21/syndata.html
  951 
  952     Attributes:
  953         tag (:class:`Optional`\\[:class:`str`]):
  954             Type selector.
  955         classes (:class:`List`\\[:class:`str`]):
  956             Class selectors.
  957         id (:class:`Optional`\\[:class:`str`]):
  958             ID selector.
  959         attrs (:class:`List`\\[:class:`AttributeSelector`]):
  960             Attribute selectors.
  961         combinator (:class:`Optional`\\[:class:`Combinator`]):
  962             Combinator with the previous sequence of simple selectors in
  963             chain.
  964         previous (:class:`Optional`\\[:class:`Selector`]):
  965             Reference to the previous sequence of simple selectors in
  966             chain.
  967 
  968     """
  969 
  970     def __init__(
  971         self,
  972         *,
  973         tag: Optional[str] = None,
  974         classes: Optional[Sequence[str]] = None,
  975         id: Optional[str] = None,
  976         attrs: Optional[Sequence["AttributeSelector"]] = None,
  977         combinator: Optional["Combinator"] = None,
  978         previous: Optional["Selector"] = None
  979     ) -> None:
  980         self.tag = tag.lower() if tag else None
  981         self.classes = list(classes or [])
  982         self.id = id
  983         self.attrs = list(attrs or [])
  984         self.combinator = combinator
  985         self.previous = previous
  986 
  987     def __repr__(self) -> str:
  988         return "<Selector %s>" % repr(str(self))
  989 
  990     def __str__(self) -> str:
  991         sequences = []
  992         delimiters = []
  993         seq = self
  994         while True:
  995             sequences.append(seq._sequence_str_())
  996             if seq.previous:
  997                 if seq.combinator == Combinator.DESCENDANT:
  998                     delimiters.append(" ")
  999                 elif seq.combinator == Combinator.CHILD:
 1000                     delimiters.append(" > ")
 1001                 elif seq.combinator == Combinator.NEXT_SIBLING:
 1002                     delimiters.append(" + ")
 1003                 elif seq.combinator == Combinator.SUBSEQUENT_SIBLING:
 1004                     delimiters.append(" ~ ")
 1005                 else:  # pragma: no cover
 1006                     raise RuntimeError(
 1007                         "unimplemented combinator: %s" % repr(self.combinator)
 1008                     )
 1009                 seq = seq.previous
 1010             else:
 1011                 delimiters.append("")
 1012                 break
 1013         return "".join(
 1014             delimiter + sequence
 1015             for delimiter, sequence in zip(reversed(delimiters), reversed(sequences))
 1016         )
 1017 
 1018     # Format a single sequence of simple selectors, without combinator.
 1019     def _sequence_str_(self) -> str:
 1020         s = ""
 1021         if self.tag:
 1022             s += self.tag
 1023         if self.classes:
 1024             s += "".join(".%s" % class_ for class_ in self.classes)
 1025         if self.id:
 1026             s += "#%s" % self.id
 1027         if self.attrs:
 1028             s += "".join(str(attr) for attr in self.attrs)
 1029         return s if s else "*"
 1030 
 1031     @classmethod
 1032     def from_str(cls, s: str, cursor: int = 0) -> Tuple["Selector", int]:
 1033         """
 1034         Parses input string into selector.
 1035 
 1036         This factory function only parses out one selector (up to a
 1037         comma or EOS), so partial consumption is allowed --- an optional
 1038         `cursor` is taken as input (0 by default) and the moved cursor
 1039         (either after the comma or at EOS) is returned as part of the
 1040         output.
 1041 
 1042         :class:`SelectorParserException` is raised on invalid input. See
 1043         :class:`Selector` documentation for the scope of support.
 1044 
 1045         If you need to completely consume a string representing
 1046         (potentially) a group of selectors, use
 1047         :meth:`SelectorGroup.from_str()`.
 1048 
 1049         Args:
 1050             s:      input string
 1051             cursor: initial cursor position on `s`
 1052 
 1053         Returns:
 1054             A tuple containing the parsed selector and the moved the
 1055             cursor (either after a comma-delimiter, or at EOS).
 1056         """
 1057         # Simple selectors.
 1058         TYPE_SEL = re.compile(r"[\w-]+", re.A)
 1059         UNIVERSAL_SEL = re.compile(r"\*")
 1060         ATTR_SEL = re.compile(
 1061             r"""\[
 1062             \s*(?P<attr>[\w-]+)\s*
 1063             (
 1064                 (?P<op>[~|^$*]?=)\s*
 1065                 (
 1066                     (?P<val_identifier>[\w-]+)|
 1067                     (?P<val_string>
 1068                         (?P<quote>['"])
 1069                         (?P<val_string_inner>.*?)
 1070                         (?<!\\)(?P=quote)
 1071                     )
 1072                 )\s*
 1073             )?
 1074             \]""",
 1075             re.A | re.X,
 1076         )
 1077         CLASS_SEL = re.compile(r"\.([\w-]+)", re.A)
 1078         ID_SEL = re.compile(r"#([\w-]+)", re.A)
 1079         PSEUDO_CLASS_SEL = re.compile(r":[\w-]+(\([^)]+\))?", re.A)
 1080         PSEUDO_ELEM_SEL = re.compile(r"::[\w-]+", re.A)
 1081 
 1082         # Combinators
 1083         DESCENDANT_COM = re.compile(r"\s+")
 1084         CHILD_COM = re.compile(r"\s*>\s*")
 1085         NEXT_SIB_COM = re.compile(r"\s*\+\s*")
 1086         SUB_SIB_COM = re.compile(r"\s*~\s*")
 1087 
 1088         # Misc
 1089         WHITESPACE = re.compile(r"\s*")
 1090         END_OF_SELECTOR = re.compile(r"\s*($|,)")
 1091 
 1092         tag = None
 1093         classes = []
 1094         id = None
 1095         attrs = []
 1096         combinator = None
 1097 
 1098         selector = None
 1099         previous_combinator = None
 1100 
 1101         i = cursor
 1102 
 1103         # Skip leading whitespace
 1104         m = WHITESPACE.match(s, i)
 1105         if m:
 1106             i = m.end()
 1107 
 1108         while i < len(s):
 1109             # Parse one simple selector.
 1110             #
 1111             # PEP 572 (assignment expressions; the one that burned Guido
 1112             # so much that he resigned as BDFL) would have been nice; it
 1113             # would have saved us from all the regex match
 1114             # reassignments, and worse still, the casts, since mypy
 1115             # complains about getting Optional[Match[str]] instead of
 1116             # Match[str].
 1117             if TYPE_SEL.match(s, i):
 1118                 if tag:
 1119                     raise SelectorParserException(s, i, "multiple type selectors found")
 1120                 m = cast(Match[str], TYPE_SEL.match(s, i))
 1121                 tag = m.group()
 1122             elif UNIVERSAL_SEL.match(s, i):
 1123                 m = cast(Match[str], UNIVERSAL_SEL.match(s, i))
 1124             elif ATTR_SEL.match(s, i):
 1125                 m = cast(Match[str], ATTR_SEL.match(s, i))
 1126 
 1127                 attr = m.group("attr")
 1128                 op = m.group("op")
 1129                 val_identifier = m.group("val_identifier")
 1130                 quote = m.group("quote")
 1131                 val_string_inner = m.group("val_string_inner")
 1132                 if val_identifier is not None:
 1133                     val = val_identifier
 1134                 elif val_string_inner is not None:
 1135                     val = val_string_inner.replace("\\" + quote, quote)
 1136                 else:
 1137                     val = None
 1138 
 1139                 if op is None:
 1140                     type = AttributeSelectorType.BARE
 1141                 elif op == "=":
 1142                     type = AttributeSelectorType.EQUAL
 1143                 elif op == "~=":
 1144                     type = AttributeSelectorType.TILDE
 1145                 elif op == "|=":
 1146                     type = AttributeSelectorType.PIPE
 1147                 elif op == "^=":
 1148                     type = AttributeSelectorType.CARET
 1149                 elif op == "$=":
 1150                     type = AttributeSelectorType.DOLLAR
 1151                 elif op == "*=":
 1152                     type = AttributeSelectorType.ASTERISK
 1153                 else:  # pragma: no cover
 1154                     raise SelectorParserException(
 1155                         s,
 1156                         i,
 1157                         "unrecognized operator %s in attribute selector" % repr(op),
 1158                     )
 1159 
 1160                 attrs.append(AttributeSelector(attr, val, type))
 1161             elif CLASS_SEL.match(s, i):
 1162                 m = cast(Match[str], CLASS_SEL.match(s, i))
 1163                 classes.append(m.group(1))
 1164             elif ID_SEL.match(s, i):
 1165                 if id:
 1166                     raise SelectorParserException(s, i, "multiple id selectors found")
 1167                 m = cast(Match[str], ID_SEL.match(s, i))
 1168                 id = m.group(1)
 1169             elif PSEUDO_CLASS_SEL.match(s, i):
 1170                 raise SelectorParserException(s, i, "pseudo-classes not supported")
 1171             elif PSEUDO_ELEM_SEL.match(s, i):
 1172                 raise SelectorParserException(s, i, "pseudo-elements not supported")
 1173             else:
 1174                 raise SelectorParserException(
 1175                     s, i, "expecting simple selector, found none"
 1176                 )
 1177             i = m.end()
 1178 
 1179             # Try to parse a combinator, or end the selector.
 1180             if CHILD_COM.match(s, i):
 1181                 m = cast(Match[str], CHILD_COM.match(s, i))
 1182                 combinator = Combinator.CHILD
 1183             elif NEXT_SIB_COM.match(s, i):
 1184                 m = cast(Match[str], NEXT_SIB_COM.match(s, i))
 1185                 combinator = Combinator.NEXT_SIBLING
 1186             elif SUB_SIB_COM.match(s, i):
 1187                 m = cast(Match[str], SUB_SIB_COM.match(s, i))
 1188                 combinator = Combinator.SUBSEQUENT_SIBLING
 1189             elif END_OF_SELECTOR.match(s, i):
 1190                 m = cast(Match[str], END_OF_SELECTOR.match(s, i))
 1191                 combinator = None
 1192             # Need to parse descendant combinator at the very end
 1193             # because it could be a prefix to all previous cases.
 1194             elif DESCENDANT_COM.match(s, i):
 1195                 m = cast(Match[str], DESCENDANT_COM.match(s, i))
 1196                 combinator = Combinator.DESCENDANT
 1197             else:
 1198                 continue
 1199             i = m.end()
 1200 
 1201             if combinator and i == len(s):
 1202                 raise SelectorParserException(s, i, "unexpected end at combinator")
 1203 
 1204             selector = cls(
 1205                 tag=tag,
 1206                 classes=classes,
 1207                 id=id,
 1208                 attrs=attrs,
 1209                 combinator=previous_combinator,
 1210                 previous=selector,
 1211             )
 1212             previous_combinator = combinator
 1213 
 1214             # End of selector.
 1215             if combinator is None:
 1216                 break
 1217 
 1218             tag = None
 1219             classes = []
 1220             id = None
 1221             attrs = []
 1222             combinator = None
 1223 
 1224         if not selector:
 1225             raise SelectorParserException(s, i, "selector is empty")
 1226 
 1227         return selector, i
 1228 
 1229     def matches(self, node: "Node", root: Optional["Node"] = None) -> bool:
 1230         """
 1231         Decides whether the selector matches `node`.
 1232 
 1233         Each sequence of simple selectors in the selector's chain must
 1234         be matched for a positive.
 1235 
 1236         If `root` is provided and child and/or descendant combinators
 1237         are involved, parent/ancestor lookup terminates at `root`.
 1238         """
 1239         if self.tag:
 1240             if not node.tag or node.tag != self.tag:
 1241                 return False
 1242         if self.id:
 1243             if node.attrs.get("id") != self.id:
 1244                 return False
 1245         if self.classes:
 1246             classes = node.classes
 1247             for class_ in self.classes:
 1248                 if class_ not in classes:
 1249                     return False
 1250         if self.attrs:
 1251             for attr_selector in self.attrs:
 1252                 if not attr_selector.matches(node):
 1253                     return False
 1254 
 1255         if not self.previous:
 1256             return True
 1257 
 1258         if self.combinator == Combinator.DESCENDANT:
 1259             return any(
 1260                 self.previous.matches(ancestor, root=root)
 1261                 for ancestor in node.ancestors()
 1262             )
 1263         elif self.combinator == Combinator.CHILD:
 1264             if node is root or node.parent is None:
 1265                 return False
 1266             else:
 1267                 return self.previous.matches(node.parent)
 1268         elif self.combinator == Combinator.NEXT_SIBLING:
 1269             sibling = node.previous_element_sibling()
 1270             if not sibling:
 1271                 return False
 1272             else:
 1273                 return self.previous.matches(sibling)
 1274         elif self.combinator == Combinator.SUBSEQUENT_SIBLING:
 1275             return any(
 1276                 self.previous.matches(sibling, root=root)
 1277                 for sibling in node.previous_siblings()
 1278                 if isinstance(sibling, ElementNode)
 1279             )
 1280         else:  # pragma: no cover
 1281             raise RuntimeError("unimplemented combinator: %s" % repr(self.combinator))
 1282 
 1283 
 1284 class AttributeSelector:
 1285     """
 1286     Represents an attribute selector.
 1287 
 1288     Attributes:
 1289         attr (:class:`str`)
 1290         val  (:class:`Optional`\\[:class:`str`])
 1291         type (:class:`AttributeSelectorType`)
 1292     """
 1293 
 1294     def __init__(
 1295         self, attr: str, val: Optional[str], type: "AttributeSelectorType"
 1296     ) -> None:
 1297         self.attr = attr.lower()
 1298         self.val = val
 1299         self.type = type
 1300 
 1301     def __repr__(self) -> str:
 1302         return "<AttributeSelector %s>" % repr(str(self))
 1303 
 1304     def __str__(self) -> str:
 1305         if self.type == AttributeSelectorType.BARE:
 1306             fmt = "[{attr}{val:.0}]"
 1307         elif self.type == AttributeSelectorType.EQUAL:
 1308             fmt = "[{attr}={val}]"
 1309         elif self.type == AttributeSelectorType.TILDE:
 1310             fmt = "[{attr}~={val}]"
 1311         elif self.type == AttributeSelectorType.PIPE:
 1312             fmt = "[{attr}|={val}]"
 1313         elif self.type == AttributeSelectorType.CARET:
 1314             fmt = "[{attr}^={val}]"
 1315         elif self.type == AttributeSelectorType.DOLLAR:
 1316             fmt = "[{attr}$={val}]"
 1317         elif self.type == AttributeSelectorType.ASTERISK:
 1318             fmt = "[{attr}*={val}]"
 1319         return fmt.format(attr=self.attr, val=repr(self.val))
 1320 
 1321     def matches(self, node: "Node") -> bool:
 1322         val = node.attrs.get(self.attr)
 1323         if val is None:
 1324             return False
 1325         if self.type == AttributeSelectorType.BARE:
 1326             return True
 1327         elif self.type == AttributeSelectorType.EQUAL:
 1328             return val == self.val
 1329         elif self.type == AttributeSelectorType.TILDE:
 1330             return self.val in val.split()
 1331         elif self.type == AttributeSelectorType.PIPE:
 1332             return val == self.val or val.startswith("%s-" % self.val)
 1333         elif self.type == AttributeSelectorType.CARET:
 1334             return bool(self.val and val.startswith(self.val))
 1335         elif self.type == AttributeSelectorType.DOLLAR:
 1336             return bool(self.val and val.endswith(self.val))
 1337         elif self.type == AttributeSelectorType.ASTERISK:
 1338             return bool(self.val and self.val in val)
 1339         else:  # pragma: no cover
 1340             raise RuntimeError("unimplemented attribute selector: %s" % repr(self.type))
 1341 
 1342 
 1343 # Enum: basis for poor man's algebraic data type.
 1344 class AttributeSelectorType(Enum):
 1345     """
 1346     Attribute selector types.
 1347 
 1348     Members correspond to the following forms of attribute selector:
 1349 
 1350     - :attr:`BARE`: ``[attr]``;
 1351     - :attr:`EQUAL`: ``[attr=val]``;
 1352     - :attr:`TILDE`: ``[attr~=val]``;
 1353     - :attr:`PIPE`: ``[attr|=val]``;
 1354     - :attr:`CARET`: ``[attr^=val]``;
 1355     - :attr:`DOLLAR`: ``[attr$=val]``;
 1356     - :attr:`ASTERISK`: ``[attr*=val]``.
 1357     """
 1358 
 1359     # [attr]
 1360     BARE = 1
 1361     # [attr=val]
 1362     EQUAL = 2
 1363     # [attr~=val]
 1364     TILDE = 3
 1365     # [attr|=val]
 1366     PIPE = 4
 1367     # [attr^=val]
 1368     CARET = 5
 1369     # [attr$=val]
 1370     DOLLAR = 6
 1371     # [attr*=val]
 1372     ASTERISK = 7
 1373 
 1374 
 1375 class Combinator(Enum):
 1376     """
 1377     Combinator types.
 1378 
 1379     Members correspond to the following combinators:
 1380 
 1381     - :attr:`DESCENDANT`: ``A B``;
 1382     - :attr:`CHILD`: ``A > B``;
 1383     - :attr:`NEXT_SIBLING`: ``A + B``;
 1384     - :attr:`SUBSEQUENT_SIBLING`: ``A ~ B``.
 1385     """
 1386 
 1387     # ' '
 1388     DESCENDANT = 1
 1389     # >
 1390     CHILD = 2
 1391     # +
 1392     NEXT_SIBLING = 3
 1393     # ~
 1394     SUBSEQUENT_SIBLING = 4
 1395 
 1396 
 1397 def _tag_is_void(tag: str) -> bool:
 1398     """
 1399     Checks whether the tag corresponds to a void element.
 1400 
 1401     https://www.w3.org/TR/html5/syntax.html#void-elements
 1402     https://html.spec.whatwg.org/multipage/syntax.html#void-elements
 1403     """
 1404     return tag.lower() in (
 1405         "area",
 1406         "base",
 1407         "br",
 1408         "col",
 1409         "embed",
 1410         "hr",
 1411         "img",
 1412         "input",
 1413         "link",
 1414         "meta",
 1415         "param",
 1416         "source",
 1417         "track",
 1418         "wbr",
 1419     )
 1420 
 1421 
 1422 def _tag_encloses_foreign_namespace(tag: str) -> bool:
 1423     """
 1424     Checks whether the tag encloses a foreign namespace (MathML or SVG).
 1425 
 1426     https://html.spec.whatwg.org/multipage/syntax.html#foreign-elements
 1427     """
 1428     return tag.lower() in ("math", "svg")
 1429 
 1430 
 1431 ### end dim ###
 1432 
 1433 
 1434 # Global helper functions
 1435 
 1436 def open_url(url):
 1437     """Open an URL in the user's default web browser.
 1438 
 1439     The string attribute ``open_url.url_handler`` can be used to open URLs
 1440     in a custom CLI script or utility. A subprocess is spawned with url as
 1441     the parameter in this case instead of the usual webbrowser.open() call.
 1442 
 1443     Whether the browser's output (both stdout and stderr) are suppressed
 1444     depends on the boolean attribute ``open_url.suppress_browser_output``.
 1445     If the attribute is not set upon a call, set it to a default value,
 1446     which means False if BROWSER is set to a known text-based browser --
 1447     elinks, links, lynx, w3m or 'www-browser'; or True otherwise.
 1448 
 1449     The string attribute ``open_url.override_text_browser`` can be used to
 1450     ignore env var BROWSER as well as some known text-based browsers and
 1451     attempt to open url in a GUI browser available.
 1452     Note: If a GUI browser is indeed found, this option ignores the program
 1453           option `show-browser-logs`
 1454     """
 1455     logger.debug('Opening %s', url)
 1456 
 1457     # Custom URL handler gets max priority
 1458     if hasattr(open_url, 'url_handler'):
 1459         subprocess.run([open_url.url_handler, url])
 1460         return
 1461 
 1462     browser = webbrowser.get()
 1463     if open_url.override_text_browser:
 1464         browser_output = open_url.suppress_browser_output
 1465         for name in [b for b in webbrowser._tryorder if b not in text_browsers]:
 1466             browser = webbrowser.get(name)
 1467             logger.debug(browser)
 1468 
 1469             # Found a GUI browser, suppress browser output
 1470             open_url.suppress_browser_output = True
 1471             break
 1472 
 1473     if open_url.suppress_browser_output:
 1474         _stderr = os.dup(2)
 1475         os.close(2)
 1476         _stdout = os.dup(1)
 1477         # Patch for GUI browsers on WSL
 1478         if "microsoft" not in platform.uname()[3].lower():
 1479             os.close(1)
 1480         fd = os.open(os.devnull, os.O_RDWR)
 1481         os.dup2(fd, 2)
 1482         os.dup2(fd, 1)
 1483     try:
 1484         browser.open(url, new=2)
 1485     finally:
 1486         if open_url.suppress_browser_output:
 1487             os.close(fd)
 1488             os.dup2(_stderr, 2)
 1489             os.dup2(_stdout, 1)
 1490 
 1491     if open_url.override_text_browser:
 1492         open_url.suppress_browser_output = browser_output
 1493 
 1494 
 1495 def printerr(msg):
 1496     """Print message, verbatim, to stderr.
 1497 
 1498     ``msg`` could be any stringifiable value.
 1499     """
 1500     print(msg, file=sys.stderr)
 1501 
 1502 
 1503 def unwrap(text):
 1504     """Unwrap text."""
 1505     lines = text.split('\n')
 1506     result = ''
 1507     for i in range(len(lines) - 1):
 1508         result += lines[i]
 1509         if not lines[i]:
 1510             # Paragraph break
 1511             result += '\n\n'
 1512         elif lines[i + 1]:
 1513             # Next line is not paragraph break, add space
 1514             result += ' '
 1515     # Handle last line
 1516     result += lines[-1] if lines[-1] else '\n'
 1517     return result
 1518 
 1519 
 1520 def check_stdout_encoding():
 1521     """Make sure stdout encoding is utf-8.
 1522 
 1523     If not, print error message and instructions, then exit with
 1524     status 1.
 1525 
 1526     This function is a no-op on win32 because encoding on win32 is
 1527     messy, and let's just hope for the best. /s
 1528     """
 1529     if sys.platform == 'win32':
 1530         return
 1531 
 1532     # Use codecs.lookup to resolve text encoding alias
 1533     encoding = codecs.lookup(sys.stdout.encoding).name
 1534     if encoding != 'utf-8':
 1535         locale_lang, locale_encoding = locale.getlocale()
 1536         if locale_lang is None:
 1537             locale_lang = '<unknown>'
 1538         if locale_encoding is None:
 1539             locale_encoding = '<unknown>'
 1540         ioencoding = os.getenv('PYTHONIOENCODING', 'not set')
 1541         sys.stderr.write(unwrap(textwrap.dedent("""\
 1542         stdout encoding '{encoding}' detected. googler requires utf-8 to
 1543         work properly. The wrong encoding may be due to a non-UTF-8
 1544         locale or an improper PYTHONIOENCODING. (For the record, your
 1545         locale language is {locale_lang} and locale encoding is
 1546         {locale_encoding}; your PYTHONIOENCODING is {ioencoding}.)
 1547 
 1548         Please set a UTF-8 locale (e.g., en_US.UTF-8) or set
 1549         PYTHONIOENCODING to utf-8.
 1550         """.format(
 1551             encoding=encoding,
 1552             locale_lang=locale_lang,
 1553             locale_encoding=locale_encoding,
 1554             ioencoding=ioencoding,
 1555         ))))
 1556         sys.exit(1)
 1557 
 1558 
 1559 def time_it(description=None):
 1560     def decorator(func):
 1561         @functools.wraps(func)
 1562         def wrapped(*args, **kwargs):
 1563             # Only profile in debug mode.
 1564             if not logger.isEnabledFor(logging.DEBUG):
 1565                 return func(*args, **kwargs)
 1566 
 1567             import time
 1568             mark = time.perf_counter()
 1569             ret = func(*args, **kwargs)
 1570             duration = time.perf_counter() - mark
 1571             logger.debug('%s completed in \x1b[33m%.3fs\x1b[0m', description or func.__name__, duration)
 1572             return ret
 1573 
 1574         return wrapped
 1575 
 1576     return decorator
 1577 
 1578 
 1579 # Classes
 1580 
 1581 class HardenedHTTPSConnection(HTTPSConnection):
 1582     """Overrides HTTPSConnection.connect to specify TLS version
 1583 
 1584     NOTE: TLS 1.2 is supported from Python 3.4
 1585     """
 1586 
 1587     def __init__(self, host, address_family=0, **kwargs):
 1588         HTTPSConnection.__init__(self, host, **kwargs)
 1589         self.address_family = address_family
 1590 
 1591     def connect(self, notweak=False):
 1592         sock = self.create_socket_connection()
 1593 
 1594         # Optimizations not available on OS X
 1595         if not notweak and sys.platform.startswith('linux'):
 1596             try:
 1597                 sock.setsockopt(socket.SOL_TCP, socket.TCP_DEFER_ACCEPT, 1)
 1598                 sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_QUICKACK, 1)
 1599                 sock.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, 524288)
 1600             except OSError:
 1601                 # Doesn't work on Windows' Linux subsystem (#179)
 1602                 logger.debug('setsockopt failed')
 1603 
 1604         if getattr(self, '_tunnel_host', None):
 1605             self.sock = sock
 1606         elif not notweak:
 1607             # Try to use TLS 1.2
 1608             ssl_context = None
 1609             if hasattr(ssl, 'PROTOCOL_TLS'):
 1610                 # Since Python 3.5.3
 1611                 ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS)
 1612                 if hasattr(ssl_context, "minimum_version"):
 1613                     # Python 3.7 with OpenSSL 1.1.0g or later
 1614                     ssl_context.minimum_version = ssl.TLSVersion.TLSv1_2
 1615                 else:
 1616                     ssl_context.options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 |
 1617                                             ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)
 1618             elif hasattr(ssl, 'PROTOCOL_TLSv1_2'):
 1619                 # Since Python 3.4
 1620                 ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
 1621             if ssl_context:
 1622                 self.sock = ssl_context.wrap_socket(sock)
 1623                 return
 1624 
 1625         # Fallback
 1626         HTTPSConnection.connect(self)
 1627 
 1628     # Adapted from socket.create_connection.
 1629     # https://github.com/python/cpython/blob/bce4ddafdd188cc6deb1584728b67b9e149ca6a4/Lib/socket.py#L771-L813
 1630     def create_socket_connection(self):
 1631         err = None
 1632         results = socket.getaddrinfo(self.host, self.port, self.address_family, socket.SOCK_STREAM)
 1633         # Prefer IPv4 if address family isn't explicitly specified.
 1634         if self.address_family == 0:
 1635             results = sorted(results, key=lambda res: 1 if res[0] == socket.AF_INET else 2)
 1636         for af, socktype, proto, canonname, sa in results:
 1637             sock = None
 1638             try:
 1639                 sock = socket.socket(af, socktype, proto)
 1640                 if self.timeout is not None:
 1641                     sock.settimeout(self.timeout)
 1642                 if self.source_address:
 1643                     sock.bind(self.source_address)
 1644                 sock.connect(sa)
 1645                 # Break explicitly a reference cycle
 1646                 err = None
 1647                 self.address_family = af
 1648                 logger.debug('Opened socket to %s:%d',
 1649                              sa[0] if af == socket.AF_INET else ('[%s]' % sa[0]),
 1650                              sa[1])
 1651                 return sock
 1652 
 1653             except socket.error as _:
 1654                 err = _
 1655                 if sock is not None:
 1656                     sock.close()
 1657 
 1658         if err is not None:
 1659             try:
 1660                 raise err
 1661             finally:
 1662                 # Break explicitly a reference cycle
 1663                 err = None
 1664         else:
 1665             raise socket.error("getaddrinfo returns an empty list")
 1666 
 1667 
 1668 class GoogleUrl(object):
 1669     """
 1670     This class constructs the Google Search/News URL.
 1671 
 1672     This class is modelled on urllib.parse.ParseResult for familiarity,
 1673     which means it supports reading of all six attributes -- scheme,
 1674     netloc, path, params, query, fragment -- of
 1675     urllib.parse.ParseResult, as well as the geturl() method.
 1676 
 1677     However, the attributes (properties) and methods listed below should
 1678     be the preferred methods of access to this class.
 1679 
 1680     Parameters
 1681     ----------
 1682     opts : dict or argparse.Namespace, optional
 1683         See the ``opts`` parameter of `update`.
 1684 
 1685     Other Parameters
 1686     ----------------
 1687     See "Other Parameters" of `update`.
 1688 
 1689     Attributes
 1690     ----------
 1691     hostname : str
 1692         Read-write property.
 1693     keywords : str or list of strs
 1694         Read-write property.
 1695     news : bool
 1696         Read-only property.
 1697     videos : bool
 1698         Read-only property.
 1699     url : str
 1700         Read-only property.
 1701 
 1702     Methods
 1703     -------
 1704     full()
 1705     relative()
 1706     update(opts=None, **kwargs)
 1707     set_queries(**kwargs)
 1708     unset_queries(*args)
 1709     next_page()
 1710     prev_page()
 1711     first_page()
 1712 
 1713     """
 1714 
 1715     def __init__(self, opts=None, **kwargs):
 1716         self.scheme = 'https'
 1717         # self.netloc is a calculated property
 1718         self.path = '/search'
 1719         self.params = ''
 1720         # self.query is a calculated property
 1721         self.fragment = ''
 1722 
 1723         self._tld = None
 1724         self._num = 10
 1725         self._start = 0
 1726         self._keywords = []
 1727         self._sites = None
 1728         self._exclude = None
 1729 
 1730         self._query_dict = {
 1731             'ie': 'UTF-8',
 1732             'oe': 'UTF-8',
 1733             #'gbv': '1',  # control the presence of javascript on the page, 1=no js, 2=js
 1734             'sei': base64.encodebytes(uuid.uuid4().bytes).decode("ascii").rstrip('=\n').replace('/', '_'),
 1735         }
 1736 
 1737         # In preloaded HTML parsing mode, set keywords to something so
 1738         # that we are not tripped up by require_keywords.
 1739         if opts.html_file and not opts.keywords:
 1740             opts.keywords = ['<debug>']
 1741 
 1742         self.update(opts, **kwargs)
 1743 
 1744     def __str__(self):
 1745         return self.url
 1746 
 1747     @property
 1748     def url(self):
 1749         """The full Google URL you want."""
 1750         return self.full()
 1751 
 1752     @property
 1753     def hostname(self):
 1754         """The hostname."""
 1755         return self.netloc
 1756 
 1757     @hostname.setter
 1758     def hostname(self, hostname):
 1759         self.netloc = hostname
 1760 
 1761     @property
 1762     def keywords(self):
 1763         """The keywords, either a str or a list of strs."""
 1764         return self._keywords
 1765 
 1766     @keywords.setter
 1767     def keywords(self, keywords):
 1768         self._keywords = keywords
 1769 
 1770     @property
 1771     def news(self):
 1772         """Whether the URL is for Google News."""
 1773         return 'tbm' in self._query_dict and self._query_dict['tbm'] == 'nws'
 1774 
 1775     @property
 1776     def videos(self):
 1777         """Whether the URL is for Google Videos."""
 1778         return 'tbm' in self._query_dict and self._query_dict['tbm'] == 'vid'
 1779 
 1780     def full(self):
 1781         """Return the full URL.
 1782 
 1783         Returns
 1784         -------
 1785         str
 1786 
 1787         """
 1788         url = (self.scheme + ':') if self.scheme else ''
 1789         url += '//' + self.netloc + self.relative()
 1790         return url
 1791 
 1792     def relative(self):
 1793         """Return the relative URL (without scheme and authority).
 1794 
 1795         Authority (see RFC 3986 section 3.2), or netloc in the
 1796         terminology of urllib.parse, basically means the hostname
 1797         here. The relative URL is good for making HTTP(S) requests to a
 1798         known host.
 1799 
 1800         Returns
 1801         -------
 1802         str
 1803 
 1804         """
 1805         rel = self.path
 1806         if self.params:
 1807             rel += ';' + self.params
 1808         if self.query:
 1809             rel += '?' + self.query
 1810         if self.fragment:
 1811             rel += '#' + self.fragment
 1812         return rel
 1813 
 1814     def update(self, opts=None, **kwargs):
 1815         """Update the URL with the given options.
 1816 
 1817         Parameters
 1818         ----------
 1819         opts : dict or argparse.Namespace, optional
 1820             Carries options that affect the Google Search/News URL. The
 1821             list of currently recognized option keys with expected value
 1822             types:
 1823 
 1824                 duration: str (GooglerArgumentParser.is_duration)
 1825                 exact: bool
 1826                 keywords: str or list of strs
 1827                 lang: str
 1828                 news: bool
 1829                 videos: bool
 1830                 num: int
 1831                 site: str
 1832                 start: int
 1833                 tld: str
 1834                 unfilter: bool
 1835 
 1836         Other Parameters
 1837         ----------------
 1838         kwargs
 1839             The `kwargs` dict extends `opts`, that is, options can be
 1840             specified either way, in `opts` or as individual keyword
 1841             arguments.
 1842 
 1843         """
 1844 
 1845         if opts is None:
 1846             opts = {}
 1847         if hasattr(opts, '__dict__'):
 1848             opts = opts.__dict__
 1849         opts.update(kwargs)
 1850 
 1851         qd = self._query_dict
 1852         if opts.get('duration'):
 1853             qd['tbs'] = 'qdr:%s' % opts['duration']
 1854         if 'exact' in opts:
 1855             if opts['exact']:
 1856                 qd['nfpr'] = 1
 1857             else:
 1858                 qd.pop('nfpr', None)
 1859         if opts.get('from') or opts.get('to'):
 1860             cd_min = opts.get('from') or ''
 1861             cd_max = opts.get('to') or ''
 1862             qd['tbs'] = 'cdr:1,cd_min:%s,cd_max:%s' % (cd_min, cd_max)
 1863         if 'keywords' in opts:
 1864             self._keywords = opts['keywords']
 1865         if 'lang' in opts and opts['lang']:
 1866             qd['hl'] = opts['lang']
 1867         if 'geoloc' in opts and opts['geoloc']:
 1868             qd['gl'] = opts['geoloc']
 1869         if 'news' in opts and opts['news']:
 1870             qd['tbm'] = 'nws'
 1871         elif 'videos' in opts and opts['videos']:
 1872             qd['tbm'] = 'vid'
 1873         else:
 1874             qd.pop('tbm', None)
 1875         if 'num' in opts:
 1876             self._num = opts['num']
 1877         if 'sites' in opts:
 1878             self._sites = opts['sites']
 1879         if 'exclude' in opts:
 1880             self._exclude = opts['exclude']
 1881         if 'start' in opts:
 1882             self._start = opts['start']
 1883         if 'tld' in opts:
 1884             self._tld = opts['tld']
 1885         if 'unfilter' in opts and opts['unfilter']:
 1886             qd['filter'] = 0
 1887 
 1888     def set_queries(self, **kwargs):
 1889         """Forcefully set queries outside the normal `update` mechanism.
 1890 
 1891         Other Parameters
 1892         ----------------
 1893         kwargs
 1894             Arbitrary key value pairs to be set in the query string. All
 1895             keys and values should be stringifiable.
 1896 
 1897             Note that certain keys, e.g., ``q``, have their values
 1898             constructed on the fly, so setting those has no actual
 1899             effect.
 1900 
 1901         """
 1902         for k, v in kwargs.items():
 1903             self._query_dict[k] = v
 1904 
 1905     def unset_queries(self, *args):
 1906         """Forcefully unset queries outside the normal `update` mechanism.
 1907 
 1908         Other Parameters
 1909         ----------------
 1910         args
 1911             Arbitrary keys to be unset. No exception is raised if a key
 1912             does not exist in the first place.
 1913 
 1914             Note that certain keys, e.g., ``q``, are always included in
 1915             the resulting URL, so unsetting those has no actual effect.
 1916 
 1917         """
 1918         for k in args:
 1919             self._query_dict.pop(k, None)
 1920 
 1921     def next_page(self):
 1922         """Navigate to the next page."""
 1923         self._start += self._num
 1924 
 1925     def prev_page(self):
 1926         """Navigate to the previous page.
 1927 
 1928         Raises
 1929         ------
 1930         ValueError
 1931             If already at the first page (``start=0`` in the current
 1932             query string).
 1933 
 1934         """
 1935         if self._start == 0:
 1936             raise ValueError('Already at the first page.')
 1937         self._start = (self._start - self._num) if self._start > self._num else 0
 1938 
 1939     def first_page(self):
 1940         """Navigate to the first page.
 1941 
 1942         Raises
 1943         ------
 1944         ValueError
 1945             If already at the first page (``start=0`` in the current
 1946             query string).
 1947 
 1948         """
 1949         if self._start == 0:
 1950             raise ValueError('Already at the first page.')
 1951         self._start = 0
 1952 
 1953     # Data source: https://web.archive.org/web/20170615200243/https://en.wikipedia.org/wiki/List_of_Google_domains
 1954     # Scraper script: https://gist.github.com/zmwangx/b976e83c14552fe18b71
 1955     TLD_TO_DOMAIN_MAP = {
 1956         'ac': 'google.ac',      'ad': 'google.ad',      'ae': 'google.ae',
 1957         'af': 'google.com.af',  'ag': 'google.com.ag',  'ai': 'google.com.ai',
 1958         'al': 'google.al',      'am': 'google.am',      'ao': 'google.co.ao',
 1959         'ar': 'google.com.ar',  'as': 'google.as',      'at': 'google.at',
 1960         'au': 'google.com.au',  'az': 'google.az',      'ba': 'google.ba',
 1961         'bd': 'google.com.bd',  'be': 'google.be',      'bf': 'google.bf',
 1962         'bg': 'google.bg',      'bh': 'google.com.bh',  'bi': 'google.bi',
 1963         'bj': 'google.bj',      'bn': 'google.com.bn',  'bo': 'google.com.bo',
 1964         'br': 'google.com.br',  'bs': 'google.bs',      'bt': 'google.bt',
 1965         'bw': 'google.co.bw',   'by': 'google.by',      'bz': 'google.com.bz',
 1966         'ca': 'google.ca',      'cat': 'google.cat',    'cc': 'google.cc',
 1967         'cd': 'google.cd',      'cf': 'google.cf',      'cg': 'google.cg',
 1968         'ch': 'google.ch',      'ci': 'google.ci',      'ck': 'google.co.ck',
 1969         'cl': 'google.cl',      'cm': 'google.cm',      'cn': 'google.cn',
 1970         'co': 'google.com.co',  'cr': 'google.co.cr',   'cu': 'google.com.cu',
 1971         'cv': 'google.cv',      'cy': 'google.com.cy',  'cz': 'google.cz',
 1972         'de': 'google.de',      'dj': 'google.dj',      'dk': 'google.dk',
 1973         'dm': 'google.dm',      'do': 'google.com.do',  'dz': 'google.dz',
 1974         'ec': 'google.com.ec',  'ee': 'google.ee',      'eg': 'google.com.eg',
 1975         'es': 'google.es',      'et': 'google.com.et',  'fi': 'google.fi',
 1976         'fj': 'google.com.fj',  'fm': 'google.fm',      'fr': 'google.fr',
 1977         'ga': 'google.ga',      'ge': 'google.ge',      'gf': 'google.gf',
 1978         'gg': 'google.gg',      'gh': 'google.com.gh',  'gi': 'google.com.gi',
 1979         'gl': 'google.gl',      'gm': 'google.gm',      'gp': 'google.gp',
 1980         'gr': 'google.gr',      'gt': 'google.com.gt',  'gy': 'google.gy',
 1981         'hk': 'google.com.hk',  'hn': 'google.hn',      'hr': 'google.hr',
 1982         'ht': 'google.ht',      'hu': 'google.hu',      'id': 'google.co.id',
 1983         'ie': 'google.ie',      'il': 'google.co.il',   'im': 'google.im',
 1984         'in': 'google.co.in',   'io': 'google.io',      'iq': 'google.iq',
 1985         'is': 'google.is',      'it': 'google.it',      'je': 'google.je',
 1986         'jm': 'google.com.jm',  'jo': 'google.jo',      'jp': 'google.co.jp',
 1987         'ke': 'google.co.ke',   'kg': 'google.kg',      'kh': 'google.com.kh',
 1988         'ki': 'google.ki',      'kr': 'google.co.kr',   'kw': 'google.com.kw',
 1989         'kz': 'google.kz',      'la': 'google.la',      'lb': 'google.com.lb',
 1990         'lc': 'google.com.lc',  'li': 'google.li',      'lk': 'google.lk',
 1991         'ls': 'google.co.ls',   'lt': 'google.lt',      'lu': 'google.lu',
 1992         'lv': 'google.lv',      'ly': 'google.com.ly',  'ma': 'google.co.ma',
 1993         'md': 'google.md',      'me': 'google.me',      'mg': 'google.mg',
 1994         'mk': 'google.mk',      'ml': 'google.ml',      'mm': 'google.com.mm',
 1995         'mn': 'google.mn',      'ms': 'google.ms',      'mt': 'google.com.mt',
 1996         'mu': 'google.mu',      'mv': 'google.mv',      'mw': 'google.mw',
 1997         'mx': 'google.com.mx',  'my': 'google.com.my',  'mz': 'google.co.mz',
 1998         'na': 'google.com.na',  'ne': 'google.ne',      'nf': 'google.com.nf',
 1999         'ng': 'google.com.ng',  'ni': 'google.com.ni',  'nl': 'google.nl',
 2000         'no': 'google.no',      'np': 'google.com.np',  'nr': 'google.nr',
 2001         'nu': 'google.nu',      'nz': 'google.co.nz',   'om': 'google.com.om',
 2002         'pa': 'google.com.pa',  'pe': 'google.com.pe',  'pg': 'google.com.pg',
 2003         'ph': 'google.com.ph',  'pk': 'google.com.pk',  'pl': 'google.pl',
 2004         'pn': 'google.co.pn',   'pr': 'google.com.pr',  'ps': 'google.ps',
 2005         'pt': 'google.pt',      'py': 'google.com.py',  'qa': 'google.com.qa',
 2006         'ro': 'google.ro',      'rs': 'google.rs',      'ru': 'google.ru',
 2007         'rw': 'google.rw',      'sa': 'google.com.sa',  'sb': 'google.com.sb',
 2008         'sc': 'google.sc',      'se': 'google.se',      'sg': 'google.com.sg',
 2009         'sh': 'google.sh',      'si': 'google.si',      'sk': 'google.sk',
 2010         'sl': 'google.com.sl',  'sm': 'google.sm',      'sn': 'google.sn',
 2011         'so': 'google.so',      'sr': 'google.sr',      'st': 'google.st',
 2012         'sv': 'google.com.sv',  'td': 'google.td',      'tg': 'google.tg',
 2013         'th': 'google.co.th',   'tj': 'google.com.tj',  'tk': 'google.tk',
 2014         'tl': 'google.tl',      'tm': 'google.tm',      'tn': 'google.tn',
 2015         'to': 'google.to',      'tr': 'google.com.tr',  'tt': 'google.tt',
 2016         'tw': 'google.com.tw',  'tz': 'google.co.tz',   'ua': 'google.com.ua',
 2017         'ug': 'google.co.ug',   'uk': 'google.co.uk',   'uy': 'google.com.uy',
 2018         'uz': 'google.co.uz',   'vc': 'google.com.vc',  've': 'google.co.ve',
 2019         'vg': 'google.vg',      'vi': 'google.co.vi',   'vn': 'google.com.vn',
 2020         'vu': 'google.vu',      'ws': 'google.ws',      'za': 'google.co.za',
 2021         'zm': 'google.co.zm',   'zw': 'google.co.zw',
 2022     }
 2023 
 2024     @property
 2025     def netloc(self):
 2026         """The hostname."""
 2027         try:
 2028             return 'www.' + self.TLD_TO_DOMAIN_MAP[self._tld]
 2029         except KeyError:
 2030             return 'www.google.com'
 2031 
 2032     @property
 2033     def query(self):
 2034         """The query string."""
 2035         qd = {}
 2036         qd.update(self._query_dict)
 2037         if self._num != 10:  # Skip sending the default
 2038             qd['num'] = self._num
 2039         if self._start:  # Skip sending the default
 2040             qd['start'] = self._start
 2041 
 2042         # Construct the q query
 2043         q = ''
 2044         keywords = self._keywords
 2045         sites = self._sites
 2046         exclude = self._exclude
 2047         if keywords:
 2048             if isinstance(keywords, list):
 2049                 q += '+'.join(urllib.parse.quote_plus(kw) for kw in keywords)
 2050             else:
 2051                 q += urllib.parse.quote_plus(keywords)
 2052         if sites:
 2053             q += '+OR'.join('+site:' + urllib.parse.quote_plus(site) for site in sites)
 2054         if exclude:
 2055             q += ''.join('+-site:' + urllib.parse.quote_plus(e) for e in exclude)
 2056         qd['q'] = q
 2057         return '&'.join('%s=%s' % (k, qd[k]) for k in sorted(qd.keys()))
 2058 
 2059 
 2060 class GoogleConnectionError(Exception):
 2061     pass
 2062 
 2063 
 2064 class GoogleConnection(object):
 2065     """
 2066     This class facilitates connecting to and fetching from Google.
 2067 
 2068     Parameters
 2069     ----------
 2070     See http.client.HTTPSConnection for documentation of the
 2071     parameters.
 2072 
 2073     Raises
 2074     ------
 2075     GoogleConnectionError
 2076 
 2077     Attributes
 2078     ----------
 2079     host : str
 2080         The currently connected host. Read-only property. Use
 2081         `new_connection` to change host.
 2082 
 2083     Methods
 2084     -------
 2085     new_connection(host=None, port=None, timeout=45)
 2086     renew_connection(timeout=45)
 2087     fetch_page(url)
 2088     close()
 2089 
 2090     """
 2091 
 2092     def __init__(self, host, port=None, address_family=0, timeout=45, proxy=None, notweak=False):
 2093         self._host = None
 2094         self._port = None
 2095         self._address_family = address_family
 2096         self._proxy = proxy
 2097         self._notweak = notweak
 2098         self._conn = None
 2099         self.new_connection(host, port=port, timeout=timeout)
 2100         self.cookie = ''
 2101 
 2102     @property
 2103     def host(self):
 2104         """The host currently connected to."""
 2105         return self._host
 2106 
 2107     @time_it()
 2108     def new_connection(self, host=None, port=None, timeout=45):
 2109         """Close the current connection (if any) and establish a new one.
 2110 
 2111         Parameters
 2112         ----------
 2113         See http.client.HTTPSConnection for documentation of the
 2114         parameters. Renew the connection (i.e., reuse the current host
 2115         and port) if host is None or empty.
 2116 
 2117         Raises
 2118         ------
 2119         GoogleConnectionError
 2120 
 2121         """
 2122         if self._conn:
 2123             self._conn.close()
 2124 
 2125         if not host:
 2126             host = self._host
 2127             port = self._port
 2128         self._host = host
 2129         self._port = port
 2130         host_display = host + (':%d' % port if port else '')
 2131 
 2132         proxy = self._proxy
 2133 
 2134         if proxy:
 2135             proxy_user_passwd, proxy_host_port = parse_proxy_spec(proxy)
 2136 
 2137             logger.debug('Connecting to proxy server %s', proxy_host_port)
 2138             self._conn = HardenedHTTPSConnection(proxy_host_port,
 2139                                                  address_family=self._address_family, timeout=timeout)
 2140 
 2141             logger.debug('Tunnelling to host %s' % host_display)
 2142             connect_headers = {}
 2143             if proxy_user_passwd:
 2144                 connect_headers['Proxy-Authorization'] = 'Basic %s' % base64.b64encode(
 2145                     proxy_user_passwd.encode('utf-8')
 2146                 ).decode('utf-8')
 2147             self._conn.set_tunnel(host, port=port, headers=connect_headers)
 2148 
 2149             try:
 2150                 self._conn.connect(self._notweak)
 2151             except Exception as e:
 2152                 msg = 'Failed to connect to proxy server %s: %s.' % (proxy, e)
 2153                 raise GoogleConnectionError(msg)
 2154         else:
 2155             logger.debug('Connecting to new host %s', host_display)
 2156             self._conn = HardenedHTTPSConnection(host, port=port,
 2157                                                  address_family=self._address_family, timeout=timeout)
 2158             try:
 2159                 self._conn.connect(self._notweak)
 2160             except Exception as e:
 2161                 msg = 'Failed to connect to %s: %s.' % (host_display, e)
 2162                 raise GoogleConnectionError(msg)
 2163 
 2164     def renew_connection(self, timeout=45):
 2165         """Renew current connection.
 2166 
 2167         Equivalent to ``new_connection(timeout=timeout)``.
 2168 
 2169         """
 2170         self.new_connection(timeout=timeout)
 2171 
 2172     @time_it()
 2173     def fetch_page(self, url):
 2174         """Fetch a URL.
 2175 
 2176         Allows one reconnection and multiple redirections before failing
 2177         and raising GoogleConnectionError.
 2178 
 2179         Parameters
 2180         ----------
 2181         url : str
 2182             The URL to fetch, relative to the host.
 2183 
 2184         Raises
 2185         ------
 2186         GoogleConnectionError
 2187             When not getting HTTP 200 even after the allowed one
 2188             reconnection and/or one redirection, or when Google is
 2189             blocking query due to unusual activity.
 2190 
 2191         Returns
 2192         -------
 2193         str
 2194             Response payload, gunzipped (if applicable) and decoded (in UTF-8).
 2195 
 2196         """
 2197         try:
 2198             self._raw_get(url)
 2199         except (http.client.HTTPException, OSError) as e:
 2200             logger.debug('Got exception: %s.', e)
 2201             logger.debug('Attempting to reconnect...')
 2202             self.renew_connection()
 2203             try:
 2204                 self._raw_get(url)
 2205             except http.client.HTTPException as e:
 2206                 logger.debug('Got exception: %s.', e)
 2207                 raise GoogleConnectionError("Failed to get '%s'." % url)
 2208 
 2209         resp = self._resp
 2210         redirect_counter = 0
 2211         while resp.status != 200 and redirect_counter < 3:
 2212             if resp.status in {301, 302, 303, 307, 308}:
 2213                 redirection_url = resp.getheader('location', '')
 2214                 if 'sorry/IndexRedirect?' in redirection_url or 'sorry/index?' in redirection_url:
 2215                     msg = "Connection blocked due to unusual activity.\n"
 2216                     if self._conn.address_family == socket.AF_INET6:
 2217                         msg += textwrap.dedent("""\
 2218                         You are connecting over IPv6 which is likely the problem. Try to make
 2219                         sure the machine has a working IPv4 network interface configured.
 2220                         See also the -4, --ipv4 option of googler.\n""")
 2221                     msg += textwrap.dedent("""\
 2222                     THIS IS NOT A BUG, please do NOT report it as a bug unless you have specific
 2223                     information that may lead to the development of a workaround.
 2224                     You IP address is temporarily or permanently blocked by Google and requires
 2225                     reCAPTCHA-solving to use the service, which googler is not capable of.
 2226                     Possible causes include issuing too many queries in a short time frame, or
 2227                     operating from a shared / low reputation IP with a history of abuse.
 2228                     Please do NOT use googler for automated scraping.""")
 2229                     msg = " ".join(msg.splitlines())
 2230                     raise GoogleConnectionError(msg)
 2231                 self._redirect(redirection_url)
 2232                 resp = self._resp
 2233                 redirect_counter += 1
 2234             else:
 2235                 break
 2236 
 2237         if resp.status != 200:
 2238             raise GoogleConnectionError('Got HTTP %d: %s' % (resp.status, resp.reason))
 2239 
 2240         payload = resp.read()
 2241         try:
 2242             return gzip.decompress(payload).decode('utf-8')
 2243         except OSError:
 2244             # Not gzipped
 2245             return payload.decode('utf-8')
 2246 
 2247     def _redirect(self, url):
 2248         """Redirect to and fetch a new URL.
 2249 
 2250         Like `_raw_get`, the response is stored in ``self._resp``. A new
 2251         connection is made if redirecting to a different host.
 2252 
 2253         Parameters
 2254         ----------
 2255         url : str
 2256             If absolute and points to a different host, make a new
 2257             connection.
 2258 
 2259         Raises
 2260         ------
 2261         GoogleConnectionError
 2262 
 2263         """
 2264         logger.debug('Redirecting to URL %s', url)
 2265         segments = urllib.parse.urlparse(url)
 2266 
 2267         host = segments.netloc
 2268         if host != self._host:
 2269             self.new_connection(host)
 2270 
 2271         relurl = urllib.parse.urlunparse(('', '') + segments[2:])
 2272         try:
 2273             self._raw_get(relurl)
 2274         except http.client.HTTPException as e:
 2275             logger.debug('Got exception: %s.', e)
 2276             raise GoogleConnectionError("Failed to get '%s'." % url)
 2277 
 2278     def _raw_get(self, url):
 2279         """Make a raw HTTP GET request.
 2280 
 2281         No status check (which implies no redirection). Response can be
 2282         accessed from ``self._resp``.
 2283 
 2284         Parameters
 2285         ----------
 2286         url : str
 2287             URL relative to the host, used in the GET request.
 2288 
 2289         Raises
 2290         ------
 2291         http.client.HTTPException
 2292 
 2293         """
 2294         logger.debug('Fetching URL %s', url)
 2295         self._conn.request('GET', url, None, {
 2296             'Accept': 'text/html',
 2297             'Accept-Encoding': 'gzip',
 2298             'User-Agent': USER_AGENT,
 2299             'Cookie': self.cookie,
 2300             'Connection': 'keep-alive',
 2301             'DNT': '1',
 2302         })
 2303         self._resp = self._conn.getresponse()
 2304         if self.cookie == '':
 2305             complete_cookie = self._resp.getheader('Set-Cookie')
 2306             # Cookie won't be available if already blocked
 2307             if complete_cookie is not None:
 2308                 self.cookie = complete_cookie[:complete_cookie.find(';')]
 2309                 logger.debug('Cookie: %s' % self.cookie)
 2310 
 2311     def close(self):
 2312         """Close the connection (if one is active)."""
 2313         if self._conn:
 2314             self._conn.close()
 2315 
 2316 
 2317 class GoogleParser(object):
 2318 
 2319     def __init__(self, html, *, news=False, videos=False):
 2320         self.news = news
 2321         self.videos = videos
 2322         self.autocorrected = False
 2323         self.showing_results_for = None
 2324         self.filtered = False
 2325         self.results = []
 2326         self.parse(html)
 2327 
 2328     @time_it()
 2329     def parse(self, html):
 2330         tree = parse_html(html)
 2331 
 2332         if debugger:
 2333             printerr('\x1b[1mInspect the DOM through the \x1b[4mtree\x1b[24m variable.\x1b[0m')
 2334             printerr('')
 2335             try:
 2336                 import IPython
 2337                 IPython.embed()
 2338             except ImportError:
 2339                 import pdb
 2340                 pdb.set_trace()
 2341 
 2342         # cw is short for collapse_whitespace.
 2343         cw = lambda s: re.sub(r'[ \t\n\r]+', ' ', s) if s is not None else s
 2344 
 2345         index = 0
 2346         for div_g in tree.select_all('div.g'):
 2347             if div_g.select('.hp-xpdbox'):
 2348                 # Skip smart cards.
 2349                 continue
 2350             try:
 2351                 if div_g.select('.st'):
 2352                     # Old class structure, stopped working some time in
 2353                     # September 2020, but kept just in case.
 2354                     h3 = div_g.select('div.r h3')
 2355                     if h3:
 2356                         title = h3.text
 2357                         a = h3.parent
 2358                     else:
 2359                         h3 = div_g.select('h3.r')
 2360                         a = h3.select('a')
 2361                         title = a.text
 2362                         mime = div_g.select('.mime')
 2363                         if mime:
 2364                             title = mime.text + ' ' + title
 2365                     abstract_node = div_g.select('.st')
 2366                     metadata_node = div_g.select('.f')
 2367                 else:
 2368                     # Current structure as of October 2020.
 2369                     # Note that a filetype tag (e.g. PDF) is now pretty
 2370                     # damn hard to parse with confidence (that it'll
 2371                     # survive the slighest further change), so we don't.
 2372 
 2373                     # As of January 15th 2021, the html class is not rc anymore, it's tF2Cxc.
 2374                     # This approach is not very resilient to changes by Google, but it works for now.
 2375                     # title_node, details_node, *_ = div_g.select_all('div.rc > div')
 2376                     title_node, details_node, *_ = div_g.select_all('div.tF2Cxc > div')
 2377                     if 'yuRUbf' not in title_node.classes:
 2378                         logger.debug('unexpected title node class(es): expected %r, got %r',
 2379                                      'yuRUbf', ' '.join(title_node.classes))
 2380                     if 'IsZvec' not in details_node.classes:
 2381                         logger.debug('unexpected details node class(es): expected %r, got %r',
 2382                                      'IsZvec', ' '.join(details_node.classes))
 2383                     a = title_node.select('a')
 2384                     h3 = a.select('h3')
 2385                     title = h3.text
 2386                     abstract_node = details_node.select('span')
 2387                     metadata_node = details_node.select('.f, span ~ div')
 2388                 url = self.unwrap_link(a.attr('href'))
 2389                 matched_keywords = []
 2390                 abstract = ''
 2391                 # BFS descendant nodes. Necessary to locate matches (b,
 2392                 # em) while skipping metadata (.f).
 2393                 abstract_nodes = collections.deque([abstract_node])
 2394                 while abstract_nodes:
 2395                     node = abstract_nodes.popleft()
 2396                     if 'f' in node.classes:
 2397                         # .f is handled as metadata instead.
 2398                         continue
 2399                     if node.tag in ['b', 'em']:
 2400                         matched_keywords.append({'phrase': node.text, 'offset': len(abstract)})
 2401                         abstract += node.text
 2402                         continue
 2403                     if not node.children:
 2404                         abstract += node.text
 2405                         continue
 2406                     for child in node.children:
 2407                         abstract_nodes.append(child)
 2408                 metadata = None
 2409                 try:
 2410                     # Sometimes there are multiple metadata fields
 2411                     # associated with a single entry, e.g. "Released",
 2412                     # "Producer(s)", "Genre", etc. for a song (sample
 2413                     # query: "never gonna give you up"). These need to
 2414                     # be delimited when displayed.
 2415                     metadata_fields = metadata_node.select_all('div > div.wFMWsc')
 2416                     if metadata_fields:
 2417                         metadata = ' | '.join(field.text for field in metadata_fields)
 2418                     elif not metadata_node.select('a') and not metadata_node.select('g-expandable-container'):
 2419                         metadata = metadata_node.text
 2420                     if metadata:
 2421                         metadata = (
 2422                             metadata
 2423                             .replace('\u200e', '')
 2424                             .replace(' - ', ', ')
 2425                             .replace(' \u2014 ', ', ')
 2426                             .strip().rstrip(',')
 2427                         )
 2428                 except AttributeError:
 2429                     pass
 2430             except (AttributeError, ValueError):
 2431                 continue
 2432             sitelinks = []
 2433             for td in div_g.select_all('td'):
 2434                 try:
 2435                     a = td.select('a')
 2436                     sl_title = a.text
 2437                     sl_url = self.unwrap_link(a.attr('href'))
 2438                     sl_abstract = td.select('div.s.st, div.s .st').text
 2439                     sitelink = Sitelink(cw(sl_title), sl_url, cw(sl_abstract))
 2440                     if sitelink not in sitelinks:
 2441                         sitelinks.append(sitelink)
 2442                 except (AttributeError, ValueError):
 2443                     continue
 2444             # cw cannot be applied to abstract here since it may screw
 2445             # up offsets of matches. Instead, each relevant node's text
 2446             # is whitespace-collapsed before being appended to abstract.
 2447             # We then hope for the best.
 2448             result = Result(index + 1, cw(title), url, abstract,
 2449                             metadata=cw(metadata), sitelinks=sitelinks, matches=matched_keywords)
 2450             if result not in self.results:
 2451                 self.results.append(result)
 2452                 index += 1
 2453 
 2454         if not self.results:
 2455             for card in tree.select_all('g-card'):
 2456                 a = card.select('a[href]')
 2457                 if not a:
 2458                     continue
 2459                 url = self.unwrap_link(a.attr('href'))
 2460                 text_nodes = []
 2461                 for node in a.descendants():
 2462                     if isinstance(node, TextNode) and node.strip():
 2463                         text_nodes.append(node.text)
 2464                 if len(text_nodes) != 4:
 2465                     continue
 2466                 publisher, title, abstract, publishing_time = text_nodes
 2467                 metadata = '%s, %s' % (publisher, publishing_time)
 2468                 index += 1
 2469                 self.results.append(Result(index, cw(title), url, cw(abstract), metadata=cw(metadata)))
 2470 
 2471         # Showing results for ...
 2472         # Search instead for ...
 2473         spell_orig = tree.select("span.spell_orig")
 2474         if spell_orig:
 2475             showing_results_for_link = next(
 2476                 filter(lambda el: el.tag == "a", spell_orig.previous_siblings()), None
 2477             )
 2478             if showing_results_for_link:
 2479                 self.autocorrected = True
 2480                 self.showing_results_for = showing_results_for_link.text
 2481 
 2482         # No results found for ...
 2483         # Results for ...:
 2484         alt_query_infobox = tree.select('#topstuff')
 2485         if alt_query_infobox:
 2486             bolds = alt_query_infobox.select_all('div b')
 2487             if len(bolds) == 2:
 2488                 self.showing_results_for = bolds[1].text
 2489 
 2490         # In order to show you the most relevant results, we have
 2491         # omitted some entries very similar to the N already displayed.
 2492         # ...
 2493         self.filtered = tree.select('p#ofr') is not None
 2494 
 2495     # Unwraps /url?q=http://...&sa=...
 2496     # TODO: don't unwrap if URL isn't in this form.
 2497     @staticmethod
 2498     def unwrap_link(link):
 2499         qs = urllib.parse.urlparse(link).query
 2500         try:
 2501             url = urllib.parse.parse_qs(qs)['q'][0]
 2502         except KeyError:
 2503             return link
 2504         else:
 2505             if "://" in url:
 2506                 return url
 2507             else:
 2508                 # Google's internal services link, e.g.,
 2509                 # /search?q=google&..., which cannot be unwrapped into
 2510                 # an actual URL.
 2511                 raise ValueError(link)
 2512 
 2513 
 2514 class Sitelink(object):
 2515     """Container for a sitelink."""
 2516 
 2517     def __init__(self, title, url, abstract):
 2518         self.title = title
 2519         self.url = url
 2520         self.abstract = abstract
 2521         self.index = ''
 2522 
 2523     def __eq__(self, other):
 2524         return (
 2525             self.title == other.title and
 2526             self.url == other.url and
 2527             self.abstract == other.abstract
 2528         )
 2529 
 2530     def __hash__(self):
 2531         return hash((self.title, self.url, self.abstract))
 2532 
 2533 
 2534 Colors = collections.namedtuple('Colors', 'index, title, url, metadata, abstract, prompt, reset')
 2535 
 2536 
 2537 class Result(object):
 2538     """
 2539     Container for one search result, with output helpers.
 2540 
 2541     Parameters
 2542     ----------
 2543     index : int or str
 2544     title : str
 2545     url : str
 2546     abstract : str
 2547     metadata : str, optional
 2548         Only applicable to Google News results, with publisher name and
 2549         publishing time.
 2550     sitelinks : list, optional
 2551         List of ``SiteLink`` objects.
 2552 
 2553     Attributes
 2554     ----------
 2555     index : str
 2556     title : str
 2557     url : str
 2558     abstract : str
 2559     metadata : str or None
 2560     sitelinks : list
 2561     matches : list
 2562 
 2563     Class Variables
 2564     ---------------
 2565     colors : str
 2566 
 2567     Methods
 2568     -------
 2569     print()
 2570     jsonizable_object()
 2571     urltable()
 2572 
 2573     """
 2574 
 2575     # Class variables
 2576     colors = None
 2577     urlexpand = True
 2578 
 2579     def __init__(self, index, title, url, abstract, metadata=None, sitelinks=None, matches=None):
 2580         index = str(index)
 2581         self.index = index
 2582         self.title = title
 2583         self.url = url
 2584         self.abstract = abstract
 2585         self.metadata = metadata
 2586         self.sitelinks = [] if sitelinks is None else sitelinks
 2587         self.matches = [] if matches is None else matches
 2588 
 2589         self._urltable = {index: url}
 2590         subindex = 'a'
 2591         for sitelink in self.sitelinks:
 2592             fullindex = index + subindex
 2593             sitelink.index = fullindex
 2594             self._urltable[fullindex] = sitelink.url
 2595             subindex = chr(ord(subindex) + 1)
 2596 
 2597     def __eq__(self, other):
 2598         return (
 2599             self.title == other.title and
 2600             self.url == other.url and
 2601             self.abstract == other.abstract and
 2602             self.metadata == other.metadata and
 2603             self.sitelinks == other.sitelinks and
 2604             self.matches == other.matches
 2605         )
 2606 
 2607     def __hash__(self):
 2608         sitelinks_hashable = tuple(self.sitelinks) if self.sitelinks is not None else None
 2609         matches_hashable = tuple(self.matches) if self.matches is not None else None
 2610         return hash(self.title, self.url, self.abstract, self.metadata, sitelinks_hashable, matches_hashable)
 2611 
 2612     def _print_title_and_url(self, index, title, url, indent=0):
 2613         colors = self.colors
 2614 
 2615         if not self.urlexpand:
 2616             url = '[' + urllib.parse.urlparse(url).netloc + ']'
 2617 
 2618         if colors:
 2619             # Adjust index to print result index clearly
 2620             print(" %s%s%-3s%s" % (' ' * indent, colors.index, index + '.', colors.reset), end='')
 2621             if not self.urlexpand:
 2622                 print(' ' + colors.title + title + colors.reset + ' ' + colors.url + url + colors.reset)
 2623             else:
 2624                 print(' ' + colors.title + title + colors.reset)
 2625                 print(' ' * (indent + 5) + colors.url + url + colors.reset)
 2626         else:
 2627             if self.urlexpand:
 2628                 print(' %s%-3s %s' % (' ' * indent, index + '.', title))
 2629                 print(' %s%s' % (' ' * (indent + 4), url))
 2630             else:
 2631                 print(' %s%-3s %s %s' % (' ' * indent, index + '.', title, url))
 2632 
 2633     def _print_metadata_and_abstract(self, abstract, metadata=None, matches=None, indent=0):
 2634         colors = self.colors
 2635         try:
 2636             columns, _ = os.get_terminal_size()
 2637         except OSError:
 2638             columns = 0
 2639 
 2640         if metadata:
 2641             if colors:
 2642                 print(' ' * (indent + 5) + colors.metadata + metadata + colors.reset)
 2643             else:
 2644                 print(' ' * (indent + 5) + metadata)
 2645 
 2646         if abstract:
 2647             fillwidth = (columns - (indent + 6)) if columns > indent + 6 else len(abstract)
 2648             wrapped_abstract = TrackedTextwrap(abstract, fillwidth)
 2649             if colors:
 2650                 # Highlight matches.
 2651                 for match in matches or []:
 2652                     offset = match['offset']
 2653                     span = len(match['phrase'])
 2654                     wrapped_abstract.insert_zero_width_sequence('\x1b[1m', offset)
 2655                     wrapped_abstract.insert_zero_width_sequence('\x1b[0m', offset + span)
 2656 
 2657             if colors:
 2658                 print(colors.abstract, end='')
 2659             for line in wrapped_abstract.lines:
 2660                 print('%s%s' % (' ' * (indent + 5), line))
 2661             if colors:
 2662                 print(colors.reset, end='')
 2663 
 2664         print('')
 2665 
 2666     def print(self):
 2667         """Print the result entry."""
 2668         self._print_title_and_url(self.index, self.title, self.url)
 2669         self._print_metadata_and_abstract(self.abstract, metadata=self.metadata, matches=self.matches)
 2670 
 2671         for sitelink in self.sitelinks:
 2672             self._print_title_and_url(sitelink.index, sitelink.title, sitelink.url, indent=4)
 2673             self._print_metadata_and_abstract(sitelink.abstract, indent=4)
 2674 
 2675     def jsonizable_object(self):
 2676         """Return a JSON-serializable dict representing the result entry."""
 2677         obj = {
 2678             'title': self.title,
 2679             'url': self.url,
 2680             'abstract': self.abstract
 2681         }
 2682         if self.metadata:
 2683             obj['metadata'] = self.metadata
 2684         if self.sitelinks:
 2685             obj['sitelinks'] = [sitelink.__dict__ for sitelink in self.sitelinks]
 2686         if self.matches:
 2687             obj['matches'] = self.matches
 2688         return obj
 2689 
 2690     def urltable(self):
 2691         """Return a index-to-URL table for the current result.
 2692 
 2693         Normally, the table contains only a single entry, but when the result
 2694         contains sitelinks, all sitelinks are included in this table.
 2695 
 2696         Returns
 2697         -------
 2698         dict
 2699             A dict mapping indices (strs) to URLs (also strs). Indices of
 2700             sitelinks are the original index appended by lowercase letters a,
 2701             b, c, etc.
 2702 
 2703         """
 2704         return self._urltable
 2705 
 2706     @staticmethod
 2707     def collapse_whitespace(s):
 2708         return re.sub(r'[ \t\n\r]+', ' ', s)
 2709 
 2710 
 2711 class GooglerCmdException(Exception):
 2712     pass
 2713 
 2714 
 2715 class NoKeywordsException(GooglerCmdException):
 2716     pass
 2717 
 2718 
 2719 def require_keywords(method):
 2720     # Require keywords to be set before we run a GooglerCmd method. If
 2721     # no keywords have been set, raise a NoKeywordsException.
 2722     @functools.wraps(method)
 2723     def enforced_method(self, *args, **kwargs):
 2724         if not self.keywords:
 2725             raise NoKeywordsException('No keywords.')
 2726         method(self, *args, **kwargs)
 2727 
 2728     return enforced_method
 2729 
 2730 
 2731 def no_argument(method):
 2732     # Normalize a do_* method of GooglerCmd that takes no argument to
 2733     # one that takes an arg, but issue a warning when an nonempty
 2734     # argument is given.
 2735     @functools.wraps(method)
 2736     def enforced_method(self, arg):
 2737         if arg:
 2738             method_name = arg.__name__
 2739             command_name = method_name[3:] if method_name.startswith('do_') else method_name
 2740             logger.warning("Argument to the '%s' command ignored.", command_name)
 2741         method(self)
 2742 
 2743     return enforced_method
 2744 
 2745 
 2746 class GooglerCmd(object):
 2747     """
 2748     Command line interpreter and executor class for googler.
 2749 
 2750     Inspired by PSL cmd.Cmd.
 2751 
 2752     Parameters
 2753     ----------
 2754     opts : argparse.Namespace
 2755         Options and/or arguments.
 2756 
 2757     Attributes
 2758     ----------
 2759     options : argparse.Namespace
 2760         Options that are currently in effect. Read-only attribute.
 2761     keywords : str or list or strs
 2762         Current keywords. Read-only attribute
 2763 
 2764     Methods
 2765     -------
 2766     fetch()
 2767     display_results(prelude='\n', json_output=False)
 2768     fetch_and_display(prelude='\n', json_output=False, interactive=True)
 2769     read_next_command()
 2770     help()
 2771     cmdloop()
 2772     """
 2773 
 2774     # Class variables
 2775     colors = None
 2776     re_url_index = re.compile(r"\d+(a-z)?")
 2777 
 2778     def __init__(self, opts):
 2779         super().__init__()
 2780 
 2781         self._opts = opts
 2782 
 2783         self._google_url = GoogleUrl(opts)
 2784 
 2785         if opts.html_file:
 2786             # Preloaded HTML parsing mode, do not initialize connection.
 2787             self._preload_from_file = opts.html_file
 2788             self._conn = None
 2789         else:
 2790             self._preload_from_file = None
 2791             proxy = opts.proxy if hasattr(opts, 'proxy') else None
 2792             self._conn = GoogleConnection(self._google_url.hostname,
 2793                                         address_family=opts.address_family,
 2794                                         proxy=proxy,
 2795                                         notweak=opts.notweak)
 2796             atexit.register(self._conn.close)
 2797 
 2798         self.results = []
 2799         self._autocorrected = None
 2800         self._showing_results_for = None
 2801         self._results_filtered = False
 2802         self._urltable = {}
 2803 
 2804         self.promptcolor = True if os.getenv('DISABLE_PROMPT_COLOR') is None else False
 2805 
 2806         self.no_results_instructions_shown = False
 2807 
 2808     @property
 2809     def options(self):
 2810         """Current options."""
 2811         return self._opts
 2812 
 2813     @property
 2814     def keywords(self):
 2815         """Current keywords."""
 2816         return self._google_url.keywords
 2817 
 2818     @require_keywords
 2819     def fetch(self):
 2820         """Fetch a page and parse for results.
 2821 
 2822         Results are stored in ``self.results``.
 2823 
 2824         Raises
 2825         ------
 2826         GoogleConnectionError
 2827 
 2828         See Also
 2829         --------
 2830         fetch_and_display
 2831 
 2832         """
 2833         # This method also sets self._results_filtered and
 2834         # self._urltable.
 2835         if self._preload_from_file:
 2836             with open(self._preload_from_file, encoding='utf-8') as fp:
 2837                 page = fp.read()
 2838         else:
 2839             page = self._conn.fetch_page(self._google_url.relative())
 2840             if logger.isEnabledFor(logging.DEBUG):
 2841                 import tempfile
 2842                 fd, tmpfile = tempfile.mkstemp(prefix='googler-response-', suffix='.html')
 2843                 os.close(fd)
 2844                 with open(tmpfile, 'w', encoding='utf-8') as fp:
 2845                     fp.write(page)
 2846                 logger.debug("Response body written to '%s'.", tmpfile)
 2847 
 2848         parser = GoogleParser(page, news=self._google_url.news, videos=self._google_url.videos)
 2849 
 2850         self.results = parser.results
 2851         self._autocorrected = parser.autocorrected
 2852         self._showing_results_for = parser.showing_results_for
 2853         self._results_filtered = parser.filtered
 2854         self._urltable = {}
 2855         for r in self.results:
 2856             self._urltable.update(r.urltable())
 2857 
 2858     def warn_no_results(self):
 2859         printerr('No results.')
 2860         if self.no_results_instructions_shown:
 2861             return
 2862 
 2863         try:
 2864             import json
 2865             import urllib.error
 2866             import urllib.request
 2867             info_json_url = '%s/master/info.json' % RAW_DOWNLOAD_REPO_BASE
 2868             logger.debug('Fetching %s for project status...', info_json_url)
 2869             try:
 2870                 with urllib.request.urlopen(info_json_url, timeout=5) as response:
 2871                     try:
 2872                         info = json.load(response)
 2873                     except Exception:
 2874                         logger.error('Failed to decode project status from %s', info_json_url)
 2875                         raise RuntimeError
 2876             except urllib.error.HTTPError as e:
 2877                 logger.error('Failed to fetch project status from %s: HTTP %d', info_json_url, e.code)
 2878                 raise RuntimeError
 2879             epoch = info.get('epoch')
 2880             if epoch > _EPOCH_:
 2881                 printerr('Your version of googler is broken due to Google-side changes.')
 2882                 tracking_issue = info.get('tracking_issue')
 2883                 fixed_on_master = info.get('fixed_on_master')
 2884                 fixed_in_release = info.get('fixed_in_release')
 2885                 if fixed_in_release:
 2886                     printerr('A new version, %s, has been released to address the changes.' % fixed_in_release)
 2887                     printerr('Please upgrade to the latest version.')
 2888                 elif fixed_on_master:
 2889                     printerr('The fix has been pushed to master, pending a release.')
 2890                     printerr('Please download the master version https://git.io/googler or wait for a release.')
 2891                 else:
 2892                     printerr('The issue is tracked at https://github.com/jarun/googler/issues/%s.' % tracking_issue)
 2893                 return
 2894         except RuntimeError:
 2895             pass
 2896 
 2897         printerr('If you believe this is a bug, please review '
 2898                  'https://git.io/googler-no-results before submitting a bug report.')
 2899         self.no_results_instructions_shown = True
 2900 
 2901     @require_keywords
 2902     def display_results(self, prelude='\n', json_output=False):
 2903         """Display results stored in ``self.results``.
 2904 
 2905         Parameters
 2906         ----------
 2907         See `fetch_and_display`.
 2908 
 2909         """
 2910         if json_output:
 2911             # JSON output
 2912             import json
 2913             results_object = [r.jsonizable_object() for r in self.results]
 2914             print(json.dumps(results_object, indent=2, sort_keys=True, ensure_ascii=False))
 2915         else:
 2916             # Regular output
 2917             if not self.results:
 2918                 self.warn_no_results()
 2919             else:
 2920                 sys.stderr.write(prelude)
 2921                 for r in self.results:
 2922                     r.print()
 2923 
 2924     @require_keywords
 2925     def showing_results_for_alert(self, interactive=True):
 2926         colors = self.colors
 2927         if self._showing_results_for:
 2928             if colors:
 2929                 # Underline the query
 2930                 actual_query = '\x1b[4m' + self._showing_results_for + '\x1b[24m'
 2931             else:
 2932                 actual_query = self._showing_results_for
 2933             if self._autocorrected:
 2934                 if interactive:
 2935                     info = 'Showing results for %s; enter "x" for an exact search.' % actual_query
 2936                 else:
 2937                     info = 'Showing results for %s; use -x, --exact for an exact search.' % actual_query
 2938             else:
 2939                 info = 'No results found; showing results for %s.' % actual_query
 2940             if interactive:
 2941                 printerr('')
 2942             if colors:
 2943                 printerr(colors.prompt + info + colors.reset)
 2944             else:
 2945                 printerr('** ' + info)
 2946 
 2947     @require_keywords
 2948     def fetch_and_display(self, prelude='\n', json_output=False, interactive=True):
 2949         """Fetch a page and display results.
 2950 
 2951         Results are stored in ``self.results``.
 2952 
 2953         Parameters
 2954         ----------
 2955         prelude : str, optional
 2956             A string that is written to stderr before showing actual results,
 2957             usually serving as a separator. Default is an empty line.
 2958         json_output : bool, optional
 2959             Whether to dump results in JSON format. Default is False.
 2960         interactive : bool, optional
 2961             Whether to show contextual instructions, when e.g. Google
 2962             has filtered the results. Default is True.
 2963 
 2964         Raises
 2965         ------
 2966         GoogleConnectionError
 2967 
 2968         See Also
 2969         --------
 2970         fetch
 2971         display_results
 2972 
 2973         """
 2974         self.fetch()
 2975         self.showing_results_for_alert()
 2976         self.display_results(prelude=prelude, json_output=json_output)
 2977         if self._results_filtered:
 2978             colors = self.colors
 2979             info = 'Enter "unfilter" to show similar results Google omitted.'
 2980             if colors:
 2981                 printerr(colors.prompt + info + colors.reset)
 2982             else:
 2983                 printerr('** ' + info)
 2984             printerr('')
 2985 
 2986     def read_next_command(self):
 2987         """Show omniprompt and read user command line.
 2988 
 2989         Command line is always stripped, and each consecutive group of
 2990         whitespace is replaced with a single space character. If the
 2991         command line is empty after stripping, when ignore it and keep
 2992         reading. Exit with status 0 if we get EOF or an empty line
 2993         (pre-strip, that is, a raw <enter>) twice in a row.
 2994 
 2995         The new command line (non-empty) is stored in ``self.cmd``.
 2996 
 2997         """
 2998         colors = self.colors
 2999         message = 'googler (? for help)'
 3000         prompt = (colors.prompt + message + colors.reset + ' ') if (colors and self.promptcolor) else (message + ': ')
 3001         enter_count = 0
 3002         while True:
 3003             try:
 3004                 cmd = input(prompt)
 3005             except EOFError:
 3006                 sys.exit(0)
 3007 
 3008             if not cmd:
 3009                 enter_count += 1
 3010                 if enter_count == 2:
 3011                     # Double <enter>
 3012                     sys.exit(0)
 3013             else:
 3014                 enter_count = 0
 3015 
 3016             cmd = ' '.join(cmd.split())
 3017             if cmd:
 3018                 self.cmd = cmd
 3019                 break
 3020 
 3021     @staticmethod
 3022     def help():
 3023         GooglerArgumentParser.print_omniprompt_help(sys.stderr)
 3024         printerr('')
 3025 
 3026     @require_keywords
 3027     @no_argument
 3028     def do_first(self):
 3029         try:
 3030             self._google_url.first_page()
 3031         except ValueError as e:
 3032             print(e, file=sys.stderr)
 3033             return
 3034 
 3035         self.fetch_and_display()
 3036 
 3037     def do_google(self, arg):
 3038         # Update keywords and reconstruct URL
 3039         self._opts.keywords = arg
 3040         self._google_url = GoogleUrl(self._opts)
 3041         self.fetch_and_display()
 3042 
 3043     @require_keywords
 3044     @no_argument
 3045     def do_next(self):
 3046         # If > 5 results are being fetched each time,
 3047         # block next when no parsed results in current fetch
 3048         if not self.results and self._google_url._num > 5:
 3049             printerr('No results.')
 3050         else:
 3051             self._google_url.next_page()
 3052             self.fetch_and_display()
 3053 
 3054     @require_keywords
 3055     def do_open(self, *args):
 3056         if not args:
 3057             open_url(self._google_url.full())
 3058             return
 3059 
 3060         for nav in args:
 3061             if nav == 'a':
 3062                 for key, value in sorted(self._urltable.items()):
 3063                     open_url(self._urltable[key])
 3064             elif nav in self._urltable:
 3065                 open_url(self._urltable[nav])
 3066             elif '-' in nav:
 3067                 try:
 3068                     vals = [int(x) for x in nav.split('-')]
 3069                     if (len(vals) != 2):
 3070                         printerr('Invalid range %s.' % nav)
 3071                         continue
 3072 
 3073                     if vals[0] > vals[1]:
 3074                         vals[0], vals[1] = vals[1], vals[0]
 3075 
 3076                     for _id in range(vals[0], vals[1] + 1):
 3077                         if str(_id) in self._urltable:
 3078                             open_url(self._urltable[str(_id)])
 3079                         else:
 3080                             printerr('Invalid index %s.' % _id)
 3081                 except ValueError:
 3082                     printerr('Invalid range %s.' % nav)
 3083             else:
 3084                 printerr('Invalid index %s.' % nav)
 3085 
 3086     @require_keywords
 3087     @no_argument
 3088     def do_previous(self):
 3089         try:
 3090             self._google_url.prev_page()
 3091         except ValueError as e:
 3092             print(e, file=sys.stderr)
 3093             return
 3094 
 3095         self.fetch_and_display()
 3096 
 3097     @require_keywords
 3098     @no_argument
 3099     def do_exact(self):
 3100         # Reset start to 0 when exact is applied.
 3101         self._google_url.update(start=0, exact=True)
 3102         self.fetch_and_display()
 3103 
 3104     @require_keywords
 3105     @no_argument
 3106     def do_unfilter(self):
 3107         # Reset start to 0 when unfilter is applied.
 3108         self._google_url.update(start=0)
 3109         self._google_url.set_queries(filter=0)
 3110         self.fetch_and_display()
 3111 
 3112     def copy_url(self, idx):
 3113         try:
 3114             try:
 3115                 content = self._urltable[idx].encode('utf-8')
 3116             except KeyError:
 3117                 printerr('Invalid index.')
 3118                 return
 3119 
 3120             # try copying the url to clipboard using native utilities
 3121             copier_params = []
 3122             if sys.platform.startswith(('linux', 'freebsd', 'openbsd')):
 3123                 if shutil.which('xsel') is not None:
 3124                     copier_params = ['xsel', '-b', '-i']
 3125                 elif shutil.which('xclip') is not None:
 3126                     copier_params = ['xclip', '-selection', 'clipboard']
 3127                 elif shutil.which('wl-copy') is not None:
 3128                     copier_params = ['wl-copy']
 3129                 elif shutil.which('termux-clipboard-set') is not None:
 3130                     copier_params = ['termux-clipboard-set']
 3131             elif sys.platform == 'darwin':
 3132                 copier_params = ['pbcopy']
 3133             elif sys.platform == 'win32':
 3134                 copier_params = ['clip']
 3135 
 3136             if copier_params:
 3137                 Popen(copier_params, stdin=PIPE, stdout=DEVNULL, stderr=DEVNULL).communicate(content)
 3138                 return
 3139 
 3140             # If native clipboard utilities are absent, try to use terminal multiplexers
 3141             # tmux
 3142             if os.getenv('TMUX_PANE'):
 3143                 copier_params = ['tmux', 'set-buffer']
 3144                 Popen(copier_params + [content], stdin=DEVNULL, stdout=DEVNULL, stderr=DEVNULL).communicate()
 3145                 return
 3146 
 3147             # GNU Screen paste buffer
 3148             if os.getenv('STY'):
 3149                 import tempfile
 3150                 copier_params = ['screen', '-X', 'readbuf', '-e', 'utf8']
 3151                 tmpfd, tmppath = tempfile.mkstemp()
 3152                 try:
 3153                     with os.fdopen(tmpfd, 'wb') as fp:
 3154                         fp.write(content)
 3155                     copier_params.append(tmppath)
 3156                     Popen(copier_params, stdin=DEVNULL, stdout=DEVNULL, stderr=DEVNULL).communicate()
 3157                 finally:
 3158                     os.unlink(tmppath)
 3159                 return
 3160 
 3161             printerr('failed to locate suitable clipboard utility')
 3162         except Exception:
 3163             raise NoKeywordsException
 3164 
 3165     def cmdloop(self):
 3166         """Run REPL."""
 3167         if self.keywords:
 3168             self.fetch_and_display()
 3169         else:
 3170             printerr('Please initiate a query.')
 3171 
 3172         while True:
 3173             self.read_next_command()
 3174             # TODO: Automatic dispatcher
 3175             #
 3176             # We can't write a dispatcher for now because that could
 3177             # change behaviour of the prompt. However, we have already
 3178             # laid a lot of ground work for the dispatcher, e.g., the
 3179             # `no_argument' decorator.
 3180             try:
 3181                 cmd = self.cmd
 3182                 if cmd == 'f':
 3183                     self.do_first('')
 3184                 elif cmd.startswith('g '):
 3185                     self.do_google(cmd[2:])
 3186                 elif cmd == 'n':
 3187                     self.do_next('')
 3188                 elif cmd == 'o':
 3189                     self.do_open()
 3190                 elif cmd.startswith('o '):
 3191                     self.do_open(*cmd[2:].split())
 3192                 elif cmd.startswith('O '):
 3193                     open_url.override_text_browser = True
 3194                     self.do_open(*cmd[2:].split())
 3195                     open_url.override_text_browser = False
 3196                 elif cmd == 'p':
 3197                     self.do_previous('')
 3198                 elif cmd == 'q':
 3199                     break
 3200                 elif cmd == 'x':
 3201                     self.do_exact('')
 3202                 elif cmd == 'unfilter':
 3203                     self.do_unfilter('')
 3204                 elif cmd == '?':
 3205                     self.help()
 3206                 elif cmd in self._urltable:
 3207                     open_url(self._urltable[cmd])
 3208                 elif self.keywords and cmd.isdigit() and int(cmd) < 100:
 3209                     printerr('Index out of bound. To search for the number, use g.')
 3210                 elif cmd == 'u':
 3211                     Result.urlexpand = not Result.urlexpand
 3212                     self.display_results()
 3213                 elif cmd.startswith('c ') and self.re_url_index.match(cmd[2:]):
 3214                     self.copy_url(cmd[2:])
 3215                 else:
 3216                     self.do_google(cmd)
 3217             except NoKeywordsException:
 3218                 printerr('Initiate a query first.')
 3219 
 3220 
 3221 class GooglerArgumentParser(argparse.ArgumentParser):
 3222     """Custom argument parser for googler."""
 3223 
 3224     # Print omniprompt help
 3225     @staticmethod
 3226     def print_omniprompt_help(file=None):
 3227         file = sys.stderr if file is None else file
 3228         file.write(textwrap.dedent("""
 3229         omniprompt keys:
 3230           n, p                  fetch the next or previous set of search results
 3231           index                 open the result corresponding to index in browser
 3232           f                     jump to the first page
 3233           o [index|range|a ...] open space-separated result indices, numeric ranges
 3234                                 (sitelinks unsupported in ranges), or all, in browser
 3235                                 open the current search in browser, if no arguments
 3236           O [index|range|a ...] like key 'o', but try to open in a GUI browser
 3237           g keywords            new Google search for 'keywords' with original options
 3238                                 should be used to search omniprompt keys and indices
 3239           c index               copy url to clipboard
 3240           u                     toggle url expansion
 3241           q, ^D, double Enter   exit googler
 3242           ?                     show omniprompt help
 3243           *                     other inputs issue a new search with original options
 3244         """))
 3245 
 3246     # Print information on googler
 3247     @staticmethod
 3248     def print_general_info(file=None):
 3249         file = sys.stderr if file is None else file
 3250         file.write(textwrap.dedent("""
 3251         Version %s
 3252         Copyright © 2008 Henri Hakkinen
 3253         Copyright © 2015-2021 Arun Prakash Jana <engineerarun@gmail.com>
 3254         Zhiming Wang <zmwangx@gmail.com>
 3255         License: GPLv3
 3256         Webpage: https://github.com/jarun/googler
 3257         """ % _VERSION_))
 3258 
 3259     # Augment print_help to print more than synopsis and options
 3260     def print_help(self, file=None):
 3261         super().print_help(file)
 3262         self.print_omniprompt_help(file)
 3263         self.print_general_info(file)
 3264 
 3265     # Automatically print full help text on error
 3266     def error(self, message):
 3267         sys.stderr.write('%s: error: %s\n\n' % (self.prog, message))
 3268         self.print_help(sys.stderr)
 3269         self.exit(2)
 3270 
 3271     # Type guards
 3272     @staticmethod
 3273     def positive_int(arg):
 3274         """Try to convert a string into a positive integer."""
 3275         try:
 3276             n = int(arg)
 3277             assert n > 0
 3278             return n
 3279         except (ValueError, AssertionError):
 3280             raise argparse.ArgumentTypeError('%s is not a positive integer' % arg)
 3281 
 3282     @staticmethod
 3283     def nonnegative_int(arg):
 3284         """Try to convert a string into a nonnegative integer."""
 3285         try:
 3286             n = int(arg)
 3287             assert n >= 0
 3288             return n
 3289         except (ValueError, AssertionError):
 3290             raise argparse.ArgumentTypeError('%s is not a non-negative integer' % arg)
 3291 
 3292     @staticmethod
 3293     def is_duration(arg):
 3294         """Check if a string is a valid duration accepted by Google.
 3295 
 3296         A valid duration is of the form dNUM, where d is a single letter h
 3297         (hour), d (day), w (week), m (month), or y (year), and NUM is a
 3298         non-negative integer.
 3299         """
 3300         try:
 3301             if arg[0] not in ('h', 'd', 'w', 'm', 'y') or int(arg[1:]) < 0:
 3302                 raise ValueError
 3303         except (TypeError, IndexError, ValueError):
 3304             raise argparse.ArgumentTypeError('%s is not a valid duration' % arg)
 3305         return arg
 3306 
 3307     @staticmethod
 3308     def is_date(arg):
 3309         """Check if a string is a valid date/month/year accepted by Google."""
 3310         if re.match(r'^(\d+/){0,2}\d+$', arg):
 3311             return arg
 3312         else:
 3313             raise argparse.ArgumentTypeError('%s is not a valid date/month/year; '
 3314                                              'use the American date format with slashes')
 3315 
 3316     @staticmethod
 3317     def is_colorstr(arg):
 3318         """Check if a string is a valid color string."""
 3319         try:
 3320             assert len(arg) == 6
 3321             for c in arg:
 3322                 assert c in COLORMAP
 3323         except AssertionError:
 3324             raise argparse.ArgumentTypeError('%s is not a valid color string' % arg)
 3325         return arg
 3326 
 3327 
 3328 # Self-upgrade mechanism
 3329 
 3330 def system_is_windows():
 3331     """Checks if the underlying system is Windows (Cygwin included)."""
 3332     return sys.platform in {'win32', 'cygwin'}
 3333 
 3334 
 3335 def get_latest_ref(include_git=False):
 3336     """Helper for download_latest_googler."""
 3337     import urllib.request
 3338 
 3339     if include_git:
 3340         # Get SHA of latest commit on master
 3341         request = urllib.request.Request('%s/commits/master' % API_REPO_BASE,
 3342                                          headers={'Accept': 'application/vnd.github.v3.sha'})
 3343         response = urllib.request.urlopen(request)
 3344         if response.status != 200:
 3345             raise http.client.HTTPException(response.reason)
 3346         return response.read().decode('utf-8')
 3347     else:
 3348         # Get name of latest tag
 3349         request = urllib.request.Request('%s/releases?per_page=1' % API_REPO_BASE,
 3350                                          headers={'Accept': 'application/vnd.github.v3+json'})
 3351         response = urllib.request.urlopen(request)
 3352         if response.status != 200:
 3353             raise http.client.HTTPException(response.reason)
 3354         import json
 3355         return json.loads(response.read().decode('utf-8'))[0]['tag_name']
 3356 
 3357 
 3358 def download_latest_googler(include_git=False):
 3359     """Download latest googler to a temp file.
 3360 
 3361     By default, the latest released version is downloaded, but if
 3362     `include_git` is specified, then the latest git master is downloaded
 3363     instead.
 3364 
 3365     Parameters
 3366     ----------
 3367     include_git : bool, optional
 3368         Download from git master. Default is False.
 3369 
 3370     Returns
 3371     -------
 3372     (git_ref, path): tuple
 3373          A tuple containing the git reference (either name of the latest
 3374          tag or SHA of the latest commit) and path to the downloaded
 3375          file.
 3376 
 3377     """
 3378     # Download googler to a tempfile
 3379     git_ref = get_latest_ref(include_git=include_git)
 3380     googler_download_url = '%s/%s/googler' % (RAW_DOWNLOAD_REPO_BASE, git_ref)
 3381     printerr('Downloading %s' % googler_download_url)
 3382     request = urllib.request.Request(googler_download_url,
 3383                                      headers={'Accept-Encoding': 'gzip'})
 3384     import tempfile
 3385     fd, path = tempfile.mkstemp()
 3386     atexit.register(lambda: os.remove(path) if os.path.exists(path) else None)
 3387     os.close(fd)
 3388     with open(path, 'wb') as fp:
 3389         with urllib.request.urlopen(request) as response:
 3390             if response.status != 200:
 3391                 raise http.client.HTTPException(response.reason)
 3392             payload = response.read()
 3393             try:
 3394                 fp.write(gzip.decompress(payload))
 3395             except OSError:
 3396                 fp.write(payload)
 3397     return git_ref, path
 3398 
 3399 
 3400 def self_replace(path):
 3401     """Replace the current script with a specified file.
 3402 
 3403     Both paths (the specified path and path to the current script) are
 3404     resolved to absolute, symlink-free paths. Upon replacement, the
 3405     owner and mode signatures of the current script are preserved. The
 3406     caller needs to have the necessary permissions.
 3407 
 3408     Replacement won't happen if the specified file is the same
 3409     (content-wise) as the current script.
 3410 
 3411     Parameters
 3412     ----------
 3413     path : str
 3414         Path to the replacement file.
 3415 
 3416     Returns
 3417     -------
 3418     bool
 3419         True if replaced, False if skipped (specified file is the same
 3420         as the current script).
 3421 
 3422     """
 3423     if system_is_windows():
 3424         raise NotImplementedError('Self upgrade not supported on Windows.')
 3425 
 3426     import filecmp
 3427     import shutil
 3428 
 3429     path = os.path.realpath(path)
 3430     self_path = os.path.realpath(__file__)
 3431 
 3432     if filecmp.cmp(path, self_path):
 3433         return False
 3434 
 3435     self_stat = os.stat(self_path)
 3436     os.chown(path, self_stat.st_uid, self_stat.st_gid)
 3437     os.chmod(path, self_stat.st_mode)
 3438 
 3439     shutil.move(path, self_path)
 3440     return True
 3441 
 3442 
 3443 def self_upgrade(include_git=False):
 3444     """Perform in-place self-upgrade.
 3445 
 3446     Parameters
 3447     ----------
 3448     include_git : bool, optional
 3449         See `download_latest_googler`. Default is False.
 3450 
 3451     """
 3452     git_ref, path = download_latest_googler(include_git=include_git)
 3453     if self_replace(path):
 3454         printerr('Upgraded to %s.' % git_ref)
 3455     else:
 3456         printerr('Already up to date.')
 3457 
 3458 
 3459 def check_new_version():
 3460     try:
 3461         from distutils.version import StrictVersion as Version
 3462     except ImportError:
 3463         # distutils not available (thanks distros), use a concise poor
 3464         # man's version parser.
 3465         class Version(tuple):
 3466             def __new__(cls, version_str):
 3467                 def parseint(s):
 3468                     try:
 3469                         return int(s)
 3470                     except ValueError:
 3471                         return 0
 3472                 return tuple.__new__(cls, [parseint(s) for s in version_str.split('.')])
 3473 
 3474     import pathlib
 3475     import tempfile
 3476     import time
 3477     cache = pathlib.Path(tempfile.gettempdir()) / 'googler-latest-version'
 3478     latest_version_str = None
 3479     # Try to load latest version string from cached location, if it
 3480     # exists and is fresh enough.
 3481     try:
 3482         if cache.is_file() and time.time() - cache.stat().st_mtime < 86400:
 3483             latest_version_str = cache.read_text().strip()
 3484     except OSError:
 3485         pass
 3486     if not latest_version_str:
 3487         try:
 3488             latest_version_str = get_latest_ref().lstrip('v')
 3489             cache.write_text(latest_version_str)
 3490         except Exception:
 3491             pass
 3492     if not latest_version_str:
 3493         return
 3494     # Try to fetch latest version string from GitHub.
 3495     try:
 3496         current_version = Version(_VERSION_)
 3497         latest_version = Version(latest_version_str)
 3498     except ValueError:
 3499         return
 3500     if latest_version > current_version:
 3501         print('\x1b[33;1mThe latest release of googler is v%s, please upgrade.\x1b[0m'
 3502               % latest_version_str,
 3503               file=sys.stderr)
 3504 
 3505 
 3506 # Miscellaneous functions
 3507 
 3508 def python_version():
 3509     return '%d.%d.%d' % sys.version_info[:3]
 3510 
 3511 
 3512 def https_proxy_from_environment():
 3513     return os.getenv('https_proxy')
 3514 
 3515 
 3516 def parse_proxy_spec(proxyspec):
 3517     if '://' in proxyspec:
 3518         pos = proxyspec.find('://')
 3519         scheme = proxyspec[:pos]
 3520         proxyspec = proxyspec[pos+3:]
 3521         if scheme.lower() != 'http':
 3522             # Only support HTTP proxies.
 3523             #
 3524             # In particular, we don't support HTTPS proxies since we
 3525             # only speak plain HTTP to the proxy server, so don't give
 3526             # users a false sense of security.
 3527             raise NotImplementedError('Unsupported proxy scheme %s.' % scheme)
 3528 
 3529     if '@' in proxyspec:
 3530         pos = proxyspec.find('@')
 3531         user_passwd = urllib.parse.unquote(proxyspec[:pos])
 3532         # Remove trailing '/' if any
 3533         host_port = proxyspec[pos+1:].rstrip('/')
 3534     else:
 3535         user_passwd = None
 3536         host_port = proxyspec.rstrip('/')
 3537 
 3538     if ':' not in host_port:
 3539         # Use port 1080 as default, following curl.
 3540         host_port += ':1080'
 3541 
 3542     return user_passwd, host_port
 3543 
 3544 
 3545 def set_win_console_mode():
 3546     # VT100 control sequences are supported on Windows 10 Anniversary Update and later.
 3547     # https://docs.microsoft.com/en-us/windows/console/console-virtual-terminal-sequences
 3548     # https://docs.microsoft.com/en-us/windows/console/setconsolemode
 3549     if platform.release() == '10':
 3550         STD_OUTPUT_HANDLE = -11
 3551         STD_ERROR_HANDLE = -12
 3552         ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
 3553         try:
 3554             from ctypes import windll, wintypes, byref
 3555             kernel32 = windll.kernel32
 3556             for nhandle in (STD_OUTPUT_HANDLE, STD_ERROR_HANDLE):
 3557                 handle = kernel32.GetStdHandle(nhandle)
 3558                 old_mode = wintypes.DWORD()
 3559                 if not kernel32.GetConsoleMode(handle, byref(old_mode)):
 3560                     raise RuntimeError('GetConsoleMode failed')
 3561                 new_mode = old_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING
 3562                 if not kernel32.SetConsoleMode(handle, new_mode):
 3563                     raise RuntimeError('SetConsoleMode failed')
 3564             # Note: No need to restore at exit. SetConsoleMode seems to
 3565             # be limited to the calling process.
 3566         except Exception:
 3567             pass
 3568 
 3569 
 3570 # Query autocompleter
 3571 
 3572 # This function is largely experimental and could raise any exception;
 3573 # you should be prepared to catch anything. When it works though, it
 3574 # returns a list of strings the prefix could autocomplete to (however,
 3575 # it is not guaranteed that they start with the specified prefix; for
 3576 # instance, they won't if the specified prefix ends in a punctuation
 3577 # mark.)
 3578 def completer_fetch_completions(prefix):
 3579     import html
 3580     import json
 3581     import re
 3582     import urllib.request
 3583 
 3584     # One can pass the 'hl' query param to specify the language. We
 3585     # ignore that for now.
 3586     api_url = ('https://www.google.com/complete/search?client=psy-ab&q=%s' %
 3587                urllib.parse.quote(prefix, safe=''))
 3588     # A timeout of 3 seconds seems to be overly generous already.
 3589     resp = urllib.request.urlopen(api_url, timeout=3)
 3590     charset = resp.headers.get_content_charset()
 3591     logger.debug('Completions charset: %s', charset)
 3592     respobj = json.loads(resp.read().decode(charset))
 3593 
 3594     # The response object, once parsed as JSON, should look like
 3595     #
 3596     # ['git',
 3597     #  [['git<b>hub</b>', 0],
 3598     #   ['git', 0],
 3599     #   ['git<b>lab</b>', 0],
 3600     #   ['git<b> stash</b>', 0]],
 3601     #  {'q': 'oooAhRzoChqNmMbNaaDKXk1YY4k', 't': {'bpc': False, 'tlw': False}}]
 3602     #
 3603     # Note the each result entry need not have two members; e.g., for
 3604     # 'gi', there is an entry ['gi<b>f</b>', 0, [131]].
 3605     HTML_TAG = re.compile(r'<[^>]+>')
 3606     return [html.unescape(HTML_TAG.sub('', entry[0])) for entry in respobj[1]]
 3607 
 3608 
 3609 def completer_run(prefix):
 3610     if prefix:
 3611         completions = completer_fetch_completions(prefix)
 3612         if completions:
 3613             print('\n'.join(completions))
 3614     sys.exit(0)
 3615 
 3616 
 3617 def parse_args(args=None, namespace=None):
 3618     """Parse googler arguments/options.
 3619 
 3620     Parameters
 3621     ----------
 3622     args : list, optional
 3623         Arguments to parse. Default is ``sys.argv``.
 3624     namespace : argparse.Namespace
 3625         Namespace to write to. Default is a new namespace.
 3626 
 3627     Returns
 3628     -------
 3629     argparse.Namespace
 3630         Namespace with parsed arguments / options.
 3631 
 3632     """
 3633 
 3634     colorstr_env = os.getenv('GOOGLER_COLORS')
 3635 
 3636     argparser = GooglerArgumentParser(description='Google from the command-line.')
 3637     addarg = argparser.add_argument
 3638     addarg('-s', '--start', type=argparser.nonnegative_int, default=0,
 3639            metavar='N', help='start at the Nth result')
 3640     addarg('-n', '--count', dest='num', type=argparser.positive_int,
 3641            default=10, metavar='N', help='show N results (default 10)')
 3642     addarg('-N', '--news', action='store_true',
 3643            help='show results from news section')
 3644     addarg('-V', '--videos', action='store_true',
 3645            help='show results from videos section')
 3646     addarg('-c', '--tld', metavar='TLD',
 3647            help="""country-specific search with top-level domain .TLD, e.g., 'in'
 3648            for India""")
 3649     addarg('-l', '--lang', metavar='LANG', help='display in language LANG')
 3650     addarg('-g', '--geoloc', metavar='CC',
 3651            help="""country-specific geolocation search with country code CC, e.g.
 3652            'in' for India. Country codes are the same as top-level domains""")
 3653     addarg('-x', '--exact', action='store_true',
 3654            help='disable automatic spelling correction')
 3655     addarg('--colorize', nargs='?', choices=['auto', 'always', 'never'],
 3656            const='always', default='auto',
 3657            help="""whether to colorize output; defaults to 'auto', which enables
 3658            color when stdout is a tty device; using --colorize without an argument
 3659            is equivalent to --colorize=always""")
 3660     addarg('-C', '--nocolor', action='store_true',
 3661            help='equivalent to --colorize=never')
 3662     addarg('--colors', dest='colorstr', type=argparser.is_colorstr,
 3663            default=colorstr_env if colorstr_env else 'GKlgxy', metavar='COLORS',
 3664            help='set output colors (see man page for details)')
 3665     addarg('-j', '--first', '--lucky', dest='lucky', action='store_true',
 3666            help='open the first result in web browser and exit')
 3667     addarg('-t', '--time', dest='duration', type=argparser.is_duration,
 3668            metavar='dN', help='time limit search '
 3669            '[h5 (5 hrs), d5 (5 days), w5 (5 weeks), m5 (5 months), y5 (5 years)]')
 3670     addarg('--from', type=argparser.is_date,
 3671            help="""starting date/month/year of date range; must use American date
 3672            format with slashes, e.g., 2/24/2020, 2/2020, 2020; can be used in
 3673            conjunction with --to, and overrides -t, --time""")
 3674     addarg('--to', type=argparser.is_date,
 3675            help='ending date/month/year of date range; see --from')
 3676     addarg('-w', '--site', dest='sites', action='append', metavar='SITE',
 3677            help='search a site using Google')
 3678     addarg('-e', '--exclude', dest='exclude', action='append', metavar='SITE',
 3679            help='exclude site from results')
 3680     addarg('--unfilter', action='store_true', help='do not omit similar results')
 3681     addarg('-p', '--proxy', default=https_proxy_from_environment(),
 3682            help="""tunnel traffic through an HTTP proxy;
 3683            PROXY is of the form [http://][user:password@]proxyhost[:port]""")
 3684     addarg('--noua', action='store_true', help=argparse.SUPPRESS)
 3685     addarg('--notweak', action='store_true',
 3686            help='disable TCP optimizations and forced TLS 1.2')
 3687     addarg('--json', action='store_true',
 3688            help='output in JSON format; implies --noprompt')
 3689     addarg('--url-handler', metavar='UTIL',
 3690            help='custom script or cli utility to open results')
 3691     addarg('--show-browser-logs', action='store_true',
 3692            help='do not suppress browser output (stdout and stderr)')
 3693     addarg('--np', '--noprompt', dest='noninteractive', action='store_true',
 3694            help='search and exit, do not prompt')
 3695     addarg('-4', '--ipv4', action='store_const', dest='address_family',
 3696            const=socket.AF_INET, default=0,
 3697            help="""only connect over IPv4
 3698            (by default, IPv4 is preferred but IPv6 is used as a fallback)""")
 3699     addarg('-6', '--ipv6', action='store_const', dest='address_family',
 3700            const=socket.AF_INET6, default=0,
 3701            help='only connect over IPv6')
 3702     addarg('keywords', nargs='*', metavar='KEYWORD', help='search keywords')
 3703     if ENABLE_SELF_UPGRADE_MECHANISM and not system_is_windows():
 3704         addarg('-u', '--upgrade', action='store_true',
 3705                help='perform in-place self-upgrade')
 3706         addarg('--include-git', action='store_true',
 3707                help='when used with --upgrade, get latest git master')
 3708     addarg('-v', '--version', action='version', version=_VERSION_)
 3709     addarg('-d', '--debug', action='store_true', help='enable debugging')
 3710     # Hidden option for interacting with DOM in an IPython/pdb shell
 3711     addarg('-D', '--debugger', action='store_true', help=argparse.SUPPRESS)
 3712     # Hidden option for parsing dumped HTML
 3713     addarg('--parse', dest='html_file', help=argparse.SUPPRESS)
 3714     addarg('--complete', help=argparse.SUPPRESS)
 3715 
 3716     parsed = argparser.parse_args(args, namespace)
 3717     if parsed.nocolor:
 3718         parsed.colorize = 'never'
 3719 
 3720     return parsed
 3721 
 3722 
 3723 def main():
 3724     try:
 3725         opts = parse_args()
 3726 
 3727         # Set logging level
 3728         if opts.debug:
 3729             logger.setLevel(logging.DEBUG)
 3730             logger.debug('googler version %s', _VERSION_)
 3731             logger.debug('Python version %s', python_version())
 3732             logger.debug('Platform: %s', platform.platform())
 3733             check_new_version()
 3734 
 3735         if opts.debugger:
 3736             global debugger
 3737             debugger = True
 3738 
 3739         # Handle query completer
 3740         if opts.complete is not None:
 3741             completer_run(opts.complete)
 3742 
 3743         # Handle self-upgrade
 3744         if hasattr(opts, 'upgrade') and opts.upgrade:
 3745             self_upgrade(include_git=opts.include_git)
 3746             sys.exit(0)
 3747 
 3748         check_stdout_encoding()
 3749 
 3750         if opts.keywords:
 3751             try:
 3752                 # Add cmdline args to readline history
 3753                 readline.add_history(' '.join(opts.keywords))
 3754             except Exception:
 3755                 pass
 3756 
 3757         # Set colors
 3758         if opts.colorize == 'always':
 3759             colorize = True
 3760         elif opts.colorize == 'auto':
 3761             colorize = sys.stdout.isatty()
 3762         else:  # opts.colorize == 'never'
 3763             colorize = False
 3764 
 3765         if colorize:
 3766             colors = Colors(*[COLORMAP[c] for c in opts.colorstr], reset=COLORMAP['x'])
 3767         else:
 3768             colors = None
 3769         Result.colors = colors
 3770         Result.urlexpand = True if os.getenv('DISABLE_URL_EXPANSION') is None else False
 3771         GooglerCmd.colors = colors
 3772 
 3773         # Try to enable ANSI color support in cmd or PowerShell on Windows 10
 3774         if sys.platform == 'win32' and sys.stdout.isatty() and colorize:
 3775             set_win_console_mode()
 3776 
 3777         if opts.url_handler is not None:
 3778             open_url.url_handler = opts.url_handler
 3779         else:
 3780             # Set text browser override to False
 3781             open_url.override_text_browser = False
 3782 
 3783             # Handle browser output suppression
 3784             if opts.show_browser_logs or (os.getenv('BROWSER') in text_browsers):
 3785                 open_url.suppress_browser_output = False
 3786             else:
 3787                 open_url.suppress_browser_output = True
 3788 
 3789         if opts.noua:
 3790             logger.warning('--noua option has been deprecated and has no effect (see #284)')
 3791 
 3792         repl = GooglerCmd(opts)
 3793 
 3794         # Non-interactive mode
 3795         if opts.json or opts.lucky or opts.noninteractive or opts.html_file:
 3796             repl.fetch()
 3797             if opts.lucky:
 3798                 if repl.results:
 3799                     open_url(repl.results[0].url)
 3800                 else:
 3801                     print('No results.', file=sys.stderr)
 3802             else:
 3803                 repl.showing_results_for_alert(interactive=False)
 3804                 repl.display_results(json_output=opts.json)
 3805             sys.exit(0)
 3806 
 3807         # Interactive mode
 3808         repl.cmdloop()
 3809     except Exception as e:
 3810         # With debugging on, let the exception through for a traceback;
 3811         # otherwise, only print the exception error message.
 3812         if logger.isEnabledFor(logging.DEBUG):
 3813             raise
 3814         else:
 3815             logger.error(e)
 3816             sys.exit(1)
 3817 
 3818 if __name__ == '__main__':
 3819     main()