"Fossies" - the Fresh Open Source Software Archive

Member "googler-4.1/googler" (30 Apr 2020, 119404 Bytes) of package /linux/misc/googler-4.1.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. See also the latest Fossies "Diffs" side-by-side code changes report for "googler": 4.0_vs_4.1.

    1 #!/usr/bin/env python3
    2 #
    3 # Copyright © 2008 Henri Hakkinen
    4 # Copyright © 2015-2020 Arun Prakash Jana <engineerarun@gmail.com>
    5 #
    6 # This program is free software: you can redistribute it and/or modify
    7 # it under the terms of the GNU General Public License as published by
    8 # the Free Software Foundation, either version 3 of the License, or
    9 # (at your option) any later version.
   10 #
   11 # This program is distributed in the hope that it will be useful,
   12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
   13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   14 # GNU General Public License for more details.
   15 #
   16 # You should have received a copy of the GNU General Public License
   17 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
   18 
   19 import argparse
   20 import atexit
   21 import base64
   22 import collections
   23 import codecs
   24 import functools
   25 import gzip
   26 import html.entities
   27 import html.parser
   28 import http.client
   29 from http.client import HTTPSConnection
   30 import locale
   31 import logging
   32 import os
   33 import platform
   34 import shutil
   35 import signal
   36 import socket
   37 import ssl
   38 from subprocess import Popen, PIPE, DEVNULL
   39 import sys
   40 import textwrap
   41 import unicodedata
   42 import urllib.parse
   43 import uuid
   44 import webbrowser
   45 
   46 # Python optional dependency compatibility layer
   47 try:
   48     import readline
   49 except ImportError:
   50     pass
   51 
   52 try:
   53     import setproctitle
   54     setproctitle.setproctitle('googler')
   55 except (ImportError, Exception):
   56     pass
   57 
   58 from typing import (
   59     Any,
   60     Dict,
   61     Generator,
   62     Iterable,
   63     Iterator,
   64     List,
   65     Match,
   66     Optional,
   67     Sequence,
   68     Tuple,
   69     Union,
   70     cast,
   71 )
   72 
   73 # Basic setup
   74 
   75 logging.basicConfig(format='[%(levelname)s] %(message)s')
   76 logger = logging.getLogger()
   77 
   78 
   79 def sigint_handler(signum, frame):
   80     print('\nInterrupted.', file=sys.stderr)
   81     sys.exit(1)
   82 
   83 try:
   84     signal.signal(signal.SIGINT, sigint_handler)
   85 except ValueError:
   86     # signal only works in main thread
   87     pass
   88 
   89 
   90 # Constants
   91 
   92 _VERSION_ = '4.1'
   93 
   94 COLORMAP = {k: '\x1b[%sm' % v for k, v in {
   95     'a': '30', 'b': '31', 'c': '32', 'd': '33',
   96     'e': '34', 'f': '35', 'g': '36', 'h': '37',
   97     'i': '90', 'j': '91', 'k': '92', 'l': '93',
   98     'm': '94', 'n': '95', 'o': '96', 'p': '97',
   99     'A': '30;1', 'B': '31;1', 'C': '32;1', 'D': '33;1',
  100     'E': '34;1', 'F': '35;1', 'G': '36;1', 'H': '37;1',
  101     'I': '90;1', 'J': '91;1', 'K': '92;1', 'L': '93;1',
  102     'M': '94;1', 'N': '95;1', 'O': '96;1', 'P': '97;1',
  103     'x': '0', 'X': '1', 'y': '7', 'Y': '7;1',
  104 }.items()}
  105 
  106 USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
  107 
  108 text_browsers = ['elinks', 'links', 'lynx', 'w3m', 'www-browser']
  109 
  110 # Self-upgrade parameters
  111 #
  112 # Downstream packagers are recommended to turn off the entire self-upgrade
  113 # mechanism through
  114 #
  115 #     make disable-self-upgrade
  116 #
  117 # before running `make install'.
  118 
  119 ENABLE_SELF_UPGRADE_MECHANISM = True
  120 API_REPO_BASE = 'https://api.github.com/repos/jarun/googler'
  121 RAW_DOWNLOAD_REPO_BASE = 'https://raw.githubusercontent.com/jarun/googler'
  122 
  123 debugger = False
  124 
  125 
  126 # Monkeypatch textwrap for CJK wide characters.
  127 
  128 def monkeypatch_textwrap_for_cjk():
  129     try:
  130         if textwrap.wrap.patched:
  131             return
  132     except AttributeError:
  133         pass
  134     psl_textwrap_wrap = textwrap.wrap
  135 
  136     def textwrap_wrap(text, width=70, **kwargs):
  137         if width <= 2:
  138             width = 2
  139         # We first add a U+0000 after each East Asian Fullwidth or East
  140         # Asian Wide character, then fill to width - 1 (so that if a NUL
  141         # character ends up on a new line, we still have one last column
  142         # to spare for the preceding wide character). Finally we strip
  143         # all the NUL characters.
  144         #
  145         # East Asian Width: https://www.unicode.org/reports/tr11/
  146         return [
  147             line.replace('\0', '')
  148             for line in psl_textwrap_wrap(
  149                 ''.join(
  150                     ch + '\0' if unicodedata.east_asian_width(ch) in ('F', 'W') else ch
  151                     for ch in unicodedata.normalize('NFC', text)
  152                 ),
  153                 width=width - 1,
  154                 **kwargs
  155             )
  156         ]
  157 
  158     def textwrap_fill(text, width=70, **kwargs):
  159         return '\n'.join(textwrap_wrap(text, width=width, **kwargs))
  160 
  161     textwrap.wrap = textwrap_wrap
  162     textwrap.fill = textwrap_fill
  163     textwrap.wrap.patched = True
  164     textwrap.fill.patched = True
  165 
  166 
  167 monkeypatch_textwrap_for_cjk()
  168 
  169 
  170 CoordinateType = Tuple[int, int]
  171 
  172 
  173 class TrackedTextwrap:
  174     """
  175     Implements a text wrapper that tracks the position of each source
  176     character, and can correctly insert zero-width sequences at given
  177     offsets of the source text.
  178 
  179     Wrapping result should be the same as that from PSL textwrap.wrap
  180     with default settings except expand_tabs=False.
  181     """
  182 
  183     def __init__(self, text: str, width: int):
  184         self._original = text
  185 
  186         # Do the job of replace_whitespace first so that we can easily
  187         # match text to wrapped lines later. Note that this operation
  188         # does not change text length or offsets.
  189         whitespace = "\t\n\v\f\r "
  190         whitespace_trans = str.maketrans(whitespace, " " * len(whitespace))
  191         text = text.translate(whitespace_trans)
  192 
  193         self._lines = textwrap.wrap(
  194             text, width, expand_tabs=False, replace_whitespace=False
  195         )
  196 
  197         # self._coords track the (row, column) coordinate of each source
  198         # character in the result text. It is indexed by offset in
  199         # source text.
  200         self._coords = []  # type: List[CoordinateType]
  201         offset = 0
  202         try:
  203             if not self._lines:
  204                 # Source text only has whitespaces. We add an empty line
  205                 # in order to produce meaningful coordinates.
  206                 self._lines = [""]
  207             for row, line in enumerate(self._lines):
  208                 assert text[offset : offset + len(line)] == line
  209                 col = 0
  210                 for _ in line:
  211                     self._coords.append((row, col))
  212                     offset += 1
  213                     col += 1
  214                 # All subsequent dropped whitespaces map to the last, imaginary column
  215                 # (the EOL character if you wish) of the current line.
  216                 while offset < len(text) and text[offset] == " ":
  217                     self._coords.append((row, col))
  218                     offset += 1
  219             # One past the final character (think of it as EOF) should
  220             # be treated as a valid offset.
  221             self._coords.append((row, col))
  222         except AssertionError:
  223             raise RuntimeError(
  224                 "TrackedTextwrap: the impossible happened at offset {} of text {!r}".format(
  225                     offset, self._original
  226                 )
  227             )
  228 
  229     # seq should be a zero-width sequence, e.g., an ANSI escape sequence.
  230     # May raise IndexError if offset is out of bounds.
  231     def insert_zero_width_sequence(self, seq: str, offset: int) -> None:
  232         row, col = self._coords[offset]
  233         line = self._lines[row]
  234         self._lines[row] = line[:col] + seq + line[col:]
  235 
  236         # Shift coordinates of all characters after the given character
  237         # on the same line.
  238         shift = len(seq)
  239         offset += 1
  240         while offset < len(self._coords) and self._coords[offset][0] == row:
  241             _, col = self._coords[offset]
  242             self._coords[offset] = (row, col + shift)
  243             offset += 1
  244 
  245     @property
  246     def original(self) -> str:
  247         return self._original
  248 
  249     @property
  250     def lines(self) -> List[str]:
  251         return self._lines
  252 
  253     @property
  254     def wrapped(self) -> str:
  255         return "\n".join(self._lines)
  256 
  257     # May raise IndexError if offset is out of bounds.
  258     def get_coordinate(self, offset: int) -> CoordinateType:
  259         return self._coords[offset]
  260 
  261 
  262 ### begin dim (DOM implementation with CSS support) ###
  263 ### https://github.com/zmwangx/dim/blob/master/dim.py ###
  264 
  265 import html
  266 import re
  267 import textwrap
  268 from collections import OrderedDict
  269 from enum import Enum
  270 from html.parser import HTMLParser
  271 
  272 
  273 SelectorGroupLike = Union[str, "SelectorGroup", "Selector"]
  274 
  275 
  276 class Node(object):
  277     """
  278     Represents a DOM node.
  279 
  280     Parts of JavaScript's DOM ``Node`` API and ``Element`` API are
  281     mirrored here, with extensions. In particular, ``querySelector`` and
  282     ``querySelectorAll`` are mirrored.
  283 
  284     Notable properties and methods: :meth:`attr()`, :attr:`classes`,
  285     :attr:`html`, :attr:`text`, :meth:`ancestors()`,
  286     :meth:`descendants()`, :meth:`select()`, :meth:`select_all()`,
  287     :meth:`matched_by()`,
  288 
  289     Attributes:
  290         tag      (:class:`Optional`\\[:class:`str`])
  291         attrs    (:class:`Dict`\\[:class:`str`, :class:`str`])
  292         parent   (:class:`Optional`\\[:class:`Node`])
  293         children (:class:`List`\\[:class:`Node`])
  294     """
  295 
  296     # Meant to be reimplemented by subclasses.
  297     def __init__(self) -> None:
  298         self.tag = None  # type: Optional[str]
  299         self.attrs = {}  # type: Dict[str, str]
  300         self.parent = None  # type: Optional[Node]
  301         self.children = []  # type: List[Node]
  302 
  303         # Used in DOMBuilder.
  304         self._partial = False
  305 
  306     # HTML representation of the node. Meant to be implemented by
  307     # subclasses.
  308     def __str__(self) -> str:  # pragma: no cover
  309         raise NotImplementedError
  310 
  311     def select(self, selector: SelectorGroupLike) -> Optional["Node"]:
  312         """DOM ``querySelector`` clone. Returns one match (if any)."""
  313         selector = self._normalize_selector(selector)
  314         for node in self._select_all(selector):
  315             return node
  316         return None
  317 
  318     def query_selector(self, selector: SelectorGroupLike) -> Optional["Node"]:
  319         """Alias of :meth:`select`."""
  320         return self.select(selector)
  321 
  322     def select_all(self, selector: SelectorGroupLike) -> List["Node"]:
  323         """DOM ``querySelectorAll`` clone. Returns all matches in a list."""
  324         selector = self._normalize_selector(selector)
  325         return list(self._select_all(selector))
  326 
  327     def query_selector_all(self, selector: SelectorGroupLike) -> List["Node"]:
  328         """Alias of :meth:`select_all`."""
  329         return self.select_all(selector)
  330 
  331     def matched_by(
  332         self, selector: SelectorGroupLike, root: Optional["Node"] = None
  333     ) -> bool:
  334         """
  335         Checks whether this node is matched by `selector`.
  336 
  337         See :meth:`SelectorGroup.matches()`.
  338         """
  339         selector = self._normalize_selector(selector)
  340         return selector.matches(self, root=root)
  341 
  342     @staticmethod
  343     def _normalize_selector(selector: SelectorGroupLike) -> "SelectorGroup":
  344         if isinstance(selector, str):
  345             return SelectorGroup.from_str(selector)
  346         if isinstance(selector, SelectorGroup):
  347             return selector
  348         if isinstance(selector, Selector):
  349             return SelectorGroup([selector])
  350         raise ValueError("not a selector or group of selectors: %s" % repr(selector))
  351 
  352     def _select_all(self, selector: "SelectorGroup") -> Generator["Node", None, None]:
  353         for descendant in self.descendants():
  354             if selector.matches(descendant, root=self):
  355                 yield descendant
  356 
  357     def child_nodes(self) -> List["Node"]:
  358         return self.children
  359 
  360     def first_child(self) -> Optional["Node"]:
  361         if self.children:
  362             return self.children[0]
  363         else:
  364             return None
  365 
  366     def first_element_child(self) -> Optional["Node"]:
  367         for child in self.children:
  368             if isinstance(child, ElementNode):
  369                 return child
  370         return None
  371 
  372     def last_child(self) -> Optional["Node"]:
  373         if self.children:
  374             return self.children[-1]
  375         else:
  376             return None
  377 
  378     def last_element_child(self) -> Optional["Node"]:
  379         for child in reversed(self.children):
  380             if isinstance(child, ElementNode):
  381                 return child
  382         return None
  383 
  384     def next_sibling(self) -> Optional["Node"]:
  385         """.. note:: Not O(1), use with caution."""
  386         next_siblings = self.next_siblings()
  387         if next_siblings:
  388             return next_siblings[0]
  389         else:
  390             return None
  391 
  392     def next_siblings(self) -> List["Node"]:
  393         parent = self.parent
  394         if not parent:
  395             return []
  396         try:
  397             index = parent.children.index(self)
  398             return parent.children[index + 1 :]
  399         except ValueError:  # pragma: no cover
  400             raise ValueError("node is not found in children of its parent")
  401 
  402     def next_element_sibling(self) -> Optional["ElementNode"]:
  403         """.. note:: Not O(1), use with caution."""
  404         for sibling in self.next_siblings():
  405             if isinstance(sibling, ElementNode):
  406                 return sibling
  407         return None
  408 
  409     def previous_sibling(self) -> Optional["Node"]:
  410         """.. note:: Not O(1), use with caution."""
  411         previous_siblings = self.previous_siblings()
  412         if previous_siblings:
  413             return previous_siblings[0]
  414         else:
  415             return None
  416 
  417     def previous_siblings(self) -> List["Node"]:
  418         """
  419         Compared to the natural DOM order, the order of returned nodes
  420         are reversed. That is, the adjacent sibling (if any) is the
  421         first in the returned list.
  422         """
  423         parent = self.parent
  424         if not parent:
  425             return []
  426         try:
  427             index = parent.children.index(self)
  428             if index > 0:
  429                 return parent.children[index - 1 :: -1]
  430             else:
  431                 return []
  432         except ValueError:  # pragma: no cover
  433             raise ValueError("node is not found in children of its parent")
  434 
  435     def previous_element_sibling(self) -> Optional["ElementNode"]:
  436         """.. note:: Not O(1), use with caution."""
  437         for sibling in self.previous_siblings():
  438             if isinstance(sibling, ElementNode):
  439                 return sibling
  440         return None
  441 
  442     def ancestors(
  443         self, *, root: Optional["Node"] = None
  444     ) -> Generator["Node", None, None]:
  445         """
  446         Ancestors are generated in reverse order of depth, stopping at
  447         `root`.
  448 
  449         A :class:`RuntimeException` is raised if `root` is not in the
  450         ancestral chain.
  451         """
  452         if self is root:
  453             return
  454         ancestor = self.parent
  455         while ancestor is not root:
  456             if ancestor is None:
  457                 raise RuntimeError("provided root node not found in ancestral chain")
  458             yield ancestor
  459             ancestor = ancestor.parent
  460         if root:
  461             yield root
  462 
  463     def descendants(self) -> Generator["Node", None, None]:
  464         """Descendants are generated in depth-first order."""
  465         for child in self.children:
  466             yield child
  467             yield from child.descendants()
  468 
  469     def attr(self, attr: str) -> Optional[str]:
  470         """Returns the attribute if it exists on the node, otherwise ``None``."""
  471         return self.attrs.get(attr)
  472 
  473     @property
  474     def html(self) -> str:
  475         """
  476         HTML representation of the node.
  477 
  478         (For a :class:`TextNode`, :meth:`html` returns the escaped version of the
  479         text.
  480         """
  481         return str(self)
  482 
  483     def outer_html(self) -> str:
  484         """Alias of :attr:`html`."""
  485         return self.html
  486 
  487     def inner_html(self) -> str:
  488         """HTML representation of the node's children."""
  489         return "".join(child.html for child in self.children)
  490 
  491     @property
  492     def text(self) -> str:  # pragma: no cover
  493         """This property is expected to be implemented by subclasses."""
  494         raise NotImplementedError
  495 
  496     def text_content(self) -> str:
  497         """Alias of :attr:`text`."""
  498         return self.text
  499 
  500     @property
  501     def classes(self) -> List[str]:
  502         return self.attrs.get("class", "").split()
  503 
  504     def class_list(self) -> List[str]:
  505         return self.classes
  506 
  507 
  508 class ElementNode(Node):
  509     """
  510     Represents an element node.
  511 
  512     Note that tag and attribute names are case-insensitive; attribute
  513     values are case-sensitive.
  514     """
  515 
  516     def __init__(
  517         self,
  518         tag: str,
  519         attrs: Iterable[Tuple[str, Optional[str]]],
  520         *,
  521         parent: Optional["Node"] = None,
  522         children: Optional[Sequence["Node"]] = None
  523     ) -> None:
  524         Node.__init__(self)
  525         self.tag = tag.lower()  # type: str
  526         self.attrs = OrderedDict((attr.lower(), val or "") for attr, val in attrs)
  527         self.parent = parent
  528         self.children = list(children or [])
  529 
  530     def __repr__(self) -> str:
  531         s = "<" + self.tag
  532         if self.attrs:
  533             s += " attrs=%s" % repr(list(self.attrs.items()))
  534         if self.children:
  535             s += " children=%s" % repr(self.children)
  536         s += ">"
  537         return s
  538 
  539     # https://ipython.org/ipython-doc/3/api/generated/IPython.lib.pretty.html
  540     def _repr_pretty_(self, p: Any, cycle: bool) -> None:  # pragma: no cover
  541         if cycle:
  542             raise RuntimeError("cycle detected in DOM tree")
  543         p.text("<\x1b[1m%s\x1b[0m" % self.tag)
  544         if self.attrs:
  545             p.text(" attrs=%s" % repr(list(self.attrs.items())))
  546         if self.children:
  547             p.text(" children=[")
  548             if len(self.children) == 1 and isinstance(self.first_child(), TextNode):
  549                 p.text("\x1b[4m%s\x1b[0m" % repr(self.first_child()))
  550             else:
  551                 with p.indent(2):
  552                     for child in self.children:
  553                         p.break_()
  554                         if hasattr(child, "_repr_pretty_"):
  555                             child._repr_pretty_(p, False)  # type: ignore
  556                         else:
  557                             p.text("\x1b[4m%s\x1b[0m" % repr(child))
  558                         p.text(",")
  559                 p.break_()
  560             p.text("]")
  561         p.text(">")
  562 
  563     def __str__(self) -> str:
  564         """HTML representation of the node."""
  565         s = "<" + self.tag
  566         for attr, val in self.attrs.items():
  567             s += ' %s="%s"' % (attr, html.escape(val))
  568         if self.children:
  569             s += ">"
  570             s += "".join(str(child) for child in self.children)
  571             s += "</%s>" % self.tag
  572         else:
  573             if _tag_is_void(self.tag):
  574                 s += "/>"
  575             else:
  576                 s += "></%s>" % self.tag
  577         return s
  578 
  579     @property
  580     def text(self) -> str:
  581         """The concatenation of all descendant text nodes."""
  582         return "".join(child.text for child in self.children)
  583 
  584 
  585 class TextNode(str, Node):
  586     """
  587     Represents a text node.
  588 
  589     Subclasses :class:`Node` and :class:`str`.
  590     """
  591 
  592     def __new__(cls, text: str) -> "TextNode":
  593         s = str.__new__(cls, text)  # type: ignore
  594         s.parent = None
  595         return s  # type: ignore
  596 
  597     def __init__(self, text: str) -> None:
  598         Node.__init__(self)
  599 
  600     def __repr__(self) -> str:
  601         return "<%s>" % str.__repr__(self)
  602 
  603     # HTML-escaped form of the text node. use text() for unescaped
  604     # version.
  605     def __str__(self) -> str:
  606         return html.escape(self)
  607 
  608     def __eq__(self, other: object) -> bool:
  609         """
  610         Two text nodes are equal if and only if they are the same node.
  611 
  612         For string comparision, use :attr:`text`.
  613         """
  614         return self is other
  615 
  616     def __ne__(self, other: object) -> bool:
  617         """
  618         Two text nodes are non-equal if they are not the same node.
  619 
  620         For string comparision, use :attr:`text`.
  621         """
  622         return self is not other
  623 
  624     @property
  625     def text(self) -> str:
  626         return str.__str__(self)
  627 
  628 
  629 class DOMBuilderException(Exception):
  630     """
  631     Exception raised when :class:`DOMBuilder` detects a bad state.
  632 
  633     Attributes:
  634         pos (:class:`Tuple`\\[:class:`int`, :class:`int`]):
  635             Line number and offset in HTML input.
  636         why (:class:`str`):
  637             Reason of the exception.
  638     """
  639 
  640     def __init__(self, pos: Tuple[int, int], why: str) -> None:
  641         self.pos = pos
  642         self.why = why
  643 
  644     def __str__(self) -> str:  # pragma: no cover
  645         return "DOM builder aborted at %d:%d: %s" % (self.pos[0], self.pos[1], self.why)
  646 
  647 
  648 class DOMBuilder(HTMLParser):
  649     """
  650     HTML parser / DOM builder.
  651 
  652     Subclasses :class:`html.parser.HTMLParser`.
  653 
  654     Consume HTML and builds a :class:`Node` tree. Once finished, use
  655     :attr:`root` to access the root of the tree.
  656 
  657     This parser cannot parse malformed HTML with tag mismatch.
  658     """
  659 
  660     def __init__(self) -> None:
  661         super().__init__(convert_charrefs=True)
  662         self._stack = []  # type: List[Node]
  663 
  664     def handle_starttag(
  665         self, tag: str, attrs: Sequence[Tuple[str, Optional[str]]]
  666     ) -> None:
  667         node = ElementNode(tag, attrs)
  668         node._partial = True
  669         self._stack.append(node)
  670         # For void elements, immediately invoke the end tag handler (see
  671         # handle_startendtag()).
  672         if _tag_is_void(tag):
  673             self.handle_endtag(tag)
  674 
  675     def handle_endtag(self, tag: str) -> None:
  676         tag = tag.lower()
  677         children = []
  678         while self._stack and not self._stack[-1]._partial:
  679             children.append(self._stack.pop())
  680         if not self._stack:
  681             raise DOMBuilderException(self.getpos(), "extra end tag: %s" % repr(tag))
  682         parent = self._stack[-1]
  683         if parent.tag != tag:
  684             raise DOMBuilderException(
  685                 self.getpos(),
  686                 "expecting end tag %s, got %s" % (repr(parent.tag), repr(tag)),
  687             )
  688         parent.children = list(reversed(children))
  689         parent._partial = False
  690         for child in children:
  691             child.parent = parent
  692 
  693     # Make parser behavior for explicitly and implicitly void elements
  694     # (e.g., <hr> vs <hr/>) consistent. The former triggers
  695     # handle_starttag only, whereas the latter triggers
  696     # handle_startendtag (which by default triggers both handle_starttag
  697     # and handle_endtag). See https://www.bugs.python.org/issue25258.
  698     def handle_startendtag(
  699         self, tag: str, attrs: Sequence[Tuple[str, Optional[str]]]
  700     ) -> None:
  701         self.handle_starttag(tag, attrs)
  702 
  703     def handle_data(self, text: str) -> None:
  704         if not self._stack:
  705             # Ignore text nodes before the first tag.
  706             return
  707         self._stack.append(TextNode(text))
  708 
  709     @property
  710     def root(self) -> "Node":
  711         """
  712         Finishes processing and returns the root node.
  713 
  714         Raises :class:`DOMBuilderException` if there is no root tag or
  715         root tag is not closed yet.
  716         """
  717         if not self._stack:
  718             raise DOMBuilderException(self.getpos(), "no root tag")
  719         if self._stack[0]._partial:
  720             raise DOMBuilderException(self.getpos(), "root tag not closed yet")
  721         return self._stack[0]
  722 
  723 
  724 def parse_html(html: str, *, ParserClass: type = DOMBuilder) -> "Node":
  725     """
  726     Parses HTML string, builds DOM, and returns root node.
  727 
  728     The parser may raise :class:`DOMBuilderException`.
  729 
  730     Args:
  731         html: input HTML string
  732         ParserClass: :class:`DOMBuilder` or a subclass
  733 
  734     Returns:
  735         Root note of the parsed tree. If the HTML string contains
  736         multiple top-level elements, only the first is returned and the
  737         rest are lost.
  738     """
  739     builder = ParserClass()  # type: DOMBuilder
  740     builder.feed(html)
  741     builder.close()
  742     return builder.root
  743 
  744 
  745 class SelectorParserException(Exception):
  746     """
  747     Exception raised when the selector parser fails to parse an input.
  748 
  749     Attributes:
  750         s (:class:`str`):
  751             The input string to be parsed.
  752         cursor (:class:`int`):
  753             Cursor position where the failure occurred.
  754         why (:class:`str`):
  755             Reason of the failure.
  756     """
  757 
  758     def __init__(self, s: str, cursor: int, why: str) -> None:
  759         self.s = s
  760         self.cursor = cursor
  761         self.why = why
  762 
  763     def __str__(self) -> str:  # pragma: no cover
  764         return "selector parser aborted at character %d of %s: %s" % (
  765             self.cursor,
  766             repr(self.s),
  767             self.why,
  768         )
  769 
  770 
  771 class SelectorGroup:
  772     """
  773     Represents a group of CSS selectors.
  774 
  775     A group of CSS selectors is simply a comma-separated list of
  776     selectors. [#]_ See :class:`Selector` documentation for the scope of
  777     support.
  778 
  779     Typically, a :class:`SelectorGroup` is constructed from a string
  780     (e.g., ``th.center, td.center``) using the factory function
  781     :meth:`from_str`.
  782 
  783     .. [#] https://www.w3.org/TR/selectors-3/#grouping
  784     """
  785 
  786     def __init__(self, selectors: Iterable["Selector"]) -> None:
  787         self._selectors = list(selectors)
  788 
  789     def __repr__(self) -> str:
  790         return "<SelectorGroup %s>" % repr(str(self))
  791 
  792     def __str__(self) -> str:
  793         return ", ".join(str(selector) for selector in self._selectors)
  794 
  795     def __len__(self) -> int:
  796         return len(self._selectors)
  797 
  798     def __getitem__(self, index: int) -> "Selector":
  799         return self._selectors[index]
  800 
  801     def __iter__(self) -> Iterator["Selector"]:
  802         return iter(self._selectors)
  803 
  804     @classmethod
  805     def from_str(cls, s: str) -> "SelectorGroup":
  806         """
  807         Parses input string into a group of selectors.
  808 
  809         :class:`SelectorParserException` is raised on invalid input. See
  810         :class:`Selector` documentation for the scope of support.
  811 
  812         Args:
  813             s: input string
  814 
  815         Returns:
  816             Parsed group of selectors.
  817         """
  818         i = 0
  819         selectors = []
  820         while i < len(s):
  821             selector, i = Selector.from_str(s, i)
  822             selectors.append(selector)
  823         if not selectors:
  824             raise SelectorParserException(s, i, "selector group is empty")
  825         return cls(selectors)
  826 
  827     def matches(self, node: "Node", root: Optional["Node"] = None) -> bool:
  828         """
  829         Decides whether the group of selectors matches `node`.
  830 
  831         The group of selectors matches `node` as long as one of the
  832         selectors matches `node`.
  833 
  834         If `root` is provided and child and/or descendant combinators
  835         are involved, parent/ancestor lookup terminates at `root`.
  836         """
  837         return any(selector.matches(node, root=root) for selector in self)
  838 
  839 
  840 class Selector:
  841     """
  842     Represents a CSS selector.
  843 
  844     Recall that a CSS selector is a chain of one or more *sequences of
  845     simple selectors* separated by *combinators*. [#selectors-3]_ This
  846     concept is represented as a cons list of sequences of simple
  847     selectors (in right to left order). This class in fact holds a
  848     single sequence, with an optional combinator and reference to the
  849     previous sequence.
  850 
  851     For instance, ``main#main p.important.definition >
  852     a.term[id][href]`` would be parsed into (schematically) the
  853     following structure::
  854 
  855         ">" tag='a' classes=('term') attrs=([id], [href]) ~>
  856         " " tag='p' classes=('important', 'definition') ~>
  857         tag='main' id='main'
  858 
  859     Each line is held in a separate instance of :class:`Selector`,
  860     linked together by the :attr:`previous` attribute.
  861 
  862     Supported grammar (from selectors level 3 [#selectors-3]_):
  863 
  864     - Type selectors;
  865     - Universal selectors;
  866     - Class selectors;
  867     - ID selectors;
  868     - Attribute selectors;
  869     - Combinators.
  870 
  871     Unsupported grammar:
  872 
  873     - Pseudo-classes;
  874     - Pseudo-elements;
  875     - Namespace prefixes (``ns|``, ``*|``, ``|``) in any part of any
  876       selector.
  877 
  878     Rationale:
  879 
  880     - Pseudo-classes have too many variants, a few of which even
  881       complete with an admittedly not-so-complex minilanguage. These add
  882       up to a lot of code.
  883     - Pseudo-elements are useless outside rendering contexts, hence out of
  884       scope.
  885     - Namespace support is too niche to be worth the parsing headache.
  886       *Using namespace prefixes may confuse the parser!*
  887 
  888     Note that the parser only loosely follows the spec and priotizes
  889     ease of parsing (which includes readability and *writability* of
  890     regexes), so some invalid selectors may be accepted (in fact, false
  891     positives abound, but accepting valid inputs is a much more
  892     important goal than rejecting invalid inputs for this library), and
  893     some valid selectors may be rejected (but as long as you stick to
  894     the scope outlined above and common sense you should be fine; the
  895     false negatives shouldn't be used by actual human beings anyway).
  896 
  897     In particular, whitespace character is simplified to ``\\s`` (ASCII
  898     mode) despite CSS spec not counting U+000B (VT) as whitespace,
  899     identifiers are simplified to ``[\\w-]+`` (ASCII mode), and strings
  900     (attribute selector values can be either identifiers or strings)
  901     allow escaped quotes (i.e., ``\\'`` inside single-quoted strings and
  902     ``\\"`` inside double-quoted strings) but everything else is
  903     interpreted literally. The exact specs for CSS identifiers and
  904     strings can be found at [#]_.
  905 
  906     Certain selectors and combinators may be implemented in the parser
  907     but not implemented in matching and/or selection APIs.
  908 
  909     .. [#selectors-3] https://www.w3.org/TR/selectors-3/
  910     .. [#] https://www.w3.org/TR/CSS21/syndata.html
  911 
  912     Attributes:
  913         tag (:class:`Optional`\\[:class:`str`]):
  914             Type selector.
  915         classes (:class:`List`\\[:class:`str`]):
  916             Class selectors.
  917         id (:class:`Optional`\\[:class:`str`]):
  918             ID selector.
  919         attrs (:class:`List`\\[:class:`AttributeSelector`]):
  920             Attribute selectors.
  921         combinator (:class:`Optional`\\[:class:`Combinator`]):
  922             Combinator with the previous sequence of simple selectors in
  923             chain.
  924         previous (:class:`Optional`\\[:class:`Selector`]):
  925             Reference to the previous sequence of simple selectors in
  926             chain.
  927 
  928     """
  929 
  930     def __init__(
  931         self,
  932         *,
  933         tag: Optional[str] = None,
  934         classes: Optional[Sequence[str]] = None,
  935         id: Optional[str] = None,
  936         attrs: Optional[Sequence["AttributeSelector"]] = None,
  937         combinator: Optional["Combinator"] = None,
  938         previous: Optional["Selector"] = None
  939     ) -> None:
  940         self.tag = tag.lower() if tag else None
  941         self.classes = list(classes or [])
  942         self.id = id
  943         self.attrs = list(attrs or [])
  944         self.combinator = combinator
  945         self.previous = previous
  946 
  947     def __repr__(self) -> str:
  948         return "<Selector %s>" % repr(str(self))
  949 
  950     def __str__(self) -> str:
  951         sequences = []
  952         delimiters = []
  953         seq = self
  954         while True:
  955             sequences.append(seq._sequence_str_())
  956             if seq.previous:
  957                 if seq.combinator == Combinator.DESCENDANT:
  958                     delimiters.append(" ")
  959                 elif seq.combinator == Combinator.CHILD:
  960                     delimiters.append(" > ")
  961                 elif seq.combinator == Combinator.NEXT_SIBLING:
  962                     delimiters.append(" + ")
  963                 elif seq.combinator == Combinator.SUBSEQUENT_SIBLING:
  964                     delimiters.append(" ~ ")
  965                 else:  # pragma: no cover
  966                     raise RuntimeError(
  967                         "unimplemented combinator: %s" % repr(self.combinator)
  968                     )
  969                 seq = seq.previous
  970             else:
  971                 delimiters.append("")
  972                 break
  973         return "".join(
  974             delimiter + sequence
  975             for delimiter, sequence in zip(reversed(delimiters), reversed(sequences))
  976         )
  977 
  978     # Format a single sequence of simple selectors, without combinator.
  979     def _sequence_str_(self) -> str:
  980         s = ""
  981         if self.tag:
  982             s += self.tag
  983         if self.classes:
  984             s += "".join(".%s" % class_ for class_ in self.classes)
  985         if self.id:
  986             s += "#%s" % self.id
  987         if self.attrs:
  988             s += "".join(str(attr) for attr in self.attrs)
  989         return s if s else "*"
  990 
  991     @classmethod
  992     def from_str(cls, s: str, cursor: int = 0) -> Tuple["Selector", int]:
  993         """
  994         Parses input string into selector.
  995 
  996         This factory function only parses out one selector (up to a
  997         comma or EOS), so partial consumption is allowed --- an optional
  998         `cursor` is taken as input (0 by default) and the moved cursor
  999         (either after the comma or at EOS) is returned as part of the
 1000         output.
 1001 
 1002         :class:`SelectorParserException` is raised on invalid input. See
 1003         :class:`Selector` documentation for the scope of support.
 1004 
 1005         If you need to completely consume a string representing
 1006         (potentially) a group of selectors, use
 1007         :meth:`SelectorGroup.from_str()`.
 1008 
 1009         Args:
 1010             s:      input string
 1011             cursor: initial cursor position on `s`
 1012 
 1013         Returns:
 1014             A tuple containing the parsed selector and the moved the
 1015             cursor (either after a comma-delimiter, or at EOS).
 1016         """
 1017         # Simple selectors.
 1018         TYPE_SEL = re.compile(r"[\w-]+", re.A)
 1019         UNIVERSAL_SEL = re.compile(r"\*")
 1020         ATTR_SEL = re.compile(
 1021             r"""\[
 1022             \s*(?P<attr>[\w-]+)\s*
 1023             (
 1024                 (?P<op>[~|^$*]?=)\s*
 1025                 (
 1026                     (?P<val_identifier>[\w-]+)|
 1027                     (?P<val_string>
 1028                         (?P<quote>['"])
 1029                         (?P<val_string_inner>.*?)
 1030                         (?<!\\)(?P=quote)
 1031                     )
 1032                 )\s*
 1033             )?
 1034             \]""",
 1035             re.A | re.X,
 1036         )
 1037         CLASS_SEL = re.compile(r"\.([\w-]+)", re.A)
 1038         ID_SEL = re.compile(r"#([\w-]+)", re.A)
 1039         PSEUDO_CLASS_SEL = re.compile(r":[\w-]+(\([^)]+\))?", re.A)
 1040         PSEUDO_ELEM_SEL = re.compile(r"::[\w-]+", re.A)
 1041 
 1042         # Combinators
 1043         DESCENDANT_COM = re.compile(r"\s+")
 1044         CHILD_COM = re.compile(r"\s*>\s*")
 1045         NEXT_SIB_COM = re.compile(r"\s*\+\s*")
 1046         SUB_SIB_COM = re.compile(r"\s*~\s*")
 1047 
 1048         # Misc
 1049         WHITESPACE = re.compile(r"\s*")
 1050         END_OF_SELECTOR = re.compile(r"\s*($|,)")
 1051 
 1052         tag = None
 1053         classes = []
 1054         id = None
 1055         attrs = []
 1056         combinator = None
 1057 
 1058         selector = None
 1059         previous_combinator = None
 1060 
 1061         i = cursor
 1062 
 1063         # Skip leading whitespace
 1064         m = WHITESPACE.match(s, i)
 1065         if m:
 1066             i = m.end()
 1067 
 1068         while i < len(s):
 1069             # Parse one simple selector.
 1070             #
 1071             # PEP 572 (assignment expressions; the one that burned Guido
 1072             # so much that he resigned as BDFL) would have been nice; it
 1073             # would have saved us from all the regex match
 1074             # reassignments, and worse still, the casts, since mypy
 1075             # complains about getting Optional[Match[str]] instead of
 1076             # Match[str].
 1077             if TYPE_SEL.match(s, i):
 1078                 if tag:
 1079                     raise SelectorParserException(s, i, "multiple type selectors found")
 1080                 m = cast(Match[str], TYPE_SEL.match(s, i))
 1081                 tag = m.group()
 1082             elif UNIVERSAL_SEL.match(s, i):
 1083                 m = cast(Match[str], UNIVERSAL_SEL.match(s, i))
 1084             elif ATTR_SEL.match(s, i):
 1085                 m = cast(Match[str], ATTR_SEL.match(s, i))
 1086 
 1087                 attr = m.group("attr")
 1088                 op = m.group("op")
 1089                 val_identifier = m.group("val_identifier")
 1090                 quote = m.group("quote")
 1091                 val_string_inner = m.group("val_string_inner")
 1092                 if val_identifier is not None:
 1093                     val = val_identifier
 1094                 elif val_string_inner is not None:
 1095                     val = val_string_inner.replace("\\" + quote, quote)
 1096                 else:
 1097                     val = None
 1098 
 1099                 if op is None:
 1100                     type = AttributeSelectorType.BARE
 1101                 elif op == "=":
 1102                     type = AttributeSelectorType.EQUAL
 1103                 elif op == "~=":
 1104                     type = AttributeSelectorType.TILDE
 1105                 elif op == "|=":
 1106                     type = AttributeSelectorType.PIPE
 1107                 elif op == "^=":
 1108                     type = AttributeSelectorType.CARET
 1109                 elif op == "$=":
 1110                     type = AttributeSelectorType.DOLLAR
 1111                 elif op == "*=":
 1112                     type = AttributeSelectorType.ASTERISK
 1113                 else:  # pragma: no cover
 1114                     raise SelectorParserException(
 1115                         s,
 1116                         i,
 1117                         "unrecognized operator %s in attribute selector" % repr(op),
 1118                     )
 1119 
 1120                 attrs.append(AttributeSelector(attr, val, type))
 1121             elif CLASS_SEL.match(s, i):
 1122                 m = cast(Match[str], CLASS_SEL.match(s, i))
 1123                 classes.append(m.group(1))
 1124             elif ID_SEL.match(s, i):
 1125                 if id:
 1126                     raise SelectorParserException(s, i, "multiple id selectors found")
 1127                 m = cast(Match[str], ID_SEL.match(s, i))
 1128                 id = m.group(1)
 1129             elif PSEUDO_CLASS_SEL.match(s, i):
 1130                 raise SelectorParserException(s, i, "pseudo-classes not supported")
 1131             elif PSEUDO_ELEM_SEL.match(s, i):
 1132                 raise SelectorParserException(s, i, "pseudo-elements not supported")
 1133             else:
 1134                 raise SelectorParserException(
 1135                     s, i, "expecting simple selector, found none"
 1136                 )
 1137             i = m.end()
 1138 
 1139             # Try to parse a combinator, or end the selector.
 1140             if CHILD_COM.match(s, i):
 1141                 m = cast(Match[str], CHILD_COM.match(s, i))
 1142                 combinator = Combinator.CHILD
 1143             elif NEXT_SIB_COM.match(s, i):
 1144                 m = cast(Match[str], NEXT_SIB_COM.match(s, i))
 1145                 combinator = Combinator.NEXT_SIBLING
 1146             elif SUB_SIB_COM.match(s, i):
 1147                 m = cast(Match[str], SUB_SIB_COM.match(s, i))
 1148                 combinator = Combinator.SUBSEQUENT_SIBLING
 1149             elif END_OF_SELECTOR.match(s, i):
 1150                 m = cast(Match[str], END_OF_SELECTOR.match(s, i))
 1151                 combinator = None
 1152             # Need to parse descendant combinator at the very end
 1153             # because it could be a prefix to all previous cases.
 1154             elif DESCENDANT_COM.match(s, i):
 1155                 m = cast(Match[str], DESCENDANT_COM.match(s, i))
 1156                 combinator = Combinator.DESCENDANT
 1157             else:
 1158                 continue
 1159             i = m.end()
 1160 
 1161             if combinator and i == len(s):
 1162                 raise SelectorParserException(s, i, "unexpected end at combinator")
 1163 
 1164             selector = cls(
 1165                 tag=tag,
 1166                 classes=classes,
 1167                 id=id,
 1168                 attrs=attrs,
 1169                 combinator=previous_combinator,
 1170                 previous=selector,
 1171             )
 1172             previous_combinator = combinator
 1173 
 1174             # End of selector.
 1175             if combinator is None:
 1176                 break
 1177 
 1178             tag = None
 1179             classes = []
 1180             id = None
 1181             attrs = []
 1182             combinator = None
 1183 
 1184         if not selector:
 1185             raise SelectorParserException(s, i, "selector is empty")
 1186 
 1187         return selector, i
 1188 
 1189     def matches(self, node: "Node", root: Optional["Node"] = None) -> bool:
 1190         """
 1191         Decides whether the selector matches `node`.
 1192 
 1193         Each sequence of simple selectors in the selector's chain must
 1194         be matched for a positive.
 1195 
 1196         If `root` is provided and child and/or descendant combinators
 1197         are involved, parent/ancestor lookup terminates at `root`.
 1198         """
 1199         if self.tag:
 1200             if not node.tag or node.tag != self.tag:
 1201                 return False
 1202         if self.id:
 1203             if node.attrs.get("id") != self.id:
 1204                 return False
 1205         if self.classes:
 1206             classes = node.classes
 1207             for class_ in self.classes:
 1208                 if class_ not in classes:
 1209                     return False
 1210         if self.attrs:
 1211             for attr_selector in self.attrs:
 1212                 if not attr_selector.matches(node):
 1213                     return False
 1214 
 1215         if not self.previous:
 1216             return True
 1217 
 1218         if self.combinator == Combinator.DESCENDANT:
 1219             return any(
 1220                 self.previous.matches(ancestor, root=root)
 1221                 for ancestor in node.ancestors()
 1222             )
 1223         elif self.combinator == Combinator.CHILD:
 1224             if node is root or node.parent is None:
 1225                 return False
 1226             else:
 1227                 return self.previous.matches(node.parent)
 1228         elif self.combinator == Combinator.NEXT_SIBLING:
 1229             sibling = node.previous_element_sibling()
 1230             if not sibling:
 1231                 return False
 1232             else:
 1233                 return self.previous.matches(sibling)
 1234         elif self.combinator == Combinator.SUBSEQUENT_SIBLING:
 1235             return any(
 1236                 self.previous.matches(sibling, root=root)
 1237                 for sibling in node.previous_siblings()
 1238                 if isinstance(sibling, ElementNode)
 1239             )
 1240         else:  # pragma: no cover
 1241             raise RuntimeError("unimplemented combinator: %s" % repr(self.combinator))
 1242 
 1243 
 1244 class AttributeSelector:
 1245     """
 1246     Represents an attribute selector.
 1247 
 1248     Attributes:
 1249         attr (:class:`str`)
 1250         val  (:class:`Optional`\\[:class:`str`])
 1251         type (:class:`AttributeSelectorType`)
 1252     """
 1253 
 1254     def __init__(
 1255         self, attr: str, val: Optional[str], type: "AttributeSelectorType"
 1256     ) -> None:
 1257         self.attr = attr.lower()
 1258         self.val = val
 1259         self.type = type
 1260 
 1261     def __repr__(self) -> str:
 1262         return "<AttributeSelector %s>" % repr(str(self))
 1263 
 1264     def __str__(self) -> str:
 1265         if self.type == AttributeSelectorType.BARE:
 1266             fmt = "[{attr}{val:.0}]"
 1267         elif self.type == AttributeSelectorType.EQUAL:
 1268             fmt = "[{attr}={val}]"
 1269         elif self.type == AttributeSelectorType.TILDE:
 1270             fmt = "[{attr}~={val}]"
 1271         elif self.type == AttributeSelectorType.PIPE:
 1272             fmt = "[{attr}|={val}]"
 1273         elif self.type == AttributeSelectorType.CARET:
 1274             fmt = "[{attr}^={val}]"
 1275         elif self.type == AttributeSelectorType.DOLLAR:
 1276             fmt = "[{attr}$={val}]"
 1277         elif self.type == AttributeSelectorType.ASTERISK:
 1278             fmt = "[{attr}*={val}]"
 1279         return fmt.format(attr=self.attr, val=repr(self.val))
 1280 
 1281     def matches(self, node: "Node") -> bool:
 1282         val = node.attrs.get(self.attr)
 1283         if val is None:
 1284             return False
 1285         if self.type == AttributeSelectorType.BARE:
 1286             return True
 1287         elif self.type == AttributeSelectorType.EQUAL:
 1288             return val == self.val
 1289         elif self.type == AttributeSelectorType.TILDE:
 1290             return self.val in val.split()
 1291         elif self.type == AttributeSelectorType.PIPE:
 1292             return val == self.val or val.startswith("%s-" % self.val)
 1293         elif self.type == AttributeSelectorType.CARET:
 1294             return bool(self.val and val.startswith(self.val))
 1295         elif self.type == AttributeSelectorType.DOLLAR:
 1296             return bool(self.val and val.endswith(self.val))
 1297         elif self.type == AttributeSelectorType.ASTERISK:
 1298             return bool(self.val and self.val in val)
 1299         else:  # pragma: no cover
 1300             raise RuntimeError("unimplemented attribute selector: %s" % repr(self.type))
 1301 
 1302 
 1303 # Enum: basis for poor man's algebraic data type.
 1304 class AttributeSelectorType(Enum):
 1305     """
 1306     Attribute selector types.
 1307 
 1308     Members correspond to the following forms of attribute selector:
 1309 
 1310     - :attr:`BARE`: ``[attr]``;
 1311     - :attr:`EQUAL`: ``[attr=val]``;
 1312     - :attr:`TILDE`: ``[attr~=val]``;
 1313     - :attr:`PIPE`: ``[attr|=val]``;
 1314     - :attr:`CARET`: ``[attr^=val]``;
 1315     - :attr:`DOLLAR`: ``[attr$=val]``;
 1316     - :attr:`ASTERISK`: ``[attr*=val]``.
 1317     """
 1318 
 1319     # [attr]
 1320     BARE = 1
 1321     # [attr=val]
 1322     EQUAL = 2
 1323     # [attr~=val]
 1324     TILDE = 3
 1325     # [attr|=val]
 1326     PIPE = 4
 1327     # [attr^=val]
 1328     CARET = 5
 1329     # [attr$=val]
 1330     DOLLAR = 6
 1331     # [attr*=val]
 1332     ASTERISK = 7
 1333 
 1334 
 1335 class Combinator(Enum):
 1336     """
 1337     Combinator types.
 1338 
 1339     Members correspond to the following combinators:
 1340 
 1341     - :attr:`DESCENDANT`: ``A B``;
 1342     - :attr:`CHILD`: ``A > B``;
 1343     - :attr:`NEXT_SIBLING`: ``A + B``;
 1344     - :attr:`SUBSEQUENT_SIBLING`: ``A ~ B``.
 1345     """
 1346 
 1347     # ' '
 1348     DESCENDANT = 1
 1349     # >
 1350     CHILD = 2
 1351     # +
 1352     NEXT_SIBLING = 3
 1353     # ~
 1354     SUBSEQUENT_SIBLING = 4
 1355 
 1356 
 1357 def _tag_is_void(tag: str) -> bool:
 1358     """
 1359     Checks whether the tag corresponds to a void element.
 1360 
 1361     https://www.w3.org/TR/html5/syntax.html#void-elements
 1362     https://html.spec.whatwg.org/multipage/syntax.html#void-elements
 1363     """
 1364     return tag.lower() in (
 1365         "area",
 1366         "base",
 1367         "br",
 1368         "col",
 1369         "embed",
 1370         "hr",
 1371         "img",
 1372         "input",
 1373         "link",
 1374         "meta",
 1375         "param",
 1376         "source",
 1377         "track",
 1378         "wbr",
 1379     )
 1380 
 1381 ### end dim ###
 1382 
 1383 
 1384 # Global helper functions
 1385 
 1386 def open_url(url):
 1387     """Open an URL in the user's default web browser.
 1388 
 1389     The string attribute ``open_url.url_handler`` can be used to open URLs
 1390     in a custom CLI script or utility. A subprocess is spawned with url as
 1391     the parameter in this case instead of the usual webbrowser.open() call.
 1392 
 1393     Whether the browser's output (both stdout and stderr) are suppressed
 1394     depends on the boolean attribute ``open_url.suppress_browser_output``.
 1395     If the attribute is not set upon a call, set it to a default value,
 1396     which means False if BROWSER is set to a known text-based browser --
 1397     elinks, links, lynx, w3m or 'www-browser'; or True otherwise.
 1398 
 1399     The string attribute ``open_url.override_text_browser`` can be used to
 1400     ignore env var BROWSER as well as some known text-based browsers and
 1401     attempt to open url in a GUI browser available.
 1402     Note: If a GUI browser is indeed found, this option ignores the program
 1403           option `show-browser-logs`
 1404     """
 1405     logger.debug('Opening %s', url)
 1406 
 1407     # Custom URL handler gets max priority
 1408     if hasattr(open_url, 'url_handler'):
 1409         p = Popen([open_url.url_handler, url], stdin=PIPE)
 1410         p.communicate()
 1411         return
 1412 
 1413     browser = webbrowser.get()
 1414     if open_url.override_text_browser:
 1415         browser_output = open_url.suppress_browser_output
 1416         for name in [b for b in webbrowser._tryorder if b not in text_browsers]:
 1417             browser = webbrowser.get(name)
 1418             logger.debug(browser)
 1419 
 1420             # Found a GUI browser, suppress browser output
 1421             open_url.suppress_browser_output = True
 1422             break
 1423 
 1424     if open_url.suppress_browser_output:
 1425         _stderr = os.dup(2)
 1426         os.close(2)
 1427         _stdout = os.dup(1)
 1428         os.close(1)
 1429         fd = os.open(os.devnull, os.O_RDWR)
 1430         os.dup2(fd, 2)
 1431         os.dup2(fd, 1)
 1432     try:
 1433         browser.open(url, new=2)
 1434     finally:
 1435         if open_url.suppress_browser_output:
 1436             os.close(fd)
 1437             os.dup2(_stderr, 2)
 1438             os.dup2(_stdout, 1)
 1439 
 1440     if open_url.override_text_browser:
 1441         open_url.suppress_browser_output = browser_output
 1442 
 1443 
 1444 def printerr(msg):
 1445     """Print message, verbatim, to stderr.
 1446 
 1447     ``msg`` could be any stringifiable value.
 1448     """
 1449     print(msg, file=sys.stderr)
 1450 
 1451 
 1452 def unwrap(text):
 1453     """Unwrap text."""
 1454     lines = text.split('\n')
 1455     result = ''
 1456     for i in range(len(lines) - 1):
 1457         result += lines[i]
 1458         if not lines[i]:
 1459             # Paragraph break
 1460             result += '\n\n'
 1461         elif lines[i + 1]:
 1462             # Next line is not paragraph break, add space
 1463             result += ' '
 1464     # Handle last line
 1465     result += lines[-1] if lines[-1] else '\n'
 1466     return result
 1467 
 1468 
 1469 def check_stdout_encoding():
 1470     """Make sure stdout encoding is utf-8.
 1471 
 1472     If not, print error message and instructions, then exit with
 1473     status 1.
 1474 
 1475     This function is a no-op on win32 because encoding on win32 is
 1476     messy, and let's just hope for the best. /s
 1477     """
 1478     if sys.platform == 'win32':
 1479         return
 1480 
 1481     # Use codecs.lookup to resolve text encoding alias
 1482     encoding = codecs.lookup(sys.stdout.encoding).name
 1483     if encoding != 'utf-8':
 1484         locale_lang, locale_encoding = locale.getlocale()
 1485         if locale_lang is None:
 1486             locale_lang = '<unknown>'
 1487         if locale_encoding is None:
 1488             locale_encoding = '<unknown>'
 1489         ioencoding = os.getenv('PYTHONIOENCODING', 'not set')
 1490         sys.stderr.write(unwrap(textwrap.dedent("""\
 1491         stdout encoding '{encoding}' detected. googler requires utf-8 to
 1492         work properly. The wrong encoding may be due to a non-UTF-8
 1493         locale or an improper PYTHONIOENCODING. (For the record, your
 1494         locale language is {locale_lang} and locale encoding is
 1495         {locale_encoding}; your PYTHONIOENCODING is {ioencoding}.)
 1496 
 1497         Please set a UTF-8 locale (e.g., en_US.UTF-8) or set
 1498         PYTHONIOENCODING to utf-8.
 1499         """.format(
 1500             encoding=encoding,
 1501             locale_lang=locale_lang,
 1502             locale_encoding=locale_encoding,
 1503             ioencoding=ioencoding,
 1504         ))))
 1505         sys.exit(1)
 1506 
 1507 
 1508 # Classes
 1509 
 1510 class HardenedHTTPSConnection(HTTPSConnection):
 1511     """Overrides HTTPSConnection.connect to specify TLS version
 1512 
 1513     NOTE: TLS 1.2 is supported from Python 3.4
 1514     """
 1515 
 1516     def __init__(self, host, **kwargs):
 1517         HTTPSConnection.__init__(self, host, **kwargs)
 1518 
 1519     def connect(self, notweak=False):
 1520         sock = socket.create_connection((self.host, self.port),
 1521                                         self.timeout, self.source_address)
 1522 
 1523         # Optimizations not available on OS X
 1524         if not notweak and sys.platform.startswith('linux'):
 1525             try:
 1526                 sock.setsockopt(socket.SOL_TCP, socket.TCP_DEFER_ACCEPT, 1)
 1527                 sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_QUICKACK, 1)
 1528                 sock.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, 524288)
 1529             except OSError:
 1530                 # Doesn't work on Windows' Linux subsystem (#179)
 1531                 logger.debug('setsockopt failed')
 1532 
 1533         if getattr(self, '_tunnel_host', None):
 1534             self.sock = sock
 1535         elif not notweak:
 1536             # Try to use TLS 1.2
 1537             ssl_context = None
 1538             if hasattr(ssl, 'PROTOCOL_TLS'):
 1539                 # Since Python 3.5.3
 1540                 ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS)
 1541                 if hasattr(ssl_context, "minimum_version"):
 1542                     # Python 3.7 with OpenSSL 1.1.0g or later
 1543                     ssl_context.minimum_version = ssl.TLSVersion.TLSv1_2
 1544                 else:
 1545                     ssl_context.options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 |
 1546                                             ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)
 1547             elif hasattr(ssl, 'PROTOCOL_TLSv1_2'):
 1548                 # Since Python 3.4
 1549                 ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
 1550             if ssl_context:
 1551                 self.sock = ssl_context.wrap_socket(sock)
 1552                 return
 1553 
 1554         # Fallback
 1555         HTTPSConnection.connect(self)
 1556 
 1557 
 1558 class GoogleUrl(object):
 1559     """
 1560     This class constructs the Google Search/News URL.
 1561 
 1562     This class is modelled on urllib.parse.ParseResult for familiarity,
 1563     which means it supports reading of all six attributes -- scheme,
 1564     netloc, path, params, query, fragment -- of
 1565     urllib.parse.ParseResult, as well as the geturl() method.
 1566 
 1567     However, the attributes (properties) and methods listed below should
 1568     be the preferred methods of access to this class.
 1569 
 1570     Parameters
 1571     ----------
 1572     opts : dict or argparse.Namespace, optional
 1573         See the ``opts`` parameter of `update`.
 1574 
 1575     Other Parameters
 1576     ----------------
 1577     See "Other Parameters" of `update`.
 1578 
 1579     Attributes
 1580     ----------
 1581     hostname : str
 1582         Read-write property.
 1583     keywords : str or list of strs
 1584         Read-write property.
 1585     news : bool
 1586         Read-only property.
 1587     videos : bool
 1588         Read-only property.
 1589     url : str
 1590         Read-only property.
 1591 
 1592     Methods
 1593     -------
 1594     full()
 1595     relative()
 1596     update(opts=None, **kwargs)
 1597     set_queries(**kwargs)
 1598     unset_queries(*args)
 1599     next_page()
 1600     prev_page()
 1601     first_page()
 1602 
 1603     """
 1604 
 1605     def __init__(self, opts=None, **kwargs):
 1606         self.scheme = 'https'
 1607         # self.netloc is a calculated property
 1608         self.path = '/search'
 1609         self.params = ''
 1610         # self.query is a calculated property
 1611         self.fragment = ''
 1612 
 1613         self._tld = None
 1614         self._num = 10
 1615         self._start = 0
 1616         self._keywords = []
 1617         self._sites = None
 1618 
 1619         self._query_dict = {
 1620             'ie': 'UTF-8',
 1621             'oe': 'UTF-8',
 1622             #'gbv': '1',  # control the presence of javascript on the page, 1=no js, 2=js
 1623             'sei': base64.encodebytes(uuid.uuid1().bytes).decode("ascii").rstrip('=\n').replace('/', '_'),
 1624         }
 1625         self.update(opts, **kwargs)
 1626 
 1627     def __str__(self):
 1628         return self.url
 1629 
 1630     @property
 1631     def url(self):
 1632         """The full Google URL you want."""
 1633         return self.full()
 1634 
 1635     @property
 1636     def hostname(self):
 1637         """The hostname."""
 1638         return self.netloc
 1639 
 1640     @hostname.setter
 1641     def hostname(self, hostname):
 1642         self.netloc = hostname
 1643 
 1644     @property
 1645     def keywords(self):
 1646         """The keywords, either a str or a list of strs."""
 1647         return self._keywords
 1648 
 1649     @keywords.setter
 1650     def keywords(self, keywords):
 1651         self._keywords = keywords
 1652 
 1653     @property
 1654     def news(self):
 1655         """Whether the URL is for Google News."""
 1656         return 'tbm' in self._query_dict and self._query_dict['tbm'] == 'nws'
 1657 
 1658     @property
 1659     def videos(self):
 1660         """Whether the URL is for Google Videos."""
 1661         return 'tbm' in self._query_dict and self._query_dict['tbm'] == 'vid'
 1662 
 1663     def full(self):
 1664         """Return the full URL.
 1665 
 1666         Returns
 1667         -------
 1668         str
 1669 
 1670         """
 1671         url = (self.scheme + ':') if self.scheme else ''
 1672         url += '//' + self.netloc + self.relative()
 1673         return url
 1674 
 1675     def relative(self):
 1676         """Return the relative URL (without scheme and authority).
 1677 
 1678         Authority (see RFC 3986 section 3.2), or netloc in the
 1679         terminology of urllib.parse, basically means the hostname
 1680         here. The relative URL is good for making HTTP(S) requests to a
 1681         known host.
 1682 
 1683         Returns
 1684         -------
 1685         str
 1686 
 1687         """
 1688         rel = self.path
 1689         if self.params:
 1690             rel += ';' + self.params
 1691         if self.query:
 1692             rel += '?' + self.query
 1693         if self.fragment:
 1694             rel += '#' + self.fragment
 1695         return rel
 1696 
 1697     def update(self, opts=None, **kwargs):
 1698         """Update the URL with the given options.
 1699 
 1700         Parameters
 1701         ----------
 1702         opts : dict or argparse.Namespace, optional
 1703             Carries options that affect the Google Search/News URL. The
 1704             list of currently recognized option keys with expected value
 1705             types:
 1706 
 1707                 duration: str (GooglerArgumentParser.is_duration)
 1708                 exact: bool
 1709                 keywords: str or list of strs
 1710                 lang: str
 1711                 news: bool
 1712                 videos: bool
 1713                 num: int
 1714                 site: str
 1715                 start: int
 1716                 tld: str
 1717                 unfilter: bool
 1718 
 1719         Other Parameters
 1720         ----------------
 1721         kwargs
 1722             The `kwargs` dict extends `opts`, that is, options can be
 1723             specified either way, in `opts` or as individual keyword
 1724             arguments.
 1725 
 1726         """
 1727 
 1728         if opts is None:
 1729             opts = {}
 1730         if hasattr(opts, '__dict__'):
 1731             opts = opts.__dict__
 1732         opts.update(kwargs)
 1733 
 1734         qd = self._query_dict
 1735         if opts.get('duration'):
 1736             qd['tbs'] = 'qdr:%s' % opts['duration']
 1737         if 'exact' in opts:
 1738             if opts['exact']:
 1739                 qd['nfpr'] = 1
 1740             else:
 1741                 qd.pop('nfpr', None)
 1742         if opts.get('from') or opts.get('to'):
 1743             cd_min = opts.get('from') or ''
 1744             cd_max = opts.get('to') or ''
 1745             qd['tbs'] = 'cdr:1,cd_min:%s,cd_max:%s' % (cd_min, cd_max)
 1746         if 'keywords' in opts:
 1747             self._keywords = opts['keywords']
 1748         if 'lang' in opts and opts['lang']:
 1749             qd['hl'] = opts['lang']
 1750         if 'news' in opts and opts['news']:
 1751             qd['tbm'] = 'nws'
 1752         elif 'videos' in opts and opts['videos']:
 1753             qd['tbm'] = 'vid'
 1754         else:
 1755             qd.pop('tbm', None)
 1756         if 'num' in opts:
 1757             self._num = opts['num']
 1758         if 'sites' in opts:
 1759             self._sites = opts['sites']
 1760         if 'start' in opts:
 1761             self._start = opts['start']
 1762         if 'tld' in opts:
 1763             self._tld = opts['tld']
 1764         if 'unfilter' in opts and opts['unfilter']:
 1765             qd['filter'] = 0
 1766 
 1767     def set_queries(self, **kwargs):
 1768         """Forcefully set queries outside the normal `update` mechanism.
 1769 
 1770         Other Parameters
 1771         ----------------
 1772         kwargs
 1773             Arbitrary key value pairs to be set in the query string. All
 1774             keys and values should be stringifiable.
 1775 
 1776             Note that certain keys, e.g., ``q``, have their values
 1777             constructed on the fly, so setting those has no actual
 1778             effect.
 1779 
 1780         """
 1781         for k, v in kwargs.items():
 1782             self._query_dict[k] = v
 1783 
 1784     def unset_queries(self, *args):
 1785         """Forcefully unset queries outside the normal `update` mechanism.
 1786 
 1787         Other Parameters
 1788         ----------------
 1789         args
 1790             Arbitrary keys to be unset. No exception is raised if a key
 1791             does not exist in the first place.
 1792 
 1793             Note that certain keys, e.g., ``q``, are always included in
 1794             the resulting URL, so unsetting those has no actual effect.
 1795 
 1796         """
 1797         for k in args:
 1798             self._query_dict.pop(k, None)
 1799 
 1800     def next_page(self):
 1801         """Navigate to the next page."""
 1802         self._start += self._num
 1803 
 1804     def prev_page(self):
 1805         """Navigate to the previous page.
 1806 
 1807         Raises
 1808         ------
 1809         ValueError
 1810             If already at the first page (``start=0`` in the current
 1811             query string).
 1812 
 1813         """
 1814         if self._start == 0:
 1815             raise ValueError('Already at the first page.')
 1816         self._start = (self._start - self._num) if self._start > self._num else 0
 1817 
 1818     def first_page(self):
 1819         """Navigate to the first page.
 1820 
 1821         Raises
 1822         ------
 1823         ValueError
 1824             If already at the first page (``start=0`` in the current
 1825             query string).
 1826 
 1827         """
 1828         if self._start == 0:
 1829             raise ValueError('Already at the first page.')
 1830         self._start = 0
 1831 
 1832     # Data source: https://web.archive.org/web/20170615200243/https://en.wikipedia.org/wiki/List_of_Google_domains
 1833     # Scraper script: https://gist.github.com/zmwangx/b976e83c14552fe18b71
 1834     TLD_TO_DOMAIN_MAP = {
 1835         'ac': 'google.ac',      'ad': 'google.ad',      'ae': 'google.ae',
 1836         'af': 'google.com.af',  'ag': 'google.com.ag',  'ai': 'google.com.ai',
 1837         'al': 'google.al',      'am': 'google.am',      'ao': 'google.co.ao',
 1838         'ar': 'google.com.ar',  'as': 'google.as',      'at': 'google.at',
 1839         'au': 'google.com.au',  'az': 'google.az',      'ba': 'google.ba',
 1840         'bd': 'google.com.bd',  'be': 'google.be',      'bf': 'google.bf',
 1841         'bg': 'google.bg',      'bh': 'google.com.bh',  'bi': 'google.bi',
 1842         'bj': 'google.bj',      'bn': 'google.com.bn',  'bo': 'google.com.bo',
 1843         'br': 'google.com.br',  'bs': 'google.bs',      'bt': 'google.bt',
 1844         'bw': 'google.co.bw',   'by': 'google.by',      'bz': 'google.com.bz',
 1845         'ca': 'google.ca',      'cat': 'google.cat',    'cc': 'google.cc',
 1846         'cd': 'google.cd',      'cf': 'google.cf',      'cg': 'google.cg',
 1847         'ch': 'google.ch',      'ci': 'google.ci',      'ck': 'google.co.ck',
 1848         'cl': 'google.cl',      'cm': 'google.cm',      'cn': 'google.cn',
 1849         'co': 'google.com.co',  'cr': 'google.co.cr',   'cu': 'google.com.cu',
 1850         'cv': 'google.cv',      'cy': 'google.com.cy',  'cz': 'google.cz',
 1851         'de': 'google.de',      'dj': 'google.dj',      'dk': 'google.dk',
 1852         'dm': 'google.dm',      'do': 'google.com.do',  'dz': 'google.dz',
 1853         'ec': 'google.com.ec',  'ee': 'google.ee',      'eg': 'google.com.eg',
 1854         'es': 'google.es',      'et': 'google.com.et',  'fi': 'google.fi',
 1855         'fj': 'google.com.fj',  'fm': 'google.fm',      'fr': 'google.fr',
 1856         'ga': 'google.ga',      'ge': 'google.ge',      'gf': 'google.gf',
 1857         'gg': 'google.gg',      'gh': 'google.com.gh',  'gi': 'google.com.gi',
 1858         'gl': 'google.gl',      'gm': 'google.gm',      'gp': 'google.gp',
 1859         'gr': 'google.gr',      'gt': 'google.com.gt',  'gy': 'google.gy',
 1860         'hk': 'google.com.hk',  'hn': 'google.hn',      'hr': 'google.hr',
 1861         'ht': 'google.ht',      'hu': 'google.hu',      'id': 'google.co.id',
 1862         'ie': 'google.ie',      'il': 'google.co.il',   'im': 'google.im',
 1863         'in': 'google.co.in',   'io': 'google.io',      'iq': 'google.iq',
 1864         'is': 'google.is',      'it': 'google.it',      'je': 'google.je',
 1865         'jm': 'google.com.jm',  'jo': 'google.jo',      'jp': 'google.co.jp',
 1866         'ke': 'google.co.ke',   'kg': 'google.kg',      'kh': 'google.com.kh',
 1867         'ki': 'google.ki',      'kr': 'google.co.kr',   'kw': 'google.com.kw',
 1868         'kz': 'google.kz',      'la': 'google.la',      'lb': 'google.com.lb',
 1869         'lc': 'google.com.lc',  'li': 'google.li',      'lk': 'google.lk',
 1870         'ls': 'google.co.ls',   'lt': 'google.lt',      'lu': 'google.lu',
 1871         'lv': 'google.lv',      'ly': 'google.com.ly',  'ma': 'google.co.ma',
 1872         'md': 'google.md',      'me': 'google.me',      'mg': 'google.mg',
 1873         'mk': 'google.mk',      'ml': 'google.ml',      'mm': 'google.com.mm',
 1874         'mn': 'google.mn',      'ms': 'google.ms',      'mt': 'google.com.mt',
 1875         'mu': 'google.mu',      'mv': 'google.mv',      'mw': 'google.mw',
 1876         'mx': 'google.com.mx',  'my': 'google.com.my',  'mz': 'google.co.mz',
 1877         'na': 'google.com.na',  'ne': 'google.ne',      'nf': 'google.com.nf',
 1878         'ng': 'google.com.ng',  'ni': 'google.com.ni',  'nl': 'google.nl',
 1879         'no': 'google.no',      'np': 'google.com.np',  'nr': 'google.nr',
 1880         'nu': 'google.nu',      'nz': 'google.co.nz',   'om': 'google.com.om',
 1881         'pa': 'google.com.pa',  'pe': 'google.com.pe',  'pg': 'google.com.pg',
 1882         'ph': 'google.com.ph',  'pk': 'google.com.pk',  'pl': 'google.pl',
 1883         'pn': 'google.co.pn',   'pr': 'google.com.pr',  'ps': 'google.ps',
 1884         'pt': 'google.pt',      'py': 'google.com.py',  'qa': 'google.com.qa',
 1885         'ro': 'google.ro',      'rs': 'google.rs',      'ru': 'google.ru',
 1886         'rw': 'google.rw',      'sa': 'google.com.sa',  'sb': 'google.com.sb',
 1887         'sc': 'google.sc',      'se': 'google.se',      'sg': 'google.com.sg',
 1888         'sh': 'google.sh',      'si': 'google.si',      'sk': 'google.sk',
 1889         'sl': 'google.com.sl',  'sm': 'google.sm',      'sn': 'google.sn',
 1890         'so': 'google.so',      'sr': 'google.sr',      'st': 'google.st',
 1891         'sv': 'google.com.sv',  'td': 'google.td',      'tg': 'google.tg',
 1892         'th': 'google.co.th',   'tj': 'google.com.tj',  'tk': 'google.tk',
 1893         'tl': 'google.tl',      'tm': 'google.tm',      'tn': 'google.tn',
 1894         'to': 'google.to',      'tr': 'google.com.tr',  'tt': 'google.tt',
 1895         'tw': 'google.com.tw',  'tz': 'google.co.tz',   'ua': 'google.com.ua',
 1896         'ug': 'google.co.ug',   'uk': 'google.co.uk',   'uy': 'google.com.uy',
 1897         'uz': 'google.co.uz',   'vc': 'google.com.vc',  've': 'google.co.ve',
 1898         'vg': 'google.vg',      'vi': 'google.co.vi',   'vn': 'google.com.vn',
 1899         'vu': 'google.vu',      'ws': 'google.ws',      'za': 'google.co.za',
 1900         'zm': 'google.co.zm',   'zw': 'google.co.zw',
 1901     }
 1902 
 1903     @property
 1904     def netloc(self):
 1905         """The hostname."""
 1906         try:
 1907             return 'www.' + self.TLD_TO_DOMAIN_MAP[self._tld]
 1908         except KeyError:
 1909             return 'www.google.com'
 1910 
 1911     @property
 1912     def query(self):
 1913         """The query string."""
 1914         qd = {}
 1915         qd.update(self._query_dict)
 1916         if self._num != 10:  # Skip sending the default
 1917             qd['num'] = self._num
 1918         if self._start:  # Skip sending the default
 1919             qd['start'] = self._start
 1920 
 1921         # Construct the q query
 1922         q = ''
 1923         keywords = self._keywords
 1924         sites = self._sites
 1925         if keywords:
 1926             if isinstance(keywords, list):
 1927                 q += '+'.join(urllib.parse.quote_plus(kw) for kw in keywords)
 1928             else:
 1929                 q += urllib.parse.quote_plus(keywords)
 1930         if sites:
 1931             q += '+OR'.join('+site:' + urllib.parse.quote_plus(site) for site in sites)
 1932         qd['q'] = q
 1933 
 1934         return '&'.join('%s=%s' % (k, qd[k]) for k in sorted(qd.keys()))
 1935 
 1936 
 1937 class GoogleConnectionError(Exception):
 1938     pass
 1939 
 1940 
 1941 class GoogleConnection(object):
 1942     """
 1943     This class facilitates connecting to and fetching from Google.
 1944 
 1945     Parameters
 1946     ----------
 1947     See http.client.HTTPSConnection for documentation of the
 1948     parameters.
 1949 
 1950     Raises
 1951     ------
 1952     GoogleConnectionError
 1953 
 1954     Attributes
 1955     ----------
 1956     host : str
 1957         The currently connected host. Read-only property. Use
 1958         `new_connection` to change host.
 1959 
 1960     Methods
 1961     -------
 1962     new_connection(host=None, port=None, timeout=45)
 1963     renew_connection(timeout=45)
 1964     fetch_page(url)
 1965     close()
 1966 
 1967     """
 1968 
 1969     def __init__(self, host, port=None, timeout=45, proxy=None, notweak=False):
 1970         self._host = None
 1971         self._port = None
 1972         self._proxy = proxy
 1973         self._notweak = notweak
 1974         self._conn = None
 1975         self.new_connection(host, port=port, timeout=timeout)
 1976         self.cookie = ''
 1977 
 1978     @property
 1979     def host(self):
 1980         """The host currently connected to."""
 1981         return self._host
 1982 
 1983     def new_connection(self, host=None, port=None, timeout=45):
 1984         """Close the current connection (if any) and establish a new one.
 1985 
 1986         Parameters
 1987         ----------
 1988         See http.client.HTTPSConnection for documentation of the
 1989         parameters. Renew the connection (i.e., reuse the current host
 1990         and port) if host is None or empty.
 1991 
 1992         Raises
 1993         ------
 1994         GoogleConnectionError
 1995 
 1996         """
 1997         if self._conn:
 1998             self._conn.close()
 1999 
 2000         if not host:
 2001             host = self._host
 2002             port = self._port
 2003         self._host = host
 2004         self._port = port
 2005         host_display = host + (':%d' % port if port else '')
 2006 
 2007         proxy = self._proxy
 2008 
 2009         if proxy:
 2010             proxy_user_passwd, proxy_host_port = parse_proxy_spec(proxy)
 2011 
 2012             logger.debug('Connecting to proxy server %s', proxy_host_port)
 2013             self._conn = HardenedHTTPSConnection(proxy_host_port, timeout=timeout)
 2014 
 2015             logger.debug('Tunnelling to host %s' % host_display)
 2016             connect_headers = {}
 2017             if proxy_user_passwd:
 2018                 connect_headers['Proxy-Authorization'] = 'Basic %s' % base64.b64encode(
 2019                     proxy_user_passwd.encode('utf-8')
 2020                 ).decode('utf-8')
 2021             self._conn.set_tunnel(host, port=port, headers=connect_headers)
 2022 
 2023             try:
 2024                 self._conn.connect(self._notweak)
 2025             except Exception as e:
 2026                 msg = 'Failed to connect to proxy server %s: %s.' % (proxy, e)
 2027                 raise GoogleConnectionError(msg)
 2028         else:
 2029             logger.debug('Connecting to new host %s', host_display)
 2030             self._conn = HardenedHTTPSConnection(host, port=port, timeout=timeout)
 2031             try:
 2032                 self._conn.connect(self._notweak)
 2033             except Exception as e:
 2034                 msg = 'Failed to connect to %s: %s.' % (host_display, e)
 2035                 raise GoogleConnectionError(msg)
 2036 
 2037     def renew_connection(self, timeout=45):
 2038         """Renew current connection.
 2039 
 2040         Equivalent to ``new_connection(timeout=timeout)``.
 2041 
 2042         """
 2043         self.new_connection(timeout=timeout)
 2044 
 2045     def fetch_page(self, url):
 2046         """Fetch a URL.
 2047 
 2048         Allows one reconnection and multiple redirections before failing
 2049         and raising GoogleConnectionError.
 2050 
 2051         Parameters
 2052         ----------
 2053         url : str
 2054             The URL to fetch, relative to the host.
 2055 
 2056         Raises
 2057         ------
 2058         GoogleConnectionError
 2059             When not getting HTTP 200 even after the allowed one
 2060             reconnection and/or one redirection, or when Google is
 2061             blocking query due to unusual activity.
 2062 
 2063         Returns
 2064         -------
 2065         str
 2066             Response payload, gunzipped (if applicable) and decoded (in UTF-8).
 2067 
 2068         """
 2069         try:
 2070             self._raw_get(url)
 2071         except (http.client.HTTPException, OSError) as e:
 2072             logger.debug('Got exception: %s.', e)
 2073             logger.debug('Attempting to reconnect...')
 2074             self.renew_connection()
 2075             try:
 2076                 self._raw_get(url)
 2077             except http.client.HTTPException as e:
 2078                 logger.debug('Got exception: %s.', e)
 2079                 raise GoogleConnectionError("Failed to get '%s'." % url)
 2080 
 2081         resp = self._resp
 2082         redirect_counter = 0
 2083         while resp.status != 200 and redirect_counter < 3:
 2084             if resp.status in {301, 302, 303, 307, 308}:
 2085                 redirection_url = resp.getheader('location', '')
 2086                 if 'sorry/IndexRedirect?' in redirection_url or 'sorry/index?' in redirection_url:
 2087                     msg = textwrap.dedent("""\
 2088                     Connection blocked due to unusual activity.
 2089                     THIS IS NOT A BUG, please do NOT report it as a bug unless you have specific
 2090                     information that may lead to the development of a workaround.
 2091                     You IP address is temporarily or permanently blocked by Google and requires
 2092                     reCAPTCHA-solving to use the service, which googler is not capable of.
 2093                     Possible causes include issuing too many queries in a short time frame, or
 2094                     operating from a shared / low reputation IP with a history of abuse.
 2095                     Please do NOT use googler for automated scraping.""")
 2096                     msg = " ".join(msg.splitlines())
 2097                     raise GoogleConnectionError(msg)
 2098                 self._redirect(redirection_url)
 2099                 resp = self._resp
 2100                 redirect_counter += 1
 2101             else:
 2102                 break
 2103 
 2104         if resp.status != 200:
 2105             raise GoogleConnectionError('Got HTTP %d: %s' % (resp.status, resp.reason))
 2106 
 2107         payload = resp.read()
 2108         try:
 2109             return gzip.decompress(payload).decode('utf-8')
 2110         except OSError:
 2111             # Not gzipped
 2112             return payload.decode('utf-8')
 2113 
 2114     def _redirect(self, url):
 2115         """Redirect to and fetch a new URL.
 2116 
 2117         Like `_raw_get`, the response is stored in ``self._resp``. A new
 2118         connection is made if redirecting to a different host.
 2119 
 2120         Parameters
 2121         ----------
 2122         url : str
 2123             If absolute and points to a different host, make a new
 2124             connection.
 2125 
 2126         Raises
 2127         ------
 2128         GoogleConnectionError
 2129 
 2130         """
 2131         logger.debug('Redirecting to URL %s', url)
 2132         segments = urllib.parse.urlparse(url)
 2133 
 2134         host = segments.netloc
 2135         if host != self._host:
 2136             self.new_connection(host)
 2137 
 2138         relurl = urllib.parse.urlunparse(('', '') + segments[2:])
 2139         try:
 2140             self._raw_get(relurl)
 2141         except http.client.HTTPException as e:
 2142             logger.debug('Got exception: %s.', e)
 2143             raise GoogleConnectionError("Failed to get '%s'." % url)
 2144 
 2145     def _raw_get(self, url):
 2146         """Make a raw HTTP GET request.
 2147 
 2148         No status check (which implies no redirection). Response can be
 2149         accessed from ``self._resp``.
 2150 
 2151         Parameters
 2152         ----------
 2153         url : str
 2154             URL relative to the host, used in the GET request.
 2155 
 2156         Raises
 2157         ------
 2158         http.client.HTTPException
 2159 
 2160         """
 2161         logger.debug('Fetching URL %s', url)
 2162         self._conn.request('GET', url, None, {
 2163             'Accept': 'text/html',
 2164             'Accept-Encoding': 'gzip',
 2165             'User-Agent': USER_AGENT,
 2166             'Cookie': self.cookie,
 2167             'Connection': 'keep-alive',
 2168             'DNT': '1',
 2169         })
 2170         self._resp = self._conn.getresponse()
 2171         if self.cookie == '':
 2172             complete_cookie = self._resp.getheader('Set-Cookie')
 2173             # Cookie won't be available if already blocked
 2174             if complete_cookie is not None:
 2175                 self.cookie = complete_cookie[:complete_cookie.find(';')]
 2176                 logger.debug('Cookie: %s' % self.cookie)
 2177 
 2178     def close(self):
 2179         """Close the connection (if one is active)."""
 2180         if self._conn:
 2181             self._conn.close()
 2182 
 2183 
 2184 class GoogleParser(object):
 2185 
 2186     def __init__(self, html, *, news=False, videos=False):
 2187         self.news = news
 2188         self.videos = videos
 2189         self.autocorrected = False
 2190         self.showing_results_for = None
 2191         self.filtered = False
 2192         self.results = []
 2193         self.parse(html)
 2194 
 2195     def parse(self, html):
 2196         tree = parse_html(html)
 2197 
 2198         if debugger:
 2199             printerr('\x1b[1mInspect the DOM through the \x1b[4mtree\x1b[24m variable.\x1b[0m')
 2200             printerr('')
 2201             try:
 2202                 import IPython
 2203                 IPython.embed()
 2204             except ImportError:
 2205                 import pdb
 2206                 pdb.set_trace()
 2207 
 2208         index = 0
 2209         for div_g in tree.select_all('div.g'):
 2210             if div_g.select('.hp-xpdbox'):
 2211                 # Skip smart cards.
 2212                 continue
 2213             try:
 2214                 h3 = div_g.select('div.r h3')
 2215                 if h3:
 2216                     title = h3.text
 2217                     url = self.unwrap_link(h3.parent.attr('href'))
 2218                 else:
 2219                     h3 = div_g.select('h3.r')
 2220                     a = h3.select('a')
 2221                     title = a.text
 2222                     mime = div_g.select('.mime')
 2223                     if mime:
 2224                         title = mime.text + ' ' + title
 2225                     url = self.unwrap_link(a.attr('href'))
 2226                 matched_keywords = []
 2227                 abstract = ''
 2228                 for childnode in div_g.select('.st').children:
 2229                     if 'f' in childnode.classes:
 2230                         # .f is handled as metadata instead.
 2231                         continue
 2232                     if childnode.tag == 'b' and childnode.text != '...':
 2233                         matched_keywords.append({'phrase': childnode.text, 'offset': len(abstract)})
 2234                     abstract = abstract + childnode.text.replace('\n', '')
 2235                 try:
 2236                     metadata = div_g.select('.f').text
 2237                     metadata = metadata.replace('\u200e', '').replace(' - ', ', ').strip().rstrip(',')
 2238                 except AttributeError:
 2239                     metadata = None
 2240             except (AttributeError, ValueError):
 2241                 continue
 2242             sitelinks = []
 2243             for td in div_g.select_all('td'):
 2244                 try:
 2245                     a = td.select('a')
 2246                     sl_title = a.text
 2247                     sl_url = self.unwrap_link(a.attr('href'))
 2248                     sl_abstract = td.select('div.s.st').text
 2249                     sitelinks.append(Sitelink(sl_title, sl_url, sl_abstract))
 2250                 except (AttributeError, ValueError):
 2251                     continue
 2252             index += 1
 2253             self.results.append(Result(index, title, url, abstract,
 2254                                        metadata=metadata, sitelinks=sitelinks, matches=matched_keywords))
 2255 
 2256         if not self.results:
 2257             for card in tree.select_all('g-card'):
 2258                 a = card.select('a[href]')
 2259                 if not a:
 2260                     continue
 2261                 url = self.unwrap_link(a.attr('href'))
 2262                 text_nodes = []
 2263                 for node in a.descendants():
 2264                     if isinstance(node, TextNode) and node.strip():
 2265                         text_nodes.append(node.text)
 2266                 if len(text_nodes) != 4:
 2267                     continue
 2268                 publisher, title, abstract, publishing_time = text_nodes
 2269                 metadata = '%s, %s' % (publisher, publishing_time)
 2270                 index += 1
 2271                 self.results.append(Result(index, title, url, abstract, metadata=metadata))
 2272 
 2273         # Showing results for ...
 2274         # Search instead for ...
 2275         spell_orig = tree.select("span.spell_orig")
 2276         if spell_orig:
 2277             showing_results_for_link = next(
 2278                 filter(lambda el: el.tag == "a", spell_orig.previous_siblings()), None
 2279             )
 2280             if showing_results_for_link:
 2281                 self.autocorrected = True
 2282                 self.showing_results_for = showing_results_for_link.text
 2283 
 2284         # No results found for ...
 2285         # Results for ...:
 2286         alt_query_infobox = tree.select('#topstuff')
 2287         if alt_query_infobox:
 2288             bolds = alt_query_infobox.select_all('div b')
 2289             if len(bolds) == 2:
 2290                 self.showing_results_for = bolds[1].text
 2291 
 2292         # In order to show you the most relevant results, we have
 2293         # omitted some entries very similar to the N already displayed.
 2294         # ...
 2295         self.filtered = tree.select('p#ofr') is not None
 2296 
 2297     # Unwraps /url?q=http://...&sa=...
 2298     # TODO: don't unwrap if URL isn't in this form.
 2299     @staticmethod
 2300     def unwrap_link(link):
 2301         qs = urllib.parse.urlparse(link).query
 2302         try:
 2303             url = urllib.parse.parse_qs(qs)['q'][0]
 2304         except KeyError:
 2305             return link
 2306         else:
 2307             if "://" in url:
 2308                 return url
 2309             else:
 2310                 # Google's internal services link, e.g.,
 2311                 # /search?q=google&..., which cannot be unwrapped into
 2312                 # an actual URL.
 2313                 raise ValueError(link)
 2314 
 2315 
 2316 class Sitelink(object):
 2317     """Container for a sitelink."""
 2318 
 2319     def __init__(self, title, url, abstract):
 2320         self.title = title
 2321         self.url = url
 2322         self.abstract = abstract
 2323         self.index = ''
 2324 
 2325 
 2326 Colors = collections.namedtuple('Colors', 'index, title, url, metadata, abstract, prompt, reset')
 2327 
 2328 
 2329 class Result(object):
 2330     """
 2331     Container for one search result, with output helpers.
 2332 
 2333     Parameters
 2334     ----------
 2335     index : int or str
 2336     title : str
 2337     url : str
 2338     abstract : str
 2339     metadata : str, optional
 2340         Only applicable to Google News results, with publisher name and
 2341         publishing time.
 2342     sitelinks : list, optional
 2343         List of ``SiteLink`` objects.
 2344 
 2345     Attributes
 2346     ----------
 2347     index : str
 2348     title : str
 2349     url : str
 2350     abstract : str
 2351     metadata : str or None
 2352     sitelinks : list
 2353     matches : list
 2354 
 2355     Class Variables
 2356     ---------------
 2357     colors : str
 2358 
 2359     Methods
 2360     -------
 2361     print()
 2362     jsonizable_object()
 2363     urltable()
 2364 
 2365     """
 2366 
 2367     # Class variables
 2368     colors = None
 2369     urlexpand = True
 2370 
 2371     def __init__(self, index, title, url, abstract, metadata=None, sitelinks=None, matches=None):
 2372         index = str(index)
 2373         self.index = index
 2374         self.title = title
 2375         self.url = url
 2376         self.abstract = abstract
 2377         self.metadata = metadata
 2378         self.sitelinks = [] if sitelinks is None else sitelinks
 2379         self.matches = [] if matches is None else matches
 2380 
 2381         self._urltable = {index: url}
 2382         subindex = 'a'
 2383         for sitelink in self.sitelinks:
 2384             fullindex = index + subindex
 2385             sitelink.index = fullindex
 2386             self._urltable[fullindex] = sitelink.url
 2387             subindex = chr(ord(subindex) + 1)
 2388 
 2389     def _print_title_and_url(self, index, title, url, indent=0):
 2390         colors = self.colors
 2391 
 2392         if not self.urlexpand:
 2393             url = '[' + urllib.parse.urlparse(url).netloc + ']'
 2394 
 2395         if colors:
 2396             # Adjust index to print result index clearly
 2397             print(" %s%s%-3s%s" % (' ' * indent, colors.index, index + '.', colors.reset), end='')
 2398             if not self.urlexpand:
 2399                 print(' ' + colors.title + title + colors.reset + ' ' + colors.url + url + colors.reset)
 2400             else:
 2401                 print(' ' + colors.title + title + colors.reset)
 2402                 print(' ' * (indent + 5) + colors.url + url + colors.reset)
 2403         else:
 2404             if self.urlexpand:
 2405                 print(' %s%-3s %s' % (' ' * indent, index + '.', title))
 2406                 print(' %s%s' % (' ' * (indent + 4), url))
 2407             else:
 2408                 print(' %s%-3s %s %s' % (' ' * indent, index + '.', title, url))
 2409 
 2410     def _print_metadata_and_abstract(self, abstract, metadata=None, matches=None, indent=0):
 2411         colors = self.colors
 2412         try:
 2413             columns, _ = os.get_terminal_size()
 2414         except OSError:
 2415             columns = 0
 2416 
 2417         if metadata:
 2418             if colors:
 2419                 print(' ' * (indent + 5) + colors.metadata + metadata + colors.reset)
 2420             else:
 2421                 print(' ' * (indent + 5) + metadata)
 2422 
 2423         fillwidth = (columns - (indent + 6)) if columns > indent + 6 else len(abstract)
 2424         wrapped_abstract = TrackedTextwrap(abstract, fillwidth)
 2425         if colors:
 2426             # Highlight matches.
 2427             for match in matches or []:
 2428                 offset = match['offset']
 2429                 span = len(match['phrase'])
 2430                 wrapped_abstract.insert_zero_width_sequence('\x1b[1m', offset)
 2431                 wrapped_abstract.insert_zero_width_sequence('\x1b[0m', offset + span)
 2432 
 2433         if colors:
 2434             print(colors.abstract, end='')
 2435         for line in wrapped_abstract.lines:
 2436             print('%s%s' % (' ' * (indent + 5), line))
 2437         if colors:
 2438             print(colors.reset, end='')
 2439         print('')
 2440 
 2441     def print(self):
 2442         """Print the result entry."""
 2443         self._print_title_and_url(self.index, self.title, self.url)
 2444         self._print_metadata_and_abstract(self.abstract, metadata=self.metadata, matches=self.matches)
 2445 
 2446         for sitelink in self.sitelinks:
 2447             self._print_title_and_url(sitelink.index, sitelink.title, sitelink.url, indent=4)
 2448             self._print_metadata_and_abstract(sitelink.abstract, indent=4)
 2449 
 2450     def jsonizable_object(self):
 2451         """Return a JSON-serializable dict representing the result entry."""
 2452         obj = {
 2453             'title': self.title,
 2454             'url': self.url,
 2455             'abstract': self.abstract
 2456         }
 2457         if self.metadata:
 2458             obj['metadata'] = self.metadata
 2459         if self.sitelinks:
 2460             obj['sitelinks'] = [sitelink.__dict__ for sitelink in self.sitelinks]
 2461         if self.matches:
 2462             obj['matches'] = self.matches
 2463         return obj
 2464 
 2465     def urltable(self):
 2466         """Return a index-to-URL table for the current result.
 2467 
 2468         Normally, the table contains only a single entry, but when the result
 2469         contains sitelinks, all sitelinks are included in this table.
 2470 
 2471         Returns
 2472         -------
 2473         dict
 2474             A dict mapping indices (strs) to URLs (also strs). Indices of
 2475             sitelinks are the original index appended by lowercase letters a,
 2476             b, c, etc.
 2477 
 2478         """
 2479         return self._urltable
 2480 
 2481 
 2482 class GooglerCmdException(Exception):
 2483     pass
 2484 
 2485 
 2486 class NoKeywordsException(GooglerCmdException):
 2487     pass
 2488 
 2489 
 2490 def require_keywords(method):
 2491     # Require keywords to be set before we run a GooglerCmd method. If
 2492     # no keywords have been set, raise a NoKeywordsException.
 2493     @functools.wraps(method)
 2494     def enforced_method(self, *args, **kwargs):
 2495         if not self.keywords:
 2496             raise NoKeywordsException('No keywords.')
 2497         method(self, *args, **kwargs)
 2498 
 2499     return enforced_method
 2500 
 2501 
 2502 def no_argument(method):
 2503     # Normalize a do_* method of GooglerCmd that takes no argument to
 2504     # one that takes an arg, but issue a warning when an nonempty
 2505     # argument is given.
 2506     @functools.wraps(method)
 2507     def enforced_method(self, arg):
 2508         if arg:
 2509             method_name = arg.__name__
 2510             command_name = method_name[3:] if method_name.startswith('do_') else method_name
 2511             logger.warning("Argument to the '%s' command ignored.", command_name)
 2512         method(self)
 2513 
 2514     return enforced_method
 2515 
 2516 
 2517 class GooglerCmd(object):
 2518     """
 2519     Command line interpreter and executor class for googler.
 2520 
 2521     Inspired by PSL cmd.Cmd.
 2522 
 2523     Parameters
 2524     ----------
 2525     opts : argparse.Namespace
 2526         Options and/or arguments.
 2527 
 2528     Attributes
 2529     ----------
 2530     options : argparse.Namespace
 2531         Options that are currently in effect. Read-only attribute.
 2532     keywords : str or list or strs
 2533         Current keywords. Read-only attribute
 2534 
 2535     Methods
 2536     -------
 2537     fetch()
 2538     display_results(prelude='\n', json_output=False)
 2539     fetch_and_display(prelude='\n', json_output=False, interactive=True)
 2540     read_next_command()
 2541     help()
 2542     cmdloop()
 2543     """
 2544 
 2545     # Class variables
 2546     colors = None
 2547     re_url_index = re.compile(r"\d+(a-z)?")
 2548 
 2549     def __init__(self, opts):
 2550         super().__init__()
 2551 
 2552         self._opts = opts
 2553 
 2554         self._google_url = GoogleUrl(opts)
 2555         proxy = opts.proxy if hasattr(opts, 'proxy') else None
 2556         self._conn = GoogleConnection(self._google_url.hostname, proxy=proxy,
 2557                                       notweak=opts.notweak)
 2558         atexit.register(self._conn.close)
 2559 
 2560         self.results = []
 2561         self._autocorrected = None
 2562         self._showing_results_for = None
 2563         self._results_filtered = False
 2564         self._urltable = {}
 2565 
 2566         self.promptcolor = True if os.getenv('DISABLE_PROMPT_COLOR') is None else False
 2567 
 2568         self.no_results_instructions_shown = False
 2569 
 2570     @property
 2571     def options(self):
 2572         """Current options."""
 2573         return self._opts
 2574 
 2575     @property
 2576     def keywords(self):
 2577         """Current keywords."""
 2578         return self._google_url.keywords
 2579 
 2580     @require_keywords
 2581     def fetch(self):
 2582         """Fetch a page and parse for results.
 2583 
 2584         Results are stored in ``self.results``.
 2585 
 2586         Raises
 2587         ------
 2588         GoogleConnectionError
 2589 
 2590         See Also
 2591         --------
 2592         fetch_and_display
 2593 
 2594         """
 2595         # This method also sets self._results_filtered and
 2596         # self._urltable.
 2597         page = self._conn.fetch_page(self._google_url.relative())
 2598 
 2599         if logger.isEnabledFor(logging.DEBUG):
 2600             import tempfile
 2601             fd, tmpfile = tempfile.mkstemp(prefix='googler-response-', suffix='.html')
 2602             os.close(fd)
 2603             with open(tmpfile, 'w', encoding='utf-8') as fp:
 2604                 fp.write(page)
 2605             logger.debug("Response body written to '%s'.", tmpfile)
 2606 
 2607         parser = GoogleParser(page, news=self._google_url.news, videos=self._google_url.videos)
 2608 
 2609         self.results = parser.results
 2610         self._autocorrected = parser.autocorrected
 2611         self._showing_results_for = parser.showing_results_for
 2612         self._results_filtered = parser.filtered
 2613         self._urltable = {}
 2614         for r in self.results:
 2615             self._urltable.update(r.urltable())
 2616 
 2617     def warn_no_results(self):
 2618         printerr('No results.')
 2619         if not self.no_results_instructions_shown:
 2620             printerr('If you believe this is a bug, please review '
 2621                      'https://git.io/googler-no-results before submitting a bug report.')
 2622             self.no_results_instructions_shown = True
 2623 
 2624     @require_keywords
 2625     def display_results(self, prelude='\n', json_output=False):
 2626         """Display results stored in ``self.results``.
 2627 
 2628         Parameters
 2629         ----------
 2630         See `fetch_and_display`.
 2631 
 2632         """
 2633         if json_output:
 2634             # JSON output
 2635             import json
 2636             results_object = [r.jsonizable_object() for r in self.results]
 2637             print(json.dumps(results_object, indent=2, sort_keys=True, ensure_ascii=False))
 2638         else:
 2639             # Regular output
 2640             if not self.results:
 2641                 self.warn_no_results()
 2642             else:
 2643                 sys.stderr.write(prelude)
 2644                 for r in self.results:
 2645                     r.print()
 2646 
 2647     @require_keywords
 2648     def showing_results_for_alert(self, interactive=True):
 2649         colors = self.colors
 2650         if self._showing_results_for:
 2651             if colors:
 2652                 # Underline the query
 2653                 actual_query = '\x1b[4m' + self._showing_results_for + '\x1b[24m'
 2654             else:
 2655                 actual_query = self._showing_results_for
 2656             if self._autocorrected:
 2657                 if interactive:
 2658                     info = 'Showing results for %s; enter "x" for an exact search.' % actual_query
 2659                 else:
 2660                     info = 'Showing results for %s; use -x, --exact for an exact search.' % actual_query
 2661             else:
 2662                 info = 'No results found; showing results for %s.' % actual_query
 2663             if interactive:
 2664                 printerr('')
 2665             if colors:
 2666                 printerr(colors.prompt + info + colors.reset)
 2667             else:
 2668                 printerr('** ' + info)
 2669 
 2670     @require_keywords
 2671     def fetch_and_display(self, prelude='\n', json_output=False, interactive=True):
 2672         """Fetch a page and display results.
 2673 
 2674         Results are stored in ``self.results``.
 2675 
 2676         Parameters
 2677         ----------
 2678         prelude : str, optional
 2679             A string that is written to stderr before showing actual results,
 2680             usually serving as a separator. Default is an empty line.
 2681         json_output : bool, optional
 2682             Whether to dump results in JSON format. Default is False.
 2683         interactive : bool, optional
 2684             Whether to show contextual instructions, when e.g. Google
 2685             has filtered the results. Default is True.
 2686 
 2687         Raises
 2688         ------
 2689         GoogleConnectionError
 2690 
 2691         See Also
 2692         --------
 2693         fetch
 2694         display_results
 2695 
 2696         """
 2697         self.fetch()
 2698         self.showing_results_for_alert()
 2699         self.display_results(prelude=prelude, json_output=json_output)
 2700         if self._results_filtered:
 2701             colors = self.colors
 2702             info = 'Enter "unfilter" to show similar results Google omitted.'
 2703             if colors:
 2704                 printerr(colors.prompt + info + colors.reset)
 2705             else:
 2706                 printerr('** ' + info)
 2707             printerr('')
 2708 
 2709     def read_next_command(self):
 2710         """Show omniprompt and read user command line.
 2711 
 2712         Command line is always stripped, and each consecutive group of
 2713         whitespace is replaced with a single space character. If the
 2714         command line is empty after stripping, when ignore it and keep
 2715         reading. Exit with status 0 if we get EOF or an empty line
 2716         (pre-strip, that is, a raw <enter>) twice in a row.
 2717 
 2718         The new command line (non-empty) is stored in ``self.cmd``.
 2719 
 2720         """
 2721         colors = self.colors
 2722         message = 'googler (? for help)'
 2723         prompt = (colors.prompt + message + colors.reset + ' ') if (colors and self.promptcolor) else (message + ': ')
 2724         enter_count = 0
 2725         while True:
 2726             try:
 2727                 cmd = input(prompt)
 2728             except EOFError:
 2729                 sys.exit(0)
 2730 
 2731             if not cmd:
 2732                 enter_count += 1
 2733                 if enter_count == 2:
 2734                     # Double <enter>
 2735                     sys.exit(0)
 2736             else:
 2737                 enter_count = 0
 2738 
 2739             cmd = ' '.join(cmd.split())
 2740             if cmd:
 2741                 self.cmd = cmd
 2742                 break
 2743 
 2744     @staticmethod
 2745     def help():
 2746         GooglerArgumentParser.print_omniprompt_help(sys.stderr)
 2747         printerr('')
 2748 
 2749     @require_keywords
 2750     @no_argument
 2751     def do_first(self):
 2752         try:
 2753             self._google_url.first_page()
 2754         except ValueError as e:
 2755             print(e, file=sys.stderr)
 2756             return
 2757 
 2758         self.fetch_and_display()
 2759 
 2760     def do_google(self, arg):
 2761         # Update keywords and reconstruct URL
 2762         self._opts.keywords = arg
 2763         self._google_url = GoogleUrl(self._opts)
 2764         self.fetch_and_display()
 2765 
 2766     @require_keywords
 2767     @no_argument
 2768     def do_next(self):
 2769         # If > 5 results are being fetched each time,
 2770         # block next when no parsed results in current fetch
 2771         if not self.results and self._google_url._num > 5:
 2772             printerr('No results.')
 2773         else:
 2774             self._google_url.next_page()
 2775             self.fetch_and_display()
 2776 
 2777     @require_keywords
 2778     def do_open(self, *args):
 2779         if not args:
 2780             open_url(self._google_url.full())
 2781             return
 2782 
 2783         for nav in args:
 2784             if nav == 'a':
 2785                 for key, value in sorted(self._urltable.items()):
 2786                     open_url(self._urltable[key])
 2787             elif nav in self._urltable:
 2788                 open_url(self._urltable[nav])
 2789             elif '-' in nav:
 2790                 try:
 2791                     vals = [int(x) for x in nav.split('-')]
 2792                     if (len(vals) != 2):
 2793                         printerr('Invalid range %s.' % nav)
 2794                         continue
 2795 
 2796                     if vals[0] > vals[1]:
 2797                         vals[0], vals[1] = vals[1], vals[0]
 2798 
 2799                     for _id in range(vals[0], vals[1] + 1):
 2800                         if str(_id) in self._urltable:
 2801                             open_url(self._urltable[str(_id)])
 2802                         else:
 2803                             printerr('Invalid index %s.' % _id)
 2804                 except ValueError:
 2805                     printerr('Invalid range %s.' % nav)
 2806             else:
 2807                 printerr('Invalid index %s.' % nav)
 2808 
 2809     @require_keywords
 2810     @no_argument
 2811     def do_previous(self):
 2812         try:
 2813             self._google_url.prev_page()
 2814         except ValueError as e:
 2815             print(e, file=sys.stderr)
 2816             return
 2817 
 2818         self.fetch_and_display()
 2819 
 2820     @require_keywords
 2821     @no_argument
 2822     def do_exact(self):
 2823         # Reset start to 0 when exact is applied.
 2824         self._google_url.update(start=0, exact=True)
 2825         self.fetch_and_display()
 2826 
 2827     @require_keywords
 2828     @no_argument
 2829     def do_unfilter(self):
 2830         # Reset start to 0 when unfilter is applied.
 2831         self._google_url.update(start=0)
 2832         self._google_url.set_queries(filter=0)
 2833         self.fetch_and_display()
 2834 
 2835     def copy_url(self, idx):
 2836         try:
 2837             try:
 2838                 content = self._urltable[idx].encode('utf-8')
 2839             except KeyError:
 2840                 printerr('Invalid index.')
 2841                 return
 2842 
 2843             # try copying the url to clipboard using native utilities
 2844             copier_params = []
 2845             if sys.platform.startswith(('linux', 'freebsd', 'openbsd')):
 2846                 if shutil.which('xsel') is not None:
 2847                     copier_params = ['xsel', '-b', '-i']
 2848                 elif shutil.which('xclip') is not None:
 2849                     copier_params = ['xclip', '-selection', 'clipboard']
 2850                 elif shutil.which('termux-clipboard-set') is not None:
 2851                     copier_params = ['termux-clipboard-set']
 2852             elif sys.platform == 'darwin':
 2853                 copier_params = ['pbcopy']
 2854             elif sys.platform == 'win32':
 2855                 copier_params = ['clip']
 2856 
 2857             if copier_params:
 2858                 Popen(copier_params, stdin=PIPE, stdout=DEVNULL, stderr=DEVNULL).communicate(content)
 2859                 return
 2860 
 2861             # If native clipboard utilities are absent, try to use terminal multiplexers
 2862             # tmux
 2863             if os.getenv('TMUX_PANE'):
 2864                 copier_params = ['tmux', 'set-buffer']
 2865                 Popen(copier_params + [content], stdin=DEVNULL, stdout=DEVNULL, stderr=DEVNULL).communicate()
 2866                 return
 2867 
 2868             # GNU Screen paste buffer
 2869             if os.getenv('STY'):
 2870                 import tempfile
 2871                 copier_params = ['screen', '-X', 'readbuf', '-e', 'utf8']
 2872                 tmpfd, tmppath = tempfile.mkstemp()
 2873                 try:
 2874                     with os.fdopen(tmpfd, 'wb') as fp:
 2875                         fp.write(content)
 2876                     copier_params.append(tmppath)
 2877                     Popen(copier_params, stdin=DEVNULL, stdout=DEVNULL, stderr=DEVNULL).communicate()
 2878                 finally:
 2879                     os.unlink(tmppath)
 2880                 return
 2881 
 2882             printerr('failed to locate suitable clipboard utility')
 2883         except Exception:
 2884             raise NoKeywordsException
 2885 
 2886     def cmdloop(self):
 2887         """Run REPL."""
 2888         if self.keywords:
 2889             self.fetch_and_display()
 2890         else:
 2891             printerr('Please initiate a query.')
 2892 
 2893         while True:
 2894             self.read_next_command()
 2895             # TODO: Automatic dispatcher
 2896             #
 2897             # We can't write a dispatcher for now because that could
 2898             # change behaviour of the prompt. However, we have already
 2899             # laid a lot of ground work for the dispatcher, e.g., the
 2900             # `no_argument' decorator.
 2901             try:
 2902                 cmd = self.cmd
 2903                 if cmd == 'f':
 2904                     self.do_first('')
 2905                 elif cmd.startswith('g '):
 2906                     self.do_google(cmd[2:])
 2907                 elif cmd == 'n':
 2908                     self.do_next('')
 2909                 elif cmd == 'o':
 2910                     self.do_open()
 2911                 elif cmd.startswith('o '):
 2912                     self.do_open(*cmd[2:].split())
 2913                 elif cmd.startswith('O '):
 2914                     open_url.override_text_browser = True
 2915                     self.do_open(*cmd[2:].split())
 2916                     open_url.override_text_browser = False
 2917                 elif cmd == 'p':
 2918                     self.do_previous('')
 2919                 elif cmd == 'q':
 2920                     break
 2921                 elif cmd == 'x':
 2922                     self.do_exact('')
 2923                 elif cmd == 'unfilter':
 2924                     self.do_unfilter('')
 2925                 elif cmd == '?':
 2926                     self.help()
 2927                 elif cmd in self._urltable:
 2928                     open_url(self._urltable[cmd])
 2929                 elif self.keywords and cmd.isdigit() and int(cmd) < 100:
 2930                     printerr('Index out of bound. To search for the number, use g.')
 2931                 elif cmd == 'u':
 2932                     Result.urlexpand = not Result.urlexpand
 2933                     self.display_results()
 2934                 elif cmd.startswith('c ') and self.re_url_index.match(cmd[2:]):
 2935                     self.copy_url(cmd[2:])
 2936                 else:
 2937                     self.do_google(cmd)
 2938             except NoKeywordsException:
 2939                 printerr('Initiate a query first.')
 2940 
 2941 
 2942 class GooglerArgumentParser(argparse.ArgumentParser):
 2943     """Custom argument parser for googler."""
 2944 
 2945     # Print omniprompt help
 2946     @staticmethod
 2947     def print_omniprompt_help(file=None):
 2948         file = sys.stderr if file is None else file
 2949         file.write(textwrap.dedent("""
 2950         omniprompt keys:
 2951           n, p                  fetch the next or previous set of search results
 2952           index                 open the result corresponding to index in browser
 2953           f                     jump to the first page
 2954           o [index|range|a ...] open space-separated result indices, numeric ranges
 2955                                 (sitelinks unsupported in ranges), or all, in browser
 2956                                 open the current search in browser, if no arguments
 2957           O [index|range|a ...] like key 'o', but try to open in a GUI browser
 2958           g keywords            new Google search for 'keywords' with original options
 2959                                 should be used to search omniprompt keys and indices
 2960           c index               copy url to clipboard
 2961           u                     toggle url expansion
 2962           q, ^D, double Enter   exit googler
 2963           ?                     show omniprompt help
 2964           *                     other inputs issue a new search with original options
 2965         """))
 2966 
 2967     # Print information on googler
 2968     @staticmethod
 2969     def print_general_info(file=None):
 2970         file = sys.stderr if file is None else file
 2971         file.write(textwrap.dedent("""
 2972         Version %s
 2973         Copyright © 2008 Henri Hakkinen
 2974         Copyright © 2015-2020 Arun Prakash Jana <engineerarun@gmail.com>
 2975         Zhiming Wang <zmwangx@gmail.com>
 2976         License: GPLv3
 2977         Webpage: https://github.com/jarun/googler
 2978         """ % _VERSION_))
 2979 
 2980     # Augment print_help to print more than synopsis and options
 2981     def print_help(self, file=None):
 2982         super().print_help(file)
 2983         self.print_omniprompt_help(file)
 2984         self.print_general_info(file)
 2985 
 2986     # Automatically print full help text on error
 2987     def error(self, message):
 2988         sys.stderr.write('%s: error: %s\n\n' % (self.prog, message))
 2989         self.print_help(sys.stderr)
 2990         self.exit(2)
 2991 
 2992     # Type guards
 2993     @staticmethod
 2994     def positive_int(arg):
 2995         """Try to convert a string into a positive integer."""
 2996         try:
 2997             n = int(arg)
 2998             assert n > 0
 2999             return n
 3000         except (ValueError, AssertionError):
 3001             raise argparse.ArgumentTypeError('%s is not a positive integer' % arg)
 3002 
 3003     @staticmethod
 3004     def nonnegative_int(arg):
 3005         """Try to convert a string into a nonnegative integer."""
 3006         try:
 3007             n = int(arg)
 3008             assert n >= 0
 3009             return n
 3010         except (ValueError, AssertionError):
 3011             raise argparse.ArgumentTypeError('%s is not a non-negative integer' % arg)
 3012 
 3013     @staticmethod
 3014     def is_duration(arg):
 3015         """Check if a string is a valid duration accepted by Google.
 3016 
 3017         A valid duration is of the form dNUM, where d is a single letter h
 3018         (hour), d (day), w (week), m (month), or y (year), and NUM is a
 3019         non-negative integer.
 3020         """
 3021         try:
 3022             if arg[0] not in ('h', 'd', 'w', 'm', 'y') or int(arg[1:]) < 0:
 3023                 raise ValueError
 3024         except (TypeError, IndexError, ValueError):
 3025             raise argparse.ArgumentTypeError('%s is not a valid duration' % arg)
 3026         return arg
 3027 
 3028     @staticmethod
 3029     def is_date(arg):
 3030         """Check if a string is a valid date/month/year accepted by Google."""
 3031         if re.match(r'^(\d+/){0,2}\d+$', arg):
 3032             return arg
 3033         else:
 3034             raise argparse.ArgumentTypeError('%s is not a valid date/month/year; '
 3035                                              'use the American date format with slashes')
 3036 
 3037     @staticmethod
 3038     def is_colorstr(arg):
 3039         """Check if a string is a valid color string."""
 3040         try:
 3041             assert len(arg) == 6
 3042             for c in arg:
 3043                 assert c in COLORMAP
 3044         except AssertionError:
 3045             raise argparse.ArgumentTypeError('%s is not a valid color string' % arg)
 3046         return arg
 3047 
 3048 
 3049 # Self-upgrade mechanism
 3050 
 3051 def system_is_windows():
 3052     """Checks if the underlying system is Windows (Cygwin included)."""
 3053     return sys.platform in {'win32', 'cygwin'}
 3054 
 3055 
 3056 def download_latest_googler(include_git=False):
 3057     """Download latest googler to a temp file.
 3058 
 3059     By default, the latest released version is downloaded, but if
 3060     `include_git` is specified, then the latest git master is downloaded
 3061     instead.
 3062 
 3063     Parameters
 3064     ----------
 3065     include_git : bool, optional
 3066         Download from git master. Default is False.
 3067 
 3068     Returns
 3069     -------
 3070     (git_ref, path): tuple
 3071          A tuple containing the git reference (either name of the latest
 3072          tag or SHA of the latest commit) and path to the downloaded
 3073          file.
 3074 
 3075     """
 3076     import urllib.request
 3077 
 3078     if include_git:
 3079         # Get SHA of latest commit on master
 3080         request = urllib.request.Request('%s/commits/master' % API_REPO_BASE,
 3081                                          headers={'Accept': 'application/vnd.github.v3.sha'})
 3082         response = urllib.request.urlopen(request)
 3083         if response.status != 200:
 3084             raise http.client.HTTPException(response.reason)
 3085         git_ref = response.read().decode('utf-8')
 3086     else:
 3087         # Get name of latest tag
 3088         request = urllib.request.Request('%s/releases?per_page=1' % API_REPO_BASE,
 3089                                          headers={'Accept': 'application/vnd.github.v3+json'})
 3090         response = urllib.request.urlopen(request)
 3091         if response.status != 200:
 3092             raise http.client.HTTPException(response.reason)
 3093         import json
 3094         git_ref = json.loads(response.read().decode('utf-8'))[0]['tag_name']
 3095 
 3096     # Download googler to a tempfile
 3097     googler_download_url = '%s/%s/googler' % (RAW_DOWNLOAD_REPO_BASE, git_ref)
 3098     printerr('Downloading %s' % googler_download_url)
 3099     request = urllib.request.Request(googler_download_url,
 3100                                      headers={'Accept-Encoding': 'gzip'})
 3101     import tempfile
 3102     fd, path = tempfile.mkstemp()
 3103     atexit.register(lambda: os.remove(path) if os.path.exists(path) else None)
 3104     os.close(fd)
 3105     with open(path, 'wb') as fp:
 3106         with urllib.request.urlopen(request) as response:
 3107             if response.status != 200:
 3108                 raise http.client.HTTPException(response.reason)
 3109             payload = response.read()
 3110             try:
 3111                 fp.write(gzip.decompress(payload))
 3112             except OSError:
 3113                 fp.write(payload)
 3114     return git_ref, path
 3115 
 3116 
 3117 def self_replace(path):
 3118     """Replace the current script with a specified file.
 3119 
 3120     Both paths (the specified path and path to the current script) are
 3121     resolved to absolute, symlink-free paths. Upon replacement, the
 3122     owner and mode signatures of the current script are preserved. The
 3123     caller needs to have the necessary permissions.
 3124 
 3125     Replacement won't happen if the specified file is the same
 3126     (content-wise) as the current script.
 3127 
 3128     Parameters
 3129     ----------
 3130     path : str
 3131         Path to the replacement file.
 3132 
 3133     Returns
 3134     -------
 3135     bool
 3136         True if replaced, False if skipped (specified file is the same
 3137         as the current script).
 3138 
 3139     """
 3140     if system_is_windows():
 3141         raise NotImplementedError('Self upgrade not supported on Windows.')
 3142 
 3143     import filecmp
 3144     import shutil
 3145 
 3146     path = os.path.realpath(path)
 3147     self_path = os.path.realpath(__file__)
 3148 
 3149     if filecmp.cmp(path, self_path):
 3150         return False
 3151 
 3152     self_stat = os.stat(self_path)
 3153     os.chown(path, self_stat.st_uid, self_stat.st_gid)
 3154     os.chmod(path, self_stat.st_mode)
 3155 
 3156     shutil.move(path, self_path)
 3157     return True
 3158 
 3159 
 3160 def self_upgrade(include_git=False):
 3161     """Perform in-place self-upgrade.
 3162 
 3163     Parameters
 3164     ----------
 3165     include_git : bool, optional
 3166         See `download_latest_googler`. Default is False.
 3167 
 3168     """
 3169     git_ref, path = download_latest_googler(include_git=include_git)
 3170     if self_replace(path):
 3171         printerr('Upgraded to %s.' % git_ref)
 3172     else:
 3173         printerr('Already up to date.')
 3174 
 3175 
 3176 # Miscellaneous functions
 3177 
 3178 def python_version():
 3179     return '%d.%d.%d' % sys.version_info[:3]
 3180 
 3181 
 3182 def https_proxy_from_environment():
 3183     return os.getenv('https_proxy')
 3184 
 3185 
 3186 def parse_proxy_spec(proxyspec):
 3187     if '://' in proxyspec:
 3188         pos = proxyspec.find('://')
 3189         scheme = proxyspec[:pos]
 3190         proxyspec = proxyspec[pos+3:]
 3191         if scheme.lower() != 'http':
 3192             # Only support HTTP proxies.
 3193             #
 3194             # In particular, we don't support HTTPS proxies since we
 3195             # only speak plain HTTP to the proxy server, so don't give
 3196             # users a false sense of security.
 3197             raise NotImplementedError('Unsupported proxy scheme %s.' % scheme)
 3198 
 3199     if '@' in proxyspec:
 3200         pos = proxyspec.find('@')
 3201         user_passwd = urllib.parse.unquote(proxyspec[:pos])
 3202         # Remove trailing '/' if any
 3203         host_port = proxyspec[pos+1:].rstrip('/')
 3204     else:
 3205         user_passwd = None
 3206         host_port = proxyspec.rstrip('/')
 3207 
 3208     if ':' not in host_port:
 3209         # Use port 1080 as default, following curl.
 3210         host_port += ':1080'
 3211 
 3212     return user_passwd, host_port
 3213 
 3214 
 3215 def set_win_console_mode():
 3216     # VT100 control sequences are supported on Windows 10 Anniversary Update and later.
 3217     # https://docs.microsoft.com/en-us/windows/console/console-virtual-terminal-sequences
 3218     # https://docs.microsoft.com/en-us/windows/console/setconsolemode
 3219     if platform.release() == '10':
 3220         STD_OUTPUT_HANDLE = -11
 3221         STD_ERROR_HANDLE = -12
 3222         ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
 3223         try:
 3224             from ctypes import windll, wintypes, byref
 3225             kernel32 = windll.kernel32
 3226             for nhandle in (STD_OUTPUT_HANDLE, STD_ERROR_HANDLE):
 3227                 handle = kernel32.GetStdHandle(nhandle)
 3228                 old_mode = wintypes.DWORD()
 3229                 if not kernel32.GetConsoleMode(handle, byref(old_mode)):
 3230                     raise RuntimeError('GetConsoleMode failed')
 3231                 new_mode = old_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING
 3232                 if not kernel32.SetConsoleMode(handle, new_mode):
 3233                     raise RuntimeError('SetConsoleMode failed')
 3234             # Note: No need to restore at exit. SetConsoleMode seems to
 3235             # be limited to the calling process.
 3236         except Exception:
 3237             pass
 3238 
 3239 
 3240 # Query autocompleter
 3241 
 3242 # This function is largely experimental and could raise any exception;
 3243 # you should be prepared to catch anything. When it works though, it
 3244 # returns a list of strings the prefix could autocomplete to (however,
 3245 # it is not guaranteed that they start with the specified prefix; for
 3246 # instance, they won't if the specified prefix ends in a punctuation
 3247 # mark.)
 3248 def completer_fetch_completions(prefix):
 3249     import html
 3250     import json
 3251     import re
 3252     import urllib.request
 3253 
 3254     # One can pass the 'hl' query param to specify the language. We
 3255     # ignore that for now.
 3256     api_url = ('https://www.google.com/complete/search?client=psy-ab&q=%s' %
 3257                urllib.parse.quote(prefix, safe=''))
 3258     # A timeout of 3 seconds seems to be overly generous already.
 3259     resp = urllib.request.urlopen(api_url, timeout=3)
 3260     charset = resp.headers.get_content_charset()
 3261     logger.debug('Completions charset: %s', charset)
 3262     respobj = json.loads(resp.read().decode(charset))
 3263 
 3264     # The response object, once parsed as JSON, should look like
 3265     #
 3266     # ['git',
 3267     #  [['git<b>hub</b>', 0],
 3268     #   ['git', 0],
 3269     #   ['git<b>lab</b>', 0],
 3270     #   ['git<b> stash</b>', 0]],
 3271     #  {'q': 'oooAhRzoChqNmMbNaaDKXk1YY4k', 't': {'bpc': False, 'tlw': False}}]
 3272     #
 3273     # Note the each result entry need not have two members; e.g., for
 3274     # 'gi', there is an entry ['gi<b>f</b>', 0, [131]].
 3275     HTML_TAG = re.compile(r'<[^>]+>')
 3276     return [html.unescape(HTML_TAG.sub('', entry[0])) for entry in respobj[1]]
 3277 
 3278 
 3279 def completer_run(prefix):
 3280     if prefix:
 3281         completions = completer_fetch_completions(prefix)
 3282         if completions:
 3283             print('\n'.join(completions))
 3284     sys.exit(0)
 3285 
 3286 
 3287 def parse_args(args=None, namespace=None):
 3288     """Parse googler arguments/options.
 3289 
 3290     Parameters
 3291     ----------
 3292     args : list, optional
 3293         Arguments to parse. Default is ``sys.argv``.
 3294     namespace : argparse.Namespace
 3295         Namespace to write to. Default is a new namespace.
 3296 
 3297     Returns
 3298     -------
 3299     argparse.Namespace
 3300         Namespace with parsed arguments / options.
 3301 
 3302     """
 3303 
 3304     colorstr_env = os.getenv('GOOGLER_COLORS')
 3305 
 3306     argparser = GooglerArgumentParser(description='Google from the command-line.')
 3307     addarg = argparser.add_argument
 3308     addarg('-s', '--start', type=argparser.nonnegative_int, default=0,
 3309            metavar='N', help='start at the Nth result')
 3310     addarg('-n', '--count', dest='num', type=argparser.positive_int,
 3311            default=10, metavar='N', help='show N results (default 10)')
 3312     addarg('-N', '--news', action='store_true',
 3313            help='show results from news section')
 3314     addarg('-V', '--videos', action='store_true',
 3315            help='show results from videos section')
 3316     addarg('-c', '--tld', metavar='TLD',
 3317            help="""country-specific search with top-level domain .TLD, e.g., 'in'
 3318            for India""")
 3319     addarg('-l', '--lang', metavar='LANG', help='display in language LANG')
 3320     addarg('-x', '--exact', action='store_true',
 3321            help='disable automatic spelling correction')
 3322     addarg('--colorize', nargs='?', choices=['auto', 'always', 'never'],
 3323            const='always', default='auto',
 3324            help="""whether to colorize output; defaults to 'auto', which enables
 3325            color when stdout is a tty device; using --colorize without an argument
 3326            is equivalent to --colorize=always""")
 3327     addarg('-C', '--nocolor', action='store_true',
 3328            help='equivalent to --colorize=never')
 3329     addarg('--colors', dest='colorstr', type=argparser.is_colorstr,
 3330            default=colorstr_env if colorstr_env else 'GKlgxy', metavar='COLORS',
 3331            help='set output colors (see man page for details)')
 3332     addarg('-j', '--first', '--lucky', dest='lucky', action='store_true',
 3333            help='open the first result in web browser and exit')
 3334     addarg('-t', '--time', dest='duration', type=argparser.is_duration,
 3335            metavar='dN', help='time limit search '
 3336            '[h5 (5 hrs), d5 (5 days), w5 (5 weeks), m5 (5 months), y5 (5 years)]')
 3337     addarg('--from', type=argparser.is_date,
 3338            help="""starting date/month/year of date range; must use American date
 3339            format with slashes, e.g., 2/24/2020, 2/2020, 2020; can be used in
 3340            conjuction with --to, and overrides -t, --time""")
 3341     addarg('--to', type=argparser.is_date,
 3342            help='ending date/month/year of date range; see --from')
 3343     addarg('-w', '--site', dest='sites', action='append', metavar='SITE',
 3344            help='search a site using Google')
 3345     addarg('--unfilter', action='store_true', help='do not omit similar results')
 3346     addarg('-p', '--proxy', default=https_proxy_from_environment(),
 3347            help="""tunnel traffic through an HTTP proxy;
 3348            PROXY is of the form [http://][user:password@]proxyhost[:port]""")
 3349     addarg('--noua', action='store_true', help='legacy option (no effect)')
 3350     addarg('--notweak', action='store_true',
 3351            help='disable TCP optimizations and forced TLS 1.2')
 3352     addarg('--json', action='store_true',
 3353            help='output in JSON format; implies --noprompt')
 3354     addarg('--url-handler', metavar='UTIL',
 3355            help='custom script or cli utility to open results')
 3356     addarg('--show-browser-logs', action='store_true',
 3357            help='do not suppress browser output (stdout and stderr)')
 3358     addarg('--np', '--noprompt', dest='noninteractive', action='store_true',
 3359            help='search and exit, do not prompt')
 3360     addarg('keywords', nargs='*', metavar='KEYWORD', help='search keywords')
 3361     if ENABLE_SELF_UPGRADE_MECHANISM and not system_is_windows():
 3362         addarg('-u', '--upgrade', action='store_true',
 3363                help='perform in-place self-upgrade')
 3364         addarg('--include-git', action='store_true',
 3365                help='when used with --upgrade, get latest git master')
 3366     addarg('-v', '--version', action='version', version=_VERSION_)
 3367     addarg('-d', '--debug', action='store_true', help='enable debugging')
 3368     # Hidden option for interacting with DOM in an IPython/pdb shell
 3369     addarg('-D', '--debugger', action='store_true', help=argparse.SUPPRESS)
 3370     addarg('--complete', help=argparse.SUPPRESS)
 3371 
 3372     parsed = argparser.parse_args(args, namespace)
 3373     if parsed.nocolor:
 3374         parsed.colorize = 'never'
 3375 
 3376     return parsed
 3377 
 3378 
 3379 def main():
 3380     try:
 3381         opts = parse_args()
 3382 
 3383         # Set logging level
 3384         if opts.debug:
 3385             logger.setLevel(logging.DEBUG)
 3386             logger.debug('googler version %s', _VERSION_)
 3387             logger.debug('Python version %s', python_version())
 3388 
 3389         if opts.debugger:
 3390             global debugger
 3391             debugger = True
 3392 
 3393         # Handle query completer
 3394         if opts.complete is not None:
 3395             completer_run(opts.complete)
 3396 
 3397         # Handle self-upgrade
 3398         if hasattr(opts, 'upgrade') and opts.upgrade:
 3399             self_upgrade(include_git=opts.include_git)
 3400             sys.exit(0)
 3401 
 3402         check_stdout_encoding()
 3403 
 3404         if opts.keywords:
 3405             try:
 3406                 # Add cmdline args to readline history
 3407                 readline.add_history(' '.join(opts.keywords))
 3408             except Exception:
 3409                 pass
 3410 
 3411         # Set colors
 3412         if opts.colorize == 'always':
 3413             colorize = True
 3414         elif opts.colorize == 'auto':
 3415             colorize = sys.stdout.isatty()
 3416         else:  # opts.colorize == 'never'
 3417             colorize = False
 3418 
 3419         if colorize:
 3420             colors = Colors(*[COLORMAP[c] for c in opts.colorstr], reset=COLORMAP['x'])
 3421         else:
 3422             colors = None
 3423         Result.colors = colors
 3424         Result.urlexpand = True if os.getenv('DISABLE_URL_EXPANSION') is None else False
 3425         GooglerCmd.colors = colors
 3426 
 3427         # Try to enable ANSI color support in cmd or PowerShell on Windows 10
 3428         if sys.platform == 'win32' and sys.stdout.isatty() and colorize:
 3429             set_win_console_mode()
 3430 
 3431         if opts.url_handler is not None:
 3432             open_url.url_handler = opts.url_handler
 3433         else:
 3434             # Set text browser override to False
 3435             open_url.override_text_browser = False
 3436 
 3437             # Handle browser output suppression
 3438             if opts.show_browser_logs or (os.getenv('BROWSER') in text_browsers):
 3439                 open_url.suppress_browser_output = False
 3440             else:
 3441                 open_url.suppress_browser_output = True
 3442 
 3443         if opts.noua:
 3444             logger.warning('--noua option has been deprecated and has no effect (see #284)')
 3445 
 3446         repl = GooglerCmd(opts)
 3447 
 3448         if opts.json or opts.lucky or opts.noninteractive:
 3449             # Non-interactive mode
 3450             repl.fetch()
 3451             if opts.lucky:
 3452                 if repl.results:
 3453                     open_url(repl.results[0].url)
 3454                 else:
 3455                     print('No results.', file=sys.stderr)
 3456             else:
 3457                 repl.showing_results_for_alert(interactive=False)
 3458                 repl.display_results(json_output=opts.json)
 3459             sys.exit(0)
 3460         else:
 3461             # Interactive mode
 3462             repl.cmdloop()
 3463     except Exception as e:
 3464         # With debugging on, let the exception through for a traceback;
 3465         # otherwise, only print the exception error message.
 3466         if logger.isEnabledFor(logging.DEBUG):
 3467             raise
 3468         else:
 3469             logger.error(e)
 3470             sys.exit(1)
 3471 
 3472 if __name__ == '__main__':
 3473     main()