"Fossies" - the Fresh Open Source Software Archive 
Member "googler-4.3.2/googler" (21 Jan 2021, 135035 Bytes) of package /linux/misc/googler-4.3.2.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style:
standard) with prefixed line numbers.
Alternatively you can here
view or
download the uninterpreted source code file.
See also the latest
Fossies "Diffs" side-by-side code changes report for "googler":
4.3.1_vs_4.3.2.
1 #!/usr/bin/env python3
2 #
3 # Copyright © 2008 Henri Hakkinen
4 # Copyright © 2015-2021 Arun Prakash Jana <engineerarun@gmail.com>
5 #
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 3 of the License, or
9 # (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
15 #
16 # You should have received a copy of the GNU General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18
19 import argparse
20 import atexit
21 import base64
22 import collections
23 import codecs
24 import functools
25 import gzip
26 import html.entities
27 import html.parser
28 import http.client
29 from http.client import HTTPSConnection
30 import locale
31 import logging
32 import os
33 import platform
34 import shutil
35 import signal
36 import socket
37 import ssl
38 import subprocess
39 from subprocess import Popen, PIPE, DEVNULL
40 import sys
41 import textwrap
42 import unicodedata
43 import urllib.parse
44 import uuid
45 import webbrowser
46
47 # Python optional dependency compatibility layer
48 try:
49 import readline
50 except ImportError:
51 pass
52
53 try:
54 import setproctitle
55 setproctitle.setproctitle('googler')
56 except (ImportError, Exception):
57 pass
58
59 from typing import (
60 Any,
61 Dict,
62 Generator,
63 Iterable,
64 Iterator,
65 List,
66 Match,
67 Optional,
68 Sequence,
69 Tuple,
70 Union,
71 cast,
72 )
73
74 # Basic setup
75
76 logging.basicConfig(format='[%(levelname)s] %(message)s')
77 logger = logging.getLogger()
78
79
80 def sigint_handler(signum, frame):
81 print('\nInterrupted.', file=sys.stderr)
82 sys.exit(1)
83
84 try:
85 signal.signal(signal.SIGINT, sigint_handler)
86 except ValueError:
87 # signal only works in main thread
88 pass
89
90
91 # Constants
92
93 _VERSION_ = '4.3.2'
94 _EPOCH_ = '20210115'
95
96 COLORMAP = {k: '\x1b[%sm' % v for k, v in {
97 'a': '30', 'b': '31', 'c': '32', 'd': '33',
98 'e': '34', 'f': '35', 'g': '36', 'h': '37',
99 'i': '90', 'j': '91', 'k': '92', 'l': '93',
100 'm': '94', 'n': '95', 'o': '96', 'p': '97',
101 'A': '30;1', 'B': '31;1', 'C': '32;1', 'D': '33;1',
102 'E': '34;1', 'F': '35;1', 'G': '36;1', 'H': '37;1',
103 'I': '90;1', 'J': '91;1', 'K': '92;1', 'L': '93;1',
104 'M': '94;1', 'N': '95;1', 'O': '96;1', 'P': '97;1',
105 'x': '0', 'X': '1', 'y': '7', 'Y': '7;1',
106 }.items()}
107
108 USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
109
110 text_browsers = ['elinks', 'links', 'lynx', 'w3m', 'www-browser']
111
112 # Self-upgrade parameters
113 #
114 # Downstream packagers are recommended to turn off the entire self-upgrade
115 # mechanism through
116 #
117 # make disable-self-upgrade
118 #
119 # before running `make install'.
120
121 ENABLE_SELF_UPGRADE_MECHANISM = True
122 API_REPO_BASE = 'https://api.github.com/repos/jarun/googler'
123 RAW_DOWNLOAD_REPO_BASE = 'https://raw.githubusercontent.com/jarun/googler'
124
125 debugger = False
126
127
128 # Monkeypatch textwrap for CJK wide characters.
129
130 def monkeypatch_textwrap_for_cjk():
131 try:
132 if textwrap.wrap.patched:
133 return
134 except AttributeError:
135 pass
136 psl_textwrap_wrap = textwrap.wrap
137
138 def textwrap_wrap(text, width=70, **kwargs):
139 if width <= 2:
140 width = 2
141 # We first add a U+0000 after each East Asian Fullwidth or East
142 # Asian Wide character, then fill to width - 1 (so that if a NUL
143 # character ends up on a new line, we still have one last column
144 # to spare for the preceding wide character). Finally we strip
145 # all the NUL characters.
146 #
147 # East Asian Width: https://www.unicode.org/reports/tr11/
148 return [
149 line.replace('\0', '')
150 for line in psl_textwrap_wrap(
151 ''.join(
152 ch + '\0' if unicodedata.east_asian_width(ch) in ('F', 'W') else ch
153 for ch in unicodedata.normalize('NFC', text)
154 ),
155 width=width - 1,
156 **kwargs
157 )
158 ]
159
160 def textwrap_fill(text, width=70, **kwargs):
161 return '\n'.join(textwrap_wrap(text, width=width, **kwargs))
162
163 textwrap.wrap = textwrap_wrap
164 textwrap.fill = textwrap_fill
165 textwrap.wrap.patched = True
166 textwrap.fill.patched = True
167
168
169 monkeypatch_textwrap_for_cjk()
170
171
172 CoordinateType = Tuple[int, int]
173
174
175 class TrackedTextwrap:
176 """
177 Implements a text wrapper that tracks the position of each source
178 character, and can correctly insert zero-width sequences at given
179 offsets of the source text.
180
181 Wrapping result should be the same as that from PSL textwrap.wrap
182 with default settings except expand_tabs=False.
183 """
184
185 def __init__(self, text: str, width: int):
186 self._original = text
187
188 # Do the job of replace_whitespace first so that we can easily
189 # match text to wrapped lines later. Note that this operation
190 # does not change text length or offsets.
191 whitespace = "\t\n\v\f\r "
192 whitespace_trans = str.maketrans(whitespace, " " * len(whitespace))
193 text = text.translate(whitespace_trans)
194
195 self._lines = textwrap.wrap(
196 text, width, expand_tabs=False, replace_whitespace=False
197 )
198
199 # self._coords track the (row, column) coordinate of each source
200 # character in the result text. It is indexed by offset in
201 # source text.
202 self._coords = [] # type: List[CoordinateType]
203 offset = 0
204 try:
205 if not self._lines:
206 # Source text only has whitespaces. We add an empty line
207 # in order to produce meaningful coordinates.
208 self._lines = [""]
209 for row, line in enumerate(self._lines):
210 assert text[offset : offset + len(line)] == line
211 col = 0
212 for _ in line:
213 self._coords.append((row, col))
214 offset += 1
215 col += 1
216 # All subsequent dropped whitespaces map to the last, imaginary column
217 # (the EOL character if you wish) of the current line.
218 while offset < len(text) and text[offset] == " ":
219 self._coords.append((row, col))
220 offset += 1
221 # One past the final character (think of it as EOF) should
222 # be treated as a valid offset.
223 self._coords.append((row, col))
224 except AssertionError:
225 raise RuntimeError(
226 "TrackedTextwrap: the impossible happened at offset {} of text {!r}".format(
227 offset, self._original
228 )
229 )
230
231 # seq should be a zero-width sequence, e.g., an ANSI escape sequence.
232 # May raise IndexError if offset is out of bounds.
233 def insert_zero_width_sequence(self, seq: str, offset: int) -> None:
234 row, col = self._coords[offset]
235 line = self._lines[row]
236 self._lines[row] = line[:col] + seq + line[col:]
237
238 # Shift coordinates of all characters after the given character
239 # on the same line.
240 shift = len(seq)
241 offset += 1
242 while offset < len(self._coords) and self._coords[offset][0] == row:
243 _, col = self._coords[offset]
244 self._coords[offset] = (row, col + shift)
245 offset += 1
246
247 @property
248 def original(self) -> str:
249 return self._original
250
251 @property
252 def lines(self) -> List[str]:
253 return self._lines
254
255 @property
256 def wrapped(self) -> str:
257 return "\n".join(self._lines)
258
259 # May raise IndexError if offset is out of bounds.
260 def get_coordinate(self, offset: int) -> CoordinateType:
261 return self._coords[offset]
262
263
264 ### begin dim (DOM implementation with CSS support) ###
265 ### https://github.com/zmwangx/dim/blob/master/dim.py ###
266
267 import html
268 import re
269 from collections import OrderedDict
270 from enum import Enum
271 from html.parser import HTMLParser
272
273
274 SelectorGroupLike = Union[str, "SelectorGroup", "Selector"]
275
276
277 class Node(object):
278 """
279 Represents a DOM node.
280
281 Parts of JavaScript's DOM ``Node`` API and ``Element`` API are
282 mirrored here, with extensions. In particular, ``querySelector`` and
283 ``querySelectorAll`` are mirrored.
284
285 Notable properties and methods: :meth:`attr()`, :attr:`classes`,
286 :attr:`html`, :attr:`text`, :meth:`ancestors()`,
287 :meth:`descendants()`, :meth:`select()`, :meth:`select_all()`,
288 :meth:`matched_by()`,
289
290 Attributes:
291 tag (:class:`Optional`\\[:class:`str`])
292 attrs (:class:`Dict`\\[:class:`str`, :class:`str`])
293 parent (:class:`Optional`\\[:class:`Node`])
294 children (:class:`List`\\[:class:`Node`])
295 """
296
297 # Meant to be reimplemented by subclasses.
298 def __init__(self) -> None:
299 self.tag = None # type: Optional[str]
300 self.attrs = {} # type: Dict[str, str]
301 self.parent = None # type: Optional[Node]
302 self.children = [] # type: List[Node]
303
304 # Used in DOMBuilder.
305 self._partial = False
306 self._namespace = None # type: Optional[str]
307
308 # HTML representation of the node. Meant to be implemented by
309 # subclasses.
310 def __str__(self) -> str: # pragma: no cover
311 raise NotImplementedError
312
313 def select(self, selector: SelectorGroupLike) -> Optional["Node"]:
314 """DOM ``querySelector`` clone. Returns one match (if any)."""
315 selector = self._normalize_selector(selector)
316 for node in self._select_all(selector):
317 return node
318 return None
319
320 def query_selector(self, selector: SelectorGroupLike) -> Optional["Node"]:
321 """Alias of :meth:`select`."""
322 return self.select(selector)
323
324 def select_all(self, selector: SelectorGroupLike) -> List["Node"]:
325 """DOM ``querySelectorAll`` clone. Returns all matches in a list."""
326 selector = self._normalize_selector(selector)
327 return list(self._select_all(selector))
328
329 def query_selector_all(self, selector: SelectorGroupLike) -> List["Node"]:
330 """Alias of :meth:`select_all`."""
331 return self.select_all(selector)
332
333 def matched_by(
334 self, selector: SelectorGroupLike, root: Optional["Node"] = None
335 ) -> bool:
336 """
337 Checks whether this node is matched by `selector`.
338
339 See :meth:`SelectorGroup.matches()`.
340 """
341 selector = self._normalize_selector(selector)
342 return selector.matches(self, root=root)
343
344 @staticmethod
345 def _normalize_selector(selector: SelectorGroupLike) -> "SelectorGroup":
346 if isinstance(selector, str):
347 return SelectorGroup.from_str(selector)
348 if isinstance(selector, SelectorGroup):
349 return selector
350 if isinstance(selector, Selector):
351 return SelectorGroup([selector])
352 raise ValueError("not a selector or group of selectors: %s" % repr(selector))
353
354 def _select_all(self, selector: "SelectorGroup") -> Generator["Node", None, None]:
355 for descendant in self.descendants():
356 if selector.matches(descendant, root=self):
357 yield descendant
358
359 def child_nodes(self) -> List["Node"]:
360 return self.children
361
362 def first_child(self) -> Optional["Node"]:
363 if self.children:
364 return self.children[0]
365 else:
366 return None
367
368 def first_element_child(self) -> Optional["Node"]:
369 for child in self.children:
370 if isinstance(child, ElementNode):
371 return child
372 return None
373
374 def last_child(self) -> Optional["Node"]:
375 if self.children:
376 return self.children[-1]
377 else:
378 return None
379
380 def last_element_child(self) -> Optional["Node"]:
381 for child in reversed(self.children):
382 if isinstance(child, ElementNode):
383 return child
384 return None
385
386 def next_sibling(self) -> Optional["Node"]:
387 """.. note:: Not O(1), use with caution."""
388 next_siblings = self.next_siblings()
389 if next_siblings:
390 return next_siblings[0]
391 else:
392 return None
393
394 def next_siblings(self) -> List["Node"]:
395 parent = self.parent
396 if not parent:
397 return []
398 try:
399 index = parent.children.index(self)
400 return parent.children[index + 1 :]
401 except ValueError: # pragma: no cover
402 raise ValueError("node is not found in children of its parent")
403
404 def next_element_sibling(self) -> Optional["ElementNode"]:
405 """.. note:: Not O(1), use with caution."""
406 for sibling in self.next_siblings():
407 if isinstance(sibling, ElementNode):
408 return sibling
409 return None
410
411 def previous_sibling(self) -> Optional["Node"]:
412 """.. note:: Not O(1), use with caution."""
413 previous_siblings = self.previous_siblings()
414 if previous_siblings:
415 return previous_siblings[0]
416 else:
417 return None
418
419 def previous_siblings(self) -> List["Node"]:
420 """
421 Compared to the natural DOM order, the order of returned nodes
422 are reversed. That is, the adjacent sibling (if any) is the
423 first in the returned list.
424 """
425 parent = self.parent
426 if not parent:
427 return []
428 try:
429 index = parent.children.index(self)
430 if index > 0:
431 return parent.children[index - 1 :: -1]
432 else:
433 return []
434 except ValueError: # pragma: no cover
435 raise ValueError("node is not found in children of its parent")
436
437 def previous_element_sibling(self) -> Optional["ElementNode"]:
438 """.. note:: Not O(1), use with caution."""
439 for sibling in self.previous_siblings():
440 if isinstance(sibling, ElementNode):
441 return sibling
442 return None
443
444 def ancestors(
445 self, *, root: Optional["Node"] = None
446 ) -> Generator["Node", None, None]:
447 """
448 Ancestors are generated in reverse order of depth, stopping at
449 `root`.
450
451 A :class:`RuntimeException` is raised if `root` is not in the
452 ancestral chain.
453 """
454 if self is root:
455 return
456 ancestor = self.parent
457 while ancestor is not root:
458 if ancestor is None:
459 raise RuntimeError("provided root node not found in ancestral chain")
460 yield ancestor
461 ancestor = ancestor.parent
462 if root:
463 yield root
464
465 def descendants(self) -> Generator["Node", None, None]:
466 """Descendants are generated in depth-first order."""
467 for child in self.children:
468 yield child
469 yield from child.descendants()
470
471 def attr(self, attr: str) -> Optional[str]:
472 """Returns the attribute if it exists on the node, otherwise ``None``."""
473 return self.attrs.get(attr)
474
475 @property
476 def html(self) -> str:
477 """
478 HTML representation of the node.
479
480 (For a :class:`TextNode`, :meth:`html` returns the escaped version of the
481 text.
482 """
483 return str(self)
484
485 def outer_html(self) -> str:
486 """Alias of :attr:`html`."""
487 return self.html
488
489 def inner_html(self) -> str:
490 """HTML representation of the node's children."""
491 return "".join(child.html for child in self.children)
492
493 @property
494 def text(self) -> str: # pragma: no cover
495 """This property is expected to be implemented by subclasses."""
496 raise NotImplementedError
497
498 def text_content(self) -> str:
499 """Alias of :attr:`text`."""
500 return self.text
501
502 @property
503 def classes(self) -> List[str]:
504 return self.attrs.get("class", "").split()
505
506 def class_list(self) -> List[str]:
507 return self.classes
508
509
510 class ElementNode(Node):
511 """
512 Represents an element node.
513
514 Note that tag and attribute names are case-insensitive; attribute
515 values are case-sensitive.
516 """
517
518 def __init__(
519 self,
520 tag: str,
521 attrs: Iterable[Tuple[str, Optional[str]]],
522 *,
523 parent: Optional["Node"] = None,
524 children: Optional[Sequence["Node"]] = None
525 ) -> None:
526 Node.__init__(self)
527 self.tag = tag.lower() # type: str
528 self.attrs = OrderedDict((attr.lower(), val or "") for attr, val in attrs)
529 self.parent = parent
530 self.children = list(children or [])
531
532 def __repr__(self) -> str:
533 s = "<" + self.tag
534 if self.attrs:
535 s += " attrs=%s" % repr(list(self.attrs.items()))
536 if self.children:
537 s += " children=%s" % repr(self.children)
538 s += ">"
539 return s
540
541 # https://ipython.readthedocs.io/en/stable/api/generated/IPython.lib.pretty.html
542 def _repr_pretty_(self, p: Any, cycle: bool) -> None: # pragma: no cover
543 if cycle:
544 raise RuntimeError("cycle detected in DOM tree")
545 p.text("<\x1b[1m%s\x1b[0m" % self.tag)
546 if self.attrs:
547 p.text(" attrs=%s" % repr(list(self.attrs.items())))
548 if self.children:
549 p.text(" children=[")
550 if len(self.children) == 1 and isinstance(self.first_child(), TextNode):
551 p.text("\x1b[4m%s\x1b[0m" % repr(self.first_child()))
552 else:
553 with p.indent(2):
554 for child in self.children:
555 p.break_()
556 if hasattr(child, "_repr_pretty_"):
557 child._repr_pretty_(p, False) # type: ignore
558 else:
559 p.text("\x1b[4m%s\x1b[0m" % repr(child))
560 p.text(",")
561 p.break_()
562 p.text("]")
563 p.text(">")
564
565 def __str__(self) -> str:
566 """HTML representation of the node."""
567 s = "<" + self.tag
568 for attr, val in self.attrs.items():
569 s += ' %s="%s"' % (attr, html.escape(val))
570 if self.children:
571 s += ">"
572 s += "".join(str(child) for child in self.children)
573 s += "</%s>" % self.tag
574 else:
575 if _tag_is_void(self.tag):
576 s += "/>"
577 else:
578 s += "></%s>" % self.tag
579 return s
580
581 @property
582 def text(self) -> str:
583 """The concatenation of all descendant text nodes."""
584 return "".join(child.text for child in self.children)
585
586
587 class TextNode(str, Node):
588 """
589 Represents a text node.
590
591 Subclasses :class:`Node` and :class:`str`.
592 """
593
594 def __new__(cls, text: str) -> "TextNode":
595 s = str.__new__(cls, text) # type: ignore
596 s.parent = None
597 return s # type: ignore
598
599 def __init__(self, text: str) -> None:
600 Node.__init__(self)
601
602 def __repr__(self) -> str:
603 return "<%s>" % str.__repr__(self)
604
605 # HTML-escaped form of the text node. use text() for unescaped
606 # version.
607 def __str__(self) -> str:
608 return html.escape(self)
609
610 def __eq__(self, other: object) -> bool:
611 """
612 Two text nodes are equal if and only if they are the same node.
613
614 For string comparison, use :attr:`text`.
615 """
616 return self is other
617
618 def __ne__(self, other: object) -> bool:
619 """
620 Two text nodes are non-equal if they are not the same node.
621
622 For string comparison, use :attr:`text`.
623 """
624 return self is not other
625
626 @property
627 def text(self) -> str:
628 return str.__str__(self)
629
630
631 class DOMBuilderException(Exception):
632 """
633 Exception raised when :class:`DOMBuilder` detects a bad state.
634
635 Attributes:
636 pos (:class:`Tuple`\\[:class:`int`, :class:`int`]):
637 Line number and offset in HTML input.
638 why (:class:`str`):
639 Reason of the exception.
640 """
641
642 def __init__(self, pos: Tuple[int, int], why: str) -> None:
643 self.pos = pos
644 self.why = why
645
646 def __str__(self) -> str: # pragma: no cover
647 return "DOM builder aborted at %d:%d: %s" % (self.pos[0], self.pos[1], self.why)
648
649
650 class DOMBuilder(HTMLParser):
651 """
652 HTML parser / DOM builder.
653
654 Subclasses :class:`html.parser.HTMLParser`.
655
656 Consume HTML and builds a :class:`Node` tree. Once finished, use
657 :attr:`root` to access the root of the tree.
658
659 This parser cannot parse malformed HTML with tag mismatch.
660 """
661
662 def __init__(self) -> None:
663 super().__init__(convert_charrefs=True)
664 # _stack is the stack for nodes. Each node is pushed to the
665 # stack when its start tag is processed, and remains on the
666 # stack until its parent node is completed (end tag processed),
667 # at which point the node is attached to the parent node as a
668 # child and popped from the stack.
669 self._stack = [] # type: List[Node]
670 # _namespace_stack is another stack tracking the parsing
671 # context, which is generally the default namespace (None) but
672 # changes when parsing foreign objects (e.g. 'svg' when parsing
673 # an <svg>). The top element is always the current parsing
674 # context, so popping works differently from _stack: an element
675 # is popped as soon as the corresponding end tag is processed.
676 self._namespace_stack = [None] # type: List[Optional[str]]
677
678 def handle_starttag(
679 self, tag: str, attrs: Sequence[Tuple[str, Optional[str]]]
680 ) -> None:
681 node = ElementNode(tag, attrs)
682 node._partial = True
683 self._stack.append(node)
684 namespace = (
685 tag.lower()
686 if _tag_encloses_foreign_namespace(tag)
687 else self._namespace_stack[-1] # Inherit parent namespace
688 )
689 node._namespace = namespace
690 self._namespace_stack.append(namespace)
691 # For void elements (not in a foreign context), immediately
692 # invoke the end tag handler (see handle_startendtag()).
693 if not namespace and _tag_is_void(tag):
694 self.handle_endtag(tag)
695
696 def handle_endtag(self, tag: str) -> None:
697 tag = tag.lower()
698 children = []
699 while self._stack and not self._stack[-1]._partial:
700 children.append(self._stack.pop())
701 if not self._stack:
702 raise DOMBuilderException(self.getpos(), "extra end tag: %s" % repr(tag))
703 parent = self._stack[-1]
704 if parent.tag != tag:
705 raise DOMBuilderException(
706 self.getpos(),
707 "expecting end tag %s, got %s" % (repr(parent.tag), repr(tag)),
708 )
709 parent.children = list(reversed(children))
710 parent._partial = False
711 for child in children:
712 child.parent = parent
713 self._namespace_stack.pop()
714
715 # Make parser behavior for explicitly and implicitly void elements
716 # (e.g., <hr> vs <hr/>) consistent. The former triggers
717 # handle_starttag only, whereas the latter triggers
718 # handle_startendtag (which by default triggers both handle_starttag
719 # and handle_endtag). See https://bugs.python.org/issue25258.
720 #
721 # An exception is foreign elements, which aren't considered void
722 # elements but can be explicitly marked as self-closing according to
723 # the HTML spec (e.g. <path/> is valid but <path> is not).
724 # Therefore, both handle_starttag and handle_endtag must be called,
725 # and handle_endtag should not be triggered from within
726 # handle_starttag in that case.
727 #
728 # Note that for simplicity we do not check whether the foreign
729 # element in question is allowed to be self-closing by spec. (The
730 # SVG spec unfortunately doesn't provide a readily available list of
731 # such elements.)
732 #
733 # https://html.spec.whatwg.org/multipage/syntax.html#foreign-elements
734 def handle_startendtag(
735 self, tag: str, attrs: Sequence[Tuple[str, Optional[str]]]
736 ) -> None:
737 if self._namespace_stack[-1] or _tag_encloses_foreign_namespace(tag):
738 self.handle_starttag(tag, attrs)
739 self.handle_endtag(tag)
740 else:
741 self.handle_starttag(tag, attrs)
742
743 def handle_data(self, text: str) -> None:
744 if not self._stack:
745 # Ignore text nodes before the first tag.
746 return
747 self._stack.append(TextNode(text))
748
749 @property
750 def root(self) -> "Node":
751 """
752 Finishes processing and returns the root node.
753
754 Raises :class:`DOMBuilderException` if there is no root tag or
755 root tag is not closed yet.
756 """
757 if not self._stack:
758 raise DOMBuilderException(self.getpos(), "no root tag")
759 if self._stack[0]._partial:
760 raise DOMBuilderException(self.getpos(), "root tag not closed yet")
761 return self._stack[0]
762
763
764 def parse_html(html: str, *, ParserClass: type = DOMBuilder) -> "Node":
765 """
766 Parses HTML string, builds DOM, and returns root node.
767
768 The parser may raise :class:`DOMBuilderException`.
769
770 Args:
771 html: input HTML string
772 ParserClass: :class:`DOMBuilder` or a subclass
773
774 Returns:
775 Root note of the parsed tree. If the HTML string contains
776 multiple top-level elements, only the first is returned and the
777 rest are lost.
778 """
779 builder = ParserClass() # type: DOMBuilder
780 builder.feed(html)
781 builder.close()
782 return builder.root
783
784
785 class SelectorParserException(Exception):
786 """
787 Exception raised when the selector parser fails to parse an input.
788
789 Attributes:
790 s (:class:`str`):
791 The input string to be parsed.
792 cursor (:class:`int`):
793 Cursor position where the failure occurred.
794 why (:class:`str`):
795 Reason of the failure.
796 """
797
798 def __init__(self, s: str, cursor: int, why: str) -> None:
799 self.s = s
800 self.cursor = cursor
801 self.why = why
802
803 def __str__(self) -> str: # pragma: no cover
804 return "selector parser aborted at character %d of %s: %s" % (
805 self.cursor,
806 repr(self.s),
807 self.why,
808 )
809
810
811 class SelectorGroup:
812 """
813 Represents a group of CSS selectors.
814
815 A group of CSS selectors is simply a comma-separated list of
816 selectors. [#]_ See :class:`Selector` documentation for the scope of
817 support.
818
819 Typically, a :class:`SelectorGroup` is constructed from a string
820 (e.g., ``th.center, td.center``) using the factory function
821 :meth:`from_str`.
822
823 .. [#] https://www.w3.org/TR/selectors-3/#grouping
824 """
825
826 def __init__(self, selectors: Iterable["Selector"]) -> None:
827 self._selectors = list(selectors)
828
829 def __repr__(self) -> str:
830 return "<SelectorGroup %s>" % repr(str(self))
831
832 def __str__(self) -> str:
833 return ", ".join(str(selector) for selector in self._selectors)
834
835 def __len__(self) -> int:
836 return len(self._selectors)
837
838 def __getitem__(self, index: int) -> "Selector":
839 return self._selectors[index]
840
841 def __iter__(self) -> Iterator["Selector"]:
842 return iter(self._selectors)
843
844 @classmethod
845 def from_str(cls, s: str) -> "SelectorGroup":
846 """
847 Parses input string into a group of selectors.
848
849 :class:`SelectorParserException` is raised on invalid input. See
850 :class:`Selector` documentation for the scope of support.
851
852 Args:
853 s: input string
854
855 Returns:
856 Parsed group of selectors.
857 """
858 i = 0
859 selectors = []
860 while i < len(s):
861 selector, i = Selector.from_str(s, i)
862 selectors.append(selector)
863 if not selectors:
864 raise SelectorParserException(s, i, "selector group is empty")
865 return cls(selectors)
866
867 def matches(self, node: "Node", root: Optional["Node"] = None) -> bool:
868 """
869 Decides whether the group of selectors matches `node`.
870
871 The group of selectors matches `node` as long as one of the
872 selectors matches `node`.
873
874 If `root` is provided and child and/or descendant combinators
875 are involved, parent/ancestor lookup terminates at `root`.
876 """
877 return any(selector.matches(node, root=root) for selector in self)
878
879
880 class Selector:
881 """
882 Represents a CSS selector.
883
884 Recall that a CSS selector is a chain of one or more *sequences of
885 simple selectors* separated by *combinators*. [#selectors-3]_ This
886 concept is represented as a cons list of sequences of simple
887 selectors (in right to left order). This class in fact holds a
888 single sequence, with an optional combinator and reference to the
889 previous sequence.
890
891 For instance, ``main#main p.important.definition >
892 a.term[id][href]`` would be parsed into (schematically) the
893 following structure::
894
895 ">" tag='a' classes=('term') attrs=([id], [href]) ~>
896 " " tag='p' classes=('important', 'definition') ~>
897 tag='main' id='main'
898
899 Each line is held in a separate instance of :class:`Selector`,
900 linked together by the :attr:`previous` attribute.
901
902 Supported grammar (from selectors level 3 [#selectors-3]_):
903
904 - Type selectors;
905 - Universal selectors;
906 - Class selectors;
907 - ID selectors;
908 - Attribute selectors;
909 - Combinators.
910
911 Unsupported grammar:
912
913 - Pseudo-classes;
914 - Pseudo-elements;
915 - Namespace prefixes (``ns|``, ``*|``, ``|``) in any part of any
916 selector.
917
918 Rationale:
919
920 - Pseudo-classes have too many variants, a few of which even
921 complete with an admittedly not-so-complex minilanguage. These add
922 up to a lot of code.
923 - Pseudo-elements are useless outside rendering contexts, hence out of
924 scope.
925 - Namespace support is too niche to be worth the parsing headache.
926 *Using namespace prefixes may confuse the parser!*
927
928 Note that the parser only loosely follows the spec and priotizes
929 ease of parsing (which includes readability and *writability* of
930 regexes), so some invalid selectors may be accepted (in fact, false
931 positives abound, but accepting valid inputs is a much more
932 important goal than rejecting invalid inputs for this library), and
933 some valid selectors may be rejected (but as long as you stick to
934 the scope outlined above and common sense you should be fine; the
935 false negatives shouldn't be used by actual human beings anyway).
936
937 In particular, whitespace character is simplified to ``\\s`` (ASCII
938 mode) despite CSS spec not counting U+000B (VT) as whitespace,
939 identifiers are simplified to ``[\\w-]+`` (ASCII mode), and strings
940 (attribute selector values can be either identifiers or strings)
941 allow escaped quotes (i.e., ``\\'`` inside single-quoted strings and
942 ``\\"`` inside double-quoted strings) but everything else is
943 interpreted literally. The exact specs for CSS identifiers and
944 strings can be found at [#]_.
945
946 Certain selectors and combinators may be implemented in the parser
947 but not implemented in matching and/or selection APIs.
948
949 .. [#selectors-3] https://www.w3.org/TR/selectors-3/
950 .. [#] https://www.w3.org/TR/CSS21/syndata.html
951
952 Attributes:
953 tag (:class:`Optional`\\[:class:`str`]):
954 Type selector.
955 classes (:class:`List`\\[:class:`str`]):
956 Class selectors.
957 id (:class:`Optional`\\[:class:`str`]):
958 ID selector.
959 attrs (:class:`List`\\[:class:`AttributeSelector`]):
960 Attribute selectors.
961 combinator (:class:`Optional`\\[:class:`Combinator`]):
962 Combinator with the previous sequence of simple selectors in
963 chain.
964 previous (:class:`Optional`\\[:class:`Selector`]):
965 Reference to the previous sequence of simple selectors in
966 chain.
967
968 """
969
970 def __init__(
971 self,
972 *,
973 tag: Optional[str] = None,
974 classes: Optional[Sequence[str]] = None,
975 id: Optional[str] = None,
976 attrs: Optional[Sequence["AttributeSelector"]] = None,
977 combinator: Optional["Combinator"] = None,
978 previous: Optional["Selector"] = None
979 ) -> None:
980 self.tag = tag.lower() if tag else None
981 self.classes = list(classes or [])
982 self.id = id
983 self.attrs = list(attrs or [])
984 self.combinator = combinator
985 self.previous = previous
986
987 def __repr__(self) -> str:
988 return "<Selector %s>" % repr(str(self))
989
990 def __str__(self) -> str:
991 sequences = []
992 delimiters = []
993 seq = self
994 while True:
995 sequences.append(seq._sequence_str_())
996 if seq.previous:
997 if seq.combinator == Combinator.DESCENDANT:
998 delimiters.append(" ")
999 elif seq.combinator == Combinator.CHILD:
1000 delimiters.append(" > ")
1001 elif seq.combinator == Combinator.NEXT_SIBLING:
1002 delimiters.append(" + ")
1003 elif seq.combinator == Combinator.SUBSEQUENT_SIBLING:
1004 delimiters.append(" ~ ")
1005 else: # pragma: no cover
1006 raise RuntimeError(
1007 "unimplemented combinator: %s" % repr(self.combinator)
1008 )
1009 seq = seq.previous
1010 else:
1011 delimiters.append("")
1012 break
1013 return "".join(
1014 delimiter + sequence
1015 for delimiter, sequence in zip(reversed(delimiters), reversed(sequences))
1016 )
1017
1018 # Format a single sequence of simple selectors, without combinator.
1019 def _sequence_str_(self) -> str:
1020 s = ""
1021 if self.tag:
1022 s += self.tag
1023 if self.classes:
1024 s += "".join(".%s" % class_ for class_ in self.classes)
1025 if self.id:
1026 s += "#%s" % self.id
1027 if self.attrs:
1028 s += "".join(str(attr) for attr in self.attrs)
1029 return s if s else "*"
1030
1031 @classmethod
1032 def from_str(cls, s: str, cursor: int = 0) -> Tuple["Selector", int]:
1033 """
1034 Parses input string into selector.
1035
1036 This factory function only parses out one selector (up to a
1037 comma or EOS), so partial consumption is allowed --- an optional
1038 `cursor` is taken as input (0 by default) and the moved cursor
1039 (either after the comma or at EOS) is returned as part of the
1040 output.
1041
1042 :class:`SelectorParserException` is raised on invalid input. See
1043 :class:`Selector` documentation for the scope of support.
1044
1045 If you need to completely consume a string representing
1046 (potentially) a group of selectors, use
1047 :meth:`SelectorGroup.from_str()`.
1048
1049 Args:
1050 s: input string
1051 cursor: initial cursor position on `s`
1052
1053 Returns:
1054 A tuple containing the parsed selector and the moved the
1055 cursor (either after a comma-delimiter, or at EOS).
1056 """
1057 # Simple selectors.
1058 TYPE_SEL = re.compile(r"[\w-]+", re.A)
1059 UNIVERSAL_SEL = re.compile(r"\*")
1060 ATTR_SEL = re.compile(
1061 r"""\[
1062 \s*(?P<attr>[\w-]+)\s*
1063 (
1064 (?P<op>[~|^$*]?=)\s*
1065 (
1066 (?P<val_identifier>[\w-]+)|
1067 (?P<val_string>
1068 (?P<quote>['"])
1069 (?P<val_string_inner>.*?)
1070 (?<!\\)(?P=quote)
1071 )
1072 )\s*
1073 )?
1074 \]""",
1075 re.A | re.X,
1076 )
1077 CLASS_SEL = re.compile(r"\.([\w-]+)", re.A)
1078 ID_SEL = re.compile(r"#([\w-]+)", re.A)
1079 PSEUDO_CLASS_SEL = re.compile(r":[\w-]+(\([^)]+\))?", re.A)
1080 PSEUDO_ELEM_SEL = re.compile(r"::[\w-]+", re.A)
1081
1082 # Combinators
1083 DESCENDANT_COM = re.compile(r"\s+")
1084 CHILD_COM = re.compile(r"\s*>\s*")
1085 NEXT_SIB_COM = re.compile(r"\s*\+\s*")
1086 SUB_SIB_COM = re.compile(r"\s*~\s*")
1087
1088 # Misc
1089 WHITESPACE = re.compile(r"\s*")
1090 END_OF_SELECTOR = re.compile(r"\s*($|,)")
1091
1092 tag = None
1093 classes = []
1094 id = None
1095 attrs = []
1096 combinator = None
1097
1098 selector = None
1099 previous_combinator = None
1100
1101 i = cursor
1102
1103 # Skip leading whitespace
1104 m = WHITESPACE.match(s, i)
1105 if m:
1106 i = m.end()
1107
1108 while i < len(s):
1109 # Parse one simple selector.
1110 #
1111 # PEP 572 (assignment expressions; the one that burned Guido
1112 # so much that he resigned as BDFL) would have been nice; it
1113 # would have saved us from all the regex match
1114 # reassignments, and worse still, the casts, since mypy
1115 # complains about getting Optional[Match[str]] instead of
1116 # Match[str].
1117 if TYPE_SEL.match(s, i):
1118 if tag:
1119 raise SelectorParserException(s, i, "multiple type selectors found")
1120 m = cast(Match[str], TYPE_SEL.match(s, i))
1121 tag = m.group()
1122 elif UNIVERSAL_SEL.match(s, i):
1123 m = cast(Match[str], UNIVERSAL_SEL.match(s, i))
1124 elif ATTR_SEL.match(s, i):
1125 m = cast(Match[str], ATTR_SEL.match(s, i))
1126
1127 attr = m.group("attr")
1128 op = m.group("op")
1129 val_identifier = m.group("val_identifier")
1130 quote = m.group("quote")
1131 val_string_inner = m.group("val_string_inner")
1132 if val_identifier is not None:
1133 val = val_identifier
1134 elif val_string_inner is not None:
1135 val = val_string_inner.replace("\\" + quote, quote)
1136 else:
1137 val = None
1138
1139 if op is None:
1140 type = AttributeSelectorType.BARE
1141 elif op == "=":
1142 type = AttributeSelectorType.EQUAL
1143 elif op == "~=":
1144 type = AttributeSelectorType.TILDE
1145 elif op == "|=":
1146 type = AttributeSelectorType.PIPE
1147 elif op == "^=":
1148 type = AttributeSelectorType.CARET
1149 elif op == "$=":
1150 type = AttributeSelectorType.DOLLAR
1151 elif op == "*=":
1152 type = AttributeSelectorType.ASTERISK
1153 else: # pragma: no cover
1154 raise SelectorParserException(
1155 s,
1156 i,
1157 "unrecognized operator %s in attribute selector" % repr(op),
1158 )
1159
1160 attrs.append(AttributeSelector(attr, val, type))
1161 elif CLASS_SEL.match(s, i):
1162 m = cast(Match[str], CLASS_SEL.match(s, i))
1163 classes.append(m.group(1))
1164 elif ID_SEL.match(s, i):
1165 if id:
1166 raise SelectorParserException(s, i, "multiple id selectors found")
1167 m = cast(Match[str], ID_SEL.match(s, i))
1168 id = m.group(1)
1169 elif PSEUDO_CLASS_SEL.match(s, i):
1170 raise SelectorParserException(s, i, "pseudo-classes not supported")
1171 elif PSEUDO_ELEM_SEL.match(s, i):
1172 raise SelectorParserException(s, i, "pseudo-elements not supported")
1173 else:
1174 raise SelectorParserException(
1175 s, i, "expecting simple selector, found none"
1176 )
1177 i = m.end()
1178
1179 # Try to parse a combinator, or end the selector.
1180 if CHILD_COM.match(s, i):
1181 m = cast(Match[str], CHILD_COM.match(s, i))
1182 combinator = Combinator.CHILD
1183 elif NEXT_SIB_COM.match(s, i):
1184 m = cast(Match[str], NEXT_SIB_COM.match(s, i))
1185 combinator = Combinator.NEXT_SIBLING
1186 elif SUB_SIB_COM.match(s, i):
1187 m = cast(Match[str], SUB_SIB_COM.match(s, i))
1188 combinator = Combinator.SUBSEQUENT_SIBLING
1189 elif END_OF_SELECTOR.match(s, i):
1190 m = cast(Match[str], END_OF_SELECTOR.match(s, i))
1191 combinator = None
1192 # Need to parse descendant combinator at the very end
1193 # because it could be a prefix to all previous cases.
1194 elif DESCENDANT_COM.match(s, i):
1195 m = cast(Match[str], DESCENDANT_COM.match(s, i))
1196 combinator = Combinator.DESCENDANT
1197 else:
1198 continue
1199 i = m.end()
1200
1201 if combinator and i == len(s):
1202 raise SelectorParserException(s, i, "unexpected end at combinator")
1203
1204 selector = cls(
1205 tag=tag,
1206 classes=classes,
1207 id=id,
1208 attrs=attrs,
1209 combinator=previous_combinator,
1210 previous=selector,
1211 )
1212 previous_combinator = combinator
1213
1214 # End of selector.
1215 if combinator is None:
1216 break
1217
1218 tag = None
1219 classes = []
1220 id = None
1221 attrs = []
1222 combinator = None
1223
1224 if not selector:
1225 raise SelectorParserException(s, i, "selector is empty")
1226
1227 return selector, i
1228
1229 def matches(self, node: "Node", root: Optional["Node"] = None) -> bool:
1230 """
1231 Decides whether the selector matches `node`.
1232
1233 Each sequence of simple selectors in the selector's chain must
1234 be matched for a positive.
1235
1236 If `root` is provided and child and/or descendant combinators
1237 are involved, parent/ancestor lookup terminates at `root`.
1238 """
1239 if self.tag:
1240 if not node.tag or node.tag != self.tag:
1241 return False
1242 if self.id:
1243 if node.attrs.get("id") != self.id:
1244 return False
1245 if self.classes:
1246 classes = node.classes
1247 for class_ in self.classes:
1248 if class_ not in classes:
1249 return False
1250 if self.attrs:
1251 for attr_selector in self.attrs:
1252 if not attr_selector.matches(node):
1253 return False
1254
1255 if not self.previous:
1256 return True
1257
1258 if self.combinator == Combinator.DESCENDANT:
1259 return any(
1260 self.previous.matches(ancestor, root=root)
1261 for ancestor in node.ancestors()
1262 )
1263 elif self.combinator == Combinator.CHILD:
1264 if node is root or node.parent is None:
1265 return False
1266 else:
1267 return self.previous.matches(node.parent)
1268 elif self.combinator == Combinator.NEXT_SIBLING:
1269 sibling = node.previous_element_sibling()
1270 if not sibling:
1271 return False
1272 else:
1273 return self.previous.matches(sibling)
1274 elif self.combinator == Combinator.SUBSEQUENT_SIBLING:
1275 return any(
1276 self.previous.matches(sibling, root=root)
1277 for sibling in node.previous_siblings()
1278 if isinstance(sibling, ElementNode)
1279 )
1280 else: # pragma: no cover
1281 raise RuntimeError("unimplemented combinator: %s" % repr(self.combinator))
1282
1283
1284 class AttributeSelector:
1285 """
1286 Represents an attribute selector.
1287
1288 Attributes:
1289 attr (:class:`str`)
1290 val (:class:`Optional`\\[:class:`str`])
1291 type (:class:`AttributeSelectorType`)
1292 """
1293
1294 def __init__(
1295 self, attr: str, val: Optional[str], type: "AttributeSelectorType"
1296 ) -> None:
1297 self.attr = attr.lower()
1298 self.val = val
1299 self.type = type
1300
1301 def __repr__(self) -> str:
1302 return "<AttributeSelector %s>" % repr(str(self))
1303
1304 def __str__(self) -> str:
1305 if self.type == AttributeSelectorType.BARE:
1306 fmt = "[{attr}{val:.0}]"
1307 elif self.type == AttributeSelectorType.EQUAL:
1308 fmt = "[{attr}={val}]"
1309 elif self.type == AttributeSelectorType.TILDE:
1310 fmt = "[{attr}~={val}]"
1311 elif self.type == AttributeSelectorType.PIPE:
1312 fmt = "[{attr}|={val}]"
1313 elif self.type == AttributeSelectorType.CARET:
1314 fmt = "[{attr}^={val}]"
1315 elif self.type == AttributeSelectorType.DOLLAR:
1316 fmt = "[{attr}$={val}]"
1317 elif self.type == AttributeSelectorType.ASTERISK:
1318 fmt = "[{attr}*={val}]"
1319 return fmt.format(attr=self.attr, val=repr(self.val))
1320
1321 def matches(self, node: "Node") -> bool:
1322 val = node.attrs.get(self.attr)
1323 if val is None:
1324 return False
1325 if self.type == AttributeSelectorType.BARE:
1326 return True
1327 elif self.type == AttributeSelectorType.EQUAL:
1328 return val == self.val
1329 elif self.type == AttributeSelectorType.TILDE:
1330 return self.val in val.split()
1331 elif self.type == AttributeSelectorType.PIPE:
1332 return val == self.val or val.startswith("%s-" % self.val)
1333 elif self.type == AttributeSelectorType.CARET:
1334 return bool(self.val and val.startswith(self.val))
1335 elif self.type == AttributeSelectorType.DOLLAR:
1336 return bool(self.val and val.endswith(self.val))
1337 elif self.type == AttributeSelectorType.ASTERISK:
1338 return bool(self.val and self.val in val)
1339 else: # pragma: no cover
1340 raise RuntimeError("unimplemented attribute selector: %s" % repr(self.type))
1341
1342
1343 # Enum: basis for poor man's algebraic data type.
1344 class AttributeSelectorType(Enum):
1345 """
1346 Attribute selector types.
1347
1348 Members correspond to the following forms of attribute selector:
1349
1350 - :attr:`BARE`: ``[attr]``;
1351 - :attr:`EQUAL`: ``[attr=val]``;
1352 - :attr:`TILDE`: ``[attr~=val]``;
1353 - :attr:`PIPE`: ``[attr|=val]``;
1354 - :attr:`CARET`: ``[attr^=val]``;
1355 - :attr:`DOLLAR`: ``[attr$=val]``;
1356 - :attr:`ASTERISK`: ``[attr*=val]``.
1357 """
1358
1359 # [attr]
1360 BARE = 1
1361 # [attr=val]
1362 EQUAL = 2
1363 # [attr~=val]
1364 TILDE = 3
1365 # [attr|=val]
1366 PIPE = 4
1367 # [attr^=val]
1368 CARET = 5
1369 # [attr$=val]
1370 DOLLAR = 6
1371 # [attr*=val]
1372 ASTERISK = 7
1373
1374
1375 class Combinator(Enum):
1376 """
1377 Combinator types.
1378
1379 Members correspond to the following combinators:
1380
1381 - :attr:`DESCENDANT`: ``A B``;
1382 - :attr:`CHILD`: ``A > B``;
1383 - :attr:`NEXT_SIBLING`: ``A + B``;
1384 - :attr:`SUBSEQUENT_SIBLING`: ``A ~ B``.
1385 """
1386
1387 # ' '
1388 DESCENDANT = 1
1389 # >
1390 CHILD = 2
1391 # +
1392 NEXT_SIBLING = 3
1393 # ~
1394 SUBSEQUENT_SIBLING = 4
1395
1396
1397 def _tag_is_void(tag: str) -> bool:
1398 """
1399 Checks whether the tag corresponds to a void element.
1400
1401 https://www.w3.org/TR/html5/syntax.html#void-elements
1402 https://html.spec.whatwg.org/multipage/syntax.html#void-elements
1403 """
1404 return tag.lower() in (
1405 "area",
1406 "base",
1407 "br",
1408 "col",
1409 "embed",
1410 "hr",
1411 "img",
1412 "input",
1413 "link",
1414 "meta",
1415 "param",
1416 "source",
1417 "track",
1418 "wbr",
1419 )
1420
1421
1422 def _tag_encloses_foreign_namespace(tag: str) -> bool:
1423 """
1424 Checks whether the tag encloses a foreign namespace (MathML or SVG).
1425
1426 https://html.spec.whatwg.org/multipage/syntax.html#foreign-elements
1427 """
1428 return tag.lower() in ("math", "svg")
1429
1430
1431 ### end dim ###
1432
1433
1434 # Global helper functions
1435
1436 def open_url(url):
1437 """Open an URL in the user's default web browser.
1438
1439 The string attribute ``open_url.url_handler`` can be used to open URLs
1440 in a custom CLI script or utility. A subprocess is spawned with url as
1441 the parameter in this case instead of the usual webbrowser.open() call.
1442
1443 Whether the browser's output (both stdout and stderr) are suppressed
1444 depends on the boolean attribute ``open_url.suppress_browser_output``.
1445 If the attribute is not set upon a call, set it to a default value,
1446 which means False if BROWSER is set to a known text-based browser --
1447 elinks, links, lynx, w3m or 'www-browser'; or True otherwise.
1448
1449 The string attribute ``open_url.override_text_browser`` can be used to
1450 ignore env var BROWSER as well as some known text-based browsers and
1451 attempt to open url in a GUI browser available.
1452 Note: If a GUI browser is indeed found, this option ignores the program
1453 option `show-browser-logs`
1454 """
1455 logger.debug('Opening %s', url)
1456
1457 # Custom URL handler gets max priority
1458 if hasattr(open_url, 'url_handler'):
1459 subprocess.run([open_url.url_handler, url])
1460 return
1461
1462 browser = webbrowser.get()
1463 if open_url.override_text_browser:
1464 browser_output = open_url.suppress_browser_output
1465 for name in [b for b in webbrowser._tryorder if b not in text_browsers]:
1466 browser = webbrowser.get(name)
1467 logger.debug(browser)
1468
1469 # Found a GUI browser, suppress browser output
1470 open_url.suppress_browser_output = True
1471 break
1472
1473 if open_url.suppress_browser_output:
1474 _stderr = os.dup(2)
1475 os.close(2)
1476 _stdout = os.dup(1)
1477 # Patch for GUI browsers on WSL
1478 if "microsoft" not in platform.uname()[3].lower():
1479 os.close(1)
1480 fd = os.open(os.devnull, os.O_RDWR)
1481 os.dup2(fd, 2)
1482 os.dup2(fd, 1)
1483 try:
1484 browser.open(url, new=2)
1485 finally:
1486 if open_url.suppress_browser_output:
1487 os.close(fd)
1488 os.dup2(_stderr, 2)
1489 os.dup2(_stdout, 1)
1490
1491 if open_url.override_text_browser:
1492 open_url.suppress_browser_output = browser_output
1493
1494
1495 def printerr(msg):
1496 """Print message, verbatim, to stderr.
1497
1498 ``msg`` could be any stringifiable value.
1499 """
1500 print(msg, file=sys.stderr)
1501
1502
1503 def unwrap(text):
1504 """Unwrap text."""
1505 lines = text.split('\n')
1506 result = ''
1507 for i in range(len(lines) - 1):
1508 result += lines[i]
1509 if not lines[i]:
1510 # Paragraph break
1511 result += '\n\n'
1512 elif lines[i + 1]:
1513 # Next line is not paragraph break, add space
1514 result += ' '
1515 # Handle last line
1516 result += lines[-1] if lines[-1] else '\n'
1517 return result
1518
1519
1520 def check_stdout_encoding():
1521 """Make sure stdout encoding is utf-8.
1522
1523 If not, print error message and instructions, then exit with
1524 status 1.
1525
1526 This function is a no-op on win32 because encoding on win32 is
1527 messy, and let's just hope for the best. /s
1528 """
1529 if sys.platform == 'win32':
1530 return
1531
1532 # Use codecs.lookup to resolve text encoding alias
1533 encoding = codecs.lookup(sys.stdout.encoding).name
1534 if encoding != 'utf-8':
1535 locale_lang, locale_encoding = locale.getlocale()
1536 if locale_lang is None:
1537 locale_lang = '<unknown>'
1538 if locale_encoding is None:
1539 locale_encoding = '<unknown>'
1540 ioencoding = os.getenv('PYTHONIOENCODING', 'not set')
1541 sys.stderr.write(unwrap(textwrap.dedent("""\
1542 stdout encoding '{encoding}' detected. googler requires utf-8 to
1543 work properly. The wrong encoding may be due to a non-UTF-8
1544 locale or an improper PYTHONIOENCODING. (For the record, your
1545 locale language is {locale_lang} and locale encoding is
1546 {locale_encoding}; your PYTHONIOENCODING is {ioencoding}.)
1547
1548 Please set a UTF-8 locale (e.g., en_US.UTF-8) or set
1549 PYTHONIOENCODING to utf-8.
1550 """.format(
1551 encoding=encoding,
1552 locale_lang=locale_lang,
1553 locale_encoding=locale_encoding,
1554 ioencoding=ioencoding,
1555 ))))
1556 sys.exit(1)
1557
1558
1559 def time_it(description=None):
1560 def decorator(func):
1561 @functools.wraps(func)
1562 def wrapped(*args, **kwargs):
1563 # Only profile in debug mode.
1564 if not logger.isEnabledFor(logging.DEBUG):
1565 return func(*args, **kwargs)
1566
1567 import time
1568 mark = time.perf_counter()
1569 ret = func(*args, **kwargs)
1570 duration = time.perf_counter() - mark
1571 logger.debug('%s completed in \x1b[33m%.3fs\x1b[0m', description or func.__name__, duration)
1572 return ret
1573
1574 return wrapped
1575
1576 return decorator
1577
1578
1579 # Classes
1580
1581 class HardenedHTTPSConnection(HTTPSConnection):
1582 """Overrides HTTPSConnection.connect to specify TLS version
1583
1584 NOTE: TLS 1.2 is supported from Python 3.4
1585 """
1586
1587 def __init__(self, host, address_family=0, **kwargs):
1588 HTTPSConnection.__init__(self, host, **kwargs)
1589 self.address_family = address_family
1590
1591 def connect(self, notweak=False):
1592 sock = self.create_socket_connection()
1593
1594 # Optimizations not available on OS X
1595 if not notweak and sys.platform.startswith('linux'):
1596 try:
1597 sock.setsockopt(socket.SOL_TCP, socket.TCP_DEFER_ACCEPT, 1)
1598 sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_QUICKACK, 1)
1599 sock.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, 524288)
1600 except OSError:
1601 # Doesn't work on Windows' Linux subsystem (#179)
1602 logger.debug('setsockopt failed')
1603
1604 if getattr(self, '_tunnel_host', None):
1605 self.sock = sock
1606 elif not notweak:
1607 # Try to use TLS 1.2
1608 ssl_context = None
1609 if hasattr(ssl, 'PROTOCOL_TLS'):
1610 # Since Python 3.5.3
1611 ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS)
1612 if hasattr(ssl_context, "minimum_version"):
1613 # Python 3.7 with OpenSSL 1.1.0g or later
1614 ssl_context.minimum_version = ssl.TLSVersion.TLSv1_2
1615 else:
1616 ssl_context.options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 |
1617 ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)
1618 elif hasattr(ssl, 'PROTOCOL_TLSv1_2'):
1619 # Since Python 3.4
1620 ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
1621 if ssl_context:
1622 self.sock = ssl_context.wrap_socket(sock)
1623 return
1624
1625 # Fallback
1626 HTTPSConnection.connect(self)
1627
1628 # Adapted from socket.create_connection.
1629 # https://github.com/python/cpython/blob/bce4ddafdd188cc6deb1584728b67b9e149ca6a4/Lib/socket.py#L771-L813
1630 def create_socket_connection(self):
1631 err = None
1632 results = socket.getaddrinfo(self.host, self.port, self.address_family, socket.SOCK_STREAM)
1633 # Prefer IPv4 if address family isn't explicitly specified.
1634 if self.address_family == 0:
1635 results = sorted(results, key=lambda res: 1 if res[0] == socket.AF_INET else 2)
1636 for af, socktype, proto, canonname, sa in results:
1637 sock = None
1638 try:
1639 sock = socket.socket(af, socktype, proto)
1640 if self.timeout is not None:
1641 sock.settimeout(self.timeout)
1642 if self.source_address:
1643 sock.bind(self.source_address)
1644 sock.connect(sa)
1645 # Break explicitly a reference cycle
1646 err = None
1647 self.address_family = af
1648 logger.debug('Opened socket to %s:%d',
1649 sa[0] if af == socket.AF_INET else ('[%s]' % sa[0]),
1650 sa[1])
1651 return sock
1652
1653 except socket.error as _:
1654 err = _
1655 if sock is not None:
1656 sock.close()
1657
1658 if err is not None:
1659 try:
1660 raise err
1661 finally:
1662 # Break explicitly a reference cycle
1663 err = None
1664 else:
1665 raise socket.error("getaddrinfo returns an empty list")
1666
1667
1668 class GoogleUrl(object):
1669 """
1670 This class constructs the Google Search/News URL.
1671
1672 This class is modelled on urllib.parse.ParseResult for familiarity,
1673 which means it supports reading of all six attributes -- scheme,
1674 netloc, path, params, query, fragment -- of
1675 urllib.parse.ParseResult, as well as the geturl() method.
1676
1677 However, the attributes (properties) and methods listed below should
1678 be the preferred methods of access to this class.
1679
1680 Parameters
1681 ----------
1682 opts : dict or argparse.Namespace, optional
1683 See the ``opts`` parameter of `update`.
1684
1685 Other Parameters
1686 ----------------
1687 See "Other Parameters" of `update`.
1688
1689 Attributes
1690 ----------
1691 hostname : str
1692 Read-write property.
1693 keywords : str or list of strs
1694 Read-write property.
1695 news : bool
1696 Read-only property.
1697 videos : bool
1698 Read-only property.
1699 url : str
1700 Read-only property.
1701
1702 Methods
1703 -------
1704 full()
1705 relative()
1706 update(opts=None, **kwargs)
1707 set_queries(**kwargs)
1708 unset_queries(*args)
1709 next_page()
1710 prev_page()
1711 first_page()
1712
1713 """
1714
1715 def __init__(self, opts=None, **kwargs):
1716 self.scheme = 'https'
1717 # self.netloc is a calculated property
1718 self.path = '/search'
1719 self.params = ''
1720 # self.query is a calculated property
1721 self.fragment = ''
1722
1723 self._tld = None
1724 self._num = 10
1725 self._start = 0
1726 self._keywords = []
1727 self._sites = None
1728 self._exclude = None
1729
1730 self._query_dict = {
1731 'ie': 'UTF-8',
1732 'oe': 'UTF-8',
1733 #'gbv': '1', # control the presence of javascript on the page, 1=no js, 2=js
1734 'sei': base64.encodebytes(uuid.uuid4().bytes).decode("ascii").rstrip('=\n').replace('/', '_'),
1735 }
1736
1737 # In preloaded HTML parsing mode, set keywords to something so
1738 # that we are not tripped up by require_keywords.
1739 if opts.html_file and not opts.keywords:
1740 opts.keywords = ['<debug>']
1741
1742 self.update(opts, **kwargs)
1743
1744 def __str__(self):
1745 return self.url
1746
1747 @property
1748 def url(self):
1749 """The full Google URL you want."""
1750 return self.full()
1751
1752 @property
1753 def hostname(self):
1754 """The hostname."""
1755 return self.netloc
1756
1757 @hostname.setter
1758 def hostname(self, hostname):
1759 self.netloc = hostname
1760
1761 @property
1762 def keywords(self):
1763 """The keywords, either a str or a list of strs."""
1764 return self._keywords
1765
1766 @keywords.setter
1767 def keywords(self, keywords):
1768 self._keywords = keywords
1769
1770 @property
1771 def news(self):
1772 """Whether the URL is for Google News."""
1773 return 'tbm' in self._query_dict and self._query_dict['tbm'] == 'nws'
1774
1775 @property
1776 def videos(self):
1777 """Whether the URL is for Google Videos."""
1778 return 'tbm' in self._query_dict and self._query_dict['tbm'] == 'vid'
1779
1780 def full(self):
1781 """Return the full URL.
1782
1783 Returns
1784 -------
1785 str
1786
1787 """
1788 url = (self.scheme + ':') if self.scheme else ''
1789 url += '//' + self.netloc + self.relative()
1790 return url
1791
1792 def relative(self):
1793 """Return the relative URL (without scheme and authority).
1794
1795 Authority (see RFC 3986 section 3.2), or netloc in the
1796 terminology of urllib.parse, basically means the hostname
1797 here. The relative URL is good for making HTTP(S) requests to a
1798 known host.
1799
1800 Returns
1801 -------
1802 str
1803
1804 """
1805 rel = self.path
1806 if self.params:
1807 rel += ';' + self.params
1808 if self.query:
1809 rel += '?' + self.query
1810 if self.fragment:
1811 rel += '#' + self.fragment
1812 return rel
1813
1814 def update(self, opts=None, **kwargs):
1815 """Update the URL with the given options.
1816
1817 Parameters
1818 ----------
1819 opts : dict or argparse.Namespace, optional
1820 Carries options that affect the Google Search/News URL. The
1821 list of currently recognized option keys with expected value
1822 types:
1823
1824 duration: str (GooglerArgumentParser.is_duration)
1825 exact: bool
1826 keywords: str or list of strs
1827 lang: str
1828 news: bool
1829 videos: bool
1830 num: int
1831 site: str
1832 start: int
1833 tld: str
1834 unfilter: bool
1835
1836 Other Parameters
1837 ----------------
1838 kwargs
1839 The `kwargs` dict extends `opts`, that is, options can be
1840 specified either way, in `opts` or as individual keyword
1841 arguments.
1842
1843 """
1844
1845 if opts is None:
1846 opts = {}
1847 if hasattr(opts, '__dict__'):
1848 opts = opts.__dict__
1849 opts.update(kwargs)
1850
1851 qd = self._query_dict
1852 if opts.get('duration'):
1853 qd['tbs'] = 'qdr:%s' % opts['duration']
1854 if 'exact' in opts:
1855 if opts['exact']:
1856 qd['nfpr'] = 1
1857 else:
1858 qd.pop('nfpr', None)
1859 if opts.get('from') or opts.get('to'):
1860 cd_min = opts.get('from') or ''
1861 cd_max = opts.get('to') or ''
1862 qd['tbs'] = 'cdr:1,cd_min:%s,cd_max:%s' % (cd_min, cd_max)
1863 if 'keywords' in opts:
1864 self._keywords = opts['keywords']
1865 if 'lang' in opts and opts['lang']:
1866 qd['hl'] = opts['lang']
1867 if 'geoloc' in opts and opts['geoloc']:
1868 qd['gl'] = opts['geoloc']
1869 if 'news' in opts and opts['news']:
1870 qd['tbm'] = 'nws'
1871 elif 'videos' in opts and opts['videos']:
1872 qd['tbm'] = 'vid'
1873 else:
1874 qd.pop('tbm', None)
1875 if 'num' in opts:
1876 self._num = opts['num']
1877 if 'sites' in opts:
1878 self._sites = opts['sites']
1879 if 'exclude' in opts:
1880 self._exclude = opts['exclude']
1881 if 'start' in opts:
1882 self._start = opts['start']
1883 if 'tld' in opts:
1884 self._tld = opts['tld']
1885 if 'unfilter' in opts and opts['unfilter']:
1886 qd['filter'] = 0
1887
1888 def set_queries(self, **kwargs):
1889 """Forcefully set queries outside the normal `update` mechanism.
1890
1891 Other Parameters
1892 ----------------
1893 kwargs
1894 Arbitrary key value pairs to be set in the query string. All
1895 keys and values should be stringifiable.
1896
1897 Note that certain keys, e.g., ``q``, have their values
1898 constructed on the fly, so setting those has no actual
1899 effect.
1900
1901 """
1902 for k, v in kwargs.items():
1903 self._query_dict[k] = v
1904
1905 def unset_queries(self, *args):
1906 """Forcefully unset queries outside the normal `update` mechanism.
1907
1908 Other Parameters
1909 ----------------
1910 args
1911 Arbitrary keys to be unset. No exception is raised if a key
1912 does not exist in the first place.
1913
1914 Note that certain keys, e.g., ``q``, are always included in
1915 the resulting URL, so unsetting those has no actual effect.
1916
1917 """
1918 for k in args:
1919 self._query_dict.pop(k, None)
1920
1921 def next_page(self):
1922 """Navigate to the next page."""
1923 self._start += self._num
1924
1925 def prev_page(self):
1926 """Navigate to the previous page.
1927
1928 Raises
1929 ------
1930 ValueError
1931 If already at the first page (``start=0`` in the current
1932 query string).
1933
1934 """
1935 if self._start == 0:
1936 raise ValueError('Already at the first page.')
1937 self._start = (self._start - self._num) if self._start > self._num else 0
1938
1939 def first_page(self):
1940 """Navigate to the first page.
1941
1942 Raises
1943 ------
1944 ValueError
1945 If already at the first page (``start=0`` in the current
1946 query string).
1947
1948 """
1949 if self._start == 0:
1950 raise ValueError('Already at the first page.')
1951 self._start = 0
1952
1953 # Data source: https://web.archive.org/web/20170615200243/https://en.wikipedia.org/wiki/List_of_Google_domains
1954 # Scraper script: https://gist.github.com/zmwangx/b976e83c14552fe18b71
1955 TLD_TO_DOMAIN_MAP = {
1956 'ac': 'google.ac', 'ad': 'google.ad', 'ae': 'google.ae',
1957 'af': 'google.com.af', 'ag': 'google.com.ag', 'ai': 'google.com.ai',
1958 'al': 'google.al', 'am': 'google.am', 'ao': 'google.co.ao',
1959 'ar': 'google.com.ar', 'as': 'google.as', 'at': 'google.at',
1960 'au': 'google.com.au', 'az': 'google.az', 'ba': 'google.ba',
1961 'bd': 'google.com.bd', 'be': 'google.be', 'bf': 'google.bf',
1962 'bg': 'google.bg', 'bh': 'google.com.bh', 'bi': 'google.bi',
1963 'bj': 'google.bj', 'bn': 'google.com.bn', 'bo': 'google.com.bo',
1964 'br': 'google.com.br', 'bs': 'google.bs', 'bt': 'google.bt',
1965 'bw': 'google.co.bw', 'by': 'google.by', 'bz': 'google.com.bz',
1966 'ca': 'google.ca', 'cat': 'google.cat', 'cc': 'google.cc',
1967 'cd': 'google.cd', 'cf': 'google.cf', 'cg': 'google.cg',
1968 'ch': 'google.ch', 'ci': 'google.ci', 'ck': 'google.co.ck',
1969 'cl': 'google.cl', 'cm': 'google.cm', 'cn': 'google.cn',
1970 'co': 'google.com.co', 'cr': 'google.co.cr', 'cu': 'google.com.cu',
1971 'cv': 'google.cv', 'cy': 'google.com.cy', 'cz': 'google.cz',
1972 'de': 'google.de', 'dj': 'google.dj', 'dk': 'google.dk',
1973 'dm': 'google.dm', 'do': 'google.com.do', 'dz': 'google.dz',
1974 'ec': 'google.com.ec', 'ee': 'google.ee', 'eg': 'google.com.eg',
1975 'es': 'google.es', 'et': 'google.com.et', 'fi': 'google.fi',
1976 'fj': 'google.com.fj', 'fm': 'google.fm', 'fr': 'google.fr',
1977 'ga': 'google.ga', 'ge': 'google.ge', 'gf': 'google.gf',
1978 'gg': 'google.gg', 'gh': 'google.com.gh', 'gi': 'google.com.gi',
1979 'gl': 'google.gl', 'gm': 'google.gm', 'gp': 'google.gp',
1980 'gr': 'google.gr', 'gt': 'google.com.gt', 'gy': 'google.gy',
1981 'hk': 'google.com.hk', 'hn': 'google.hn', 'hr': 'google.hr',
1982 'ht': 'google.ht', 'hu': 'google.hu', 'id': 'google.co.id',
1983 'ie': 'google.ie', 'il': 'google.co.il', 'im': 'google.im',
1984 'in': 'google.co.in', 'io': 'google.io', 'iq': 'google.iq',
1985 'is': 'google.is', 'it': 'google.it', 'je': 'google.je',
1986 'jm': 'google.com.jm', 'jo': 'google.jo', 'jp': 'google.co.jp',
1987 'ke': 'google.co.ke', 'kg': 'google.kg', 'kh': 'google.com.kh',
1988 'ki': 'google.ki', 'kr': 'google.co.kr', 'kw': 'google.com.kw',
1989 'kz': 'google.kz', 'la': 'google.la', 'lb': 'google.com.lb',
1990 'lc': 'google.com.lc', 'li': 'google.li', 'lk': 'google.lk',
1991 'ls': 'google.co.ls', 'lt': 'google.lt', 'lu': 'google.lu',
1992 'lv': 'google.lv', 'ly': 'google.com.ly', 'ma': 'google.co.ma',
1993 'md': 'google.md', 'me': 'google.me', 'mg': 'google.mg',
1994 'mk': 'google.mk', 'ml': 'google.ml', 'mm': 'google.com.mm',
1995 'mn': 'google.mn', 'ms': 'google.ms', 'mt': 'google.com.mt',
1996 'mu': 'google.mu', 'mv': 'google.mv', 'mw': 'google.mw',
1997 'mx': 'google.com.mx', 'my': 'google.com.my', 'mz': 'google.co.mz',
1998 'na': 'google.com.na', 'ne': 'google.ne', 'nf': 'google.com.nf',
1999 'ng': 'google.com.ng', 'ni': 'google.com.ni', 'nl': 'google.nl',
2000 'no': 'google.no', 'np': 'google.com.np', 'nr': 'google.nr',
2001 'nu': 'google.nu', 'nz': 'google.co.nz', 'om': 'google.com.om',
2002 'pa': 'google.com.pa', 'pe': 'google.com.pe', 'pg': 'google.com.pg',
2003 'ph': 'google.com.ph', 'pk': 'google.com.pk', 'pl': 'google.pl',
2004 'pn': 'google.co.pn', 'pr': 'google.com.pr', 'ps': 'google.ps',
2005 'pt': 'google.pt', 'py': 'google.com.py', 'qa': 'google.com.qa',
2006 'ro': 'google.ro', 'rs': 'google.rs', 'ru': 'google.ru',
2007 'rw': 'google.rw', 'sa': 'google.com.sa', 'sb': 'google.com.sb',
2008 'sc': 'google.sc', 'se': 'google.se', 'sg': 'google.com.sg',
2009 'sh': 'google.sh', 'si': 'google.si', 'sk': 'google.sk',
2010 'sl': 'google.com.sl', 'sm': 'google.sm', 'sn': 'google.sn',
2011 'so': 'google.so', 'sr': 'google.sr', 'st': 'google.st',
2012 'sv': 'google.com.sv', 'td': 'google.td', 'tg': 'google.tg',
2013 'th': 'google.co.th', 'tj': 'google.com.tj', 'tk': 'google.tk',
2014 'tl': 'google.tl', 'tm': 'google.tm', 'tn': 'google.tn',
2015 'to': 'google.to', 'tr': 'google.com.tr', 'tt': 'google.tt',
2016 'tw': 'google.com.tw', 'tz': 'google.co.tz', 'ua': 'google.com.ua',
2017 'ug': 'google.co.ug', 'uk': 'google.co.uk', 'uy': 'google.com.uy',
2018 'uz': 'google.co.uz', 'vc': 'google.com.vc', 've': 'google.co.ve',
2019 'vg': 'google.vg', 'vi': 'google.co.vi', 'vn': 'google.com.vn',
2020 'vu': 'google.vu', 'ws': 'google.ws', 'za': 'google.co.za',
2021 'zm': 'google.co.zm', 'zw': 'google.co.zw',
2022 }
2023
2024 @property
2025 def netloc(self):
2026 """The hostname."""
2027 try:
2028 return 'www.' + self.TLD_TO_DOMAIN_MAP[self._tld]
2029 except KeyError:
2030 return 'www.google.com'
2031
2032 @property
2033 def query(self):
2034 """The query string."""
2035 qd = {}
2036 qd.update(self._query_dict)
2037 if self._num != 10: # Skip sending the default
2038 qd['num'] = self._num
2039 if self._start: # Skip sending the default
2040 qd['start'] = self._start
2041
2042 # Construct the q query
2043 q = ''
2044 keywords = self._keywords
2045 sites = self._sites
2046 exclude = self._exclude
2047 if keywords:
2048 if isinstance(keywords, list):
2049 q += '+'.join(urllib.parse.quote_plus(kw) for kw in keywords)
2050 else:
2051 q += urllib.parse.quote_plus(keywords)
2052 if sites:
2053 q += '+OR'.join('+site:' + urllib.parse.quote_plus(site) for site in sites)
2054 if exclude:
2055 q += ''.join('+-site:' + urllib.parse.quote_plus(e) for e in exclude)
2056 qd['q'] = q
2057 return '&'.join('%s=%s' % (k, qd[k]) for k in sorted(qd.keys()))
2058
2059
2060 class GoogleConnectionError(Exception):
2061 pass
2062
2063
2064 class GoogleConnection(object):
2065 """
2066 This class facilitates connecting to and fetching from Google.
2067
2068 Parameters
2069 ----------
2070 See http.client.HTTPSConnection for documentation of the
2071 parameters.
2072
2073 Raises
2074 ------
2075 GoogleConnectionError
2076
2077 Attributes
2078 ----------
2079 host : str
2080 The currently connected host. Read-only property. Use
2081 `new_connection` to change host.
2082
2083 Methods
2084 -------
2085 new_connection(host=None, port=None, timeout=45)
2086 renew_connection(timeout=45)
2087 fetch_page(url)
2088 close()
2089
2090 """
2091
2092 def __init__(self, host, port=None, address_family=0, timeout=45, proxy=None, notweak=False):
2093 self._host = None
2094 self._port = None
2095 self._address_family = address_family
2096 self._proxy = proxy
2097 self._notweak = notweak
2098 self._conn = None
2099 self.new_connection(host, port=port, timeout=timeout)
2100 self.cookie = ''
2101
2102 @property
2103 def host(self):
2104 """The host currently connected to."""
2105 return self._host
2106
2107 @time_it()
2108 def new_connection(self, host=None, port=None, timeout=45):
2109 """Close the current connection (if any) and establish a new one.
2110
2111 Parameters
2112 ----------
2113 See http.client.HTTPSConnection for documentation of the
2114 parameters. Renew the connection (i.e., reuse the current host
2115 and port) if host is None or empty.
2116
2117 Raises
2118 ------
2119 GoogleConnectionError
2120
2121 """
2122 if self._conn:
2123 self._conn.close()
2124
2125 if not host:
2126 host = self._host
2127 port = self._port
2128 self._host = host
2129 self._port = port
2130 host_display = host + (':%d' % port if port else '')
2131
2132 proxy = self._proxy
2133
2134 if proxy:
2135 proxy_user_passwd, proxy_host_port = parse_proxy_spec(proxy)
2136
2137 logger.debug('Connecting to proxy server %s', proxy_host_port)
2138 self._conn = HardenedHTTPSConnection(proxy_host_port,
2139 address_family=self._address_family, timeout=timeout)
2140
2141 logger.debug('Tunnelling to host %s' % host_display)
2142 connect_headers = {}
2143 if proxy_user_passwd:
2144 connect_headers['Proxy-Authorization'] = 'Basic %s' % base64.b64encode(
2145 proxy_user_passwd.encode('utf-8')
2146 ).decode('utf-8')
2147 self._conn.set_tunnel(host, port=port, headers=connect_headers)
2148
2149 try:
2150 self._conn.connect(self._notweak)
2151 except Exception as e:
2152 msg = 'Failed to connect to proxy server %s: %s.' % (proxy, e)
2153 raise GoogleConnectionError(msg)
2154 else:
2155 logger.debug('Connecting to new host %s', host_display)
2156 self._conn = HardenedHTTPSConnection(host, port=port,
2157 address_family=self._address_family, timeout=timeout)
2158 try:
2159 self._conn.connect(self._notweak)
2160 except Exception as e:
2161 msg = 'Failed to connect to %s: %s.' % (host_display, e)
2162 raise GoogleConnectionError(msg)
2163
2164 def renew_connection(self, timeout=45):
2165 """Renew current connection.
2166
2167 Equivalent to ``new_connection(timeout=timeout)``.
2168
2169 """
2170 self.new_connection(timeout=timeout)
2171
2172 @time_it()
2173 def fetch_page(self, url):
2174 """Fetch a URL.
2175
2176 Allows one reconnection and multiple redirections before failing
2177 and raising GoogleConnectionError.
2178
2179 Parameters
2180 ----------
2181 url : str
2182 The URL to fetch, relative to the host.
2183
2184 Raises
2185 ------
2186 GoogleConnectionError
2187 When not getting HTTP 200 even after the allowed one
2188 reconnection and/or one redirection, or when Google is
2189 blocking query due to unusual activity.
2190
2191 Returns
2192 -------
2193 str
2194 Response payload, gunzipped (if applicable) and decoded (in UTF-8).
2195
2196 """
2197 try:
2198 self._raw_get(url)
2199 except (http.client.HTTPException, OSError) as e:
2200 logger.debug('Got exception: %s.', e)
2201 logger.debug('Attempting to reconnect...')
2202 self.renew_connection()
2203 try:
2204 self._raw_get(url)
2205 except http.client.HTTPException as e:
2206 logger.debug('Got exception: %s.', e)
2207 raise GoogleConnectionError("Failed to get '%s'." % url)
2208
2209 resp = self._resp
2210 redirect_counter = 0
2211 while resp.status != 200 and redirect_counter < 3:
2212 if resp.status in {301, 302, 303, 307, 308}:
2213 redirection_url = resp.getheader('location', '')
2214 if 'sorry/IndexRedirect?' in redirection_url or 'sorry/index?' in redirection_url:
2215 msg = "Connection blocked due to unusual activity.\n"
2216 if self._conn.address_family == socket.AF_INET6:
2217 msg += textwrap.dedent("""\
2218 You are connecting over IPv6 which is likely the problem. Try to make
2219 sure the machine has a working IPv4 network interface configured.
2220 See also the -4, --ipv4 option of googler.\n""")
2221 msg += textwrap.dedent("""\
2222 THIS IS NOT A BUG, please do NOT report it as a bug unless you have specific
2223 information that may lead to the development of a workaround.
2224 You IP address is temporarily or permanently blocked by Google and requires
2225 reCAPTCHA-solving to use the service, which googler is not capable of.
2226 Possible causes include issuing too many queries in a short time frame, or
2227 operating from a shared / low reputation IP with a history of abuse.
2228 Please do NOT use googler for automated scraping.""")
2229 msg = " ".join(msg.splitlines())
2230 raise GoogleConnectionError(msg)
2231 self._redirect(redirection_url)
2232 resp = self._resp
2233 redirect_counter += 1
2234 else:
2235 break
2236
2237 if resp.status != 200:
2238 raise GoogleConnectionError('Got HTTP %d: %s' % (resp.status, resp.reason))
2239
2240 payload = resp.read()
2241 try:
2242 return gzip.decompress(payload).decode('utf-8')
2243 except OSError:
2244 # Not gzipped
2245 return payload.decode('utf-8')
2246
2247 def _redirect(self, url):
2248 """Redirect to and fetch a new URL.
2249
2250 Like `_raw_get`, the response is stored in ``self._resp``. A new
2251 connection is made if redirecting to a different host.
2252
2253 Parameters
2254 ----------
2255 url : str
2256 If absolute and points to a different host, make a new
2257 connection.
2258
2259 Raises
2260 ------
2261 GoogleConnectionError
2262
2263 """
2264 logger.debug('Redirecting to URL %s', url)
2265 segments = urllib.parse.urlparse(url)
2266
2267 host = segments.netloc
2268 if host != self._host:
2269 self.new_connection(host)
2270
2271 relurl = urllib.parse.urlunparse(('', '') + segments[2:])
2272 try:
2273 self._raw_get(relurl)
2274 except http.client.HTTPException as e:
2275 logger.debug('Got exception: %s.', e)
2276 raise GoogleConnectionError("Failed to get '%s'." % url)
2277
2278 def _raw_get(self, url):
2279 """Make a raw HTTP GET request.
2280
2281 No status check (which implies no redirection). Response can be
2282 accessed from ``self._resp``.
2283
2284 Parameters
2285 ----------
2286 url : str
2287 URL relative to the host, used in the GET request.
2288
2289 Raises
2290 ------
2291 http.client.HTTPException
2292
2293 """
2294 logger.debug('Fetching URL %s', url)
2295 self._conn.request('GET', url, None, {
2296 'Accept': 'text/html',
2297 'Accept-Encoding': 'gzip',
2298 'User-Agent': USER_AGENT,
2299 'Cookie': self.cookie,
2300 'Connection': 'keep-alive',
2301 'DNT': '1',
2302 })
2303 self._resp = self._conn.getresponse()
2304 if self.cookie == '':
2305 complete_cookie = self._resp.getheader('Set-Cookie')
2306 # Cookie won't be available if already blocked
2307 if complete_cookie is not None:
2308 self.cookie = complete_cookie[:complete_cookie.find(';')]
2309 logger.debug('Cookie: %s' % self.cookie)
2310
2311 def close(self):
2312 """Close the connection (if one is active)."""
2313 if self._conn:
2314 self._conn.close()
2315
2316
2317 class GoogleParser(object):
2318
2319 def __init__(self, html, *, news=False, videos=False):
2320 self.news = news
2321 self.videos = videos
2322 self.autocorrected = False
2323 self.showing_results_for = None
2324 self.filtered = False
2325 self.results = []
2326 self.parse(html)
2327
2328 @time_it()
2329 def parse(self, html):
2330 tree = parse_html(html)
2331
2332 if debugger:
2333 printerr('\x1b[1mInspect the DOM through the \x1b[4mtree\x1b[24m variable.\x1b[0m')
2334 printerr('')
2335 try:
2336 import IPython
2337 IPython.embed()
2338 except ImportError:
2339 import pdb
2340 pdb.set_trace()
2341
2342 # cw is short for collapse_whitespace.
2343 cw = lambda s: re.sub(r'[ \t\n\r]+', ' ', s) if s is not None else s
2344
2345 index = 0
2346 for div_g in tree.select_all('div.g'):
2347 if div_g.select('.hp-xpdbox'):
2348 # Skip smart cards.
2349 continue
2350 try:
2351 if div_g.select('.st'):
2352 # Old class structure, stopped working some time in
2353 # September 2020, but kept just in case.
2354 h3 = div_g.select('div.r h3')
2355 if h3:
2356 title = h3.text
2357 a = h3.parent
2358 else:
2359 h3 = div_g.select('h3.r')
2360 a = h3.select('a')
2361 title = a.text
2362 mime = div_g.select('.mime')
2363 if mime:
2364 title = mime.text + ' ' + title
2365 abstract_node = div_g.select('.st')
2366 metadata_node = div_g.select('.f')
2367 else:
2368 # Current structure as of October 2020.
2369 # Note that a filetype tag (e.g. PDF) is now pretty
2370 # damn hard to parse with confidence (that it'll
2371 # survive the slighest further change), so we don't.
2372
2373 # As of January 15th 2021, the html class is not rc anymore, it's tF2Cxc.
2374 # This approach is not very resilient to changes by Google, but it works for now.
2375 # title_node, details_node, *_ = div_g.select_all('div.rc > div')
2376 title_node, details_node, *_ = div_g.select_all('div.tF2Cxc > div')
2377 if 'yuRUbf' not in title_node.classes:
2378 logger.debug('unexpected title node class(es): expected %r, got %r',
2379 'yuRUbf', ' '.join(title_node.classes))
2380 if 'IsZvec' not in details_node.classes:
2381 logger.debug('unexpected details node class(es): expected %r, got %r',
2382 'IsZvec', ' '.join(details_node.classes))
2383 a = title_node.select('a')
2384 h3 = a.select('h3')
2385 title = h3.text
2386 abstract_node = details_node.select('span')
2387 metadata_node = details_node.select('.f, span ~ div')
2388 url = self.unwrap_link(a.attr('href'))
2389 matched_keywords = []
2390 abstract = ''
2391 # BFS descendant nodes. Necessary to locate matches (b,
2392 # em) while skipping metadata (.f).
2393 abstract_nodes = collections.deque([abstract_node])
2394 while abstract_nodes:
2395 node = abstract_nodes.popleft()
2396 if 'f' in node.classes:
2397 # .f is handled as metadata instead.
2398 continue
2399 if node.tag in ['b', 'em']:
2400 matched_keywords.append({'phrase': node.text, 'offset': len(abstract)})
2401 abstract += node.text
2402 continue
2403 if not node.children:
2404 abstract += node.text
2405 continue
2406 for child in node.children:
2407 abstract_nodes.append(child)
2408 metadata = None
2409 try:
2410 # Sometimes there are multiple metadata fields
2411 # associated with a single entry, e.g. "Released",
2412 # "Producer(s)", "Genre", etc. for a song (sample
2413 # query: "never gonna give you up"). These need to
2414 # be delimited when displayed.
2415 metadata_fields = metadata_node.select_all('div > div.wFMWsc')
2416 if metadata_fields:
2417 metadata = ' | '.join(field.text for field in metadata_fields)
2418 elif not metadata_node.select('a') and not metadata_node.select('g-expandable-container'):
2419 metadata = metadata_node.text
2420 if metadata:
2421 metadata = (
2422 metadata
2423 .replace('\u200e', '')
2424 .replace(' - ', ', ')
2425 .replace(' \u2014 ', ', ')
2426 .strip().rstrip(',')
2427 )
2428 except AttributeError:
2429 pass
2430 except (AttributeError, ValueError):
2431 continue
2432 sitelinks = []
2433 for td in div_g.select_all('td'):
2434 try:
2435 a = td.select('a')
2436 sl_title = a.text
2437 sl_url = self.unwrap_link(a.attr('href'))
2438 sl_abstract = td.select('div.s.st, div.s .st').text
2439 sitelink = Sitelink(cw(sl_title), sl_url, cw(sl_abstract))
2440 if sitelink not in sitelinks:
2441 sitelinks.append(sitelink)
2442 except (AttributeError, ValueError):
2443 continue
2444 # cw cannot be applied to abstract here since it may screw
2445 # up offsets of matches. Instead, each relevant node's text
2446 # is whitespace-collapsed before being appended to abstract.
2447 # We then hope for the best.
2448 result = Result(index + 1, cw(title), url, abstract,
2449 metadata=cw(metadata), sitelinks=sitelinks, matches=matched_keywords)
2450 if result not in self.results:
2451 self.results.append(result)
2452 index += 1
2453
2454 if not self.results:
2455 for card in tree.select_all('g-card'):
2456 a = card.select('a[href]')
2457 if not a:
2458 continue
2459 url = self.unwrap_link(a.attr('href'))
2460 text_nodes = []
2461 for node in a.descendants():
2462 if isinstance(node, TextNode) and node.strip():
2463 text_nodes.append(node.text)
2464 if len(text_nodes) != 4:
2465 continue
2466 publisher, title, abstract, publishing_time = text_nodes
2467 metadata = '%s, %s' % (publisher, publishing_time)
2468 index += 1
2469 self.results.append(Result(index, cw(title), url, cw(abstract), metadata=cw(metadata)))
2470
2471 # Showing results for ...
2472 # Search instead for ...
2473 spell_orig = tree.select("span.spell_orig")
2474 if spell_orig:
2475 showing_results_for_link = next(
2476 filter(lambda el: el.tag == "a", spell_orig.previous_siblings()), None
2477 )
2478 if showing_results_for_link:
2479 self.autocorrected = True
2480 self.showing_results_for = showing_results_for_link.text
2481
2482 # No results found for ...
2483 # Results for ...:
2484 alt_query_infobox = tree.select('#topstuff')
2485 if alt_query_infobox:
2486 bolds = alt_query_infobox.select_all('div b')
2487 if len(bolds) == 2:
2488 self.showing_results_for = bolds[1].text
2489
2490 # In order to show you the most relevant results, we have
2491 # omitted some entries very similar to the N already displayed.
2492 # ...
2493 self.filtered = tree.select('p#ofr') is not None
2494
2495 # Unwraps /url?q=http://...&sa=...
2496 # TODO: don't unwrap if URL isn't in this form.
2497 @staticmethod
2498 def unwrap_link(link):
2499 qs = urllib.parse.urlparse(link).query
2500 try:
2501 url = urllib.parse.parse_qs(qs)['q'][0]
2502 except KeyError:
2503 return link
2504 else:
2505 if "://" in url:
2506 return url
2507 else:
2508 # Google's internal services link, e.g.,
2509 # /search?q=google&..., which cannot be unwrapped into
2510 # an actual URL.
2511 raise ValueError(link)
2512
2513
2514 class Sitelink(object):
2515 """Container for a sitelink."""
2516
2517 def __init__(self, title, url, abstract):
2518 self.title = title
2519 self.url = url
2520 self.abstract = abstract
2521 self.index = ''
2522
2523 def __eq__(self, other):
2524 return (
2525 self.title == other.title and
2526 self.url == other.url and
2527 self.abstract == other.abstract
2528 )
2529
2530 def __hash__(self):
2531 return hash((self.title, self.url, self.abstract))
2532
2533
2534 Colors = collections.namedtuple('Colors', 'index, title, url, metadata, abstract, prompt, reset')
2535
2536
2537 class Result(object):
2538 """
2539 Container for one search result, with output helpers.
2540
2541 Parameters
2542 ----------
2543 index : int or str
2544 title : str
2545 url : str
2546 abstract : str
2547 metadata : str, optional
2548 Only applicable to Google News results, with publisher name and
2549 publishing time.
2550 sitelinks : list, optional
2551 List of ``SiteLink`` objects.
2552
2553 Attributes
2554 ----------
2555 index : str
2556 title : str
2557 url : str
2558 abstract : str
2559 metadata : str or None
2560 sitelinks : list
2561 matches : list
2562
2563 Class Variables
2564 ---------------
2565 colors : str
2566
2567 Methods
2568 -------
2569 print()
2570 jsonizable_object()
2571 urltable()
2572
2573 """
2574
2575 # Class variables
2576 colors = None
2577 urlexpand = True
2578
2579 def __init__(self, index, title, url, abstract, metadata=None, sitelinks=None, matches=None):
2580 index = str(index)
2581 self.index = index
2582 self.title = title
2583 self.url = url
2584 self.abstract = abstract
2585 self.metadata = metadata
2586 self.sitelinks = [] if sitelinks is None else sitelinks
2587 self.matches = [] if matches is None else matches
2588
2589 self._urltable = {index: url}
2590 subindex = 'a'
2591 for sitelink in self.sitelinks:
2592 fullindex = index + subindex
2593 sitelink.index = fullindex
2594 self._urltable[fullindex] = sitelink.url
2595 subindex = chr(ord(subindex) + 1)
2596
2597 def __eq__(self, other):
2598 return (
2599 self.title == other.title and
2600 self.url == other.url and
2601 self.abstract == other.abstract and
2602 self.metadata == other.metadata and
2603 self.sitelinks == other.sitelinks and
2604 self.matches == other.matches
2605 )
2606
2607 def __hash__(self):
2608 sitelinks_hashable = tuple(self.sitelinks) if self.sitelinks is not None else None
2609 matches_hashable = tuple(self.matches) if self.matches is not None else None
2610 return hash(self.title, self.url, self.abstract, self.metadata, sitelinks_hashable, matches_hashable)
2611
2612 def _print_title_and_url(self, index, title, url, indent=0):
2613 colors = self.colors
2614
2615 if not self.urlexpand:
2616 url = '[' + urllib.parse.urlparse(url).netloc + ']'
2617
2618 if colors:
2619 # Adjust index to print result index clearly
2620 print(" %s%s%-3s%s" % (' ' * indent, colors.index, index + '.', colors.reset), end='')
2621 if not self.urlexpand:
2622 print(' ' + colors.title + title + colors.reset + ' ' + colors.url + url + colors.reset)
2623 else:
2624 print(' ' + colors.title + title + colors.reset)
2625 print(' ' * (indent + 5) + colors.url + url + colors.reset)
2626 else:
2627 if self.urlexpand:
2628 print(' %s%-3s %s' % (' ' * indent, index + '.', title))
2629 print(' %s%s' % (' ' * (indent + 4), url))
2630 else:
2631 print(' %s%-3s %s %s' % (' ' * indent, index + '.', title, url))
2632
2633 def _print_metadata_and_abstract(self, abstract, metadata=None, matches=None, indent=0):
2634 colors = self.colors
2635 try:
2636 columns, _ = os.get_terminal_size()
2637 except OSError:
2638 columns = 0
2639
2640 if metadata:
2641 if colors:
2642 print(' ' * (indent + 5) + colors.metadata + metadata + colors.reset)
2643 else:
2644 print(' ' * (indent + 5) + metadata)
2645
2646 if abstract:
2647 fillwidth = (columns - (indent + 6)) if columns > indent + 6 else len(abstract)
2648 wrapped_abstract = TrackedTextwrap(abstract, fillwidth)
2649 if colors:
2650 # Highlight matches.
2651 for match in matches or []:
2652 offset = match['offset']
2653 span = len(match['phrase'])
2654 wrapped_abstract.insert_zero_width_sequence('\x1b[1m', offset)
2655 wrapped_abstract.insert_zero_width_sequence('\x1b[0m', offset + span)
2656
2657 if colors:
2658 print(colors.abstract, end='')
2659 for line in wrapped_abstract.lines:
2660 print('%s%s' % (' ' * (indent + 5), line))
2661 if colors:
2662 print(colors.reset, end='')
2663
2664 print('')
2665
2666 def print(self):
2667 """Print the result entry."""
2668 self._print_title_and_url(self.index, self.title, self.url)
2669 self._print_metadata_and_abstract(self.abstract, metadata=self.metadata, matches=self.matches)
2670
2671 for sitelink in self.sitelinks:
2672 self._print_title_and_url(sitelink.index, sitelink.title, sitelink.url, indent=4)
2673 self._print_metadata_and_abstract(sitelink.abstract, indent=4)
2674
2675 def jsonizable_object(self):
2676 """Return a JSON-serializable dict representing the result entry."""
2677 obj = {
2678 'title': self.title,
2679 'url': self.url,
2680 'abstract': self.abstract
2681 }
2682 if self.metadata:
2683 obj['metadata'] = self.metadata
2684 if self.sitelinks:
2685 obj['sitelinks'] = [sitelink.__dict__ for sitelink in self.sitelinks]
2686 if self.matches:
2687 obj['matches'] = self.matches
2688 return obj
2689
2690 def urltable(self):
2691 """Return a index-to-URL table for the current result.
2692
2693 Normally, the table contains only a single entry, but when the result
2694 contains sitelinks, all sitelinks are included in this table.
2695
2696 Returns
2697 -------
2698 dict
2699 A dict mapping indices (strs) to URLs (also strs). Indices of
2700 sitelinks are the original index appended by lowercase letters a,
2701 b, c, etc.
2702
2703 """
2704 return self._urltable
2705
2706 @staticmethod
2707 def collapse_whitespace(s):
2708 return re.sub(r'[ \t\n\r]+', ' ', s)
2709
2710
2711 class GooglerCmdException(Exception):
2712 pass
2713
2714
2715 class NoKeywordsException(GooglerCmdException):
2716 pass
2717
2718
2719 def require_keywords(method):
2720 # Require keywords to be set before we run a GooglerCmd method. If
2721 # no keywords have been set, raise a NoKeywordsException.
2722 @functools.wraps(method)
2723 def enforced_method(self, *args, **kwargs):
2724 if not self.keywords:
2725 raise NoKeywordsException('No keywords.')
2726 method(self, *args, **kwargs)
2727
2728 return enforced_method
2729
2730
2731 def no_argument(method):
2732 # Normalize a do_* method of GooglerCmd that takes no argument to
2733 # one that takes an arg, but issue a warning when an nonempty
2734 # argument is given.
2735 @functools.wraps(method)
2736 def enforced_method(self, arg):
2737 if arg:
2738 method_name = arg.__name__
2739 command_name = method_name[3:] if method_name.startswith('do_') else method_name
2740 logger.warning("Argument to the '%s' command ignored.", command_name)
2741 method(self)
2742
2743 return enforced_method
2744
2745
2746 class GooglerCmd(object):
2747 """
2748 Command line interpreter and executor class for googler.
2749
2750 Inspired by PSL cmd.Cmd.
2751
2752 Parameters
2753 ----------
2754 opts : argparse.Namespace
2755 Options and/or arguments.
2756
2757 Attributes
2758 ----------
2759 options : argparse.Namespace
2760 Options that are currently in effect. Read-only attribute.
2761 keywords : str or list or strs
2762 Current keywords. Read-only attribute
2763
2764 Methods
2765 -------
2766 fetch()
2767 display_results(prelude='\n', json_output=False)
2768 fetch_and_display(prelude='\n', json_output=False, interactive=True)
2769 read_next_command()
2770 help()
2771 cmdloop()
2772 """
2773
2774 # Class variables
2775 colors = None
2776 re_url_index = re.compile(r"\d+(a-z)?")
2777
2778 def __init__(self, opts):
2779 super().__init__()
2780
2781 self._opts = opts
2782
2783 self._google_url = GoogleUrl(opts)
2784
2785 if opts.html_file:
2786 # Preloaded HTML parsing mode, do not initialize connection.
2787 self._preload_from_file = opts.html_file
2788 self._conn = None
2789 else:
2790 self._preload_from_file = None
2791 proxy = opts.proxy if hasattr(opts, 'proxy') else None
2792 self._conn = GoogleConnection(self._google_url.hostname,
2793 address_family=opts.address_family,
2794 proxy=proxy,
2795 notweak=opts.notweak)
2796 atexit.register(self._conn.close)
2797
2798 self.results = []
2799 self._autocorrected = None
2800 self._showing_results_for = None
2801 self._results_filtered = False
2802 self._urltable = {}
2803
2804 self.promptcolor = True if os.getenv('DISABLE_PROMPT_COLOR') is None else False
2805
2806 self.no_results_instructions_shown = False
2807
2808 @property
2809 def options(self):
2810 """Current options."""
2811 return self._opts
2812
2813 @property
2814 def keywords(self):
2815 """Current keywords."""
2816 return self._google_url.keywords
2817
2818 @require_keywords
2819 def fetch(self):
2820 """Fetch a page and parse for results.
2821
2822 Results are stored in ``self.results``.
2823
2824 Raises
2825 ------
2826 GoogleConnectionError
2827
2828 See Also
2829 --------
2830 fetch_and_display
2831
2832 """
2833 # This method also sets self._results_filtered and
2834 # self._urltable.
2835 if self._preload_from_file:
2836 with open(self._preload_from_file, encoding='utf-8') as fp:
2837 page = fp.read()
2838 else:
2839 page = self._conn.fetch_page(self._google_url.relative())
2840 if logger.isEnabledFor(logging.DEBUG):
2841 import tempfile
2842 fd, tmpfile = tempfile.mkstemp(prefix='googler-response-', suffix='.html')
2843 os.close(fd)
2844 with open(tmpfile, 'w', encoding='utf-8') as fp:
2845 fp.write(page)
2846 logger.debug("Response body written to '%s'.", tmpfile)
2847
2848 parser = GoogleParser(page, news=self._google_url.news, videos=self._google_url.videos)
2849
2850 self.results = parser.results
2851 self._autocorrected = parser.autocorrected
2852 self._showing_results_for = parser.showing_results_for
2853 self._results_filtered = parser.filtered
2854 self._urltable = {}
2855 for r in self.results:
2856 self._urltable.update(r.urltable())
2857
2858 def warn_no_results(self):
2859 printerr('No results.')
2860 if self.no_results_instructions_shown:
2861 return
2862
2863 try:
2864 import json
2865 import urllib.error
2866 import urllib.request
2867 info_json_url = '%s/master/info.json' % RAW_DOWNLOAD_REPO_BASE
2868 logger.debug('Fetching %s for project status...', info_json_url)
2869 try:
2870 with urllib.request.urlopen(info_json_url, timeout=5) as response:
2871 try:
2872 info = json.load(response)
2873 except Exception:
2874 logger.error('Failed to decode project status from %s', info_json_url)
2875 raise RuntimeError
2876 except urllib.error.HTTPError as e:
2877 logger.error('Failed to fetch project status from %s: HTTP %d', info_json_url, e.code)
2878 raise RuntimeError
2879 epoch = info.get('epoch')
2880 if epoch > _EPOCH_:
2881 printerr('Your version of googler is broken due to Google-side changes.')
2882 tracking_issue = info.get('tracking_issue')
2883 fixed_on_master = info.get('fixed_on_master')
2884 fixed_in_release = info.get('fixed_in_release')
2885 if fixed_in_release:
2886 printerr('A new version, %s, has been released to address the changes.' % fixed_in_release)
2887 printerr('Please upgrade to the latest version.')
2888 elif fixed_on_master:
2889 printerr('The fix has been pushed to master, pending a release.')
2890 printerr('Please download the master version https://git.io/googler or wait for a release.')
2891 else:
2892 printerr('The issue is tracked at https://github.com/jarun/googler/issues/%s.' % tracking_issue)
2893 return
2894 except RuntimeError:
2895 pass
2896
2897 printerr('If you believe this is a bug, please review '
2898 'https://git.io/googler-no-results before submitting a bug report.')
2899 self.no_results_instructions_shown = True
2900
2901 @require_keywords
2902 def display_results(self, prelude='\n', json_output=False):
2903 """Display results stored in ``self.results``.
2904
2905 Parameters
2906 ----------
2907 See `fetch_and_display`.
2908
2909 """
2910 if json_output:
2911 # JSON output
2912 import json
2913 results_object = [r.jsonizable_object() for r in self.results]
2914 print(json.dumps(results_object, indent=2, sort_keys=True, ensure_ascii=False))
2915 else:
2916 # Regular output
2917 if not self.results:
2918 self.warn_no_results()
2919 else:
2920 sys.stderr.write(prelude)
2921 for r in self.results:
2922 r.print()
2923
2924 @require_keywords
2925 def showing_results_for_alert(self, interactive=True):
2926 colors = self.colors
2927 if self._showing_results_for:
2928 if colors:
2929 # Underline the query
2930 actual_query = '\x1b[4m' + self._showing_results_for + '\x1b[24m'
2931 else:
2932 actual_query = self._showing_results_for
2933 if self._autocorrected:
2934 if interactive:
2935 info = 'Showing results for %s; enter "x" for an exact search.' % actual_query
2936 else:
2937 info = 'Showing results for %s; use -x, --exact for an exact search.' % actual_query
2938 else:
2939 info = 'No results found; showing results for %s.' % actual_query
2940 if interactive:
2941 printerr('')
2942 if colors:
2943 printerr(colors.prompt + info + colors.reset)
2944 else:
2945 printerr('** ' + info)
2946
2947 @require_keywords
2948 def fetch_and_display(self, prelude='\n', json_output=False, interactive=True):
2949 """Fetch a page and display results.
2950
2951 Results are stored in ``self.results``.
2952
2953 Parameters
2954 ----------
2955 prelude : str, optional
2956 A string that is written to stderr before showing actual results,
2957 usually serving as a separator. Default is an empty line.
2958 json_output : bool, optional
2959 Whether to dump results in JSON format. Default is False.
2960 interactive : bool, optional
2961 Whether to show contextual instructions, when e.g. Google
2962 has filtered the results. Default is True.
2963
2964 Raises
2965 ------
2966 GoogleConnectionError
2967
2968 See Also
2969 --------
2970 fetch
2971 display_results
2972
2973 """
2974 self.fetch()
2975 self.showing_results_for_alert()
2976 self.display_results(prelude=prelude, json_output=json_output)
2977 if self._results_filtered:
2978 colors = self.colors
2979 info = 'Enter "unfilter" to show similar results Google omitted.'
2980 if colors:
2981 printerr(colors.prompt + info + colors.reset)
2982 else:
2983 printerr('** ' + info)
2984 printerr('')
2985
2986 def read_next_command(self):
2987 """Show omniprompt and read user command line.
2988
2989 Command line is always stripped, and each consecutive group of
2990 whitespace is replaced with a single space character. If the
2991 command line is empty after stripping, when ignore it and keep
2992 reading. Exit with status 0 if we get EOF or an empty line
2993 (pre-strip, that is, a raw <enter>) twice in a row.
2994
2995 The new command line (non-empty) is stored in ``self.cmd``.
2996
2997 """
2998 colors = self.colors
2999 message = 'googler (? for help)'
3000 prompt = (colors.prompt + message + colors.reset + ' ') if (colors and self.promptcolor) else (message + ': ')
3001 enter_count = 0
3002 while True:
3003 try:
3004 cmd = input(prompt)
3005 except EOFError:
3006 sys.exit(0)
3007
3008 if not cmd:
3009 enter_count += 1
3010 if enter_count == 2:
3011 # Double <enter>
3012 sys.exit(0)
3013 else:
3014 enter_count = 0
3015
3016 cmd = ' '.join(cmd.split())
3017 if cmd:
3018 self.cmd = cmd
3019 break
3020
3021 @staticmethod
3022 def help():
3023 GooglerArgumentParser.print_omniprompt_help(sys.stderr)
3024 printerr('')
3025
3026 @require_keywords
3027 @no_argument
3028 def do_first(self):
3029 try:
3030 self._google_url.first_page()
3031 except ValueError as e:
3032 print(e, file=sys.stderr)
3033 return
3034
3035 self.fetch_and_display()
3036
3037 def do_google(self, arg):
3038 # Update keywords and reconstruct URL
3039 self._opts.keywords = arg
3040 self._google_url = GoogleUrl(self._opts)
3041 self.fetch_and_display()
3042
3043 @require_keywords
3044 @no_argument
3045 def do_next(self):
3046 # If > 5 results are being fetched each time,
3047 # block next when no parsed results in current fetch
3048 if not self.results and self._google_url._num > 5:
3049 printerr('No results.')
3050 else:
3051 self._google_url.next_page()
3052 self.fetch_and_display()
3053
3054 @require_keywords
3055 def do_open(self, *args):
3056 if not args:
3057 open_url(self._google_url.full())
3058 return
3059
3060 for nav in args:
3061 if nav == 'a':
3062 for key, value in sorted(self._urltable.items()):
3063 open_url(self._urltable[key])
3064 elif nav in self._urltable:
3065 open_url(self._urltable[nav])
3066 elif '-' in nav:
3067 try:
3068 vals = [int(x) for x in nav.split('-')]
3069 if (len(vals) != 2):
3070 printerr('Invalid range %s.' % nav)
3071 continue
3072
3073 if vals[0] > vals[1]:
3074 vals[0], vals[1] = vals[1], vals[0]
3075
3076 for _id in range(vals[0], vals[1] + 1):
3077 if str(_id) in self._urltable:
3078 open_url(self._urltable[str(_id)])
3079 else:
3080 printerr('Invalid index %s.' % _id)
3081 except ValueError:
3082 printerr('Invalid range %s.' % nav)
3083 else:
3084 printerr('Invalid index %s.' % nav)
3085
3086 @require_keywords
3087 @no_argument
3088 def do_previous(self):
3089 try:
3090 self._google_url.prev_page()
3091 except ValueError as e:
3092 print(e, file=sys.stderr)
3093 return
3094
3095 self.fetch_and_display()
3096
3097 @require_keywords
3098 @no_argument
3099 def do_exact(self):
3100 # Reset start to 0 when exact is applied.
3101 self._google_url.update(start=0, exact=True)
3102 self.fetch_and_display()
3103
3104 @require_keywords
3105 @no_argument
3106 def do_unfilter(self):
3107 # Reset start to 0 when unfilter is applied.
3108 self._google_url.update(start=0)
3109 self._google_url.set_queries(filter=0)
3110 self.fetch_and_display()
3111
3112 def copy_url(self, idx):
3113 try:
3114 try:
3115 content = self._urltable[idx].encode('utf-8')
3116 except KeyError:
3117 printerr('Invalid index.')
3118 return
3119
3120 # try copying the url to clipboard using native utilities
3121 copier_params = []
3122 if sys.platform.startswith(('linux', 'freebsd', 'openbsd')):
3123 if shutil.which('xsel') is not None:
3124 copier_params = ['xsel', '-b', '-i']
3125 elif shutil.which('xclip') is not None:
3126 copier_params = ['xclip', '-selection', 'clipboard']
3127 elif shutil.which('wl-copy') is not None:
3128 copier_params = ['wl-copy']
3129 elif shutil.which('termux-clipboard-set') is not None:
3130 copier_params = ['termux-clipboard-set']
3131 elif sys.platform == 'darwin':
3132 copier_params = ['pbcopy']
3133 elif sys.platform == 'win32':
3134 copier_params = ['clip']
3135
3136 if copier_params:
3137 Popen(copier_params, stdin=PIPE, stdout=DEVNULL, stderr=DEVNULL).communicate(content)
3138 return
3139
3140 # If native clipboard utilities are absent, try to use terminal multiplexers
3141 # tmux
3142 if os.getenv('TMUX_PANE'):
3143 copier_params = ['tmux', 'set-buffer']
3144 Popen(copier_params + [content], stdin=DEVNULL, stdout=DEVNULL, stderr=DEVNULL).communicate()
3145 return
3146
3147 # GNU Screen paste buffer
3148 if os.getenv('STY'):
3149 import tempfile
3150 copier_params = ['screen', '-X', 'readbuf', '-e', 'utf8']
3151 tmpfd, tmppath = tempfile.mkstemp()
3152 try:
3153 with os.fdopen(tmpfd, 'wb') as fp:
3154 fp.write(content)
3155 copier_params.append(tmppath)
3156 Popen(copier_params, stdin=DEVNULL, stdout=DEVNULL, stderr=DEVNULL).communicate()
3157 finally:
3158 os.unlink(tmppath)
3159 return
3160
3161 printerr('failed to locate suitable clipboard utility')
3162 except Exception:
3163 raise NoKeywordsException
3164
3165 def cmdloop(self):
3166 """Run REPL."""
3167 if self.keywords:
3168 self.fetch_and_display()
3169 else:
3170 printerr('Please initiate a query.')
3171
3172 while True:
3173 self.read_next_command()
3174 # TODO: Automatic dispatcher
3175 #
3176 # We can't write a dispatcher for now because that could
3177 # change behaviour of the prompt. However, we have already
3178 # laid a lot of ground work for the dispatcher, e.g., the
3179 # `no_argument' decorator.
3180 try:
3181 cmd = self.cmd
3182 if cmd == 'f':
3183 self.do_first('')
3184 elif cmd.startswith('g '):
3185 self.do_google(cmd[2:])
3186 elif cmd == 'n':
3187 self.do_next('')
3188 elif cmd == 'o':
3189 self.do_open()
3190 elif cmd.startswith('o '):
3191 self.do_open(*cmd[2:].split())
3192 elif cmd.startswith('O '):
3193 open_url.override_text_browser = True
3194 self.do_open(*cmd[2:].split())
3195 open_url.override_text_browser = False
3196 elif cmd == 'p':
3197 self.do_previous('')
3198 elif cmd == 'q':
3199 break
3200 elif cmd == 'x':
3201 self.do_exact('')
3202 elif cmd == 'unfilter':
3203 self.do_unfilter('')
3204 elif cmd == '?':
3205 self.help()
3206 elif cmd in self._urltable:
3207 open_url(self._urltable[cmd])
3208 elif self.keywords and cmd.isdigit() and int(cmd) < 100:
3209 printerr('Index out of bound. To search for the number, use g.')
3210 elif cmd == 'u':
3211 Result.urlexpand = not Result.urlexpand
3212 self.display_results()
3213 elif cmd.startswith('c ') and self.re_url_index.match(cmd[2:]):
3214 self.copy_url(cmd[2:])
3215 else:
3216 self.do_google(cmd)
3217 except NoKeywordsException:
3218 printerr('Initiate a query first.')
3219
3220
3221 class GooglerArgumentParser(argparse.ArgumentParser):
3222 """Custom argument parser for googler."""
3223
3224 # Print omniprompt help
3225 @staticmethod
3226 def print_omniprompt_help(file=None):
3227 file = sys.stderr if file is None else file
3228 file.write(textwrap.dedent("""
3229 omniprompt keys:
3230 n, p fetch the next or previous set of search results
3231 index open the result corresponding to index in browser
3232 f jump to the first page
3233 o [index|range|a ...] open space-separated result indices, numeric ranges
3234 (sitelinks unsupported in ranges), or all, in browser
3235 open the current search in browser, if no arguments
3236 O [index|range|a ...] like key 'o', but try to open in a GUI browser
3237 g keywords new Google search for 'keywords' with original options
3238 should be used to search omniprompt keys and indices
3239 c index copy url to clipboard
3240 u toggle url expansion
3241 q, ^D, double Enter exit googler
3242 ? show omniprompt help
3243 * other inputs issue a new search with original options
3244 """))
3245
3246 # Print information on googler
3247 @staticmethod
3248 def print_general_info(file=None):
3249 file = sys.stderr if file is None else file
3250 file.write(textwrap.dedent("""
3251 Version %s
3252 Copyright © 2008 Henri Hakkinen
3253 Copyright © 2015-2021 Arun Prakash Jana <engineerarun@gmail.com>
3254 Zhiming Wang <zmwangx@gmail.com>
3255 License: GPLv3
3256 Webpage: https://github.com/jarun/googler
3257 """ % _VERSION_))
3258
3259 # Augment print_help to print more than synopsis and options
3260 def print_help(self, file=None):
3261 super().print_help(file)
3262 self.print_omniprompt_help(file)
3263 self.print_general_info(file)
3264
3265 # Automatically print full help text on error
3266 def error(self, message):
3267 sys.stderr.write('%s: error: %s\n\n' % (self.prog, message))
3268 self.print_help(sys.stderr)
3269 self.exit(2)
3270
3271 # Type guards
3272 @staticmethod
3273 def positive_int(arg):
3274 """Try to convert a string into a positive integer."""
3275 try:
3276 n = int(arg)
3277 assert n > 0
3278 return n
3279 except (ValueError, AssertionError):
3280 raise argparse.ArgumentTypeError('%s is not a positive integer' % arg)
3281
3282 @staticmethod
3283 def nonnegative_int(arg):
3284 """Try to convert a string into a nonnegative integer."""
3285 try:
3286 n = int(arg)
3287 assert n >= 0
3288 return n
3289 except (ValueError, AssertionError):
3290 raise argparse.ArgumentTypeError('%s is not a non-negative integer' % arg)
3291
3292 @staticmethod
3293 def is_duration(arg):
3294 """Check if a string is a valid duration accepted by Google.
3295
3296 A valid duration is of the form dNUM, where d is a single letter h
3297 (hour), d (day), w (week), m (month), or y (year), and NUM is a
3298 non-negative integer.
3299 """
3300 try:
3301 if arg[0] not in ('h', 'd', 'w', 'm', 'y') or int(arg[1:]) < 0:
3302 raise ValueError
3303 except (TypeError, IndexError, ValueError):
3304 raise argparse.ArgumentTypeError('%s is not a valid duration' % arg)
3305 return arg
3306
3307 @staticmethod
3308 def is_date(arg):
3309 """Check if a string is a valid date/month/year accepted by Google."""
3310 if re.match(r'^(\d+/){0,2}\d+$', arg):
3311 return arg
3312 else:
3313 raise argparse.ArgumentTypeError('%s is not a valid date/month/year; '
3314 'use the American date format with slashes')
3315
3316 @staticmethod
3317 def is_colorstr(arg):
3318 """Check if a string is a valid color string."""
3319 try:
3320 assert len(arg) == 6
3321 for c in arg:
3322 assert c in COLORMAP
3323 except AssertionError:
3324 raise argparse.ArgumentTypeError('%s is not a valid color string' % arg)
3325 return arg
3326
3327
3328 # Self-upgrade mechanism
3329
3330 def system_is_windows():
3331 """Checks if the underlying system is Windows (Cygwin included)."""
3332 return sys.platform in {'win32', 'cygwin'}
3333
3334
3335 def get_latest_ref(include_git=False):
3336 """Helper for download_latest_googler."""
3337 import urllib.request
3338
3339 if include_git:
3340 # Get SHA of latest commit on master
3341 request = urllib.request.Request('%s/commits/master' % API_REPO_BASE,
3342 headers={'Accept': 'application/vnd.github.v3.sha'})
3343 response = urllib.request.urlopen(request)
3344 if response.status != 200:
3345 raise http.client.HTTPException(response.reason)
3346 return response.read().decode('utf-8')
3347 else:
3348 # Get name of latest tag
3349 request = urllib.request.Request('%s/releases?per_page=1' % API_REPO_BASE,
3350 headers={'Accept': 'application/vnd.github.v3+json'})
3351 response = urllib.request.urlopen(request)
3352 if response.status != 200:
3353 raise http.client.HTTPException(response.reason)
3354 import json
3355 return json.loads(response.read().decode('utf-8'))[0]['tag_name']
3356
3357
3358 def download_latest_googler(include_git=False):
3359 """Download latest googler to a temp file.
3360
3361 By default, the latest released version is downloaded, but if
3362 `include_git` is specified, then the latest git master is downloaded
3363 instead.
3364
3365 Parameters
3366 ----------
3367 include_git : bool, optional
3368 Download from git master. Default is False.
3369
3370 Returns
3371 -------
3372 (git_ref, path): tuple
3373 A tuple containing the git reference (either name of the latest
3374 tag or SHA of the latest commit) and path to the downloaded
3375 file.
3376
3377 """
3378 # Download googler to a tempfile
3379 git_ref = get_latest_ref(include_git=include_git)
3380 googler_download_url = '%s/%s/googler' % (RAW_DOWNLOAD_REPO_BASE, git_ref)
3381 printerr('Downloading %s' % googler_download_url)
3382 request = urllib.request.Request(googler_download_url,
3383 headers={'Accept-Encoding': 'gzip'})
3384 import tempfile
3385 fd, path = tempfile.mkstemp()
3386 atexit.register(lambda: os.remove(path) if os.path.exists(path) else None)
3387 os.close(fd)
3388 with open(path, 'wb') as fp:
3389 with urllib.request.urlopen(request) as response:
3390 if response.status != 200:
3391 raise http.client.HTTPException(response.reason)
3392 payload = response.read()
3393 try:
3394 fp.write(gzip.decompress(payload))
3395 except OSError:
3396 fp.write(payload)
3397 return git_ref, path
3398
3399
3400 def self_replace(path):
3401 """Replace the current script with a specified file.
3402
3403 Both paths (the specified path and path to the current script) are
3404 resolved to absolute, symlink-free paths. Upon replacement, the
3405 owner and mode signatures of the current script are preserved. The
3406 caller needs to have the necessary permissions.
3407
3408 Replacement won't happen if the specified file is the same
3409 (content-wise) as the current script.
3410
3411 Parameters
3412 ----------
3413 path : str
3414 Path to the replacement file.
3415
3416 Returns
3417 -------
3418 bool
3419 True if replaced, False if skipped (specified file is the same
3420 as the current script).
3421
3422 """
3423 if system_is_windows():
3424 raise NotImplementedError('Self upgrade not supported on Windows.')
3425
3426 import filecmp
3427 import shutil
3428
3429 path = os.path.realpath(path)
3430 self_path = os.path.realpath(__file__)
3431
3432 if filecmp.cmp(path, self_path):
3433 return False
3434
3435 self_stat = os.stat(self_path)
3436 os.chown(path, self_stat.st_uid, self_stat.st_gid)
3437 os.chmod(path, self_stat.st_mode)
3438
3439 shutil.move(path, self_path)
3440 return True
3441
3442
3443 def self_upgrade(include_git=False):
3444 """Perform in-place self-upgrade.
3445
3446 Parameters
3447 ----------
3448 include_git : bool, optional
3449 See `download_latest_googler`. Default is False.
3450
3451 """
3452 git_ref, path = download_latest_googler(include_git=include_git)
3453 if self_replace(path):
3454 printerr('Upgraded to %s.' % git_ref)
3455 else:
3456 printerr('Already up to date.')
3457
3458
3459 def check_new_version():
3460 try:
3461 from distutils.version import StrictVersion as Version
3462 except ImportError:
3463 # distutils not available (thanks distros), use a concise poor
3464 # man's version parser.
3465 class Version(tuple):
3466 def __new__(cls, version_str):
3467 def parseint(s):
3468 try:
3469 return int(s)
3470 except ValueError:
3471 return 0
3472 return tuple.__new__(cls, [parseint(s) for s in version_str.split('.')])
3473
3474 import pathlib
3475 import tempfile
3476 import time
3477 cache = pathlib.Path(tempfile.gettempdir()) / 'googler-latest-version'
3478 latest_version_str = None
3479 # Try to load latest version string from cached location, if it
3480 # exists and is fresh enough.
3481 try:
3482 if cache.is_file() and time.time() - cache.stat().st_mtime < 86400:
3483 latest_version_str = cache.read_text().strip()
3484 except OSError:
3485 pass
3486 if not latest_version_str:
3487 try:
3488 latest_version_str = get_latest_ref().lstrip('v')
3489 cache.write_text(latest_version_str)
3490 except Exception:
3491 pass
3492 if not latest_version_str:
3493 return
3494 # Try to fetch latest version string from GitHub.
3495 try:
3496 current_version = Version(_VERSION_)
3497 latest_version = Version(latest_version_str)
3498 except ValueError:
3499 return
3500 if latest_version > current_version:
3501 print('\x1b[33;1mThe latest release of googler is v%s, please upgrade.\x1b[0m'
3502 % latest_version_str,
3503 file=sys.stderr)
3504
3505
3506 # Miscellaneous functions
3507
3508 def python_version():
3509 return '%d.%d.%d' % sys.version_info[:3]
3510
3511
3512 def https_proxy_from_environment():
3513 return os.getenv('https_proxy')
3514
3515
3516 def parse_proxy_spec(proxyspec):
3517 if '://' in proxyspec:
3518 pos = proxyspec.find('://')
3519 scheme = proxyspec[:pos]
3520 proxyspec = proxyspec[pos+3:]
3521 if scheme.lower() != 'http':
3522 # Only support HTTP proxies.
3523 #
3524 # In particular, we don't support HTTPS proxies since we
3525 # only speak plain HTTP to the proxy server, so don't give
3526 # users a false sense of security.
3527 raise NotImplementedError('Unsupported proxy scheme %s.' % scheme)
3528
3529 if '@' in proxyspec:
3530 pos = proxyspec.find('@')
3531 user_passwd = urllib.parse.unquote(proxyspec[:pos])
3532 # Remove trailing '/' if any
3533 host_port = proxyspec[pos+1:].rstrip('/')
3534 else:
3535 user_passwd = None
3536 host_port = proxyspec.rstrip('/')
3537
3538 if ':' not in host_port:
3539 # Use port 1080 as default, following curl.
3540 host_port += ':1080'
3541
3542 return user_passwd, host_port
3543
3544
3545 def set_win_console_mode():
3546 # VT100 control sequences are supported on Windows 10 Anniversary Update and later.
3547 # https://docs.microsoft.com/en-us/windows/console/console-virtual-terminal-sequences
3548 # https://docs.microsoft.com/en-us/windows/console/setconsolemode
3549 if platform.release() == '10':
3550 STD_OUTPUT_HANDLE = -11
3551 STD_ERROR_HANDLE = -12
3552 ENABLE_VIRTUAL_TERMINAL_PROCESSING = 0x0004
3553 try:
3554 from ctypes import windll, wintypes, byref
3555 kernel32 = windll.kernel32
3556 for nhandle in (STD_OUTPUT_HANDLE, STD_ERROR_HANDLE):
3557 handle = kernel32.GetStdHandle(nhandle)
3558 old_mode = wintypes.DWORD()
3559 if not kernel32.GetConsoleMode(handle, byref(old_mode)):
3560 raise RuntimeError('GetConsoleMode failed')
3561 new_mode = old_mode.value | ENABLE_VIRTUAL_TERMINAL_PROCESSING
3562 if not kernel32.SetConsoleMode(handle, new_mode):
3563 raise RuntimeError('SetConsoleMode failed')
3564 # Note: No need to restore at exit. SetConsoleMode seems to
3565 # be limited to the calling process.
3566 except Exception:
3567 pass
3568
3569
3570 # Query autocompleter
3571
3572 # This function is largely experimental and could raise any exception;
3573 # you should be prepared to catch anything. When it works though, it
3574 # returns a list of strings the prefix could autocomplete to (however,
3575 # it is not guaranteed that they start with the specified prefix; for
3576 # instance, they won't if the specified prefix ends in a punctuation
3577 # mark.)
3578 def completer_fetch_completions(prefix):
3579 import html
3580 import json
3581 import re
3582 import urllib.request
3583
3584 # One can pass the 'hl' query param to specify the language. We
3585 # ignore that for now.
3586 api_url = ('https://www.google.com/complete/search?client=psy-ab&q=%s' %
3587 urllib.parse.quote(prefix, safe=''))
3588 # A timeout of 3 seconds seems to be overly generous already.
3589 resp = urllib.request.urlopen(api_url, timeout=3)
3590 charset = resp.headers.get_content_charset()
3591 logger.debug('Completions charset: %s', charset)
3592 respobj = json.loads(resp.read().decode(charset))
3593
3594 # The response object, once parsed as JSON, should look like
3595 #
3596 # ['git',
3597 # [['git<b>hub</b>', 0],
3598 # ['git', 0],
3599 # ['git<b>lab</b>', 0],
3600 # ['git<b> stash</b>', 0]],
3601 # {'q': 'oooAhRzoChqNmMbNaaDKXk1YY4k', 't': {'bpc': False, 'tlw': False}}]
3602 #
3603 # Note the each result entry need not have two members; e.g., for
3604 # 'gi', there is an entry ['gi<b>f</b>', 0, [131]].
3605 HTML_TAG = re.compile(r'<[^>]+>')
3606 return [html.unescape(HTML_TAG.sub('', entry[0])) for entry in respobj[1]]
3607
3608
3609 def completer_run(prefix):
3610 if prefix:
3611 completions = completer_fetch_completions(prefix)
3612 if completions:
3613 print('\n'.join(completions))
3614 sys.exit(0)
3615
3616
3617 def parse_args(args=None, namespace=None):
3618 """Parse googler arguments/options.
3619
3620 Parameters
3621 ----------
3622 args : list, optional
3623 Arguments to parse. Default is ``sys.argv``.
3624 namespace : argparse.Namespace
3625 Namespace to write to. Default is a new namespace.
3626
3627 Returns
3628 -------
3629 argparse.Namespace
3630 Namespace with parsed arguments / options.
3631
3632 """
3633
3634 colorstr_env = os.getenv('GOOGLER_COLORS')
3635
3636 argparser = GooglerArgumentParser(description='Google from the command-line.')
3637 addarg = argparser.add_argument
3638 addarg('-s', '--start', type=argparser.nonnegative_int, default=0,
3639 metavar='N', help='start at the Nth result')
3640 addarg('-n', '--count', dest='num', type=argparser.positive_int,
3641 default=10, metavar='N', help='show N results (default 10)')
3642 addarg('-N', '--news', action='store_true',
3643 help='show results from news section')
3644 addarg('-V', '--videos', action='store_true',
3645 help='show results from videos section')
3646 addarg('-c', '--tld', metavar='TLD',
3647 help="""country-specific search with top-level domain .TLD, e.g., 'in'
3648 for India""")
3649 addarg('-l', '--lang', metavar='LANG', help='display in language LANG')
3650 addarg('-g', '--geoloc', metavar='CC',
3651 help="""country-specific geolocation search with country code CC, e.g.
3652 'in' for India. Country codes are the same as top-level domains""")
3653 addarg('-x', '--exact', action='store_true',
3654 help='disable automatic spelling correction')
3655 addarg('--colorize', nargs='?', choices=['auto', 'always', 'never'],
3656 const='always', default='auto',
3657 help="""whether to colorize output; defaults to 'auto', which enables
3658 color when stdout is a tty device; using --colorize without an argument
3659 is equivalent to --colorize=always""")
3660 addarg('-C', '--nocolor', action='store_true',
3661 help='equivalent to --colorize=never')
3662 addarg('--colors', dest='colorstr', type=argparser.is_colorstr,
3663 default=colorstr_env if colorstr_env else 'GKlgxy', metavar='COLORS',
3664 help='set output colors (see man page for details)')
3665 addarg('-j', '--first', '--lucky', dest='lucky', action='store_true',
3666 help='open the first result in web browser and exit')
3667 addarg('-t', '--time', dest='duration', type=argparser.is_duration,
3668 metavar='dN', help='time limit search '
3669 '[h5 (5 hrs), d5 (5 days), w5 (5 weeks), m5 (5 months), y5 (5 years)]')
3670 addarg('--from', type=argparser.is_date,
3671 help="""starting date/month/year of date range; must use American date
3672 format with slashes, e.g., 2/24/2020, 2/2020, 2020; can be used in
3673 conjunction with --to, and overrides -t, --time""")
3674 addarg('--to', type=argparser.is_date,
3675 help='ending date/month/year of date range; see --from')
3676 addarg('-w', '--site', dest='sites', action='append', metavar='SITE',
3677 help='search a site using Google')
3678 addarg('-e', '--exclude', dest='exclude', action='append', metavar='SITE',
3679 help='exclude site from results')
3680 addarg('--unfilter', action='store_true', help='do not omit similar results')
3681 addarg('-p', '--proxy', default=https_proxy_from_environment(),
3682 help="""tunnel traffic through an HTTP proxy;
3683 PROXY is of the form [http://][user:password@]proxyhost[:port]""")
3684 addarg('--noua', action='store_true', help=argparse.SUPPRESS)
3685 addarg('--notweak', action='store_true',
3686 help='disable TCP optimizations and forced TLS 1.2')
3687 addarg('--json', action='store_true',
3688 help='output in JSON format; implies --noprompt')
3689 addarg('--url-handler', metavar='UTIL',
3690 help='custom script or cli utility to open results')
3691 addarg('--show-browser-logs', action='store_true',
3692 help='do not suppress browser output (stdout and stderr)')
3693 addarg('--np', '--noprompt', dest='noninteractive', action='store_true',
3694 help='search and exit, do not prompt')
3695 addarg('-4', '--ipv4', action='store_const', dest='address_family',
3696 const=socket.AF_INET, default=0,
3697 help="""only connect over IPv4
3698 (by default, IPv4 is preferred but IPv6 is used as a fallback)""")
3699 addarg('-6', '--ipv6', action='store_const', dest='address_family',
3700 const=socket.AF_INET6, default=0,
3701 help='only connect over IPv6')
3702 addarg('keywords', nargs='*', metavar='KEYWORD', help='search keywords')
3703 if ENABLE_SELF_UPGRADE_MECHANISM and not system_is_windows():
3704 addarg('-u', '--upgrade', action='store_true',
3705 help='perform in-place self-upgrade')
3706 addarg('--include-git', action='store_true',
3707 help='when used with --upgrade, get latest git master')
3708 addarg('-v', '--version', action='version', version=_VERSION_)
3709 addarg('-d', '--debug', action='store_true', help='enable debugging')
3710 # Hidden option for interacting with DOM in an IPython/pdb shell
3711 addarg('-D', '--debugger', action='store_true', help=argparse.SUPPRESS)
3712 # Hidden option for parsing dumped HTML
3713 addarg('--parse', dest='html_file', help=argparse.SUPPRESS)
3714 addarg('--complete', help=argparse.SUPPRESS)
3715
3716 parsed = argparser.parse_args(args, namespace)
3717 if parsed.nocolor:
3718 parsed.colorize = 'never'
3719
3720 return parsed
3721
3722
3723 def main():
3724 try:
3725 opts = parse_args()
3726
3727 # Set logging level
3728 if opts.debug:
3729 logger.setLevel(logging.DEBUG)
3730 logger.debug('googler version %s', _VERSION_)
3731 logger.debug('Python version %s', python_version())
3732 logger.debug('Platform: %s', platform.platform())
3733 check_new_version()
3734
3735 if opts.debugger:
3736 global debugger
3737 debugger = True
3738
3739 # Handle query completer
3740 if opts.complete is not None:
3741 completer_run(opts.complete)
3742
3743 # Handle self-upgrade
3744 if hasattr(opts, 'upgrade') and opts.upgrade:
3745 self_upgrade(include_git=opts.include_git)
3746 sys.exit(0)
3747
3748 check_stdout_encoding()
3749
3750 if opts.keywords:
3751 try:
3752 # Add cmdline args to readline history
3753 readline.add_history(' '.join(opts.keywords))
3754 except Exception:
3755 pass
3756
3757 # Set colors
3758 if opts.colorize == 'always':
3759 colorize = True
3760 elif opts.colorize == 'auto':
3761 colorize = sys.stdout.isatty()
3762 else: # opts.colorize == 'never'
3763 colorize = False
3764
3765 if colorize:
3766 colors = Colors(*[COLORMAP[c] for c in opts.colorstr], reset=COLORMAP['x'])
3767 else:
3768 colors = None
3769 Result.colors = colors
3770 Result.urlexpand = True if os.getenv('DISABLE_URL_EXPANSION') is None else False
3771 GooglerCmd.colors = colors
3772
3773 # Try to enable ANSI color support in cmd or PowerShell on Windows 10
3774 if sys.platform == 'win32' and sys.stdout.isatty() and colorize:
3775 set_win_console_mode()
3776
3777 if opts.url_handler is not None:
3778 open_url.url_handler = opts.url_handler
3779 else:
3780 # Set text browser override to False
3781 open_url.override_text_browser = False
3782
3783 # Handle browser output suppression
3784 if opts.show_browser_logs or (os.getenv('BROWSER') in text_browsers):
3785 open_url.suppress_browser_output = False
3786 else:
3787 open_url.suppress_browser_output = True
3788
3789 if opts.noua:
3790 logger.warning('--noua option has been deprecated and has no effect (see #284)')
3791
3792 repl = GooglerCmd(opts)
3793
3794 # Non-interactive mode
3795 if opts.json or opts.lucky or opts.noninteractive or opts.html_file:
3796 repl.fetch()
3797 if opts.lucky:
3798 if repl.results:
3799 open_url(repl.results[0].url)
3800 else:
3801 print('No results.', file=sys.stderr)
3802 else:
3803 repl.showing_results_for_alert(interactive=False)
3804 repl.display_results(json_output=opts.json)
3805 sys.exit(0)
3806
3807 # Interactive mode
3808 repl.cmdloop()
3809 except Exception as e:
3810 # With debugging on, let the exception through for a traceback;
3811 # otherwise, only print the exception error message.
3812 if logger.isEnabledFor(logging.DEBUG):
3813 raise
3814 else:
3815 logger.error(e)
3816 sys.exit(1)
3817
3818 if __name__ == '__main__':
3819 main()