"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "googler" between
googler-4.1.tar.gz and googler-4.2.tar.gz

About: googler is a command line tool to search Google (Web & News) from the terminal (requires Python).

googler  (googler-4.1):googler  (googler-4.2)
skipping to change at line 90 skipping to change at line 90
sys.exit(1) sys.exit(1)
try: try:
signal.signal(signal.SIGINT, sigint_handler) signal.signal(signal.SIGINT, sigint_handler)
except ValueError: except ValueError:
# signal only works in main thread # signal only works in main thread
pass pass
# Constants # Constants
_VERSION_ = '4.1' _VERSION_ = '4.2'
COLORMAP = {k: '\x1b[%sm' % v for k, v in { COLORMAP = {k: '\x1b[%sm' % v for k, v in {
'a': '30', 'b': '31', 'c': '32', 'd': '33', 'a': '30', 'b': '31', 'c': '32', 'd': '33',
'e': '34', 'f': '35', 'g': '36', 'h': '37', 'e': '34', 'f': '35', 'g': '36', 'h': '37',
'i': '90', 'j': '91', 'k': '92', 'l': '93', 'i': '90', 'j': '91', 'k': '92', 'l': '93',
'm': '94', 'n': '95', 'o': '96', 'p': '97', 'm': '94', 'n': '95', 'o': '96', 'p': '97',
'A': '30;1', 'B': '31;1', 'C': '32;1', 'D': '33;1', 'A': '30;1', 'B': '31;1', 'C': '32;1', 'D': '33;1',
'E': '34;1', 'F': '35;1', 'G': '36;1', 'H': '37;1', 'E': '34;1', 'F': '35;1', 'G': '36;1', 'H': '37;1',
'I': '90;1', 'J': '91;1', 'K': '92;1', 'L': '93;1', 'I': '90;1', 'J': '91;1', 'K': '92;1', 'L': '93;1',
'M': '94;1', 'N': '95;1', 'O': '96;1', 'P': '97;1', 'M': '94;1', 'N': '95;1', 'O': '96;1', 'P': '97;1',
skipping to change at line 601 skipping to change at line 601
# HTML-escaped form of the text node. use text() for unescaped # HTML-escaped form of the text node. use text() for unescaped
# version. # version.
def __str__(self) -> str: def __str__(self) -> str:
return html.escape(self) return html.escape(self)
def __eq__(self, other: object) -> bool: def __eq__(self, other: object) -> bool:
""" """
Two text nodes are equal if and only if they are the same node. Two text nodes are equal if and only if they are the same node.
For string comparision, use :attr:`text`. For string comparison, use :attr:`text`.
""" """
return self is other return self is other
def __ne__(self, other: object) -> bool: def __ne__(self, other: object) -> bool:
""" """
Two text nodes are non-equal if they are not the same node. Two text nodes are non-equal if they are not the same node.
For string comparision, use :attr:`text`. For string comparison, use :attr:`text`.
""" """
return self is not other return self is not other
@property @property
def text(self) -> str: def text(self) -> str:
return str.__str__(self) return str.__str__(self)
class DOMBuilderException(Exception): class DOMBuilderException(Exception):
""" """
Exception raised when :class:`DOMBuilder` detects a bad state. Exception raised when :class:`DOMBuilder` detects a bad state.
skipping to change at line 1406 skipping to change at line 1406
logger.debug(browser) logger.debug(browser)
# Found a GUI browser, suppress browser output # Found a GUI browser, suppress browser output
open_url.suppress_browser_output = True open_url.suppress_browser_output = True
break break
if open_url.suppress_browser_output: if open_url.suppress_browser_output:
_stderr = os.dup(2) _stderr = os.dup(2)
os.close(2) os.close(2)
_stdout = os.dup(1) _stdout = os.dup(1)
os.close(1) # Patch for GUI browsers on WSL
if "microsoft" not in platform.uname()[3].lower():
os.close(1)
fd = os.open(os.devnull, os.O_RDWR) fd = os.open(os.devnull, os.O_RDWR)
os.dup2(fd, 2) os.dup2(fd, 2)
os.dup2(fd, 1) os.dup2(fd, 1)
try: try:
browser.open(url, new=2) browser.open(url, new=2)
finally: finally:
if open_url.suppress_browser_output: if open_url.suppress_browser_output:
os.close(fd) os.close(fd)
os.dup2(_stderr, 2) os.dup2(_stderr, 2)
os.dup2(_stdout, 1) os.dup2(_stdout, 1)
skipping to change at line 1490 skipping to change at line 1492
sys.exit(1) sys.exit(1)
# Classes # Classes
class HardenedHTTPSConnection(HTTPSConnection): class HardenedHTTPSConnection(HTTPSConnection):
"""Overrides HTTPSConnection.connect to specify TLS version """Overrides HTTPSConnection.connect to specify TLS version
NOTE: TLS 1.2 is supported from Python 3.4 NOTE: TLS 1.2 is supported from Python 3.4
""" """
def __init__(self, host, **kwargs): def __init__(self, host, address_family=0, **kwargs):
HTTPSConnection.__init__(self, host, **kwargs) HTTPSConnection.__init__(self, host, **kwargs)
self.address_family = address_family
def connect(self, notweak=False): def connect(self, notweak=False):
sock = socket.create_connection((self.host, self.port), sock = self.create_socket_connection()
self.timeout, self.source_address)
# Optimizations not available on OS X # Optimizations not available on OS X
if not notweak and sys.platform.startswith('linux'): if not notweak and sys.platform.startswith('linux'):
try: try:
sock.setsockopt(socket.SOL_TCP, socket.TCP_DEFER_ACCEPT, 1) sock.setsockopt(socket.SOL_TCP, socket.TCP_DEFER_ACCEPT, 1)
sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_QUICKACK, 1) sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_QUICKACK, 1)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, 524288) sock.setsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF, 524288)
except OSError: except OSError:
# Doesn't work on Windows' Linux subsystem (#179) # Doesn't work on Windows' Linux subsystem (#179)
logger.debug('setsockopt failed') logger.debug('setsockopt failed')
skipping to change at line 1531 skipping to change at line 1533
elif hasattr(ssl, 'PROTOCOL_TLSv1_2'): elif hasattr(ssl, 'PROTOCOL_TLSv1_2'):
# Since Python 3.4 # Since Python 3.4
ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2) ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
if ssl_context: if ssl_context:
self.sock = ssl_context.wrap_socket(sock) self.sock = ssl_context.wrap_socket(sock)
return return
# Fallback # Fallback
HTTPSConnection.connect(self) HTTPSConnection.connect(self)
# Adapted from socket.create_connection.
# https://github.com/python/cpython/blob/bce4ddafdd188cc6deb1584728b67b9e149
ca6a4/Lib/socket.py#L771-L813
def create_socket_connection(self):
err = None
results = socket.getaddrinfo(self.host, self.port, self.address_family,
socket.SOCK_STREAM)
# Prefer IPv4 if address family isn't explicitly specified.
if self.address_family == 0:
results = sorted(results, key=lambda res: 1 if res[0] == socket.AF_I
NET else 2)
for af, socktype, proto, canonname, sa in results:
sock = None
try:
sock = socket.socket(af, socktype, proto)
if self.timeout is not None:
sock.settimeout(self.timeout)
if self.source_address:
sock.bind(self.source_address)
sock.connect(sa)
# Break explicitly a reference cycle
err = None
self.address_family = af
logger.debug('Opened socket to %s:%d',
sa[0] if af == socket.AF_INET else ('[%s]' % sa[0])
,
sa[1])
return sock
except socket.error as _:
err = _
if sock is not None:
sock.close()
if err is not None:
try:
raise err
finally:
# Break explicitly a reference cycle
err = None
else:
raise socket.error("getaddrinfo returns an empty list")
class GoogleUrl(object): class GoogleUrl(object):
""" """
This class constructs the Google Search/News URL. This class constructs the Google Search/News URL.
This class is modelled on urllib.parse.ParseResult for familiarity, This class is modelled on urllib.parse.ParseResult for familiarity,
which means it supports reading of all six attributes -- scheme, which means it supports reading of all six attributes -- scheme,
netloc, path, params, query, fragment -- of netloc, path, params, query, fragment -- of
urllib.parse.ParseResult, as well as the geturl() method. urllib.parse.ParseResult, as well as the geturl() method.
However, the attributes (properties) and methods listed below should However, the attributes (properties) and methods listed below should
skipping to change at line 1598 skipping to change at line 1639
self._start = 0 self._start = 0
self._keywords = [] self._keywords = []
self._sites = None self._sites = None
self._query_dict = { self._query_dict = {
'ie': 'UTF-8', 'ie': 'UTF-8',
'oe': 'UTF-8', 'oe': 'UTF-8',
#'gbv': '1', # control the presence of javascript on the page, 1=no js, 2=js #'gbv': '1', # control the presence of javascript on the page, 1=no js, 2=js
'sei': base64.encodebytes(uuid.uuid1().bytes).decode("ascii").rstrip ('=\n').replace('/', '_'), 'sei': base64.encodebytes(uuid.uuid1().bytes).decode("ascii").rstrip ('=\n').replace('/', '_'),
} }
# In preloaded HTML parsing mode, set keywords to something so
# that we are not tripped up by require_keywords.
if opts.html_file and not opts.keywords:
opts.keywords = ['<debug>']
self.update(opts, **kwargs) self.update(opts, **kwargs)
def __str__(self): def __str__(self):
return self.url return self.url
@property @property
def url(self): def url(self):
"""The full Google URL you want.""" """The full Google URL you want."""
return self.full() return self.full()
skipping to change at line 1940 skipping to change at line 1987
Methods Methods
------- -------
new_connection(host=None, port=None, timeout=45) new_connection(host=None, port=None, timeout=45)
renew_connection(timeout=45) renew_connection(timeout=45)
fetch_page(url) fetch_page(url)
close() close()
""" """
def __init__(self, host, port=None, timeout=45, proxy=None, notweak=False): def __init__(self, host, port=None, address_family=0, timeout=45, proxy=None , notweak=False):
self._host = None self._host = None
self._port = None self._port = None
self._address_family = address_family
self._proxy = proxy self._proxy = proxy
self._notweak = notweak self._notweak = notweak
self._conn = None self._conn = None
self.new_connection(host, port=port, timeout=timeout) self.new_connection(host, port=port, timeout=timeout)
self.cookie = '' self.cookie = ''
@property @property
def host(self): def host(self):
"""The host currently connected to.""" """The host currently connected to."""
return self._host return self._host
skipping to change at line 1984 skipping to change at line 2032
self._host = host self._host = host
self._port = port self._port = port
host_display = host + (':%d' % port if port else '') host_display = host + (':%d' % port if port else '')
proxy = self._proxy proxy = self._proxy
if proxy: if proxy:
proxy_user_passwd, proxy_host_port = parse_proxy_spec(proxy) proxy_user_passwd, proxy_host_port = parse_proxy_spec(proxy)
logger.debug('Connecting to proxy server %s', proxy_host_port) logger.debug('Connecting to proxy server %s', proxy_host_port)
self._conn = HardenedHTTPSConnection(proxy_host_port, timeout=timeou self._conn = HardenedHTTPSConnection(proxy_host_port,
t) address_family=self._address_fa
mily, timeout=timeout)
logger.debug('Tunnelling to host %s' % host_display) logger.debug('Tunnelling to host %s' % host_display)
connect_headers = {} connect_headers = {}
if proxy_user_passwd: if proxy_user_passwd:
connect_headers['Proxy-Authorization'] = 'Basic %s' % base64.b64 encode( connect_headers['Proxy-Authorization'] = 'Basic %s' % base64.b64 encode(
proxy_user_passwd.encode('utf-8') proxy_user_passwd.encode('utf-8')
).decode('utf-8') ).decode('utf-8')
self._conn.set_tunnel(host, port=port, headers=connect_headers) self._conn.set_tunnel(host, port=port, headers=connect_headers)
try: try:
self._conn.connect(self._notweak) self._conn.connect(self._notweak)
except Exception as e: except Exception as e:
msg = 'Failed to connect to proxy server %s: %s.' % (proxy, e) msg = 'Failed to connect to proxy server %s: %s.' % (proxy, e)
raise GoogleConnectionError(msg) raise GoogleConnectionError(msg)
else: else:
logger.debug('Connecting to new host %s', host_display) logger.debug('Connecting to new host %s', host_display)
self._conn = HardenedHTTPSConnection(host, port=port, timeout=timeou self._conn = HardenedHTTPSConnection(host, port=port,
t) address_family=self._address_fa
mily, timeout=timeout)
try: try:
self._conn.connect(self._notweak) self._conn.connect(self._notweak)
except Exception as e: except Exception as e:
msg = 'Failed to connect to %s: %s.' % (host_display, e) msg = 'Failed to connect to %s: %s.' % (host_display, e)
raise GoogleConnectionError(msg) raise GoogleConnectionError(msg)
def renew_connection(self, timeout=45): def renew_connection(self, timeout=45):
"""Renew current connection. """Renew current connection.
Equivalent to ``new_connection(timeout=timeout)``. Equivalent to ``new_connection(timeout=timeout)``.
skipping to change at line 2058 skipping to change at line 2108
except http.client.HTTPException as e: except http.client.HTTPException as e:
logger.debug('Got exception: %s.', e) logger.debug('Got exception: %s.', e)
raise GoogleConnectionError("Failed to get '%s'." % url) raise GoogleConnectionError("Failed to get '%s'." % url)
resp = self._resp resp = self._resp
redirect_counter = 0 redirect_counter = 0
while resp.status != 200 and redirect_counter < 3: while resp.status != 200 and redirect_counter < 3:
if resp.status in {301, 302, 303, 307, 308}: if resp.status in {301, 302, 303, 307, 308}:
redirection_url = resp.getheader('location', '') redirection_url = resp.getheader('location', '')
if 'sorry/IndexRedirect?' in redirection_url or 'sorry/index?' i n redirection_url: if 'sorry/IndexRedirect?' in redirection_url or 'sorry/index?' i n redirection_url:
msg = textwrap.dedent("""\ msg = "Connection blocked due to unusual activity.\n"
Connection blocked due to unusual activity. if self._conn.address_family == socket.AF_INET6:
msg += textwrap.dedent("""\
You are connecting over IPv6 which is likely the problem
. Try to make
sure the machine has a working IPv4 network interface co
nfigured.
See also the -4, --ipv4 option of googler.\n""")
msg += textwrap.dedent("""\
THIS IS NOT A BUG, please do NOT report it as a bug unless y ou have specific THIS IS NOT A BUG, please do NOT report it as a bug unless y ou have specific
information that may lead to the development of a workaround . information that may lead to the development of a workaround .
You IP address is temporarily or permanently blocked by Goog le and requires You IP address is temporarily or permanently blocked by Goog le and requires
reCAPTCHA-solving to use the service, which googler is not c apable of. reCAPTCHA-solving to use the service, which googler is not c apable of.
Possible causes include issuing too many queries in a short time frame, or Possible causes include issuing too many queries in a short time frame, or
operating from a shared / low reputation IP with a history o f abuse. operating from a shared / low reputation IP with a history o f abuse.
Please do NOT use googler for automated scraping.""") Please do NOT use googler for automated scraping.""")
msg = " ".join(msg.splitlines()) msg = " ".join(msg.splitlines())
raise GoogleConnectionError(msg) raise GoogleConnectionError(msg)
self._redirect(redirection_url) self._redirect(redirection_url)
skipping to change at line 2178 skipping to change at line 2233
if debugger: if debugger:
printerr('\x1b[1mInspect the DOM through the \x1b[4mtree\x1b[24m var iable.\x1b[0m') printerr('\x1b[1mInspect the DOM through the \x1b[4mtree\x1b[24m var iable.\x1b[0m')
printerr('') printerr('')
try: try:
import IPython import IPython
IPython.embed() IPython.embed()
except ImportError: except ImportError:
import pdb import pdb
pdb.set_trace() pdb.set_trace()
# cw is short for collapse_whitespace.
cw = lambda s: re.sub(r'[ \t\n\r]+', ' ', s) if s is not None else s
index = 0 index = 0
for div_g in tree.select_all('div.g'): for div_g in tree.select_all('div.g'):
if div_g.select('.hp-xpdbox'): if div_g.select('.hp-xpdbox'):
# Skip smart cards. # Skip smart cards.
continue continue
try: try:
h3 = div_g.select('div.r h3') h3 = div_g.select('div.r h3')
if h3: if h3:
title = h3.text title = h3.text
url = self.unwrap_link(h3.parent.attr('href')) url = self.unwrap_link(h3.parent.attr('href'))
skipping to change at line 2202 skipping to change at line 2260
mime = div_g.select('.mime') mime = div_g.select('.mime')
if mime: if mime:
title = mime.text + ' ' + title title = mime.text + ' ' + title
url = self.unwrap_link(a.attr('href')) url = self.unwrap_link(a.attr('href'))
matched_keywords = [] matched_keywords = []
abstract = '' abstract = ''
for childnode in div_g.select('.st').children: for childnode in div_g.select('.st').children:
if 'f' in childnode.classes: if 'f' in childnode.classes:
# .f is handled as metadata instead. # .f is handled as metadata instead.
continue continue
if childnode.tag == 'b' and childnode.text != '...': childnode_text = cw(childnode.text)
matched_keywords.append({'phrase': childnode.text, 'offs if childnode.tag in ['b', 'em'] and childnode_text != '...':
et': len(abstract)}) matched_keywords.append({'phrase': childnode_text, 'offs
abstract = abstract + childnode.text.replace('\n', '') et': len(abstract)})
abstract = abstract + childnode_text
try: try:
metadata = div_g.select('.f').text metadata = div_g.select('.f').text
metadata = metadata.replace('\u200e', '').replace(' - ', ', ').strip().rstrip(',') metadata = metadata.replace('\u200e', '').replace(' - ', ', ').strip().rstrip(',')
except AttributeError: except AttributeError:
metadata = None metadata = None
except (AttributeError, ValueError): except (AttributeError, ValueError):
continue continue
sitelinks = [] sitelinks = []
for td in div_g.select_all('td'): for td in div_g.select_all('td'):
try: try:
a = td.select('a') a = td.select('a')
sl_title = a.text sl_title = a.text
sl_url = self.unwrap_link(a.attr('href')) sl_url = self.unwrap_link(a.attr('href'))
sl_abstract = td.select('div.s.st').text sl_abstract = td.select('div.s.st, div.s .st').text
sitelinks.append(Sitelink(sl_title, sl_url, sl_abstract)) sitelink = Sitelink(cw(sl_title), sl_url, cw(sl_abstract))
if sitelink not in sitelinks:
sitelinks.append(sitelink)
except (AttributeError, ValueError): except (AttributeError, ValueError):
continue continue
index += 1 # cw cannot be applied to abstract here since it may screw
self.results.append(Result(index, title, url, abstract, # up offsets of matches. Instead, each relevant node's text
metadata=metadata, sitelinks=sitelinks, m # is whitespace-collapsed before being appended to abstract.
atches=matched_keywords)) # We then hope for the best.
result = Result(index + 1, cw(title), url, abstract,
metadata=cw(metadata), sitelinks=sitelinks, matches=
matched_keywords)
if result not in self.results:
self.results.append(result)
index += 1
if not self.results: if not self.results:
for card in tree.select_all('g-card'): for card in tree.select_all('g-card'):
a = card.select('a[href]') a = card.select('a[href]')
if not a: if not a:
continue continue
url = self.unwrap_link(a.attr('href')) url = self.unwrap_link(a.attr('href'))
text_nodes = [] text_nodes = []
for node in a.descendants(): for node in a.descendants():
if isinstance(node, TextNode) and node.strip(): if isinstance(node, TextNode) and node.strip():
text_nodes.append(node.text) text_nodes.append(node.text)
if len(text_nodes) != 4: if len(text_nodes) != 4:
continue continue
publisher, title, abstract, publishing_time = text_nodes publisher, title, abstract, publishing_time = text_nodes
metadata = '%s, %s' % (publisher, publishing_time) metadata = '%s, %s' % (publisher, publishing_time)
index += 1 index += 1
self.results.append(Result(index, title, url, abstract, metadata =metadata)) self.results.append(Result(index, cw(title), url, cw(abstract), metadata=cw(metadata)))
# Showing results for ... # Showing results for ...
# Search instead for ... # Search instead for ...
spell_orig = tree.select("span.spell_orig") spell_orig = tree.select("span.spell_orig")
if spell_orig: if spell_orig:
showing_results_for_link = next( showing_results_for_link = next(
filter(lambda el: el.tag == "a", spell_orig.previous_siblings()) , None filter(lambda el: el.tag == "a", spell_orig.previous_siblings()) , None
) )
if showing_results_for_link: if showing_results_for_link:
self.autocorrected = True self.autocorrected = True
skipping to change at line 2294 skipping to change at line 2361
class Sitelink(object): class Sitelink(object):
"""Container for a sitelink.""" """Container for a sitelink."""
def __init__(self, title, url, abstract): def __init__(self, title, url, abstract):
self.title = title self.title = title
self.url = url self.url = url
self.abstract = abstract self.abstract = abstract
self.index = '' self.index = ''
def __eq__(self, other):
return (
self.title == other.title and
self.url == other.url and
self.abstract == other.abstract
)
def __hash__(self):
return hash((self.title, self.url, self.abstract))
Colors = collections.namedtuple('Colors', 'index, title, url, metadata, abstract , prompt, reset') Colors = collections.namedtuple('Colors', 'index, title, url, metadata, abstract , prompt, reset')
class Result(object): class Result(object):
""" """
Container for one search result, with output helpers. Container for one search result, with output helpers.
Parameters Parameters
---------- ----------
index : int or str index : int or str
title : str title : str
skipping to change at line 2356 skipping to change at line 2433
self.matches = [] if matches is None else matches self.matches = [] if matches is None else matches
self._urltable = {index: url} self._urltable = {index: url}
subindex = 'a' subindex = 'a'
for sitelink in self.sitelinks: for sitelink in self.sitelinks:
fullindex = index + subindex fullindex = index + subindex
sitelink.index = fullindex sitelink.index = fullindex
self._urltable[fullindex] = sitelink.url self._urltable[fullindex] = sitelink.url
subindex = chr(ord(subindex) + 1) subindex = chr(ord(subindex) + 1)
def __eq__(self, other):
return (
self.title == other.title and
self.url == other.url and
self.abstract == other.abstract and
self.metadata == other.metadata and
self.sitelinks == other.sitelinks and
self.matches == other.matches
)
def __hash__(self):
sitelinks_hashable = tuple(sitelinks) if sitelinks is not None else None
matches_hashable = tuple(matches) if matches is not None else None
return hash(self.title, self.url, self.abstract, self.metadata, self.sit
elinks, self.matches)
def _print_title_and_url(self, index, title, url, indent=0): def _print_title_and_url(self, index, title, url, indent=0):
colors = self.colors colors = self.colors
if not self.urlexpand: if not self.urlexpand:
url = '[' + urllib.parse.urlparse(url).netloc + ']' url = '[' + urllib.parse.urlparse(url).netloc + ']'
if colors: if colors:
# Adjust index to print result index clearly # Adjust index to print result index clearly
print(" %s%s%-3s%s" % (' ' * indent, colors.index, index + '.', colo rs.reset), end='') print(" %s%s%-3s%s" % (' ' * indent, colors.index, index + '.', colo rs.reset), end='')
if not self.urlexpand: if not self.urlexpand:
skipping to change at line 2390 skipping to change at line 2482
columns, _ = os.get_terminal_size() columns, _ = os.get_terminal_size()
except OSError: except OSError:
columns = 0 columns = 0
if metadata: if metadata:
if colors: if colors:
print(' ' * (indent + 5) + colors.metadata + metadata + colors.r eset) print(' ' * (indent + 5) + colors.metadata + metadata + colors.r eset)
else: else:
print(' ' * (indent + 5) + metadata) print(' ' * (indent + 5) + metadata)
fillwidth = (columns - (indent + 6)) if columns > indent + 6 else len(ab if abstract:
stract) fillwidth = (columns - (indent + 6)) if columns > indent + 6 else le
wrapped_abstract = TrackedTextwrap(abstract, fillwidth) n(abstract)
if colors: wrapped_abstract = TrackedTextwrap(abstract, fillwidth)
# Highlight matches. if colors:
for match in matches or []: # Highlight matches.
offset = match['offset'] for match in matches or []:
span = len(match['phrase']) offset = match['offset']
wrapped_abstract.insert_zero_width_sequence('\x1b[1m', offset) span = len(match['phrase'])
wrapped_abstract.insert_zero_width_sequence('\x1b[0m', offset + wrapped_abstract.insert_zero_width_sequence('\x1b[1m', offse
span) t)
wrapped_abstract.insert_zero_width_sequence('\x1b[0m', offse
t + span)
if colors:
print(colors.abstract, end='')
for line in wrapped_abstract.lines:
print('%s%s' % (' ' * (indent + 5), line))
if colors:
print(colors.reset, end='')
if colors:
print(colors.abstract, end='')
for line in wrapped_abstract.lines:
print('%s%s' % (' ' * (indent + 5), line))
if colors:
print(colors.reset, end='')
print('') print('')
def print(self): def print(self):
"""Print the result entry.""" """Print the result entry."""
self._print_title_and_url(self.index, self.title, self.url) self._print_title_and_url(self.index, self.title, self.url)
self._print_metadata_and_abstract(self.abstract, metadata=self.metadata, matches=self.matches) self._print_metadata_and_abstract(self.abstract, metadata=self.metadata, matches=self.matches)
for sitelink in self.sitelinks: for sitelink in self.sitelinks:
self._print_title_and_url(sitelink.index, sitelink.title, sitelink.u rl, indent=4) self._print_title_and_url(sitelink.index, sitelink.title, sitelink.u rl, indent=4)
self._print_metadata_and_abstract(sitelink.abstract, indent=4) self._print_metadata_and_abstract(sitelink.abstract, indent=4)
skipping to change at line 2448 skipping to change at line 2542
Returns Returns
------- -------
dict dict
A dict mapping indices (strs) to URLs (also strs). Indices of A dict mapping indices (strs) to URLs (also strs). Indices of
sitelinks are the original index appended by lowercase letters a, sitelinks are the original index appended by lowercase letters a,
b, c, etc. b, c, etc.
""" """
return self._urltable return self._urltable
@staticmethod
def collapse_whitespace(s):
return re.sub(r'[ \t\n\r]+', ' ', s)
class GooglerCmdException(Exception): class GooglerCmdException(Exception):
pass pass
class NoKeywordsException(GooglerCmdException): class NoKeywordsException(GooglerCmdException):
pass pass
def require_keywords(method): def require_keywords(method):
# Require keywords to be set before we run a GooglerCmd method. If # Require keywords to be set before we run a GooglerCmd method. If
# no keywords have been set, raise a NoKeywordsException. # no keywords have been set, raise a NoKeywordsException.
@functools.wraps(method) @functools.wraps(method)
skipping to change at line 2517 skipping to change at line 2615
# Class variables # Class variables
colors = None colors = None
re_url_index = re.compile(r"\d+(a-z)?") re_url_index = re.compile(r"\d+(a-z)?")
def __init__(self, opts): def __init__(self, opts):
super().__init__() super().__init__()
self._opts = opts self._opts = opts
self._google_url = GoogleUrl(opts) self._google_url = GoogleUrl(opts)
proxy = opts.proxy if hasattr(opts, 'proxy') else None
self._conn = GoogleConnection(self._google_url.hostname, proxy=proxy, if opts.html_file:
notweak=opts.notweak) # Preloaded HTML parsing mode, do not initialize connection.
atexit.register(self._conn.close) self._preload_from_file = opts.html_file
self._conn = None
else:
self._preload_from_file = None
proxy = opts.proxy if hasattr(opts, 'proxy') else None
self._conn = GoogleConnection(self._google_url.hostname,
address_family=opts.address_family,
proxy=proxy,
notweak=opts.notweak)
atexit.register(self._conn.close)
self.results = [] self.results = []
self._autocorrected = None self._autocorrected = None
self._showing_results_for = None self._showing_results_for = None
self._results_filtered = False self._results_filtered = False
self._urltable = {} self._urltable = {}
self.promptcolor = True if os.getenv('DISABLE_PROMPT_COLOR') is None els e False self.promptcolor = True if os.getenv('DISABLE_PROMPT_COLOR') is None els e False
self.no_results_instructions_shown = False self.no_results_instructions_shown = False
skipping to change at line 2559 skipping to change at line 2666
------ ------
GoogleConnectionError GoogleConnectionError
See Also See Also
-------- --------
fetch_and_display fetch_and_display
""" """
# This method also sets self._results_filtered and # This method also sets self._results_filtered and
# self._urltable. # self._urltable.
page = self._conn.fetch_page(self._google_url.relative()) if self._preload_from_file:
with open(self._preload_from_file, encoding='utf-8') as fp:
if logger.isEnabledFor(logging.DEBUG): page = fp.read()
import tempfile else:
fd, tmpfile = tempfile.mkstemp(prefix='googler-response-', suffix='. page = self._conn.fetch_page(self._google_url.relative())
html') if logger.isEnabledFor(logging.DEBUG):
os.close(fd) import tempfile
with open(tmpfile, 'w', encoding='utf-8') as fp: fd, tmpfile = tempfile.mkstemp(prefix='googler-response-', suffi
fp.write(page) x='.html')
logger.debug("Response body written to '%s'.", tmpfile) os.close(fd)
with open(tmpfile, 'w', encoding='utf-8') as fp:
fp.write(page)
logger.debug("Response body written to '%s'.", tmpfile)
parser = GoogleParser(page, news=self._google_url.news, videos=self._goo gle_url.videos) parser = GoogleParser(page, news=self._google_url.news, videos=self._goo gle_url.videos)
self.results = parser.results self.results = parser.results
self._autocorrected = parser.autocorrected self._autocorrected = parser.autocorrected
self._showing_results_for = parser.showing_results_for self._showing_results_for = parser.showing_results_for
self._results_filtered = parser.filtered self._results_filtered = parser.filtered
self._urltable = {} self._urltable = {}
for r in self.results: for r in self.results:
self._urltable.update(r.urltable()) self._urltable.update(r.urltable())
skipping to change at line 2812 skipping to change at line 2922
printerr('Invalid index.') printerr('Invalid index.')
return return
# try copying the url to clipboard using native utilities # try copying the url to clipboard using native utilities
copier_params = [] copier_params = []
if sys.platform.startswith(('linux', 'freebsd', 'openbsd')): if sys.platform.startswith(('linux', 'freebsd', 'openbsd')):
if shutil.which('xsel') is not None: if shutil.which('xsel') is not None:
copier_params = ['xsel', '-b', '-i'] copier_params = ['xsel', '-b', '-i']
elif shutil.which('xclip') is not None: elif shutil.which('xclip') is not None:
copier_params = ['xclip', '-selection', 'clipboard'] copier_params = ['xclip', '-selection', 'clipboard']
elif shutil.which('wl-copy') is not None:
copier_params = ['wl-copy']
elif shutil.which('termux-clipboard-set') is not None: elif shutil.which('termux-clipboard-set') is not None:
copier_params = ['termux-clipboard-set'] copier_params = ['termux-clipboard-set']
elif sys.platform == 'darwin': elif sys.platform == 'darwin':
copier_params = ['pbcopy'] copier_params = ['pbcopy']
elif sys.platform == 'win32': elif sys.platform == 'win32':
copier_params = ['clip'] copier_params = ['clip']
if copier_params: if copier_params:
Popen(copier_params, stdin=PIPE, stdout=DEVNULL, stderr=DEVNULL) .communicate(content) Popen(copier_params, stdin=PIPE, stdout=DEVNULL, stderr=DEVNULL) .communicate(content)
return return
skipping to change at line 3015 skipping to change at line 3127
except AssertionError: except AssertionError:
raise argparse.ArgumentTypeError('%s is not a valid color string' % arg) raise argparse.ArgumentTypeError('%s is not a valid color string' % arg)
return arg return arg
# Self-upgrade mechanism # Self-upgrade mechanism
def system_is_windows(): def system_is_windows():
"""Checks if the underlying system is Windows (Cygwin included).""" """Checks if the underlying system is Windows (Cygwin included)."""
return sys.platform in {'win32', 'cygwin'} return sys.platform in {'win32', 'cygwin'}
def get_latest_ref(include_git=False):
"""Helper for download_latest_googler."""
import urllib.request
if include_git:
# Get SHA of latest commit on master
request = urllib.request.Request('%s/commits/master' % API_REPO_BASE,
headers={'Accept': 'application/vnd.git
hub.v3.sha'})
response = urllib.request.urlopen(request)
if response.status != 200:
raise http.client.HTTPException(response.reason)
return response.read().decode('utf-8')
else:
# Get name of latest tag
request = urllib.request.Request('%s/releases?per_page=1' % API_REPO_BAS
E,
headers={'Accept': 'application/vnd.git
hub.v3+json'})
response = urllib.request.urlopen(request)
if response.status != 200:
raise http.client.HTTPException(response.reason)
import json
return json.loads(response.read().decode('utf-8'))[0]['tag_name']
def download_latest_googler(include_git=False): def download_latest_googler(include_git=False):
"""Download latest googler to a temp file. """Download latest googler to a temp file.
By default, the latest released version is downloaded, but if By default, the latest released version is downloaded, but if
`include_git` is specified, then the latest git master is downloaded `include_git` is specified, then the latest git master is downloaded
instead. instead.
Parameters Parameters
---------- ----------
include_git : bool, optional include_git : bool, optional
Download from git master. Default is False. Download from git master. Default is False.
Returns Returns
------- -------
(git_ref, path): tuple (git_ref, path): tuple
A tuple containing the git reference (either name of the latest A tuple containing the git reference (either name of the latest
tag or SHA of the latest commit) and path to the downloaded tag or SHA of the latest commit) and path to the downloaded
file. file.
""" """
import urllib.request
if include_git:
# Get SHA of latest commit on master
request = urllib.request.Request('%s/commits/master' % API_REPO_BASE,
headers={'Accept': 'application/vnd.git
hub.v3.sha'})
response = urllib.request.urlopen(request)
if response.status != 200:
raise http.client.HTTPException(response.reason)
git_ref = response.read().decode('utf-8')
else:
# Get name of latest tag
request = urllib.request.Request('%s/releases?per_page=1' % API_REPO_BAS
E,
headers={'Accept': 'application/vnd.git
hub.v3+json'})
response = urllib.request.urlopen(request)
if response.status != 200:
raise http.client.HTTPException(response.reason)
import json
git_ref = json.loads(response.read().decode('utf-8'))[0]['tag_name']
# Download googler to a tempfile # Download googler to a tempfile
git_ref = get_latest_ref(include_git=include_git)
googler_download_url = '%s/%s/googler' % (RAW_DOWNLOAD_REPO_BASE, git_ref) googler_download_url = '%s/%s/googler' % (RAW_DOWNLOAD_REPO_BASE, git_ref)
printerr('Downloading %s' % googler_download_url) printerr('Downloading %s' % googler_download_url)
request = urllib.request.Request(googler_download_url, request = urllib.request.Request(googler_download_url,
headers={'Accept-Encoding': 'gzip'}) headers={'Accept-Encoding': 'gzip'})
import tempfile import tempfile
fd, path = tempfile.mkstemp() fd, path = tempfile.mkstemp()
atexit.register(lambda: os.remove(path) if os.path.exists(path) else None) atexit.register(lambda: os.remove(path) if os.path.exists(path) else None)
os.close(fd) os.close(fd)
with open(path, 'wb') as fp: with open(path, 'wb') as fp:
with urllib.request.urlopen(request) as response: with urllib.request.urlopen(request) as response:
skipping to change at line 3132 skipping to change at line 3247
include_git : bool, optional include_git : bool, optional
See `download_latest_googler`. Default is False. See `download_latest_googler`. Default is False.
""" """
git_ref, path = download_latest_googler(include_git=include_git) git_ref, path = download_latest_googler(include_git=include_git)
if self_replace(path): if self_replace(path):
printerr('Upgraded to %s.' % git_ref) printerr('Upgraded to %s.' % git_ref)
else: else:
printerr('Already up to date.') printerr('Already up to date.')
def check_new_version():
try:
from distutils.version import StrictVersion as Version
except ImportError:
# distutils not available (thanks distros), use a concise poor
# man's version parser.
class Version(tuple):
def __new__(cls, version_str):
def parseint(s):
try:
return int(s)
except ValueError:
return 0
return tuple.__new__(cls, [parseint(s) for s in version_str.spli
t('.')])
import pathlib
import tempfile
import time
cache = pathlib.Path(tempfile.gettempdir()) / 'googler-latest-version'
latest_version_str = None
# Try to load latest version string from cached location, if it
# exists and is fresh enough.
try:
if cache.is_file() and time.time() - cache.stat().st_mtime < 86400:
latest_version_str = cache.read_text().strip()
except OSError:
pass
if not latest_version_str:
try:
latest_version_str = get_latest_ref().lstrip('v')
cache.write_text(latest_version_str)
except Exception:
pass
if not latest_version_str:
return
# Try to fetch latest version string from GitHub.
try:
current_version = Version(_VERSION_)
latest_version = Version(latest_version_str)
except ValueError:
return
if latest_version > current_version:
print('\x1b[33;1mThe latest release of googler is v%s, please upgrade.\x
1b[0m'
% latest_version_str,
file=sys.stderr)
# Miscellaneous functions # Miscellaneous functions
def python_version(): def python_version():
return '%d.%d.%d' % sys.version_info[:3] return '%d.%d.%d' % sys.version_info[:3]
def https_proxy_from_environment(): def https_proxy_from_environment():
return os.getenv('https_proxy') return os.getenv('https_proxy')
def parse_proxy_spec(proxyspec): def parse_proxy_spec(proxyspec):
if '://' in proxyspec: if '://' in proxyspec:
skipping to change at line 3290 skipping to change at line 3451
default=colorstr_env if colorstr_env else 'GKlgxy', metavar='COLORS', default=colorstr_env if colorstr_env else 'GKlgxy', metavar='COLORS',
help='set output colors (see man page for details)') help='set output colors (see man page for details)')
addarg('-j', '--first', '--lucky', dest='lucky', action='store_true', addarg('-j', '--first', '--lucky', dest='lucky', action='store_true',
help='open the first result in web browser and exit') help='open the first result in web browser and exit')
addarg('-t', '--time', dest='duration', type=argparser.is_duration, addarg('-t', '--time', dest='duration', type=argparser.is_duration,
metavar='dN', help='time limit search ' metavar='dN', help='time limit search '
'[h5 (5 hrs), d5 (5 days), w5 (5 weeks), m5 (5 months), y5 (5 years)] ') '[h5 (5 hrs), d5 (5 days), w5 (5 weeks), m5 (5 months), y5 (5 years)] ')
addarg('--from', type=argparser.is_date, addarg('--from', type=argparser.is_date,
help="""starting date/month/year of date range; must use American dat e help="""starting date/month/year of date range; must use American dat e
format with slashes, e.g., 2/24/2020, 2/2020, 2020; can be used in format with slashes, e.g., 2/24/2020, 2/2020, 2020; can be used in
conjuction with --to, and overrides -t, --time""") conjunction with --to, and overrides -t, --time""")
addarg('--to', type=argparser.is_date, addarg('--to', type=argparser.is_date,
help='ending date/month/year of date range; see --from') help='ending date/month/year of date range; see --from')
addarg('-w', '--site', dest='sites', action='append', metavar='SITE', addarg('-w', '--site', dest='sites', action='append', metavar='SITE',
help='search a site using Google') help='search a site using Google')
addarg('--unfilter', action='store_true', help='do not omit similar results' ) addarg('--unfilter', action='store_true', help='do not omit similar results' )
addarg('-p', '--proxy', default=https_proxy_from_environment(), addarg('-p', '--proxy', default=https_proxy_from_environment(),
help="""tunnel traffic through an HTTP proxy; help="""tunnel traffic through an HTTP proxy;
PROXY is of the form [http://][user:password@]proxyhost[:port]""") PROXY is of the form [http://][user:password@]proxyhost[:port]""")
addarg('--noua', action='store_true', help='legacy option (no effect)') addarg('--noua', action='store_true', help=argparse.SUPPRESS)
addarg('--notweak', action='store_true', addarg('--notweak', action='store_true',
help='disable TCP optimizations and forced TLS 1.2') help='disable TCP optimizations and forced TLS 1.2')
addarg('--json', action='store_true', addarg('--json', action='store_true',
help='output in JSON format; implies --noprompt') help='output in JSON format; implies --noprompt')
addarg('--url-handler', metavar='UTIL', addarg('--url-handler', metavar='UTIL',
help='custom script or cli utility to open results') help='custom script or cli utility to open results')
addarg('--show-browser-logs', action='store_true', addarg('--show-browser-logs', action='store_true',
help='do not suppress browser output (stdout and stderr)') help='do not suppress browser output (stdout and stderr)')
addarg('--np', '--noprompt', dest='noninteractive', action='store_true', addarg('--np', '--noprompt', dest='noninteractive', action='store_true',
help='search and exit, do not prompt') help='search and exit, do not prompt')
addarg('-4', '--ipv4', action='store_const', dest='address_family',
const=socket.AF_INET, default=0,
help="""only connect over IPv4
(by default, IPv4 is preferred but IPv6 is used as a fallback)""")
addarg('-6', '--ipv6', action='store_const', dest='address_family',
const=socket.AF_INET6, default=0,
help='only connect over IPv6')
addarg('keywords', nargs='*', metavar='KEYWORD', help='search keywords') addarg('keywords', nargs='*', metavar='KEYWORD', help='search keywords')
if ENABLE_SELF_UPGRADE_MECHANISM and not system_is_windows(): if ENABLE_SELF_UPGRADE_MECHANISM and not system_is_windows():
addarg('-u', '--upgrade', action='store_true', addarg('-u', '--upgrade', action='store_true',
help='perform in-place self-upgrade') help='perform in-place self-upgrade')
addarg('--include-git', action='store_true', addarg('--include-git', action='store_true',
help='when used with --upgrade, get latest git master') help='when used with --upgrade, get latest git master')
addarg('-v', '--version', action='version', version=_VERSION_) addarg('-v', '--version', action='version', version=_VERSION_)
addarg('-d', '--debug', action='store_true', help='enable debugging') addarg('-d', '--debug', action='store_true', help='enable debugging')
# Hidden option for interacting with DOM in an IPython/pdb shell # Hidden option for interacting with DOM in an IPython/pdb shell
addarg('-D', '--debugger', action='store_true', help=argparse.SUPPRESS) addarg('-D', '--debugger', action='store_true', help=argparse.SUPPRESS)
# Hidden option for parsing dumped HTML
addarg('--parse', dest='html_file', help=argparse.SUPPRESS)
addarg('--complete', help=argparse.SUPPRESS) addarg('--complete', help=argparse.SUPPRESS)
parsed = argparser.parse_args(args, namespace) parsed = argparser.parse_args(args, namespace)
if parsed.nocolor: if parsed.nocolor:
parsed.colorize = 'never' parsed.colorize = 'never'
return parsed return parsed
def main(): def main():
try: try:
opts = parse_args() opts = parse_args()
# Set logging level # Set logging level
if opts.debug: if opts.debug:
logger.setLevel(logging.DEBUG) logger.setLevel(logging.DEBUG)
logger.debug('googler version %s', _VERSION_) logger.debug('googler version %s', _VERSION_)
logger.debug('Python version %s', python_version()) logger.debug('Python version %s', python_version())
logger.debug('Platform: %s', platform.platform())
check_new_version()
if opts.debugger: if opts.debugger:
global debugger global debugger
debugger = True debugger = True
# Handle query completer # Handle query completer
if opts.complete is not None: if opts.complete is not None:
completer_run(opts.complete) completer_run(opts.complete)
# Handle self-upgrade # Handle self-upgrade
skipping to change at line 3397 skipping to change at line 3569
if opts.show_browser_logs or (os.getenv('BROWSER') in text_browsers) : if opts.show_browser_logs or (os.getenv('BROWSER') in text_browsers) :
open_url.suppress_browser_output = False open_url.suppress_browser_output = False
else: else:
open_url.suppress_browser_output = True open_url.suppress_browser_output = True
if opts.noua: if opts.noua:
logger.warning('--noua option has been deprecated and has no effect (see #284)') logger.warning('--noua option has been deprecated and has no effect (see #284)')
repl = GooglerCmd(opts) repl = GooglerCmd(opts)
if opts.json or opts.lucky or opts.noninteractive: # Non-interactive mode
# Non-interactive mode if opts.json or opts.lucky or opts.noninteractive or opts.html_file:
repl.fetch() repl.fetch()
if opts.lucky: if opts.lucky:
if repl.results: if repl.results:
open_url(repl.results[0].url) open_url(repl.results[0].url)
else: else:
print('No results.', file=sys.stderr) print('No results.', file=sys.stderr)
else: else:
repl.showing_results_for_alert(interactive=False) repl.showing_results_for_alert(interactive=False)
repl.display_results(json_output=opts.json) repl.display_results(json_output=opts.json)
sys.exit(0) sys.exit(0)
else:
# Interactive mode # Interactive mode
repl.cmdloop() repl.cmdloop()
except Exception as e: except Exception as e:
# With debugging on, let the exception through for a traceback; # With debugging on, let the exception through for a traceback;
# otherwise, only print the exception error message. # otherwise, only print the exception error message.
if logger.isEnabledFor(logging.DEBUG): if logger.isEnabledFor(logging.DEBUG):
raise raise
else: else:
logger.error(e) logger.error(e)
sys.exit(1) sys.exit(1)
if __name__ == '__main__': if __name__ == '__main__':
 End of changes. 38 change blocks. 
86 lines changed or deleted 268 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)