"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "benchsuite/benchsuite" between
ripgrep-12.1.1.tar.gz and ripgrep-13.0.0.tar.gz

About: ripgrep is a command line search tool ("rg") that tries to combine the usability of "ag" (an "ack" clone) with the raw speed of GNU "grep" (written in "Rust").

benchsuite  (ripgrep-12.1.1):benchsuite  (ripgrep-13.0.0)
skipping to change at line 26 skipping to change at line 26
import sys import sys
import time import time
# Some constants for identifying the corpora we use to run tests. # Some constants for identifying the corpora we use to run tests.
# We establish two very different kinds of corpora: a small number of large # We establish two very different kinds of corpora: a small number of large
# files and a large number of small files. These are vastly different use cases # files and a large number of small files. These are vastly different use cases
# not only because of their performance characteristics, but also the # not only because of their performance characteristics, but also the
# strategies used to increase the relevance of results returned. # strategies used to increase the relevance of results returned.
SUBTITLES_DIR = 'subtitles' SUBTITLES_DIR = 'subtitles'
SUBTITLES_EN_NAME = 'OpenSubtitles2016.raw.en' SUBTITLES_EN_NAME = 'en.txt'
SUBTITLES_EN_NAME_SAMPLE = 'OpenSubtitles2016.raw.sample.en' SUBTITLES_EN_NAME_SAMPLE = 'en.sample.txt'
SUBTITLES_EN_NAME_GZ = '%s.gz' % SUBTITLES_EN_NAME SUBTITLES_EN_NAME_GZ = '%s.gz' % SUBTITLES_EN_NAME
SUBTITLES_EN_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitl # SUBTITLES_EN_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubti
es2016.raw.en.gz' # noqa tles2016.raw.en.gz' # noqa
SUBTITLES_RU_NAME = 'OpenSubtitles2016.raw.ru' SUBTITLES_EN_URL = 'https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2016/mono/en
.txt.gz' # noqa
SUBTITLES_RU_NAME = 'ru.txt'
SUBTITLES_RU_NAME_GZ = '%s.gz' % SUBTITLES_RU_NAME SUBTITLES_RU_NAME_GZ = '%s.gz' % SUBTITLES_RU_NAME
SUBTITLES_RU_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitl # SUBTITLES_RU_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubti
es2016.raw.ru.gz' # noqa tles2016.raw.ru.gz' # noqa
SUBTITLES_RU_URL = 'https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2016/mono/ru
.txt.gz' # noqa
LINUX_DIR = 'linux' LINUX_DIR = 'linux'
LINUX_CLONE = 'git://github.com/BurntSushi/linux' LINUX_CLONE = 'git://github.com/BurntSushi/linux'
# Grep takes locale settings from the environment. There is a *substantial* # Grep takes locale settings from the environment. There is a *substantial*
# performance impact for enabling Unicode, so we need to handle this explicitly # performance impact for enabling Unicode, so we need to handle this explicitly
# in our benchmarks. # in our benchmarks.
GREP_ASCII = {'LC_ALL': 'C'} GREP_ASCII = {'LC_ALL': 'C'}
GREP_UNICODE = {'LC_ALL': 'en_US.UTF-8'} GREP_UNICODE = {'LC_ALL': 'en_US.UTF-8'}
skipping to change at line 57 skipping to change at line 59
'--binary-skip', '--binary-skip',
'--exclude-files', '.*', '--exclude-files', '.*',
'--exclude-files', '*.pdf', '--exclude-files', '*.pdf',
] ]
def bench_linux_literal_default(suite_dir): def bench_linux_literal_default(suite_dir):
''' '''
Benchmark the speed of a literal using *default* settings. Benchmark the speed of a literal using *default* settings.
This is a purposefully unfair benchmark for use in performance This is a purposefully unfair benchmark for use in performance
analysis, but it is pedagogically useful to demonstrate how analysis, but it is pedagogically useful to demonstrate how default
default behaviors differ. behaviors differ. For example, ugrep and grep don't do any smart
filtering by default, so they will invariably search more files
than ripgrep, ag or git grep.
''' '''
require(suite_dir, 'linux') require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR) cwd = path.join(suite_dir, LINUX_DIR)
pat = 'PM_RESUME' pat = 'PM_RESUME'
def mkcmd(*args, **kwargs): def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd kwargs['cwd'] = cwd
return Command(*args, **kwargs) return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', pat]), mkcmd('rg', ['rg', pat]),
mkcmd('ag', ['ag', pat]), mkcmd('ag', ['ag', pat]),
# ucg reports the exact same matches as ag and rg even though it
# doesn't read gitignore files. Instead, it has a file whitelist
# that happens to match up exactly with the gitignores for this search.
mkcmd('ucg', ['ucg', pat]),
# I guess setting LC_ALL=en_US.UTF-8 probably isn't necessarily the # I guess setting LC_ALL=en_US.UTF-8 probably isn't necessarily the
# default, but I'd guess it to be on most desktop systems. # default, but I'd guess it to be on most desktop systems.
mkcmd('pt', ['pt', pat]), mkcmd('git grep', ['git', 'grep', pat], env=GREP_UNICODE),
# sift reports an extra line here for a binary file matched. mkcmd('ugrep', ['ugrep', '-r', pat, './']),
mkcmd('sift', ['sift', pat]), mkcmd('grep', ['grep', '-r', pat, './'], env=GREP_UNICODE),
mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'en_US.UTF-8'}),
]) ])
def bench_linux_literal(suite_dir): def bench_linux_literal(suite_dir):
''' '''
Benchmark the speed of a literal, attempting to be fair. Benchmark the speed of a literal, attempting to be fair.
This tries to use the minimum set of options available in all tools This tries to use the minimum set of options available in all tools
to test how fast they are. For example, it makes sure there is to test how fast they are. For example, it makes sure there is
no case insensitive matching and that line numbers are computed no case insensitive matching and that line numbers are computed
(because some tools don't permit disabling line numbers). (because some tools don't permit disabling line numbers).
''' '''
require(suite_dir, 'linux') require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR) cwd = path.join(suite_dir, LINUX_DIR)
pat = 'PM_RESUME' pat = 'PM_RESUME'
def mkcmd(*args, **kwargs): def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd kwargs['cwd'] = cwd
return Command(*args, **kwargs) return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg (ignore)', ['rg', '-n', pat]), mkcmd('rg', ['rg', '-n', pat]),
mkcmd('rg (ignore) (mmap)', ['rg', '-n', '--mmap', pat]), mkcmd('rg (mmap)', ['rg', '-n', '--mmap', pat]),
mkcmd('ag (ignore) (mmap)', ['ag', '-s', pat]), mkcmd('ag (mmap)', ['ag', '-s', pat]),
mkcmd('pt (ignore)', ['pt', pat]), mkcmd('git grep', [
mkcmd('sift (ignore)', SIFT + ['-n', '--git', pat]),
mkcmd('git grep (ignore)', [
'git', 'grep', '-I', '-n', pat, 'git', 'grep', '-I', '-n', pat,
], env={'LC_ALL': 'C'}), ], env={'LC_ALL': 'C'}),
mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]), mkcmd('ugrep', [
mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]), 'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
'-n', pat, './',
])
]) ])
def bench_linux_literal_casei(suite_dir): def bench_linux_literal_casei(suite_dir):
''' '''
Benchmark the speed of a case insensitive literal search. Benchmark the speed of a case insensitive literal search.
This is like the linux_literal benchmark, except we ask the This is like the linux_literal benchmark, except we ask the
search tools to do case insensitive search. search tools to do case insensitive search.
''' '''
require(suite_dir, 'linux') require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR) cwd = path.join(suite_dir, LINUX_DIR)
pat = 'PM_RESUME' pat = 'PM_RESUME'
def mkcmd(*args, **kwargs): def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd kwargs['cwd'] = cwd
return Command(*args, **kwargs) return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg (ignore)', ['rg', '-n', '-i', pat]), mkcmd('rg', ['rg', '-n', '-i', pat]),
mkcmd('rg (ignore) (mmap)', ['rg', '-n', '-i', '--mmap', pat]), mkcmd('rg (mmap)', ['rg', '-n', '-i', '--mmap', pat]),
mkcmd('ag (ignore) (mmap)', ['ag', '-i', pat]), mkcmd('ag (mmap)', ['ag', '-i', pat]),
mkcmd('pt (ignore)', ['pt', '-i', pat]),
mkcmd('sift (ignore)', SIFT + ['-n', '-i', '--git', pat]),
# It'd technically be more appropriate to set LC_ALL=en_US.UTF-8 here, # It'd technically be more appropriate to set LC_ALL=en_US.UTF-8 here,
# since that is certainly what ripgrep is doing, but this is for an # since that is certainly what ripgrep is doing, but this is for an
# ASCII literal, so we should give `git grep` all the opportunity to # ASCII literal, so we should give `git grep` all the opportunity to
# do its best. # do its best.
mkcmd('git grep (ignore)', [ mkcmd('git grep', [
'git', 'grep', '-I', '-n', '-i', pat, 'git', 'grep', '-I', '-n', '-i', pat,
], env={'LC_ALL': 'C'}), ], env={'LC_ALL': 'C'}),
mkcmd('rg (whitelist)', [ mkcmd('ugrep', [
'rg', '-n', '-i', '--no-ignore', '-tall', pat, 'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
]), '-n', '-i', pat, './',
mkcmd('ucg (whitelist)', ['ucg', '-i', pat]), ])
]) ])
def bench_linux_re_literal_suffix(suite_dir): def bench_linux_re_literal_suffix(suite_dir):
''' '''
Benchmark the speed of a literal inside a regex. Benchmark the speed of a literal inside a regex.
This, for example, inhibits a prefix byte optimization used
inside of Go's regex engine (relevant for sift and pt).
''' '''
require(suite_dir, 'linux') require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR) cwd = path.join(suite_dir, LINUX_DIR)
pat = '[A-Z]+_RESUME' pat = '[A-Z]+_RESUME'
def mkcmd(*args, **kwargs): def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd kwargs['cwd'] = cwd
return Command(*args, **kwargs) return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg (ignore)', ['rg', '-n', pat]), mkcmd('rg', ['rg', '-n', pat]),
mkcmd('ag (ignore)', ['ag', '-s', pat]), mkcmd('ag', ['ag', '-s', pat]),
mkcmd('pt (ignore)', ['pt', '-e', pat]),
mkcmd('sift (ignore)', SIFT + ['-n', '--git', pat]),
mkcmd( mkcmd(
'git grep (ignore)', 'git grep',
['git', 'grep', '-E', '-I', '-n', pat], ['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'C'}, env={'LC_ALL': 'C'},
), ),
mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]), mkcmd('ugrep', [
mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]), 'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
'-n', pat, './',
])
]) ])
def bench_linux_word(suite_dir): def bench_linux_word(suite_dir):
''' '''
Benchmark use of the -w ("match word") flag in each tool. Benchmark use of the -w ("match word") flag in each tool.
sift has a lot of trouble with this because it forces it into Go's
regex engine by surrounding the pattern with \b assertions.
''' '''
require(suite_dir, 'linux') require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR) cwd = path.join(suite_dir, LINUX_DIR)
pat = 'PM_RESUME' pat = 'PM_RESUME'
def mkcmd(*args, **kwargs): def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd kwargs['cwd'] = cwd
return Command(*args, **kwargs) return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg (ignore)', ['rg', '-n', '-w', pat]), mkcmd('rg', ['rg', '-n', '-w', pat]),
mkcmd('ag (ignore)', ['ag', '-s', '-w', pat]), mkcmd('ag', ['ag', '-s', '-w', pat]),
mkcmd('pt (ignore)', ['pt', '-w', pat]),
mkcmd('sift (ignore)', SIFT + ['-n', '-w', '--git', pat]),
mkcmd( mkcmd(
'git grep (ignore)', 'git grep',
['git', 'grep', '-E', '-I', '-n', '-w', pat], ['git', 'grep', '-E', '-I', '-n', '-w', pat],
env={'LC_ALL': 'C'}, env={'LC_ALL': 'C'},
), ),
mkcmd('rg (whitelist)', [ mkcmd('ugrep', [
'rg', '-n', '-w', '--no-ignore', '-tall', pat, 'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
]), '-n', '-w', pat, './',
mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', '-w', pat]), ])
]) ])
def bench_linux_unicode_greek(suite_dir): def bench_linux_unicode_greek(suite_dir):
''' '''
Benchmark matching of a Unicode category. Benchmark matching of a Unicode category.
Only three tools (ripgrep, sift and pt) support this. We omit
pt because it is too slow.
''' '''
require(suite_dir, 'linux') require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR) cwd = path.join(suite_dir, LINUX_DIR)
pat = r'\p{Greek}' pat = r'\p{Greek}'
def mkcmd(*args, **kwargs): def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd kwargs['cwd'] = cwd
return Command(*args, **kwargs) return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', pat]), mkcmd('rg', ['rg', '-n', pat]),
mkcmd('pt', ['pt', '-e', pat]), mkcmd('ugrep', [
mkcmd('sift', SIFT + ['-n', '--git', pat]), 'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
'-n', pat, './',
])
]) ])
def bench_linux_unicode_greek_casei(suite_dir): def bench_linux_unicode_greek_casei(suite_dir):
''' '''
Benchmark matching of a Unicode category, case insensitively. Benchmark matching of a Unicode category, case insensitively.
Only ripgrep gets this right (and it's still fast). Only ripgrep gets this right (and it's still fast).
''' '''
require(suite_dir, 'linux') require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR) cwd = path.join(suite_dir, LINUX_DIR)
pat = r'\p{Greek}' pat = r'\p{Greek}'
def mkcmd(*args, **kwargs): def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd kwargs['cwd'] = cwd
return Command(*args, **kwargs) return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg', ['rg', '-n', '-i', pat]), mkcmd('rg', ['rg', '-n', '-i', pat]),
mkcmd('pt', ['pt', '-i', '-e', pat]), mkcmd('ugrep', [
mkcmd('sift', SIFT + ['-n', '-i', '--git', pat]), 'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
'-n', '-i', pat, './',
])
]) ])
def bench_linux_unicode_word(suite_dir): def bench_linux_unicode_word(suite_dir):
''' '''
Benchmark Unicode aware \w character class. Benchmark Unicode aware \\w character class.
Only ripgrep and git-grep (with LC_ALL=en_US.UTF-8) actually get Only ripgrep and git-grep (with LC_ALL=en_US.UTF-8) actually get
this right. Everything else uses the standard ASCII interpretation this right. Everything else uses the standard ASCII interpretation
of \w. of \\w.
''' '''
require(suite_dir, 'linux') require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR) cwd = path.join(suite_dir, LINUX_DIR)
pat = r'\wAh' pat = r'\wAh'
def mkcmd(*args, **kwargs): def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd kwargs['cwd'] = cwd
return Command(*args, **kwargs) return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg (ignore)', ['rg', '-n', pat]), mkcmd('rg', ['rg', '-n', pat]),
mkcmd('rg (ignore) (ASCII)', ['rg', '-n', '(?-u)' + pat]), mkcmd('rg (ASCII)', ['rg', '-n', '(?-u)' + pat]),
mkcmd('ag (ignore) (ASCII)', ['ag', '-s', pat]), mkcmd('ag (ASCII)', ['ag', '-s', pat]),
mkcmd('pt (ignore) (ASCII)', ['pt', '-e', pat]),
mkcmd('sift (ignore) (ASCII)', SIFT + ['-n', '--git', pat]),
mkcmd( mkcmd(
'git grep (ignore)', 'git grep',
['git', 'grep', '-E', '-I', '-n', pat], ['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'en_US.UTF-8'}, env={'LC_ALL': 'en_US.UTF-8'},
), ),
mkcmd( mkcmd(
'git grep (ignore) (ASCII)', 'git grep (ASCII)',
['git', 'grep', '-E', '-I', '-n', pat], ['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'C'}, env={'LC_ALL': 'C'},
), ),
mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]), mkcmd('ugrep', [
mkcmd('rg (whitelist) (ASCII)', [ 'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
'rg', '-n', '--no-ignore', '-tall', '(?-u)' + pat, '-n', pat, './',
]),
mkcmd('ugrep (ASCII)', [
'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
'-n', '-U', pat, './',
]), ]),
mkcmd('ucg (ASCII)', ['ucg', '--nosmart-case', pat]),
]) ])
def bench_linux_no_literal(suite_dir): def bench_linux_no_literal(suite_dir):
''' '''
Benchmark a regex that defeats all literal optimizations. Benchmark a regex that defeats all literal optimizations.
Most search patterns have some kind of literal in them, which Most search patterns have some kind of literal in them, which
typically permits searches to take some shortcuts. Therefore, the typically permits searches to take some shortcuts. Therefore, the
applicability of this benchmark is somewhat suspicious, but the applicability of this benchmark is somewhat suspicious, but the
suite wouldn't feel complete without it. suite wouldn't feel complete without it.
''' '''
require(suite_dir, 'linux') require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR) cwd = path.join(suite_dir, LINUX_DIR)
pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}' pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}'
def mkcmd(*args, **kwargs): def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd kwargs['cwd'] = cwd
return Command(*args, **kwargs) return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg (ignore)', ['rg', '-n', pat]), mkcmd('rg', ['rg', '-n', pat]),
mkcmd('rg (ignore) (ASCII)', ['rg', '-n', '(?-u)' + pat]), mkcmd('rg (ASCII)', ['rg', '-n', '(?-u)' + pat]),
mkcmd('ag (ignore) (ASCII)', ['ag', '-s', pat]), mkcmd('ag (ASCII)', ['ag', '-s', pat]),
mkcmd('pt (ignore) (ASCII)', ['pt', '-e', pat]),
mkcmd('sift (ignore) (ASCII)', SIFT + ['-n', '--git', pat]),
mkcmd( mkcmd(
'git grep (ignore)', 'git grep',
['git', 'grep', '-E', '-I', '-n', pat], ['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'en_US.UTF-8'}, env={'LC_ALL': 'en_US.UTF-8'},
), ),
mkcmd( mkcmd(
'git grep (ignore) (ASCII)', 'git grep (ASCII)',
['git', 'grep', '-E', '-I', '-n', pat], ['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'C'}, env={'LC_ALL': 'C'},
), ),
mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]), mkcmd('ugrep', [
mkcmd('rg (whitelist) (ASCII)', [ 'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
'rg', '-n', '--no-ignore', '-tall', '(?-u)' + pat, '-n', pat, './',
]),
mkcmd('ugrep (ASCII)', [
'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
'-n', '-U', pat, './',
]), ]),
mkcmd('ucg (whitelist) (ASCII)', ['ucg', '--nosmart-case', pat]),
]) ])
def bench_linux_alternates(suite_dir): def bench_linux_alternates(suite_dir):
''' '''
Benchmark a small alternation of literals. Benchmark a small alternation of literals.
sift doesn't make the cut. It's more than 10x slower than the next sift doesn't make the cut. It's more than 10x slower than the next
fastest result. The slowdown is likely because the Go regexp engine fastest result. The slowdown is likely because the Go regexp engine
doesn't do any literal optimizations for this case (there is no doesn't do any literal optimizations for this case (there is no
common leading byte). common leading byte).
''' '''
require(suite_dir, 'linux') require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR) cwd = path.join(suite_dir, LINUX_DIR)
pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT' pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT'
def mkcmd(*args, **kwargs): def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd kwargs['cwd'] = cwd
return Command(*args, **kwargs) return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg (ignore)', ['rg', '-n', pat]), mkcmd('rg', ['rg', '-n', pat]),
mkcmd('ag (ignore)', ['ag', '-s', pat]), mkcmd('ag', ['ag', '-s', pat]),
mkcmd( mkcmd(
'git grep (ignore)', 'git grep',
['git', 'grep', '-E', '-I', '-n', pat], ['git', 'grep', '-E', '-I', '-n', pat],
env={'LC_ALL': 'C'}, env={'LC_ALL': 'C'},
), ),
mkcmd('rg (whitelist)', ['rg', '--no-ignore', '-n', pat]), mkcmd('ugrep', [
mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]), 'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
'-n', pat, './',
])
]) ])
def bench_linux_alternates_casei(suite_dir): def bench_linux_alternates_casei(suite_dir):
'Benchmark a small alternation of literals case insensitively.' 'Benchmark a small alternation of literals case insensitively.'
require(suite_dir, 'linux') require(suite_dir, 'linux')
cwd = path.join(suite_dir, LINUX_DIR) cwd = path.join(suite_dir, LINUX_DIR)
pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT' pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT'
def mkcmd(*args, **kwargs): def mkcmd(*args, **kwargs):
kwargs['cwd'] = cwd kwargs['cwd'] = cwd
return Command(*args, **kwargs) return Command(*args, **kwargs)
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
mkcmd('rg (ignore)', ['rg', '-n', '-i', pat]), mkcmd('rg', ['rg', '-n', '-i', pat]),
mkcmd('ag (ignore)', ['ag', '-i', pat]), mkcmd('ag', ['ag', '-i', pat]),
mkcmd( mkcmd(
'git grep (ignore)', 'git grep',
['git', 'grep', '-E', '-I', '-n', '-i', pat], ['git', 'grep', '-E', '-I', '-n', '-i', pat],
env={'LC_ALL': 'C'}, env={'LC_ALL': 'C'},
), ),
mkcmd('rg (whitelist)', ['rg', '--no-ignore', '-n', '-i', pat]), mkcmd('ugrep', [
mkcmd('ucg (whitelist)', ['ucg', '-i', pat]), 'ugrep', '-r', '--ignore-files', '--no-hidden', '-I',
'-n', '-i', pat, './',
])
]) ])
def bench_subtitles_en_literal(suite_dir): def bench_subtitles_en_literal(suite_dir):
''' '''
Benchmark the speed of an ASCII string literal. Benchmark the speed of an ASCII string literal.
''' '''
require(suite_dir, 'subtitles-en') require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = 'Sherlock Holmes' pat = 'Sherlock Holmes'
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', pat, en]), Command('rg', ['rg', pat, en]),
Command('rg (no mmap)', ['rg', '--no-mmap', pat, en]), Command('rg (no mmap)', ['rg', '--no-mmap', pat, en]),
Command('pt', ['pt', '-N', pat, en]), Command('grep', ['grep', pat, en], env=GREP_ASCII),
Command('sift', ['sift', pat, en]),
Command('grep', ['grep', '-a', pat, en], env=GREP_ASCII),
Command('rg (lines)', ['rg', '-n', pat, en]), Command('rg (lines)', ['rg', '-n', pat, en]),
Command('ag (lines)', ['ag', '-s', pat, en]), Command('ag (lines)', ['ag', '-s', pat, en]),
Command('ucg (lines)', ['ucg', '--nosmart-case', pat, en]), Command('grep (lines)', ['grep', '-n', pat, en], env=GREP_ASCII),
Command('pt (lines)', ['pt', pat, en]), Command('ugrep (lines)', ['ugrep', '-n', pat, en])
Command('sift (lines)', ['sift', '-n', pat, en]),
Command('grep (lines)', ['grep', '-an', pat, en], env=GREP_ASCII),
]) ])
def bench_subtitles_en_literal_casei(suite_dir): def bench_subtitles_en_literal_casei(suite_dir):
''' '''
Benchmark the speed of a Unicode-y string case insensitively. Benchmark the speed of a Unicode-y string case insensitively.
''' '''
require(suite_dir, 'subtitles-en') require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = 'Sherlock Holmes' pat = 'Sherlock Holmes'
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-i', pat, en]), Command('rg', ['rg', '-i', pat, en]),
Command('grep', ['grep', '-ai', pat, en], env=GREP_UNICODE), Command('grep', ['grep', '-i', pat, en], env=GREP_UNICODE),
Command('grep (ASCII)', [ Command('grep (ASCII)', ['grep', '-E', '-i', pat, en], env=GREP_ASCII),
'grep', '-E', '-ai', pat, en,
], env=GREP_ASCII),
Command('rg (lines)', ['rg', '-n', '-i', pat, en]), Command('rg (lines)', ['rg', '-n', '-i', pat, en]),
Command('ag (lines) (ASCII)', ['ag', '-i', pat, en]), Command('ag (lines) (ASCII)', ['ag', '-i', pat, en]),
Command('ucg (lines) (ASCII)', ['ucg', '-i', pat, en]), Command('ugrep (lines)', ['ugrep', '-n', '-i', pat, en])
]) ])
def bench_subtitles_en_literal_word(suite_dir): def bench_subtitles_en_literal_word(suite_dir):
''' '''
Benchmark the speed of finding a literal inside word boundaries. Benchmark the speed of finding a literal inside word boundaries.
''' '''
require(suite_dir, 'subtitles-en') require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = 'Sherlock Holmes' pat = 'Sherlock Holmes'
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
Command('rg (ASCII)', [ Command('rg (ASCII)', [
'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', en, 'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', en,
]), ]),
Command('ag (ASCII)', ['ag', '-sw', pat, en]), Command('ag (ASCII)', ['ag', '-sw', pat, en]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]), Command('grep (ASCII)', ['grep', '-nw', pat, en], env=GREP_ASCII),
Command('grep (ASCII)', [ Command('ugrep (ASCII)', ['ugrep', '-nw', pat, en]),
'grep', '-anw', pat, en,
], env=GREP_ASCII),
Command('rg', ['rg', '-nw', pat, en]), Command('rg', ['rg', '-nw', pat, en]),
Command('grep', ['grep', '-anw', pat, en], env=GREP_UNICODE), Command('grep', ['grep', '-nw', pat, en], env=GREP_UNICODE),
]) ])
def bench_subtitles_en_alternate(suite_dir): def bench_subtitles_en_alternate(suite_dir):
''' '''
Benchmark the speed of a set of alternate literals. Benchmark the speed of a set of alternate literals.
''' '''
require(suite_dir, 'subtitles-en') require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = '|'.join([ pat = '|'.join([
'Sherlock Holmes', 'Sherlock Holmes',
'John Watson', 'John Watson',
'Irene Adler', 'Irene Adler',
'Inspector Lestrade', 'Inspector Lestrade',
'Professor Moriarty', 'Professor Moriarty',
]) ])
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
Command('rg (lines)', ['rg', '-n', pat, en]), Command('rg (lines)', ['rg', '-n', pat, en]),
Command('ag (lines)', ['ag', '-s', pat, en]), Command('ag (lines)', ['ag', '-s', pat, en]),
Command('ucg (lines)', ['ucg', '--nosmart-case', pat, en]), Command('grep (lines)', ['grep', '-E', '-n', pat, en], env=GREP_ASCII),
Command('grep (lines)', [ Command('ugrep (lines)', ['ugrep', '-n', pat, en]),
'grep', '-E', '-an', pat, en,
], env=GREP_ASCII),
Command('rg', ['rg', pat, en]), Command('rg', ['rg', pat, en]),
Command('grep', [ Command('grep', ['grep', '-E', pat, en], env=GREP_ASCII),
'grep', '-E', '-a', pat, en,
], env=GREP_ASCII),
]) ])
def bench_subtitles_en_alternate_casei(suite_dir): def bench_subtitles_en_alternate_casei(suite_dir):
''' '''
Benchmark the speed of a set of alternate literals. Benchmark the speed of a set of alternate literals.
''' '''
require(suite_dir, 'subtitles-en') require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = '|'.join([ pat = '|'.join([
'Sherlock Holmes', 'Sherlock Holmes',
'John Watson', 'John Watson',
'Irene Adler', 'Irene Adler',
'Inspector Lestrade', 'Inspector Lestrade',
'Professor Moriarty', 'Professor Moriarty',
]) ])
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
Command('ag (ASCII)', ['ag', '-s', '-i', pat, en]), Command('ag (ASCII)', ['ag', '-s', '-i', pat, en]),
Command('ucg (ASCII)', ['ucg', '-i', pat, en]),
Command('grep (ASCII)', [ Command('grep (ASCII)', [
'grep', '-E', '-ani', pat, en, 'grep', '-E', '-ni', pat, en,
], env=GREP_ASCII), ], env=GREP_ASCII),
Command('ugrep (ASCII)', ['ugrep', '-n', '-i', pat, en]),
Command('rg', ['rg', '-n', '-i', pat, en]), Command('rg', ['rg', '-n', '-i', pat, en]),
Command('grep', ['grep', '-E', '-ani', pat, en], env=GREP_UNICODE), Command('grep', ['grep', '-E', '-ni', pat, en], env=GREP_UNICODE),
]) ])
def bench_subtitles_en_surrounding_words(suite_dir): def bench_subtitles_en_surrounding_words(suite_dir):
''' '''
Benchmark a more complex regex with an inner literal. Benchmark a more complex regex with an inner literal.
''' '''
require(suite_dir, 'subtitles-en') require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = r'\w+\s+Holmes\s+\w+' pat = r'\w+\s+Holmes\s+\w+'
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, en]), Command('rg', ['rg', '-n', pat, en]),
Command('grep', ['grep', '-E', '-an', pat, en], env=GREP_UNICODE), Command('grep', ['grep', '-E', '-n', pat, en], env=GREP_UNICODE),
Command('ugrep', ['ugrep', '-n', pat, en]),
Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]), Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]),
Command('ag (ASCII)', ['ag', '-s', pat, en]), Command('ag (ASCII)', ['ag', '-s', pat, en]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]), Command('grep (ASCII)', ['grep', '-E', '-n', pat, en], env=GREP_ASCII),
Command('grep (ASCII)', [ Command('ugrep (ASCII)', ['ugrep', '-n', '-U', pat, en])
'grep', '-E', '-an', pat, en,
], env=GREP_ASCII),
]) ])
def bench_subtitles_en_no_literal(suite_dir): def bench_subtitles_en_no_literal(suite_dir):
''' '''
Benchmark the speed of a regex with no literals. Benchmark the speed of a regex with no literals.
Note that we don't even try to run grep with Unicode support Note that we don't even try to run grep with Unicode support
on this one. While it should eventually get the right answer, on this one. While it should eventually get the right answer,
I killed it after it had already been running for two minutes I killed it after it had already been running for two minutes
and showed no signs of finishing soon. and showed no signs of finishing soon.
''' '''
require(suite_dir, 'subtitles-en') require(suite_dir, 'subtitles-en')
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE)
pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}' pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}'
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, en]), Command('rg', ['rg', '-n', pat, en]),
Command('ugrep', ['ugrep', '-n', pat, en]),
Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]), Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]),
Command('ag (ASCII)', ['ag', '-s', pat, en]), Command('ag (ASCII)', ['ag', '-s', pat, en]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]), Command('grep (ASCII)', ['grep', '-E', '-n', pat, en], env=GREP_ASCII),
Command('grep (ASCII)', [ Command('ugrep (ASCII)', ['ugrep', '-n', '-U', pat, en])
'grep', '-E', '-an', pat, en,
], env=GREP_ASCII),
]) ])
def bench_subtitles_ru_literal(suite_dir): def bench_subtitles_ru_literal(suite_dir):
''' '''
Benchmark the speed of a Unicode-y string literal. Benchmark the speed of a Unicode-y string literal.
''' '''
require(suite_dir, 'subtitles-ru') require(suite_dir, 'subtitles-ru')
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME) ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
pat = 'Шерлок Холмс' # Sherlock Holmes pat = 'Шерлок Холмс' # Sherlock Holmes
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', pat, ru]), Command('rg', ['rg', pat, ru]),
Command('rg (no mmap)', ['rg', '--no-mmap', pat, ru]), Command('rg (no mmap)', ['rg', '--no-mmap', pat, ru]),
Command('pt', ['pt', '-N', pat, ru]), Command('grep', ['grep', pat, ru], env=GREP_ASCII),
Command('sift', ['sift', pat, ru]),
Command('grep', ['grep', '-a', pat, ru], env=GREP_ASCII),
Command('rg (lines)', ['rg', '-n', pat, ru]), Command('rg (lines)', ['rg', '-n', pat, ru]),
Command('ag (lines)', ['ag', '-s', pat, ru]), Command('ag (lines)', ['ag', '-s', pat, ru]),
Command('ucg (lines)', ['ucg', '--nosmart-case', pat, ru]), Command('grep (lines)', ['grep', '-n', pat, ru], env=GREP_ASCII),
Command('pt (lines)', ['pt', pat, ru]), Command('ugrep (lines)', ['ugrep', '-n', pat, ru])
Command('sift (lines)', ['sift', '-n', pat, ru]),
Command('grep (lines)', ['grep', '-an', pat, ru], env=GREP_ASCII),
]) ])
def bench_subtitles_ru_literal_casei(suite_dir): def bench_subtitles_ru_literal_casei(suite_dir):
''' '''
Benchmark the speed of a Unicode-y string case insensitively. Benchmark the speed of a Unicode-y string case insensitively.
''' '''
require(suite_dir, 'subtitles-ru') require(suite_dir, 'subtitles-ru')
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME) ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
pat = 'Шерлок Холмс' # Sherlock Holmes pat = 'Шерлок Холмс' # Sherlock Holmes
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-i', pat, ru]), Command('rg', ['rg', '-i', pat, ru]),
Command('grep', ['grep', '-ai', pat, ru], env=GREP_UNICODE), Command('grep', ['grep', '-i', pat, ru], env=GREP_UNICODE),
Command('grep (ASCII)', [ Command('grep (ASCII)', ['grep', '-E', '-i', pat, ru], env=GREP_ASCII),
'grep', '-E', '-ai', pat, ru,
], env=GREP_ASCII),
Command('rg (lines)', ['rg', '-n', '-i', pat, ru]), Command('rg (lines)', ['rg', '-n', '-i', pat, ru]),
Command('ag (lines) (ASCII)', ['ag', '-i', pat, ru]), Command('ag (lines) (ASCII)', ['ag', '-i', pat, ru]),
Command('ucg (lines) (ASCII)', ['ucg', '-i', pat, ru]), Command('ugrep (lines) (ASCII)', ['ugrep', '-n', '-i', pat, ru])
]) ])
def bench_subtitles_ru_literal_word(suite_dir): def bench_subtitles_ru_literal_word(suite_dir):
''' '''
Benchmark the speed of finding a literal inside word boundaries. Benchmark the speed of finding a literal inside word boundaries.
''' '''
require(suite_dir, 'subtitles-ru') require(suite_dir, 'subtitles-ru')
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME) ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
pat = 'Шерлок Холмс' # Sherlock Holmes pat = 'Шерлок Холмс' # Sherlock Holmes
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
Command('rg (ASCII)', [ Command('rg (ASCII)', [
'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', ru, # You might think we'd use \b here for word boundaries, but both
# GNU grep and ripgrep implement -w with the formulation below.
# Since we can't use Unicode in a pattern and disable Unicode word
# boundaries, we just hand-jam this ourselves.
'rg', '-n', r'(?-u:^|\W)' + pat + r'(?-u:$|\W)', ru,
]), ]),
Command('ag (ASCII)', ['ag', '-sw', pat, ru]), Command('ag (ASCII)', ['ag', '-sw', pat, ru]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]),
Command('grep (ASCII)', [ Command('grep (ASCII)', [
'grep', '-anw', pat, ru, 'grep', '-nw', pat, ru,
], env=GREP_ASCII), ], env=GREP_ASCII),
Command('ugrep (ASCII)', ['ugrep', '-nw', pat, ru]),
Command('rg', ['rg', '-nw', pat, ru]), Command('rg', ['rg', '-nw', pat, ru]),
Command('grep', ['grep', '-anw', pat, ru], env=GREP_UNICODE), Command('grep', ['grep', '-nw', pat, ru], env=GREP_UNICODE),
]) ])
def bench_subtitles_ru_alternate(suite_dir): def bench_subtitles_ru_alternate(suite_dir):
''' '''
Benchmark the speed of a set of alternate literals. Benchmark the speed of a set of alternate literals.
''' '''
require(suite_dir, 'subtitles-ru') require(suite_dir, 'subtitles-ru')
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME) ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
pat = '|'.join([ pat = '|'.join([
'Шерлок Холмс', # Sherlock Holmes 'Шерлок Холмс', # Sherlock Holmes
'Джон Уотсон', # John Watson 'Джон Уотсон', # John Watson
'Ирен Адлер', # Irene Adler 'Ирен Адлер', # Irene Adler
'инспектор Лестрейд', # Inspector Lestrade 'инспектор Лестрейд', # Inspector Lestrade
'профессор Мориарти', # Professor Moriarty 'профессор Мориарти', # Professor Moriarty
]) ])
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
Command('rg (lines)', ['rg', '-n', pat, ru]), Command('rg (lines)', ['rg', '-n', pat, ru]),
Command('ag (lines)', ['ag', '-s', pat, ru]), Command('ag (lines)', ['ag', '-s', pat, ru]),
Command('ucg (lines)', ['ucg', '--nosmart-case', pat, ru]), Command('grep (lines)', ['grep', '-E', '-n', pat, ru], env=GREP_ASCII),
Command('grep (lines)', [ Command('ugrep (lines)', ['ugrep', '-n', pat, ru]),
'grep', '-E', '-an', pat, ru,
], env=GREP_ASCII),
Command('rg', ['rg', pat, ru]), Command('rg', ['rg', pat, ru]),
Command('grep', [ Command('grep', ['grep', '-E', pat, ru], env=GREP_ASCII),
'grep', '-E', '-a', pat, ru,
], env=GREP_ASCII),
]) ])
def bench_subtitles_ru_alternate_casei(suite_dir): def bench_subtitles_ru_alternate_casei(suite_dir):
''' '''
Benchmark the speed of a set of alternate literals. Benchmark the speed of a set of alternate literals.
''' '''
require(suite_dir, 'subtitles-ru') require(suite_dir, 'subtitles-ru')
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME) ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
pat = '|'.join([ pat = '|'.join([
'Шерлок Холмс', # Sherlock Holmes 'Шерлок Холмс', # Sherlock Holmes
'Джон Уотсон', # John Watson 'Джон Уотсон', # John Watson
'Ирен Адлер', # Irene Adler 'Ирен Адлер', # Irene Adler
'инспектор Лестрейд', # Inspector Lestrade 'инспектор Лестрейд', # Inspector Lestrade
'профессор Мориарти', # Professor Moriarty 'профессор Мориарти', # Professor Moriarty
]) ])
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
Command('ag (ASCII)', ['ag', '-s', '-i', pat, ru]), Command('ag (ASCII)', ['ag', '-s', '-i', pat, ru]),
Command('ucg (ASCII)', ['ucg', '-i', pat, ru]),
Command('grep (ASCII)', [ Command('grep (ASCII)', [
'grep', '-E', '-ani', pat, ru, 'grep', '-E', '-ni', pat, ru,
], env=GREP_ASCII), ], env=GREP_ASCII),
Command('ugrep (ASCII)', ['ugrep', '-n', '-i', pat, ru]),
Command('rg', ['rg', '-n', '-i', pat, ru]), Command('rg', ['rg', '-n', '-i', pat, ru]),
Command('grep', ['grep', '-E', '-ani', pat, ru], env=GREP_UNICODE), Command('grep', ['grep', '-E', '-ni', pat, ru], env=GREP_UNICODE),
]) ])
def bench_subtitles_ru_surrounding_words(suite_dir): def bench_subtitles_ru_surrounding_words(suite_dir):
''' '''
Benchmark a more complex regex with an inner literal. Benchmark a more complex regex with an inner literal.
''' '''
require(suite_dir, 'subtitles-en') require(suite_dir, 'subtitles-en')
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME) ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
pat = r'\w+\s+Холмс\s+\w+' pat = r'\w+\s+Холмс\s+\w+'
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, ru]), Command('rg', ['rg', '-n', pat, ru]),
Command('grep', ['grep', '-E', '-an', pat, ru], env=GREP_UNICODE), Command('grep', ['grep', '-E', '-n', pat, ru], env=GREP_UNICODE),
Command('ugrep', ['ugrep', '-n', pat, ru]),
Command('ag (ASCII)', ['ag', '-s', pat, ru]), Command('ag (ASCII)', ['ag', '-s', pat, ru]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]), Command('grep (ASCII)', ['grep', '-E', '-n', pat, ru], env=GREP_ASCII),
Command('grep (ASCII)', [ Command('ugrep (ASCII)', ['ugrep', '-n', '-U', pat, ru]),
'grep', '-E', '-an', pat, ru,
], env=GREP_ASCII),
]) ])
def bench_subtitles_ru_no_literal(suite_dir): def bench_subtitles_ru_no_literal(suite_dir):
''' '''
Benchmark the speed of a regex with no literals. Benchmark the speed of a regex with no literals.
Note that we don't even try to run grep with Unicode support Note that we don't even try to run grep with Unicode support
on this one. While it should eventually get the right answer, on this one. While it should eventually get the right answer,
I killed it after it had already been running for two minutes I killed it after it had already been running for two minutes
and showed no signs of finishing soon. and showed no signs of finishing soon.
''' '''
require(suite_dir, 'subtitles-ru') require(suite_dir, 'subtitles-ru')
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME) ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME)
pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}' pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}'
return Benchmark(pattern=pat, commands=[ return Benchmark(pattern=pat, commands=[
Command('rg', ['rg', '-n', pat, ru]), Command('rg', ['rg', '-n', pat, ru]),
Command('ugrep', ['ugrep', '-n', pat, ru]),
Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, ru]), Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, ru]),
Command('ag (ASCII)', ['ag', '-s', pat, ru]), Command('ag (ASCII)', ['ag', '-s', pat, ru]),
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]), Command('grep (ASCII)', ['grep', '-E', '-n', pat, ru], env=GREP_ASCII),
Command('grep (ASCII)', [ Command('ugrep (ASCII)', ['ugrep', '-n', '-U', pat, ru])
'grep', '-E', '-an', pat, ru,
], env=GREP_ASCII),
]) ])
class MissingDependencies(Exception): class MissingDependencies(Exception):
''' '''
A missing dependency exception. A missing dependency exception.
This exception occurs when running a benchmark that requires a This exception occurs when running a benchmark that requires a
particular corpus that isn't available. particular corpus that isn't available.
:ivar list(str) missing_names: :ivar list(str) missing_names:
skipping to change at line 731 skipping to change at line 705
''' '''
A single benchmark corresponding to a grouping of commands. A single benchmark corresponding to a grouping of commands.
The main purpose of a benchmark is to compare the performance The main purpose of a benchmark is to compare the performance
characteristics of a group of commands. characteristics of a group of commands.
''' '''
def __init__(self, name=None, pattern=None, commands=None, def __init__(self, name=None, pattern=None, commands=None,
warmup_count=1, count=3, line_count=True, warmup_count=1, count=3, line_count=True,
allow_missing_commands=False, allow_missing_commands=False,
disabled_cmds=None): disabled_cmds=None, order=0):
''' '''
Create a single benchmark. Create a single benchmark.
A single benchmark is composed of a set of commands that are A single benchmark is composed of a set of commands that are
benchmarked and compared against one another. A benchmark may benchmarked and compared against one another. A benchmark may
have multiple commands that use the same search tool (but have multiple commands that use the same search tool (but
probably should have something differentiating them). probably should have something differentiating them).
The grouping of commands is a purely human driven process. The grouping of commands is a purely human driven process.
skipping to change at line 767 skipping to change at line 741
:param int count: :param int count:
The number of samples to collect from each command. The number of samples to collect from each command.
:param bool line_count: :param bool line_count:
When set, the lines of each search are counted and included When set, the lines of each search are counted and included
in the samples produced. in the samples produced.
:param bool allow_missing_commands: :param bool allow_missing_commands:
When set, if a command is missing, then the benchmark When set, if a command is missing, then the benchmark
will simply skip it. will simply skip it.
:param list(str) disabled_cmds: :param list(str) disabled_cmds:
A list of commands to skip. A list of commands to skip.
:param int order:
An integer indicating the sequence number of this benchmark.
''' '''
self.name = name self.name = name
self.pattern = pattern self.pattern = pattern
self.commands = commands or [] self.commands = commands or []
self.warmup_count = warmup_count self.warmup_count = warmup_count
self.count = count self.count = count
self.line_count = line_count self.line_count = line_count
self.allow_missing_commands = allow_missing_commands self.allow_missing_commands = allow_missing_commands
self.disabled_cmds = set(disabled_cmds or []) self.disabled_cmds = set(disabled_cmds or [])
self.order = order
def raise_if_missing(self): def raise_if_missing(self):
''' '''
Raises a MissingCommands exception if applicable. Raises a MissingCommands exception if applicable.
A MissingCommands exception is raised when the following A MissingCommands exception is raised when the following
criteria are met: 1) allow_missing_commands is False, and 2) at criteria are met: 1) allow_missing_commands is False, and 2) at
least one command in this benchmark could not be found on this least one command in this benchmark could not be found on this
system. system.
''' '''
skipping to change at line 868 skipping to change at line 845
Benchmark results consist of a set of samples, where each sample Benchmark results consist of a set of samples, where each sample
corresponds to a single run of a single command in the benchmark. corresponds to a single run of a single command in the benchmark.
Various statistics can be computed from these samples such as mean Various statistics can be computed from these samples such as mean
and standard deviation. and standard deviation.
''' '''
def __init__(self, benchmark): def __init__(self, benchmark):
''' '''
Create a new set of results, initially empty. Create a new set of results, initially empty.
:param Benchmarl benchmark: :param Benchmark benchmark:
The benchmark that produced these results. The benchmark that produced these results.
''' '''
self.benchmark = benchmark self.benchmark = benchmark
self.samples = [] self.samples = []
def add(self, cmd, duration, line_count=None): def add(self, cmd, duration, line_count=None):
''' '''
Add a new sample to this result set. Add a new sample to this result set.
:param Command cmd: :param Command cmd:
skipping to change at line 1055 skipping to change at line 1032
os.makedirs(subtitle_dir) os.makedirs(subtitle_dir)
if not os.path.exists(en_path): if not os.path.exists(en_path):
if not os.path.exists(en_path_gz): if not os.path.exists(en_path_gz):
run_cmd(['curl', '-LO', SUBTITLES_EN_URL], cwd=subtitle_dir) run_cmd(['curl', '-LO', SUBTITLES_EN_URL], cwd=subtitle_dir)
run_cmd(['gunzip', en_path_gz]) run_cmd(['gunzip', en_path_gz])
if not os.path.exists(en_path_sample): if not os.path.exists(en_path_sample):
# Get a sample roughly the same size as the Russian corpus so that # Get a sample roughly the same size as the Russian corpus so that
# benchmarks finish in a reasonable time. # benchmarks finish in a reasonable time.
with open(path.join(subtitle_dir, en_path_sample), 'wb+') as f: with open(path.join(subtitle_dir, en_path_sample), 'wb+') as f:
run_cmd( run_cmd(
['head', '-n', '32722372', en_path], ['head', '-n', '55000000', en_path],
cwd=subtitle_dir, stdout=f) cwd=subtitle_dir, stdout=f)
def has_subtitles_en(suite_dir): def has_subtitles_en(suite_dir):
'Returns true if English subtitles have been downloaded.' 'Returns true if English subtitles have been downloaded.'
subtitle_dir = path.join(suite_dir, SUBTITLES_DIR) subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
return path.exists(path.join(subtitle_dir, SUBTITLES_EN_NAME_SAMPLE)) return path.exists(path.join(subtitle_dir, SUBTITLES_EN_NAME_SAMPLE))
def download_subtitles_ru(suite_dir): def download_subtitles_ru(suite_dir):
'Download and decompress Russian subtitles.' 'Download and decompress Russian subtitles.'
subtitle_dir = path.join(suite_dir, SUBTITLES_DIR) subtitle_dir = path.join(suite_dir, SUBTITLES_DIR)
skipping to change at line 1125 skipping to change at line 1102
:param str suite_dir: :param str suite_dir:
The directory containing corpora. The directory containing corpora.
:param str filter_pat: :param str filter_pat:
A single regular expression that is used to filter benchmarks A single regular expression that is used to filter benchmarks
by their name. When not specified, all benchmarks are run. by their name. When not specified, all benchmarks are run.
:returns: :returns:
An iterable over all runnable benchmarks. If a benchmark An iterable over all runnable benchmarks. If a benchmark
requires corpora that are missing, then a log message is requires corpora that are missing, then a log message is
emitted to stderr and it is not yielded. emitted to stderr and it is not yielded.
''' '''
for fun in sorted(globals()): benchmarks = []
if not fun.startswith('bench_'): for global_name in globals():
if not global_name.startswith('bench_'):
continue continue
name = re.sub('^bench_', '', fun) name = re.sub('^bench_', '', global_name)
if filter_pat is not None and not re.search(filter_pat, name): if filter_pat is not None and not re.search(filter_pat, name):
continue continue
try: try:
benchmark = globals()[fun](suite_dir) fun = globals()[global_name]
benchmark = fun(suite_dir)
benchmark.name = name benchmark.name = name
benchmark.warmup_count = warmup_iter benchmark.warmup_count = warmup_iter
benchmark.count = bench_iter benchmark.count = bench_iter
benchmark.allow_missing_commands = allow_missing_commands benchmark.allow_missing_commands = allow_missing_commands
benchmark.disabled_cmds = disabled_cmds benchmark.disabled_cmds = disabled_cmds
benchmark.order = fun.__code__.co_firstlineno
benchmark.raise_if_missing() benchmark.raise_if_missing()
except MissingDependencies as e: except MissingDependencies as e:
eprint( eprint(
'missing: %s, skipping benchmark %s (try running with: %s)' % ( 'missing: %s, skipping benchmark %s (try running with: %s)' % (
', '.join(e.missing_names), ', '.join(e.missing_names),
name, name,
' '.join(['--download %s' % n for n in e.missing_names]), ' '.join(['--download %s' % n for n in e.missing_names]),
)) ))
continue continue
except MissingCommands as e: except MissingCommands as e:
fmt = 'missing commands: %s, skipping benchmark %s ' \ fmt = 'missing commands: %s, skipping benchmark %s ' \
'(run with --allow-missing to run incomplete benchmarks)' '(run with --allow-missing to run incomplete benchmarks)'
eprint(fmt % (', '.join(e.missing_names), name)) eprint(fmt % (', '.join(e.missing_names), name))
continue continue
yield benchmark benchmarks.append(benchmark)
return sorted(benchmarks, key=lambda b: b.order)
def main(): def main():
download_choices = ['all', 'linux', 'subtitles-en', 'subtitles-ru'] download_choices = ['all', 'linux', 'subtitles-en', 'subtitles-ru']
p = argparse.ArgumentParser('Command line search tool benchmark suite.') p = argparse.ArgumentParser('Command line search tool benchmark suite.')
p.add_argument( p.add_argument(
'--dir', metavar='PATH', default=os.getcwd(), '--dir', metavar='PATH', default=os.getcwd(),
help='The directory in which to download data and perform searches.') help='The directory in which to download data and perform searches.')
p.add_argument( p.add_argument(
'--download', metavar='CORPUS', action='append', '--download', metavar='CORPUS', action='append',
choices=download_choices, choices=download_choices,
 End of changes. 85 change blocks. 
177 lines changed or deleted 160 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)