benchsuite (ripgrep-12.1.1) | : | benchsuite (ripgrep-13.0.0) | ||
---|---|---|---|---|
skipping to change at line 26 | skipping to change at line 26 | |||
import sys | import sys | |||
import time | import time | |||
# Some constants for identifying the corpora we use to run tests. | # Some constants for identifying the corpora we use to run tests. | |||
# We establish two very different kinds of corpora: a small number of large | # We establish two very different kinds of corpora: a small number of large | |||
# files and a large number of small files. These are vastly different use cases | # files and a large number of small files. These are vastly different use cases | |||
# not only because of their performance characteristics, but also the | # not only because of their performance characteristics, but also the | |||
# strategies used to increase the relevance of results returned. | # strategies used to increase the relevance of results returned. | |||
SUBTITLES_DIR = 'subtitles' | SUBTITLES_DIR = 'subtitles' | |||
SUBTITLES_EN_NAME = 'OpenSubtitles2016.raw.en' | SUBTITLES_EN_NAME = 'en.txt' | |||
SUBTITLES_EN_NAME_SAMPLE = 'OpenSubtitles2016.raw.sample.en' | SUBTITLES_EN_NAME_SAMPLE = 'en.sample.txt' | |||
SUBTITLES_EN_NAME_GZ = '%s.gz' % SUBTITLES_EN_NAME | SUBTITLES_EN_NAME_GZ = '%s.gz' % SUBTITLES_EN_NAME | |||
SUBTITLES_EN_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitl | # SUBTITLES_EN_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubti | |||
es2016.raw.en.gz' # noqa | tles2016.raw.en.gz' # noqa | |||
SUBTITLES_RU_NAME = 'OpenSubtitles2016.raw.ru' | SUBTITLES_EN_URL = 'https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2016/mono/en | |||
.txt.gz' # noqa | ||||
SUBTITLES_RU_NAME = 'ru.txt' | ||||
SUBTITLES_RU_NAME_GZ = '%s.gz' % SUBTITLES_RU_NAME | SUBTITLES_RU_NAME_GZ = '%s.gz' % SUBTITLES_RU_NAME | |||
SUBTITLES_RU_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubtitl | # SUBTITLES_RU_URL = 'http://opus.lingfil.uu.se/OpenSubtitles2016/mono/OpenSubti | |||
es2016.raw.ru.gz' # noqa | tles2016.raw.ru.gz' # noqa | |||
SUBTITLES_RU_URL = 'https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2016/mono/ru | ||||
.txt.gz' # noqa | ||||
LINUX_DIR = 'linux' | LINUX_DIR = 'linux' | |||
LINUX_CLONE = 'git://github.com/BurntSushi/linux' | LINUX_CLONE = 'git://github.com/BurntSushi/linux' | |||
# Grep takes locale settings from the environment. There is a *substantial* | # Grep takes locale settings from the environment. There is a *substantial* | |||
# performance impact for enabling Unicode, so we need to handle this explicitly | # performance impact for enabling Unicode, so we need to handle this explicitly | |||
# in our benchmarks. | # in our benchmarks. | |||
GREP_ASCII = {'LC_ALL': 'C'} | GREP_ASCII = {'LC_ALL': 'C'} | |||
GREP_UNICODE = {'LC_ALL': 'en_US.UTF-8'} | GREP_UNICODE = {'LC_ALL': 'en_US.UTF-8'} | |||
skipping to change at line 57 | skipping to change at line 59 | |||
'--binary-skip', | '--binary-skip', | |||
'--exclude-files', '.*', | '--exclude-files', '.*', | |||
'--exclude-files', '*.pdf', | '--exclude-files', '*.pdf', | |||
] | ] | |||
def bench_linux_literal_default(suite_dir): | def bench_linux_literal_default(suite_dir): | |||
''' | ''' | |||
Benchmark the speed of a literal using *default* settings. | Benchmark the speed of a literal using *default* settings. | |||
This is a purposefully unfair benchmark for use in performance | This is a purposefully unfair benchmark for use in performance | |||
analysis, but it is pedagogically useful to demonstrate how | analysis, but it is pedagogically useful to demonstrate how default | |||
default behaviors differ. | behaviors differ. For example, ugrep and grep don't do any smart | |||
filtering by default, so they will invariably search more files | ||||
than ripgrep, ag or git grep. | ||||
''' | ''' | |||
require(suite_dir, 'linux') | require(suite_dir, 'linux') | |||
cwd = path.join(suite_dir, LINUX_DIR) | cwd = path.join(suite_dir, LINUX_DIR) | |||
pat = 'PM_RESUME' | pat = 'PM_RESUME' | |||
def mkcmd(*args, **kwargs): | def mkcmd(*args, **kwargs): | |||
kwargs['cwd'] = cwd | kwargs['cwd'] = cwd | |||
return Command(*args, **kwargs) | return Command(*args, **kwargs) | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
mkcmd('rg', ['rg', pat]), | mkcmd('rg', ['rg', pat]), | |||
mkcmd('ag', ['ag', pat]), | mkcmd('ag', ['ag', pat]), | |||
# ucg reports the exact same matches as ag and rg even though it | ||||
# doesn't read gitignore files. Instead, it has a file whitelist | ||||
# that happens to match up exactly with the gitignores for this search. | ||||
mkcmd('ucg', ['ucg', pat]), | ||||
# I guess setting LC_ALL=en_US.UTF-8 probably isn't necessarily the | # I guess setting LC_ALL=en_US.UTF-8 probably isn't necessarily the | |||
# default, but I'd guess it to be on most desktop systems. | # default, but I'd guess it to be on most desktop systems. | |||
mkcmd('pt', ['pt', pat]), | mkcmd('git grep', ['git', 'grep', pat], env=GREP_UNICODE), | |||
# sift reports an extra line here for a binary file matched. | mkcmd('ugrep', ['ugrep', '-r', pat, './']), | |||
mkcmd('sift', ['sift', pat]), | mkcmd('grep', ['grep', '-r', pat, './'], env=GREP_UNICODE), | |||
mkcmd('git grep', ['git', 'grep', pat], env={'LC_ALL': 'en_US.UTF-8'}), | ||||
]) | ]) | |||
def bench_linux_literal(suite_dir): | def bench_linux_literal(suite_dir): | |||
''' | ''' | |||
Benchmark the speed of a literal, attempting to be fair. | Benchmark the speed of a literal, attempting to be fair. | |||
This tries to use the minimum set of options available in all tools | This tries to use the minimum set of options available in all tools | |||
to test how fast they are. For example, it makes sure there is | to test how fast they are. For example, it makes sure there is | |||
no case insensitive matching and that line numbers are computed | no case insensitive matching and that line numbers are computed | |||
(because some tools don't permit disabling line numbers). | (because some tools don't permit disabling line numbers). | |||
''' | ''' | |||
require(suite_dir, 'linux') | require(suite_dir, 'linux') | |||
cwd = path.join(suite_dir, LINUX_DIR) | cwd = path.join(suite_dir, LINUX_DIR) | |||
pat = 'PM_RESUME' | pat = 'PM_RESUME' | |||
def mkcmd(*args, **kwargs): | def mkcmd(*args, **kwargs): | |||
kwargs['cwd'] = cwd | kwargs['cwd'] = cwd | |||
return Command(*args, **kwargs) | return Command(*args, **kwargs) | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
mkcmd('rg (ignore)', ['rg', '-n', pat]), | mkcmd('rg', ['rg', '-n', pat]), | |||
mkcmd('rg (ignore) (mmap)', ['rg', '-n', '--mmap', pat]), | mkcmd('rg (mmap)', ['rg', '-n', '--mmap', pat]), | |||
mkcmd('ag (ignore) (mmap)', ['ag', '-s', pat]), | mkcmd('ag (mmap)', ['ag', '-s', pat]), | |||
mkcmd('pt (ignore)', ['pt', pat]), | mkcmd('git grep', [ | |||
mkcmd('sift (ignore)', SIFT + ['-n', '--git', pat]), | ||||
mkcmd('git grep (ignore)', [ | ||||
'git', 'grep', '-I', '-n', pat, | 'git', 'grep', '-I', '-n', pat, | |||
], env={'LC_ALL': 'C'}), | ], env={'LC_ALL': 'C'}), | |||
mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]), | mkcmd('ugrep', [ | |||
mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]), | 'ugrep', '-r', '--ignore-files', '--no-hidden', '-I', | |||
'-n', pat, './', | ||||
]) | ||||
]) | ]) | |||
def bench_linux_literal_casei(suite_dir): | def bench_linux_literal_casei(suite_dir): | |||
''' | ''' | |||
Benchmark the speed of a case insensitive literal search. | Benchmark the speed of a case insensitive literal search. | |||
This is like the linux_literal benchmark, except we ask the | This is like the linux_literal benchmark, except we ask the | |||
search tools to do case insensitive search. | search tools to do case insensitive search. | |||
''' | ''' | |||
require(suite_dir, 'linux') | require(suite_dir, 'linux') | |||
cwd = path.join(suite_dir, LINUX_DIR) | cwd = path.join(suite_dir, LINUX_DIR) | |||
pat = 'PM_RESUME' | pat = 'PM_RESUME' | |||
def mkcmd(*args, **kwargs): | def mkcmd(*args, **kwargs): | |||
kwargs['cwd'] = cwd | kwargs['cwd'] = cwd | |||
return Command(*args, **kwargs) | return Command(*args, **kwargs) | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
mkcmd('rg (ignore)', ['rg', '-n', '-i', pat]), | mkcmd('rg', ['rg', '-n', '-i', pat]), | |||
mkcmd('rg (ignore) (mmap)', ['rg', '-n', '-i', '--mmap', pat]), | mkcmd('rg (mmap)', ['rg', '-n', '-i', '--mmap', pat]), | |||
mkcmd('ag (ignore) (mmap)', ['ag', '-i', pat]), | mkcmd('ag (mmap)', ['ag', '-i', pat]), | |||
mkcmd('pt (ignore)', ['pt', '-i', pat]), | ||||
mkcmd('sift (ignore)', SIFT + ['-n', '-i', '--git', pat]), | ||||
# It'd technically be more appropriate to set LC_ALL=en_US.UTF-8 here, | # It'd technically be more appropriate to set LC_ALL=en_US.UTF-8 here, | |||
# since that is certainly what ripgrep is doing, but this is for an | # since that is certainly what ripgrep is doing, but this is for an | |||
# ASCII literal, so we should give `git grep` all the opportunity to | # ASCII literal, so we should give `git grep` all the opportunity to | |||
# do its best. | # do its best. | |||
mkcmd('git grep (ignore)', [ | mkcmd('git grep', [ | |||
'git', 'grep', '-I', '-n', '-i', pat, | 'git', 'grep', '-I', '-n', '-i', pat, | |||
], env={'LC_ALL': 'C'}), | ], env={'LC_ALL': 'C'}), | |||
mkcmd('rg (whitelist)', [ | mkcmd('ugrep', [ | |||
'rg', '-n', '-i', '--no-ignore', '-tall', pat, | 'ugrep', '-r', '--ignore-files', '--no-hidden', '-I', | |||
]), | '-n', '-i', pat, './', | |||
mkcmd('ucg (whitelist)', ['ucg', '-i', pat]), | ]) | |||
]) | ]) | |||
def bench_linux_re_literal_suffix(suite_dir): | def bench_linux_re_literal_suffix(suite_dir): | |||
''' | ''' | |||
Benchmark the speed of a literal inside a regex. | Benchmark the speed of a literal inside a regex. | |||
This, for example, inhibits a prefix byte optimization used | ||||
inside of Go's regex engine (relevant for sift and pt). | ||||
''' | ''' | |||
require(suite_dir, 'linux') | require(suite_dir, 'linux') | |||
cwd = path.join(suite_dir, LINUX_DIR) | cwd = path.join(suite_dir, LINUX_DIR) | |||
pat = '[A-Z]+_RESUME' | pat = '[A-Z]+_RESUME' | |||
def mkcmd(*args, **kwargs): | def mkcmd(*args, **kwargs): | |||
kwargs['cwd'] = cwd | kwargs['cwd'] = cwd | |||
return Command(*args, **kwargs) | return Command(*args, **kwargs) | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
mkcmd('rg (ignore)', ['rg', '-n', pat]), | mkcmd('rg', ['rg', '-n', pat]), | |||
mkcmd('ag (ignore)', ['ag', '-s', pat]), | mkcmd('ag', ['ag', '-s', pat]), | |||
mkcmd('pt (ignore)', ['pt', '-e', pat]), | ||||
mkcmd('sift (ignore)', SIFT + ['-n', '--git', pat]), | ||||
mkcmd( | mkcmd( | |||
'git grep (ignore)', | 'git grep', | |||
['git', 'grep', '-E', '-I', '-n', pat], | ['git', 'grep', '-E', '-I', '-n', pat], | |||
env={'LC_ALL': 'C'}, | env={'LC_ALL': 'C'}, | |||
), | ), | |||
mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]), | mkcmd('ugrep', [ | |||
mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]), | 'ugrep', '-r', '--ignore-files', '--no-hidden', '-I', | |||
'-n', pat, './', | ||||
]) | ||||
]) | ]) | |||
def bench_linux_word(suite_dir): | def bench_linux_word(suite_dir): | |||
''' | ''' | |||
Benchmark use of the -w ("match word") flag in each tool. | Benchmark use of the -w ("match word") flag in each tool. | |||
sift has a lot of trouble with this because it forces it into Go's | ||||
regex engine by surrounding the pattern with \b assertions. | ||||
''' | ''' | |||
require(suite_dir, 'linux') | require(suite_dir, 'linux') | |||
cwd = path.join(suite_dir, LINUX_DIR) | cwd = path.join(suite_dir, LINUX_DIR) | |||
pat = 'PM_RESUME' | pat = 'PM_RESUME' | |||
def mkcmd(*args, **kwargs): | def mkcmd(*args, **kwargs): | |||
kwargs['cwd'] = cwd | kwargs['cwd'] = cwd | |||
return Command(*args, **kwargs) | return Command(*args, **kwargs) | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
mkcmd('rg (ignore)', ['rg', '-n', '-w', pat]), | mkcmd('rg', ['rg', '-n', '-w', pat]), | |||
mkcmd('ag (ignore)', ['ag', '-s', '-w', pat]), | mkcmd('ag', ['ag', '-s', '-w', pat]), | |||
mkcmd('pt (ignore)', ['pt', '-w', pat]), | ||||
mkcmd('sift (ignore)', SIFT + ['-n', '-w', '--git', pat]), | ||||
mkcmd( | mkcmd( | |||
'git grep (ignore)', | 'git grep', | |||
['git', 'grep', '-E', '-I', '-n', '-w', pat], | ['git', 'grep', '-E', '-I', '-n', '-w', pat], | |||
env={'LC_ALL': 'C'}, | env={'LC_ALL': 'C'}, | |||
), | ), | |||
mkcmd('rg (whitelist)', [ | mkcmd('ugrep', [ | |||
'rg', '-n', '-w', '--no-ignore', '-tall', pat, | 'ugrep', '-r', '--ignore-files', '--no-hidden', '-I', | |||
]), | '-n', '-w', pat, './', | |||
mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', '-w', pat]), | ]) | |||
]) | ]) | |||
def bench_linux_unicode_greek(suite_dir): | def bench_linux_unicode_greek(suite_dir): | |||
''' | ''' | |||
Benchmark matching of a Unicode category. | Benchmark matching of a Unicode category. | |||
Only three tools (ripgrep, sift and pt) support this. We omit | ||||
pt because it is too slow. | ||||
''' | ''' | |||
require(suite_dir, 'linux') | require(suite_dir, 'linux') | |||
cwd = path.join(suite_dir, LINUX_DIR) | cwd = path.join(suite_dir, LINUX_DIR) | |||
pat = r'\p{Greek}' | pat = r'\p{Greek}' | |||
def mkcmd(*args, **kwargs): | def mkcmd(*args, **kwargs): | |||
kwargs['cwd'] = cwd | kwargs['cwd'] = cwd | |||
return Command(*args, **kwargs) | return Command(*args, **kwargs) | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
mkcmd('rg', ['rg', '-n', pat]), | mkcmd('rg', ['rg', '-n', pat]), | |||
mkcmd('pt', ['pt', '-e', pat]), | mkcmd('ugrep', [ | |||
mkcmd('sift', SIFT + ['-n', '--git', pat]), | 'ugrep', '-r', '--ignore-files', '--no-hidden', '-I', | |||
'-n', pat, './', | ||||
]) | ||||
]) | ]) | |||
def bench_linux_unicode_greek_casei(suite_dir): | def bench_linux_unicode_greek_casei(suite_dir): | |||
''' | ''' | |||
Benchmark matching of a Unicode category, case insensitively. | Benchmark matching of a Unicode category, case insensitively. | |||
Only ripgrep gets this right (and it's still fast). | Only ripgrep gets this right (and it's still fast). | |||
''' | ''' | |||
require(suite_dir, 'linux') | require(suite_dir, 'linux') | |||
cwd = path.join(suite_dir, LINUX_DIR) | cwd = path.join(suite_dir, LINUX_DIR) | |||
pat = r'\p{Greek}' | pat = r'\p{Greek}' | |||
def mkcmd(*args, **kwargs): | def mkcmd(*args, **kwargs): | |||
kwargs['cwd'] = cwd | kwargs['cwd'] = cwd | |||
return Command(*args, **kwargs) | return Command(*args, **kwargs) | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
mkcmd('rg', ['rg', '-n', '-i', pat]), | mkcmd('rg', ['rg', '-n', '-i', pat]), | |||
mkcmd('pt', ['pt', '-i', '-e', pat]), | mkcmd('ugrep', [ | |||
mkcmd('sift', SIFT + ['-n', '-i', '--git', pat]), | 'ugrep', '-r', '--ignore-files', '--no-hidden', '-I', | |||
'-n', '-i', pat, './', | ||||
]) | ||||
]) | ]) | |||
def bench_linux_unicode_word(suite_dir): | def bench_linux_unicode_word(suite_dir): | |||
''' | ''' | |||
Benchmark Unicode aware \w character class. | Benchmark Unicode aware \\w character class. | |||
Only ripgrep and git-grep (with LC_ALL=en_US.UTF-8) actually get | Only ripgrep and git-grep (with LC_ALL=en_US.UTF-8) actually get | |||
this right. Everything else uses the standard ASCII interpretation | this right. Everything else uses the standard ASCII interpretation | |||
of \w. | of \\w. | |||
''' | ''' | |||
require(suite_dir, 'linux') | require(suite_dir, 'linux') | |||
cwd = path.join(suite_dir, LINUX_DIR) | cwd = path.join(suite_dir, LINUX_DIR) | |||
pat = r'\wAh' | pat = r'\wAh' | |||
def mkcmd(*args, **kwargs): | def mkcmd(*args, **kwargs): | |||
kwargs['cwd'] = cwd | kwargs['cwd'] = cwd | |||
return Command(*args, **kwargs) | return Command(*args, **kwargs) | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
mkcmd('rg (ignore)', ['rg', '-n', pat]), | mkcmd('rg', ['rg', '-n', pat]), | |||
mkcmd('rg (ignore) (ASCII)', ['rg', '-n', '(?-u)' + pat]), | mkcmd('rg (ASCII)', ['rg', '-n', '(?-u)' + pat]), | |||
mkcmd('ag (ignore) (ASCII)', ['ag', '-s', pat]), | mkcmd('ag (ASCII)', ['ag', '-s', pat]), | |||
mkcmd('pt (ignore) (ASCII)', ['pt', '-e', pat]), | ||||
mkcmd('sift (ignore) (ASCII)', SIFT + ['-n', '--git', pat]), | ||||
mkcmd( | mkcmd( | |||
'git grep (ignore)', | 'git grep', | |||
['git', 'grep', '-E', '-I', '-n', pat], | ['git', 'grep', '-E', '-I', '-n', pat], | |||
env={'LC_ALL': 'en_US.UTF-8'}, | env={'LC_ALL': 'en_US.UTF-8'}, | |||
), | ), | |||
mkcmd( | mkcmd( | |||
'git grep (ignore) (ASCII)', | 'git grep (ASCII)', | |||
['git', 'grep', '-E', '-I', '-n', pat], | ['git', 'grep', '-E', '-I', '-n', pat], | |||
env={'LC_ALL': 'C'}, | env={'LC_ALL': 'C'}, | |||
), | ), | |||
mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]), | mkcmd('ugrep', [ | |||
mkcmd('rg (whitelist) (ASCII)', [ | 'ugrep', '-r', '--ignore-files', '--no-hidden', '-I', | |||
'rg', '-n', '--no-ignore', '-tall', '(?-u)' + pat, | '-n', pat, './', | |||
]), | ||||
mkcmd('ugrep (ASCII)', [ | ||||
'ugrep', '-r', '--ignore-files', '--no-hidden', '-I', | ||||
'-n', '-U', pat, './', | ||||
]), | ]), | |||
mkcmd('ucg (ASCII)', ['ucg', '--nosmart-case', pat]), | ||||
]) | ]) | |||
def bench_linux_no_literal(suite_dir): | def bench_linux_no_literal(suite_dir): | |||
''' | ''' | |||
Benchmark a regex that defeats all literal optimizations. | Benchmark a regex that defeats all literal optimizations. | |||
Most search patterns have some kind of literal in them, which | Most search patterns have some kind of literal in them, which | |||
typically permits searches to take some shortcuts. Therefore, the | typically permits searches to take some shortcuts. Therefore, the | |||
applicability of this benchmark is somewhat suspicious, but the | applicability of this benchmark is somewhat suspicious, but the | |||
suite wouldn't feel complete without it. | suite wouldn't feel complete without it. | |||
''' | ''' | |||
require(suite_dir, 'linux') | require(suite_dir, 'linux') | |||
cwd = path.join(suite_dir, LINUX_DIR) | cwd = path.join(suite_dir, LINUX_DIR) | |||
pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}' | pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}' | |||
def mkcmd(*args, **kwargs): | def mkcmd(*args, **kwargs): | |||
kwargs['cwd'] = cwd | kwargs['cwd'] = cwd | |||
return Command(*args, **kwargs) | return Command(*args, **kwargs) | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
mkcmd('rg (ignore)', ['rg', '-n', pat]), | mkcmd('rg', ['rg', '-n', pat]), | |||
mkcmd('rg (ignore) (ASCII)', ['rg', '-n', '(?-u)' + pat]), | mkcmd('rg (ASCII)', ['rg', '-n', '(?-u)' + pat]), | |||
mkcmd('ag (ignore) (ASCII)', ['ag', '-s', pat]), | mkcmd('ag (ASCII)', ['ag', '-s', pat]), | |||
mkcmd('pt (ignore) (ASCII)', ['pt', '-e', pat]), | ||||
mkcmd('sift (ignore) (ASCII)', SIFT + ['-n', '--git', pat]), | ||||
mkcmd( | mkcmd( | |||
'git grep (ignore)', | 'git grep', | |||
['git', 'grep', '-E', '-I', '-n', pat], | ['git', 'grep', '-E', '-I', '-n', pat], | |||
env={'LC_ALL': 'en_US.UTF-8'}, | env={'LC_ALL': 'en_US.UTF-8'}, | |||
), | ), | |||
mkcmd( | mkcmd( | |||
'git grep (ignore) (ASCII)', | 'git grep (ASCII)', | |||
['git', 'grep', '-E', '-I', '-n', pat], | ['git', 'grep', '-E', '-I', '-n', pat], | |||
env={'LC_ALL': 'C'}, | env={'LC_ALL': 'C'}, | |||
), | ), | |||
mkcmd('rg (whitelist)', ['rg', '-n', '--no-ignore', '-tall', pat]), | mkcmd('ugrep', [ | |||
mkcmd('rg (whitelist) (ASCII)', [ | 'ugrep', '-r', '--ignore-files', '--no-hidden', '-I', | |||
'rg', '-n', '--no-ignore', '-tall', '(?-u)' + pat, | '-n', pat, './', | |||
]), | ||||
mkcmd('ugrep (ASCII)', [ | ||||
'ugrep', '-r', '--ignore-files', '--no-hidden', '-I', | ||||
'-n', '-U', pat, './', | ||||
]), | ]), | |||
mkcmd('ucg (whitelist) (ASCII)', ['ucg', '--nosmart-case', pat]), | ||||
]) | ]) | |||
def bench_linux_alternates(suite_dir): | def bench_linux_alternates(suite_dir): | |||
''' | ''' | |||
Benchmark a small alternation of literals. | Benchmark a small alternation of literals. | |||
sift doesn't make the cut. It's more than 10x slower than the next | sift doesn't make the cut. It's more than 10x slower than the next | |||
fastest result. The slowdown is likely because the Go regexp engine | fastest result. The slowdown is likely because the Go regexp engine | |||
doesn't do any literal optimizations for this case (there is no | doesn't do any literal optimizations for this case (there is no | |||
common leading byte). | common leading byte). | |||
''' | ''' | |||
require(suite_dir, 'linux') | require(suite_dir, 'linux') | |||
cwd = path.join(suite_dir, LINUX_DIR) | cwd = path.join(suite_dir, LINUX_DIR) | |||
pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT' | pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT' | |||
def mkcmd(*args, **kwargs): | def mkcmd(*args, **kwargs): | |||
kwargs['cwd'] = cwd | kwargs['cwd'] = cwd | |||
return Command(*args, **kwargs) | return Command(*args, **kwargs) | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
mkcmd('rg (ignore)', ['rg', '-n', pat]), | mkcmd('rg', ['rg', '-n', pat]), | |||
mkcmd('ag (ignore)', ['ag', '-s', pat]), | mkcmd('ag', ['ag', '-s', pat]), | |||
mkcmd( | mkcmd( | |||
'git grep (ignore)', | 'git grep', | |||
['git', 'grep', '-E', '-I', '-n', pat], | ['git', 'grep', '-E', '-I', '-n', pat], | |||
env={'LC_ALL': 'C'}, | env={'LC_ALL': 'C'}, | |||
), | ), | |||
mkcmd('rg (whitelist)', ['rg', '--no-ignore', '-n', pat]), | mkcmd('ugrep', [ | |||
mkcmd('ucg (whitelist)', ['ucg', '--nosmart-case', pat]), | 'ugrep', '-r', '--ignore-files', '--no-hidden', '-I', | |||
'-n', pat, './', | ||||
]) | ||||
]) | ]) | |||
def bench_linux_alternates_casei(suite_dir): | def bench_linux_alternates_casei(suite_dir): | |||
'Benchmark a small alternation of literals case insensitively.' | 'Benchmark a small alternation of literals case insensitively.' | |||
require(suite_dir, 'linux') | require(suite_dir, 'linux') | |||
cwd = path.join(suite_dir, LINUX_DIR) | cwd = path.join(suite_dir, LINUX_DIR) | |||
pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT' | pat = 'ERR_SYS|PME_TURN_OFF|LINK_REQ_RST|CFG_BME_EVT' | |||
def mkcmd(*args, **kwargs): | def mkcmd(*args, **kwargs): | |||
kwargs['cwd'] = cwd | kwargs['cwd'] = cwd | |||
return Command(*args, **kwargs) | return Command(*args, **kwargs) | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
mkcmd('rg (ignore)', ['rg', '-n', '-i', pat]), | mkcmd('rg', ['rg', '-n', '-i', pat]), | |||
mkcmd('ag (ignore)', ['ag', '-i', pat]), | mkcmd('ag', ['ag', '-i', pat]), | |||
mkcmd( | mkcmd( | |||
'git grep (ignore)', | 'git grep', | |||
['git', 'grep', '-E', '-I', '-n', '-i', pat], | ['git', 'grep', '-E', '-I', '-n', '-i', pat], | |||
env={'LC_ALL': 'C'}, | env={'LC_ALL': 'C'}, | |||
), | ), | |||
mkcmd('rg (whitelist)', ['rg', '--no-ignore', '-n', '-i', pat]), | mkcmd('ugrep', [ | |||
mkcmd('ucg (whitelist)', ['ucg', '-i', pat]), | 'ugrep', '-r', '--ignore-files', '--no-hidden', '-I', | |||
'-n', '-i', pat, './', | ||||
]) | ||||
]) | ]) | |||
def bench_subtitles_en_literal(suite_dir): | def bench_subtitles_en_literal(suite_dir): | |||
''' | ''' | |||
Benchmark the speed of an ASCII string literal. | Benchmark the speed of an ASCII string literal. | |||
''' | ''' | |||
require(suite_dir, 'subtitles-en') | require(suite_dir, 'subtitles-en') | |||
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) | en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) | |||
pat = 'Sherlock Holmes' | pat = 'Sherlock Holmes' | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
Command('rg', ['rg', pat, en]), | Command('rg', ['rg', pat, en]), | |||
Command('rg (no mmap)', ['rg', '--no-mmap', pat, en]), | Command('rg (no mmap)', ['rg', '--no-mmap', pat, en]), | |||
Command('pt', ['pt', '-N', pat, en]), | Command('grep', ['grep', pat, en], env=GREP_ASCII), | |||
Command('sift', ['sift', pat, en]), | ||||
Command('grep', ['grep', '-a', pat, en], env=GREP_ASCII), | ||||
Command('rg (lines)', ['rg', '-n', pat, en]), | Command('rg (lines)', ['rg', '-n', pat, en]), | |||
Command('ag (lines)', ['ag', '-s', pat, en]), | Command('ag (lines)', ['ag', '-s', pat, en]), | |||
Command('ucg (lines)', ['ucg', '--nosmart-case', pat, en]), | Command('grep (lines)', ['grep', '-n', pat, en], env=GREP_ASCII), | |||
Command('pt (lines)', ['pt', pat, en]), | Command('ugrep (lines)', ['ugrep', '-n', pat, en]) | |||
Command('sift (lines)', ['sift', '-n', pat, en]), | ||||
Command('grep (lines)', ['grep', '-an', pat, en], env=GREP_ASCII), | ||||
]) | ]) | |||
def bench_subtitles_en_literal_casei(suite_dir): | def bench_subtitles_en_literal_casei(suite_dir): | |||
''' | ''' | |||
Benchmark the speed of a Unicode-y string case insensitively. | Benchmark the speed of a Unicode-y string case insensitively. | |||
''' | ''' | |||
require(suite_dir, 'subtitles-en') | require(suite_dir, 'subtitles-en') | |||
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) | en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) | |||
pat = 'Sherlock Holmes' | pat = 'Sherlock Holmes' | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
Command('rg', ['rg', '-i', pat, en]), | Command('rg', ['rg', '-i', pat, en]), | |||
Command('grep', ['grep', '-ai', pat, en], env=GREP_UNICODE), | Command('grep', ['grep', '-i', pat, en], env=GREP_UNICODE), | |||
Command('grep (ASCII)', [ | Command('grep (ASCII)', ['grep', '-E', '-i', pat, en], env=GREP_ASCII), | |||
'grep', '-E', '-ai', pat, en, | ||||
], env=GREP_ASCII), | ||||
Command('rg (lines)', ['rg', '-n', '-i', pat, en]), | Command('rg (lines)', ['rg', '-n', '-i', pat, en]), | |||
Command('ag (lines) (ASCII)', ['ag', '-i', pat, en]), | Command('ag (lines) (ASCII)', ['ag', '-i', pat, en]), | |||
Command('ucg (lines) (ASCII)', ['ucg', '-i', pat, en]), | Command('ugrep (lines)', ['ugrep', '-n', '-i', pat, en]) | |||
]) | ]) | |||
def bench_subtitles_en_literal_word(suite_dir): | def bench_subtitles_en_literal_word(suite_dir): | |||
''' | ''' | |||
Benchmark the speed of finding a literal inside word boundaries. | Benchmark the speed of finding a literal inside word boundaries. | |||
''' | ''' | |||
require(suite_dir, 'subtitles-en') | require(suite_dir, 'subtitles-en') | |||
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) | en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) | |||
pat = 'Sherlock Holmes' | pat = 'Sherlock Holmes' | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
Command('rg (ASCII)', [ | Command('rg (ASCII)', [ | |||
'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', en, | 'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', en, | |||
]), | ]), | |||
Command('ag (ASCII)', ['ag', '-sw', pat, en]), | Command('ag (ASCII)', ['ag', '-sw', pat, en]), | |||
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]), | Command('grep (ASCII)', ['grep', '-nw', pat, en], env=GREP_ASCII), | |||
Command('grep (ASCII)', [ | Command('ugrep (ASCII)', ['ugrep', '-nw', pat, en]), | |||
'grep', '-anw', pat, en, | ||||
], env=GREP_ASCII), | ||||
Command('rg', ['rg', '-nw', pat, en]), | Command('rg', ['rg', '-nw', pat, en]), | |||
Command('grep', ['grep', '-anw', pat, en], env=GREP_UNICODE), | Command('grep', ['grep', '-nw', pat, en], env=GREP_UNICODE), | |||
]) | ]) | |||
def bench_subtitles_en_alternate(suite_dir): | def bench_subtitles_en_alternate(suite_dir): | |||
''' | ''' | |||
Benchmark the speed of a set of alternate literals. | Benchmark the speed of a set of alternate literals. | |||
''' | ''' | |||
require(suite_dir, 'subtitles-en') | require(suite_dir, 'subtitles-en') | |||
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) | en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) | |||
pat = '|'.join([ | pat = '|'.join([ | |||
'Sherlock Holmes', | 'Sherlock Holmes', | |||
'John Watson', | 'John Watson', | |||
'Irene Adler', | 'Irene Adler', | |||
'Inspector Lestrade', | 'Inspector Lestrade', | |||
'Professor Moriarty', | 'Professor Moriarty', | |||
]) | ]) | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
Command('rg (lines)', ['rg', '-n', pat, en]), | Command('rg (lines)', ['rg', '-n', pat, en]), | |||
Command('ag (lines)', ['ag', '-s', pat, en]), | Command('ag (lines)', ['ag', '-s', pat, en]), | |||
Command('ucg (lines)', ['ucg', '--nosmart-case', pat, en]), | Command('grep (lines)', ['grep', '-E', '-n', pat, en], env=GREP_ASCII), | |||
Command('grep (lines)', [ | Command('ugrep (lines)', ['ugrep', '-n', pat, en]), | |||
'grep', '-E', '-an', pat, en, | ||||
], env=GREP_ASCII), | ||||
Command('rg', ['rg', pat, en]), | Command('rg', ['rg', pat, en]), | |||
Command('grep', [ | Command('grep', ['grep', '-E', pat, en], env=GREP_ASCII), | |||
'grep', '-E', '-a', pat, en, | ||||
], env=GREP_ASCII), | ||||
]) | ]) | |||
def bench_subtitles_en_alternate_casei(suite_dir): | def bench_subtitles_en_alternate_casei(suite_dir): | |||
''' | ''' | |||
Benchmark the speed of a set of alternate literals. | Benchmark the speed of a set of alternate literals. | |||
''' | ''' | |||
require(suite_dir, 'subtitles-en') | require(suite_dir, 'subtitles-en') | |||
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) | en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) | |||
pat = '|'.join([ | pat = '|'.join([ | |||
'Sherlock Holmes', | 'Sherlock Holmes', | |||
'John Watson', | 'John Watson', | |||
'Irene Adler', | 'Irene Adler', | |||
'Inspector Lestrade', | 'Inspector Lestrade', | |||
'Professor Moriarty', | 'Professor Moriarty', | |||
]) | ]) | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
Command('ag (ASCII)', ['ag', '-s', '-i', pat, en]), | Command('ag (ASCII)', ['ag', '-s', '-i', pat, en]), | |||
Command('ucg (ASCII)', ['ucg', '-i', pat, en]), | ||||
Command('grep (ASCII)', [ | Command('grep (ASCII)', [ | |||
'grep', '-E', '-ani', pat, en, | 'grep', '-E', '-ni', pat, en, | |||
], env=GREP_ASCII), | ], env=GREP_ASCII), | |||
Command('ugrep (ASCII)', ['ugrep', '-n', '-i', pat, en]), | ||||
Command('rg', ['rg', '-n', '-i', pat, en]), | Command('rg', ['rg', '-n', '-i', pat, en]), | |||
Command('grep', ['grep', '-E', '-ani', pat, en], env=GREP_UNICODE), | Command('grep', ['grep', '-E', '-ni', pat, en], env=GREP_UNICODE), | |||
]) | ]) | |||
def bench_subtitles_en_surrounding_words(suite_dir): | def bench_subtitles_en_surrounding_words(suite_dir): | |||
''' | ''' | |||
Benchmark a more complex regex with an inner literal. | Benchmark a more complex regex with an inner literal. | |||
''' | ''' | |||
require(suite_dir, 'subtitles-en') | require(suite_dir, 'subtitles-en') | |||
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) | en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) | |||
pat = r'\w+\s+Holmes\s+\w+' | pat = r'\w+\s+Holmes\s+\w+' | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
Command('rg', ['rg', '-n', pat, en]), | Command('rg', ['rg', '-n', pat, en]), | |||
Command('grep', ['grep', '-E', '-an', pat, en], env=GREP_UNICODE), | Command('grep', ['grep', '-E', '-n', pat, en], env=GREP_UNICODE), | |||
Command('ugrep', ['ugrep', '-n', pat, en]), | ||||
Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]), | Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]), | |||
Command('ag (ASCII)', ['ag', '-s', pat, en]), | Command('ag (ASCII)', ['ag', '-s', pat, en]), | |||
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]), | Command('grep (ASCII)', ['grep', '-E', '-n', pat, en], env=GREP_ASCII), | |||
Command('grep (ASCII)', [ | Command('ugrep (ASCII)', ['ugrep', '-n', '-U', pat, en]) | |||
'grep', '-E', '-an', pat, en, | ||||
], env=GREP_ASCII), | ||||
]) | ]) | |||
def bench_subtitles_en_no_literal(suite_dir): | def bench_subtitles_en_no_literal(suite_dir): | |||
''' | ''' | |||
Benchmark the speed of a regex with no literals. | Benchmark the speed of a regex with no literals. | |||
Note that we don't even try to run grep with Unicode support | Note that we don't even try to run grep with Unicode support | |||
on this one. While it should eventually get the right answer, | on this one. While it should eventually get the right answer, | |||
I killed it after it had already been running for two minutes | I killed it after it had already been running for two minutes | |||
and showed no signs of finishing soon. | and showed no signs of finishing soon. | |||
''' | ''' | |||
require(suite_dir, 'subtitles-en') | require(suite_dir, 'subtitles-en') | |||
en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) | en = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_EN_NAME_SAMPLE) | |||
pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}' | pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}' | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
Command('rg', ['rg', '-n', pat, en]), | Command('rg', ['rg', '-n', pat, en]), | |||
Command('ugrep', ['ugrep', '-n', pat, en]), | ||||
Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]), | Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, en]), | |||
Command('ag (ASCII)', ['ag', '-s', pat, en]), | Command('ag (ASCII)', ['ag', '-s', pat, en]), | |||
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, en]), | Command('grep (ASCII)', ['grep', '-E', '-n', pat, en], env=GREP_ASCII), | |||
Command('grep (ASCII)', [ | Command('ugrep (ASCII)', ['ugrep', '-n', '-U', pat, en]) | |||
'grep', '-E', '-an', pat, en, | ||||
], env=GREP_ASCII), | ||||
]) | ]) | |||
def bench_subtitles_ru_literal(suite_dir): | def bench_subtitles_ru_literal(suite_dir): | |||
''' | ''' | |||
Benchmark the speed of a Unicode-y string literal. | Benchmark the speed of a Unicode-y string literal. | |||
''' | ''' | |||
require(suite_dir, 'subtitles-ru') | require(suite_dir, 'subtitles-ru') | |||
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME) | ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME) | |||
pat = 'Шерлок Холмс' # Sherlock Holmes | pat = 'Шерлок Холмс' # Sherlock Holmes | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
Command('rg', ['rg', pat, ru]), | Command('rg', ['rg', pat, ru]), | |||
Command('rg (no mmap)', ['rg', '--no-mmap', pat, ru]), | Command('rg (no mmap)', ['rg', '--no-mmap', pat, ru]), | |||
Command('pt', ['pt', '-N', pat, ru]), | Command('grep', ['grep', pat, ru], env=GREP_ASCII), | |||
Command('sift', ['sift', pat, ru]), | ||||
Command('grep', ['grep', '-a', pat, ru], env=GREP_ASCII), | ||||
Command('rg (lines)', ['rg', '-n', pat, ru]), | Command('rg (lines)', ['rg', '-n', pat, ru]), | |||
Command('ag (lines)', ['ag', '-s', pat, ru]), | Command('ag (lines)', ['ag', '-s', pat, ru]), | |||
Command('ucg (lines)', ['ucg', '--nosmart-case', pat, ru]), | Command('grep (lines)', ['grep', '-n', pat, ru], env=GREP_ASCII), | |||
Command('pt (lines)', ['pt', pat, ru]), | Command('ugrep (lines)', ['ugrep', '-n', pat, ru]) | |||
Command('sift (lines)', ['sift', '-n', pat, ru]), | ||||
Command('grep (lines)', ['grep', '-an', pat, ru], env=GREP_ASCII), | ||||
]) | ]) | |||
def bench_subtitles_ru_literal_casei(suite_dir): | def bench_subtitles_ru_literal_casei(suite_dir): | |||
''' | ''' | |||
Benchmark the speed of a Unicode-y string case insensitively. | Benchmark the speed of a Unicode-y string case insensitively. | |||
''' | ''' | |||
require(suite_dir, 'subtitles-ru') | require(suite_dir, 'subtitles-ru') | |||
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME) | ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME) | |||
pat = 'Шерлок Холмс' # Sherlock Holmes | pat = 'Шерлок Холмс' # Sherlock Holmes | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
Command('rg', ['rg', '-i', pat, ru]), | Command('rg', ['rg', '-i', pat, ru]), | |||
Command('grep', ['grep', '-ai', pat, ru], env=GREP_UNICODE), | Command('grep', ['grep', '-i', pat, ru], env=GREP_UNICODE), | |||
Command('grep (ASCII)', [ | Command('grep (ASCII)', ['grep', '-E', '-i', pat, ru], env=GREP_ASCII), | |||
'grep', '-E', '-ai', pat, ru, | ||||
], env=GREP_ASCII), | ||||
Command('rg (lines)', ['rg', '-n', '-i', pat, ru]), | Command('rg (lines)', ['rg', '-n', '-i', pat, ru]), | |||
Command('ag (lines) (ASCII)', ['ag', '-i', pat, ru]), | Command('ag (lines) (ASCII)', ['ag', '-i', pat, ru]), | |||
Command('ucg (lines) (ASCII)', ['ucg', '-i', pat, ru]), | Command('ugrep (lines) (ASCII)', ['ugrep', '-n', '-i', pat, ru]) | |||
]) | ]) | |||
def bench_subtitles_ru_literal_word(suite_dir): | def bench_subtitles_ru_literal_word(suite_dir): | |||
''' | ''' | |||
Benchmark the speed of finding a literal inside word boundaries. | Benchmark the speed of finding a literal inside word boundaries. | |||
''' | ''' | |||
require(suite_dir, 'subtitles-ru') | require(suite_dir, 'subtitles-ru') | |||
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME) | ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME) | |||
pat = 'Шерлок Холмс' # Sherlock Holmes | pat = 'Шерлок Холмс' # Sherlock Holmes | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
Command('rg (ASCII)', [ | Command('rg (ASCII)', [ | |||
'rg', '-n', r'(?-u:\b)' + pat + r'(?-u:\b)', ru, | # You might think we'd use \b here for word boundaries, but both | |||
# GNU grep and ripgrep implement -w with the formulation below. | ||||
# Since we can't use Unicode in a pattern and disable Unicode word | ||||
# boundaries, we just hand-jam this ourselves. | ||||
'rg', '-n', r'(?-u:^|\W)' + pat + r'(?-u:$|\W)', ru, | ||||
]), | ]), | |||
Command('ag (ASCII)', ['ag', '-sw', pat, ru]), | Command('ag (ASCII)', ['ag', '-sw', pat, ru]), | |||
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]), | ||||
Command('grep (ASCII)', [ | Command('grep (ASCII)', [ | |||
'grep', '-anw', pat, ru, | 'grep', '-nw', pat, ru, | |||
], env=GREP_ASCII), | ], env=GREP_ASCII), | |||
Command('ugrep (ASCII)', ['ugrep', '-nw', pat, ru]), | ||||
Command('rg', ['rg', '-nw', pat, ru]), | Command('rg', ['rg', '-nw', pat, ru]), | |||
Command('grep', ['grep', '-anw', pat, ru], env=GREP_UNICODE), | Command('grep', ['grep', '-nw', pat, ru], env=GREP_UNICODE), | |||
]) | ]) | |||
def bench_subtitles_ru_alternate(suite_dir): | def bench_subtitles_ru_alternate(suite_dir): | |||
''' | ''' | |||
Benchmark the speed of a set of alternate literals. | Benchmark the speed of a set of alternate literals. | |||
''' | ''' | |||
require(suite_dir, 'subtitles-ru') | require(suite_dir, 'subtitles-ru') | |||
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME) | ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME) | |||
pat = '|'.join([ | pat = '|'.join([ | |||
'Шерлок Холмс', # Sherlock Holmes | 'Шерлок Холмс', # Sherlock Holmes | |||
'Джон Уотсон', # John Watson | 'Джон Уотсон', # John Watson | |||
'Ирен Адлер', # Irene Adler | 'Ирен Адлер', # Irene Adler | |||
'инспектор Лестрейд', # Inspector Lestrade | 'инспектор Лестрейд', # Inspector Lestrade | |||
'профессор Мориарти', # Professor Moriarty | 'профессор Мориарти', # Professor Moriarty | |||
]) | ]) | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
Command('rg (lines)', ['rg', '-n', pat, ru]), | Command('rg (lines)', ['rg', '-n', pat, ru]), | |||
Command('ag (lines)', ['ag', '-s', pat, ru]), | Command('ag (lines)', ['ag', '-s', pat, ru]), | |||
Command('ucg (lines)', ['ucg', '--nosmart-case', pat, ru]), | Command('grep (lines)', ['grep', '-E', '-n', pat, ru], env=GREP_ASCII), | |||
Command('grep (lines)', [ | Command('ugrep (lines)', ['ugrep', '-n', pat, ru]), | |||
'grep', '-E', '-an', pat, ru, | ||||
], env=GREP_ASCII), | ||||
Command('rg', ['rg', pat, ru]), | Command('rg', ['rg', pat, ru]), | |||
Command('grep', [ | Command('grep', ['grep', '-E', pat, ru], env=GREP_ASCII), | |||
'grep', '-E', '-a', pat, ru, | ||||
], env=GREP_ASCII), | ||||
]) | ]) | |||
def bench_subtitles_ru_alternate_casei(suite_dir): | def bench_subtitles_ru_alternate_casei(suite_dir): | |||
''' | ''' | |||
Benchmark the speed of a set of alternate literals. | Benchmark the speed of a set of alternate literals. | |||
''' | ''' | |||
require(suite_dir, 'subtitles-ru') | require(suite_dir, 'subtitles-ru') | |||
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME) | ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME) | |||
pat = '|'.join([ | pat = '|'.join([ | |||
'Шерлок Холмс', # Sherlock Holmes | 'Шерлок Холмс', # Sherlock Holmes | |||
'Джон Уотсон', # John Watson | 'Джон Уотсон', # John Watson | |||
'Ирен Адлер', # Irene Adler | 'Ирен Адлер', # Irene Adler | |||
'инспектор Лестрейд', # Inspector Lestrade | 'инспектор Лестрейд', # Inspector Lestrade | |||
'профессор Мориарти', # Professor Moriarty | 'профессор Мориарти', # Professor Moriarty | |||
]) | ]) | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
Command('ag (ASCII)', ['ag', '-s', '-i', pat, ru]), | Command('ag (ASCII)', ['ag', '-s', '-i', pat, ru]), | |||
Command('ucg (ASCII)', ['ucg', '-i', pat, ru]), | ||||
Command('grep (ASCII)', [ | Command('grep (ASCII)', [ | |||
'grep', '-E', '-ani', pat, ru, | 'grep', '-E', '-ni', pat, ru, | |||
], env=GREP_ASCII), | ], env=GREP_ASCII), | |||
Command('ugrep (ASCII)', ['ugrep', '-n', '-i', pat, ru]), | ||||
Command('rg', ['rg', '-n', '-i', pat, ru]), | Command('rg', ['rg', '-n', '-i', pat, ru]), | |||
Command('grep', ['grep', '-E', '-ani', pat, ru], env=GREP_UNICODE), | Command('grep', ['grep', '-E', '-ni', pat, ru], env=GREP_UNICODE), | |||
]) | ]) | |||
def bench_subtitles_ru_surrounding_words(suite_dir): | def bench_subtitles_ru_surrounding_words(suite_dir): | |||
''' | ''' | |||
Benchmark a more complex regex with an inner literal. | Benchmark a more complex regex with an inner literal. | |||
''' | ''' | |||
require(suite_dir, 'subtitles-en') | require(suite_dir, 'subtitles-en') | |||
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME) | ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME) | |||
pat = r'\w+\s+Холмс\s+\w+' | pat = r'\w+\s+Холмс\s+\w+' | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
Command('rg', ['rg', '-n', pat, ru]), | Command('rg', ['rg', '-n', pat, ru]), | |||
Command('grep', ['grep', '-E', '-an', pat, ru], env=GREP_UNICODE), | Command('grep', ['grep', '-E', '-n', pat, ru], env=GREP_UNICODE), | |||
Command('ugrep', ['ugrep', '-n', pat, ru]), | ||||
Command('ag (ASCII)', ['ag', '-s', pat, ru]), | Command('ag (ASCII)', ['ag', '-s', pat, ru]), | |||
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]), | Command('grep (ASCII)', ['grep', '-E', '-n', pat, ru], env=GREP_ASCII), | |||
Command('grep (ASCII)', [ | Command('ugrep (ASCII)', ['ugrep', '-n', '-U', pat, ru]), | |||
'grep', '-E', '-an', pat, ru, | ||||
], env=GREP_ASCII), | ||||
]) | ]) | |||
def bench_subtitles_ru_no_literal(suite_dir): | def bench_subtitles_ru_no_literal(suite_dir): | |||
''' | ''' | |||
Benchmark the speed of a regex with no literals. | Benchmark the speed of a regex with no literals. | |||
Note that we don't even try to run grep with Unicode support | Note that we don't even try to run grep with Unicode support | |||
on this one. While it should eventually get the right answer, | on this one. While it should eventually get the right answer, | |||
I killed it after it had already been running for two minutes | I killed it after it had already been running for two minutes | |||
and showed no signs of finishing soon. | and showed no signs of finishing soon. | |||
''' | ''' | |||
require(suite_dir, 'subtitles-ru') | require(suite_dir, 'subtitles-ru') | |||
ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME) | ru = path.join(suite_dir, SUBTITLES_DIR, SUBTITLES_RU_NAME) | |||
pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}' | pat = r'\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}\s+\w{5}' | |||
return Benchmark(pattern=pat, commands=[ | return Benchmark(pattern=pat, commands=[ | |||
Command('rg', ['rg', '-n', pat, ru]), | Command('rg', ['rg', '-n', pat, ru]), | |||
Command('ugrep', ['ugrep', '-n', pat, ru]), | ||||
Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, ru]), | Command('rg (ASCII)', ['rg', '-n', '(?-u)' + pat, ru]), | |||
Command('ag (ASCII)', ['ag', '-s', pat, ru]), | Command('ag (ASCII)', ['ag', '-s', pat, ru]), | |||
Command('ucg (ASCII)', ['ucg', '--nosmart-case', pat, ru]), | Command('grep (ASCII)', ['grep', '-E', '-n', pat, ru], env=GREP_ASCII), | |||
Command('grep (ASCII)', [ | Command('ugrep (ASCII)', ['ugrep', '-n', '-U', pat, ru]) | |||
'grep', '-E', '-an', pat, ru, | ||||
], env=GREP_ASCII), | ||||
]) | ]) | |||
class MissingDependencies(Exception): | class MissingDependencies(Exception): | |||
''' | ''' | |||
A missing dependency exception. | A missing dependency exception. | |||
This exception occurs when running a benchmark that requires a | This exception occurs when running a benchmark that requires a | |||
particular corpus that isn't available. | particular corpus that isn't available. | |||
:ivar list(str) missing_names: | :ivar list(str) missing_names: | |||
skipping to change at line 731 | skipping to change at line 705 | |||
''' | ''' | |||
A single benchmark corresponding to a grouping of commands. | A single benchmark corresponding to a grouping of commands. | |||
The main purpose of a benchmark is to compare the performance | The main purpose of a benchmark is to compare the performance | |||
characteristics of a group of commands. | characteristics of a group of commands. | |||
''' | ''' | |||
def __init__(self, name=None, pattern=None, commands=None, | def __init__(self, name=None, pattern=None, commands=None, | |||
warmup_count=1, count=3, line_count=True, | warmup_count=1, count=3, line_count=True, | |||
allow_missing_commands=False, | allow_missing_commands=False, | |||
disabled_cmds=None): | disabled_cmds=None, order=0): | |||
''' | ''' | |||
Create a single benchmark. | Create a single benchmark. | |||
A single benchmark is composed of a set of commands that are | A single benchmark is composed of a set of commands that are | |||
benchmarked and compared against one another. A benchmark may | benchmarked and compared against one another. A benchmark may | |||
have multiple commands that use the same search tool (but | have multiple commands that use the same search tool (but | |||
probably should have something differentiating them). | probably should have something differentiating them). | |||
The grouping of commands is a purely human driven process. | The grouping of commands is a purely human driven process. | |||
skipping to change at line 767 | skipping to change at line 741 | |||
:param int count: | :param int count: | |||
The number of samples to collect from each command. | The number of samples to collect from each command. | |||
:param bool line_count: | :param bool line_count: | |||
When set, the lines of each search are counted and included | When set, the lines of each search are counted and included | |||
in the samples produced. | in the samples produced. | |||
:param bool allow_missing_commands: | :param bool allow_missing_commands: | |||
When set, if a command is missing, then the benchmark | When set, if a command is missing, then the benchmark | |||
will simply skip it. | will simply skip it. | |||
:param list(str) disabled_cmds: | :param list(str) disabled_cmds: | |||
A list of commands to skip. | A list of commands to skip. | |||
:param int order: | ||||
An integer indicating the sequence number of this benchmark. | ||||
''' | ''' | |||
self.name = name | self.name = name | |||
self.pattern = pattern | self.pattern = pattern | |||
self.commands = commands or [] | self.commands = commands or [] | |||
self.warmup_count = warmup_count | self.warmup_count = warmup_count | |||
self.count = count | self.count = count | |||
self.line_count = line_count | self.line_count = line_count | |||
self.allow_missing_commands = allow_missing_commands | self.allow_missing_commands = allow_missing_commands | |||
self.disabled_cmds = set(disabled_cmds or []) | self.disabled_cmds = set(disabled_cmds or []) | |||
self.order = order | ||||
def raise_if_missing(self): | def raise_if_missing(self): | |||
''' | ''' | |||
Raises a MissingCommands exception if applicable. | Raises a MissingCommands exception if applicable. | |||
A MissingCommands exception is raised when the following | A MissingCommands exception is raised when the following | |||
criteria are met: 1) allow_missing_commands is False, and 2) at | criteria are met: 1) allow_missing_commands is False, and 2) at | |||
least one command in this benchmark could not be found on this | least one command in this benchmark could not be found on this | |||
system. | system. | |||
''' | ''' | |||
skipping to change at line 868 | skipping to change at line 845 | |||
Benchmark results consist of a set of samples, where each sample | Benchmark results consist of a set of samples, where each sample | |||
corresponds to a single run of a single command in the benchmark. | corresponds to a single run of a single command in the benchmark. | |||
Various statistics can be computed from these samples such as mean | Various statistics can be computed from these samples such as mean | |||
and standard deviation. | and standard deviation. | |||
''' | ''' | |||
def __init__(self, benchmark): | def __init__(self, benchmark): | |||
''' | ''' | |||
Create a new set of results, initially empty. | Create a new set of results, initially empty. | |||
:param Benchmarl benchmark: | :param Benchmark benchmark: | |||
The benchmark that produced these results. | The benchmark that produced these results. | |||
''' | ''' | |||
self.benchmark = benchmark | self.benchmark = benchmark | |||
self.samples = [] | self.samples = [] | |||
def add(self, cmd, duration, line_count=None): | def add(self, cmd, duration, line_count=None): | |||
''' | ''' | |||
Add a new sample to this result set. | Add a new sample to this result set. | |||
:param Command cmd: | :param Command cmd: | |||
skipping to change at line 1055 | skipping to change at line 1032 | |||
os.makedirs(subtitle_dir) | os.makedirs(subtitle_dir) | |||
if not os.path.exists(en_path): | if not os.path.exists(en_path): | |||
if not os.path.exists(en_path_gz): | if not os.path.exists(en_path_gz): | |||
run_cmd(['curl', '-LO', SUBTITLES_EN_URL], cwd=subtitle_dir) | run_cmd(['curl', '-LO', SUBTITLES_EN_URL], cwd=subtitle_dir) | |||
run_cmd(['gunzip', en_path_gz]) | run_cmd(['gunzip', en_path_gz]) | |||
if not os.path.exists(en_path_sample): | if not os.path.exists(en_path_sample): | |||
# Get a sample roughly the same size as the Russian corpus so that | # Get a sample roughly the same size as the Russian corpus so that | |||
# benchmarks finish in a reasonable time. | # benchmarks finish in a reasonable time. | |||
with open(path.join(subtitle_dir, en_path_sample), 'wb+') as f: | with open(path.join(subtitle_dir, en_path_sample), 'wb+') as f: | |||
run_cmd( | run_cmd( | |||
['head', '-n', '32722372', en_path], | ['head', '-n', '55000000', en_path], | |||
cwd=subtitle_dir, stdout=f) | cwd=subtitle_dir, stdout=f) | |||
def has_subtitles_en(suite_dir): | def has_subtitles_en(suite_dir): | |||
'Returns true if English subtitles have been downloaded.' | 'Returns true if English subtitles have been downloaded.' | |||
subtitle_dir = path.join(suite_dir, SUBTITLES_DIR) | subtitle_dir = path.join(suite_dir, SUBTITLES_DIR) | |||
return path.exists(path.join(subtitle_dir, SUBTITLES_EN_NAME_SAMPLE)) | return path.exists(path.join(subtitle_dir, SUBTITLES_EN_NAME_SAMPLE)) | |||
def download_subtitles_ru(suite_dir): | def download_subtitles_ru(suite_dir): | |||
'Download and decompress Russian subtitles.' | 'Download and decompress Russian subtitles.' | |||
subtitle_dir = path.join(suite_dir, SUBTITLES_DIR) | subtitle_dir = path.join(suite_dir, SUBTITLES_DIR) | |||
skipping to change at line 1125 | skipping to change at line 1102 | |||
:param str suite_dir: | :param str suite_dir: | |||
The directory containing corpora. | The directory containing corpora. | |||
:param str filter_pat: | :param str filter_pat: | |||
A single regular expression that is used to filter benchmarks | A single regular expression that is used to filter benchmarks | |||
by their name. When not specified, all benchmarks are run. | by their name. When not specified, all benchmarks are run. | |||
:returns: | :returns: | |||
An iterable over all runnable benchmarks. If a benchmark | An iterable over all runnable benchmarks. If a benchmark | |||
requires corpora that are missing, then a log message is | requires corpora that are missing, then a log message is | |||
emitted to stderr and it is not yielded. | emitted to stderr and it is not yielded. | |||
''' | ''' | |||
for fun in sorted(globals()): | benchmarks = [] | |||
if not fun.startswith('bench_'): | for global_name in globals(): | |||
if not global_name.startswith('bench_'): | ||||
continue | continue | |||
name = re.sub('^bench_', '', fun) | name = re.sub('^bench_', '', global_name) | |||
if filter_pat is not None and not re.search(filter_pat, name): | if filter_pat is not None and not re.search(filter_pat, name): | |||
continue | continue | |||
try: | try: | |||
benchmark = globals()[fun](suite_dir) | fun = globals()[global_name] | |||
benchmark = fun(suite_dir) | ||||
benchmark.name = name | benchmark.name = name | |||
benchmark.warmup_count = warmup_iter | benchmark.warmup_count = warmup_iter | |||
benchmark.count = bench_iter | benchmark.count = bench_iter | |||
benchmark.allow_missing_commands = allow_missing_commands | benchmark.allow_missing_commands = allow_missing_commands | |||
benchmark.disabled_cmds = disabled_cmds | benchmark.disabled_cmds = disabled_cmds | |||
benchmark.order = fun.__code__.co_firstlineno | ||||
benchmark.raise_if_missing() | benchmark.raise_if_missing() | |||
except MissingDependencies as e: | except MissingDependencies as e: | |||
eprint( | eprint( | |||
'missing: %s, skipping benchmark %s (try running with: %s)' % ( | 'missing: %s, skipping benchmark %s (try running with: %s)' % ( | |||
', '.join(e.missing_names), | ', '.join(e.missing_names), | |||
name, | name, | |||
' '.join(['--download %s' % n for n in e.missing_names]), | ' '.join(['--download %s' % n for n in e.missing_names]), | |||
)) | )) | |||
continue | continue | |||
except MissingCommands as e: | except MissingCommands as e: | |||
fmt = 'missing commands: %s, skipping benchmark %s ' \ | fmt = 'missing commands: %s, skipping benchmark %s ' \ | |||
'(run with --allow-missing to run incomplete benchmarks)' | '(run with --allow-missing to run incomplete benchmarks)' | |||
eprint(fmt % (', '.join(e.missing_names), name)) | eprint(fmt % (', '.join(e.missing_names), name)) | |||
continue | continue | |||
yield benchmark | benchmarks.append(benchmark) | |||
return sorted(benchmarks, key=lambda b: b.order) | ||||
def main(): | def main(): | |||
download_choices = ['all', 'linux', 'subtitles-en', 'subtitles-ru'] | download_choices = ['all', 'linux', 'subtitles-en', 'subtitles-ru'] | |||
p = argparse.ArgumentParser('Command line search tool benchmark suite.') | p = argparse.ArgumentParser('Command line search tool benchmark suite.') | |||
p.add_argument( | p.add_argument( | |||
'--dir', metavar='PATH', default=os.getcwd(), | '--dir', metavar='PATH', default=os.getcwd(), | |||
help='The directory in which to download data and perform searches.') | help='The directory in which to download data and perform searches.') | |||
p.add_argument( | p.add_argument( | |||
'--download', metavar='CORPUS', action='append', | '--download', metavar='CORPUS', action='append', | |||
choices=download_choices, | choices=download_choices, | |||
End of changes. 85 change blocks. | ||||
177 lines changed or deleted | 160 lines changed or added |