"Fossies" - the Fresh Open Source Software Archive

Member "codespell-2.0.0/codespell_lib/tests/test_dictionary.py" (23 Nov 2020, 11348 Bytes) of package /linux/misc/codespell-2.0.0.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. See also the latest Fossies "Diffs" side-by-side code changes report for "test_dictionary.py": 1.17.1_vs_2.0.0.

    1 # -*- coding: utf-8 -*-
    2 
    3 import glob
    4 import os.path as op
    5 import os
    6 import re
    7 import warnings
    8 
    9 import pytest
   10 
   11 from codespell_lib._codespell import _builtin_dictionaries
   12 from codespell_lib._codespell import supported_languages
   13 
   14 spellers = dict()
   15 
   16 try:
   17     import aspell
   18     for lang in supported_languages:
   19         spellers[lang] = aspell.Speller('lang', lang)
   20 except Exception as exp:  # probably ImportError, but maybe also language
   21     if os.getenv('REQUIRE_ASPELL', 'false').lower() == 'true':
   22         raise RuntimeError(
   23             'Cannot run complete tests without aspell when '
   24             'REQUIRE_ASPELL=true. Got error during import:\n%s'
   25             % (exp,))
   26     else:
   27         warnings.warn(
   28             'aspell not found, but not required, skipping aspell tests. Got '
   29             'error during import:\n%s' % (exp,))
   30 
   31 ws = re.compile(r'.*\s.*')  # whitespace
   32 comma = re.compile(r'.*,.*')  # comma
   33 
   34 global_err_dicts = dict()
   35 global_pairs = set()
   36 
   37 # Filename, should be seen as errors in aspell or not
   38 _data_dir = op.join(op.dirname(__file__), '..', 'data')
   39 _fnames_in_aspell = [
   40     (op.join(_data_dir, 'dictionary%s.txt' % d[2]), d[3:5], d[5:7])
   41     for d in _builtin_dictionaries]
   42 fname_params = pytest.mark.parametrize('fname, in_aspell, in_dictionary', _fnames_in_aspell)  # noqa: E501
   43 
   44 
   45 def test_dictionaries_exist():
   46     """Test consistency of dictionaries."""
   47     doc_fnames = set(op.basename(f[0]) for f in _fnames_in_aspell)
   48     got_fnames = set(op.basename(f)
   49                      for f in glob.glob(op.join(_data_dir, '*.txt')))
   50     assert doc_fnames == got_fnames
   51 
   52 
   53 @fname_params
   54 def test_dictionary_formatting(fname, in_aspell, in_dictionary):
   55     """Test that all dictionary entries are valid."""
   56     errors = list()
   57     with open(fname, 'rb') as fid:
   58         for line in fid:
   59             err, rep = line.decode('utf-8').split('->')
   60             err = err.lower()
   61             rep = rep.rstrip('\n')
   62             try:
   63                 _check_err_rep(err, rep, in_aspell, fname, in_dictionary)
   64             except AssertionError as exp:
   65                 errors.append(str(exp).split('\n')[0])
   66     if len(errors):
   67         raise AssertionError('\n' + '\n'.join(errors))
   68 
   69 
   70 def _check_aspell(phrase, msg, in_aspell, fname, languages):
   71     if not spellers:  # if no spellcheckers exist
   72         return  # cannot check
   73     if in_aspell is None:
   74         return  # don't check
   75     if ' ' in phrase:
   76         for word in phrase.split():
   77             _check_aspell(word, msg, in_aspell, fname, languages)
   78         return  # stop normal checking as we've done each word above
   79     this_in_aspell = any(spellers[lang].check(phrase.encode(
   80         spellers[lang].ConfigKeys()['encoding'][1])) for lang in languages)
   81     end = 'be in aspell dictionaries (%s) for dictionary %s' % (
   82         ', '.join(languages), fname)
   83     if in_aspell:  # should be an error in aspell
   84         assert this_in_aspell, '%s should %s' % (msg, end)
   85     else:  # shouldn't be
   86         assert not this_in_aspell, '%s should not %s' % (msg, end)
   87 
   88 
   89 def _check_err_rep(err, rep, in_aspell, fname, languages):
   90     assert ws.match(err) is None, 'error %r has whitespace' % err
   91     assert comma.match(err) is None, 'error %r has a comma' % err
   92     assert len(rep) > 0, ('error %s: correction %r must be non-empty'
   93                           % (err, rep))
   94     assert not re.match(r'^\s.*', rep), ('error %s: correction %r '
   95                                          'cannot start with whitespace'
   96                                          % (err, rep))
   97     _check_aspell(err, 'error %r' % (err,), in_aspell[0], fname, languages[0])
   98     prefix = 'error %s: correction %r' % (err, rep)
   99     for (r, msg) in [
  100             (r'^,',
  101              '%s starts with a comma'),
  102             (r'\s,',
  103              '%s contains a whitespace character followed by a comma'),
  104             (r',\s\s',
  105              '%s contains a comma followed by multiple whitespace characters'),
  106             (r',[^ ]',
  107              '%s contains a comma *not* followed by a space'),
  108             (r'\s+$',
  109              '%s has a trailing space'),
  110             (r'^[^,]*,\s*$',
  111              '%s has a single entry but contains a trailing comma')]:
  112         assert not re.search(r, rep), (msg % (prefix,))
  113     del msg
  114     if rep.count(','):
  115         assert rep.endswith(','), ('error %s: multiple corrections must end '
  116                                    'with trailing ","' % (err,))
  117     reps = [r.strip() for r in rep.split(',')]
  118     reps = [r for r in reps if len(r)]
  119     for r in reps:
  120         assert err != r.lower(), ('error %r corrects to itself amongst others'
  121                                   % (err,))
  122         _check_aspell(
  123             r, 'error %s: correction %r' % (err, r),
  124             in_aspell[1], fname, languages[1])
  125 
  126     # aspell dictionary is case sensitive, so pass the original case into there
  127     # we could ignore the case, but that would miss things like days of the
  128     # week which we want to be correct
  129     reps = [r.lower() for r in reps]
  130     assert len(set(reps)) == len(reps), ('error %s: corrections "%s" are not '
  131                                          '(lower-case) unique' % (err, rep))
  132 
  133 
  134 @pytest.mark.parametrize('err, rep, match', [
  135     ('a a', 'bar', 'has whitespace'),
  136     ('a,a', 'bar', 'has a comma'),
  137     ('a', '', 'non-empty'),
  138     ('a', ' bar', 'start with whitespace'),
  139     ('a', ',bar', 'starts with a comma'),
  140     ('a', 'bar,bat', '.*not.*followed by a space'),
  141     ('a', 'bar ', 'trailing space'),
  142     ('a', 'b ,ar', 'contains a whitespace.*followed by a comma'),
  143     ('a', 'bar,', 'single entry.*comma'),
  144     ('a', 'bar, bat', 'must end with trailing ","'),
  145     ('a', 'a, bar,', 'corrects to itself amongst others'),
  146     ('a', 'a', 'corrects to itself'),
  147     ('a', 'bar, Bar,', 'unique'),
  148 ])
  149 def test_error_checking(err, rep, match):
  150     """Test that our error checking works."""
  151     with pytest.raises(AssertionError, match=match):
  152         _check_err_rep(err, rep, (None, None), 'dummy',
  153                        (supported_languages, supported_languages))
  154 
  155 
  156 @pytest.mark.skipif(not spellers, reason='requires aspell-en')
  157 @pytest.mark.parametrize('err, rep, err_aspell, rep_aspell, match', [
  158     # This doesn't raise any exceptions, so skip for now:
  159     # pytest.param('a', 'uvw, bar,', None, None, 'should be in aspell'),
  160     ('abcdef', 'uvwxyz, bar,', True, None, 'should be in aspell'),
  161     ('a', 'uvwxyz, bar,', False, None, 'should not be in aspell'),
  162     ('a', 'abcdef, uvwxyz,', None, True, 'should be in aspell'),
  163     ('abcdef', 'uvwxyz, bar,', True, True, 'should be in aspell'),
  164     ('abcdef', 'uvwxyz, bar,', False, True, 'should be in aspell'),
  165     ('a', 'bar, back,', None, False, 'should not be in aspell'),
  166     ('a', 'bar, back, Wednesday,', None, False, 'should not be in aspell'),
  167     ('abcdef', 'ghijkl, uvwxyz,', True, False, 'should be in aspell'),
  168     ('abcdef', 'uvwxyz, bar,', False, False, 'should not be in aspell'),
  169     # Multi-word corrections
  170     # One multi-word, both parts
  171     ('a', 'abcdef uvwxyz', None, True, 'should be in aspell'),
  172     ('a', 'bar back', None, False, 'should not be in aspell'),
  173     ('a', 'bar back Wednesday', None, False, 'should not be in aspell'),
  174     # Second multi-word, both parts
  175     ('a', 'bar back, abcdef uvwxyz, bar,', None, True, 'should be in aspell'),
  176     ('a', 'abcdef uvwxyz, bar back, ghijkl,', None, False, 'should not be in aspell'),  # noqa: E501
  177     # One multi-word, second part
  178     ('a', 'bar abcdef', None, True, 'should be in aspell'),
  179     ('a', 'abcdef back', None, False, 'should not be in aspell'),
  180 ])
  181 def test_error_checking_in_aspell(err, rep, err_aspell, rep_aspell, match):
  182     """Test that our error checking works with aspell."""
  183     with pytest.raises(AssertionError, match=match):
  184         _check_err_rep(
  185             err, rep, (err_aspell, rep_aspell), 'dummy',
  186             (supported_languages, supported_languages))
  187 
  188 
  189 # allow some duplicates, like "m-i-n-i-m-i-s-e", or "c-a-l-c-u-l-a-t-a-b-l-e"
  190 allowed_dups = {
  191     ('dictionary.txt', 'dictionary_en-GB_to_en-US.txt'),
  192     ('dictionary.txt', 'dictionary_rare.txt'),
  193     ('dictionary.txt', 'dictionary_usage.txt'),
  194     ('dictionary_rare.txt', 'dictionary_usage.txt'),
  195 }
  196 
  197 
  198 @fname_params
  199 @pytest.mark.dependency(name='dictionary loop')
  200 def test_dictionary_looping(fname, in_aspell, in_dictionary):
  201     """Test that all dictionary entries are valid."""
  202     this_err_dict = dict()
  203     short_fname = op.basename(fname)
  204     with open(fname, 'rb') as fid:
  205         for line in fid:
  206             err, rep = line.decode('utf-8').split('->')
  207             err = err.lower()
  208             assert err not in this_err_dict, \
  209                 'error %r already exists in %s' % (err, short_fname)
  210             rep = rep.rstrip('\n')
  211             reps = [r.strip() for r in rep.lower().split(',')]
  212             reps = [r for r in reps if len(r)]
  213             this_err_dict[err] = reps
  214     # 1. check the dict against itself (diagonal)
  215     for err in this_err_dict:
  216         for r in this_err_dict[err]:
  217             assert r not in this_err_dict, \
  218                 ('error %s: correction %s is an error itself in the same '
  219                  'dictionary file %s' % (err, r, short_fname))
  220     pair = (short_fname, short_fname)
  221     assert pair not in global_pairs
  222     global_pairs.add(pair)
  223     for other_fname, other_err_dict in global_err_dicts.items():
  224         # error duplication (eventually maybe we should just merge?)
  225         for err in this_err_dict:
  226             assert err not in other_err_dict, \
  227                 ('error %r in dictionary %s already exists in dictionary '
  228                  '%s' % (err, short_fname, other_fname))
  229         # 2. check corrections in this dict against other dicts (upper)
  230         pair = (short_fname, other_fname)
  231         if pair not in allowed_dups:
  232             for err in this_err_dict:
  233                 assert err not in other_err_dict, \
  234                     ('error %r in dictionary %s already exists in dictionary '
  235                      '%s' % (err, short_fname, other_fname))
  236                 for r in this_err_dict[err]:
  237                     assert r not in other_err_dict, \
  238                         ('error %s: correction %s from dictionary %s is an '
  239                          'error itself in dictionary %s'
  240                          % (err, r, short_fname, other_fname))
  241         assert pair not in global_pairs
  242         global_pairs.add(pair)
  243         # 3. check corrections in other dicts against this dict (lower)
  244         pair = (other_fname, short_fname)
  245         if pair not in allowed_dups:
  246             for err in other_err_dict:
  247                 for r in other_err_dict[err]:
  248                     assert r not in this_err_dict, \
  249                         ('error %s: correction %s from dictionary %s is an '
  250                          'error itself in dictionary %s'
  251                          % (err, r, other_fname, short_fname))
  252         assert pair not in global_pairs
  253         global_pairs.add(pair)
  254     global_err_dicts[short_fname] = this_err_dict
  255 
  256 
  257 @pytest.mark.dependency(depends=['dictionary loop'])
  258 def test_ran_all():
  259     """Test that all pairwise tests ran."""
  260     for f1, _, _ in _fnames_in_aspell:
  261         f1 = op.basename(f1)
  262         for f2, _, _ in _fnames_in_aspell:
  263             f2 = op.basename(f2)
  264             assert (f1, f2) in global_pairs
  265     assert len(global_pairs) == len(_fnames_in_aspell) ** 2