"Fossies" - the Fresh Open Source Software Archive

Member "aspell-0.60.8/modules/speller/default/language.cpp" (8 Oct 2019, 25060 Bytes) of package /linux/misc/aspell-0.60.8.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "language.cpp" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 0.60.7_vs_0.60.8.

    1 // Copyright 2000 by Kevin Atkinson under the terms of the LGPL
    2 
    3 #include "settings.h"
    4 
    5 #include <vector>
    6 #include <assert.h>
    7 
    8 #include <iostream.hpp>
    9 
   10 #include "asc_ctype.hpp"
   11 #include "clone_ptr-t.hpp"
   12 #include "config.hpp"
   13 #include "enumeration.hpp"
   14 #include "errors.hpp"
   15 #include "file_data_util.hpp"
   16 #include "fstream.hpp"
   17 #include "language.hpp"
   18 #include "string.hpp"
   19 #include "cache-t.hpp"
   20 #include "getdata.hpp"
   21 #include "file_util.hpp"
   22 
   23 #ifdef ENABLE_NLS
   24 #  include <langinfo.h>
   25 #endif
   26 
   27 #include "gettext.h"
   28 
   29 namespace aspeller {
   30 
   31   static const char TO_CHAR_TYPE[256] = {
   32     // 1  2  3  4  5  6  7  8  9  A  B  C  D  E  F 
   33     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0
   34     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1
   35     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2
   36     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3
   37     0, 4, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 6, 5, 0, 0, // 4
   38     0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, // 5
   39     0, 4, 0, 0, 3, 0, 0, 0, 2, 0, 0, 0, 6, 5, 0, 0, // 6
   40     0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, // 7
   41     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
   42     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
   43     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
   44     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
   45     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // C
   46     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // D
   47     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // E
   48     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  // F
   49   };
   50 
   51   static const int FOR_CONFIG = 1;
   52 
   53   static const KeyInfo lang_config_keys[] = {
   54     {"charset",             KeyInfoString, "iso-8859-1", ""}
   55     , {"data-encoding",       KeyInfoString, "<charset>", ""}
   56     , {"name",                KeyInfoString, "", ""}
   57     , {"run-together",        KeyInfoBool,   "", "", 0, FOR_CONFIG}
   58     , {"run-together-limit",  KeyInfoInt,    "", "", 0, FOR_CONFIG}
   59     , {"run-together-min",    KeyInfoInt,    "", "", 0, FOR_CONFIG}
   60     , {"soundslike",          KeyInfoString, "none", ""}
   61     , {"special",             KeyInfoString, "", ""}
   62     , {"ignore-accents" ,     KeyInfoBool, "", "", 0, FOR_CONFIG}
   63     , {"invisible-soundslike",KeyInfoBool, "", "", 0, FOR_CONFIG}
   64     , {"keyboard",            KeyInfoString, "standard", "", 0, FOR_CONFIG} 
   65     , {"affix",               KeyInfoString, "none", ""}
   66     , {"affix-compress",      KeyInfoBool, "false", "", 0, FOR_CONFIG}
   67     , {"partially-expand",    KeyInfoBool, "false", "", 0, FOR_CONFIG}
   68     , {"affix-char",          KeyInfoString, "/", "", 0, FOR_CONFIG}
   69     , {"flag-char",           KeyInfoString, ":", "", 0, FOR_CONFIG}
   70     , {"repl-table",          KeyInfoString, "none", ""}
   71     , {"sug-split-char",      KeyInfoList, "", "", 0, FOR_CONFIG}
   72     , {"store-as",            KeyInfoString, "", ""}
   73     , {"try",                 KeyInfoString, "", ""}
   74     , {"normalize",           KeyInfoBool, "false", "", 0, FOR_CONFIG}
   75     , {"norm-required",       KeyInfoBool, "false", "", 0, FOR_CONFIG}
   76     , {"norm-form",           KeyInfoString, "nfc", "", 0, FOR_CONFIG}
   77   };
   78 
   79   static GlobalCache<Language> language_cache("language");
   80 
   81   PosibErr<void> Language::setup(const String & lang, const Config * config)
   82   {
   83     //
   84     // get_lang_info
   85     //
   86 
   87     String dir1,dir2,path;
   88 
   89     fill_data_dir(config, dir1, dir2);
   90     dir_ = find_file(path,dir1,dir2,lang,".dat");
   91 
   92     lang_config_ = 
   93       new Config("speller-lang",
   94                  lang_config_keys, 
   95                  lang_config_keys + sizeof(lang_config_keys)/sizeof(KeyInfo));
   96     Config & data = *lang_config_;
   97 
   98     {
   99       PosibErrBase pe = data.read_in_file(path);
  100       if (pe.has_err(cant_read_file)) {
  101     String mesg = pe.get_err()->mesg;
  102     mesg[0] = asc_tolower(mesg[0]);
  103     mesg = _("This is probably because: ") + mesg;
  104     return make_err(unknown_language, lang, mesg);
  105       } else if (pe.has_err())
  106     return pe;
  107     }
  108 
  109     if (!data.have("name"))
  110       return make_err(bad_file_format, path, _("The required field \"name\" is missing."));
  111 
  112     String buf;
  113     name_          = data.retrieve("name");
  114     charset_       = fix_encoding_str(data.retrieve("charset"), buf);
  115     charmap_       = charset_;
  116 
  117     ConfigConvKey d_enc = data.retrieve_value("data-encoding");
  118     d_enc.fix_encoding_str();
  119     data_encoding_ = d_enc.val;
  120 
  121     DataPair d;
  122 
  123     //
  124     // read header of cset data file
  125     //
  126   
  127     FStream char_data;
  128     String char_data_name;
  129     find_file(char_data_name,dir1,dir2,charset_,".cset");
  130     RET_ON_ERR(char_data.open(char_data_name, "r"));
  131     
  132     String temp;
  133     char * p;
  134     do {
  135       p = get_nb_line(char_data, temp);
  136       if (*p == '=') {
  137         ++p;
  138         while (asc_isspace(*p)) ++p;
  139         charmap_ = p;
  140       }
  141     } while (*p != '/');
  142 
  143     //
  144     // fill in tables
  145     //
  146 
  147     for (unsigned int i = 0; i != 256; ++i) {
  148       p = get_nb_line(char_data, temp);
  149       if (!p || strtoul(p, &p, 16) != i) 
  150         return make_err(bad_file_format, char_data_name);
  151       to_uni_[i] = strtol(p, &p, 16);
  152       while (asc_isspace(*p)) ++p;
  153       char_type_[i] = static_cast<CharType>(TO_CHAR_TYPE[to_uchar(*p++)]);
  154       while (asc_isspace(*p)) ++p;
  155       ++p; // display, ignored for now
  156       CharInfo inf = char_type_[i] >= Letter ? LETTER : 0;
  157       to_upper_[i] = static_cast<char>(strtol(p, &p, 16));
  158       inf |= to_uchar(to_upper_[i]) == i ? UPPER : 0;
  159       to_lower_[i] = static_cast<char>(strtol(p, &p, 16));
  160       inf |= to_uchar(to_lower_[i]) == i ? LOWER : 0;
  161       to_title_[i] = static_cast<char>(strtol(p, &p, 16));
  162       inf |= to_uchar(to_title_[i]) == i ? TITLE : 0;
  163       to_plain_[i] = static_cast<char>(strtol(p, &p, 16));
  164       inf |= to_uchar(to_plain_[i]) == i ? PLAIN : 0;
  165       inf |= to_uchar(to_plain_[i]) == 0 ? PLAIN : 0;
  166       sl_first_[i] = static_cast<char>(strtol(p, &p, 16));
  167       sl_rest_[i]  = static_cast<char>(strtol(p, &p, 16));
  168       char_info_[i] = inf;
  169     }
  170 
  171     for (unsigned int i = 0; i != 256; ++i) {
  172       de_accent_[i] = to_plain_[i] == 0 ? to_uchar(i) : to_plain_[i];
  173     }
  174 
  175     to_plain_[0] = 0x10; // to make things slightly easier
  176     to_plain_[1] = 0x10;
  177 
  178     for (unsigned int i = 0; i != 256; ++i) {
  179       to_stripped_[i] = to_plain_[(unsigned char)to_lower_[i]];
  180     }
  181     
  182     char_data.close();
  183 
  184     if (data.have("store-as"))
  185       buf = data.retrieve("store-as");
  186     else if (data.retrieve_bool("affix-compress"))
  187       buf = "lower";
  188     else
  189       buf = "stripped";
  190     char * clean_is;
  191     if (buf == "stripped") {
  192       store_as_ = Stripped;
  193       clean_is = to_stripped_;
  194     } else {
  195       store_as_ = Lower;
  196       clean_is = to_lower_;
  197     }
  198 
  199     for (unsigned i = 0; i != 256; ++i) {
  200       to_clean_[i] = char_type_[i] > NonLetter ? clean_is[i] : 0;
  201       if ((unsigned char)to_clean_[i] == i) char_info_[i] |= CLEAN;
  202     }
  203 
  204     to_clean_[0x00] = 0x10; // to make things slightly easier
  205     to_clean_[0x10] = 0x10;
  206 
  207     clean_chars_   = get_clean_chars(*this);
  208 
  209     //
  210     // determine which mapping to use
  211     //
  212 
  213     if (charmap_ != charset_) {
  214       if (file_exists(dir1 + charset_ + ".cmap") || 
  215           file_exists(dir2 + charset_ + ".cmap"))
  216       {
  217         charmap_ = charset_;
  218       } else if (data_encoding_ == charset_) {
  219         data_encoding_ = charmap_;
  220       }
  221     }
  222       
  223     //
  224     // set up conversions
  225     //
  226     {
  227 #ifdef ENABLE_NLS
  228       const char * tmp = 0;
  229       tmp = bind_textdomain_codeset("aspell", 0);
  230 #ifdef HAVE_LANGINFO_CODESET
  231       if (!tmp) tmp = nl_langinfo(CODESET);
  232 #endif
  233       if (ascii_encoding(*config, tmp)) tmp = 0;
  234       if (tmp)
  235         RET_ON_ERR(mesg_conv_.setup(*config, charmap_, fix_encoding_str(tmp, buf), NormTo));
  236       else 
  237 #endif
  238         RET_ON_ERR(mesg_conv_.setup(*config, charmap_, data_encoding_, NormTo));
  239       // no need to check for errors here since we know charmap_ is a
  240       // supported encoding
  241       RET_ON_ERR(to_utf8_.setup(*config, charmap_, "utf-8", NormTo));
  242       RET_ON_ERR(from_utf8_.setup(*config, "utf-8", charmap_, NormFrom));
  243     }
  244     
  245     Conv iconv;
  246     RET_ON_ERR(iconv.setup(*config, data_encoding_, charmap_, NormFrom));
  247 
  248     //
  249     // set up special
  250     //
  251 
  252     init(data.retrieve("special"), d, buf);
  253     while (split(d)) {
  254       char c = iconv(d.key)[0];
  255       split(d);
  256       special_[to_uchar(c)] = 
  257         SpecialChar (d.key[0] == '*',d.key[1] == '*', d.key[2] == '*');
  258     }
  259 
  260     //
  261     // prep phonetic code
  262     //
  263 
  264     {
  265       PosibErr<Soundslike *> pe = new_soundslike(data.retrieve("soundslike"),
  266                                                  iconv,
  267                                                this);
  268       if (pe.has_err()) return pe;
  269       soundslike_.reset(pe.data);
  270     }
  271     soundslike_chars_ = soundslike_->soundslike_chars();
  272 
  273     have_soundslike_ = strcmp(soundslike_->name(), "none") != 0;
  274 
  275     //
  276     // prep affix code
  277     //
  278     {
  279       PosibErr<AffixMgr *> pe = new_affix_mgr(data.retrieve("affix"), iconv, this);
  280       if (pe.has_err()) return pe;
  281       affix_.reset(pe.data);
  282     }
  283 
  284     //
  285     // fill repl tables (if any)
  286     //
  287 
  288     String repl = data.retrieve("repl-table");
  289     have_repl_ = false;
  290     if (repl != "none") {
  291 
  292       String repl_file;
  293       FStream REPL;
  294       find_file(repl_file, dir1, dir2, repl, "_repl", ".dat");
  295       RET_ON_ERR(REPL.open(repl_file, "r"));
  296       
  297       size_t num_repl = 0;
  298       while (getdata_pair(REPL, d, buf)) {
  299         ::to_lower(d.key);
  300         if (d.key == "rep") {
  301           num_repl = atoi(d.value); // FIXME make this more robust
  302           break;
  303         }
  304       }
  305 
  306       if (num_repl > 0)
  307         have_repl_ = true;
  308 
  309       for (size_t i = 0; i != num_repl; ++i) {
  310         bool res = getdata_pair(REPL, d, buf);
  311         assert(res); // FIXME
  312         ::to_lower(d.key);
  313         assert(d.key == "rep"); // FIXME
  314         split(d);
  315         SuggestRepl rep;
  316         rep.substr = buf_.dup(iconv(d.key));
  317         if (check_if_valid(*this, rep.substr).get_err()) 
  318           continue; // FIXME: This should probably be an error, but
  319                     // this may cause problems with compatibility with
  320                     // Myspell as these entries may make sense for
  321                     // Myspell (but obviously not for Aspell)
  322         to_clean((char *)rep.substr, rep.substr);
  323         rep.repl   = buf_.dup(iconv(d.value));
  324         if (check_if_valid(*this, rep.repl).get_err()) 
  325           continue; // FIXME: Ditto
  326         to_clean((char *)rep.repl, rep.repl);
  327         if (strcmp(rep.substr, rep.repl) == 0 || rep.substr[0] == '\0')
  328           continue; // FIXME: Ditto
  329         repls_.push_back(rep);
  330       }
  331 
  332     }
  333     return no_err;
  334   }
  335 
  336   PosibErr<void> Language::set_lang_defaults(Config & config) const
  337   {
  338     config.replace_internal("actual-lang", name());
  339     RET_ON_ERR(config.lang_config_merge(*lang_config_, FOR_CONFIG, data_encoding_));
  340     return no_err;
  341   }
  342 
  343   WordInfo Language::get_word_info(ParmStr str) const
  344   {
  345     CharInfo first = CHAR_INFO_ALL, all = CHAR_INFO_ALL;
  346     const char * p = str;
  347     while (*p && (first = char_info(*p++), all &= first, !(first & LETTER)));
  348     while (*p) all &= char_info(*p++);
  349     WordInfo res;
  350     if      (all & LOWER)   res = AllLower;
  351     else if (all & UPPER)   res = AllUpper;
  352     else if (first & TITLE) res = FirstUpper;
  353     else                    res = Other;
  354     if (all & PLAIN)  res |= ALL_PLAIN;
  355     if (all & CLEAN)  res |= ALL_CLEAN;
  356     return res;
  357   }
  358   
  359   CasePattern Language::case_pattern(ParmStr str) const  
  360   {
  361     CharInfo first = CHAR_INFO_ALL, all = CHAR_INFO_ALL;
  362     const char * p = str;
  363     while (*p && (first = char_info(*p++), all &= first, !(first & LETTER)));
  364     while (*p) all &= char_info(*p++);
  365     if      (all & LOWER)   return AllLower;
  366     else if (all & UPPER)   return AllUpper;
  367     else if (first & TITLE) return FirstUpper;
  368     else                    return Other;
  369   }
  370 
  371   CasePattern Language::case_pattern(const char * str, unsigned size) const  
  372   {
  373     CharInfo first = CHAR_INFO_ALL, all = CHAR_INFO_ALL;
  374     const char * p = str;
  375     const char * end = p + size;
  376     while (p < end && (first = char_info(*p++), all &= first, !(first & LETTER)));
  377     while (p < end) all &= char_info(*p++);
  378     if      (all & LOWER)   return AllLower;
  379     else if (all & UPPER)   return AllUpper;
  380     else if (first & TITLE) return FirstUpper;
  381     else                    return Other;
  382   }
  383   
  384   void Language::fix_case(CasePattern case_pattern,
  385                           char * res, const char * str) const 
  386   {
  387     if (!str[0]) return;
  388     if (case_pattern == AllUpper) {
  389       to_upper(res,str);
  390     } if (case_pattern == FirstUpper && is_lower(str[0])) {
  391       *res = to_title(str[0]);
  392       if (res == str) return;
  393       res++;
  394       str++;
  395       while (*str) *res++ = *str++;
  396       *res = '\0';
  397     } else {
  398       if (res == str) return;
  399       while (*str) *res++ = *str++;
  400       *res = '\0';
  401     }
  402   }
  403 
  404   const char * Language::fix_case(CasePattern case_pattern, const char * str,
  405                                   String & buf) const 
  406   {
  407     if (!str[0]) return str;
  408     if (case_pattern == AllUpper) {
  409       to_upper(buf,str);
  410       return buf.str();
  411     } if (case_pattern == FirstUpper && is_lower(str[0])) {
  412       buf.clear();
  413       buf += to_title(str[0]);
  414       str++;
  415       while (*str) buf += *str++;
  416       return buf.str();
  417     } else {
  418       return str;
  419     }
  420   }
  421 
  422   WordAff * Language::fake_expand(ParmStr word, ParmStr aff, 
  423                                   ObjStack & buf) const 
  424   {
  425     WordAff * cur = (WordAff *)buf.alloc_bottom(sizeof(WordAff));
  426     cur->word = buf.dup(word);
  427     cur->aff = (unsigned char *)buf.dup("");
  428     cur->next = 0;
  429     return cur;
  430   }
  431 
  432   CompoundWord Language::split_word(const char * word, unsigned len,
  433                                     bool camel_case) const
  434   {
  435     if (!camel_case || len <= 1)
  436       return CompoundWord(word, word + len);
  437     // len >= 2
  438     if (is_upper(word[0])) {
  439       if (is_lower(word[1])) {
  440         unsigned i = 2;
  441         while (i < len && is_lower(word[i]))
  442           ++i;
  443         return CompoundWord(word, word + i, word + len);
  444       }
  445       if (is_upper(word[1])) {
  446         unsigned i = 2;
  447         while (i < len && is_upper(word[i]))
  448           ++i;
  449         if (i == len)
  450           return CompoundWord(word, word + len);
  451         // The first upper case letter is assumed to be part of the next word
  452         return CompoundWord(word, word + i - 1, word + len);
  453       }
  454     } else if (is_lower(word[0])) {
  455       unsigned i = 1;
  456       while (i < len && is_lower(word[i]))
  457         ++i;
  458       return CompoundWord(word, word + i, word + len);
  459     }
  460     // this should't happen but just in case...
  461     return CompoundWord(word, word + len);
  462   }
  463   
  464   bool SensitiveCompare::operator() (const char * word0, 
  465                      const char * inlist0) const
  466   {
  467     assert(*word0 && *inlist0);
  468   try_again:
  469     const char * word = word0;
  470     const char * inlist = inlist0;
  471 
  472     if (!case_insensitive) {
  473       
  474       if (begin) {
  475         if (*word == *inlist || *word == lang->to_title(*inlist)) ++word, ++inlist;
  476         else                                                      goto try_upper;
  477       }
  478       while (*word && *inlist && *word == *inlist) ++word, ++inlist;
  479       if (*inlist) goto try_upper;
  480       if (end && lang->special(*word).end) ++word;
  481       if (*word) goto try_upper;
  482       return true;
  483     try_upper:
  484       word = word0;
  485       inlist = inlist0;
  486       while (*word && *inlist && *word == lang->to_upper(*inlist)) ++word, ++inlist;
  487       if (*inlist) goto fail;
  488       if (end && lang->special(*word).end) ++word;
  489       if (*word) goto fail;
  490       
  491     } else { // case_insensitive
  492       
  493       while (*word && *inlist && 
  494              lang->to_upper(*word) == lang->to_upper(*inlist)) ++word, ++inlist;
  495       if (*inlist) goto fail;
  496       if (end && lang->special(*word).end) ++word;
  497       if (*word) goto fail;
  498       
  499     }
  500     return true;
  501 
  502   fail:
  503     if (begin && lang->special(*word0).begin) {++word0; goto try_again;}
  504     return false;
  505   }
  506 
  507   static PosibErrBase invalid_word_e(const Language & l,
  508                                      ParmStr word,
  509                                      const char * msg,
  510                                      char chr = 0)
  511   {
  512     char m[200];
  513     if (chr) {
  514       // the "char *" cast is needed due to an incorrect "snprintf"
  515       //   declaration on some platforms.
  516       snprintf(m, 200, (char *)msg, MsgConv(l)(chr), l.to_uni(chr));
  517       msg = m;
  518     }
  519     return make_err(invalid_word, MsgConv(l)(word), msg);
  520   }
  521 
  522   PosibErr<void> check_if_sane(const Language & l, ParmStr word) {
  523     if (*word == '\0') 
  524       return invalid_word_e(l, word, _("Empty string."));
  525     return no_err;
  526   }
  527 
  528   PosibErr<void> check_if_valid(const Language & l, ParmStr word) {
  529     RET_ON_ERR(check_if_sane(l, word));
  530     const char * i = word;
  531     if (!l.is_alpha(*i)) {
  532       if (!l.special(*i).begin)
  533         return invalid_word_e(l, word, _("The character '%s' (U+%02X) may not appear at the beginning of a word."), *i);
  534       else if (!l.is_alpha(*(i+1)))
  535         return invalid_word_e(l, word, _("The character '%s' (U+%02X) must be followed by an alphabetic character."), *i);
  536       else if (!*(i+1))
  537         return invalid_word_e(l, word, _("Does not contain any alphabetic characters."));
  538     }
  539     for (;*(i+1) != '\0'; ++i) { 
  540       if (!l.is_alpha(*i)) {
  541         if (!l.special(*i).middle)
  542           return invalid_word_e(l, word, _("The character '%s' (U+%02X) may not appear in the middle of a word."), *i);
  543         else if (!l.is_alpha(*(i+1)))
  544           return invalid_word_e(l, word, _("The character '%s' (U+%02X) must be followed by an alphabetic character."), *i);
  545       }
  546     }
  547     if (!l.is_alpha(*i)) {
  548       if (*i == '\r')
  549         return invalid_word_e(l, word, _("The character '\\r' (U+0D) may not appear at the end of a word. " 
  550                                          "This probably means means that the file is using MS-DOS EOL instead of Unix EOL."), *i);
  551       if (!l.special(*i).end)
  552         return invalid_word_e(l, word, _("The character '%s' (U+%02X) may not appear at the end of a word."), *i);
  553     }
  554     return no_err;
  555   }
  556 
  557   PosibErr<void> validate_affix(const Language & l, ParmStr word, ParmStr aff)
  558   {
  559     for (const char * a = aff; *a; ++a) {
  560       CheckAffixRes res = l.affix()->check_affix(word, *a);
  561       if (res == InvalidAffix)
  562         return make_err(invalid_affix, MsgConv(l)(*a), MsgConv(l)(word));
  563       else if (res == InapplicableAffix)
  564         return make_err(inapplicable_affix, MsgConv(l)(*a), MsgConv(l)(word));
  565     }
  566     return no_err;
  567   }
  568 
  569   CleanAffix::CleanAffix(const Language * lang0, OStream * log0)
  570     : lang(lang0), log(log0), msgconv1(lang0), msgconv2(lang0)
  571   {
  572   }
  573   
  574   char * CleanAffix::operator()(ParmStr word, char * aff)
  575   {
  576     char * r = aff;
  577     for (const char * a = aff; *a; ++a) {
  578       CheckAffixRes res = lang->affix()->check_affix(word, *a);
  579       if (res == ValidAffix) {
  580         *r = *a;
  581         ++r;
  582       } else if (log) {
  583         const char * msg = res == InvalidAffix 
  584           ? _("Warning: Removing invalid affix '%s' from word %s.\n")
  585           : _("Warning: Removing inapplicable affix '%s' from word %s.\n");
  586         log->printf(msg, msgconv1(*a), msgconv2(word));
  587       }
  588     }
  589     *r = '\0';
  590     return r;
  591   }
  592 
  593   String get_stripped_chars(const Language & lang) {
  594     bool chars_set[256] = {0};
  595     String     chars_list;
  596     for (int i = 0; i != 256; ++i) 
  597     {
  598       char c = static_cast<char>(i);
  599     if (lang.is_alpha(c) || lang.special(c).any)
  600       chars_set[static_cast<unsigned char>(lang.to_stripped(c))] = true;
  601     }
  602     for (int i = 1; i != 256; ++i) 
  603     {
  604       if (chars_set[i]) 
  605     chars_list += static_cast<char>(i);
  606     }
  607     return chars_list;
  608   }
  609 
  610   String get_clean_chars(const Language & lang) {
  611     bool chars_set[256] = {0};
  612     String     chars_list;
  613     for (int i = 0; i != 256; ++i) 
  614     {
  615       char c = static_cast<char>(i);
  616       if (lang.is_alpha(c) || lang.special(c).any) 
  617         chars_set[static_cast<unsigned char>(lang.to_clean(c))] = true;
  618     }
  619     for (int i = 1; i != 256; ++i) 
  620     {
  621       if (chars_set[i]) {
  622     chars_list += static_cast<char>(i);
  623       }
  624     }
  625     return chars_list;
  626   }
  627 
  628   PosibErr<Language *> new_language(const Config & config, ParmStr lang)
  629   {
  630     if (!lang)
  631       return get_cache_data(&language_cache, &config, config.retrieve("lang"));
  632     else
  633       return get_cache_data(&language_cache, &config, lang);
  634   }
  635 
  636   PosibErr<void> open_affix_file(const Config & c, FStream & f)
  637   {
  638     String lang = c.retrieve("lang");
  639 
  640     String dir1,dir2,path;
  641     fill_data_dir(&c, dir1, dir2);
  642     String dir = find_file(path,dir1,dir2,lang,".dat");
  643 
  644     String file;
  645     file += dir;
  646     file += '/';
  647     file += lang;
  648     file += "_affix.dat";
  649     
  650     RET_ON_ERR(f.open(file,"r"));
  651 
  652     return no_err;
  653   }
  654 
  655   bool find_language(Config & c)
  656   {
  657     String l_data = c.retrieve("lang");
  658     char * l = l_data.mstr();
  659 
  660     String dir1,dir2,path;
  661     fill_data_dir(&c, dir1, dir2);
  662 
  663     char * s = l + strlen(l);
  664 
  665     while (s > l) {
  666       find_file(path,dir1,dir2,l,".dat");
  667       if (file_exists(path)) {
  668         c.replace_internal("actual-lang", l);
  669         return true;
  670       }
  671       while (s > l && !(*s == '-' || *s == '_')) --s;
  672       *s = '\0';
  673     }
  674     return false;
  675   }
  676 
  677   WordListIterator::WordListIterator(StringEnumeration * in0,
  678                                    const Language * lang0,
  679                                    OStream * log0)
  680     : in(in0), lang(lang0), log(log0), val(), str(0), str_end(0),
  681       clean_affix(lang0, log0) {}
  682 
  683   PosibErr<void>  WordListIterator::init(Config & config)
  684   {
  685     if (!config.have("norm-strict"))
  686       config.replace("norm-strict", "true");
  687     have_affix = lang->have_affix();
  688     validate_words = config.retrieve_bool("validate-words");
  689     validate_affixes = config.retrieve_bool("validate-affixes");
  690     clean_words = config.retrieve_bool("clean-words");
  691     skip_invalid_words = config.retrieve_bool("skip-invalid-words");
  692     clean_affixes = config.retrieve_bool("clean-affixes");
  693     if (config.have("encoding")) {
  694       ConfigConvKey enc = config.retrieve_value("encoding");
  695       RET_ON_ERR(iconv.setup(config, enc, lang->charmap(),NormFrom));
  696     } else {
  697       RET_ON_ERR(iconv.setup(config, lang->data_encoding(), lang->charmap(), NormFrom));
  698     }
  699     return no_err;
  700   }
  701 
  702   PosibErr<void> WordListIterator::init_plain(Config & config)
  703   {
  704     if (!config.have("norm-strict"))
  705       config.replace("norm-strict", "true");
  706     have_affix = false;
  707     validate_words = config.retrieve_bool("validate-words");
  708     clean_words = true;
  709     if (config.have("clean-words"))
  710       clean_words = config.retrieve_bool("clean-words");
  711     skip_invalid_words = true;
  712     RET_ON_ERR(iconv.setup(config, "utf-8", lang->charmap(),NormFrom));
  713     return no_err;
  714   }
  715  
  716   PosibErr<bool> WordListIterator::adv() 
  717   {
  718   loop:
  719     if (!str) {
  720       orig = in->next();
  721       if (!orig) return false;
  722       if (!*orig) goto loop;
  723       PosibErr<const char *> pe = iconv(orig);
  724       if (pe.has_err()) {
  725         if (!skip_invalid_words) return pe;
  726         if (log) log->printf(_("Warning: %s Skipping string.\n"), pe.get_err()->mesg);
  727         else pe.ignore_err();
  728         goto loop;
  729       }
  730       if (pe.data == orig) {
  731         data = orig;
  732         data.ensure_null_end();
  733         str = data.pbegin();
  734         str_end = data.pend();
  735       } else {
  736         str = iconv.buf.pbegin();
  737         str_end = iconv.buf.pend();
  738       }
  739       char * aff = str_end;
  740       char * aff_end = str_end;
  741       if (have_affix) {
  742         aff = strchr(str, '/');
  743         if (aff == 0) {
  744           aff = str_end;
  745         } else {
  746           *aff = '\0';
  747           str_end = aff;
  748           ++aff;
  749         }
  750         if (validate_affixes) {
  751           if (clean_affixes)
  752             aff_end = clean_affix(str, aff);
  753           else
  754             RET_ON_ERR(validate_affix(*lang, str, aff));
  755         }
  756       }
  757       val.aff.str = aff;
  758       val.aff.size = aff_end - aff;
  759       if (!*aff && validate_words && clean_words) {
  760         char * s = str;
  761         while (s < str_end && !lang->is_alpha(*s) && !lang->special(*s).begin)
  762           *s++ = '\0';
  763         char * s2 = str_end - 1;
  764         while (s2 >= str && *s2 && !lang->is_alpha(*s2) && !lang->special(*s2).end)
  765           *s2-- = '\0';
  766       }
  767     }
  768     while (str < str_end) 
  769     {
  770       if (!*str) {++str; continue;}
  771 
  772       PosibErrBase pe2 = validate_words ? check_if_valid(*lang, str) : no_err;
  773 
  774       val.word.str = str;
  775       val.word.size = strlen(str);
  776       str += val.word.size + 1;
  777 
  778       if (!pe2.has_err() && val.word.size + (*val.aff ? val.aff.size + 1 : 0) > 240)
  779         pe2 = make_err(invalid_word, MsgConv(lang)(val.word),
  780                        _("The total length is larger than 240 characters."));
  781 
  782       if (!pe2.has_err()) return true;
  783       if (!skip_invalid_words) return pe2;
  784       if (log) log->printf(_("Warning: %s Skipping word.\n"), pe2.get_err()->mesg);
  785       else pe2.ignore_err();
  786     } 
  787     str = 0;
  788     goto loop;
  789   }
  790 }