"Fossies" - the Fresh Open Source Software Archive

Member "recode-3.7.12/tables.py" (15 Feb 2022, 48406 Bytes) of package /linux/misc/recode-3.7.12.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) Python source code syntax highlighting (style: standard) with prefixed line numbers. Alternatively you can here view or download the uninterpreted source code file. For more information about "tables.py" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 3.7.11_vs_3.7.12.

    1 #!/usr/bin/python3
    2 # -*- coding: utf-8 -*-
    3 # Automatically derive Recode table files from various sources.
    4 # Copyright © 1993, 1994, 1997, 1998, 1999, 2000 Free Software Foundation, Inc.
    5 # François Pinard <pinard@iro.umontreal.ca>, 1993.
    6 
    7 # This program is free software; you can redistribute it and/or modify
    8 # it under the terms of the GNU General Public License as published by
    9 # the Free Software Foundation; either version 3, or (at your option)
   10 # any later version.
   11 
   12 # This program is distributed in the hope that it will be useful,
   13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
   14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   15 # GNU General Public License for more details.
   16 
   17 # You should have received a copy of the GNU General Public License
   18 # along with this program; if not, see <https://www.gnu.org/licenses/>.
   19 
   20 """\
   21 `tables.py' derives Recode table files from various sources.
   22 
   23 Usage: python tables.py [OPTION]... DATA-FILE...
   24 
   25 Output selection:
   26   -e   Produce C source file for explode data (explode.c)
   27   -i   Produce C source file for iconv charsets (iconvdecl.h)
   28   -m   Produce C inclusion file for short RFC 1345 mnemonics (rfc1345.h)
   29   -n   Produce C inclusion file for character names (charname.h)
   30   -p   Produce C source files for strip data (strip-pool.c and strip-data.c)
   31   -t   Produce Texinfo inclusion file for RFC 1345 (rfc1345.texi)
   32 
   33 Modality options:
   34   -C DIRECTORY   Change to DIRECTORY prior to processing
   35   -F             Produce French versions for -n, -s or -t
   36   -v             Increase verbosity
   37 
   38 DATA-FILEs may be rfc1345.txt, mnemonic[.,]ds, Unicode maps, or .def files
   39 from Keld's chset* packages.  The digesting order is usually important.
   40 When `-F' and `-n' are used, process Alain's tables.
   41 """
   42 
   43 import re, sys, io
   44 
   45 # Character constants.
   46 REPLACEMENT_CHARACTER = 0xFFFD
   47 NOT_A_CHARACTER = 0xFFFF
   48 
   49 # Main driver.
   50 
   51 class Main:
   52     directory = None
   53     charnames = None
   54     explodes = None
   55     iconv = None
   56     mnemonics = None
   57     strips = None
   58     verbose = False
   59 
   60     def main(self, *arguments):
   61         if not arguments:
   62             sys.stdout.write(__doc__)
   63             return
   64         import getopt
   65         French_option = False
   66         options, arguments = getopt.getopt(arguments, 'C:Feimnptv')
   67         for option, value in options:
   68             if option == '-C':
   69                 self.directory = value
   70             elif option == '-F':
   71                 French_option = True
   72             elif option == '-e':
   73                 if not self.explodes:
   74                     self.explodes = Explodes()
   75                 self.explodes.do_sources = True
   76             elif option == '-i':
   77                 if not self.iconv:
   78                     self.iconv = Iconv()
   79                 self.iconv.do_sources = True
   80             elif option == '-m':
   81                 if not self.mnemonics:
   82                     self.mnemonics = Mnemonics()
   83                 self.mnemonics.do_sources = True
   84             elif option == '-n':
   85                 if not self.charnames:
   86                     self.charnames = Charnames()
   87                 self.charnames.do_sources = True
   88             elif option == '-p':
   89                 if not self.strips:
   90                     self.strips = Strips()
   91                 self.strips.do_sources = True
   92             elif option == '-t':
   93                 if not self.strips:
   94                     self.strips = Strips()
   95                 self.strips.do_texinfo = True
   96             elif option == '-v':
   97                 self.verbose = True
   98 
   99         # Read all data tables.
  100         if self.directory:
  101             import os
  102             os.chdir(self.directory)
  103         if self.iconv:
  104             self.iconv.digest()
  105         for name in arguments:
  106             input = Input(name)
  107             while True:
  108                 line = input.readline()
  109                 if not line:
  110                     break
  111                 if line[0] == '\n':
  112                     continue
  113                 if line[:2] == '/*':
  114                     while line.find('*/') < 0:
  115                         line = input.readline()
  116                     continue
  117                 if input.begins('#    Name:'):
  118                     if not self.strips:
  119                         self.strips = Strips()
  120                     self.strips.digest_unimap(input)
  121                     break
  122                 if line[0] == '#':
  123                     continue
  124                 if input.begins('escape_char'):
  125                     if not self.mnemonics:
  126                         self.mnemonics = Mnemonics()
  127                     self.mnemonics.digest_mnemonics_ds(input)
  128                     break
  129                 if input.match('Network Working Group +K\. Simonsen$'):
  130                     if (self.charnames
  131                             and self.charnames.do_sources
  132                             and not French_option):
  133                         while not input.begins(
  134                             '   3rd field is the long descriptive'):
  135                             line = input.readline()
  136                         if not self.mnemonics:
  137                             self.mnemonics = Mnemonics()
  138                         self.mnemonics.digest_rfc1345(input)
  139                     if self.explodes or self.strips:
  140                         while line != '5.  CHARSET TABLES\n':
  141                             line = input.readline()
  142                         if not self.strips:
  143                             self.strips = Strips()
  144                         self.strips.digest_rfc1345(input)
  145                     break
  146                 if input.begins('@@\t'):
  147                     if self.charnames.do_sources and French_option:
  148                         self.charnames.digest_french(input)
  149                     break
  150                 if line == '&referenceset\n':
  151                     while line != '\n':
  152                         line = input.readline()
  153                     if not self.strips:
  154                         self.strips = Strips()
  155                     if not self.mnemonics:
  156                         self.mnemonics = Mnemonics()
  157                     self.strips.digest_rfc1345(input)
  158                     break
  159                 if line in ('   Repertoire according to ISO/IEC 10646-1:1993\n',
  160                             '   Control characters\n',
  161                             '   Private use\n'):
  162                     while line not in ('   Plane 000\n',
  163                                        '   plane 000\n'):
  164                         line = input.readline()
  165                     if not self.mnemonics:
  166                         self.mnemonics = Mnemonics()
  167                     self.mnemonics.digest_iso10646_def(input)
  168                     break
  169                 input.die("Data file with unknown contents")
  170         for instance in (self.explodes,
  171                          self.strips,
  172                          self.charnames,
  173                          self.iconv,
  174                          self.mnemonics):
  175             if instance:
  176                 instance.complete(French_option)
  177 
  178 run = Main()
  179 main = run.main
  180 
  181 class Options:
  182 
  183     def __init__(self):
  184         self.do_sources = False
  185         self.do_texinfo = False
  186 
  187 # Charnames.
  188 
  189 class Charnames(Options):
  190     SOURCES = 'charname.h'
  191 
  192     # Name of character, given its numerical value.
  193     charname_map = {}
  194 
  195     # Maximum printable length of a character name.
  196     max_length = 0
  197 
  198     # Frequency of each word, then its crypt code.
  199     code_map = {}
  200 
  201     def digest_french(self, input):
  202         self.preset_french()
  203         fold_table = list(range(256))
  204         for before, after in zip(
  205                 u'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÂÇÈÉÊÎÏÑÔÖÛ'.encode('ISO-8859-1'),
  206                 u'abcdefghijklmnopqrstuvwxyzàâçèéêîïñôöû'.encode('ISO-8859-1')):
  207             fold_table[before] = after
  208         folding = ''.join(map(chr, fold_table))
  209         ignorables = (
  210                 u'<commande>'.encode('ISO-8859-1'),
  211                 u'<réservé>'.encode('ISO-8859-1'),
  212                 u'<pas un caractère>'.encode('ISO-8859-1'))
  213         while True:
  214             line = input.readline()
  215             if not line:
  216                 break
  217             if input.begins('@@\t'):
  218                 continue
  219             # Pour éliminer la fin de ligne.
  220             line = line.rstrip()
  221             input.line = line
  222             match = input.match('([0-9A-F]{4})\t([^(]+)( \\(.*\\))?( \\*)?$')
  223             if match:
  224                 ucs = int(match.group(1), 16)
  225                 text = match.group(2).translate(folding)
  226                 if text in ignorables:
  227                     continue
  228                 self.declare(ucs, re.sub(r' +\*$', '', text, 1))
  229             else:
  230                 input.warn("Unrecognised line")
  231 
  232     def preset_french(self):
  233         self.max_length = 0
  234         ucs = 0x0000
  235         for text in (
  236             u"nul (nul)",                                        # 0000
  237             u"début d'en-tête (soh)",                            # 0001
  238             u"début de texte (stx)",                             # 0002
  239             u"fin de texte (etx)",                               # 0003
  240             u"fin de transmission (eot)",                        # 0004
  241             u"demande (enq)",                                    # 0005
  242             u"accusé de réception positif (ack)",                # 0006
  243             u"sonnerie (bel)",                                   # 0007
  244             u"espace arrière (bs)",                              # 0008
  245             u"tabulation horizontale (ht)",                      # 0009
  246             u"interligne (lf)",                                  # 000A
  247             u"tabulation verticale (vt)",                        # 000B
  248             u"page suivante (ff)",                               # 000C
  249             u"retour de chariot (cr)",                           # 000D
  250             u"hors code (so)",                                   # 000E
  251             u"en code (si)",                                     # 000F
  252             u"échappement transmission (dle)",                   # 0010
  253             u"commande d'appareil un (dc1)",                     # 0011
  254             u"commande d'appareil deux (dc2)",                   # 0012
  255             u"commande d'appareil trois (dc3)",                  # 0013
  256             u"commande d'appareil quatre (dc4)",                 # 0014
  257             u"accusé de réception négatif (nak)",                # 0015
  258             u"synchronisation (syn)",                            # 0016
  259             u"fin de transmission de bloc (etb)",                # 0017
  260             u"annulation (can)",                                 # 0018
  261             u"fin de support (em)",                              # 0019
  262             u"caractère de substitution (sub)",                  # 001A
  263             u"échappement (esc)",                                # 001B
  264             u"séparateur de fichier (fs)",                       # 001C
  265             u"séparateur de groupe (gs)",                        # 001D
  266             u"séparateur d'article (rs)",                        # 001E
  267             u"séparateur de sous-article (us)",                  # 001F
  268             ):
  269             self.declare(ucs, text)
  270             ucs += 1
  271         ucs = 0x007F
  272         for text in (
  273             u"suppression (del)",                                # 007F
  274             u"caractère de bourre (pad)",                        # 0080
  275             u"octet supérieur prédéfini (hop)",                  # 0081
  276             u"arrêt permis ici (bph)",                           # 0082
  277             u"aucun arrêt ici (nbh)",                            # 0083
  278             u"index (ind)",                                      # 0084
  279             u"à la ligne (nel)",                                 # 0085
  280             u"début de zone sélectionnée (ssa)",                 # 0086
  281             u"fin de zone sélectionnée (esa)",                   # 0087
  282             u"arrêt de tabulateur horizontal (hts)",             # 0088
  283             u"tabulateur horizontal avec justification (htj)",   # 0089
  284             u"arrêt de tabulateur vertical (vts)",               # 008A
  285             u"interligne partiel vers <= bas (pld)",             # 008B
  286             u"interligne partiel vers <= haut (plu)",            # 008C
  287             u"index inversé (ri)",                               # 008D
  288             u"remplacement unique deux (ss2)",                   # 008E
  289             u"remplacement unique trois (ss3)",                  # 008F
  290             u"chaîne de commande d'appareil (dcs)",              # 0090
  291             u"usage privé un (pu1)",                             # 0091
  292             u"usage privé deux (pu2)",                           # 0092
  293             u"mise en mode transmission (sts)",                  # 0093
  294             u"annulation du caractère précédent (cch)",          # 0094
  295             u"message en attente (mw)",                          # 0095
  296             u"début de zone protégée (sga)",                     # 0096
  297             u"fin de zone protégée (ega)",                       # 0097
  298             u"début de chaîne (sos)",                            # 0098
  299             u"introducteur de caractère graphique unique (sgci)",# 0099
  300             u"introducteur de caractère unique (sci)",           # 009A
  301             u"introducteur de séquence de commande (csi)",       # 009B
  302             u"fin de chaîne (st)",                               # 009C
  303             u"commande de système d'exploitation (osc)",         # 009D
  304             u"message privé (pm)",                               # 009E
  305             u"commande de progiciel (apc)",                      # 009F
  306             ):
  307             self.declare(ucs, text)
  308             ucs += 1
  309 
  310     def declare(self, ucs, text):
  311         self.charname_map[ucs] = text
  312         if len(text) > self.max_length:
  313             self.max_length = len(text)
  314         for word in text.split():
  315             self.code_map[word] = self.code_map.get(word, 0) + 1
  316 
  317     def presort_word(self, word):
  318         return -self.code_map[word], word
  319 
  320     # Write a compressed list of character names.
  321     def complete(self, french):
  322         if not self.do_sources:
  323             return
  324         if french:
  325             write = Output('fr-%s' % self.SOURCES).write
  326         else:
  327             write = Output(self.SOURCES).write
  328         # Establish a mild compression scheme.  Words word[:singles]
  329         # will be represented by a single byte running from 1 to
  330         # singles.  All remaining words will be represented by two
  331         # bytes, the first one running slowly from singles+1 to 255,
  332         # the second cycling faster from 1 to 255.
  333         if run.verbose:
  334             sys.stdout.write('  sorting words...')
  335         pairs = list(map(self.presort_word, self.code_map.keys()))
  336         pairs.sort()
  337         words = list(map(lambda pair: pair[1], pairs))
  338         pairs = None
  339         if run.verbose:
  340             sys.stdout.write(' %d of them\n' % len(words))
  341         count = len(words)
  342         singles = (255 * 255 - count) // 254
  343         # Transmit a few values for further usage by the C code.
  344         if run.verbose:
  345             sys.stdout.write('  sorting names...')
  346         ucs2_table = list(self.charname_map.keys())
  347         ucs2_table.sort()
  348         if run.verbose:
  349             sys.stdout.write(' %d of them\n' % len(ucs2_table))
  350         write('\n'
  351               '#define NUMBER_OF_SINGLES %d\n'
  352               '#define MAX_CHARNAME_LENGTH %d\n'
  353               '#define NUMBER_OF_CHARNAMES %d\n'
  354               % (singles, self.max_length, len(ucs2_table)))
  355         # Establish a mild compression scheme (one or two bytes per word).
  356         sys.stdout.write("  writing words\n")
  357         write('\n'
  358               'static const char *const word[%d] =\n'
  359               '  {\n'
  360               % count)
  361         char1 = 1
  362         char2 = 1
  363         for counter in range(singles):
  364             word = words[counter]
  365             write('    %-28s/* \\%0.3o */\n'
  366                   % ('"%s",' % re.sub('"', r'\"', word), char1))
  367             self.code_map[words[counter]] = char1
  368             char1 += 1
  369         for counter in range(singles, count):
  370             word = words[counter]
  371             write('    %-28s/* \\%0.3o\\%0.3o */\n'
  372                   % ('"%s",' % re.sub('"', r'\"', word, 1), char1, char2))
  373             self.code_map[words[counter]] = 256 * char1 + char2
  374             if char2 == 255:
  375                 char1 += 1
  376                 char2 = 1
  377             else:
  378                 char2 += 1
  379         write('  };\n')
  380         sys.stdout.write("  writing names\n")
  381         write('\n'
  382               'struct charname\n'
  383               '  {\n'
  384               '    recode_ucs2 code;\n'
  385               '    const char *crypted;\n'
  386               '  };\n'
  387               '\n'
  388               'static const struct charname charname[NUMBER_OF_CHARNAMES] =\n'
  389               '  {\n')
  390         for ucs2 in ucs2_table:
  391             write('    {0x%04X, "' % ucs2)
  392             for word in self.charname_map[ucs2].split():
  393                 if word in self.code_map:
  394                     code = self.code_map[word]
  395                     if code < 256:
  396                         write('\\%0.3o' % code)
  397                     else:
  398                         write('\\%0.3o\\%0.3o' % (code // 256, code % 256))
  399                 else:
  400                     sys.stdout.write('??? %s\n' % word)
  401             write('"},\n')
  402         write('  };\n')
  403 
  404 # Explodes.
  405 
  406 class Explodes(Options):
  407     SOURCES = 'explode.c'
  408 
  409     def __init__(self):
  410         Options.__init__(self)
  411         # Table fragments will be produced while reading data tables.
  412         self.write = Output(self.SOURCES).write
  413         write = self.write
  414         write('\n'
  415               '#include "config.h"\n'
  416               '#include "common.h"\n')
  417 
  418     def complete(self, french):
  419         if not self.do_sources:
  420             return
  421         # Print the collectable initialization function.
  422         sys.stdout.write("Completing %s\n" % self.SOURCES)
  423         write = self.write
  424         write('\n'
  425               'bool\n'
  426               'module_explodes (struct recode_outer *outer)\n'
  427               '{\n')
  428         count = 0
  429         while self.declare_charset:
  430             write('  if (!declare_explode_data (outer, &data_%d, "%s"))\n'
  431                   '    return false;\n'
  432                   % (count, self.declare_charset[0]))
  433             del self.declare_charset[0]
  434             count += 1
  435         write('\n')
  436         while declare_alias:
  437             write('  if (!declare_alias (outer, "%s", "%s"))\n'
  438                   '    return false;\n'
  439                   % declare_alias[0])
  440             del declare_alias[0]
  441         write('\n'
  442               '  return true;\n'
  443               '}\n'
  444               '\n'
  445               'void\n'
  446               'delmodule_explodes (_GL_UNUSED struct recode_outer *outer)\n'
  447               '{\n'
  448               '}\n')
  449 
  450 # Iconv.
  451 
  452 class Iconv(Options):
  453     SOURCES = 'iconvdecl.h'
  454 
  455     data = []
  456 
  457     def digest(self):
  458         canonical = {}
  459         for charset in ('Georgian-Academy', 'Georgian-PS', 'MuleLao-1',
  460                         'Macintosh', 'MacArabic', 'MacCentralEurope',
  461                         'MacCroatian', 'MacCyrillic', 'MacGreek', 'MacHebrew',
  462                         'MacIceland', 'MacRoman', 'MacRomania', 'MacThai',
  463                         'MacTurkish', 'MacUkraine'):
  464             canonical[charset.upper()] = charset
  465 
  466         # Read in the encodings.def file.
  467         sys.stdout.write("Reading from `iconv -l'\n")
  468         libc = None
  469         import os
  470         names = []
  471         for line in os.popen('iconv -l'):
  472             if libc is None:
  473                 libc = len(line.split('/')) == 3
  474             if libc:
  475                 first, second, empty = line.split('/')
  476                 assert empty == '\n', repr(line)
  477                 name = second or first
  478                 if name not in names:
  479                     names.append(name)
  480                     self.data.append((name, ()))
  481             else:
  482                 aliases = []
  483                 for alias in line.split():
  484                     if alias in canonical:
  485                         alias = canonical[alias]
  486                     aliases.append(alias)
  487                 self.data.append((aliases[0], aliases[1:]))
  488 
  489     def complete(self, french):
  490         def write_charset(format, charset):
  491             write(format % charset)
  492             write(format % (charset + "-translit"))
  493         if not self.do_sources:
  494             return
  495         write = Output(self.SOURCES).write
  496         count = 1
  497         for charset, aliases in self.data:
  498             versions = 2 # Normal, //TRANSLIT
  499             count = count + (versions + len(aliases)) * versions
  500         write('\n'
  501               "/* This is derived from Bruno Haible's `libiconv' package.  */"
  502               '\n'
  503               'static const char *iconv_name_list[%d] =\n'
  504               '  {\n'
  505               % count)
  506         for charset, aliases in self.data:
  507             if aliases:
  508                 write_charset('    "%s",\n', charset)
  509                 for alias in aliases[:-1]:
  510                     write_charset('\t"%s",\n', alias)
  511                 write_charset('\t"%s", NULL,\n', aliases[-1])
  512             else:
  513                 write_charset('    "%s", NULL,\n', charset)
  514         write('    NULL\n'
  515               '  };\n')
  516 
  517 # Mnemonics.
  518 
  519 class Mnemonics(Options):
  520     SOURCES = 'rfc1345.h'
  521 
  522     # Ignore any mnemonic whose length is greater than MAX_MNEMONIC_LENGTH.
  523     MAX_MNEMONIC_LENGTH = 3
  524 
  525     # Numeric value of a character, given its mnemonic.
  526     ucs2_map = {}
  527 
  528     table_length = 0
  529     mnemonic_map = {}
  530 
  531     # Read in a mnemonics file.
  532     def digest_mnemonics_ds(self, input):
  533         while input.readline():
  534             match = input.match('<([^ \t\n]+)>\t<U(....)>')
  535             if match:
  536                 mnemonic = re.sub('/(.)', r'\1', match.group(1))
  537                 ucs2 = int(match.group(2), 16)
  538                 self.declare(mnemonic, ucs2, input.warn)
  539 
  540     # Read in Keld's list of 10646 characters.
  541     def digest_iso10646_def(self, input):
  542         while True:
  543             line = input.readline()
  544             if not line:
  545                 break
  546             if line == '\n':
  547                 continue
  548             if len(line) == 3:
  549                 continue
  550             if input.begins('   \.\.\.'):
  551                 continue
  552             if line == '   Presentation forms\n':
  553                 continue
  554             if input.begins('   naming: first vertical '):
  555                 continue
  556             match = input.match('   row ([0-9][0-9][0-9])$')
  557             if match and int(match.group(1)) < 256:
  558                 row = int(match.group(1))
  559                 cell = 0
  560                 continue
  561             if line == '   cell 00\n':
  562                 cell = 0
  563                 continue
  564             match = input.match('   cell ([0-9][0-9][0-9])$')
  565             if match and int(match.group(1)) < 256:
  566                 cell = int(match.group(1))
  567                 continue
  568             if input.match('   [^ ]+'):
  569                 if not input.match('   [A-Z][A-Z][A-Z]'):
  570                     continue
  571             if input.match('   [^ ].*'):
  572                 if cell == 256:
  573                     input.warn("Over 256 cells in row %d", row)
  574                 cell += 1
  575                 continue
  576             match = (input.match('([^ ])  [^ ].*')
  577                      or input.match('([^ ][^ ]+) [^ ].*'))
  578             if match:
  579                 if cell == 256:
  580                     input.warn("Over 256 cells in row %d", row)
  581                 self.declare(match.group(1), 256*row + cell, input.warn)
  582                 cell += 1
  583                 continue
  584             input.warn("Unrecognised line")
  585 
  586     # Read the text of RFC 1345, saving all character names it declares.
  587     def digest_rfc1345(self, input):
  588         def read_line(input=input):
  589             skip = False
  590             while True:
  591                 line = input.readline()
  592                 if not line:
  593                     break
  594                 if input.begins('Simonsen'):
  595                     skip = True
  596                     continue
  597                 if skip:
  598                     if input.begins('RFC 1345'):
  599                         skip = False
  600                     continue
  601                 if input.begins('4.  CHARSETS'):
  602                     break
  603                 if line == '\n':
  604                     continue
  605                 if line[0] == ' ':
  606                     return line[:-1].lstrip()
  607             return None
  608         self.max_length = 0
  609         # Read the character descriptions.  Count words in charnames.
  610         line = read_line()
  611         while line:
  612             # Look ahead one line and merge it if it should.
  613             next = read_line()
  614             while next:
  615                 match = re.match('             *( .*)', next)
  616                 if not match:
  617                     break
  618                 line += match.group(1)
  619                 next = read_line()
  620             # Separate fields and save needed information.
  621             match = re.search('([^ ]+) +[0-9a-f]+ +(.*)', line)
  622             if match:
  623                 mnemo = match.group(1)
  624                 text = match.group(2).lower()
  625                 if mnemo in self.ucs2_map:
  626                     run.charnames.declare(self.ucs2_map[mnemo], text)
  627                 elif len(mnemo) <= self.MAX_MNEMONIC_LENGTH:
  628                     input.warn("No known UCS-2 code for `%s'", mnemo)
  629             elif not re.search(' +e000', line):
  630                 input.warn("Unrecognised line")
  631             line = next
  632 
  633     # Declare a correspondence between a mnemonic and an UCS-2 value.
  634     def declare(self, mnemonic, ucs2, warn):
  635         if len(mnemonic) > self.MAX_MNEMONIC_LENGTH:
  636             return
  637         if self.do_sources:
  638             if ucs2 in self.mnemonic_map:
  639                 if self.mnemonic_map[ucs2] != mnemonic:
  640                     warn("U+%04X `%s' known as `%s'",
  641                                ucs2, mnemonic, self.mnemonic_map[ucs2])
  642                     if len(mnemonic) < len(self.mnemonic_map[ucs2]):
  643                         self.mnemonic_map[ucs2] = mnemonic
  644             else:
  645                 self.mnemonic_map[ucs2] = mnemonic
  646                 self.table_length += 1
  647         if mnemonic in self.ucs2_map:
  648             if self.ucs2_map[mnemonic] != ucs2:
  649                 warn("`%s' U+%04X known as U+%04X",
  650                      mnemonic, ucs2, self.ucs2_map[mnemonic])
  651                 #FIXME: ??? cell = self.ucs2_map[mnemonic] - 256*row
  652         else:
  653             self.ucs2_map[mnemonic] = ucs2
  654 
  655     def complete(self, french):
  656         if self.do_sources:
  657             self.complete_sources()
  658 
  659     # Write an UCS-2 to RFC 1345 mnemonic table.
  660     def complete_sources(self):
  661         inverse_map = {}
  662         write = Output(self.SOURCES).write
  663         write('\n'
  664               '#define TABLE_LENGTH %d\n'
  665               '#define MAX_MNEMONIC_LENGTH %d\n'
  666               % (self.table_length, self.MAX_MNEMONIC_LENGTH))
  667         write('\n'
  668               'struct entry\n'
  669               '  {\n'
  670               '    recode_ucs2 code;\n'
  671               '    const char *rfc1345;\n'
  672               '  };\n'
  673               '\n'
  674               'static const struct entry table[TABLE_LENGTH] =\n'
  675               '  {\n')
  676         count = 0
  677         indices = list(self.mnemonic_map.keys())
  678         indices.sort()
  679         for ucs2 in indices:
  680             text = self.mnemonic_map[ucs2]
  681             inverse_map[text] = count
  682             write('    /* %4d */ {0x%04X, "%s"},\n'
  683                   % (count, ucs2, re.sub(r'([\"])', r'\\\1', text)))
  684             count += 1
  685         write('  };\n')
  686 
  687         write('\n'
  688               'static const unsigned short inverse[TABLE_LENGTH] =\n'
  689               '  {')
  690         count = 0
  691         keys = list(inverse_map.keys())
  692         keys.sort()
  693         for text in keys:
  694             if count % 10 == 0:
  695                 if count != 0:
  696                     write(',')
  697                 write('\n    /* %4d */ ' % count)
  698             else:
  699                 write(', ')
  700             write('%4d' % inverse_map[text])
  701             count += 1
  702         write('\n'
  703               '  };\n')
  704 
  705 # Global table of strips.
  706 
  707 class Strips(Options):
  708     POOL = 'strip-pool.c'
  709     DATA = 'strip-data.c'
  710     TEXINFO = 'rfc1345.texi'
  711 
  712     # Change STRIP_SIZE in `src/recode.h' if you change the value here.
  713     # See the accompanying documentation there, as needed.
  714     STRIP_SIZE = 8
  715 
  716     # Prepare the production of tables.
  717     pool_size = 0
  718     pool_refs = 0
  719     strip_map = {}
  720     strips = []
  721 
  722     # While digesting files.
  723     used_map = {}
  724     table = []
  725     declare_alias = []
  726     implied_surface = {}
  727 
  728     def __init__(self):
  729         Options.__init__(self)
  730         self.write_data = None
  731         self.aliases_map = {}
  732         self.remark_map = {}
  733         self.declare_charset = []
  734         # Prepare to read various tables.
  735         self.charset_ordinal = 0
  736         self.discard_charset = False
  737         self.alias_count = 0
  738         self.comment = ''
  739 
  740     def init_write_data(self):
  741         if self.do_sources and not self.write_data:
  742             # Table fragments will be produced while reading data tables.
  743             self.write_data = Output(self.DATA).write
  744             write = self.write_data
  745             write('\n'
  746                   '#include "config.h"\n'
  747                   '#include "common.h"\n'
  748                   '#include "decsteps.h"\n')
  749 
  750     # Read the text of RFC 1345, saving all charsets it declares.
  751     # UCS-2 mnemonics files should have been read in already.
  752     def digest_rfc1345(self, input):
  753         self.init_write_data()
  754         # Informal canonical order of presentation.
  755         CHARSET, REM, ALIAS, ESC, BITS, CODE = range(6)
  756         charset = None
  757         skip = False
  758         while True:
  759             line = input.readline()
  760             if not line:
  761                 break
  762             if input.begins('Simonsen'):
  763                 skip = True
  764                 continue
  765             if skip:
  766                 if input.begins('RFC 1345'):
  767                     skip = False
  768                 continue
  769             if line == '\n':
  770                 continue
  771             if line == 'ACKNOWLEDGEMENTS\n':
  772                 break
  773             line, count = re.subn('^  ?', '', line)
  774             if not count:
  775                 continue
  776             input.line = line
  777             # Recognize `&charset'.
  778             match = input.match('&charset (.*)')
  779             if match:
  780                 # Before beginning a new charset, process the previous one.
  781                 if charset:
  782                     self.charset_done(charset, remark, aliases)
  783                 charset = match.group(1)
  784                 # Prepare for processing a new charset: save the charset
  785                 # name for further declaration; announce this charset in
  786                 # the array initialization section; and initialize its
  787                 # processing.
  788                 if run.verbose:
  789                     sys.stdout.write("  %d) %s\n"
  790                                      % (self.charset_ordinal + 1, charset))
  791                 status = CHARSET
  792                 self.comment = '\n/* %s\n' % charset
  793                 hashname = re.sub('[^a-z0-9]', '', charset.lower())
  794                 if hashname in self.used_map:
  795                     input.warn("Duplicate of %s (discarded)",
  796                                self.used_map[hashname])
  797                     self.discard_charset = True
  798                     continue
  799                 self.used_map[hashname] = charset
  800                 self.alias_count = 0
  801                 self.table = [NOT_A_CHARACTER] * 256
  802                 codedim = 0
  803                 code = 0
  804                 aliases = []
  805                 remark = []
  806                 #FIXME:match = re.match('(CP|IBM|windows-)([0-9]+)$', charset)
  807                 match = re.match('(CP|IBM)([0-9]+)$', charset)
  808                 if match:
  809                     self.implied_surface[match.group(2)] = 'crlf'
  810                     self.implied_surface['CP' + match.group(2)] = 'crlf'
  811                     self.implied_surface['IBM' + match.group(2)] = 'crlf'
  812                     self.declare_alias.append((charset, charset))
  813                     self.alias_count += 1
  814                     continue
  815                 if charset in ('macintosh', 'macintosh_ce'):
  816                     self.implied_surface[charset] = 'cr'
  817                     self.declare_alias.append((charset, charset))
  818                     self.alias_count += 1
  819                     continue
  820                 continue
  821             # Recognize other `&' directives.
  822             match = input.match('&rem (.*)')
  823             if match and not input.begins('&rem &alias'):
  824                 # Keld now prefers `&rem' to be allowed everywhere.
  825                 #if status > REM:
  826                 #    input.warn("`&rem' out of sequence")
  827                 #status = REM;
  828                 if self.do_texinfo:
  829                     # Save remarks for Texinfo.
  830                     text = match.group(1)
  831                     remark.append(text)
  832                 continue
  833             match = input.match('(&rem )?&alias (.*)')
  834             if match:
  835                 if status > ALIAS:
  836                     input.warn("`&alias' out of sequence")
  837                 status = ALIAS
  838                 # Save synonymous charset names for later declarations.
  839                 alias = match.group(2)
  840                 if alias[-1] == ' ':
  841                     input.warn("Spurious trailing whitespace")
  842                     alias = alias.rstrip()
  843                 self.comment = self.comment + '   %s\n' % alias
  844                 hashname = re.sub('[^a-z0-9]', '', alias.lower())
  845                 if hashname in self.used_map:
  846                     if self.used_map[hashname] != charset:
  847                         input.warn("Duplicate of %s", self.used_map[hashname])
  848                         continue
  849                 else:
  850                     self.used_map[hashname] = charset
  851                 aliases.append(alias)
  852                 match = re.match('(CP|IBM)([0-9]+)$', alias)
  853                 if match:
  854                     self.implied_surface[match.group(2)] = 'crlf'
  855                     self.implied_surface['CP' + match.group(2)] = 'crlf'
  856                     self.implied_surface['IBM' + match.group(2)] = 'crlf'
  857                 elif alias in ('mac', 'macce'):
  858                     self.implied_surface[alias] = 'cr'
  859                 self.declare_alias.append((alias, charset))
  860                 self.alias_count += 1
  861                 continue
  862             if input.match('&g[0-4]esc'):
  863                 if status > ESC:
  864                     input.warn("`&esc' out of sequence")
  865                 status = ESC
  866                 continue
  867             match = input.match('&bits ([0-9]+)$')
  868             if match:
  869                 if status > BITS:
  870                     input.warn("`&bits' out of sequence")
  871                 status = BITS
  872                 if int(match.group(1)) > 8:
  873                     input.warn("`&bits %s' not accepted (charset discarded)",
  874                                match.group(1))
  875                     self.discard_charset = True
  876                 continue
  877             match = input.match('&code (.*)')
  878             if match:
  879                 if status > CODE:
  880                     input.warn("`&code' out of sequence")
  881                 status = CODE
  882                 # Save the code position.
  883                 code = int(match.group(1))
  884                 continue
  885             # Other lines cause the charset to be discarded.
  886             match = input.match('&([^ ]+)')
  887             if match:
  888                 if not self.discard_charset:
  889                     input.warn("`&%s' not accepted (charset discarded)",
  890                                match.group(1))
  891                     self.discard_charset = True
  892             if self.discard_charset:
  893                 continue
  894             # Save all other tokens into the double table.
  895             for token in line.split():
  896                 if token == '??':
  897                     self.table[code] = NOT_A_CHARACTER
  898                 elif token == '__':
  899                     self.table[code] = REPLACEMENT_CHARACTER
  900                 elif token in run.mnemonics.ucs2_map:
  901                     self.table[code] = run.mnemonics.ucs2_map[token]
  902                     if len(token) > codedim:
  903                         codedim = len(token)
  904                 else:
  905                     input.warn("Unknown mnemonic for code: %s", token)
  906                     self.table[code] = REPLACEMENT_CHARACTER
  907                 code += 1
  908         # Push the last charset out.
  909         self.charset_done(charset, remark, aliases)
  910 
  911     # Read a Unicode map, as found in ftp://ftp.unicode.com/MAPPINGS.
  912     def digest_unimap(self, input):
  913         self.init_write_data()
  914         line = input.line
  915         match = input.match('# +Name: +([^ ]+) to Unicode table$')
  916         if match:
  917             # Set comment.
  918             name = match.group(1).split()
  919             charset = name[0]
  920             del name[0]
  921             self.comment = '\n/* %s\n' % charset
  922             # Set charset.
  923             hashname = re.sub('[^a-z0-9]', '', charset.lower())
  924             if self.used_map[hashname]:
  925                 input.warn("`%s' duplicates `%s' (charset discarded)",
  926                            hashname, self.used_map[hashname])
  927                 self.discard_charset = True
  928                 return
  929             self.used_map[hashname] = charset
  930             # Prepare for read.
  931             self.alias_count = 0
  932             self.table = [NOT_A_CHARACTER] * 256
  933             codedim = 0
  934             code = 0
  935             aliases = []
  936             remark = []
  937         if self.discard_charset:
  938             return
  939         # Process aliases.
  940         for alias in name:
  941             self.comment = self.comment + '   %s\n' % alias
  942 
  943             hashname = re.sub('[^a-z0-9]', '', alias.lower())
  944             if self.used_map[hashname] and self.used_map[hashname] != charset:
  945                 input.warn("`%s' duplicates `%s'", hashname,
  946                            self.used_map[hashname])
  947                 continue
  948             self.used_map[hashname] = charset
  949 
  950             aliases.append(alias)
  951             self.declare_alias.append((alias, charset))
  952             self.alias_count += 1
  953         # Read table contents.
  954         while True:
  955             line = input.readline()
  956             if not line:
  957                 break
  958             if line == '\n':
  959                 continue
  960             if line[0] == '#':
  961                 continue
  962             if input.match('0x([0-9A-F]+)\t\t#UNDEFINED$'):
  963                 continue
  964             if input.search('\032'):
  965                 # Old MS-DOS C-z !!
  966                 break
  967             match = input.match('0x([0-9A-F]+)\t0x([0-9A-F]+)\t\#')
  968             if match:
  969                 self.table[int(match.group(1), 16)] = int(match.group(2), 16)
  970             else:
  971                 input.warn("Unrecognised input line")
  972         # Complete processing.
  973         self.charset_done(charset, remark, aliases)
  974 
  975     # Print all accumulated information for the charset.  If the
  976     # charset should be discarded, adjust tables.
  977     def charset_done(self, charset, remark, aliases):
  978         if self.discard_charset:
  979             while self.alias_count > 0:
  980                 del self.declare_alias[-1]
  981                 self.alias_count -= 1
  982             self.discard_charset = False
  983             self.comment = ''
  984         if not self.comment:
  985             return
  986         if self.do_texinfo:
  987             # Save the documentation.
  988             aliases.sort()
  989             self.aliases_map[charset] = aliases
  990             self.remark_map[charset] = remark
  991         if run.explodes:
  992             write = run.explodes.write
  993             # Make introductory C comments.
  994             write(self.comment)
  995             write('*/\n')
  996             # Make the table for this charset.
  997             write('\n'
  998                   'static const unsigned short data_%d[] =\n'
  999                   '  {\n'
 1000                   % self.charset_ordinal)
 1001             for code in range(256):
 1002                 if code != self.table[code]:
 1003                     write('    %3d, 0x%.4X, DONE,\n'
 1004                           % (code, self.table[code]))
 1005             write('    DONE\n'
 1006                   '  };\n')
 1007             # Register the table.
 1008             self.declare_charset.append(charset)
 1009         if self.do_sources:
 1010             write = self.write_data
 1011             # Make introductory C comments.
 1012             write(self.comment)
 1013             write('*/\n')
 1014             # Make the table for this charset.
 1015             write('\n'
 1016                   'static struct strip_data data_%d =\n'
 1017                   '  {\n'
 1018                   '    ucs2_data_pool,\n'
 1019                   '    {\n'
 1020                   % self.charset_ordinal)
 1021             count = 0
 1022             for code in range(0, 256, self.STRIP_SIZE):
 1023                 if count % 12 == 0:
 1024                     if count != 0:
 1025                         write(',\n')
 1026                     write('      ')
 1027                 else:
 1028                     write(', ')
 1029                 strip = self.table[code:code+self.STRIP_SIZE]
 1030                 write('%4d' % self.pool_index(strip))
 1031                 count += 1
 1032             write('\n'
 1033                   '    }\n'
 1034                   '  };\n')
 1035             # Register the table.
 1036             self.declare_charset.append(charset)
 1037         self.charset_ordinal += 1
 1038         self.comment = ''
 1039 
 1040     # Return the pool index for strip.  Add to the pool as required.
 1041     def pool_index(self, strip):
 1042 
 1043         def format(item):
 1044             return '%04X' % item
 1045 
 1046         self.pool_refs += 1
 1047         text = ''.join(map(format, strip))
 1048         if text not in self.strip_map:
 1049             self.strip_map[text] = self.pool_size
 1050             self.pool_size = self.pool_size + self.STRIP_SIZE
 1051             self.strips.append(text)
 1052         return self.strip_map[text]
 1053 
 1054     def complete(self, french):
 1055         if self.do_sources:
 1056             self.complete_sources()
 1057         if self.do_texinfo:
 1058             self.complete_texinfo(french)
 1059 
 1060     def complete_sources(self):
 1061         # Give memory statistics.
 1062         sys.stdout.write('Table memory = %d bytes (pool %d, refs %d)\n'
 1063                          % (self.pool_size * 2 + self.pool_refs * 2,
 1064                             self.pool_size * 2,
 1065                             self.pool_refs * 2))
 1066 
 1067         # Print the collectable initialization function.
 1068         sys.stdout.write("Completing %s\n" % self.DATA)
 1069         write = self.write_data
 1070         write('\n'
 1071               '_GL_ATTRIBUTE_CONST bool\n'
 1072               'module_strips (_GL_UNUSED struct recode_outer *outer)\n'
 1073               '{\n'
 1074               '  RECODE_ALIAS alias;\n'
 1075               '\n')
 1076         count = 0
 1077         while self.declare_charset:
 1078             write('  if (!declare_strip_data (outer, &data_%d, "%s"))\n'
 1079                   '    return false;\n'
 1080                   % (count, self.declare_charset[0]))
 1081             del self.declare_charset[0]
 1082             count += 1
 1083         write('\n')
 1084         while self.declare_alias:
 1085             alias, charset = self.declare_alias[0]
 1086             if alias in self.implied_surface:
 1087                 write('  if (alias = declare_alias (outer, "%s", "%s"),'
 1088                       ' !alias)\n'
 1089                       '    return false;\n'
 1090                       % self.declare_alias[0])
 1091                 write('  if (!declare_implied_surface (outer, alias,'
 1092                       ' outer->%s_surface))\n'
 1093                       '    return false;\n'
 1094                       % self.implied_surface[alias])
 1095             else:
 1096                 write('  if (!declare_alias (outer, "%s", "%s"))\n'
 1097                       '    return false;\n'
 1098                       % self.declare_alias[0])
 1099             del self.declare_alias[0]
 1100         write('\n'
 1101               '  return true;\n'
 1102               '}\n'
 1103               '\n'
 1104               'void\n'
 1105               'delmodule_strips (_GL_UNUSED struct recode_outer *outer)\n'
 1106               '{\n'
 1107               '}\n')
 1108 
 1109         # Write the pool file.
 1110         write = Output(self.POOL).write
 1111         write('\n'
 1112               '#include "config.h"\n'
 1113               '#include "common.h"\n'
 1114               '\n'
 1115               'const recode_ucs2 ucs2_data_pool[%d] =\n'
 1116               '  {'
 1117               % self.pool_size)
 1118         count = 0
 1119         for strip in self.strips:
 1120             for pos in range(0, self.STRIP_SIZE * 4, 4):
 1121                 if count % 8 == 0:
 1122                     if count != 0:
 1123                         write(',')
 1124                     write('\n    /* %4d */ ' % count)
 1125                 else:
 1126                     write(', ')
 1127                 write('0x' + strip[pos:pos+4])
 1128                 count += 1
 1129         write('\n'
 1130               '  };\n')
 1131 
 1132     def complete_texinfo(self, french):
 1133         if french:
 1134             write = Output('fr-%s' % self.TEXINFO, noheader=True).write
 1135         else:
 1136             write = Output(self.TEXINFO, noheader=True).write
 1137         charsets = list(self.remark_map.keys())
 1138         charsets.sort()
 1139         for charset in charsets:
 1140             write('\n'
 1141                   '@item %s\n'
 1142                   '@tindex %s@r{, aliases and source}\n'
 1143                   % (charset, re.sub(':([0-9]+)', r'(\1)', charset)))
 1144             aliases = self.aliases_map[charset]
 1145             if aliases:
 1146                 if len(aliases) == 1:
 1147                     if aliases[0]:      # FIXME: why empty sometimes?
 1148                         write('@tindex %s\n'
 1149                               '@code{%s} is an alias for this charset.\n'
 1150                               % (re.sub(':([0-9]+)', r'(\1)', aliases[0]),
 1151                                  aliases[0]))
 1152                 else:
 1153                     for alias in aliases:
 1154                         write('@tindex %s\n'
 1155                               % re.sub(':([0-9]+)', r'(\1)', alias))
 1156                     write('@code{%s} and @code{%s} are aliases'
 1157                           ' for this charset.\n'
 1158                           % ('}, @code{'.join(aliases[:-1]), aliases[-1]))
 1159             for line in self.remark_map[charset]:
 1160                 if line[0].islower():
 1161                     line = line[0].upper() + line[1:]
 1162                 write(line.replace('@', '@@'))
 1163                 if line[-1] != '.':
 1164                     write('.')
 1165                 write('\n')
 1166 
 1167 # Handling basic input and output.
 1168 
 1169 class Input:
 1170 
 1171     def __init__(self, name):
 1172         self.name = name
 1173         self.input = io.open(name, encoding='latin-1')
 1174         self.line_count = 0
 1175         sys.stdout.write("Reading %s\n" % name)
 1176 
 1177     def readline(self):
 1178         self.line = self.input.readline()
 1179         self.line_count += 1
 1180         if type(self.line) == bytes:
 1181             self.line = self.line.decode('utf-8')
 1182         return self.line
 1183 
 1184     def warn(self, format, *args):
 1185         if run.verbose:
 1186             sys.stdout.write('%s:%s: %s\n'
 1187                              % (self.name, self.line_count, format % args))
 1188 
 1189     def die(self, format, *args):
 1190         sys.stdout.write('%s:%s: %s\n'
 1191                          % (self.name, self.line_count, format % args))
 1192         raise 'Fatal'
 1193 
 1194     def begins(self, text):
 1195         return self.line[:len(text)] == text
 1196 
 1197     def match(self, pattern):
 1198         return re.match(pattern, self.line)
 1199 
 1200     def search(self, pattern):
 1201         return re.search(pattern, self.line)
 1202 
 1203 class Output:
 1204 
 1205     def __init__(self, name, noheader=False):
 1206         self.name = name
 1207         self.write = open(name, 'w', encoding='utf-8').write
 1208         sys.stdout.write("Writing %s\n" % name)
 1209         if not noheader:
 1210             self.write("""\
 1211 /* DO NOT MODIFY THIS FILE!  It was generated by `recode/tables.py'.  */
 1212 
 1213 /* Conversion of files between different charsets and surfaces.
 1214    Copyright © 1999 Free Software Foundation, Inc.
 1215    Contributed by François Pinard <pinard@iro.umontreal.ca>, 1993, 1997.
 1216 
 1217    This library is free software; you can redistribute it and/or
 1218    modify it under the terms of the GNU Lesser General Public License
 1219    as published by the Free Software Foundation; either version 3 of the
 1220    License, or (at your option) any later version.
 1221 
 1222    This library is distributed in the hope that it will be
 1223    useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 1224    of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 1225    Lesser General Public License for more details.
 1226 
 1227    You should have received a copy of the GNU Lesser General Public
 1228    License along with the Recode Library; see the file `COPYING.LIB'.
 1229    If not, see <https://www.gnu.org/licenses/>.
 1230    Suite 330, Boston, MA 02111-1307, USA.  */
 1231 """)
 1232 
 1233 if __name__ == '__main__':
 1234     main(*sys.argv[1:])