tables.py (recode-3.7.4) | : | tables.py (recode-3.7.5) | ||
---|---|---|---|---|
#!/usr/bin/python | #!/usr/bin/python3 | |||
# -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | |||
# Automatically derive Recode table files from various sources. | # Automatically derive Recode table files from various sources. | |||
# Copyright © 1993, 1994, 1997, 1998, 1999, 2000 Free Software Foundation, Inc. | # Copyright © 1993, 1994, 1997, 1998, 1999, 2000 Free Software Foundation, Inc. | |||
# François Pinard <pinard@iro.umontreal.ca>, 1993. | # François Pinard <pinard@iro.umontreal.ca>, 1993. | |||
# This program is free software; you can redistribute it and/or modify | # This program is free software; you can redistribute it and/or modify | |||
# it under the terms of the GNU General Public License as published by | # it under the terms of the GNU General Public License as published by | |||
# the Free Software Foundation; either version 3, or (at your option) | # the Free Software Foundation; either version 3, or (at your option) | |||
# any later version. | # any later version. | |||
skipping to change at line 43 | skipping to change at line 43 | |||
Modality options: | Modality options: | |||
-C DIRECTORY Change to DIRECTORY prior to processing | -C DIRECTORY Change to DIRECTORY prior to processing | |||
-F Produce French versions for -n, -s or -t | -F Produce French versions for -n, -s or -t | |||
-v Increase verbosity | -v Increase verbosity | |||
DATA-FILEs may be rfc1345.txt, mnemonic[.,]ds, Unicode maps, or .def files | DATA-FILEs may be rfc1345.txt, mnemonic[.,]ds, Unicode maps, or .def files | |||
from Keld's chset* packages. The digesting order is usually important. | from Keld's chset* packages. The digesting order is usually important. | |||
When `-F' and `-n' are used, process Alain's tables. | When `-F' and `-n' are used, process Alain's tables. | |||
""" | """ | |||
import re, sys | import re, sys, io | |||
# Character constants. | # Character constants. | |||
REPLACEMENT_CHARACTER = 0xFFFD | REPLACEMENT_CHARACTER = 0xFFFD | |||
NOT_A_CHARACTER = 0xFFFF | NOT_A_CHARACTER = 0xFFFF | |||
# Main driver. | # Main driver. | |||
class Main: | class Main: | |||
directory = None | directory = None | |||
charnames = None | charnames = None | |||
skipping to change at line 202 | skipping to change at line 202 | |||
charname_map = {} | charname_map = {} | |||
# Maximum printable length of a character name. | # Maximum printable length of a character name. | |||
max_length = 0 | max_length = 0 | |||
# Frequency of each word, then its crypt code. | # Frequency of each word, then its crypt code. | |||
code_map = {} | code_map = {} | |||
def digest_french(self, input): | def digest_french(self, input): | |||
self.preset_french() | self.preset_french() | |||
fold_table = range(256) | fold_table = list(range(256)) | |||
for before, after in map( | for before, after in zip( | |||
None, | ||||
u'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÂÇÈÉÊÎÏÑÔÖÛ'.encode('ISO-8859-1'), | u'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÂÇÈÉÊÎÏÑÔÖÛ'.encode('ISO-8859-1'), | |||
u'abcdefghijklmnopqrstuvwxyzàâçèéêîïñôöû'.encode('ISO-8859-1')): | u'abcdefghijklmnopqrstuvwxyzàâçèéêîïñôöû'.encode('ISO-8859-1')): | |||
fold_table[ord(before)] = ord(after) | fold_table[before] = after | |||
folding = ''.join(map(chr, fold_table)) | folding = ''.join(map(chr, fold_table)) | |||
ignorables = ( | ignorables = ( | |||
u'<commande>'.encode('ISO-8859-1'), | u'<commande>'.encode('ISO-8859-1'), | |||
u'<réservé>'.encode('ISO-8859-1'), | u'<réservé>'.encode('ISO-8859-1'), | |||
u'<pas un caractère>'.encode('ISO-8859-1')) | u'<pas un caractère>'.encode('ISO-8859-1')) | |||
while True: | while True: | |||
line = input.readline() | line = input.readline() | |||
if not line: | if not line: | |||
break | break | |||
if input.begins('@@\t'): | if input.begins('@@\t'): | |||
skipping to change at line 269 | skipping to change at line 268 | |||
u"fin de transmission de bloc (etb)", # 0017 | u"fin de transmission de bloc (etb)", # 0017 | |||
u"annulation (can)", # 0018 | u"annulation (can)", # 0018 | |||
u"fin de support (em)", # 0019 | u"fin de support (em)", # 0019 | |||
u"caractère de substitution (sub)", # 001A | u"caractère de substitution (sub)", # 001A | |||
u"échappement (esc)", # 001B | u"échappement (esc)", # 001B | |||
u"séparateur de fichier (fs)", # 001C | u"séparateur de fichier (fs)", # 001C | |||
u"séparateur de groupe (gs)", # 001D | u"séparateur de groupe (gs)", # 001D | |||
u"séparateur d'article (rs)", # 001E | u"séparateur d'article (rs)", # 001E | |||
u"séparateur de sous-article (us)", # 001F | u"séparateur de sous-article (us)", # 001F | |||
): | ): | |||
self.declare(ucs, text.encode('ISO-8859-1')) | self.declare(ucs, text) | |||
ucs += 1 | ucs += 1 | |||
ucs = 0x007F | ucs = 0x007F | |||
for text in ( | for text in ( | |||
u"suppression (del)", # 007F | u"suppression (del)", # 007F | |||
u"caractère de bourre (pad)", # 0080 | u"caractère de bourre (pad)", # 0080 | |||
u"octet supérieur prédéfini (hop)", # 0081 | u"octet supérieur prédéfini (hop)", # 0081 | |||
u"arrêt permis ici (bph)", # 0082 | u"arrêt permis ici (bph)", # 0082 | |||
u"aucun arrêt ici (nbh)", # 0083 | u"aucun arrêt ici (nbh)", # 0083 | |||
u"index (ind)", # 0084 | u"index (ind)", # 0084 | |||
u"à la ligne (nel)", # 0085 | u"à la ligne (nel)", # 0085 | |||
skipping to change at line 307 | skipping to change at line 306 | |||
u"fin de zone protégée (ega)", # 0097 | u"fin de zone protégée (ega)", # 0097 | |||
u"début de chaîne (sos)", # 0098 | u"début de chaîne (sos)", # 0098 | |||
u"introducteur de caractère graphique unique (sgci)",# 0099 | u"introducteur de caractère graphique unique (sgci)",# 0099 | |||
u"introducteur de caractère unique (sci)", # 009A | u"introducteur de caractère unique (sci)", # 009A | |||
u"introducteur de séquence de commande (csi)", # 009B | u"introducteur de séquence de commande (csi)", # 009B | |||
u"fin de chaîne (st)", # 009C | u"fin de chaîne (st)", # 009C | |||
u"commande de système d'exploitation (osc)", # 009D | u"commande de système d'exploitation (osc)", # 009D | |||
u"message privé (pm)", # 009E | u"message privé (pm)", # 009E | |||
u"commande de progiciel (apc)", # 009F | u"commande de progiciel (apc)", # 009F | |||
): | ): | |||
self.declare(ucs, text.encode('ISO-8859-1')) | self.declare(ucs, text) | |||
ucs += 1 | ucs += 1 | |||
def declare(self, ucs, text): | def declare(self, ucs, text): | |||
self.charname_map[ucs] = text | self.charname_map[ucs] = text | |||
if len(text) > self.max_length: | if len(text) > self.max_length: | |||
self.max_length = len(text) | self.max_length = len(text) | |||
for word in text.split(): | for word in text.split(): | |||
self.code_map[word] = self.code_map.get(word, 0) + 1 | self.code_map[word] = self.code_map.get(word, 0) + 1 | |||
def presort_word(self, word): | def presort_word(self, word): | |||
skipping to change at line 335 | skipping to change at line 334 | |||
write = Output('fr-%s' % self.SOURCES).write | write = Output('fr-%s' % self.SOURCES).write | |||
else: | else: | |||
write = Output(self.SOURCES).write | write = Output(self.SOURCES).write | |||
# Establish a mild compression scheme. Words word[:singles] | # Establish a mild compression scheme. Words word[:singles] | |||
# will be represented by a single byte running from 1 to | # will be represented by a single byte running from 1 to | |||
# singles. All remaining words will be represented by two | # singles. All remaining words will be represented by two | |||
# bytes, the first one running slowly from singles+1 to 255, | # bytes, the first one running slowly from singles+1 to 255, | |||
# the second cycling faster from 1 to 255. | # the second cycling faster from 1 to 255. | |||
if run.verbose: | if run.verbose: | |||
sys.stdout.write(' sorting words...') | sys.stdout.write(' sorting words...') | |||
pairs = map(self.presort_word, self.code_map.keys()) | pairs = list(map(self.presort_word, self.code_map.keys())) | |||
pairs.sort() | pairs.sort() | |||
words = map(lambda pair: pair[1], pairs) | words = list(map(lambda pair: pair[1], pairs)) | |||
pairs = None | pairs = None | |||
if run.verbose: | if run.verbose: | |||
sys.stdout.write(' %d of them\n' % len(words)) | sys.stdout.write(' %d of them\n' % len(words)) | |||
count = len(words) | count = len(words) | |||
singles = (255 * 255 - count) / 254 | singles = (255 * 255 - count) // 254 | |||
# Transmit a few values for further usage by the C code. | # Transmit a few values for further usage by the C code. | |||
if run.verbose: | if run.verbose: | |||
sys.stdout.write(' sorting names...') | sys.stdout.write(' sorting names...') | |||
ucs2_table = self.charname_map.keys() | ucs2_table = list(self.charname_map.keys()) | |||
ucs2_table.sort() | ucs2_table.sort() | |||
if run.verbose: | if run.verbose: | |||
sys.stdout.write(' %d of them\n' % len(ucs2_table)) | sys.stdout.write(' %d of them\n' % len(ucs2_table)) | |||
write('\n' | write('\n' | |||
'#define NUMBER_OF_SINGLES %d\n' | '#define NUMBER_OF_SINGLES %d\n' | |||
'#define MAX_CHARNAME_LENGTH %d\n' | '#define MAX_CHARNAME_LENGTH %d\n' | |||
'#define NUMBER_OF_CHARNAMES %d\n' | '#define NUMBER_OF_CHARNAMES %d\n' | |||
% (singles, self.max_length, len(ucs2_table))) | % (singles, self.max_length, len(ucs2_table))) | |||
# Establish a mild compression scheme (one or two bytes per word). | # Establish a mild compression scheme (one or two bytes per word). | |||
sys.stdout.write(" writing words\n") | sys.stdout.write(" writing words\n") | |||
skipping to change at line 398 | skipping to change at line 397 | |||
'static const struct charname charname[NUMBER_OF_CHARNAMES] =\n' | 'static const struct charname charname[NUMBER_OF_CHARNAMES] =\n' | |||
' {\n') | ' {\n') | |||
for ucs2 in ucs2_table: | for ucs2 in ucs2_table: | |||
write(' {0x%04X, "' % ucs2) | write(' {0x%04X, "' % ucs2) | |||
for word in self.charname_map[ucs2].split(): | for word in self.charname_map[ucs2].split(): | |||
if word in self.code_map: | if word in self.code_map: | |||
code = self.code_map[word] | code = self.code_map[word] | |||
if code < 256: | if code < 256: | |||
write('\\%0.3o' % code) | write('\\%0.3o' % code) | |||
else: | else: | |||
write('\\%0.3o\\%0.3o' % (code / 256, code % 256)) | write('\\%0.3o\\%0.3o' % (code // 256, code % 256)) | |||
else: | else: | |||
sys.stdout.write('??? %s\n' % word) | sys.stdout.write('??? %s\n' % word) | |||
write('"},\n') | write('"},\n') | |||
write(' };\n') | write(' };\n') | |||
# Explodes. | # Explodes. | |||
class Explodes(Options): | class Explodes(Options): | |||
SOURCES = 'explode.c' | SOURCES = 'explode.c' | |||
def __init__(self): | def __init__(self): | |||
skipping to change at line 665 | skipping to change at line 664 | |||
write('\n' | write('\n' | |||
'struct entry\n' | 'struct entry\n' | |||
' {\n' | ' {\n' | |||
' recode_ucs2 code;\n' | ' recode_ucs2 code;\n' | |||
' const char *rfc1345;\n' | ' const char *rfc1345;\n' | |||
' };\n' | ' };\n' | |||
'\n' | '\n' | |||
'static const struct entry table[TABLE_LENGTH] =\n' | 'static const struct entry table[TABLE_LENGTH] =\n' | |||
' {\n') | ' {\n') | |||
count = 0 | count = 0 | |||
indices = self.mnemonic_map.keys() | indices = list(self.mnemonic_map.keys()) | |||
indices.sort() | indices.sort() | |||
for ucs2 in indices: | for ucs2 in indices: | |||
text = self.mnemonic_map[ucs2] | text = self.mnemonic_map[ucs2] | |||
inverse_map[text] = count | inverse_map[text] = count | |||
write(' /* %4d */ {0x%04X, "%s"},\n' | write(' /* %4d */ {0x%04X, "%s"},\n' | |||
% (count, ucs2, re.sub(r'([\"])', r'\\\1', text))) | % (count, ucs2, re.sub(r'([\"])', r'\\\1', text))) | |||
count += 1 | count += 1 | |||
write(' };\n') | write(' };\n') | |||
write('\n' | write('\n' | |||
'static const unsigned short inverse[TABLE_LENGTH] =\n' | 'static const unsigned short inverse[TABLE_LENGTH] =\n' | |||
' {') | ' {') | |||
count = 0 | count = 0 | |||
keys = inverse_map.keys() | keys = list(inverse_map.keys()) | |||
keys.sort() | keys.sort() | |||
for text in keys: | for text in keys: | |||
if count % 10 == 0: | if count % 10 == 0: | |||
if count != 0: | if count != 0: | |||
write(',') | write(',') | |||
write('\n /* %4d */ ' % count) | write('\n /* %4d */ ' % count) | |||
else: | else: | |||
write(', ') | write(', ') | |||
write('%4d' % inverse_map[text]) | write('%4d' % inverse_map[text]) | |||
count += 1 | count += 1 | |||
skipping to change at line 1122 | skipping to change at line 1121 | |||
write('0x' + strip[pos:pos+4]) | write('0x' + strip[pos:pos+4]) | |||
count += 1 | count += 1 | |||
write('\n' | write('\n' | |||
' };\n') | ' };\n') | |||
def complete_texinfo(self, french): | def complete_texinfo(self, french): | |||
if french: | if french: | |||
write = Output('fr-%s' % self.TEXINFO, noheader=True).write | write = Output('fr-%s' % self.TEXINFO, noheader=True).write | |||
else: | else: | |||
write = Output(self.TEXINFO, noheader=True).write | write = Output(self.TEXINFO, noheader=True).write | |||
charsets = self.remark_map.keys() | charsets = list(self.remark_map.keys()) | |||
charsets.sort() | charsets.sort() | |||
for charset in charsets: | for charset in charsets: | |||
write('\n' | write('\n' | |||
'@item %s\n' | '@item %s\n' | |||
'@tindex %s@r{, aliases and source}\n' | '@tindex %s@r{, aliases and source}\n' | |||
% (charset, re.sub(':([0-9]+)', r'(\1)', charset))) | % (charset, re.sub(':([0-9]+)', r'(\1)', charset))) | |||
aliases = self.aliases_map[charset] | aliases = self.aliases_map[charset] | |||
if aliases: | if aliases: | |||
if len(aliases) == 1: | if len(aliases) == 1: | |||
if aliases[0]: # FIXME: why empty sometimes? | if aliases[0]: # FIXME: why empty sometimes? | |||
skipping to change at line 1157 | skipping to change at line 1156 | |||
write(line.replace('@', '@@')) | write(line.replace('@', '@@')) | |||
if line[-1] != '.': | if line[-1] != '.': | |||
write('.') | write('.') | |||
write('\n') | write('\n') | |||
# Handling basic input and output. | # Handling basic input and output. | |||
class Input: | class Input: | |||
def __init__(self, name): | def __init__(self, name): | |||
self.name = name | self.name = name | |||
self.input = file(name) | self.input = io.open(name, encoding='latin-1') | |||
self.line_count = 0 | self.line_count = 0 | |||
sys.stdout.write("Reading %s\n" % name) | sys.stdout.write("Reading %s\n" % name) | |||
def readline(self): | def readline(self): | |||
self.line = self.input.readline() | self.line = self.input.readline() | |||
self.line_count += 1 | self.line_count += 1 | |||
if type(self.line) == bytes: | ||||
self.line = self.line.decode('utf-8') | ||||
return self.line | return self.line | |||
def warn(self, format, *args): | def warn(self, format, *args): | |||
if run.verbose: | if run.verbose: | |||
sys.stdout.write('%s:%s: %s\n' | sys.stdout.write('%s:%s: %s\n' | |||
% (self.name, self.line_count, format % args)) | % (self.name, self.line_count, format % args)) | |||
def die(self, format, *args): | def die(self, format, *args): | |||
sys.stdout.write('%s:%s: %s\n' | sys.stdout.write('%s:%s: %s\n' | |||
% (self.name, self.line_count, format % args)) | % (self.name, self.line_count, format % args)) | |||
skipping to change at line 1189 | skipping to change at line 1190 | |||
def match(self, pattern): | def match(self, pattern): | |||
return re.match(pattern, self.line) | return re.match(pattern, self.line) | |||
def search(self, pattern): | def search(self, pattern): | |||
return re.search(pattern, self.line) | return re.search(pattern, self.line) | |||
class Output: | class Output: | |||
def __init__(self, name, noheader=False): | def __init__(self, name, noheader=False): | |||
self.name = name | self.name = name | |||
self.write = file(name, 'w').write | self.write = open(name, 'w', encoding='utf-8').write | |||
sys.stdout.write("Writing %s\n" % name) | sys.stdout.write("Writing %s\n" % name) | |||
if not noheader: | if not noheader: | |||
self.write("""\ | self.write("""\ | |||
/* DO NOT MODIFY THIS FILE! It was generated by `recode/tables.py'. */ | /* DO NOT MODIFY THIS FILE! It was generated by `recode/tables.py'. */ | |||
/* Conversion of files between different charsets and surfaces. | /* Conversion of files between different charsets and surfaces. | |||
Copyright © 1999 Free Software Foundation, Inc. | Copyright © 1999 Free Software Foundation, Inc. | |||
Contributed by François Pinard <pinard@iro.umontreal.ca>, 1993, 1997. | Contributed by François Pinard <pinard@iro.umontreal.ca>, 1993, 1997. | |||
This library is free software; you can redistribute it and/or | This library is free software; you can redistribute it and/or | |||
End of changes. 17 change blocks. | ||||
18 lines changed or deleted | 19 lines changed or added |