w32tex
About: TeX Live provides a comprehensive TeX system including all the major TeX-related programs, macro packages, and fonts that are free software. Windows sources.
  Fossies Dox: w32tex-src.tar.xz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

gen-tag-table.py
Go to the documentation of this file.
1 #!/usr/bin/env python3
2 
3 """Generator of the mapping from OpenType tags to BCP 47 tags and vice
4 versa.
5 
6 It creates a ``const LangTag[]``, matching the tags from the OpenType
7 languages system tag list to the language subtags of the BCP 47 language
8 subtag registry, with some manual adjustments. The mappings are
9 supplemented with macrolanguages' sublanguages and retired codes'
10 replacements, according to BCP 47 and some manual additions where BCP 47
11 omits a retired code entirely.
12 
13 Also generated is a function, ``hb_ot_ambiguous_tag_to_language``,
14 intended for use by ``hb_ot_tag_to_language``. It maps OpenType tags
15 back to BCP 47 tags. Ambiguous OpenType tags (those that correspond to
16 multiple BCP 47 tags) are listed here, except when the alphabetically
17 first BCP 47 tag happens to be the chosen disambiguated tag. In that
18 case, the fallback behavior will choose the right tag anyway.
19 
20 usage: ./gen-tag-table.py languagetags language-subtag-registry
21 
22 Input files:
23 * https://docs.microsoft.com/en-us/typography/opentype/spec/languagetags
24 * https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
25 """
26 
27 import collections
28 from html.parser import HTMLParser
29 def write (s):
30  sys.stdout.flush ()
31  sys.stdout.buffer.write (s.encode ('utf-8'))
32 import itertools
33 import re
34 import sys
35 import unicodedata
36 
37 if len (sys.argv) != 3:
38  sys.exit (__doc__)
39 
40 from html import unescape
41 def html_unescape (parser, entity):
42  return unescape (entity)
43 
44 def expect (condition, message=None):
45  if not condition:
46  if message is None:
47  raise AssertionError
48  raise AssertionError (message)
49 
50 DEFAULT_LANGUAGE_SYSTEM = ''
51 
52 # from https://www-01.sil.org/iso639-3/iso-639-3.tab
53 ISO_639_3_TO_1 = {
54  'aar': 'aa',
55  'abk': 'ab',
56  'afr': 'af',
57  'aka': 'ak',
58  'amh': 'am',
59  'ara': 'ar',
60  'arg': 'an',
61  'asm': 'as',
62  'ava': 'av',
63  'ave': 'ae',
64  'aym': 'ay',
65  'aze': 'az',
66  'bak': 'ba',
67  'bam': 'bm',
68  'bel': 'be',
69  'ben': 'bn',
70  'bis': 'bi',
71  'bod': 'bo',
72  'bos': 'bs',
73  'bre': 'br',
74  'bul': 'bg',
75  'cat': 'ca',
76  'ces': 'cs',
77  'cha': 'ch',
78  'che': 'ce',
79  'chu': 'cu',
80  'chv': 'cv',
81  'cor': 'kw',
82  'cos': 'co',
83  'cre': 'cr',
84  'cym': 'cy',
85  'dan': 'da',
86  'deu': 'de',
87  'div': 'dv',
88  'dzo': 'dz',
89  'ell': 'el',
90  'eng': 'en',
91  'epo': 'eo',
92  'est': 'et',
93  'eus': 'eu',
94  'ewe': 'ee',
95  'fao': 'fo',
96  'fas': 'fa',
97  'fij': 'fj',
98  'fin': 'fi',
99  'fra': 'fr',
100  'fry': 'fy',
101  'ful': 'ff',
102  'gla': 'gd',
103  'gle': 'ga',
104  'glg': 'gl',
105  'glv': 'gv',
106  'grn': 'gn',
107  'guj': 'gu',
108  'hat': 'ht',
109  'hau': 'ha',
110  'hbs': 'sh',
111  'heb': 'he',
112  'her': 'hz',
113  'hin': 'hi',
114  'hmo': 'ho',
115  'hrv': 'hr',
116  'hun': 'hu',
117  'hye': 'hy',
118  'ibo': 'ig',
119  'ido': 'io',
120  'iii': 'ii',
121  'iku': 'iu',
122  'ile': 'ie',
123  'ina': 'ia',
124  'ind': 'id',
125  'ipk': 'ik',
126  'isl': 'is',
127  'ita': 'it',
128  'jav': 'jv',
129  'jpn': 'ja',
130  'kal': 'kl',
131  'kan': 'kn',
132  'kas': 'ks',
133  'kat': 'ka',
134  'kau': 'kr',
135  'kaz': 'kk',
136  'khm': 'km',
137  'kik': 'ki',
138  'kin': 'rw',
139  'kir': 'ky',
140  'kom': 'kv',
141  'kon': 'kg',
142  'kor': 'ko',
143  'kua': 'kj',
144  'kur': 'ku',
145  'lao': 'lo',
146  'lat': 'la',
147  'lav': 'lv',
148  'lim': 'li',
149  'lin': 'ln',
150  'lit': 'lt',
151  'ltz': 'lb',
152  'lub': 'lu',
153  'lug': 'lg',
154  'mah': 'mh',
155  'mal': 'ml',
156  'mar': 'mr',
157  'mkd': 'mk',
158  'mlg': 'mg',
159  'mlt': 'mt',
160  'mol': 'mo',
161  'mon': 'mn',
162  'mri': 'mi',
163  'msa': 'ms',
164  'mya': 'my',
165  'nau': 'na',
166  'nav': 'nv',
167  'nbl': 'nr',
168  'nde': 'nd',
169  'ndo': 'ng',
170  'nep': 'ne',
171  'nld': 'nl',
172  'nno': 'nn',
173  'nob': 'nb',
174  'nor': 'no',
175  'nya': 'ny',
176  'oci': 'oc',
177  'oji': 'oj',
178  'ori': 'or',
179  'orm': 'om',
180  'oss': 'os',
181  'pan': 'pa',
182  'pli': 'pi',
183  'pol': 'pl',
184  'por': 'pt',
185  'pus': 'ps',
186  'que': 'qu',
187  'roh': 'rm',
188  'ron': 'ro',
189  'run': 'rn',
190  'rus': 'ru',
191  'sag': 'sg',
192  'san': 'sa',
193  'sin': 'si',
194  'slk': 'sk',
195  'slv': 'sl',
196  'sme': 'se',
197  'smo': 'sm',
198  'sna': 'sn',
199  'snd': 'sd',
200  'som': 'so',
201  'sot': 'st',
202  'spa': 'es',
203  'sqi': 'sq',
204  'srd': 'sc',
205  'srp': 'sr',
206  'ssw': 'ss',
207  'sun': 'su',
208  'swa': 'sw',
209  'swe': 'sv',
210  'tah': 'ty',
211  'tam': 'ta',
212  'tat': 'tt',
213  'tel': 'te',
214  'tgk': 'tg',
215  'tgl': 'tl',
216  'tha': 'th',
217  'tir': 'ti',
218  'ton': 'to',
219  'tsn': 'tn',
220  'tso': 'ts',
221  'tuk': 'tk',
222  'tur': 'tr',
223  'twi': 'tw',
224  'uig': 'ug',
225  'ukr': 'uk',
226  'urd': 'ur',
227  'uzb': 'uz',
228  'ven': 've',
229  'vie': 'vi',
230  'vol': 'vo',
231  'wln': 'wa',
232  'wol': 'wo',
233  'xho': 'xh',
234  'yid': 'yi',
235  'yor': 'yo',
236  'zha': 'za',
237  'zho': 'zh',
238  'zul': 'zu',
239 }
240 
241 class LanguageTag (object):
242  """A BCP 47 language tag.
243 
244  Attributes:
245  subtags (List[str]): The list of subtags in this tag.
246  grandfathered (bool): Whether this tag is grandfathered. If
247  ``true``, the entire lowercased tag is the ``language``
248  and the other subtag fields are empty.
249  language (str): The language subtag.
250  script (str): The script subtag.
251  region (str): The region subtag.
252  variant (str): The variant subtag.
253 
254  Args:
255  tag (str): A BCP 47 language tag.
256 
257  """
258  def __init__ (self, tag):
259  global bcp_47
260  self.subtagssubtags = tag.lower ().split ('-')
261  self.grandfatheredgrandfathered = tag.lower () in bcp_47.grandfathered
262  if self.grandfatheredgrandfathered:
263  self.languagelanguage = tag.lower ()
264  self.scriptscript = ''
265  self.regionregion = ''
266  self.variantvariant = ''
267  else:
268  self.languagelanguage = self.subtagssubtags[0]
269  self.scriptscript = self._find_first_find_first_find_first (lambda s: len (s) == 4 and s[0] > '9', self.subtagssubtags)
270  self.regionregion = self._find_first_find_first_find_first (lambda s: len (s) == 2 and s[0] > '9' or len (s) == 3 and s[0] <= '9', self.subtagssubtags[1:])
271  self.variantvariant = self._find_first_find_first_find_first (lambda s: len (s) > 4 or len (s) == 4 and s[0] <= '9', self.subtagssubtags)
272 
273  def __str__(self):
274  return '-'.join(self.subtagssubtags)
275 
276  def __repr__ (self):
277  return 'LanguageTag(%r)' % str(self)
278 
279  @staticmethod
280  def _find_first (function, sequence):
281  try:
282  return next (iter (filter (function, sequence)))
283  except StopIteration:
284  return None
285 
286  def is_complex (self):
287  """Return whether this tag is too complex to represent as a
288  ``LangTag`` in the generated code.
289 
290  Complex tags need to be handled in
291  ``hb_ot_tags_from_complex_language``.
292 
293  Returns:
294  Whether this tag is complex.
295  """
296  return not (len (self.subtagssubtags) == 1
297  or self.grandfatheredgrandfathered
298  and len (self.subtagssubtags[1]) != 3
299  and ot.from_bcp_47[self.subtagssubtags[0]] == ot.from_bcp_47[self.languagelanguage])
300 
301  def get_group (self):
302  """Return the group into which this tag should be categorized in
303  ``hb_ot_tags_from_complex_language``.
304 
305  The group is the first letter of the tag, or ``'und'`` if this tag
306  should not be matched in a ``switch`` statement in the generated
307  code.
308 
309  Returns:
310  This tag's group.
311  """
312  return ('und'
313  if (self.languagelanguage == 'und'
314  or self.variantvariant in bcp_47.prefixes and len (bcp_47.prefixes[self.variantvariant]) == 1)
315  else self.languagelanguage[0])
316 
317 class OpenTypeRegistryParser (HTMLParser):
318  """A parser for the OpenType language system tag registry.
319 
320  Attributes:
321  header (str): The "last updated" line of the registry.
322  names (Mapping[str, str]): A map of language system tags to the
323  names they are given in the registry.
324  ranks (DefaultDict[str, int]): A map of language system tags to
325  numbers. If a single BCP 47 tag corresponds to multiple
326  OpenType tags, the tags are ordered in increasing order by
327  rank. The rank is based on the number of BCP 47 tags
328  associated with a tag, though it may be manually modified.
329  to_bcp_47 (DefaultDict[str, AbstractSet[str]]): A map of
330  OpenType language system tags to sets of BCP 47 tags.
331  from_bcp_47 (DefaultDict[str, AbstractSet[str]]): ``to_bcp_47``
332  inverted. Its values start as unsorted sets;
333  ``sort_languages`` converts them to sorted lists.
334 
335  """
336  def __init__ (self):
337  HTMLParser.__init__ (self)
338  self.headerheader = ''
339  self.namesnames = {}
340  self.ranksranks = collections.defaultdict (int)
341  self.to_bcp_47to_bcp_47 = collections.defaultdict (set)
342  self.from_bcp_47from_bcp_47 = collections.defaultdict (set)
343  # Whether the parser is in a <td> element
344  self._td_td = False
345  # The text of the <td> elements of the current <tr> element.
346  self._current_tr_current_tr = []
347 
348  def handle_starttag (self, tag, attrs):
349  if tag == 'meta':
350  for attr, value in attrs:
351  if attr == 'name' and value == 'updated_at':
352  self.headerheader = self.get_starttag_text ()
353  break
354  elif tag == 'td':
355  self._td_td = True
356  self._current_tr_current_tr.append ('')
357  elif tag == 'tr':
358  self._current_tr_current_tr = []
359 
360  def handle_endtag (self, tag):
361  if tag == 'td':
362  self._td_td = False
363  elif tag == 'tr' and self._current_tr_current_tr:
364  expect (2 <= len (self._current_tr_current_tr) <= 3)
365  name = self._current_tr_current_tr[0].strip ()
366  tag = self._current_tr_current_tr[1].strip ("\t\n\v\f\r '")
367  rank = 0
368  if len (tag) > 4:
369  expect (tag.endswith (' (deprecated)'), 'ill-formed OpenType tag: %s' % tag)
370  name += ' (deprecated)'
371  tag = tag.split (' ')[0]
372  rank = 1
373  self.namesnames[tag] = re.sub (' languages$', '', name)
374  if not self._current_tr_current_tr[2]:
375  return
376  iso_codes = self._current_tr_current_tr[2].strip ()
377  self.to_bcp_47to_bcp_47[tag].update (ISO_639_3_TO_1.get (code, code) for code in iso_codes.replace (' ', '').split (','))
378  rank += 2 * len (self.to_bcp_47to_bcp_47[tag])
379  self.ranksranks[tag] = rank
380 
381  def handle_data (self, data):
382  if self._td_td:
383  self._current_tr_current_tr[-1] += data
384 
385  def handle_charref (self, name):
386  self.handle_datahandle_datahandle_data (html_unescape (self, '&#%s;' % name))
387 
388  def handle_entityref (self, name):
389  self.handle_datahandle_datahandle_data (html_unescape (self, '&%s;' % name))
390 
391  def parse (self, filename):
392  """Parse the OpenType language system tag registry.
393 
394  Args:
395  filename (str): The file name of the registry.
396  """
397  with open (filename, encoding='utf-8') as f:
398  self.feed (f.read ())
399  expect (self.headerheader)
400  for tag, iso_codes in self.to_bcp_47to_bcp_47.items ():
401  for iso_code in iso_codes:
402  self.from_bcp_47from_bcp_47[iso_code].add (tag)
403 
404  def add_language (self, bcp_47_tag, ot_tag):
405  """Add a language as if it were in the registry.
406 
407  Args:
408  bcp_47_tag (str): A BCP 47 tag. If the tag is more than just
409  a language subtag, and if the language subtag is a
410  macrolanguage, then new languages are added corresponding
411  to the macrolanguages' individual languages with the
412  remainder of the tag appended.
413  ot_tag (str): An OpenType language system tag.
414  """
415  global bcp_47
416  self.to_bcp_47to_bcp_47[ot_tag].add (bcp_47_tag)
417  self.from_bcp_47from_bcp_47[bcp_47_tag].add (ot_tag)
418  if bcp_47_tag.lower () not in bcp_47.grandfathered:
419  try:
420  [macrolanguage, suffix] = bcp_47_tag.split ('-', 1)
421  if macrolanguage in bcp_47.macrolanguages:
422  s = set ()
423  for language in bcp_47.macrolanguages[macrolanguage]:
424  if language.lower () not in bcp_47.grandfathered:
425  s.add ('%s-%s' % (language, suffix))
426  bcp_47.macrolanguages['%s-%s' % (macrolanguage, suffix)] = s
427  except ValueError:
428  pass
429 
430  @staticmethod
431  def _remove_language (tag_1, dict_1, dict_2):
432  for tag_2 in dict_1.pop (tag_1):
433  dict_2[tag_2].remove (tag_1)
434  if not dict_2[tag_2]:
435  del dict_2[tag_2]
436 
437  def remove_language_ot (self, ot_tag):
438  """Remove an OpenType tag from the registry.
439 
440  Args:
441  ot_tag (str): An OpenType tag.
442  """
443  self._remove_language_remove_language_remove_language (ot_tag, self.to_bcp_47to_bcp_47, self.from_bcp_47from_bcp_47)
444 
445  def remove_language_bcp_47 (self, bcp_47_tag):
446  """Remove a BCP 47 tag from the registry.
447 
448  Args:
449  bcp_47_tag (str): A BCP 47 tag.
450  """
451  self._remove_language_remove_language_remove_language (bcp_47_tag, self.from_bcp_47from_bcp_47, self.to_bcp_47to_bcp_47)
452 
454  """Copy mappings from macrolanguages to individual languages.
455 
456  If a BCP 47 tag for an individual mapping has no OpenType
457  mapping but its macrolanguage does, the mapping is copied to
458  the individual language. For example, als (Tosk Albanian) has no
459  explicit mapping, so it inherits from sq (Albanian) the mapping
460  to SQI.
461 
462  If a BCP 47 tag for a macrolanguage has no OpenType mapping but
463  all of its individual languages do and they all map to the same
464  tags, the mapping is copied to the macrolanguage.
465  """
466  global bcp_47
467  original_ot_from_bcp_47 = dict (self.from_bcp_47from_bcp_47)
468  for macrolanguage, languages in dict (bcp_47.macrolanguages).items ():
469  ot_macrolanguages = set (original_ot_from_bcp_47.get (macrolanguage, set ()))
470  if ot_macrolanguages:
471  for ot_macrolanguage in ot_macrolanguages:
472  for language in languages:
473  self.add_languageadd_languageadd_language (language, ot_macrolanguage)
474  self.ranksranks[ot_macrolanguage] += 1
475  else:
476  for language in languages:
477  if language in original_ot_from_bcp_47:
478  if ot_macrolanguages:
479  ml = original_ot_from_bcp_47[language]
480  if ml:
481  ot_macrolanguages &= ml
482  else:
483  pass
484  else:
485  ot_macrolanguages |= original_ot_from_bcp_47[language]
486  else:
487  ot_macrolanguages.clear ()
488  if not ot_macrolanguages:
489  break
490  for ot_macrolanguage in ot_macrolanguages:
491  self.add_languageadd_languageadd_language (macrolanguage, ot_macrolanguage)
492 
493  def sort_languages (self):
494  """Sort the values of ``from_bcp_47`` in ascending rank order."""
495  for language, tags in self.from_bcp_47from_bcp_47.items ():
496  self.from_bcp_47from_bcp_47[language] = sorted (tags,
497  key=lambda t: (self.ranksranks[t] + rank_delta (language, t), t))
498 
499 ot = OpenTypeRegistryParser ()
500 
501 class BCP47Parser (object):
502  """A parser for the BCP 47 subtag registry.
503 
504  Attributes:
505  header (str): The "File-Date" line of the registry.
506  names (Mapping[str, str]): A map of subtags to the names they
507  are given in the registry. Each value is a
508  ``'\\n'``-separated list of names.
509  scopes (Mapping[str, str]): A map of language subtags to strings
510  suffixed to language names, including suffixes to explain
511  language scopes.
512  macrolanguages (DefaultDict[str, AbstractSet[str]]): A map of
513  language subtags to the sets of language subtags which
514  inherit from them. See
515  ``OpenTypeRegistryParser.inherit_from_macrolanguages``.
516  prefixes (DefaultDict[str, AbstractSet[str]]): A map of variant
517  subtags to their prefixes.
518  grandfathered (AbstractSet[str]): The set of grandfathered tags,
519  normalized to lowercase.
520 
521  """
522  def __init__ (self):
523  self.headerheader = ''
524  self.namesnames = {}
525  self.scopesscopes = {}
526  self.macrolanguagesmacrolanguages = collections.defaultdict (set)
527  self.prefixesprefixes = collections.defaultdict (set)
528  self.grandfatheredgrandfathered = set ()
529 
530  def parse (self, filename):
531  """Parse the BCP 47 subtag registry.
532 
533  Args:
534  filename (str): The file name of the registry.
535  """
536  with open (filename, encoding='utf-8') as f:
537  subtag_type = None
538  subtag = None
539  deprecated = False
540  has_preferred_value = False
541  line_buffer = ''
542  for line in itertools.chain (f, ['']):
543  line = line.rstrip ()
544  if line.startswith (' '):
545  line_buffer += line[1:]
546  continue
547  line, line_buffer = line_buffer, line
548  if line.startswith ('Type: '):
549  subtag_type = line.split (' ')[1]
550  deprecated = False
551  has_preferred_value = False
552  elif line.startswith ('Subtag: ') or line.startswith ('Tag: '):
553  subtag = line.split (' ')[1]
554  if subtag_type == 'grandfathered':
555  self.grandfatheredgrandfathered.add (subtag.lower ())
556  elif line.startswith ('Description: '):
557  description = line.split (' ', 1)[1].replace (' (individual language)', '')
558  description = re.sub (' (\‍(family\‍)|\‍((individual |macro)language\‍)|languages)$', '',
559  description)
560  if subtag in self.namesnames:
561  self.namesnames[subtag] += '\n' + description
562  else:
563  self.namesnames[subtag] = description
564  elif subtag_type == 'language' or subtag_type == 'grandfathered':
565  if line.startswith ('Scope: '):
566  scope = line.split (' ')[1]
567  if scope == 'macrolanguage':
568  scope = ' [macrolanguage]'
569  elif scope == 'collection':
570  scope = ' [family]'
571  else:
572  continue
573  self.scopesscopes[subtag] = scope
574  elif line.startswith ('Deprecated: '):
575  self.scopesscopes[subtag] = ' (retired code)' + self.scopesscopes.get (subtag, '')
576  deprecated = True
577  elif deprecated and line.startswith ('Comments: see '):
578  # If a subtag is split into multiple replacement subtags,
579  # it essentially represents a macrolanguage.
580  for language in line.replace (',', '').split (' ')[2:]:
581  self._add_macrolanguage_add_macrolanguage_add_macrolanguage (subtag, language)
582  elif line.startswith ('Preferred-Value: '):
583  # If a subtag is deprecated in favor of a single replacement subtag,
584  # it is either a dialect or synonym of the preferred subtag. Either
585  # way, it is close enough to the truth to consider the replacement
586  # the macrolanguage of the deprecated language.
587  has_preferred_value = True
588  macrolanguage = line.split (' ')[1]
589  self._add_macrolanguage_add_macrolanguage_add_macrolanguage (macrolanguage, subtag)
590  elif not has_preferred_value and line.startswith ('Macrolanguage: '):
591  self._add_macrolanguage_add_macrolanguage_add_macrolanguage (line.split (' ')[1], subtag)
592  elif subtag_type == 'variant':
593  if line.startswith ('Deprecated: '):
594  self.scopesscopes[subtag] = ' (retired code)' + self.scopesscopes.get (subtag, '')
595  elif line.startswith ('Prefix: '):
596  self.prefixesprefixes[subtag].add (line.split (' ')[1])
597  elif line.startswith ('File-Date: '):
598  self.headerheader = line
599  expect (self.headerheader)
600 
601  def _add_macrolanguage (self, macrolanguage, language):
602  global ot
603  if language not in ot.from_bcp_47:
604  for l in self.macrolanguagesmacrolanguages.get (language, set ()):
605  self._add_macrolanguage_add_macrolanguage_add_macrolanguage (macrolanguage, l)
606  if macrolanguage not in ot.from_bcp_47:
607  for ls in list (self.macrolanguagesmacrolanguages.values ()):
608  if macrolanguage in ls:
609  ls.add (language)
610  return
611  self.macrolanguagesmacrolanguages[macrolanguage].add (language)
612 
614  """Make every language have at most one macrolanguage."""
615  inverted = collections.defaultdict (list)
616  for macrolanguage, languages in self.macrolanguagesmacrolanguages.items ():
617  for language in languages:
618  inverted[language].append (macrolanguage)
619  for language, macrolanguages in inverted.items ():
620  if len (macrolanguages) > 1:
621  macrolanguages.sort (key=lambda ml: len (self.macrolanguagesmacrolanguages[ml]))
622  biggest_macrolanguage = macrolanguages.pop ()
623  for macrolanguage in macrolanguages:
624  self._add_macrolanguage_add_macrolanguage_add_macrolanguage (biggest_macrolanguage, macrolanguage)
625 
626  def _get_name_piece (self, subtag):
627  """Return the first name of a subtag plus its scope suffix.
628 
629  Args:
630  subtag (str): A BCP 47 subtag.
631 
632  Returns:
633  The name form of ``subtag``.
634  """
635  return self.namesnames[subtag].split ('\n')[0] + self.scopesscopes.get (subtag, '')
636 
637  def get_name (self, lt):
638  """Return the names of the subtags in a language tag.
639 
640  Args:
641  lt (LanguageTag): A BCP 47 language tag.
642 
643  Returns:
644  The name form of ``lt``.
645  """
646  name = self._get_name_piece_get_name_piece_get_name_piece (lt.language)
647  if lt.script:
648  name += '; ' + self._get_name_piece_get_name_piece_get_name_piece (lt.script.title ())
649  if lt.region:
650  name += '; ' + self._get_name_piece_get_name_piece_get_name_piece (lt.region.upper ())
651  if lt.variant:
652  name += '; ' + self._get_name_piece_get_name_piece_get_name_piece (lt.variant)
653  return name
654 
655 bcp_47 = BCP47Parser ()
656 
657 ot.parse (sys.argv[1])
658 bcp_47.parse (sys.argv[2])
659 
660 ot.add_language ('ary', 'MOR')
661 
662 ot.add_language ('ath', 'ATH')
663 
664 ot.add_language ('bai', 'BML')
665 
666 ot.ranks['BAL'] = ot.ranks['KAR'] + 1
667 
668 ot.add_language ('ber', 'BBR')
669 
670 ot.remove_language_ot ('PGR')
671 ot.add_language ('el-polyton', 'PGR')
672 
673 bcp_47.macrolanguages['et'] = {'ekk'}
674 
675 bcp_47.names['flm'] = 'Falam Chin'
676 bcp_47.scopes['flm'] = ' (retired code)'
677 bcp_47.macrolanguages['flm'] = {'cfm'}
678 
679 ot.ranks['FNE'] = ot.ranks['TNE'] + 1
680 
681 ot.add_language ('und-fonipa', 'IPPH')
682 
683 ot.add_language ('und-fonnapa', 'APPH')
684 
685 ot.remove_language_ot ('IRT')
686 ot.add_language ('ga-Latg', 'IRT')
687 
688 ot.add_language ('hy-arevmda', 'HYE')
689 
690 ot.remove_language_ot ('KGE')
691 ot.add_language ('und-Geok', 'KGE')
692 
693 bcp_47.macrolanguages['id'] = {'in'}
694 
695 bcp_47.macrolanguages['ijo'] = {'ijc'}
696 
697 ot.add_language ('kht', 'KHN')
698 ot.names['KHN'] = ot.names['KHT'] + ' (Microsoft fonts)'
699 ot.ranks['KHN'] = ot.ranks['KHT'] + 1
700 
701 ot.ranks['LCR'] = ot.ranks['MCR'] + 1
702 
703 ot.names['MAL'] = 'Malayalam Traditional'
704 ot.ranks['MLR'] += 1
705 
706 bcp_47.names['mhv'] = 'Arakanese'
707 bcp_47.scopes['mhv'] = ' (retired code)'
708 
709 ot.add_language ('no', 'NOR')
710 
711 ot.add_language ('oc-provenc', 'PRO')
712 
713 ot.add_language ('qu', 'QUZ')
714 ot.add_language ('qub', 'QWH')
715 ot.add_language ('qud', 'QVI')
716 ot.add_language ('qug', 'QVI')
717 ot.add_language ('qul', 'QUH')
718 ot.add_language ('qup', 'QVI')
719 ot.add_language ('qur', 'QWH')
720 ot.add_language ('qus', 'QUH')
721 ot.add_language ('quw', 'QVI')
722 ot.add_language ('qux', 'QWH')
723 ot.add_language ('qva', 'QWH')
724 ot.add_language ('qvh', 'QWH')
725 ot.add_language ('qvj', 'QVI')
726 ot.add_language ('qvl', 'QWH')
727 ot.add_language ('qvm', 'QWH')
728 ot.add_language ('qvn', 'QWH')
729 ot.add_language ('qvo', 'QVI')
730 ot.add_language ('qvp', 'QWH')
731 ot.add_language ('qvw', 'QWH')
732 ot.add_language ('qvz', 'QVI')
733 ot.add_language ('qwa', 'QWH')
734 ot.add_language ('qws', 'QWH')
735 ot.add_language ('qxa', 'QWH')
736 ot.add_language ('qxc', 'QWH')
737 ot.add_language ('qxh', 'QWH')
738 ot.add_language ('qxl', 'QVI')
739 ot.add_language ('qxn', 'QWH')
740 ot.add_language ('qxo', 'QWH')
741 ot.add_language ('qxr', 'QVI')
742 ot.add_language ('qxt', 'QWH')
743 ot.add_language ('qxw', 'QWH')
744 
745 bcp_47.macrolanguages['ro'].remove ('mo')
746 bcp_47.macrolanguages['ro-MD'].add ('mo')
747 
748 ot.remove_language_ot ('SYRE')
749 ot.remove_language_ot ('SYRJ')
750 ot.remove_language_ot ('SYRN')
751 ot.add_language ('und-Syre', 'SYRE')
752 ot.add_language ('und-Syrj', 'SYRJ')
753 ot.add_language ('und-Syrn', 'SYRN')
754 
755 bcp_47.names['xst'] = "Silt'e"
756 bcp_47.scopes['xst'] = ' (retired code)'
757 bcp_47.macrolanguages['xst'] = {'stv', 'wle'}
758 
759 ot.add_language ('xwo', 'TOD')
760 
761 ot.remove_language_ot ('ZHH')
762 ot.remove_language_ot ('ZHP')
763 ot.remove_language_ot ('ZHT')
764 ot.remove_language_ot ('ZHTM')
765 bcp_47.macrolanguages['zh'].remove ('lzh')
766 bcp_47.macrolanguages['zh'].remove ('yue')
767 ot.add_language ('zh-Hant-MO', 'ZHH')
768 ot.add_language ('zh-Hant-MO', 'ZHTM')
769 ot.add_language ('zh-Hant-HK', 'ZHH')
770 ot.add_language ('zh-Hans', 'ZHS')
771 ot.add_language ('zh-Hant', 'ZHT')
772 ot.add_language ('zh-HK', 'ZHH')
773 ot.add_language ('zh-MO', 'ZHH')
774 ot.add_language ('zh-MO', 'ZHTM')
775 ot.add_language ('zh-TW', 'ZHT')
776 ot.add_language ('lzh', 'ZHT')
777 ot.add_language ('lzh-Hans', 'ZHS')
778 ot.add_language ('yue', 'ZHH')
779 ot.add_language ('yue-Hans', 'ZHS')
780 
781 bcp_47.macrolanguages['zom'] = {'yos'}
782 
783 def rank_delta (bcp_47, ot):
784  """Return a delta to apply to a BCP 47 tag's rank.
785 
786  Most OpenType tags have a constant rank, but a few have ranks that
787  depend on the BCP 47 tag.
788 
789  Args:
790  bcp_47 (str): A BCP 47 tag.
791  ot (str): An OpenType tag to.
792 
793  Returns:
794  A number to add to ``ot``'s rank when sorting ``bcp_47``'s
795  OpenType equivalents.
796  """
797  if bcp_47 == 'ak' and ot == 'AKA':
798  return -1
799  if bcp_47 == 'tw' and ot == 'TWI':
800  return -1
801  return 0
802 
803 disambiguation = {
804  'ALT': 'alt',
805  'ARK': 'rki',
806  'ATH': 'ath',
807  'BHI': 'bhb',
808  'BLN': 'bjt',
809  'BTI': 'beb',
810  'CCHN': 'cco',
811  'CMR': 'swb',
812  'CPP': 'crp',
813  'CRR': 'crx',
814  'DUJ': 'dwu',
815  'ECR': 'crj',
816  'HAL': 'cfm',
817  'HND': 'hnd',
818  'HYE': 'hyw',
819  'KIS': 'kqs',
820  'KUI': 'uki',
821  'LRC': 'bqi',
822  'NDB': 'nd',
823  'NIS': 'njz',
824  'PLG': 'pce',
825  'PRO': 'pro',
826  'QIN': 'bgr',
827  'QUH': 'quh',
828  'QVI': 'qvi',
829  'QWH': 'qwh',
830  'SIG': 'stv',
831  'SRB': 'sr',
832  'ZHH': 'zh-HK',
833  'ZHS': 'zh-Hans',
834  'ZHT': 'zh-Hant',
835  'ZHTM': 'zh-MO',
836 }
837 
838 ot.inherit_from_macrolanguages ()
839 bcp_47.remove_extra_macrolanguages ()
840 ot.inherit_from_macrolanguages ()
841 ot.names[DEFAULT_LANGUAGE_SYSTEM] = '*/'
842 ot.ranks[DEFAULT_LANGUAGE_SYSTEM] = max (ot.ranks.values ()) + 1
843 for tricky_ot_tag in filter (lambda tag: re.match ('[A-Z]{3}$', tag), ot.names):
844  possible_bcp_47_tag = tricky_ot_tag.lower ()
845  if possible_bcp_47_tag in bcp_47.names and not ot.from_bcp_47[possible_bcp_47_tag]:
846  ot.add_language (possible_bcp_47_tag, DEFAULT_LANGUAGE_SYSTEM)
847  bcp_47.macrolanguages[possible_bcp_47_tag] = set ()
848 ot.sort_languages ()
849 
850 print ('/* == Start of generated table == */')
851 print ('/*')
852 print (' * The following table is generated by running:')
853 print (' *')
854 print (' * %s languagetags language-subtag-registry' % sys.argv[0])
855 print (' *')
856 print (' * on files with these headers:')
857 print (' *')
858 print (' * %s' % ot.header.strip ())
859 print (' * %s' % bcp_47.header)
860 print (' */')
861 print ()
862 print ('#ifndef HB_OT_TAG_TABLE_HH')
863 print ('#define HB_OT_TAG_TABLE_HH')
864 print ()
865 print ('static const LangTag ot_languages[] = {')
866 
867 def hb_tag (tag):
868  """Convert a tag to ``HB_TAG`` form.
869 
870  Args:
871  tag (str): An OpenType tag.
872 
873  Returns:
874  A snippet of C++ representing ``tag``.
875  """
876  if tag == DEFAULT_LANGUAGE_SYSTEM:
877  return 'HB_TAG_NONE\t '
878  return "HB_TAG('%s','%s','%s','%s')" % tuple (('%-4s' % tag)[:4])
879 
880 def get_variant_set (name):
881  """Return a set of variant language names from a name.
882 
883  Args:
884  name (str): A list of language names from the BCP 47 registry,
885  joined on ``'\\n'``.
886 
887  Returns:
888  A set of normalized language names.
889  """
890  return set (unicodedata.normalize ('NFD', n.replace ('\u2019', "'"))
891  .encode ('ASCII', 'ignore')
892  .strip ()
893  for n in re.split ('[\n(),]', name) if n)
894 
895 def language_name_intersection (a, b):
896  """Return the names in common between two language names.
897 
898  Args:
899  a (str): A list of language names from the BCP 47 registry,
900  joined on ``'\\n'``.
901  b (str): A list of language names from the BCP 47 registry,
902  joined on ``'\\n'``.
903 
904  Returns:
905  The normalized language names shared by ``a`` and ``b``.
906  """
907  return get_variant_set (a).intersection (get_variant_set (b))
908 
909 def get_matching_language_name (intersection, candidates):
910  return next (iter (c for c in candidates if not intersection.isdisjoint (get_variant_set (c))))
911 
912 def same_tag (bcp_47_tag, ot_tags):
913  return len (bcp_47_tag) == 3 and len (ot_tags) == 1 and bcp_47_tag == ot_tags[0].lower ()
914 
915 for language, tags in sorted (ot.from_bcp_47.items ()):
916  if language == '' or '-' in language:
917  continue
918  commented_out = same_tag (language, tags)
919  for i, tag in enumerate (tags, start=1):
920  print ('%s{\"%s\",\t%s},' % ('/*' if commented_out else ' ', language, hb_tag (tag)), end='')
921  if commented_out:
922  print ('*/', end='')
923  print ('\t/* ', end='')
924  bcp_47_name = bcp_47.names.get (language, '')
925  bcp_47_name_candidates = bcp_47_name.split ('\n')
926  ot_name = ot.names[tag]
927  scope = bcp_47.scopes.get (language, '')
928  if tag == DEFAULT_LANGUAGE_SYSTEM:
929  write (f'{bcp_47_name_candidates[0]}{scope} != {ot.names[language.upper ()]}')
930  else:
931  intersection = language_name_intersection (bcp_47_name, ot_name)
932  if not intersection:
933  write ('%s%s -> %s' % (bcp_47_name_candidates[0], scope, ot_name))
934  else:
935  name = get_matching_language_name (intersection, bcp_47_name_candidates)
936  bcp_47.names[language] = name
937  write ('%s%s' % (name if len (name) > len (ot_name) else ot_name, scope))
938  print (' */')
939 
940 print ('};')
941 print ()
942 
943 print ('/**')
944 print (' * hb_ot_tags_from_complex_language:')
945 print (' * @lang_str: a BCP 47 language tag to convert.')
946 print (' * @limit: a pointer to the end of the substring of @lang_str to consider for')
947 print (' * conversion.')
948 print (' * @count: maximum number of language tags to retrieve (IN) and actual number of')
949 print (' * language tags retrieved (OUT). If no tags are retrieved, it is not modified.')
950 print (' * @tags: array of size at least @language_count to store the language tag')
951 print (' * results')
952 print (' *')
953 print (' * Converts a multi-subtag BCP 47 language tag to language tags.')
954 print (' *')
955 print (' * Return value: Whether any language systems were retrieved.')
956 print (' **/')
957 print ('static bool')
958 print ('hb_ot_tags_from_complex_language (const char *lang_str,')
959 print ('\t\t\t\t const char *limit,')
960 print ('\t\t\t\t unsigned int *count /* IN/OUT */,')
961 print ('\t\t\t\t hb_tag_t *tags /* OUT */)')
962 print ('{')
963 
964 def print_subtag_matches (subtag, new_line):
965  if subtag:
966  if new_line:
967  print ()
968  print ('\t&& ', end='')
969  print ('subtag_matches (lang_str, limit, "-%s")' % subtag, end='')
970 
971 complex_tags = collections.defaultdict (list)
972 for initial, group in itertools.groupby ((lt_tags for lt_tags in [
973  (LanguageTag (language), tags)
974  for language, tags in sorted (ot.from_bcp_47.items (),
975  key=lambda i: (-len (i[0]), i[0]))
976  ] if lt_tags[0].is_complex ()),
977  key=lambda lt_tags: lt_tags[0].get_group ()):
978  complex_tags[initial] += group
979 
980 for initial, items in sorted (complex_tags.items ()):
981  if initial != 'und':
982  continue
983  for lt, tags in items:
984  if lt.variant in bcp_47.prefixes:
985  expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language,
986  '%s is not a valid prefix of %s' % (lt.language, lt.variant))
987  print (' if (', end='')
988  print_subtag_matches (lt.script, False)
989  print_subtag_matches (lt.region, False)
990  print_subtag_matches (lt.variant, False)
991  print (')')
992  print (' {')
993  write (' /* %s */' % bcp_47.get_name (lt))
994  print ()
995  if len (tags) == 1:
996  write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
997  print ()
998  print (' *count = 1;')
999  else:
1000  print (' hb_tag_t possible_tags[] = {')
1001  for tag in tags:
1002  write (' %s, /* %s */' % (hb_tag (tag), ot.names[tag]))
1003  print ()
1004  print (' };')
1005  print (' for (i = 0; i < %s && i < *count; i++)' % len (tags))
1006  print (' tags[i] = possible_tags[i];')
1007  print (' *count = i;')
1008  print (' return true;')
1009  print (' }')
1010 
1011 print (' switch (lang_str[0])')
1012 print (' {')
1013 for initial, items in sorted (complex_tags.items ()):
1014  if initial == 'und':
1015  continue
1016  print (" case '%s':" % initial)
1017  for lt, tags in items:
1018  print (' if (', end='')
1019  script = lt.script
1020  region = lt.region
1021  if lt.grandfathered:
1022  print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='')
1023  else:
1024  string_literal = lt.language[1:] + '-'
1025  if script:
1026  string_literal += script
1027  script = None
1028  if region:
1029  string_literal += '-' + region
1030  region = None
1031  if string_literal[-1] == '-':
1032  print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='')
1033  else:
1034  print ('lang_matches (&lang_str[1], "%s")' % string_literal, end='')
1035  print_subtag_matches (script, True)
1036  print_subtag_matches (region, True)
1037  print_subtag_matches (lt.variant, True)
1038  print (')')
1039  print (' {')
1040  write (' /* %s */' % bcp_47.get_name (lt))
1041  print ()
1042  if len (tags) == 1:
1043  write (' tags[0] = %s; /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
1044  print ()
1045  print (' *count = 1;')
1046  else:
1047  print (' unsigned int i;')
1048  print (' hb_tag_t possible_tags[] = {')
1049  for tag in tags:
1050  write ('\t%s, /* %s */' % (hb_tag (tag), ot.names[tag]))
1051  print ()
1052  print (' };')
1053  print (' for (i = 0; i < %s && i < *count; i++)' % len (tags))
1054  print ('\ttags[i] = possible_tags[i];')
1055  print (' *count = i;')
1056  print (' return true;')
1057  print (' }')
1058  print (' break;')
1059 
1060 print (' }')
1061 print (' return false;')
1062 print ('}')
1063 print ()
1064 print ('/**')
1065 print (' * hb_ot_ambiguous_tag_to_language')
1066 print (' * @tag: A language tag.')
1067 print (' *')
1068 print (' * Converts @tag to a BCP 47 language tag if it is ambiguous (it corresponds to')
1069 print (' * many language tags) and the best tag is not the alphabetically first, or if')
1070 print (' * the best tag consists of multiple subtags, or if the best tag does not appear')
1071 print (' * in #ot_languages.')
1072 print (' *')
1073 print (' * Return value: The #hb_language_t corresponding to the BCP 47 language tag,')
1074 print (' * or #HB_LANGUAGE_INVALID if @tag is not ambiguous.')
1075 print (' **/')
1076 print ('static hb_language_t')
1077 print ('hb_ot_ambiguous_tag_to_language (hb_tag_t tag)')
1078 print ('{')
1079 print (' switch (tag)')
1080 print (' {')
1081 
1083  """Verify and normalize ``disambiguation``.
1084 
1085  ``disambiguation`` is a map of ambiguous OpenType language system
1086  tags to the particular BCP 47 tags they correspond to. This function
1087  checks that all its keys really are ambiguous and that each key's
1088  value is valid for that key. It checks that no ambiguous tag is
1089  missing, except when it can figure out which BCP 47 tag is the best
1090  by itself.
1091 
1092  It modifies ``disambiguation`` to remove keys whose values are the
1093  same as those that the fallback would return anyway, and to add
1094  ambiguous keys whose disambiguations it determined automatically.
1095 
1096  Raises:
1097  AssertionError: Verification failed.
1098  """
1099  global bcp_47
1100  global disambiguation
1101  global ot
1102  for ot_tag, bcp_47_tags in ot.to_bcp_47.items ():
1103  if ot_tag == DEFAULT_LANGUAGE_SYSTEM:
1104  primary_tags = []
1105  else:
1106  primary_tags = list (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot.from_bcp_47.get (t)[0] == ot_tag)
1107  if len (primary_tags) == 1:
1108  expect (ot_tag not in disambiguation, 'unnecessary disambiguation for OT tag: %s' % ot_tag)
1109  if '-' in primary_tags[0]:
1110  disambiguation[ot_tag] = primary_tags[0]
1111  else:
1112  first_tag = sorted (t for t in bcp_47_tags if t not in bcp_47.grandfathered and ot_tag in ot.from_bcp_47.get (t))[0]
1113  if primary_tags[0] != first_tag:
1114  disambiguation[ot_tag] = primary_tags[0]
1115  elif len (primary_tags) == 0:
1116  expect (ot_tag not in disambiguation, 'There is no possible valid disambiguation for %s' % ot_tag)
1117  else:
1118  macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [macrolanguage]')
1119  if len (macrolanguages) != 1:
1120  macrolanguages = list (t for t in primary_tags if bcp_47.scopes.get (t) == ' [family]')
1121  if len (macrolanguages) != 1:
1122  macrolanguages = list (t for t in primary_tags if 'retired code' not in bcp_47.scopes.get (t, ''))
1123  if len (macrolanguages) != 1:
1124  expect (ot_tag in disambiguation, 'ambiguous OT tag: %s %s' % (ot_tag, str (macrolanguages)))
1125  expect (disambiguation[ot_tag] in bcp_47_tags,
1126  '%s is not a valid disambiguation for %s' % (disambiguation[ot_tag], ot_tag))
1127  elif ot_tag not in disambiguation:
1128  disambiguation[ot_tag] = macrolanguages[0]
1129  different_bcp_47_tags = sorted (t for t in bcp_47_tags if not same_tag (t, ot.from_bcp_47.get (t)))
1130  if different_bcp_47_tags and disambiguation[ot_tag] == different_bcp_47_tags[0] and '-' not in disambiguation[ot_tag]:
1131  del disambiguation[ot_tag]
1132  for ot_tag in disambiguation.keys ():
1133  expect (ot_tag in ot.to_bcp_47, 'unknown OT tag: %s' % ot_tag)
1134 
1135 verify_disambiguation_dict ()
1136 for ot_tag, bcp_47_tag in sorted (disambiguation.items ()):
1137  write (' case %s: /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag]))
1138  print ()
1139  write (' return hb_language_from_string (\"%s\", -1); /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag))))
1140  print ()
1141 
1142 print (' default:')
1143 print (' return HB_LANGUAGE_INVALID;')
1144 print (' }')
1145 print ('}')
1146 
1147 print ()
1148 print ('#endif /* HB_OT_TAG_TABLE_HH */')
1149 print ()
1150 print ('/* == End of generated table == */')
1151 
def _get_name_piece(self, subtag)
def _add_macrolanguage(self, macrolanguage, language)
def remove_extra_macrolanguages(self)
def parse(self, filename)
def _find_first(function, sequence)
def handle_starttag(self, tag, attrs)
def _remove_language(tag_1, dict_1, dict_2)
def add_language(self, bcp_47_tag, ot_tag)
def remove_language_bcp_47(self, bcp_47_tag)
#define join
Definition: ctangleboot.c:112
def verify_disambiguation_dict()
def print_subtag_matches(subtag, new_line)
def language_name_intersection(a, b)
def rank_delta(bcp_47, ot)
def get_variant_set(name)
def hb_tag(tag)
def expect(condition, message=None)
def html_unescape(parser, entity)
#define str(s)
Definition: sh6.c:399