w32tex
About: TeX Live provides a comprehensive TeX system including all the major TeX-related programs, macro packages, and fonts that are free software. Windows sources.
  Fossies Dox: w32tex-src.tar.xz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

gen-indic-table.py
Go to the documentation of this file.
1 #!/usr/bin/env python3
2 
3 """usage: ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt
4 
5 Input files:
6 * https://unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt
7 * https://unicode.org/Public/UCD/latest/ucd/IndicPositionalCategory.txt
8 * https://unicode.org/Public/UCD/latest/ucd/Blocks.txt
9 """
10 
11 import sys
12 
13 if len (sys.argv) != 4:
14  sys.exit (__doc__)
15 
16 ALLOWED_SINGLES = [0x00A0, 0x25CC]
17 ALLOWED_BLOCKS = [
18  'Basic Latin',
19  'Latin-1 Supplement',
20  'Devanagari',
21  'Bengali',
22  'Gurmukhi',
23  'Gujarati',
24  'Oriya',
25  'Tamil',
26  'Telugu',
27  'Kannada',
28  'Malayalam',
29  'Sinhala',
30  'Myanmar',
31  'Khmer',
32  'Vedic Extensions',
33  'General Punctuation',
34  'Superscripts and Subscripts',
35  'Devanagari Extended',
36  'Myanmar Extended-B',
37  'Myanmar Extended-A',
38 ]
39 
40 files = [open (x, encoding='utf-8') for x in sys.argv[1:]]
41 
42 headers = [[f.readline () for i in range (2)] for f in files]
43 
44 data = [{} for _ in files]
45 values = [{} for _ in files]
46 for i, f in enumerate (files):
47  for line in f:
48 
49  j = line.find ('#')
50  if j >= 0:
51  line = line[:j]
52 
53  fields = [x.strip () for x in line.split (';')]
54  if len (fields) == 1:
55  continue
56 
57  uu = fields[0].split ('..')
58  start = int (uu[0], 16)
59  if len (uu) == 1:
60  end = start
61  else:
62  end = int (uu[1], 16)
63 
64  t = fields[1]
65 
66  for u in range (start, end + 1):
67  data[i][u] = t
68  values[i][t] = values[i].get (t, 0) + end - start + 1
69 
70 # Merge data into one dict:
71 defaults = ('Other', 'Not_Applicable', 'No_Block')
72 for i,v in enumerate (defaults):
73  values[i][v] = values[i].get (v, 0) + 1
74 combined = {}
75 for i,d in enumerate (data):
76  for u,v in d.items ():
77  if i == 2 and not u in combined:
78  continue
79  if not u in combined:
80  combined[u] = list (defaults)
81  combined[u][i] = v
82 combined = {k:v for k,v in combined.items() if k in ALLOWED_SINGLES or v[2] in ALLOWED_BLOCKS}
83 data = combined
84 del combined
85 
86 # Move the outliers NO-BREAK SPACE and DOTTED CIRCLE out
87 singles = {}
88 for u in ALLOWED_SINGLES:
89  singles[u] = data[u]
90  del data[u]
91 
92 print ("/* == Start of generated table == */")
93 print ("/*")
94 print (" * The following table is generated by running:")
95 print (" *")
96 print (" * ./gen-indic-table.py IndicSyllabicCategory.txt IndicPositionalCategory.txt Blocks.txt")
97 print (" *")
98 print (" * on files with these headers:")
99 print (" *")
100 for h in headers:
101  for l in h:
102  print (" * %s" % (l.strip()))
103 print (" */")
104 print ()
105 print ('#include "hb.hh"')
106 print ()
107 print ('#ifndef HB_NO_OT_SHAPE')
108 print ()
109 print ('#include "hb-ot-shape-complex-indic.hh"')
110 print ()
111 
112 # Shorten values
113 short = [{
114  "Bindu": 'Bi',
115  "Cantillation_Mark": 'Ca',
116  "Joiner": 'ZWJ',
117  "Non_Joiner": 'ZWNJ',
118  "Number": 'Nd',
119  "Visarga": 'Vs',
120  "Vowel": 'Vo',
121  "Vowel_Dependent": 'M',
122  "Consonant_Prefixed": 'CPrf',
123  "Other": 'x',
124 },{
125  "Not_Applicable": 'x',
126 }]
127 all_shorts = [{},{}]
128 
129 # Add some of the values, to make them more readable, and to avoid duplicates
130 
131 
132 for i in range (2):
133  for v,s in short[i].items ():
134  all_shorts[i][s] = v
135 
136 what = ["INDIC_SYLLABIC_CATEGORY", "INDIC_MATRA_CATEGORY"]
137 what_short = ["ISC", "IMC"]
138 print ('#pragma GCC diagnostic push')
139 print ('#pragma GCC diagnostic ignored "-Wunused-macros"')
140 cat_defs = []
141 for i in range (2):
142  vv = sorted (values[i].keys ())
143  for v in vv:
144  v_no_and = v.replace ('_And_', '_')
145  if v in short[i]:
146  s = short[i][v]
147  else:
148  s = ''.join ([c for c in v_no_and if ord ('A') <= ord (c) <= ord ('Z')])
149  if s in all_shorts[i]:
150  raise Exception ("Duplicate short value alias", v, all_shorts[i][s])
151  all_shorts[i][s] = v
152  short[i][v] = s
153  cat_defs.append ((what_short[i] + '_' + s, what[i] + '_' + v.upper (), str (values[i][v]), v))
154 
155 maxlen_s = max ([len (c[0]) for c in cat_defs])
156 maxlen_l = max ([len (c[1]) for c in cat_defs])
157 maxlen_n = max ([len (c[2]) for c in cat_defs])
158 for s in what_short:
159  print ()
160  for c in [c for c in cat_defs if s in c[0]]:
161  print ("#define %s %s /* %s chars; %s */" %
162  (c[0].ljust (maxlen_s), c[1].ljust (maxlen_l), c[2].rjust (maxlen_n), c[3]))
163 print ()
164 print ('#pragma GCC diagnostic pop')
165 print ()
166 print ("#define _(S,M) INDIC_COMBINE_CATEGORIES (ISC_##S, IMC_##M)")
167 print ()
168 print ()
169 
170 total = 0
171 used = 0
172 last_block = None
173 def print_block (block, start, end, data):
174  global total, used, last_block
175  if block and block != last_block:
176  print ()
177  print ()
178  print (" /* %s */" % block)
179  num = 0
180  assert start % 8 == 0
181  assert (end+1) % 8 == 0
182  for u in range (start, end+1):
183  if u % 8 == 0:
184  print ()
185  print (" /* %04X */" % u, end="")
186  if u in data:
187  num += 1
188  d = data.get (u, defaults)
189  print ("%9s" % ("_(%s,%s)," % (short[0][d[0]], short[1][d[1]])), end="")
190 
191  total += end - start + 1
192  used += num
193  if block:
194  last_block = block
195 
196 uu = sorted (data.keys ())
197 
198 last = -100000
199 num = 0
200 offset = 0
201 starts = []
202 ends = []
203 print ("static const INDIC_TABLE_ELEMENT_TYPE indic_table[] = {")
204 for u in uu:
205  if u <= last:
206  continue
207  block = data[u][2]
208 
209  start = u//8*8
210  end = start+1
211  while end in uu and block == data[end][2]:
212  end += 1
213  end = (end-1)//8*8 + 7
214 
215  if start != last + 1:
216  if start - last <= 1+16*3:
217  print_block (None, last+1, start-1, data)
218  else:
219  if last >= 0:
220  ends.append (last + 1)
221  offset += ends[-1] - starts[-1]
222  print ()
223  print ()
224  print ("#define indic_offset_0x%04xu %d" % (start, offset))
225  starts.append (start)
226 
227  print_block (block, start, end, data)
228  last = end
229 ends.append (last + 1)
230 offset += ends[-1] - starts[-1]
231 print ()
232 print ()
233 occupancy = used * 100. / total
234 page_bits = 12
235 print ("}; /* Table items: %d; occupancy: %d%% */" % (offset, occupancy))
236 print ()
237 print ("INDIC_TABLE_ELEMENT_TYPE")
238 print ("hb_indic_get_categories (hb_codepoint_t u)")
239 print ("{")
240 print (" switch (u >> %d)" % page_bits)
241 print (" {")
242 pages = set ([u>>page_bits for u in starts+ends+list (singles.keys ())])
243 for p in sorted(pages):
244  print (" case 0x%0Xu:" % p)
245  for u,d in singles.items ():
246  if p != u>>page_bits: continue
247  print (" if (unlikely (u == 0x%04Xu)) return _(%s,%s);" % (u, short[0][d[0]], short[1][d[1]]))
248  for (start,end) in zip (starts, ends):
249  if p not in [start>>page_bits, end>>page_bits]: continue
250  offset = "indic_offset_0x%04xu" % start
251  print (" if (hb_in_range<hb_codepoint_t> (u, 0x%04Xu, 0x%04Xu)) return indic_table[u - 0x%04Xu + %s];" % (start, end-1, start, offset))
252  print (" break;")
253  print ("")
254 print (" default:")
255 print (" break;")
256 print (" }")
257 print (" return _(x,x);")
258 print ("}")
259 print ()
260 print ("#undef _")
261 for i in range (2):
262  print ()
263  vv = sorted (values[i].keys ())
264  for v in vv:
265  print ("#undef %s_%s" %
266  (what_short[i], short[i][v]))
267 print ()
268 print ('#endif')
269 print ()
270 print ("/* == End of generated table == */")
271 
272 # Maintain at least 30% occupancy in the table */
273 if occupancy < 30:
274  raise Exception ("Table too sparse, please investigate: ", occupancy)
def print_block(block, start, end, data)