w32tex
About: TeX Live provides a comprehensive TeX system including all the major TeX-related programs, macro packages, and fonts that are free software. Windows sources.
  Fossies Dox: w32tex-src.tar.xz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

TextOutputDev.cc
Go to the documentation of this file.
1 //========================================================================
2 //
3 // TextOutputDev.cc
4 //
5 // Copyright 1997-2003 Glyph & Cog, LLC
6 //
7 //========================================================================
8 
9 //========================================================================
10 //
11 // Modified under the Poppler project - http://poppler.freedesktop.org
12 //
13 // All changes made under the Poppler project to this file are licensed
14 // under GPL version 2 or later
15 //
16 // Copyright (C) 2005-2007 Kristian Høgsberg <krh@redhat.com>
17 // Copyright (C) 2005 Nickolay V. Shmyrev <nshmyrev@yandex.ru>
18 // Copyright (C) 2006-2008, 2011-2013 Carlos Garcia Campos <carlosgc@gnome.org>
19 // Copyright (C) 2006, 2007, 2013 Ed Catmur <ed@catmur.co.uk>
20 // Copyright (C) 2006 Jeff Muizelaar <jeff@infidigm.net>
21 // Copyright (C) 2007, 2008, 2012, 2017 Adrian Johnson <ajohnson@redneon.com>
22 // Copyright (C) 2008 Koji Otani <sho@bbr.jp>
23 // Copyright (C) 2008, 2010-2012, 2014-2021 Albert Astals Cid <aacid@kde.org>
24 // Copyright (C) 2008 Pino Toscano <pino@kde.org>
25 // Copyright (C) 2008, 2010 Hib Eris <hib@hiberis.nl>
26 // Copyright (C) 2009 Ross Moore <ross@maths.mq.edu.au>
27 // Copyright (C) 2009 Kovid Goyal <kovid@kovidgoyal.net>
28 // Copyright (C) 2010 Brian Ewins <brian.ewins@gmail.com>
29 // Copyright (C) 2010 Marek Kasik <mkasik@redhat.com>
30 // Copyright (C) 2010, 2020 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
31 // Copyright (C) 2011 Sam Liao <phyomh@gmail.com>
32 // Copyright (C) 2012 Horst Prote <prote@fmi.uni-stuttgart.de>
33 // Copyright (C) 2012, 2013-2018 Jason Crain <jason@aquaticape.us>
34 // Copyright (C) 2012 Peter Breitenlohner <peb@mppmu.mpg.de>
35 // Copyright (C) 2013 José Aliste <jaliste@src.gnome.org>
36 // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
37 // Copyright (C) 2013 Ed Catmur <ed@catmur.co.uk>
38 // Copyright (C) 2016 Khaled Hosny <khaledhosny@eglug.org>
39 // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
40 // Copyright (C) 2018 Sanchit Anand <sanxchit@gmail.com>
41 // Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de>
42 // Copyright (C) 2018-2021 Nelson Benítez León <nbenitezl@gmail.com>
43 // Copyright (C) 2019 Christian Persch <chpe@src.gnome.org>
44 // Copyright (C) 2019 Oliver Sander <oliver.sander@tu-dresden.de>
45 // Copyright (C) 2019 Dan Shea <dan.shea@logical-innovations.com>
46 //
47 // To see a description of the changes please see the Changelog file that
48 // came with your tarball or type make ChangeLog if you are building from git
49 //
50 //========================================================================
51 
52 #include <config.h>
53 
54 #include <cstdio>
55 #include <cstdlib>
56 #include <cstddef>
57 #include <cmath>
58 #include <cfloat>
59 #include <cctype>
60 #include <algorithm>
61 #ifdef _WIN32
62 # include <fcntl.h> // for O_BINARY
63 # include <io.h> // for setmode
64 #endif
65 #include "goo/gfile.h"
66 #include "goo/gmem.h"
67 #include "goo/GooString.h"
68 #include "poppler-config.h"
69 #include "Error.h"
70 #include "GlobalParams.h"
71 #include "UnicodeMap.h"
72 #include "UnicodeTypeTable.h"
73 #include "Link.h"
74 #include "TextOutputDev.h"
75 #include "Page.h"
76 #include "Annot.h"
77 #include "UTF.h"
78 
79 //------------------------------------------------------------------------
80 // parameters
81 //------------------------------------------------------------------------
82 
83 // Each bucket in a text pool includes baselines within a range of
84 // this many points.
85 #define textPoolStep 4
86 
87 // Inter-character space width which will cause addChar to start a new
88 // word.
89 #define minWordBreakSpace 0.1
90 
91 // Negative inter-character space width, i.e., overlap, which will
92 // cause addChar to start a new word.
93 #define minDupBreakOverlap 0.2
94 
95 // Max distance between baselines of two lines within a block, as a
96 // fraction of the font size.
97 #define maxLineSpacingDelta 1.5
98 
99 // Max difference in primary font sizes on two lines in the same
100 // block. Delta1 is used when examining new lines above and below the
101 // current block; delta2 is used when examining text that overlaps the
102 // current block; delta3 is used when examining text to the left and
103 // right of the current block.
104 #define maxBlockFontSizeDelta1 0.05
105 #define maxBlockFontSizeDelta2 0.6
106 #define maxBlockFontSizeDelta3 0.2
107 
108 // Max difference in font sizes inside a word.
109 #define maxWordFontSizeDelta 0.05
110 
111 // Maximum distance between baselines of two words on the same line,
112 // e.g., distance between subscript or superscript and the primary
113 // baseline, as a fraction of the font size.
114 #define maxIntraLineDelta 0.5
115 
116 // Minimum inter-word spacing, as a fraction of the font size. (Only
117 // used for raw ordering.)
118 #define minWordSpacing 0.15
119 
120 // Maximum inter-word spacing, as a fraction of the font size.
121 #define maxWordSpacing 1.5
122 
123 // Maximum horizontal spacing which will allow a word to be pulled
124 // into a block.
125 #define minColSpacing1 0.3
126 
127 // Minimum spacing between columns, as a fraction of the font size.
128 #define minColSpacing2 1.0
129 
130 // Maximum vertical spacing between blocks within a flow, as a
131 // multiple of the font size.
132 #define maxBlockSpacing 2.5
133 
134 // Minimum spacing between characters within a word, as a fraction of
135 // the font size.
136 #define minCharSpacing -0.5
137 
138 // Maximum spacing between characters within a word, as a fraction of
139 // the font size, when there is no obvious extra-wide character
140 // spacing.
141 #define maxCharSpacing 0.03
142 
143 // When extra-wide character spacing is detected, the inter-character
144 // space threshold is set to the minimum inter-character space
145 // multiplied by this constant.
146 #define maxWideCharSpacingMul 1.3
147 
148 // Upper limit on spacing between characters in a word.
149 #define maxWideCharSpacing 0.4
150 
151 // Max difference in primary,secondary coordinates (as a fraction of
152 // the font size) allowed for duplicated text (fake boldface, drop
153 // shadows) which is to be discarded.
154 #define dupMaxPriDelta 0.1
155 #define dupMaxSecDelta 0.2
156 
157 // Max width of underlines (in points).
158 #define maxUnderlineWidth 3
159 
160 // Min distance between baseline and underline (in points).
161 //~ this should be font-size-dependent
162 #define minUnderlineGap -2
163 
164 // Max distance between baseline and underline (in points).
165 //~ this should be font-size-dependent
166 #define maxUnderlineGap 4
167 
168 // Max horizontal distance between edge of word and start of underline
169 // (in points).
170 //~ this should be font-size-dependent
171 #define underlineSlack 1
172 
173 // Max distance between edge of text and edge of link border
174 #define hyperlinkSlack 2
175 
176 // Max distance between characters when combining a base character and
177 // combining character
178 #define combMaxMidDelta 0.3
179 #define combMaxBaseDelta 0.4
180 
181 // Text is considered diagonal if abs(tan(angle)) > diagonalThreshold.
182 // (Or 1/tan(angle) for 90/270 degrees.)
183 #define diagonalThreshold 0.1
184 
185 // How opaque a selection on a glyphless font should be. Since the font is
186 // glyphless and overlaid over text in image form, this must enable users
187 // to read the underlying image. Issue #157
188 #define glyphlessSelectionOpacity 0.4
189 
190 namespace {
191 
192 inline bool isAscii7(Unicode uchar)
193 {
194  return uchar < 128;
195 }
196 
197 }
198 
199 static int reorderText(const Unicode *text, int len, const UnicodeMap *uMap, bool primaryLR, GooString *s, Unicode *u)
200 {
201  char lre[8], rle[8], popdf[8], buf[8];
202  int lreLen = 0, rleLen = 0, popdfLen = 0, n;
203  int nCols, i, j, k;
204 
205  nCols = 0;
206 
207  if (s) {
208  lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
209  rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
210  popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
211  }
212 
213  if (primaryLR) {
214  i = 0;
215  while (i < len) {
216  // output a left-to-right section
217  for (j = i; j < len && !unicodeTypeR(text[j]); ++j)
218  ;
219  for (k = i; k < j; ++k) {
220  if (s) {
221  n = uMap->mapUnicode(text[k], buf, sizeof(buf));
222  s->append(buf, n);
223  }
224  if (u)
225  u[nCols] = text[k];
226  ++nCols;
227  }
228  i = j;
229  // output a right-to-left section
230  for (j = i; j < len && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j])); ++j)
231  ;
232  if (j > i) {
233  if (s)
234  s->append(rle, rleLen);
235  for (k = j - 1; k >= i; --k) {
236  if (s) {
237  n = uMap->mapUnicode(text[k], buf, sizeof(buf));
238  s->append(buf, n);
239  }
240  if (u)
241  u[nCols] = text[k];
242  ++nCols;
243  }
244  if (s)
245  s->append(popdf, popdfLen);
246  i = j;
247  }
248  }
249  } else {
250  // Note: This code treats numeric characters (European and
251  // Arabic/Indic) as left-to-right, which isn't strictly correct
252  // (incurs extra LRE/POPDF pairs), but does produce correct
253  // visual formatting.
254  if (s)
255  s->append(rle, rleLen);
256  i = len - 1;
257  while (i >= 0) {
258  // output a right-to-left section
259  for (j = i; j >= 0 && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j])); --j)
260  ;
261  for (k = i; k > j; --k) {
262  if (s) {
263  n = uMap->mapUnicode(text[k], buf, sizeof(buf));
264  s->append(buf, n);
265  }
266  if (u)
267  u[nCols] = text[k];
268  ++nCols;
269  }
270  i = j;
271  // output a left-to-right section
272  for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j)
273  ;
274  if (j < i) {
275  if (s)
276  s->append(lre, lreLen);
277  for (k = j + 1; k <= i; ++k) {
278  if (s) {
279  n = uMap->mapUnicode(text[k], buf, sizeof(buf));
280  s->append(buf, n);
281  }
282  if (u)
283  u[nCols] = text[k];
284  ++nCols;
285  }
286  if (s)
287  s->append(popdf, popdfLen);
288  i = j;
289  }
290  }
291  if (s)
292  s->append(popdf, popdfLen);
293  }
294 
295  return nCols;
296 }
297 
298 //------------------------------------------------------------------------
299 // TextUnderline
300 //------------------------------------------------------------------------
301 
302 class TextUnderline
303 {
304 public:
305  TextUnderline(double x0A, double y0A, double x1A, double y1A)
306  {
307  x0 = x0A;
308  y0 = y0A;
309  x1 = x1A;
310  y1 = y1A;
311  horiz = y0 == y1;
312  }
314 
315  double x0, y0, x1, y1;
316  bool horiz;
317 };
318 
319 //------------------------------------------------------------------------
320 // TextLink
321 //------------------------------------------------------------------------
322 
323 class TextLink
324 {
325 public:
326  TextLink(int xMinA, int yMinA, int xMaxA, int yMaxA, AnnotLink *linkA)
327  {
328  xMin = xMinA;
329  yMin = yMinA;
330  xMax = xMaxA;
331  yMax = yMaxA;
332  link = linkA;
333  }
334  ~TextLink() { }
335 
336  int xMin, yMin, xMax, yMax;
338 };
339 
340 //------------------------------------------------------------------------
341 // TextFontInfo
342 //------------------------------------------------------------------------
343 
345 {
346  gfxFont = state->getFont();
347  if (gfxFont)
348  gfxFont->incRefCnt();
349 #ifdef TEXTOUT_WORD_LIST
350  fontName = (gfxFont && gfxFont->getName()) ? gfxFont->getName()->copy() : nullptr;
351  flags = gfxFont ? gfxFont->getFlags() : 0;
352 #endif
353 }
354 
356 {
357  if (gfxFont)
358  gfxFont->decRefCnt();
359 #ifdef TEXTOUT_WORD_LIST
360  if (fontName) {
361  delete fontName;
362  }
363 #endif
364 }
365 
367 {
368  return state->getFont() == gfxFont;
369 }
370 
371 bool TextFontInfo::matches(const TextFontInfo *fontInfo) const
372 {
373  return gfxFont == fontInfo->gfxFont;
374 }
375 
376 bool TextFontInfo::matches(const Ref *ref) const
377 {
378  return (*(gfxFont->getID()) == *ref);
379 }
380 
382 {
383  return gfxFont ? gfxFont->getAscent() : 0.95;
384 }
385 
387 {
388  return gfxFont ? gfxFont->getDescent() : -0.35;
389 }
390 
392 {
393  return gfxFont ? gfxFont->getWMode() : 0;
394 }
395 
396 //------------------------------------------------------------------------
397 // TextWord
398 //------------------------------------------------------------------------
399 
400 TextWord::TextWord(const GfxState *state, int rotA, double fontSizeA)
401 {
402  rot = rotA;
403  fontSize = fontSizeA;
404  text = nullptr;
405  charcode = nullptr;
406  edge = nullptr;
407  charPos = nullptr;
408  font = nullptr;
409  textMat = nullptr;
410  len = size = 0;
411  spaceAfter = false;
412  next = nullptr;
413  invisible = state->getRender() == 3;
414 
415 #ifdef TEXTOUT_WORD_LIST
416  GfxRGB rgb;
417 
418  if ((state->getRender() & 3) == 1) {
419  state->getStrokeRGB(&rgb);
420  } else {
421  state->getFillRGB(&rgb);
422  }
423  colorR = colToDbl(rgb.r);
424  colorG = colToDbl(rgb.g);
425  colorB = colToDbl(rgb.b);
426 #endif
427 
428  underlined = false;
429  link = nullptr;
430 }
431 
433 {
434  gfree(text);
435  gfree(charcode);
436  gfree(edge);
437  gfree(charPos);
438  gfree(font);
439  gfree(textMat);
440 }
441 
442 void TextWord::addChar(const GfxState *state, TextFontInfo *fontA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA)
443 {
444  ensureCapacity(len + 1);
445  text[len] = u;
446  charcode[len] = c;
447  charPos[len] = charPosA;
448  charPos[len + 1] = charPosA + charLen;
449  font[len] = fontA;
450  textMat[len] = textMatA;
451 
452  if (len == 0)
453  setInitialBounds(fontA, x, y);
454 
455  if (wMode) { // vertical writing mode
456  // NB: the rotation value has been incremented by 1 (in
457  // TextPage::beginWord()) for vertical writing mode
458  switch (rot) {
459  case 0:
460  edge[len] = x - fontSize;
461  xMax = edge[len + 1] = x;
462  break;
463  case 1:
464  edge[len] = y - fontSize;
465  yMax = edge[len + 1] = y;
466  break;
467  case 2:
468  edge[len] = x + fontSize;
469  xMin = edge[len + 1] = x;
470  break;
471  case 3:
472  edge[len] = y + fontSize;
473  yMin = edge[len + 1] = y;
474  break;
475  }
476  } else { // horizontal writing mode
477  switch (rot) {
478  case 0:
479  edge[len] = x;
480  xMax = edge[len + 1] = x + dx;
481  break;
482  case 1:
483  edge[len] = y;
484  yMax = edge[len + 1] = y + dy;
485  break;
486  case 2:
487  edge[len] = x;
488  xMin = edge[len + 1] = x + dx;
489  break;
490  case 3:
491  edge[len] = y;
492  yMin = edge[len + 1] = y + dy;
493  break;
494  }
495  }
496  ++len;
497 }
498 
499 void TextWord::setInitialBounds(TextFontInfo *fontA, double x, double y)
500 {
501  double ascent = fontA->getAscent() * fontSize;
502  double descent = fontA->getDescent() * fontSize;
503  wMode = fontA->getWMode();
504 
505  if (wMode) { // vertical writing mode
506  // NB: the rotation value has been incremented by 1 (in
507  // TextPage::beginWord()) for vertical writing mode
508  switch (rot) {
509  case 0:
510  xMin = x - fontSize;
511  yMin = y - fontSize;
512  yMax = y;
513  base = y;
514  break;
515  case 1:
516  xMin = x;
517  yMin = y - fontSize;
518  xMax = x + fontSize;
519  base = x;
520  break;
521  case 2:
522  yMin = y;
523  xMax = x + fontSize;
524  yMax = y + fontSize;
525  base = y;
526  break;
527  case 3:
528  xMin = x - fontSize;
529  xMax = x;
530  yMax = y + fontSize;
531  base = x;
532  break;
533  }
534  } else { // horizontal writing mode
535  switch (rot) {
536  case 0:
537  xMin = x;
538  yMin = y - ascent;
539  yMax = y - descent;
540  if (yMin == yMax) {
541  // this is a sanity check for a case that shouldn't happen -- but
542  // if it does happen, we want to avoid dividing by zero later
543  yMin = y;
544  yMax = y + 1;
545  }
546  base = y;
547  break;
548  case 1:
549  xMin = x + descent;
550  yMin = y;
551  xMax = x + ascent;
552  if (xMin == xMax) {
553  // this is a sanity check for a case that shouldn't happen -- but
554  // if it does happen, we want to avoid dividing by zero later
555  xMin = x;
556  xMax = x + 1;
557  }
558  base = x;
559  break;
560  case 2:
561  yMin = y + descent;
562  xMax = x;
563  yMax = y + ascent;
564  if (yMin == yMax) {
565  // this is a sanity check for a case that shouldn't happen -- but
566  // if it does happen, we want to avoid dividing by zero later
567  yMin = y;
568  yMax = y + 1;
569  }
570  base = y;
571  break;
572  case 3:
573  xMin = x - ascent;
574  xMax = x - descent;
575  yMax = y;
576  if (xMin == xMax) {
577  // this is a sanity check for a case that shouldn't happen -- but
578  // if it does happen, we want to avoid dividing by zero later
579  xMin = x;
580  xMax = x + 1;
581  }
582  base = x;
583  break;
584  }
585  }
586 }
587 
589 {
590  if (capacity > size) {
591  size = std::max(size + 16, capacity);
592  text = (Unicode *)greallocn(text, size, sizeof(Unicode));
593  charcode = (CharCode *)greallocn(charcode, (size + 1), sizeof(CharCode));
594  edge = (double *)greallocn(edge, (size + 1), sizeof(double));
595  charPos = (int *)greallocn(charPos, size + 1, sizeof(int));
596  font = (TextFontInfo **)greallocn(font, size, sizeof(TextFontInfo *));
597  textMat = (Matrix *)greallocn(textMat, size, sizeof(Matrix));
598  }
599 }
600 
602 {
605 };
606 
607 static const struct CombiningTable combiningTable[] = {
608  { 0x0060, 0x0300 }, // grave
609  { 0x00a8, 0x0308 }, // dieresis
610  { 0x00af, 0x0304 }, // macron
611  { 0x00b4, 0x0301 }, // acute
612  { 0x00b8, 0x0327 }, // cedilla
613  { 0x02c6, 0x0302 }, // circumflex
614  { 0x02c7, 0x030c }, // caron
615  { 0x02d8, 0x0306 }, // breve
616  { 0x02d9, 0x0307 }, // dotaccent
617  { 0x02da, 0x030a }, // ring
618  { 0x02dc, 0x0303 }, // tilde
619  { 0x02dd, 0x030b } // hungarumlaut (double acute accent)
620 };
621 
622 // returning combining versions of characters
624 {
625  for (const CombiningTable &combining : combiningTable) {
626  if (u == combining.base)
627  return combining.comb;
628  }
629  return 0;
630 }
631 
632 bool TextWord::addCombining(const GfxState *state, TextFontInfo *fontA, double fontSizeA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA)
633 {
634  if (len == 0 || wMode != 0 || fontA->getWMode() != 0)
635  return false;
636 
637  Unicode cCurrent = getCombiningChar(u);
638  Unicode cPrev = getCombiningChar(text[len - 1]);
639  double edgeMid = (edge[len - 1] + edge[len]) / 2;
640  double charMid, maxScaledMidDelta, charBase, maxScaledBaseDelta;
641 
642  if (cCurrent != 0 && unicodeTypeAlphaNum(text[len - 1])) {
643  // Current is a combining character, previous is base character
644  maxScaledMidDelta = fabs(edge[len] - edge[len - 1]) * combMaxMidDelta;
645  charMid = charBase = maxScaledBaseDelta = 0;
646 
647  // Test if characters overlap
648  if (rot == 0 || rot == 2) {
649  charMid = x + (dx / 2);
650  charBase = y;
651  maxScaledBaseDelta = (yMax - yMin) * combMaxBaseDelta;
652  } else {
653  charMid = y + (dy / 2);
654  charBase = x;
655  maxScaledBaseDelta = (xMax - xMin) * combMaxBaseDelta;
656  }
657 
658  if (fabs(charMid - edgeMid) >= maxScaledMidDelta || fabs(charBase - base) >= maxScaledBaseDelta)
659  return false;
660 
661  // Add character, but don't adjust edge / bounding box because
662  // combining character's positioning could be odd.
663  ensureCapacity(len + 1);
664  text[len] = cCurrent;
665  charcode[len] = c;
666  charPos[len] = charPosA;
667  charPos[len + 1] = charPosA + charLen;
668  font[len] = fontA;
669  textMat[len] = textMatA;
670  edge[len + 1] = edge[len];
671  edge[len] = (edge[len + 1] + edge[len - 1]) / 2;
672  ++len;
673  return true;
674  }
675 
676  if (cPrev != 0 && unicodeTypeAlphaNum(u)) {
677  // Previous is a combining character, current is base character
678  maxScaledBaseDelta = (fontA->getAscent() - fontA->getDescent()) * fontSizeA * combMaxBaseDelta;
679  charMid = charBase = maxScaledMidDelta = 0;
680 
681  // Test if characters overlap
682  if (rot == 0 || rot == 2) {
683  charMid = x + (dx / 2);
684  charBase = y;
685  maxScaledMidDelta = fabs(dx * combMaxMidDelta);
686  } else {
687  charMid = y + (dy / 2);
688  charBase = x;
689  maxScaledMidDelta = fabs(dy * combMaxMidDelta);
690  }
691 
692  if (fabs(charMid - edgeMid) >= maxScaledMidDelta || fabs(charBase - base) >= maxScaledBaseDelta)
693  return false;
694 
695  // move combining character to after base character
696  ensureCapacity(len + 1);
697  fontSize = fontSizeA;
698  text[len] = cPrev;
699  charcode[len] = charcode[len - 1];
700  charPos[len] = charPosA;
701  charPos[len + 1] = charPosA + charLen;
702  font[len] = font[len - 1];
703  textMat[len] = textMat[len - 1];
704 
705  text[len - 1] = u;
706  charcode[len - 1] = c;
707  font[len - 1] = fontA;
708  textMat[len - 1] = textMatA;
709 
710  if (len == 1)
711  setInitialBounds(fontA, x, y);
712 
713  // Updated edges / bounding box because we changed the base
714  // character.
715  if (wMode) {
716  switch (rot) {
717  case 0:
718  edge[len - 1] = x - fontSize;
719  xMax = edge[len + 1] = x;
720  break;
721  case 1:
722  edge[len - 1] = y - fontSize;
723  yMax = edge[len + 1] = y;
724  break;
725  case 2:
726  edge[len - 1] = x + fontSize;
727  xMin = edge[len + 1] = x;
728  break;
729  case 3:
730  edge[len - 1] = y + fontSize;
731  yMin = edge[len + 1] = y;
732  break;
733  }
734  } else {
735  switch (rot) {
736  case 0:
737  edge[len - 1] = x;
738  xMax = edge[len + 1] = x + dx;
739  break;
740  case 1:
741  edge[len - 1] = y;
742  yMax = edge[len + 1] = y + dy;
743  break;
744  case 2:
745  edge[len - 1] = x;
746  xMin = edge[len + 1] = x + dx;
747  break;
748  case 3:
749  edge[len - 1] = y;
750  yMin = edge[len + 1] = y + dy;
751  break;
752  }
753  }
754 
755  edge[len] = (edge[len + 1] + edge[len - 1]) / 2;
756  ++len;
757  return true;
758  }
759  return false;
760 }
761 
763 {
764  int i;
765 
766  if (word->xMin < xMin) {
767  xMin = word->xMin;
768  }
769  if (word->yMin < yMin) {
770  yMin = word->yMin;
771  }
772  if (word->xMax > xMax) {
773  xMax = word->xMax;
774  }
775  if (word->yMax > yMax) {
776  yMax = word->yMax;
777  }
778  ensureCapacity(len + word->len);
779  for (i = 0; i < word->len; ++i) {
780  text[len + i] = word->text[i];
781  charcode[len + i] = word->charcode[i];
782  edge[len + i] = word->edge[i];
783  charPos[len + i] = word->charPos[i];
784  font[len + i] = word->font[i];
785  textMat[len + i] = word->textMat[i];
786  }
787  edge[len + word->len] = word->edge[word->len];
788  charPos[len + word->len] = word->charPos[word->len];
789  len += word->len;
790 }
791 
792 inline int TextWord::primaryCmp(const TextWord *word) const
793 {
794  double cmp;
795 
796  cmp = 0; // make gcc happy
797  switch (rot) {
798  case 0:
799  cmp = xMin - word->xMin;
800  break;
801  case 1:
802  cmp = yMin - word->yMin;
803  break;
804  case 2:
805  cmp = word->xMax - xMax;
806  break;
807  case 3:
808  cmp = word->yMax - yMax;
809  break;
810  }
811  return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
812 }
813 
815 {
816  double delta;
817 
818  delta = 0; // make gcc happy
819  switch (rot) {
820  case 0:
821  delta = word->xMin - xMax;
822  break;
823  case 1:
824  delta = word->yMin - yMax;
825  break;
826  case 2:
827  delta = xMin - word->xMax;
828  break;
829  case 3:
830  delta = yMin - word->yMax;
831  break;
832  }
833  return delta;
834 }
835 
836 int TextWord::cmpYX(const void *p1, const void *p2)
837 {
838  TextWord *word1 = *(TextWord **)p1;
839  TextWord *word2 = *(TextWord **)p2;
840  double cmp;
841 
842  cmp = word1->yMin - word2->yMin;
843  if (cmp == 0) {
844  cmp = word1->xMin - word2->xMin;
845  }
846  return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
847 }
848 
849 #ifdef TEXTOUT_WORD_LIST
850 
852 {
853  GooString *s;
854  const UnicodeMap *uMap;
855  char buf[8];
856  int n, i;
857 
858  s = new GooString();
859  if (!(uMap = globalParams->getTextEncoding())) {
860  return s;
861  }
862  for (i = 0; i < len; ++i) {
863  n = uMap->mapUnicode(text[i], buf, sizeof(buf));
864  s->append(buf, n);
865  }
866  return s;
867 }
868 
869 void TextWord::getCharBBox(int charIdx, double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const
870 {
871  if (charIdx < 0 || charIdx >= len) {
872  return;
873  }
874  switch (rot) {
875  case 0:
876  *xMinA = edge[charIdx];
877  *xMaxA = edge[charIdx + 1];
878  *yMinA = yMin;
879  *yMaxA = yMax;
880  break;
881  case 1:
882  *xMinA = xMin;
883  *xMaxA = xMax;
884  *yMinA = edge[charIdx];
885  *yMaxA = edge[charIdx + 1];
886  break;
887  case 2:
888  *xMinA = edge[charIdx + 1];
889  *xMaxA = edge[charIdx];
890  *yMinA = yMin;
891  *yMaxA = yMax;
892  break;
893  case 3:
894  *xMinA = xMin;
895  *xMaxA = xMax;
896  *yMinA = edge[charIdx + 1];
897  *yMaxA = edge[charIdx];
898  break;
899  }
900 }
901 
902 #endif // TEXTOUT_WORD_LIST
903 
904 //------------------------------------------------------------------------
905 // TextPool
906 //------------------------------------------------------------------------
907 
909 {
910  minBaseIdx = 0;
911  maxBaseIdx = -1;
912  pool = nullptr;
913  cursor = nullptr;
914  cursorBaseIdx = -1;
915 }
916 
918 {
919  int baseIdx;
920  TextWord *word, *word2;
921 
922  for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
923  for (word = pool[baseIdx - minBaseIdx]; word; word = word2) {
924  word2 = word->next;
925  delete word;
926  }
927  }
928  gfree(pool);
929 }
930 
931 int TextPool::getBaseIdx(double base) const
932 {
933  const double baseIdxDouble = base / textPoolStep;
934  if (baseIdxDouble < minBaseIdx) {
935  return minBaseIdx;
936  }
937  if (baseIdxDouble > maxBaseIdx) {
938  return maxBaseIdx;
939  }
940  return (int)baseIdxDouble;
941 }
942 
944 {
945  int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx;
946  TextWord *w0, *w1;
947 
948  // expand the array if needed
949  wordBaseIdx = (int)(word->base / textPoolStep);
950  if (unlikely(wordBaseIdx <= INT_MIN + 128 || wordBaseIdx >= INT_MAX - 128)) {
951  error(errSyntaxWarning, -1, "wordBaseIdx out of range");
952  delete word;
953  return;
954  }
955  if (minBaseIdx > maxBaseIdx) {
956  minBaseIdx = wordBaseIdx - 128;
957  maxBaseIdx = wordBaseIdx + 128;
958  pool = (TextWord **)gmallocn(maxBaseIdx - minBaseIdx + 1, sizeof(TextWord *));
959  for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
960  pool[baseIdx - minBaseIdx] = nullptr;
961  }
962  } else if (wordBaseIdx < minBaseIdx) {
963  newMinBaseIdx = wordBaseIdx - 128;
964  TextWord **newPool = (TextWord **)gmallocn_checkoverflow(maxBaseIdx - newMinBaseIdx + 1, sizeof(TextWord *));
965  if (unlikely(!newPool)) {
966  error(errSyntaxWarning, -1, "newPool would overflow");
967  delete word;
968  return;
969  }
970  for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) {
971  newPool[baseIdx - newMinBaseIdx] = nullptr;
972  }
973  memcpy(&newPool[minBaseIdx - newMinBaseIdx], pool, (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *));
974  gfree(pool);
975  pool = newPool;
976  minBaseIdx = newMinBaseIdx;
977  } else if (wordBaseIdx > maxBaseIdx) {
978  newMaxBaseIdx = wordBaseIdx + 128;
979  TextWord **reallocatedPool = (TextWord **)greallocn(pool, newMaxBaseIdx - minBaseIdx + 1, sizeof(TextWord *), true /*checkoverflow*/, false /*free_pool*/);
980  if (!reallocatedPool) {
981  error(errSyntaxWarning, -1, "new pool size would overflow");
982  delete word;
983  return;
984  }
985  pool = reallocatedPool;
986  for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) {
987  pool[baseIdx - minBaseIdx] = nullptr;
988  }
989  maxBaseIdx = newMaxBaseIdx;
990  }
991 
992  // insert the new word
993  if (cursor && wordBaseIdx == cursorBaseIdx && word->primaryCmp(cursor) >= 0) {
994  w0 = cursor;
995  w1 = cursor->next;
996  } else {
997  w0 = nullptr;
998  w1 = pool[wordBaseIdx - minBaseIdx];
999  }
1000  for (; w1 && word->primaryCmp(w1) > 0; w0 = w1, w1 = w1->next)
1001  ;
1002  word->next = w1;
1003  if (w0) {
1004  w0->next = word;
1005  } else {
1006  pool[wordBaseIdx - minBaseIdx] = word;
1007  }
1008  cursor = word;
1009  cursorBaseIdx = wordBaseIdx;
1010 }
1011 
1012 //------------------------------------------------------------------------
1013 // TextLine
1014 //------------------------------------------------------------------------
1015 
1016 TextLine::TextLine(TextBlock *blkA, int rotA, double baseA)
1017 {
1018  blk = blkA;
1019  rot = rotA;
1020  base = baseA;
1021  words = lastWord = nullptr;
1022  text = nullptr;
1023  edge = nullptr;
1024  col = nullptr;
1025  len = 0;
1026  convertedLen = 0;
1027  hyphenated = false;
1028  next = nullptr;
1029  xMin = yMin = 0;
1030  xMax = yMax = -1;
1031  normalized = nullptr;
1032  normalized_len = 0;
1033  normalized_idx = nullptr;
1034  ascii_translation = nullptr;
1035  ascii_len = 0;
1036  ascii_idx = nullptr;
1037 }
1038 
1040 {
1041  TextWord *word;
1042 
1043  while (words) {
1044  word = words;
1045  words = words->next;
1046  delete word;
1047  }
1048  gfree(text);
1049  gfree(edge);
1050  gfree(col);
1051  if (normalized) {
1052  gfree(normalized);
1054  }
1055  if (ascii_translation) {
1057  gfree(ascii_idx);
1058  }
1059 }
1060 
1062 {
1063  if (lastWord) {
1064  lastWord->next = word;
1065  } else {
1066  words = word;
1067  }
1068  lastWord = word;
1069 
1070  if (xMin > xMax) {
1071  xMin = word->xMin;
1072  xMax = word->xMax;
1073  yMin = word->yMin;
1074  yMax = word->yMax;
1075  } else {
1076  if (word->xMin < xMin) {
1077  xMin = word->xMin;
1078  }
1079  if (word->xMax > xMax) {
1080  xMax = word->xMax;
1081  }
1082  if (word->yMin < yMin) {
1083  yMin = word->yMin;
1084  }
1085  if (word->yMax > yMax) {
1086  yMax = word->yMax;
1087  }
1088  }
1089 }
1090 
1092 {
1093  double delta;
1094 
1095  delta = 0; // make gcc happy
1096  switch (rot) {
1097  case 0:
1098  delta = line->xMin - xMax;
1099  break;
1100  case 1:
1101  delta = line->yMin - yMax;
1102  break;
1103  case 2:
1104  delta = xMin - line->xMax;
1105  break;
1106  case 3:
1107  delta = yMin - line->yMax;
1108  break;
1109  }
1110  return delta;
1111 }
1112 
1114 {
1115  double cmp;
1116 
1117  cmp = 0; // make gcc happy
1118  switch (rot) {
1119  case 0:
1120  cmp = xMin - line->xMin;
1121  break;
1122  case 1:
1123  cmp = yMin - line->yMin;
1124  break;
1125  case 2:
1126  cmp = line->xMax - xMax;
1127  break;
1128  case 3:
1129  cmp = line->yMax - yMax;
1130  break;
1131  }
1132  return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1133 }
1134 
1136 {
1137  double cmp;
1138 
1139  cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base;
1140  return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1141 }
1142 
1143 int TextLine::cmpYX(const TextLine *line) const
1144 {
1145  int cmp;
1146 
1147  if ((cmp = secondaryCmp(line))) {
1148  return cmp;
1149  }
1150  return primaryCmp(line);
1151 }
1152 
1153 int TextLine::cmpXY(const void *p1, const void *p2)
1154 {
1155  TextLine *line1 = *(TextLine **)p1;
1156  TextLine *line2 = *(TextLine **)p2;
1157  int cmp;
1158 
1159  if ((cmp = line1->primaryCmp(line2))) {
1160  return cmp;
1161  }
1162  return line1->secondaryCmp(line2);
1163 }
1164 
1166 {
1167  TextWord *word0, *word1;
1168  double space, delta, minSpace;
1169  bool isUnicode;
1170  char buf[8];
1171  int i, j;
1172 
1173  if (words->next) {
1174 
1175  // compute the inter-word space threshold
1176  if (words->len > 1 || words->next->len > 1) {
1177  minSpace = 0;
1178  } else {
1179  minSpace = words->primaryDelta(words->next);
1180  for (word0 = words->next, word1 = word0->next; word1 && minSpace > 0; word0 = word1, word1 = word0->next) {
1181  if (word1->len > 1) {
1182  minSpace = 0;
1183  }
1184  delta = word0->primaryDelta(word1);
1185  if (delta < minSpace) {
1186  minSpace = delta;
1187  }
1188  }
1189  }
1190  if (minSpace <= 0) {
1191  space = maxCharSpacing * words->fontSize;
1192  } else {
1193  space = maxWideCharSpacingMul * minSpace;
1194  if (space > maxWideCharSpacing * words->fontSize) {
1195  space = maxWideCharSpacing * words->fontSize;
1196  }
1197  }
1198 
1199  // merge words
1200  word0 = words;
1201  word1 = words->next;
1202  while (word1) {
1203  if (word0->primaryDelta(word1) >= space) {
1204  word0->spaceAfter = true;
1205  word0 = word1;
1206  word1 = word1->next;
1207  } else if (word0->font[word0->len - 1] == word1->font[0] && word0->underlined == word1->underlined && fabs(word0->fontSize - word1->fontSize) < maxWordFontSizeDelta * words->fontSize
1208  && word1->charPos[0] == word0->charPos[word0->len]) {
1209  word0->merge(word1);
1210  word0->next = word1->next;
1211  delete word1;
1212  word1 = word0->next;
1213  } else {
1214  word0 = word1;
1215  word1 = word1->next;
1216  }
1217  }
1218  }
1219 
1220  // build the line text
1221  isUnicode = uMap ? uMap->isUnicode() : false;
1222  len = 0;
1223  for (word1 = words; word1; word1 = word1->next) {
1224  len += word1->len;
1225  if (word1->spaceAfter) {
1226  ++len;
1227  }
1228  }
1229  text = (Unicode *)gmallocn(len, sizeof(Unicode));
1230  edge = (double *)gmallocn(len + 1, sizeof(double));
1231  i = 0;
1232  for (word1 = words; word1; word1 = word1->next) {
1233  for (j = 0; j < word1->len; ++j) {
1234  text[i] = word1->text[j];
1235  edge[i] = word1->edge[j];
1236  ++i;
1237  }
1238  edge[i] = word1->edge[word1->len];
1239  if (word1->spaceAfter) {
1240  text[i] = (Unicode)0x0020;
1241  ++i;
1242  }
1243  }
1244 
1245  // compute convertedLen and set up the col array
1246  col = (int *)gmallocn(len + 1, sizeof(int));
1247  convertedLen = 0;
1248  for (i = 0; i < len; ++i) {
1249  col[i] = convertedLen;
1250  if (isUnicode) {
1251  ++convertedLen;
1252  } else if (uMap) {
1253  convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf));
1254  }
1255  }
1256  col[len] = convertedLen;
1257 
1258  // check for hyphen at end of line
1259  //~ need to check for other chars used as hyphens
1260  hyphenated = text[len - 1] == (Unicode)'-';
1261 }
1262 
1263 //------------------------------------------------------------------------
1264 // TextLineFrag
1265 //------------------------------------------------------------------------
1266 
1268 {
1269 public:
1270  TextLine *line; // the line object
1271  int start, len; // offset and length of this fragment
1272  // (in Unicode chars)
1273  double xMin, xMax; // bounding box coordinates
1274  double yMin, yMax;
1275  double base; // baseline virtual coordinate
1276  int col; // first column
1277 
1278  void init(TextLine *lineA, int startA, int lenA);
1279  void computeCoords(bool oneRot);
1280 
1281  static int cmpYXPrimaryRot(const void *p1, const void *p2);
1282  static int cmpYXLineRot(const void *p1, const void *p2);
1283  static int cmpXYLineRot(const void *p1, const void *p2);
1284  static int cmpXYColumnPrimaryRot(const void *p1, const void *p2);
1285  static int cmpXYColumnLineRot(const void *p1, const void *p2);
1286 };
1287 
1288 void TextLineFrag::init(TextLine *lineA, int startA, int lenA)
1289 {
1290  line = lineA;
1291  start = startA;
1292  len = lenA;
1293  col = line->col[start];
1294 }
1295 
1297 {
1298  TextBlock *blk;
1299  double d0, d1, d2, d3, d4;
1300 
1301  if (oneRot) {
1302 
1303  switch (line->rot) {
1304  case 0:
1305  xMin = line->edge[start];
1306  xMax = line->edge[start + len];
1307  yMin = line->yMin;
1308  yMax = line->yMax;
1309  break;
1310  case 1:
1311  xMin = line->xMin;
1312  xMax = line->xMax;
1313  yMin = line->edge[start];
1314  yMax = line->edge[start + len];
1315  break;
1316  case 2:
1317  xMin = line->edge[start + len];
1318  xMax = line->edge[start];
1319  yMin = line->yMin;
1320  yMax = line->yMax;
1321  break;
1322  case 3:
1323  xMin = line->xMin;
1324  xMax = line->xMax;
1325  yMin = line->edge[start + len];
1326  yMax = line->edge[start];
1327  break;
1328  }
1329  base = line->base;
1330 
1331  } else {
1332 
1333  if (line->rot == 0 && line->blk->page->primaryRot == 0) {
1334 
1335  xMin = line->edge[start];
1336  xMax = line->edge[start + len];
1337  yMin = line->yMin;
1338  yMax = line->yMax;
1339  base = line->base;
1340 
1341  } else {
1342 
1343  blk = line->blk;
1344  d0 = line->edge[start];
1345  d1 = line->edge[start + len];
1346  d2 = d3 = d4 = 0; // make gcc happy
1347 
1348  switch (line->rot) {
1349  case 0:
1350  d2 = line->yMin;
1351  d3 = line->yMax;
1352  d4 = line->base;
1353  d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin);
1354  d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin);
1355  d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin);
1356  d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin);
1357  d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin);
1358  break;
1359  case 1:
1360  d2 = line->xMax;
1361  d3 = line->xMin;
1362  d4 = line->base;
1363  d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin);
1364  d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin);
1365  d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin);
1366  d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin);
1367  d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin);
1368  break;
1369  case 2:
1370  d2 = line->yMax;
1371  d3 = line->yMin;
1372  d4 = line->base;
1373  d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin);
1374  d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin);
1375  d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin);
1376  d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin);
1377  d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin);
1378  break;
1379  case 3:
1380  d2 = line->xMin;
1381  d3 = line->xMax;
1382  d4 = line->base;
1383  d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin);
1384  d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin);
1385  d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin);
1386  d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin);
1387  d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin);
1388  break;
1389  }
1390 
1391  switch (line->blk->page->primaryRot) {
1392  case 0:
1393  xMin = blk->xMin + d0 * (blk->xMax - blk->xMin);
1394  xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
1395  yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
1396  yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
1397  base = blk->yMin + d4 * (blk->yMax - blk->yMin);
1398  break;
1399  case 1:
1400  xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
1401  xMax = blk->xMax - d2 * (blk->xMax - blk->xMin);
1402  yMin = blk->yMin + d0 * (blk->yMax - blk->yMin);
1403  yMax = blk->yMin + d1 * (blk->yMax - blk->yMin);
1404  base = blk->xMax - d4 * (blk->xMax - blk->xMin);
1405  break;
1406  case 2:
1407  xMin = blk->xMax - d1 * (blk->xMax - blk->xMin);
1408  xMax = blk->xMax - d0 * (blk->xMax - blk->xMin);
1409  yMin = blk->yMax - d3 * (blk->yMax - blk->yMin);
1410  yMax = blk->yMax - d2 * (blk->yMax - blk->yMin);
1411  base = blk->yMax - d4 * (blk->yMax - blk->yMin);
1412  break;
1413  case 3:
1414  xMin = blk->xMin + d2 * (blk->xMax - blk->xMin);
1415  xMax = blk->xMin + d3 * (blk->xMax - blk->xMin);
1416  yMin = blk->yMax - d1 * (blk->yMax - blk->yMin);
1417  yMax = blk->yMax - d0 * (blk->yMax - blk->yMin);
1418  base = blk->xMin + d4 * (blk->xMax - blk->xMin);
1419  break;
1420  }
1421  }
1422  }
1423 }
1424 
1425 int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2)
1426 {
1427  TextLineFrag *frag1 = (TextLineFrag *)p1;
1428  TextLineFrag *frag2 = (TextLineFrag *)p2;
1429  double cmp;
1430 
1431  cmp = 0; // make gcc happy
1432  switch (frag1->line->blk->page->primaryRot) {
1433  case 0:
1434  if (fabs(cmp = frag1->yMin - frag2->yMin) < 0.01) {
1435  cmp = frag1->xMin - frag2->xMin;
1436  }
1437  break;
1438  case 1:
1439  if (fabs(cmp = frag2->xMax - frag1->xMax) < 0.01) {
1440  cmp = frag1->yMin - frag2->yMin;
1441  }
1442  break;
1443  case 2:
1444  if (fabs(cmp = frag2->yMin - frag1->yMin) < 0.01) {
1445  cmp = frag2->xMax - frag1->xMax;
1446  }
1447  break;
1448  case 3:
1449  if (fabs(cmp = frag1->xMax - frag2->xMax) < 0.01) {
1450  cmp = frag2->yMax - frag1->yMax;
1451  }
1452  break;
1453  }
1454  return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1455 }
1456 
1457 int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2)
1458 {
1459  TextLineFrag *frag1 = (TextLineFrag *)p1;
1460  TextLineFrag *frag2 = (TextLineFrag *)p2;
1461  double cmp;
1462 
1463  cmp = 0; // make gcc happy
1464  switch (frag1->line->rot) {
1465  case 0:
1466  if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1467  cmp = frag1->xMin - frag2->xMin;
1468  }
1469  break;
1470  case 1:
1471  if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1472  cmp = frag1->yMin - frag2->yMin;
1473  }
1474  break;
1475  case 2:
1476  if ((cmp = frag2->yMin - frag1->yMin) == 0) {
1477  cmp = frag2->xMax - frag1->xMax;
1478  }
1479  break;
1480  case 3:
1481  if ((cmp = frag1->xMax - frag2->xMax) == 0) {
1482  cmp = frag2->yMax - frag1->yMax;
1483  }
1484  break;
1485  }
1486  return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1487 }
1488 
1489 int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2)
1490 {
1491  TextLineFrag *frag1 = (TextLineFrag *)p1;
1492  TextLineFrag *frag2 = (TextLineFrag *)p2;
1493  double cmp;
1494 
1495  cmp = 0; // make gcc happy
1496  switch (frag1->line->rot) {
1497  case 0:
1498  if ((cmp = frag1->xMin - frag2->xMin) == 0) {
1499  cmp = frag1->yMin - frag2->yMin;
1500  }
1501  break;
1502  case 1:
1503  if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1504  cmp = frag2->xMax - frag1->xMax;
1505  }
1506  break;
1507  case 2:
1508  if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1509  cmp = frag2->yMin - frag1->yMin;
1510  }
1511  break;
1512  case 3:
1513  if ((cmp = frag2->yMax - frag1->yMax) == 0) {
1514  cmp = frag1->xMax - frag2->xMax;
1515  }
1516  break;
1517  }
1518  return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1519 }
1520 
1521 int TextLineFrag::cmpXYColumnPrimaryRot(const void *p1, const void *p2)
1522 {
1523  TextLineFrag *frag1 = (TextLineFrag *)p1;
1524  TextLineFrag *frag2 = (TextLineFrag *)p2;
1525  double cmp;
1526 
1527  // if columns overlap, compare y values
1528  if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] - frag2->line->col[frag2->start]) && frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start])) {
1529  cmp = 0; // make gcc happy
1530  switch (frag1->line->blk->page->primaryRot) {
1531  case 0:
1532  cmp = frag1->yMin - frag2->yMin;
1533  break;
1534  case 1:
1535  cmp = frag2->xMax - frag1->xMax;
1536  break;
1537  case 2:
1538  cmp = frag2->yMin - frag1->yMin;
1539  break;
1540  case 3:
1541  cmp = frag1->xMax - frag2->xMax;
1542  break;
1543  }
1544  return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1545  }
1546 
1547  // otherwise, compare starting column
1548  return frag1->col - frag2->col;
1549 }
1550 
1551 int TextLineFrag::cmpXYColumnLineRot(const void *p1, const void *p2)
1552 {
1553  TextLineFrag *frag1 = (TextLineFrag *)p1;
1554  TextLineFrag *frag2 = (TextLineFrag *)p2;
1555  double cmp;
1556 
1557  // if columns overlap, compare y values
1558  if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] - frag2->line->col[frag2->start]) && frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start])) {
1559  cmp = 0; // make gcc happy
1560  switch (frag1->line->rot) {
1561  case 0:
1562  cmp = frag1->yMin - frag2->yMin;
1563  break;
1564  case 1:
1565  cmp = frag2->xMax - frag1->xMax;
1566  break;
1567  case 2:
1568  cmp = frag2->yMin - frag1->yMin;
1569  break;
1570  case 3:
1571  cmp = frag1->xMax - frag2->xMax;
1572  break;
1573  }
1574  return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1575  }
1576 
1577  // otherwise, compare starting column
1578  return frag1->col - frag2->col;
1579 }
1580 
1581 //------------------------------------------------------------------------
1582 // TextBlock
1583 //------------------------------------------------------------------------
1584 
1586 {
1587  page = pageA;
1588  rot = rotA;
1589  xMin = yMin = 0;
1590  xMax = yMax = -1;
1591  priMin = 0;
1592  priMax = page->pageWidth;
1593  pool = new TextPool();
1594  lines = nullptr;
1595  curLine = nullptr;
1596  next = nullptr;
1597  stackNext = nullptr;
1598  tableId = -1;
1599  tableEnd = false;
1600 }
1601 
1603 {
1604  TextLine *line;
1605 
1606  delete pool;
1607  while (lines) {
1608  line = lines;
1609  lines = lines->next;
1610  delete line;
1611  }
1612 }
1613 
1615 {
1616  pool->addWord(word);
1617  if (xMin > xMax) {
1618  xMin = word->xMin;
1619  xMax = word->xMax;
1620  yMin = word->yMin;
1621  yMax = word->yMax;
1622  } else {
1623  if (word->xMin < xMin) {
1624  xMin = word->xMin;
1625  }
1626  if (word->xMax > xMax) {
1627  xMax = word->xMax;
1628  }
1629  if (word->yMin < yMin) {
1630  yMin = word->yMin;
1631  }
1632  if (word->yMax > yMax) {
1633  yMax = word->yMax;
1634  }
1635  }
1636 }
1637 
1638 void TextBlock::coalesce(const UnicodeMap *uMap, double fixedPitch)
1639 {
1640  TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord;
1641  TextLine *line, *line0, *line1;
1642  int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
1643  int baseIdx, bestWordBaseIdx, idx0, idx1;
1644  double minBase, maxBase;
1645  double fontSize, wordSpacing, delta, priDelta, secDelta;
1646  TextLine **lineArray;
1647  bool found, overlap;
1648  int col1, col2;
1649  int i, j, k;
1650 
1651  // discard duplicated text (fake boldface, drop shadows)
1652  for (idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) {
1653  word0 = pool->getPool(idx0);
1654  while (word0) {
1655  priDelta = dupMaxPriDelta * word0->fontSize;
1656  secDelta = dupMaxSecDelta * word0->fontSize;
1657  maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
1658  found = false;
1659  word1 = word2 = nullptr; // make gcc happy
1660  for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) {
1661  if (idx1 == idx0) {
1662  word1 = word0;
1663  word2 = word0->next;
1664  } else {
1665  word1 = nullptr;
1666  word2 = pool->getPool(idx1);
1667  }
1668  for (; word2; word1 = word2, word2 = word2->next) {
1669  if (word2->len == word0->len && !memcmp(word2->text, word0->text, word0->len * sizeof(Unicode))) {
1670  switch (rot) {
1671  case 0:
1672  case 2:
1673  found = fabs(word0->xMin - word2->xMin) < priDelta && fabs(word0->xMax - word2->xMax) < priDelta && fabs(word0->yMin - word2->yMin) < secDelta && fabs(word0->yMax - word2->yMax) < secDelta;
1674  break;
1675  case 1:
1676  case 3:
1677  found = fabs(word0->xMin - word2->xMin) < secDelta && fabs(word0->xMax - word2->xMax) < secDelta && fabs(word0->yMin - word2->yMin) < priDelta && fabs(word0->yMax - word2->yMax) < priDelta;
1678  break;
1679  }
1680  }
1681  if (found) {
1682  break;
1683  }
1684  }
1685  if (found) {
1686  break;
1687  }
1688  }
1689  if (found) {
1690  if (word1) {
1691  word1->next = word2->next;
1692  } else {
1693  pool->setPool(idx1, word2->next);
1694  }
1695  delete word2;
1696  } else {
1697  word0 = word0->next;
1698  }
1699  }
1700  }
1701 
1702  // build the lines
1703  curLine = nullptr;
1704  poolMinBaseIdx = pool->minBaseIdx;
1705  charCount = 0;
1706  nLines = 0;
1707  while (true) {
1708 
1709  // find the first non-empty line in the pool
1710  for (; poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx); ++poolMinBaseIdx)
1711  ;
1712  if (poolMinBaseIdx > pool->maxBaseIdx) {
1713  break;
1714  }
1715 
1716  // look for the left-most word in the first four lines of the
1717  // pool -- this avoids starting with a superscript word
1718  startBaseIdx = poolMinBaseIdx;
1719  for (baseIdx = poolMinBaseIdx + 1; baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx; ++baseIdx) {
1720  if (!pool->getPool(baseIdx)) {
1721  continue;
1722  }
1723  if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx)) < 0) {
1724  startBaseIdx = baseIdx;
1725  }
1726  }
1727 
1728  // create a new line
1729  word0 = pool->getPool(startBaseIdx);
1730  pool->setPool(startBaseIdx, word0->next);
1731  word0->next = nullptr;
1732  line = new TextLine(this, word0->rot, word0->base);
1733  line->addWord(word0);
1734  lastWord = word0;
1735 
1736  // compute the search range
1737  fontSize = word0->fontSize;
1738  minBase = word0->base - maxIntraLineDelta * fontSize;
1739  maxBase = word0->base + maxIntraLineDelta * fontSize;
1740  minBaseIdx = pool->getBaseIdx(minBase);
1741  maxBaseIdx = pool->getBaseIdx(maxBase);
1743 
1744  // find the rest of the words in this line
1745  while (true) {
1746 
1747  // find the left-most word whose baseline is in the range for
1748  // this line
1749  bestWordBaseIdx = 0;
1750  bestWord0 = bestWord1 = nullptr;
1751  overlap = false;
1752  for (baseIdx = minBaseIdx; !overlap && baseIdx <= maxBaseIdx; ++baseIdx) {
1753  for (word0 = nullptr, word1 = pool->getPool(baseIdx); word1; word0 = word1, word1 = word1->next) {
1754  if (word1->base >= minBase && word1->base <= maxBase) {
1755  delta = lastWord->primaryDelta(word1);
1756  if (delta < minCharSpacing * fontSize) {
1757  overlap = true;
1758  break;
1759  } else {
1760  if (delta < wordSpacing && (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
1761  bestWordBaseIdx = baseIdx;
1762  bestWord0 = word0;
1763  bestWord1 = word1;
1764  }
1765  break;
1766  }
1767  }
1768  }
1769  }
1770  if (overlap || !bestWord1) {
1771  break;
1772  }
1773 
1774  // remove it from the pool, and add it to the line
1775  if (bestWord0) {
1776  bestWord0->next = bestWord1->next;
1777  } else {
1778  pool->setPool(bestWordBaseIdx, bestWord1->next);
1779  }
1780  bestWord1->next = nullptr;
1781  line->addWord(bestWord1);
1782  lastWord = bestWord1;
1783  }
1784 
1785  // add the line
1786  if (curLine && line->cmpYX(curLine) > 0) {
1787  line0 = curLine;
1788  line1 = curLine->next;
1789  } else {
1790  line0 = nullptr;
1791  line1 = lines;
1792  }
1793  for (; line1 && line->cmpYX(line1) > 0; line0 = line1, line1 = line1->next)
1794  ;
1795  if (line0) {
1796  line0->next = line;
1797  } else {
1798  lines = line;
1799  }
1800  line->next = line1;
1801  curLine = line;
1802  line->coalesce(uMap);
1803  charCount += line->len;
1804  ++nLines;
1805  }
1806 
1807  // sort lines into xy order for column assignment
1808  lineArray = (TextLine **)gmallocn(nLines, sizeof(TextLine *));
1809  for (line = lines, i = 0; line; line = line->next, ++i) {
1810  lineArray[i] = line;
1811  }
1812  qsort(lineArray, nLines, sizeof(TextLine *), &TextLine::cmpXY);
1813 
1814  // column assignment
1815  nColumns = 0;
1816  if (fixedPitch) {
1817  for (i = 0; i < nLines; ++i) {
1818  line0 = lineArray[i];
1819  col1 = 0; // make gcc happy
1820  switch (rot) {
1821  case 0:
1822  col1 = (int)((line0->xMin - xMin) / fixedPitch + 0.5);
1823  break;
1824  case 1:
1825  col1 = (int)((line0->yMin - yMin) / fixedPitch + 0.5);
1826  break;
1827  case 2:
1828  col1 = (int)((xMax - line0->xMax) / fixedPitch + 0.5);
1829  break;
1830  case 3:
1831  col1 = (int)((yMax - line0->yMax) / fixedPitch + 0.5);
1832  break;
1833  }
1834  for (k = 0; k <= line0->len; ++k) {
1835  line0->col[k] += col1;
1836  }
1837  if (line0->col[line0->len] > nColumns) {
1838  nColumns = line0->col[line0->len];
1839  }
1840  }
1841  } else {
1842  for (i = 0; i < nLines; ++i) {
1843  line0 = lineArray[i];
1844  col1 = 0;
1845  for (j = 0; j < i; ++j) {
1846  line1 = lineArray[j];
1847  if (line1->primaryDelta(line0) >= 0) {
1848  col2 = line1->col[line1->len] + 1;
1849  } else {
1850  k = 0; // make gcc happy
1851  switch (rot) {
1852  case 0:
1853  for (k = 0; k < line1->len && line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k)
1854  ;
1855  break;
1856  case 1:
1857  for (k = 0; k < line1->len && line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k)
1858  ;
1859  break;
1860  case 2:
1861  for (k = 0; k < line1->len && line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k)
1862  ;
1863  break;
1864  case 3:
1865  for (k = 0; k < line1->len && line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k)
1866  ;
1867  break;
1868  }
1869  col2 = line1->col[k];
1870  }
1871  if (col2 > col1) {
1872  col1 = col2;
1873  }
1874  }
1875  for (k = 0; k <= line0->len; ++k) {
1876  line0->col[k] += col1;
1877  }
1878  if (line0->col[line0->len] > nColumns) {
1879  nColumns = line0->col[line0->len];
1880  }
1881  }
1882  }
1883  gfree(lineArray);
1884 }
1885 
1887 {
1888  double newPriMin, newPriMax;
1889  bool gotPriMin, gotPriMax;
1890 
1891  gotPriMin = gotPriMax = false;
1892  newPriMin = newPriMax = 0; // make gcc happy
1893  switch (page->primaryRot) {
1894  case 0:
1895  case 2:
1896  if (blk->yMin < yMax && blk->yMax > yMin) {
1897  if (blk->xMin < xMin) {
1898  newPriMin = blk->xMax;
1899  gotPriMin = true;
1900  }
1901  if (blk->xMax > xMax) {
1902  newPriMax = blk->xMin;
1903  gotPriMax = true;
1904  }
1905  }
1906  break;
1907  case 1:
1908  case 3:
1909  if (blk->xMin < xMax && blk->xMax > xMin) {
1910  if (blk->yMin < yMin) {
1911  newPriMin = blk->yMax;
1912  gotPriMin = true;
1913  }
1914  if (blk->yMax > yMax) {
1915  newPriMax = blk->yMin;
1916  gotPriMax = true;
1917  }
1918  }
1919  break;
1920  }
1921  if (gotPriMin) {
1922  if (newPriMin > xMin) {
1923  newPriMin = xMin;
1924  }
1925  if (newPriMin > priMin) {
1926  priMin = newPriMin;
1927  }
1928  }
1929  if (gotPriMax) {
1930  if (newPriMax < xMax) {
1931  newPriMax = xMax;
1932  }
1933  if (newPriMax < priMax) {
1934  priMax = newPriMax;
1935  }
1936  }
1937 }
1938 
1939 int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2)
1940 {
1941  TextBlock *blk1 = *(TextBlock **)p1;
1942  TextBlock *blk2 = *(TextBlock **)p2;
1943  double cmp;
1944 
1945  cmp = 0; // make gcc happy
1946  switch (blk1->page->primaryRot) {
1947  case 0:
1948  if ((cmp = blk1->xMin - blk2->xMin) == 0) {
1949  cmp = blk1->yMin - blk2->yMin;
1950  }
1951  break;
1952  case 1:
1953  if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1954  cmp = blk2->xMax - blk1->xMax;
1955  }
1956  break;
1957  case 2:
1958  if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1959  cmp = blk2->yMin - blk1->yMin;
1960  }
1961  break;
1962  case 3:
1963  if ((cmp = blk2->yMax - blk1->yMax) == 0) {
1964  cmp = blk1->xMax - blk2->xMax;
1965  }
1966  break;
1967  }
1968  return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1969 }
1970 
1971 int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2)
1972 {
1973  TextBlock *blk1 = *(TextBlock **)p1;
1974  TextBlock *blk2 = *(TextBlock **)p2;
1975  double cmp;
1976 
1977  cmp = 0; // make gcc happy
1978  switch (blk1->page->primaryRot) {
1979  case 0:
1980  if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1981  cmp = blk1->xMin - blk2->xMin;
1982  }
1983  break;
1984  case 1:
1985  if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1986  cmp = blk1->yMin - blk2->yMin;
1987  }
1988  break;
1989  case 2:
1990  if ((cmp = blk2->yMin - blk1->yMin) == 0) {
1991  cmp = blk2->xMax - blk1->xMax;
1992  }
1993  break;
1994  case 3:
1995  if ((cmp = blk1->xMax - blk2->xMax) == 0) {
1996  cmp = blk2->yMax - blk1->yMax;
1997  }
1998  break;
1999  }
2000  return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
2001 }
2002 
2003 int TextBlock::primaryCmp(const TextBlock *blk) const
2004 {
2005  double cmp;
2006 
2007  cmp = 0; // make gcc happy
2008  switch (rot) {
2009  case 0:
2010  cmp = xMin - blk->xMin;
2011  break;
2012  case 1:
2013  cmp = yMin - blk->yMin;
2014  break;
2015  case 2:
2016  cmp = blk->xMax - xMax;
2017  break;
2018  case 3:
2019  cmp = blk->yMax - yMax;
2020  break;
2021  }
2022  return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
2023 }
2024 
2025 double TextBlock::secondaryDelta(const TextBlock *blk) const
2026 {
2027  double delta;
2028 
2029  delta = 0; // make gcc happy
2030  switch (rot) {
2031  case 0:
2032  delta = blk->yMin - yMax;
2033  break;
2034  case 1:
2035  delta = xMin - blk->xMax;
2036  break;
2037  case 2:
2038  delta = yMin - blk->yMax;
2039  break;
2040  case 3:
2041  delta = blk->xMin - xMax;
2042  break;
2043  }
2044  return delta;
2045 }
2046 
2047 bool TextBlock::isBelow(const TextBlock *blk) const
2048 {
2049  bool below;
2050 
2051  below = false; // make gcc happy
2052  switch (page->primaryRot) {
2053  case 0:
2054  below = xMin >= blk->priMin && xMax <= blk->priMax && yMin > blk->yMin;
2055  break;
2056  case 1:
2057  below = yMin >= blk->priMin && yMax <= blk->priMax && xMax < blk->xMax;
2058  break;
2059  case 2:
2060  below = xMin >= blk->priMin && xMax <= blk->priMax && yMax < blk->yMax;
2061  break;
2062  case 3:
2063  below = yMin >= blk->priMin && yMax <= blk->priMax && xMin > blk->xMin;
2064  break;
2065  }
2066 
2067  return below;
2068 }
2069 
2071 {
2072  bool before = false;
2073  bool overlap = false;
2074 
2075  switch (this->page->primaryRot) {
2076  case 0:
2077  case 2:
2078  overlap = ((this->ExMin <= blk1->ExMin) && (blk1->ExMin <= this->ExMax)) || ((blk1->ExMin <= this->ExMin) && (this->ExMin <= blk1->ExMax));
2079  break;
2080  case 1:
2081  case 3:
2082  overlap = ((this->EyMin <= blk1->EyMin) && (blk1->EyMin <= this->EyMax)) || ((blk1->EyMin <= this->EyMin) && (this->EyMin <= blk1->EyMax));
2083  break;
2084  }
2085  switch (this->page->primaryRot) {
2086  case 0:
2087  before = overlap && this->EyMin < blk1->EyMin;
2088  break;
2089  case 1:
2090  before = overlap && this->ExMax > blk1->ExMax;
2091  break;
2092  case 2:
2093  before = overlap && this->EyMax > blk1->EyMax;
2094  break;
2095  case 3:
2096  before = overlap && this->ExMin < blk1->ExMin;
2097  break;
2098  }
2099  return before;
2100 }
2101 
2103 {
2104  double cmp = 0;
2105  int rotLR = rot;
2106 
2107  if (!page->primaryLR) {
2108  rotLR = (rotLR + 2) % 4;
2109  }
2110 
2111  switch (rotLR) {
2112  case 0:
2113  cmp = ExMax - blk1->ExMin;
2114  break;
2115  case 1:
2116  cmp = EyMin - blk1->EyMax;
2117  break;
2118  case 2:
2119  cmp = blk1->ExMax - ExMin;
2120  break;
2121  case 3:
2122  cmp = blk1->EyMin - EyMax;
2123  break;
2124  }
2125  return cmp <= 0;
2126 }
2127 
2128 // Sort into reading order by performing a topological sort using the rules
2129 // given in "High Performance Document Layout Analysis", T.M. Breuel, 2003.
2130 // See http://pubs.iupr.org/#2003-breuel-sdiut
2131 // Topological sort is done by depth first search, see
2132 // http://en.wikipedia.org/wiki/Topological_sorting
2133 int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited, TextBlock **cache, int cacheSize)
2134 {
2135  int pos2;
2136  TextBlock *blk1, *blk2, *blk3;
2137  bool before;
2138 
2139  if (visited[pos1]) {
2140  return sortPos;
2141  }
2142 
2143  blk1 = this;
2144 
2145 #if 0 // for debugging
2146  printf("visited: %d %.2f..%.2f %.2f..%.2f\n",
2147  sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
2148 #endif
2149  visited[pos1] = true;
2150  pos2 = -1;
2151  for (blk2 = blkList; blk2; blk2 = blk2->next) {
2152  pos2++;
2153  if (visited[pos2]) {
2154  // skip visited nodes
2155  continue;
2156  }
2157  before = false;
2158 
2159  // is blk2 before blk1? (for table entries)
2160  if (blk1->tableId >= 0 && blk1->tableId == blk2->tableId) {
2161  if (page->primaryLR) {
2162  if (blk2->xMax <= blk1->xMin && blk2->yMin <= blk1->yMax && blk2->yMax >= blk1->yMin)
2163  before = true;
2164  } else {
2165  if (blk2->xMin >= blk1->xMax && blk2->yMin <= blk1->yMax && blk2->yMax >= blk1->yMin)
2166  before = true;
2167  }
2168 
2169  if (blk2->yMax <= blk1->yMin)
2170  before = true;
2171  } else {
2172  if (blk2->isBeforeByRule1(blk1)) {
2173  // Rule (1) blk1 and blk2 overlap, and blk2 is above blk1.
2174  before = true;
2175 #if 0 // for debugging
2176  printf("rule1: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
2177  blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax,
2178  blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
2179 #endif
2180  } else if (blk2->isBeforeByRule2(blk1)) {
2181  // Rule (2) blk2 left of blk1, and no intervening blk3
2182  // such that blk1 is before blk3 by rule 1,
2183  // and blk3 is before blk2 by rule 1.
2184  before = true;
2185  for (int i = 0; i < cacheSize && cache[i]; ++i) {
2186  if (blk1->isBeforeByRule1(cache[i]) && cache[i]->isBeforeByRule1(blk2)) {
2187  before = false;
2188  std::rotate(cache, cache + i, cache + i + 1);
2189  break;
2190  }
2191  }
2192 
2193  if (before) {
2194  for (blk3 = blkList; blk3; blk3 = blk3->next) {
2195  if (blk3 == blk2 || blk3 == blk1) {
2196  continue;
2197  }
2198  if (blk1->isBeforeByRule1(blk3) && blk3->isBeforeByRule1(blk2)) {
2199  before = false;
2200  std::copy_backward(cache, cache + cacheSize - 1, cache + cacheSize);
2201  cache[0] = blk3;
2202  break;
2203  }
2204  }
2205  }
2206 #if 0 // for debugging
2207  if (before) {
2208  printf("rule2: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
2209  blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax,
2210  blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax);
2211  }
2212 #endif
2213  }
2214  }
2215  if (before) {
2216  // blk2 is before blk1, so it needs to be visited
2217  // before we can add blk1 to the sorted list.
2218  sortPos = blk2->visitDepthFirst(blkList, pos2, sorted, sortPos, visited, cache, cacheSize);
2219  }
2220  }
2221 #if 0 // for debugging
2222  printf("sorted: %d %.2f..%.2f %.2f..%.2f\n",
2223  sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
2224 #endif
2225  sorted[sortPos++] = blk1;
2226  return sortPos;
2227 }
2228 
2229 int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited)
2230 {
2231  const int blockCacheSize = 4;
2232  TextBlock *blockCache[blockCacheSize];
2233  std::fill(blockCache, blockCache + blockCacheSize, nullptr);
2234  return visitDepthFirst(blkList, pos1, sorted, sortPos, visited, blockCache, blockCacheSize);
2235 }
2236 
2237 //------------------------------------------------------------------------
2238 // TextFlow
2239 //------------------------------------------------------------------------
2240 
2242 {
2243  page = pageA;
2244  xMin = blk->xMin;
2245  xMax = blk->xMax;
2246  yMin = blk->yMin;
2247  yMax = blk->yMax;
2248  priMin = blk->priMin;
2249  priMax = blk->priMax;
2250  blocks = lastBlk = blk;
2251  next = nullptr;
2252 }
2253 
2255 {
2256  TextBlock *blk;
2257 
2258  while (blocks) {
2259  blk = blocks;
2260  blocks = blocks->next;
2261  delete blk;
2262  }
2263 }
2264 
2266 {
2267  if (lastBlk) {
2268  lastBlk->next = blk;
2269  } else {
2270  blocks = blk;
2271  }
2272  lastBlk = blk;
2273  if (blk->xMin < xMin) {
2274  xMin = blk->xMin;
2275  }
2276  if (blk->xMax > xMax) {
2277  xMax = blk->xMax;
2278  }
2279  if (blk->yMin < yMin) {
2280  yMin = blk->yMin;
2281  }
2282  if (blk->yMax > yMax) {
2283  yMax = blk->yMax;
2284  }
2285 }
2286 
2287 bool TextFlow::blockFits(const TextBlock *blk, const TextBlock *prevBlk) const
2288 {
2289  bool fits;
2290 
2291  // lower blocks must use smaller fonts
2292  if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) {
2293  return false;
2294  }
2295 
2296  fits = false; // make gcc happy
2297  switch (page->primaryRot) {
2298  case 0:
2299  fits = blk->xMin >= priMin && blk->xMax <= priMax;
2300  break;
2301  case 1:
2302  fits = blk->yMin >= priMin && blk->yMax <= priMax;
2303  break;
2304  case 2:
2305  fits = blk->xMin >= priMin && blk->xMax <= priMax;
2306  break;
2307  case 3:
2308  fits = blk->yMin >= priMin && blk->yMax <= priMax;
2309  break;
2310  }
2311  return fits;
2312 }
2313 
2314 #ifdef TEXTOUT_WORD_LIST
2315 
2316 //------------------------------------------------------------------------
2317 // TextWordList
2318 //------------------------------------------------------------------------
2319 
2321 {
2322  TextFlow *flow;
2323  TextBlock *blk;
2324  TextLine *line;
2325  TextWord *word;
2326  TextWord **wordArray;
2327  int nWords, i;
2328 
2329  words = new std::vector<TextWord *>();
2330 
2331  if (text->rawOrder) {
2332  for (word = text->rawWords; word; word = word->next) {
2333  words->push_back(word);
2334  }
2335 
2336  } else if (physLayout) {
2337  // this is inefficient, but it's also the least useful of these
2338  // three cases
2339  nWords = 0;
2340  for (flow = text->flows; flow; flow = flow->next) {
2341  for (blk = flow->blocks; blk; blk = blk->next) {
2342  for (line = blk->lines; line; line = line->next) {
2343  for (word = line->words; word; word = word->next) {
2344  ++nWords;
2345  }
2346  }
2347  }
2348  }
2349  wordArray = (TextWord **)gmallocn(nWords, sizeof(TextWord *));
2350  i = 0;
2351  for (flow = text->flows; flow; flow = flow->next) {
2352  for (blk = flow->blocks; blk; blk = blk->next) {
2353  for (line = blk->lines; line; line = line->next) {
2354  for (word = line->words; word; word = word->next) {
2355  wordArray[i++] = word;
2356  }
2357  }
2358  }
2359  }
2360  qsort(wordArray, nWords, sizeof(TextWord *), &TextWord::cmpYX);
2361  for (i = 0; i < nWords; ++i) {
2362  words->push_back(wordArray[i]);
2363  }
2364  gfree(wordArray);
2365 
2366  } else {
2367  for (flow = text->flows; flow; flow = flow->next) {
2368  for (blk = flow->blocks; blk; blk = blk->next) {
2369  for (line = blk->lines; line; line = line->next) {
2370  for (word = line->words; word; word = word->next) {
2371  words->push_back(word);
2372  }
2373  }
2374  }
2375  }
2376  }
2377 }
2378 
2380 {
2381  delete words;
2382 }
2383 
2385 {
2386  return words->size();
2387 }
2388 
2390 {
2391  if (idx < 0 || idx >= (int)words->size()) {
2392  return nullptr;
2393  }
2394  return (*words)[idx];
2395 }
2396 
2397 #endif // TEXTOUT_WORD_LIST
2398 
2399 //------------------------------------------------------------------------
2400 // TextPage
2401 //------------------------------------------------------------------------
2402 
2403 TextPage::TextPage(bool rawOrderA, bool discardDiagA)
2404 {
2405  int rot;
2406 
2407  refCnt = 1;
2408  rawOrder = rawOrderA;
2409  discardDiag = discardDiagA;
2410  curWord = nullptr;
2411  charPos = 0;
2412  curFont = nullptr;
2413  curFontSize = 0;
2414  nest = 0;
2415  nTinyChars = 0;
2416  lastCharOverlap = false;
2417  if (!rawOrder) {
2418  for (rot = 0; rot < 4; ++rot) {
2419  pools[rot] = new TextPool();
2420  }
2421  }
2422  flows = nullptr;
2423  blocks = nullptr;
2424  rawWords = nullptr;
2425  rawLastWord = nullptr;
2426  fonts = new std::vector<TextFontInfo *>();
2427  lastFindXMin = lastFindYMin = 0;
2428  haveLastFind = false;
2429  underlines = new std::vector<TextUnderline *>();
2430  links = new std::vector<TextLink *>();
2431  mergeCombining = true;
2432  diagonal = false;
2433 }
2434 
2436 {
2437  int rot;
2438 
2439  clear();
2440  if (!rawOrder) {
2441  for (rot = 0; rot < 4; ++rot) {
2442  delete pools[rot];
2443  }
2444  }
2445  delete fonts;
2446  for (auto entry : *underlines) {
2447  delete entry;
2448  }
2449  delete underlines;
2450  for (auto entry : *links) {
2451  delete entry;
2452  }
2453  delete links;
2454 }
2455 
2457 {
2458  refCnt++;
2459 }
2460 
2462 {
2463  if (--refCnt == 0)
2464  delete this;
2465 }
2466 
2468 {
2469  clear();
2470  if (state) {
2471  pageWidth = state->getPageWidth();
2472  pageHeight = state->getPageHeight();
2473  } else {
2474  pageWidth = pageHeight = 0;
2475  }
2476 }
2477 
2479 {
2480  if (curWord) {
2481  endWord();
2482  }
2483 }
2484 
2485 void TextPage::clear()
2486 {
2487  int rot;
2488  TextFlow *flow;
2489  TextWord *word;
2490 
2491  if (curWord) {
2492  delete curWord;
2493  curWord = nullptr;
2494  }
2495  if (rawOrder) {
2496  while (rawWords) {
2497  word = rawWords;
2498  rawWords = rawWords->next;
2499  delete word;
2500  }
2501  } else {
2502  for (rot = 0; rot < 4; ++rot) {
2503  delete pools[rot];
2504  }
2505  while (flows) {
2506  flow = flows;
2507  flows = flows->next;
2508  delete flow;
2509  }
2510  gfree(blocks);
2511  }
2512  for (auto entry : *fonts) {
2513  delete entry;
2514  }
2515  delete fonts;
2516  for (auto entry : *underlines) {
2517  delete entry;
2518  }
2519  delete underlines;
2520  for (auto entry : *links) {
2521  delete entry;
2522  }
2523  delete links;
2524 
2525  diagonal = false;
2526  curWord = nullptr;
2527  charPos = 0;
2528  curFont = nullptr;
2529  curFontSize = 0;
2530  nest = 0;
2531  nTinyChars = 0;
2532  if (!rawOrder) {
2533  for (rot = 0; rot < 4; ++rot) {
2534  pools[rot] = new TextPool();
2535  }
2536  }
2537  flows = nullptr;
2538  blocks = nullptr;
2539  rawWords = nullptr;
2540  rawLastWord = nullptr;
2541  fonts = new std::vector<TextFontInfo *>();
2542  underlines = new std::vector<TextUnderline *>();
2543  links = new std::vector<TextLink *>();
2544 }
2545 
2547 {
2548  GfxFont *gfxFont;
2549  const double *fm;
2550  const char *name;
2551  int code, mCode, letterCode, anyCode;
2552  double w;
2553 
2554  // get the font info object
2555  curFont = nullptr;
2556  for (TextFontInfo *f : *fonts) {
2557  curFont = f;
2558  if (curFont->matches(state)) {
2559  break;
2560  }
2561  curFont = nullptr;
2562  }
2563  if (!curFont) {
2564  curFont = new TextFontInfo(state);
2565  fonts->push_back(curFont);
2566  }
2567 
2568  // adjust the font size
2569  gfxFont = state->getFont();
2570  curFontSize = state->getTransformedFontSize();
2571  if (gfxFont && gfxFont->getType() == fontType3) {
2572  // This is a hack which makes it possible to deal with some Type 3
2573  // fonts. The problem is that it's impossible to know what the
2574  // base coordinate system used in the font is without actually
2575  // rendering the font. This code tries to guess by looking at the
2576  // width of the character 'm' (which breaks if the font is a
2577  // subset that doesn't contain 'm').
2578  mCode = letterCode = anyCode = -1;
2579  for (code = 0; code < 256; ++code) {
2580  name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
2581  int nameLen = name ? strlen(name) : 0;
2582  bool nameOneChar = nameLen == 1 || (nameLen > 1 && name[1] == '\0');
2583  if (nameOneChar && name[0] == 'm') {
2584  mCode = code;
2585  }
2586  if (letterCode < 0 && nameOneChar && ((name[0] >= 'A' && name[0] <= 'Z') || (name[0] >= 'a' && name[0] <= 'z'))) {
2587  letterCode = code;
2588  }
2589  if (anyCode < 0 && name && ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {
2590  anyCode = code;
2591  }
2592  }
2593  if (mCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
2594  // 0.6 is a generic average 'm' width -- yes, this is a hack
2595  curFontSize *= w / 0.6;
2596  } else if (letterCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
2597  // even more of a hack: 0.5 is a generic letter width
2598  curFontSize *= w / 0.5;
2599  } else if (anyCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
2600  // better than nothing: 0.5 is a generic character width
2601  curFontSize *= w / 0.5;
2602  }
2603  fm = gfxFont->getFontMatrix();
2604  if (fm[0] != 0) {
2605  curFontSize *= fabs(fm[3] / fm[0]);
2606  }
2607  }
2608 }
2609 
2611 {
2612  GfxFont *gfxFont;
2613  const double *fontm;
2614  double m[4], m2[4];
2615  int rot;
2616 
2617  // This check is needed because Type 3 characters can contain
2618  // text-drawing operations (when TextPage is being used via
2619  // {X,Win}SplashOutputDev rather than TextOutputDev).
2620  if (curWord) {
2621  ++nest;
2622  return;
2623  }
2624 
2625  // compute the rotation
2626  state->getFontTransMat(&m[0], &m[1], &m[2], &m[3]);
2627  gfxFont = state->getFont();
2628  if (gfxFont && gfxFont->getType() == fontType3) {
2629  fontm = state->getFont()->getFontMatrix();
2630  m2[0] = fontm[0] * m[0] + fontm[1] * m[2];
2631  m2[1] = fontm[0] * m[1] + fontm[1] * m[3];
2632  m2[2] = fontm[2] * m[0] + fontm[3] * m[2];
2633  m2[3] = fontm[2] * m[1] + fontm[3] * m[3];
2634  m[0] = m2[0];
2635  m[1] = m2[1];
2636  m[2] = m2[2];
2637  m[3] = m2[3];
2638  }
2639  if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) {
2640  rot = (m[0] > 0 || m[3] < 0) ? 0 : 2;
2641  } else {
2642  rot = (m[2] > 0) ? 1 : 3;
2643  }
2644  if (fabs(m[0]) >= fabs(m[1])) {
2645  diagonal = fabs(m[1]) > diagonalThreshold * fabs(m[0]);
2646  } else {
2647  diagonal = fabs(m[0]) > diagonalThreshold * fabs(m[1]);
2648  }
2649 
2650  // for vertical writing mode, the lines are effectively rotated 90
2651  // degrees
2652  if (gfxFont && gfxFont->getWMode()) {
2653  rot = (rot + 1) & 3;
2654  }
2655 
2656  curWord = new TextWord(state, rot, curFontSize);
2657 }
2658 
2659 void TextPage::addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen)
2660 {
2661  double x1, y1, w1, h1, dx2, dy2, base, sp, delta;
2662  bool overlap;
2663  int i;
2664  int wMode;
2665  Matrix mat;
2666 
2667  // subtract char and word spacing from the dx,dy values
2668  sp = state->getCharSpace();
2669  if (c == (CharCode)0x20) {
2670  sp += state->getWordSpace();
2671  }
2672  state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
2673  dx -= dx2;
2674  dy -= dy2;
2675  state->transformDelta(dx, dy, &w1, &h1);
2676 
2677  // throw away chars that aren't inside the page bounds
2678  // (and also do a sanity check on the character size)
2679  state->transform(x, y, &x1, &y1);
2680  if (x1 + w1 < 0 || x1 > pageWidth || y1 + h1 < 0 || y1 > pageHeight || x1 != x1 || y1 != y1 || // IEEE way of checking for isnan
2681  w1 != w1 || h1 != h1) {
2682  charPos += nBytes;
2683  return;
2684  }
2685 
2686  // check the tiny chars limit
2687  if (fabs(w1) < 3 && fabs(h1) < 3) {
2688  if (++nTinyChars > 50000) {
2689  charPos += nBytes;
2690  return;
2691  }
2692  }
2693 
2694  // break words at space character
2695  if (uLen == 1 && UnicodeIsWhitespace(u[0])) {
2696  charPos += nBytes;
2697  endWord();
2698  return;
2699  } else if (uLen == 1 && u[0] == (Unicode)0x0) {
2700  // ignore null characters
2701  charPos += nBytes;
2702  return;
2703  }
2704 
2705  state->getFontTransMat(&mat.m[0], &mat.m[1], &mat.m[2], &mat.m[3]);
2706  mat.m[0] *= state->getHorizScaling();
2707  mat.m[1] *= state->getHorizScaling();
2708  mat.m[4] = x1;
2709  mat.m[5] = y1;
2710 
2711  if (mergeCombining && curWord && uLen == 1 && curWord->addCombining(state, curFont, curFontSize, x1, y1, w1, h1, charPos, nBytes, c, u[0], mat)) {
2712  charPos += nBytes;
2713  return;
2714  }
2715 
2716  // start a new word if:
2717  // (1) this character doesn't fall in the right place relative to
2718  // the end of the previous word (this places upper and lower
2719  // constraints on the position deltas along both the primary
2720  // and secondary axes), or
2721  // (2) this character overlaps the previous one (duplicated text), or
2722  // (3) the previous character was an overlap (we want each duplicated
2723  // character to be in a word by itself at this stage),
2724  // (4) the font size has changed
2725  // (5) the WMode changed
2726  if (curWord && curWord->len > 0) {
2727  base = sp = delta = 0; // make gcc happy
2728  switch (curWord->rot) {
2729  case 0:
2730  base = y1;
2731  sp = x1 - curWord->xMax;
2732  delta = x1 - curWord->edge[curWord->len - 1];
2733  break;
2734  case 1:
2735  base = x1;
2736  sp = y1 - curWord->yMax;
2737  delta = y1 - curWord->edge[curWord->len - 1];
2738  break;
2739  case 2:
2740  base = y1;
2741  sp = curWord->xMin - x1;
2742  delta = curWord->edge[curWord->len - 1] - x1;
2743  break;
2744  case 3:
2745  base = x1;
2746  sp = curWord->yMin - y1;
2747  delta = curWord->edge[curWord->len - 1] - y1;
2748  break;
2749  }
2751  wMode = curFont->getWMode();
2752  if (overlap || lastCharOverlap || sp < -minDupBreakOverlap * curWord->fontSize || sp > minWordBreakSpace * curWord->fontSize || fabs(base - curWord->base) > 0.5 || curFontSize != curWord->fontSize || wMode != curWord->wMode) {
2753  endWord();
2754  }
2755  lastCharOverlap = overlap;
2756  } else {
2757  lastCharOverlap = false;
2758  }
2759 
2760  if (uLen != 0) {
2761  // start a new word if needed
2762  if (!curWord) {
2763  beginWord(state);
2764  }
2765 
2766  // throw away diagonal chars
2767  if (discardDiag && diagonal) {
2768  charPos += nBytes;
2769  return;
2770  }
2771 
2772  // page rotation and/or transform matrices can cause text to be
2773  // drawn in reverse order -- in this case, swap the begin/end
2774  // coordinates and break text into individual chars
2775  if ((curWord->rot == 0 && w1 < 0) || (curWord->rot == 1 && h1 < 0) || (curWord->rot == 2 && w1 > 0) || (curWord->rot == 3 && h1 > 0)) {
2776  endWord();
2777  beginWord(state);
2778 
2779  // throw away diagonal chars
2780  if (discardDiag && diagonal) {
2781  charPos += nBytes;
2782  return;
2783  }
2784 
2785  x1 += w1;
2786  y1 += h1;
2787  w1 = -w1;
2788  h1 = -h1;
2789  }
2790 
2791  // add the characters to the current word
2792  w1 /= uLen;
2793  h1 /= uLen;
2794  for (i = 0; i < uLen; ++i) {
2795  curWord->addChar(state, curFont, x1 + i * w1, y1 + i * h1, w1, h1, charPos, nBytes, c, u[i], mat);
2796  }
2797  }
2798  charPos += nBytes;
2799 }
2800 
2801 void TextPage::incCharCount(int nChars)
2802 {
2803  charPos += nChars;
2804 }
2805 
2807 {
2808  // This check is needed because Type 3 characters can contain
2809  // text-drawing operations (when TextPage is being used via
2810  // {X,Win}SplashOutputDev rather than TextOutputDev).
2811  if (nest > 0) {
2812  --nest;
2813  return;
2814  }
2815 
2816  if (curWord) {
2817  addWord(curWord);
2818  curWord = nullptr;
2819  }
2820 }
2821 
2823 {
2824  // throw away zero-length words -- they don't have valid xMin/xMax
2825  // values, and they're useless anyway
2826  if (word->len == 0) {
2827  delete word;
2828  return;
2829  }
2830 
2831  if (rawOrder) {
2832  if (rawLastWord) {
2833  rawLastWord->next = word;
2834  } else {
2835  rawWords = word;
2836  }
2837  rawLastWord = word;
2838  } else {
2839  pools[word->rot]->addWord(word);
2840  }
2841 }
2842 
2843 void TextPage::addUnderline(double x0, double y0, double x1, double y1)
2844 {
2845  underlines->push_back(new TextUnderline(x0, y0, x1, y1));
2846 }
2847 
2848 void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link)
2849 {
2850  links->push_back(new TextLink(xMin, yMin, xMax, yMax, link));
2851 }
2852 
2853 void TextPage::coalesce(bool physLayout, double fixedPitch, bool doHTML)
2854 {
2855  TextPool *pool;
2856  TextWord *word0, *word1, *word2;
2857  TextLine *line;
2858  TextBlock *blkList, *blk, *lastBlk, *blk0, *blk1, *blk2;
2859  TextFlow *flow, *lastFlow;
2860  int rot, poolMinBaseIdx, baseIdx, startBaseIdx, endBaseIdx;
2861  double minBase, maxBase, newMinBase, newMaxBase;
2862  double fontSize, colSpace1, colSpace2, lineSpace, intraLineSpace, blkSpace;
2863  bool found;
2864  int count[4];
2865  int lrCount;
2866  int col1, col2;
2867  int j, n;
2868 
2869  if (rawOrder) {
2870  primaryRot = 0;
2871  primaryLR = true;
2872  return;
2873  }
2874 
2875  const UnicodeMap *uMap = globalParams->getTextEncoding();
2876  blkList = nullptr;
2877  lastBlk = nullptr;
2878  nBlocks = 0;
2879  primaryRot = 0;
2880 
2881 #if 0 // for debugging
2882  printf("*** initial words ***\n");
2883  for (rot = 0; rot < 4; ++rot) {
2884  pool = pools[rot];
2885  for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) {
2886  for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) {
2887  printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f rot=%d link=%p '",
2888  word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2889  word0->base, word0->fontSize, rot*90, word0->link);
2890  for (i = 0; i < word0->len; ++i) {
2891  fputc(word0->text[i] & 0xff, stdout);
2892  }
2893  printf("'\n");
2894  }
2895  }
2896  }
2897  printf("\n");
2898 #endif
2899 
2900 #if 0 //~ for debugging
2901  for (i = 0; i < underlines->getLength(); ++i) {
2902  underline = (TextUnderline *)underlines->get(i);
2903  printf("underline: x=%g..%g y=%g..%g horiz=%d\n",
2904  underline->x0, underline->x1, underline->y0, underline->y1,
2905  underline->horiz);
2906  }
2907 #endif
2908 
2909  if (doHTML) {
2910 
2911  //----- handle underlining
2912  for (const TextUnderline *underline : *underlines) {
2913  if (underline->horiz) {
2914  // rot = 0
2915  if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2916  startBaseIdx = pools[0]->getBaseIdx(underline->y0 + minUnderlineGap);
2917  endBaseIdx = pools[0]->getBaseIdx(underline->y0 + maxUnderlineGap);
2918  for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2919  for (word0 = pools[0]->getPool(j); word0; word0 = word0->next) {
2920  //~ need to check the y value against the word baseline
2921  if (underline->x0 < word0->xMin + underlineSlack && word0->xMax - underlineSlack < underline->x1) {
2922  word0->underlined = true;
2923  }
2924  }
2925  }
2926  }
2927 
2928  // rot = 2
2929  if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2930  startBaseIdx = pools[2]->getBaseIdx(underline->y0 - maxUnderlineGap);
2931  endBaseIdx = pools[2]->getBaseIdx(underline->y0 - minUnderlineGap);
2932  for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2933  for (word0 = pools[2]->getPool(j); word0; word0 = word0->next) {
2934  if (underline->x0 < word0->xMin + underlineSlack && word0->xMax - underlineSlack < underline->x1) {
2935  word0->underlined = true;
2936  }
2937  }
2938  }
2939  }
2940  } else {
2941  // rot = 1
2942  if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2943  startBaseIdx = pools[1]->getBaseIdx(underline->x0 - maxUnderlineGap);
2944  endBaseIdx = pools[1]->getBaseIdx(underline->x0 - minUnderlineGap);
2945  for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2946  for (word0 = pools[1]->getPool(j); word0; word0 = word0->next) {
2947  if (underline->y0 < word0->yMin + underlineSlack && word0->yMax - underlineSlack < underline->y1) {
2948  word0->underlined = true;
2949  }
2950  }
2951  }
2952  }
2953 
2954  // rot = 3
2955  if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
2956  startBaseIdx = pools[3]->getBaseIdx(underline->x0 + minUnderlineGap);
2957  endBaseIdx = pools[3]->getBaseIdx(underline->x0 + maxUnderlineGap);
2958  for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2959  for (word0 = pools[3]->getPool(j); word0; word0 = word0->next) {
2960  if (underline->y0 < word0->yMin + underlineSlack && word0->yMax - underlineSlack < underline->y1) {
2961  word0->underlined = true;
2962  }
2963  }
2964  }
2965  }
2966  }
2967  }
2968 
2969  //----- handle links
2970  for (const TextLink *link : *links) {
2971  // rot = 0
2972  if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2973  startBaseIdx = pools[0]->getBaseIdx(link->yMin);
2974  endBaseIdx = pools[0]->getBaseIdx(link->yMax);
2975  for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2976  for (word0 = pools[0]->getPool(j); word0; word0 = word0->next) {
2977  if (link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax && link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax) {
2978  word0->link = link->link;
2979  }
2980  }
2981  }
2982  }
2983 
2984  // rot = 2
2985  if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2986  startBaseIdx = pools[2]->getBaseIdx(link->yMin);
2987  endBaseIdx = pools[2]->getBaseIdx(link->yMax);
2988  for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2989  for (word0 = pools[2]->getPool(j); word0; word0 = word0->next) {
2990  if (link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax && link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax) {
2991  word0->link = link->link;
2992  }
2993  }
2994  }
2995  }
2996 
2997  // rot = 1
2998  if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2999  startBaseIdx = pools[1]->getBaseIdx(link->xMin);
3000  endBaseIdx = pools[1]->getBaseIdx(link->xMax);
3001  for (j = startBaseIdx; j <= endBaseIdx; ++j) {
3002  for (word0 = pools[1]->getPool(j); word0; word0 = word0->next) {
3003  if (link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax && link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax) {
3004  word0->link = link->link;
3005  }
3006  }
3007  }
3008  }
3009 
3010  // rot = 3
3011  if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
3012  startBaseIdx = pools[3]->getBaseIdx(link->xMin);
3013  endBaseIdx = pools[3]->getBaseIdx(link->xMax);
3014  for (j = startBaseIdx; j <= endBaseIdx; ++j) {
3015  for (word0 = pools[3]->getPool(j); word0; word0 = word0->next) {
3016  if (link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax && link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax) {
3017  word0->link = link->link;
3018  }
3019  }
3020  }
3021  }
3022  }
3023  }
3024 
3025  //----- assemble the blocks
3026 
3027  //~ add an outer loop for writing mode (vertical text)
3028 
3029  // build blocks for each rotation value
3030  for (rot = 0; rot < 4; ++rot) {
3031  pool = pools[rot];
3032  poolMinBaseIdx = pool->minBaseIdx;
3033  count[rot] = 0;
3034 
3035  // add blocks until no more words are left
3036  while (true) {
3037 
3038  // find the first non-empty line in the pool
3039  for (; poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx); ++poolMinBaseIdx)
3040  ;
3041  if (poolMinBaseIdx > pool->maxBaseIdx) {
3042  break;
3043  }
3044 
3045  // look for the left-most word in the first four lines of the
3046  // pool -- this avoids starting with a superscript word
3047  startBaseIdx = poolMinBaseIdx;
3048  for (baseIdx = poolMinBaseIdx + 1; baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx; ++baseIdx) {
3049  if (!pool->getPool(baseIdx)) {
3050  continue;
3051  }
3052  if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx)) < 0) {
3053  startBaseIdx = baseIdx;
3054  }
3055  }
3056 
3057  // create a new block
3058  word0 = pool->getPool(startBaseIdx);
3059  pool->setPool(startBaseIdx, word0->next);
3060  word0->next = nullptr;
3061  blk = new TextBlock(this, rot);
3062  blk->addWord(word0);
3063 
3064  fontSize = word0->fontSize;
3065  minBase = maxBase = word0->base;
3066  colSpace1 = minColSpacing1 * fontSize;
3067  colSpace2 = minColSpacing2 * fontSize;
3068  lineSpace = maxLineSpacingDelta * fontSize;
3069  intraLineSpace = maxIntraLineDelta * fontSize;
3070 
3071  // add words to the block
3072  do {
3073  found = false;
3074 
3075  // look for words on the line above the current top edge of
3076  // the block
3077  newMinBase = minBase;
3078  for (baseIdx = pool->getBaseIdx(minBase); baseIdx >= pool->getBaseIdx(minBase - lineSpace); --baseIdx) {
3079  word0 = nullptr;
3080  word1 = pool->getPool(baseIdx);
3081  while (word1) {
3082  if (word1->base < minBase && word1->base >= minBase - lineSpace && ((rot == 0 || rot == 2) ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin) : (word1->yMin < blk->yMax && word1->yMax > blk->yMin))
3084  word2 = word1;
3085  if (word0) {
3086  word0->next = word1->next;
3087  } else {
3088  pool->setPool(baseIdx, word1->next);
3089  }
3090  word1 = word1->next;
3091  word2->next = nullptr;
3092  blk->addWord(word2);
3093  found = true;
3094  newMinBase = word2->base;
3095  } else {
3096  word0 = word1;
3097  word1 = word1->next;
3098  }
3099  }
3100  }
3101  minBase = newMinBase;
3102 
3103  // look for words on the line below the current bottom edge of
3104  // the block
3105  newMaxBase = maxBase;
3106  for (baseIdx = pool->getBaseIdx(maxBase); baseIdx <= pool->getBaseIdx(maxBase + lineSpace); ++baseIdx) {
3107  word0 = nullptr;
3108  word1 = pool->getPool(baseIdx);
3109  while (word1) {
3110  if (word1->base > maxBase && word1->base <= maxBase + lineSpace && ((rot == 0 || rot == 2) ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin) : (word1->yMin < blk->yMax && word1->yMax > blk->yMin))
3112  word2 = word1;
3113  if (word0) {
3114  word0->next = word1->next;
3115  } else {
3116  pool->setPool(baseIdx, word1->next);
3117  }
3118  word1 = word1->next;
3119  word2->next = nullptr;
3120  blk->addWord(word2);
3121  found = true;
3122  newMaxBase = word2->base;
3123  } else {
3124  word0 = word1;
3125  word1 = word1->next;
3126  }
3127  }
3128  }
3129  maxBase = newMaxBase;
3130 
3131  // look for words that are on lines already in the block, and
3132  // that overlap the block horizontally
3133  for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); ++baseIdx) {
3134  word0 = nullptr;
3135  word1 = pool->getPool(baseIdx);
3136  while (word1) {
3137  if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3138  && ((rot == 0 || rot == 2) ? (word1->xMin < blk->xMax + colSpace1 && word1->xMax > blk->xMin - colSpace1) : (word1->yMin < blk->yMax + colSpace1 && word1->yMax > blk->yMin - colSpace1))
3140  word2 = word1;
3141  if (word0) {
3142  word0->next = word1->next;
3143  } else {
3144  pool->setPool(baseIdx, word1->next);
3145  }
3146  word1 = word1->next;
3147  word2->next = nullptr;
3148  blk->addWord(word2);
3149  found = true;
3150  } else {
3151  word0 = word1;
3152  word1 = word1->next;
3153  }
3154  }
3155  }
3156 
3157  // only check for outlying words (the next two chunks of code)
3158  // if we didn't find anything else
3159  if (found) {
3160  continue;
3161  }
3162 
3163  // scan down the left side of the block, looking for words
3164  // that are near (but not overlapping) the block; if there are
3165  // three or fewer, add them to the block
3166  n = 0;
3167  for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); ++baseIdx) {
3168  word1 = pool->getPool(baseIdx);
3169  while (word1) {
3170  if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3171  && ((rot == 0 || rot == 2) ? (word1->xMax <= blk->xMin && word1->xMax > blk->xMin - colSpace2) : (word1->yMax <= blk->yMin && word1->yMax > blk->yMin - colSpace2))
3173  ++n;
3174  break;
3175  }
3176  word1 = word1->next;
3177  }
3178  }
3179  if (n > 0 && n <= 3) {
3180  for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); ++baseIdx) {
3181  word0 = nullptr;
3182  word1 = pool->getPool(baseIdx);
3183  while (word1) {
3184  if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3185  && ((rot == 0 || rot == 2) ? (word1->xMax <= blk->xMin && word1->xMax > blk->xMin - colSpace2) : (word1->yMax <= blk->yMin && word1->yMax > blk->yMin - colSpace2))
3187  word2 = word1;
3188  if (word0) {
3189  word0->next = word1->next;
3190  } else {
3191  pool->setPool(baseIdx, word1->next);
3192  }
3193  word1 = word1->next;
3194  word2->next = nullptr;
3195  blk->addWord(word2);
3196  if (word2->base < minBase) {
3197  minBase = word2->base;
3198  } else if (word2->base > maxBase) {
3199  maxBase = word2->base;
3200  }
3201  found = true;
3202  break;
3203  } else {
3204  word0 = word1;
3205  word1 = word1->next;
3206  }
3207  }
3208  }
3209  }
3210 
3211  // scan down the right side of the block, looking for words
3212  // that are near (but not overlapping) the block; if there are
3213  // three or fewer, add them to the block
3214  n = 0;
3215  for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); ++baseIdx) {
3216  word1 = pool->getPool(baseIdx);
3217  while (word1) {
3218  if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3219  && ((rot == 0 || rot == 2) ? (word1->xMin >= blk->xMax && word1->xMin < blk->xMax + colSpace2) : (word1->yMin >= blk->yMax && word1->yMin < blk->yMax + colSpace2))
3221  ++n;
3222  break;
3223  }
3224  word1 = word1->next;
3225  }
3226  }
3227  if (n > 0 && n <= 3) {
3228  for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); ++baseIdx) {
3229  word0 = nullptr;
3230  word1 = pool->getPool(baseIdx);
3231  while (word1) {
3232  if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3233  && ((rot == 0 || rot == 2) ? (word1->xMin >= blk->xMax && word1->xMin < blk->xMax + colSpace2) : (word1->yMin >= blk->yMax && word1->yMin < blk->yMax + colSpace2))
3235  word2 = word1;
3236  if (word0) {
3237  word0->next = word1->next;
3238  } else {
3239  pool->setPool(baseIdx, word1->next);
3240  }
3241  word1 = word1->next;
3242  word2->next = nullptr;
3243  blk->addWord(word2);
3244  if (word2->base < minBase) {
3245  minBase = word2->base;
3246  } else if (word2->base > maxBase) {
3247  maxBase = word2->base;
3248  }
3249  found = true;
3250  break;
3251  } else {
3252  word0 = word1;
3253  word1 = word1->next;
3254  }
3255  }
3256  }
3257  }
3258 
3259  } while (found);
3260 
3261  //~ need to compute the primary writing mode (horiz/vert) in
3262  //~ addition to primary rotation
3263 
3264  // coalesce the block, and add it to the list
3265  blk->coalesce(uMap, fixedPitch);
3266  if (lastBlk) {
3267  lastBlk->next = blk;
3268  } else {
3269  blkList = blk;
3270  }
3271  lastBlk = blk;
3272  count[rot] += blk->charCount;
3273  ++nBlocks;
3274  }
3275 
3276  if (count[rot] > count[primaryRot]) {
3277  primaryRot = rot;
3278  }
3279  }
3280 
3281 #if 0 // for debugging
3282  printf("*** rotation ***\n");
3283  for (rot = 0; rot < 4; ++rot) {
3284  printf(" %d: %6d\n", rot, count[rot]);
3285  }
3286  printf(" primary rot = %d\n", primaryRot);
3287  printf("\n");
3288 #endif
3289 
3290 #if 0 // for debugging
3291  printf("*** blocks ***\n");
3292  for (blk = blkList; blk; blk = blk->next) {
3293  printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n",
3294  blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax);
3295  for (line = blk->lines; line; line = line->next) {
3296  printf(" line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n",
3297  line->xMin, line->xMax, line->yMin, line->yMax, line->base);
3298  for (word0 = line->words; word0; word0 = word0->next) {
3299  printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3300  word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3301  word0->base, word0->fontSize, word0->spaceAfter);
3302  for (i = 0; i < word0->len; ++i) {
3303  fputc(word0->text[i] & 0xff, stdout);
3304  }
3305  printf("'\n");
3306  }
3307  }
3308  }
3309  printf("\n");
3310 #endif
3311 
3312  // determine the primary direction
3313  lrCount = 0;
3314  for (blk = blkList; blk; blk = blk->next) {
3315  for (line = blk->lines; line; line = line->next) {
3316  for (word0 = line->words; word0; word0 = word0->next) {
3317  for (int i = 0; i < word0->len; ++i) {
3318  if (unicodeTypeL(word0->text[i])) {
3319  ++lrCount;
3320  } else if (unicodeTypeR(word0->text[i])) {
3321  --lrCount;
3322  }
3323  }
3324  }
3325  }
3326  }
3327  primaryLR = lrCount >= 0;
3328 
3329 #if 0 // for debugging
3330  printf("*** direction ***\n");
3331  printf("lrCount = %d\n", lrCount);
3332  printf("primaryLR = %d\n", primaryLR);
3333 #endif
3334 
3335  //----- column assignment
3336 
3337  // sort blocks into xy order for column assignment
3338  if (blocks)
3339  gfree(blocks);
3340  if (physLayout && fixedPitch) {
3341 
3342  blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
3343  int i;
3344  for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
3345  blocks[i] = blk;
3346  col1 = 0; // make gcc happy
3347  switch (primaryRot) {
3348  case 0:
3349  col1 = (int)(blk->xMin / fixedPitch + 0.5);
3350  break;
3351  case 1:
3352  col1 = (int)(blk->yMin / fixedPitch + 0.5);
3353  break;
3354  case 2:
3355  col1 = (int)((pageWidth - blk->xMax) / fixedPitch + 0.5);
3356  break;
3357  case 3:
3358  col1 = (int)((pageHeight - blk->yMax) / fixedPitch + 0.5);
3359  break;
3360  }
3361  blk->col = col1;
3362  for (line = blk->lines; line; line = line->next) {
3363  for (j = 0; j <= line->len; ++j) {
3364  line->col[j] += col1;
3365  }
3366  }
3367  }
3368 
3369  } else {
3370 
3371  // sort blocks into xy order for column assignment
3372  blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
3373  int i;
3374  for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
3375  blocks[i] = blk;
3376  }
3377  if (blocks) {
3379  }
3380 
3381  // column assignment
3382  for (i = 0; i < nBlocks; ++i) {
3383  blk0 = blocks[i];
3384  col1 = 0;
3385  for (j = 0; j < i; ++j) {
3386  blk1 = blocks[j];
3387  col2 = 0; // make gcc happy
3388  switch (primaryRot) {
3389  case 0:
3390  if (blk0->xMin > blk1->xMax) {
3391  col2 = blk1->col + blk1->nColumns + 3;
3392  } else if (blk1->xMax == blk1->xMin) {
3393  col2 = blk1->col;
3394  } else {
3395  col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) / (blk1->xMax - blk1->xMin)) * blk1->nColumns);
3396  }
3397  break;
3398  case 1:
3399  if (blk0->yMin > blk1->yMax) {
3400  col2 = blk1->col + blk1->nColumns + 3;
3401  } else if (blk1->yMax == blk1->yMin) {
3402  col2 = blk1->col;
3403  } else {
3404  col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) / (blk1->yMax - blk1->yMin)) * blk1->nColumns);
3405  }
3406  break;
3407  case 2:
3408  if (blk0->xMax < blk1->xMin) {
3409  col2 = blk1->col + blk1->nColumns + 3;
3410  } else if (blk1->xMin == blk1->xMax) {
3411  col2 = blk1->col;
3412  } else {
3413  col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) / (blk1->xMin - blk1->xMax)) * blk1->nColumns);
3414  }
3415  break;
3416  case 3:
3417  if (blk0->yMax < blk1->yMin) {
3418  col2 = blk1->col + blk1->nColumns + 3;
3419  } else if (blk1->yMin == blk1->yMax) {
3420  col2 = blk1->col;
3421  } else {
3422  col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) / (blk1->yMin - blk1->yMax)) * blk1->nColumns);
3423  }
3424  break;
3425  }
3426  if (col2 > col1) {
3427  col1 = col2;
3428  }
3429  }
3430  blk0->col = col1;
3431  for (line = blk0->lines; line; line = line->next) {
3432  for (j = 0; j <= line->len; ++j) {
3433  line->col[j] += col1;
3434  }
3435  }
3436  }
3437  }
3438 
3439 #if 0 // for debugging
3440  printf("*** blocks, after column assignment ***\n");
3441  for (blk = blkList; blk; blk = blk->next) {
3442  printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n",
3443  blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col,
3444  blk->nColumns);
3445  for (line = blk->lines; line; line = line->next) {
3446  printf(" line: col[0]=%d\n", line->col[0]);
3447  for (word0 = line->words; word0; word0 = word0->next) {
3448  printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3449  word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3450  word0->base, word0->fontSize, word0->spaceAfter);
3451  for (i = 0; i < word0->len; ++i) {
3452  fputc(word0->text[i] & 0xff, stdout);
3453  }
3454  printf("'\n");
3455  }
3456  }
3457  }
3458  printf("\n");
3459 #endif
3460 
3461  //----- reading order sort
3462 
3463  // compute space on left and right sides of each block
3464  for (int i = 0; i < nBlocks; ++i) {
3465  blk0 = blocks[i];
3466  for (j = 0; j < nBlocks; ++j) {
3467  blk1 = blocks[j];
3468  if (blk1 != blk0) {
3469  blk0->updatePriMinMax(blk1);
3470  }
3471  }
3472  }
3473 
3474 #if 0 // for debugging
3475  printf("PAGE\n");
3476 #endif
3477 
3478  int sortPos = 0;
3479  bool *visited = (bool *)gmallocn(nBlocks, sizeof(bool));
3480  for (int i = 0; i < nBlocks; i++) {
3481  visited[i] = false;
3482  }
3483 
3484  double bxMin0, byMin0, bxMin1, byMin1;
3485  int numTables = 0;
3486  int tableId = -1;
3487  int correspondenceX, correspondenceY;
3488  double xCentre1, yCentre1, xCentre2, yCentre2;
3489  double xCentre3, yCentre3, xCentre4, yCentre4;
3490  double deltaX, deltaY;
3491  TextBlock *fblk2 = nullptr, *fblk3 = nullptr, *fblk4 = nullptr;
3492 
3493  for (blk1 = blkList; blk1; blk1 = blk1->next) {
3494  blk1->ExMin = blk1->xMin;
3495  blk1->ExMax = blk1->xMax;
3496  blk1->EyMin = blk1->yMin;
3497  blk1->EyMax = blk1->yMax;
3498 
3499  bxMin0 = DBL_MAX;
3500  byMin0 = DBL_MAX;
3501  bxMin1 = DBL_MAX;
3502  byMin1 = DBL_MAX;
3503 
3504  fblk2 = nullptr;
3505  fblk3 = nullptr;
3506  fblk4 = nullptr;
3507 
3508  /* find fblk2, fblk3 and fblk4 so that
3509  * fblk2 is on the right of blk1 and overlap with blk1 in y axis
3510  * fblk3 is under blk1 and overlap with blk1 in x axis
3511  * fblk4 is under blk1 and on the right of blk1
3512  * and they are closest to blk1
3513  */
3514  for (blk2 = blkList; blk2; blk2 = blk2->next) {
3515  if (blk2 != blk1) {
3516  if (blk2->yMin <= blk1->yMax && blk2->yMax >= blk1->yMin && blk2->xMin > blk1->xMax && blk2->xMin < bxMin0) {
3517  bxMin0 = blk2->xMin;
3518  fblk2 = blk2;
3519  } else if (blk2->xMin <= blk1->xMax && blk2->xMax >= blk1->xMin && blk2->yMin > blk1->yMax && blk2->yMin < byMin0) {
3520  byMin0 = blk2->yMin;
3521  fblk3 = blk2;
3522  } else if (blk2->xMin > blk1->xMax && blk2->xMin < bxMin1 && blk2->yMin > blk1->yMax && blk2->yMin < byMin1) {
3523  bxMin1 = blk2->xMin;
3524  byMin1 = blk2->yMin;
3525  fblk4 = blk2;
3526  }
3527  }
3528  }
3529 
3530  /* fblk4 can not overlap with fblk3 in x and with fblk2 in y
3531  * fblk2 can not overlap with fblk3 in x and y
3532  * fblk4 has to overlap with fblk3 in y and with fblk2 in x
3533  */
3534  if (fblk2 != nullptr && fblk3 != nullptr && fblk4 != nullptr) {
3535  if (((fblk3->xMin <= fblk4->xMax && fblk3->xMax >= fblk4->xMin) || (fblk2->yMin <= fblk4->yMax && fblk2->yMax >= fblk4->yMin) || (fblk2->xMin <= fblk3->xMax && fblk2->xMax >= fblk3->xMin)
3536  || (fblk2->yMin <= fblk3->yMax && fblk2->yMax >= fblk3->yMin))
3537  || !(fblk4->xMin <= fblk2->xMax && fblk4->xMax >= fblk2->xMin && fblk4->yMin <= fblk3->yMax && fblk4->yMax >= fblk3->yMin)) {
3538  fblk2 = nullptr;
3539  fblk3 = nullptr;
3540  fblk4 = nullptr;
3541  }
3542  }
3543 
3544  // if we found any then look whether they form a table
3545  if (fblk2 != nullptr && fblk3 != nullptr && fblk4 != nullptr) {
3546  tableId = -1;
3547  correspondenceX = 0;
3548  correspondenceY = 0;
3549  deltaX = 0.0;
3550  deltaY = 0.0;
3551 
3552  if (blk1->lines && blk1->lines->words)
3553  deltaX = blk1->lines->words->getFontSize();
3554  if (fblk2->lines && fblk2->lines->words)
3555  deltaX = deltaX < fblk2->lines->words->getFontSize() ? deltaX : fblk2->lines->words->getFontSize();
3556  if (fblk3->lines && fblk3->lines->words)
3557  deltaX = deltaX < fblk3->lines->words->getFontSize() ? deltaX : fblk3->lines->words->getFontSize();
3558  if (fblk4->lines && fblk4->lines->words)
3559  deltaX = deltaX < fblk4->lines->words->getFontSize() ? deltaX : fblk4->lines->words->getFontSize();
3560 
3561  deltaY = deltaX;
3562 
3563  deltaX *= minColSpacing1;
3564  deltaY *= maxIntraLineDelta;
3565 
3566  xCentre1 = (blk1->xMax + blk1->xMin) / 2.0;
3567  yCentre1 = (blk1->yMax + blk1->yMin) / 2.0;
3568  xCentre2 = (fblk2->xMax + fblk2->xMin) / 2.0;
3569  yCentre2 = (fblk2->yMax + fblk2->yMin) / 2.0;
3570  xCentre3 = (fblk3->xMax + fblk3->xMin) / 2.0;
3571  yCentre3 = (fblk3->yMax + fblk3->yMin) / 2.0;
3572  xCentre4 = (fblk4->xMax + fblk4->xMin) / 2.0;
3573  yCentre4 = (fblk4->yMax + fblk4->yMin) / 2.0;
3574 
3575  // are blocks centrally aligned in x ?
3576  if (fabs(xCentre1 - xCentre3) <= deltaX && fabs(xCentre2 - xCentre4) <= deltaX)
3577  correspondenceX++;
3578 
3579  // are blocks centrally aligned in y ?
3580  if (fabs(yCentre1 - yCentre2) <= deltaY && fabs(yCentre3 - yCentre4) <= deltaY)
3581  correspondenceY++;
3582 
3583  // are blocks aligned to the left ?
3584  if (fabs(blk1->xMin - fblk3->xMin) <= deltaX && fabs(fblk2->xMin - fblk4->xMin) <= deltaX)
3585  correspondenceX++;
3586 
3587  // are blocks aligned to the right ?
3588  if (fabs(blk1->xMax - fblk3->xMax) <= deltaX && fabs(fblk2->xMax - fblk4->xMax) <= deltaX)
3589  correspondenceX++;
3590 
3591  // are blocks aligned to the top ?
3592  if (fabs(blk1->yMin - fblk2->yMin) <= deltaY && fabs(fblk3->yMin - fblk4->yMin) <= deltaY)
3593  correspondenceY++;
3594 
3595  // are blocks aligned to the bottom ?
3596  if (fabs(blk1->yMax - fblk2->yMax) <= deltaY && fabs(fblk3->yMax - fblk4->yMax) <= deltaY)
3597  correspondenceY++;
3598 
3599  // are blocks aligned in x and y ?
3600  if (correspondenceX > 0 && correspondenceY > 0) {
3601 
3602  // find maximal tableId
3603  tableId = tableId < fblk4->tableId ? fblk4->tableId : tableId;
3604  tableId = tableId < fblk3->tableId ? fblk3->tableId : tableId;
3605  tableId = tableId < fblk2->tableId ? fblk2->tableId : tableId;
3606  tableId = tableId < blk1->tableId ? blk1->tableId : tableId;
3607 
3608  // if the tableId is -1, then we found new table
3609  if (tableId < 0) {
3610  tableId = numTables;
3611  numTables++;
3612  }
3613 
3614  blk1->tableId = tableId;
3615  fblk2->tableId = tableId;
3616  fblk3->tableId = tableId;
3617  fblk4->tableId = tableId;
3618  }
3619  }
3620  }
3621 
3622  /* set extended bounding boxes of all table entries
3623  * so that they contain whole table
3624  * (we need to process whole table size when comparing it
3625  * with regular text blocks)
3626  */
3627  PDFRectangle *envelopes = new PDFRectangle[numTables];
3628  TextBlock **ending_blocks = new TextBlock *[numTables];
3629 
3630  for (int i = 0; i < numTables; i++) {
3631  envelopes[i].x1 = DBL_MAX;
3632  envelopes[i].x2 = DBL_MIN;
3633  envelopes[i].y1 = DBL_MAX;
3634  envelopes[i].y2 = DBL_MIN;
3635  }
3636 
3637  for (blk1 = blkList; blk1; blk1 = blk1->next) {
3638  if (blk1->tableId >= 0) {
3639  if (blk1->ExMin < envelopes[blk1->tableId].x1) {
3640  envelopes[blk1->tableId].x1 = blk1->ExMin;
3641  if (!blk1->page->primaryLR)
3642  ending_blocks[blk1->tableId] = blk1;
3643  }
3644 
3645  if (blk1->ExMax > envelopes[blk1->tableId].x2) {
3646  envelopes[blk1->tableId].x2 = blk1->ExMax;
3647  if (blk1->page->primaryLR)
3648  ending_blocks[blk1->tableId] = blk1;
3649  }
3650 
3651  envelopes[blk1->tableId].y1 = blk1->EyMin < envelopes[blk1->tableId].y1 ? blk1->EyMin : envelopes[blk1->tableId].y1;
3652  envelopes[blk1->tableId].y2 = blk1->EyMax > envelopes[blk1->tableId].y2 ? blk1->EyMax : envelopes[blk1->tableId].y2;
3653  }
3654  }
3655 
3656  for (blk1 = blkList; blk1; blk1 = blk1->next) {
3657  if (blk1->tableId >= 0 && blk1->xMin <= ending_blocks[blk1->tableId]->xMax && blk1->xMax >= ending_blocks[blk1->tableId]->xMin) {
3658  blk1->tableEnd = true;
3659  }
3660  }
3661 
3662  for (blk1 = blkList; blk1; blk1 = blk1->next) {
3663  if (blk1->tableId >= 0) {
3664  blk1->ExMin = envelopes[blk1->tableId].x1;
3665  blk1->ExMax = envelopes[blk1->tableId].x2;
3666  blk1->EyMin = envelopes[blk1->tableId].y1;
3667  blk1->EyMax = envelopes[blk1->tableId].y2;
3668  }
3669  }
3670  delete[] envelopes;
3671  delete[] ending_blocks;
3672 
3673  /* set extended bounding boxes of all other blocks
3674  * so that they extend in x without hitting neighbours
3675  */
3676  for (blk1 = blkList; blk1; blk1 = blk1->next) {
3677  if (!(blk1->tableId >= 0)) {
3678  double xMax = DBL_MAX;
3679  double xMin = DBL_MIN;
3680 
3681  for (blk2 = blkList; blk2; blk2 = blk2->next) {
3682  if (blk2 == blk1)
3683  continue;
3684 
3685  if (blk1->yMin <= blk2->yMax && blk1->yMax >= blk2->yMin) {
3686  if (blk2->xMin < xMax && blk2->xMin > blk1->xMax)
3687  xMax = blk2->xMin;
3688 
3689  if (blk2->xMax > xMin && blk2->xMax < blk1->xMin)
3690  xMin = blk2->xMax;
3691  }
3692  }
3693 
3694  for (blk2 = blkList; blk2; blk2 = blk2->next) {
3695  if (blk2 == blk1)
3696  continue;
3697 
3698  if (blk2->xMax > blk1->ExMax && blk2->xMax <= xMax && blk2->yMin >= blk1->yMax) {
3699  blk1->ExMax = blk2->xMax;
3700  }
3701 
3702  if (blk2->xMin < blk1->ExMin && blk2->xMin >= xMin && blk2->yMin >= blk1->yMax)
3703  blk1->ExMin = blk2->xMin;
3704  }
3705  }
3706  }
3707 
3708  int i = -1;
3709  for (blk1 = blkList; blk1; blk1 = blk1->next) {
3710  i++;
3711  sortPos = blk1->visitDepthFirst(blkList, i, blocks, sortPos, visited);
3712  }
3713  if (visited) {
3714  gfree(visited);
3715  }
3716 
3717 #if 0 // for debugging
3718  printf("*** blocks, after ro sort ***\n");
3719  for (i = 0; i < nBlocks; ++i) {
3720  blk = blocks[i];
3721  printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n",
3722  blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
3723  blk->priMin, blk->priMax);
3724  for (line = blk->lines; line; line = line->next) {
3725  printf(" line:\n");
3726  for (word0 = line->words; word0; word0 = word0->next) {
3727  printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3728  word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3729  word0->base, word0->fontSize, word0->spaceAfter);
3730  for (j = 0; j < word0->len; ++j) {
3731  fputc(word0->text[j] & 0xff, stdout);
3732  }
3733  printf("'\n");
3734  }
3735  }
3736  }
3737  printf("\n");
3738  fflush(stdout);
3739 #endif
3740 
3741  // build the flows
3742  //~ this needs to be adjusted for writing mode (vertical text)
3743  //~ this also needs to account for right-to-left column ordering
3744  while (flows) {
3745  flow = flows;
3746  flows = flows->next;
3747  delete flow;
3748  }
3749  flow = nullptr;
3750  flows = lastFlow = nullptr;
3751  // assume blocks are already in reading order,
3752  // and construct flows accordingly.
3753  for (i = 0; i < nBlocks; i++) {
3754  blk = blocks[i];
3755  blk->next = nullptr;
3756  if (flow) {
3757  blk1 = blocks[i - 1];
3758  blkSpace = maxBlockSpacing * blk1->lines->words->fontSize;
3759  if (blk1->secondaryDelta(blk) <= blkSpace && blk->isBelow(blk1) && flow->blockFits(blk, blk1)) {
3760  flow->addBlock(blk);
3761  continue;
3762  }
3763  }
3764  flow = new TextFlow(this, blk);
3765  if (lastFlow) {
3766  lastFlow->next = flow;
3767  } else {
3768  flows = flow;
3769  }
3770  lastFlow = flow;
3771  }
3772 
3773 #if 0 // for debugging
3774  printf("*** flows ***\n");
3775  for (flow = flows; flow; flow = flow->next) {
3776  printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n",
3777  flow->xMin, flow->xMax, flow->yMin, flow->yMax,
3778  flow->priMin, flow->priMax);
3779  for (blk = flow->blocks; blk; blk = blk->next) {
3780  printf(" block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n",
3781  blk->rot, blk->ExMin, blk->ExMax, blk->EyMin, blk->EyMax,
3782  blk->priMin, blk->priMax);
3783  for (line = blk->lines; line; line = line->next) {
3784  printf(" line:\n");
3785  for (word0 = line->words; word0; word0 = word0->next) {
3786  printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3787  word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3788  word0->base, word0->fontSize, word0->spaceAfter);
3789  for (i = 0; i < word0->len; ++i) {
3790  fputc(word0->text[i] & 0xff, stdout);
3791  }
3792  printf("'\n");
3793  }
3794  }
3795  }
3796  }
3797  printf("\n");
3798 #endif
3799 }
3800 
3801 bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax)
3802 {
3803  return findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, false, backward, wholeWord, xMin, yMin, xMax, yMax);
3804 }
3805 
3806 bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax,
3807  double *yMax)
3808 {
3809  TextBlock *blk;
3810  TextLine *line;
3811  Unicode *s2, *txt, *reordered;
3812  Unicode *p;
3813  int txtSize, m, i, j, k;
3814  double xStart, yStart, xStop, yStop;
3815  double xMin0, yMin0, xMax0, yMax0;
3816  double xMin1, yMin1, xMax1, yMax1;
3817  bool found;
3818 
3819  if (len == 0) {
3820  return false;
3821  }
3822 
3823  if (rawOrder) {
3824  return false;
3825  }
3826 
3827  // handle right-to-left text
3828  reordered = (Unicode *)gmallocn(len, sizeof(Unicode));
3829  reorderText(s, len, nullptr, primaryLR, nullptr, reordered);
3830 
3831  // normalize the search string
3832  s2 = unicodeNormalizeNFKC(reordered, len, &len, nullptr);
3833 
3834  // if search string is not pure ascii then don't
3835  // use ignoreDiacritics (as they won't match)
3836  if (!caseSensitive) {
3837  // convert the search string to uppercase
3838  for (i = 0; i < len; ++i) {
3839  s2[i] = unicodeToUpper(s2[i]);
3840  if (ignoreDiacritics && !isAscii7(s2[i]))
3841  ignoreDiacritics = false;
3842  }
3843  } else if (ignoreDiacritics) {
3844  for (i = 0; i < len; ++i) {
3845  if (!isAscii7(s2[i])) {
3846  ignoreDiacritics = false;
3847  break;
3848  }
3849  }
3850  }
3851 
3852  txt = nullptr;
3853  txtSize = 0;
3854 
3855  xStart = yStart = xStop = yStop = 0;
3856  if (startAtLast && haveLastFind) {
3857  xStart = lastFindXMin;
3858  yStart = lastFindYMin;
3859  } else if (!startAtTop) {
3860  xStart = *xMin;
3861  yStart = *yMin;
3862  }
3863  if (stopAtLast && haveLastFind) {
3864  xStop = lastFindXMin;
3865  yStop = lastFindYMin;
3866  } else if (!stopAtBottom) {
3867  xStop = *xMax;
3868  yStop = *yMax;
3869  }
3870 
3871  found = false;
3872  xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
3873  xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
3874 
3875  for (i = backward ? nBlocks - 1 : 0; backward ? i >= 0 : i < nBlocks; i += backward ? -1 : 1) {
3876  blk = blocks[i];
3877 
3878  // check: is the block above the top limit?
3879  // (this only works if the page's primary rotation is zero --
3880  // otherwise the blocks won't be sorted in the useful order)
3881  if (!startAtTop && primaryRot == 0 && (backward ? blk->yMin > yStart : blk->yMax < yStart)) {
3882  continue;
3883  }
3884 
3885  // check: is the block below the bottom limit?
3886  // (this only works if the page's primary rotation is zero --
3887  // otherwise the blocks won't be sorted in the useful order)
3888  if (!stopAtBottom && primaryRot == 0 && (backward ? blk->yMax < yStop : blk->yMin > yStop)) {
3889  break;
3890  }
3891 
3892  for (line = blk->lines; line; line = line->next) {
3893 
3894  // check: is the line above the top limit?
3895  // (this only works if the page's primary rotation is zero --
3896  // otherwise the lines won't be sorted in the useful order)
3897  if (!startAtTop && primaryRot == 0 && (backward ? line->yMin > yStart : line->yMin < yStart)) {
3898  continue;
3899  }
3900 
3901  // check: is the line below the bottom limit?
3902  // (this only works if the page's primary rotation is zero --
3903  // otherwise the lines won't be sorted in the useful order)
3904  if (!stopAtBottom && primaryRot == 0 && (backward ? line->yMin < yStop : line->yMin > yStop)) {
3905  continue;
3906  }
3907 
3908  if (!line->normalized)
3909  line->normalized = unicodeNormalizeNFKC(line->text, line->len, &line->normalized_len, &line->normalized_idx, true);
3910  // convert the line to uppercase
3911  m = line->normalized_len;
3912 
3913  if (ignoreDiacritics) {
3914  if (!line->ascii_translation)
3915  unicodeToAscii7(line->normalized, line->normalized_len, &line->ascii_translation, &line->ascii_len, line->normalized_idx, &line->ascii_idx);
3916  if (line->ascii_len)
3917  m = line->ascii_len;
3918  else
3919  ignoreDiacritics = false;
3920  }
3921  if (!caseSensitive) {
3922  if (m > txtSize) {
3923  txt = (Unicode *)greallocn(txt, m, sizeof(Unicode));
3924  txtSize = m;
3925  }
3926  for (k = 0; k < m; ++k) {
3927  if (ignoreDiacritics)
3928  txt[k] = unicodeToUpper(line->ascii_translation[k]);
3929  else
3930  txt[k] = unicodeToUpper(line->normalized[k]);
3931  }
3932  } else {
3933  if (ignoreDiacritics)
3934  txt = line->ascii_translation;
3935  else
3936  txt = line->normalized;
3937  }
3938 
3939  // search each position in this line
3940  j = backward ? m - len : 0;
3941  p = txt + j;
3942  while (backward ? j >= 0 : j <= m - len) {
3943  if (!wholeWord || ((j == 0 || !unicodeTypeAlphaNum(txt[j - 1])) && (j + len == m || !unicodeTypeAlphaNum(txt[j + len])))) {
3944 
3945  // compare the strings
3946  for (k = 0; k < len; ++k) {
3947  if (p[k] != s2[k]) {
3948  break;
3949  }
3950  }
3951 
3952  // found it
3953  if (k == len) {
3954  // where s2 matches a subsequence of a compatibility equivalence
3955  // decomposition, highlight the entire glyph, since we don't know
3956  // the internal layout of subglyph components
3957  int normStart, normAfterEnd;
3958  if (ignoreDiacritics) {
3959  normStart = line->ascii_idx[j];
3960  normAfterEnd = line->ascii_idx[j + len - 1] + 1;
3961  } else {
3962  normStart = line->normalized_idx[j];
3963  normAfterEnd = line->normalized_idx[j + len - 1] + 1;
3964  }
3965  switch (line->rot) {
3966  case 0:
3967  xMin1 = line->edge[normStart];
3968  xMax1 = line->edge[normAfterEnd];
3969  yMin1 = line->yMin;
3970  yMax1 = line->yMax;
3971  break;
3972  case 1:
3973  xMin1 = line->xMin;
3974  xMax1 = line->xMax;
3975  yMin1 = line->edge[normStart];
3976  yMax1 = line->edge[normAfterEnd];
3977  break;
3978  case 2:
3979  xMin1 = line->edge[normAfterEnd];
3980  xMax1 = line->edge[normStart];
3981  yMin1 = line->yMin;
3982  yMax1 = line->yMax;
3983  break;
3984  case 3:
3985  xMin1 = line->xMin;
3986  xMax1 = line->xMax;
3987  yMin1 = line->edge[normAfterEnd];
3988  yMax1 = line->edge[normStart];
3989  break;
3990  }
3991  if (backward) {
3992  if ((startAtTop || yMin1 < yStart || (yMin1 == yStart && xMin1 < xStart)) && (stopAtBottom || yMin1 > yStop || (yMin1 == yStop && xMin1 > xStop))) {
3993  if (!found || yMin1 > yMin0 || (yMin1 == yMin0 && xMin1 > xMin0)) {
3994  xMin0 = xMin1;
3995  xMax0 = xMax1;
3996  yMin0 = yMin1;
3997  yMax0 = yMax1;
3998  found = true;
3999  }
4000  }
4001  } else {
4002  if ((startAtTop || yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) && (stopAtBottom || yMin1 < yStop || (yMin1 == yStop && xMin1 < xStop))) {
4003  if (!found || yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) {
4004  xMin0 = xMin1;
4005  xMax0 = xMax1;
4006  yMin0 = yMin1;
4007  yMax0 = yMax1;
4008  found = true;
4009  }
4010  }
4011  }
4012  }
4013  }
4014  if (backward) {
4015  --j;
4016  --p;
4017  } else {
4018  ++j;
4019  ++p;
4020  }
4021  }
4022  }
4023  }
4024 
4025  gfree(s2);
4026  gfree(reordered);
4027  if (!caseSensitive) {
4028  gfree(txt);
4029  }
4030 
4031  if (found) {
4032  *xMin = xMin0;
4033  *xMax = xMax0;
4034  *yMin = yMin0;
4035  *yMax = yMax0;
4036  lastFindXMin = xMin0;
4037  lastFindYMin = yMin0;
4038  haveLastFind = true;
4039  return true;
4040  }
4041 
4042  return false;
4043 }
4044 
4045 GooString *TextPage::getText(double xMin, double yMin, double xMax, double yMax, EndOfLineKind textEOL) const
4046 {
4047  GooString *s;
4048  const UnicodeMap *uMap;
4049  TextBlock *blk;
4050  TextLine *line;
4051  TextLineFrag *frags;
4052  int nFrags, fragsSize;
4053  TextLineFrag *frag;
4054  char space[8], eol[16];
4055  int spaceLen, eolLen;
4056  int lastRot;
4057  double x, y, delta;
4058  int col, idx0, idx1, i, j;
4059  bool multiLine, oneRot;
4060 
4061  s = new GooString();
4062 
4063  // get the output encoding
4064  if (!(uMap = globalParams->getTextEncoding())) {
4065  return s;
4066  }
4067 
4068  if (rawOrder) {
4069  TextWord *word;
4070  char mbc[16];
4071  int mbc_len;
4072 
4073  for (word = rawWords; word && word <= rawLastWord; word = word->next) {
4074  for (j = 0; j < word->getLength(); ++j) {
4075  double gXMin, gXMax, gYMin, gYMax;
4076  word->getCharBBox(j, &gXMin, &gYMin, &gXMax, &gYMax);
4077  if (xMin <= gXMin && gXMax <= xMax && yMin <= gYMin && gYMax <= yMax) {
4078  mbc_len = uMap->mapUnicode(*(word->getChar(j)), mbc, sizeof(mbc));
4079  s->append(mbc, mbc_len);
4080  }
4081  }
4082  }
4083  return s;
4084  }
4085 
4086  spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
4087  eolLen = 0; // make gcc happy
4088  switch (textEOL) {
4089  case eolUnix:
4090  eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
4091  break;
4092  case eolDOS:
4093  eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
4094  eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
4095  break;
4096  case eolMac:
4097  eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
4098  break;
4099  }
4100 
4101  //~ writing mode (horiz/vert)
4102 
4103  // collect the line fragments that are in the rectangle
4104  fragsSize = 256;
4105  frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
4106  nFrags = 0;
4107  lastRot = -1;
4108  oneRot = true;
4109  for (i = 0; i < nBlocks; ++i) {
4110  blk = blocks[i];
4111  if (xMin < blk->xMax && blk->xMin < xMax && yMin < blk->yMax && blk->yMin < yMax) {
4112  for (line = blk->lines; line; line = line->next) {
4113  if (xMin < line->xMax && line->xMin < xMax && yMin < line->yMax && line->yMin < yMax) {
4114  idx0 = idx1 = -1;
4115  switch (line->rot) {
4116  case 0:
4117  y = 0.5 * (line->yMin + line->yMax);
4118  if (yMin < y && y < yMax) {
4119  j = 0;
4120  while (j < line->len) {
4121  if (0.5 * (line->edge[j] + line->edge[j + 1]) > xMin) {
4122  idx0 = j;
4123  break;
4124  }
4125  ++j;
4126  }
4127  j = line->len - 1;
4128  while (j >= 0) {
4129  if (0.5 * (line->edge[j] + line->edge[j + 1]) < xMax) {
4130  idx1 = j;
4131  break;
4132  }
4133  --j;
4134  }
4135  }
4136  break;
4137  case 1:
4138  x = 0.5 * (line->xMin + line->xMax);
4139  if (xMin < x && x < xMax) {
4140  j = 0;
4141  while (j < line->len) {
4142  if (0.5 * (line->edge[j] + line->edge[j + 1]) > yMin) {
4143  idx0 = j;
4144  break;
4145  }
4146  ++j;
4147  }
4148  j = line->len - 1;
4149  while (j >= 0) {
4150  if (0.5 * (line->edge[j] + line->edge[j + 1]) < yMax) {
4151  idx1 = j;
4152  break;
4153  }
4154  --j;
4155  }
4156  }
4157  break;
4158  case 2:
4159  y = 0.5 * (line->yMin + line->yMax);
4160  if (yMin < y && y < yMax) {
4161  j = 0;
4162  while (j < line->len) {
4163  if (0.5 * (line->edge[j] + line->edge[j + 1]) < xMax) {
4164  idx0 = j;
4165  break;
4166  }
4167  ++j;
4168  }
4169  j = line->len - 1;
4170  while (j >= 0) {
4171  if (0.5 * (line->edge[j] + line->edge[j + 1]) > xMin) {
4172  idx1 = j;
4173  break;
4174  }
4175  --j;
4176  }
4177  }
4178  break;
4179  case 3:
4180  x = 0.5 * (line->xMin + line->xMax);
4181  if (xMin < x && x < xMax) {
4182  j = 0;
4183  while (j < line->len) {
4184  if (0.5 * (line->edge[j] + line->edge[j + 1]) < yMax) {
4185  idx0 = j;
4186  break;
4187  }
4188  ++j;
4189  }
4190  j = line->len - 1;
4191  while (j >= 0) {
4192  if (0.5 * (line->edge[j] + line->edge[j + 1]) > yMin) {
4193  idx1 = j;
4194  break;
4195  }
4196  --j;
4197  }
4198  }
4199  break;
4200  }
4201  if (idx0 >= 0 && idx1 >= 0) {
4202  if (nFrags == fragsSize) {
4203  fragsSize *= 2;
4204  frags = (TextLineFrag *)greallocn(frags, fragsSize, sizeof(TextLineFrag));
4205  }
4206  frags[nFrags].init(line, idx0, idx1 - idx0 + 1);
4207  ++nFrags;
4208  if (lastRot >= 0 && line->rot != lastRot) {
4209  oneRot = false;
4210  }
4211  lastRot = line->rot;
4212  }
4213  }
4214  }
4215  }
4216  }
4217 
4218  // sort the fragments and generate the string
4219  if (nFrags > 0) {
4220 
4221  for (i = 0; i < nFrags; ++i) {
4222  frags[i].computeCoords(oneRot);
4223  }
4224  assignColumns(frags, nFrags, oneRot);
4225 
4226  // if all lines in the region have the same rotation, use it;
4227  // otherwise, use the page's primary rotation
4228  if (oneRot) {
4229  qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXLineRot);
4230  } else {
4231  qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXPrimaryRot);
4232  }
4233  i = 0;
4234  while (i < nFrags) {
4235  delta = maxIntraLineDelta * frags[i].line->words->fontSize;
4236  for (j = i + 1; j < nFrags && fabs(frags[j].base - frags[i].base) < delta; ++j)
4237  ;
4239  i = j;
4240  }
4241 
4242  col = 0;
4243  multiLine = false;
4244  for (i = 0; i < nFrags; ++i) {
4245  frag = &frags[i];
4246 
4247  // insert a return
4248  if (frag->col < col || (i > 0 && fabs(frag->base - frags[i - 1].base) > maxIntraLineDelta * frags[i - 1].line->words->fontSize)) {
4249  s->append(eol, eolLen);
4250  col = 0;
4251  multiLine = true;
4252  }
4253 
4254  // column alignment
4255  for (; col < frag->col; ++col) {
4256  s->append(space, spaceLen);
4257  }
4258 
4259  // get the fragment text
4260  col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
4261  }
4262 
4263  if (multiLine) {
4264  s->append(eol, eolLen);
4265  }
4266  }
4267 
4268  gfree(frags);
4269 
4270  return s;
4271 }
4272 
4274 {
4275 public:
4280  virtual void visitBlock(TextBlock *block, TextLine *begin, TextLine *end, const PDFRectangle *selection) = 0;
4281  virtual void visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) = 0;
4282  virtual void visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) = 0;
4283 
4284 protected:
4286 };
4287 
4289 
4291 
4293 {
4294 public:
4296  ~TextSelectionDumper() override;
4297 
4298  void visitBlock(TextBlock *block, TextLine *begin, TextLine *end, const PDFRectangle *selection) override {};
4299  void visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) override;
4300  void visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) override;
4301  void endPage();
4302 
4303  GooString *getText();
4304  std::vector<TextWordSelection *> **takeWordList(int *nLines);
4305 
4306 private:
4307  void startLine();
4308  void finishLine();
4309 
4310  std::vector<TextWordSelection *> **lines;
4312  std::vector<TextWordSelection *> *words;
4313  int tableId;
4315 };
4316 
4318 {
4319  linesSize = 256;
4320  lines = (std::vector<TextWordSelection *> **)gmallocn(linesSize, sizeof(std::vector<TextWordSelection *> *));
4321  nLines = 0;
4322 
4323  tableId = -1;
4324  currentBlock = nullptr;
4325  words = nullptr;
4326 }
4327 
4329 {
4330  for (int i = 0; i < nLines; i++) {
4331  for (auto entry : *(lines[i])) {
4332  delete entry;
4333  }
4334  delete lines[i];
4335  }
4336  gfree(lines);
4337 }
4338 
4340 {
4341  finishLine();
4342  words = new std::vector<TextWordSelection *>();
4343 }
4344 
4346 {
4347  if (nLines == linesSize) {
4348  linesSize *= 2;
4349  lines = (std::vector<TextWordSelection *> **)grealloc(lines, linesSize * sizeof(std::vector<TextWordSelection *> *));
4350  }
4351 
4352  if (words && words->size() > 0)
4353  lines[nLines++] = words;
4354  else if (words)
4355  delete words;
4356  words = nullptr;
4357 }
4358 
4359 void TextSelectionDumper::visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection)
4360 {
4361  TextLineFrag frag;
4362 
4363  frag.init(line, edge_begin, edge_end - edge_begin);
4364 
4365  if (tableId >= 0 && frag.line->blk->tableId < 0) {
4366  finishLine();
4367 
4368  tableId = -1;
4369  currentBlock = nullptr;
4370  }
4371 
4372  if (frag.line->blk->tableId >= 0) { // a table
4373  if (tableId == -1) {
4374  tableId = frag.line->blk->tableId;
4375  currentBlock = frag.line->blk;
4376  }
4377 
4378  if (currentBlock == frag.line->blk) { // the same block
4379  startLine();
4380  } else { // another block
4381  if (currentBlock->tableEnd) { // previous block ended its row
4382  startLine();
4383  }
4384  currentBlock = frag.line->blk;
4385  }
4386  } else { // not a table
4387  startLine();
4388  }
4389 }
4390 
4392 {
4393  words->push_back(new TextWordSelection(word, begin, end));
4394 }
4395 
4397 {
4398  finishLine();
4399 }
4400 
4402 {
4403  GooString *text;
4404  int i;
4405  const UnicodeMap *uMap;
4406  char space[8], eol[16];
4407  int spaceLen, eolLen;
4408 
4409  text = new GooString();
4410 
4411  if (!(uMap = globalParams->getTextEncoding()))
4412  return text;
4413 
4414  spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
4415  eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
4416 
4417  for (i = 0; i < nLines; i++) {
4418  std::vector<TextWordSelection *> *lineWords = lines[i];
4419  for (std::size_t j = 0; j < lineWords->size(); j++) {
4420  TextWordSelection *sel = (*lineWords)[j];
4421 
4422  page->dumpFragment(sel->word->text + sel->begin, sel->end - sel->begin, uMap, text);
4423  if (j < lineWords->size() - 1 && sel->word->spaceAfter)
4424  text->append(space, spaceLen);
4425  }
4426  if (i < nLines - 1)
4427  text->append(eol, eolLen);
4428  }
4429 
4430  return text;
4431 }
4432 
4433 std::vector<TextWordSelection *> **TextSelectionDumper::takeWordList(int *nLinesOut)
4434 {
4435  std::vector<TextWordSelection *> **returnValue = lines;
4436 
4437  *nLinesOut = nLines;
4438  if (nLines == 0)
4439  return nullptr;
4440 
4441  nLines = 0;
4442  lines = nullptr;
4443 
4444  return returnValue;
4445 }
4446 
4448 {
4449 public:
4451  ~TextSelectionSizer() override { delete list; }
4452 
4453  void visitBlock(TextBlock *block, TextLine *begin, TextLine *end, const PDFRectangle *selection) override {};
4454  void visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) override;
4455  void visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) override {};
4456 
4457  std::vector<PDFRectangle *> *takeRegion()
4458  {
4459  auto aux = list;
4460  list = nullptr;
4461  return aux;
4462  }
4463 
4464 private:
4465  std::vector<PDFRectangle *> *list;
4466  double scale;
4467 };
4468 
4470 {
4471  list = new std::vector<PDFRectangle *>();
4472 }
4473 
4474 void TextSelectionSizer::visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection)
4475 {
4476  PDFRectangle *rect;
4477  double x1, y1, x2, y2, margin;
4478 
4479  margin = (line->yMax - line->yMin) / 8;
4480  x1 = line->edge[edge_begin];
4481  y1 = line->yMin - margin;
4482  x2 = line->edge[edge_end];
4483  y2 = line->yMax + margin;
4484 
4485  rect = new PDFRectangle(floor(x1 * scale), floor(y1 * scale), ceil(x2 * scale), ceil(y2 * scale));
4486  list->push_back(rect);
4487 }
4488 
4490 {
4491 public:
4492  TextSelectionPainter(TextPage *page, double scale, int rotation, OutputDev *out, const GfxColor *box_color, const GfxColor *glyph_color);
4493  ~TextSelectionPainter() override;
4494 
4495  void visitBlock(TextBlock *block, TextLine *begin, TextLine *end, const PDFRectangle *selection) override {};
4496  void visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) override;
4497  void visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) override;
4498  void endPage();
4499 
4500 private:
4504  std::vector<TextWordSelection *> *selectionList;
4506  bool hasGlyphLessFont();
4507 };
4508 
4509 TextSelectionPainter::TextSelectionPainter(TextPage *p, double scale, int rotation, OutputDev *outA, const GfxColor *box_color, const GfxColor *glyph_colorA) : TextSelectionVisitor(p), out(outA), glyph_color(glyph_colorA)
4510 {
4511  PDFRectangle box(0, 0, p->pageWidth, p->pageHeight);
4512 
4513  selectionList = new std::vector<TextWordSelection *>();
4514  state = new GfxState(72 * scale, 72 * scale, &box, rotation, false);
4515 
4516  state->getCTM(&ctm);
4517  ctm.invertTo(&ictm);
4518 
4519  out->startPage(0, state, nullptr);
4520  out->setDefaultCTM(state->getCTM());
4521 
4522  state->setFillColorSpace(new GfxDeviceRGBColorSpace());
4523  state->setFillColor(box_color);
4525 }
4526 
4528 {
4529  for (auto entry : *selectionList) {
4530  delete entry;
4531  }
4532  delete selectionList;
4533  delete state;
4534 }
4535 
4536 void TextSelectionPainter::visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection)
4537 {
4538  double x1, y1, x2, y2, margin;
4539 
4540  margin = (line->yMax - line->yMin) / 8;
4541  x1 = floor(line->edge[edge_begin]);
4542  y1 = floor(line->yMin - margin);
4543  x2 = ceil(line->edge[edge_end]);
4544  y2 = ceil(line->yMax + margin);
4545 
4546  ctm.transform(line->edge[edge_begin], line->yMin - margin, &x1, &y1);
4547  ctm.transform(line->edge[edge_end], line->yMax + margin, &x2, &y2);
4548 
4549  x1 = floor(x1);
4550  y1 = floor(y1);
4551  x2 = ceil(x2);
4552  y2 = ceil(y2);
4553 
4554  ictm.transform(x1, y1, &x1, &y1);
4555  ictm.transform(x2, y2, &x2, &y2);
4556 
4557  state->moveTo(x1, y1);
4558  state->lineTo(x2, y1);
4559  state->lineTo(x2, y2);
4560  state->lineTo(x1, y2);
4561  state->closePath();
4562 }
4563 
4565 {
4566  selectionList->push_back(new TextWordSelection(word, begin, end));
4567 }
4568 
4570 {
4571  if (selectionList && selectionList->size()) {