w32tex
About: TeX Live provides a comprehensive TeX system including all the major TeX-related programs, macro packages, and fonts that are free software. Windows sources.
  Fossies Dox: w32tex-src.tar.xz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

TextOutputDev.cc
Go to the documentation of this file.
1 //========================================================================
2 //
3 // TextOutputDev.cc
4 //
5 // Copyright 1997-2014 Glyph & Cog, LLC
6 //
7 //========================================================================
8 
9 #include <aconf.h>
10 
11 #ifdef USE_GCC_PRAGMAS
12 #pragma implementation
13 #endif
14 
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <stddef.h>
18 #include <math.h>
19 #include <limits.h>
20 #include <ctype.h>
21 #ifdef _WIN32
22 #include <fcntl.h> // for O_BINARY
23 #include <io.h> // for setmode
24 #endif
25 #include "gmem.h"
26 #include "gmempp.h"
27 #include "GString.h"
28 #include "GList.h"
29 #include "gfile.h"
30 #include "config.h"
31 #include "Error.h"
32 #include "GlobalParams.h"
33 #include "UnicodeMap.h"
34 #include "UnicodeRemapping.h"
35 #include "UnicodeTypeTable.h"
36 #include "GfxState.h"
37 #include "Link.h"
38 #include "TextOutputDev.h"
39 
40 //------------------------------------------------------------------------
41 // parameters
42 //------------------------------------------------------------------------
43 
44 // Size of bins used for horizontal and vertical profiles is
45 // splitPrecisionMul * minFontSize.
46 #define splitPrecisionMul 0.05
47 
48 // Minimum allowed split precision.
49 #define minSplitPrecision 0.01
50 
51 // yMin and yMax (or xMin and xMax for rot=1,3) are adjusted by this
52 // fraction of the text height, to allow for slightly overlapping
53 // lines (or large ascent/descent values).
54 #define ascentAdjustFactor 0
55 #define descentAdjustFactor 0.35
56 
57 // Gaps larger than max{gap} - splitGapSlack * avgFontSize are
58 // considered to be equivalent.
59 #define splitGapSlack 0.2
60 
61 // The vertical gap threshold (minimum gap required to split
62 // vertically) depends on the (approximate) number of lines in the
63 // block:
64 // threshold = (max + slope * nLines) * avgFontSize
65 // with a min value of vertGapThresholdMin * avgFontSize.
66 #define vertGapThresholdMin 0.8
67 #define vertGapThresholdMax 3
68 #define vertGapThresholdSlope -0.5
69 
70 // Vertical gap threshold for table mode.
71 #define vertGapThresholdTableMin 0.2
72 #define vertGapThresholdTableMax 0.5
73 #define vertGapThresholdTableSlope -0.02
74 
75 // A large character has a font size larger than
76 // largeCharThreshold * avgFontSize.
77 #define largeCharThreshold 1.5
78 
79 // A block will be split vertically only if the resulting chunk
80 // widths are greater than vertSplitChunkThreshold * avgFontSize.
81 #define vertSplitChunkThreshold 2
82 
83 // Max difference in primary,secondary coordinates (as a fraction of
84 // the font size) allowed for duplicated text (fake boldface, drop
85 // shadows) which is to be discarded.
86 #define dupMaxPriDelta 0.1
87 #define dupMaxSecDelta 0.2
88 
89 // Inter-character spacing that varies by less than this multiple of
90 // font size is assumed to be equivalent.
91 #define uniformSpacing 0.07
92 #define tableModeUniformSpacing 0.14
93 
94 // Typical word spacing, as a fraction of font size. This will be
95 // added to the minimum inter-character spacing, to account for wide
96 // character spacing.
97 #define wordSpacing 0.1
98 #define tableModeWordSpacing 0.2
99 
100 // Minimum paragraph indent from left margin, as a fraction of font
101 // size.
102 #define minParagraphIndent 0.5
103 
104 // If the space between two lines is greater than
105 // paragraphSpacingThreshold * avgLineSpacing, start a new paragraph.
106 #define paragraphSpacingThreshold 1.25
107 
108 // If font size changes by at least this much (measured in points)
109 // between lines, start a new paragraph.
110 #define paragraphFontSizeDelta 1
111 
112 // Spaces at the start of a line in physical layout mode are this wide
113 // (as a multiple of font size).
114 #define physLayoutSpaceWidth 0.33
115 
116 // In simple layout mode, lines are broken at gaps larger than this
117 // value multiplied by font size.
118 #define simpleLayoutGapThreshold 0.7
119 
120 // Minimum overlap in simple2 mode.
121 #define simple2MinOverlap 0.2
122 
123 // Table cells (TextColumns) are allowed to overlap by this much
124 // in table layout mode (as a fraction of cell width or height).
125 #define tableCellOverlapSlack 0.05
126 
127 // Primary axis delta which will cause a line break in raw mode
128 // (as a fraction of font size).
129 #define rawModeLineDelta 0.5
130 
131 // Secondary axis delta which will cause a word break in raw mode
132 // (as a fraction of font size).
133 #define rawModeWordSpacing 0.15
134 
135 // Secondary axis overlap which will cause a line break in raw mode
136 // (as a fraction of font size).
137 #define rawModeCharOverlap 0.2
138 
139 // Max spacing (as a multiple of font size) allowed between the end of
140 // a line and a clipped character to be included in that line.
141 #define clippedTextMaxWordSpace 0.5
142 
143 // Max width of underlines (in points).
144 #define maxUnderlineWidth 3
145 
146 // Max horizontal distance between edge of word and start of underline
147 // (as a fraction of font size).
148 #define underlineSlack 0.2
149 
150 // Max vertical distance between baseline of word and start of
151 // underline (as a fraction of font size).
152 #define underlineBaselineSlack 0.2
153 
154 // Max distance between edge of text and edge of link border (as a
155 // fraction of font size).
156 #define hyperlinkSlack 0.2
157 
158 // Text is considered diagonal if abs(tan(angle)) > diagonalThreshold.
159 // (Or 1/tan(angle) for 90/270 degrees.)
160 #define diagonalThreshold 0.1
161 
162 // This value is used as the ascent when computing selection
163 // rectangles, in order to work around flakey ascent values in fonts.
164 #define selectionAscent 0.8
165 
166 // Grid size used to bin sort characters for overlap detection.
167 #define overlapGridWidth 20
168 #define overlapGridHeight 20
169 
170 // Minimum character bbox overlap (horizontal and vertical) as a
171 // fraction of character bbox width/height for a character to be
172 // treated as overlapping.
173 #define minCharOverlap 0.3
174 
175 #define maxUnicodeLen 16
176 
177 //------------------------------------------------------------------------
178 
179 static inline double dmin(double x, double y) {
180  return x < y ? x : y;
181 }
182 
183 static inline double dmax(double x, double y) {
184  return x > y ? x : y;
185 }
186 
187 //------------------------------------------------------------------------
188 // TextChar
189 //------------------------------------------------------------------------
190 
191 class TextChar {
192 public:
193 
194  TextChar(Unicode cA, int charPosA, int charLenA,
195  double xMinA, double yMinA, double xMaxA, double yMaxA,
196  int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA,
197  TextFontInfo *fontA, double fontSizeA,
198  double colorRA, double colorGA, double colorBA);
199 
200  static int cmpX(const void *p1, const void *p2);
201  static int cmpY(const void *p1, const void *p2);
202  static int cmpCharPos(const void *p1, const void *p2);
203 
205  int charPos;
206  int charLen;
207  double xMin, yMin, xMax, yMax;
209  double fontSize;
210  double colorR,
213 
214  // group the byte-size fields to minimize object size
216  char rotated;
217  char clipped;
218  char invisible;
220  char overlap;
221 };
222 
223 TextChar::TextChar(Unicode cA, int charPosA, int charLenA,
224  double xMinA, double yMinA, double xMaxA, double yMaxA,
225  int rotA, GBool rotatedA, GBool clippedA, GBool invisibleA,
226  TextFontInfo *fontA, double fontSizeA,
227  double colorRA, double colorGA, double colorBA) {
228  double t;
229 
230  c = cA;
231  charPos = charPosA;
232  charLen = charLenA;
233  xMin = xMinA;
234  yMin = yMinA;
235  xMax = xMaxA;
236  yMax = yMaxA;
237  // this can happen with vertical writing mode, or with odd values
238  // for the char/word spacing parameters
239  if (xMin > xMax) {
240  t = xMin; xMin = xMax; xMax = t;
241  }
242  if (yMin > yMax) {
243  t = yMin; yMin = yMax; yMax = t;
244  }
245  // TextPage::findGaps uses integer coordinates, so clip the char
246  // bbox to fit in a 32-bit int (this is generally only a problem in
247  // damaged PDF files)
248  if (xMin < -1e8) {
249  xMin = -1e8;
250  }
251  if (xMax > 1e8) {
252  xMax = 1e8;
253  }
254  if (yMin < -1e8) {
255  yMin = -1e8;
256  }
257  if (yMax > 1e8) {
258  yMax = 1e8;
259  }
260  rot = (Guchar)rotA;
261  rotated = (char)rotatedA;
262  clipped = (char)clippedA;
263  invisible = (char)invisibleA;
264  spaceAfter = (char)gFalse;
265  font = fontA;
266  fontSize = fontSizeA;
267  colorR = colorRA;
268  colorG = colorGA;
269  colorB = colorBA;
270  overlap = gFalse;
271 }
272 
273 int TextChar::cmpX(const void *p1, const void *p2) {
274  const TextChar *ch1 = *(const TextChar **)p1;
275  const TextChar *ch2 = *(const TextChar **)p2;
276 
277  if (ch1->xMin < ch2->xMin) {
278  return -1;
279  } else if (ch1->xMin > ch2->xMin) {
280  return 1;
281  } else {
282  return ch1->charPos - ch2->charPos;
283  }
284 }
285 
286 int TextChar::cmpY(const void *p1, const void *p2) {
287  const TextChar *ch1 = *(const TextChar **)p1;
288  const TextChar *ch2 = *(const TextChar **)p2;
289 
290  if (ch1->yMin < ch2->yMin) {
291  return -1;
292  } else if (ch1->yMin > ch2->yMin) {
293  return 1;
294  } else {
295  return ch1->charPos - ch2->charPos;
296  }
297 }
298 
299 int TextChar::cmpCharPos(const void *p1, const void *p2) {
300  const TextChar *ch1 = *(const TextChar **)p1;
301  const TextChar *ch2 = *(const TextChar **)p2;
302  return ch1->charPos - ch2->charPos;
303 }
304 
305 //------------------------------------------------------------------------
306 // TextBlock
307 //------------------------------------------------------------------------
308 
312  blkLeaf
313 };
314 
319  blkTagLine
320 };
321 
322 class TextBlock {
323 public:
324 
325  TextBlock(TextBlockType typeA, int rotA);
326  ~TextBlock();
327  void addChild(TextBlock *child);
328  void addChild(TextChar *child, GBool updateBox);
329  void prependChild(TextChar *child);
330  void updateBounds(int childIdx);
331 
334  int rot;
335  double xMin, yMin, xMax, yMax;
336  GBool smallSplit; // true for blkVertSplit/blkHorizSplit
337  // where the gap size is small
338  GList *children; // for blkLeaf, children are TextWord;
339  // for others, children are TextBlock
340 };
341 
343  type = typeA;
345  rot = rotA;
346  xMin = yMin = xMax = yMax = 0;
347  smallSplit = gFalse;
348  children = new GList();
349 }
350 
352  if (type == blkLeaf) {
353  delete children;
354  } else {
356  }
357 }
358 
360  if (children->getLength() == 0) {
361  xMin = child->xMin;
362  yMin = child->yMin;
363  xMax = child->xMax;
364  yMax = child->yMax;
365  } else {
366  if (child->xMin < xMin) {
367  xMin = child->xMin;
368  }
369  if (child->yMin < yMin) {
370  yMin = child->yMin;
371  }
372  if (child->xMax > xMax) {
373  xMax = child->xMax;
374  }
375  if (child->yMax > yMax) {
376  yMax = child->yMax;
377  }
378  }
380 }
381 
383  if (updateBox) {
384  if (children->getLength() == 0) {
385  xMin = child->xMin;
386  yMin = child->yMin;
387  xMax = child->xMax;
388  yMax = child->yMax;
389  } else {
390  if (child->xMin < xMin) {
391  xMin = child->xMin;
392  }
393  if (child->yMin < yMin) {
394  yMin = child->yMin;
395  }
396  if (child->xMax > xMax) {
397  xMax = child->xMax;
398  }
399  if (child->yMax > yMax) {
400  yMax = child->yMax;
401  }
402  }
403  }
405 }
406 
408  if (children->getLength() == 0) {
409  xMin = child->xMin;
410  yMin = child->yMin;
411  xMax = child->xMax;
412  yMax = child->yMax;
413  } else {
414  if (child->xMin < xMin) {
415  xMin = child->xMin;
416  }
417  if (child->yMin < yMin) {
418  yMin = child->yMin;
419  }
420  if (child->xMax > xMax) {
421  xMax = child->xMax;
422  }
423  if (child->yMax > yMax) {
424  yMax = child->yMax;
425  }
426  }
427  children->insert(0, child);
428 }
429 
430 void TextBlock::updateBounds(int childIdx) {
431  TextBlock *child;
432 
433  child = (TextBlock *)children->get(childIdx);
434  if (child->xMin < xMin) {
435  xMin = child->xMin;
436  }
437  if (child->yMin < yMin) {
438  yMin = child->yMin;
439  }
440  if (child->xMax > xMax) {
441  xMax = child->xMax;
442  }
443  if (child->yMax > yMax) {
444  yMax = child->yMax;
445  }
446 }
447 
448 //------------------------------------------------------------------------
449 // TextCharLine
450 //------------------------------------------------------------------------
451 
453 public:
454 
455  TextCharLine(int rotA);
456  ~TextCharLine();
457  void add(TextChar *ch);
458 
460  double yMin, yMax;
461  int rot;
463 };
464 
466  chars = new GList();
467  yMin = yMax = 0;
468  rot = rotA;
469  next = prev = NULL;
470 }
471 
473  delete chars;
474 }
475 
477  chars->append(ch);
478  yMin = ch->yMin;
479  yMax = ch->yMax;
480 }
481 
482 //------------------------------------------------------------------------
483 // TextGaps
484 //------------------------------------------------------------------------
485 
486 struct TextGap {
487  double x; // center of gap: x for vertical gaps,
488  // y for horizontal gaps
489  double w; // width/height of gap
490 };
491 
492 class TextGaps {
493 public:
494 
495  TextGaps();
496  ~TextGaps();
497  void addGap(double x, double w);
498  int getLength() { return length; }
499  double getX(int idx) { return gaps[idx].x; }
500  double getW(int idx) { return gaps[idx].w; }
501 
502 private:
503 
504  int length;
505  int size;
507 };
508 
510  length = 0;
511  size = 16;
512  gaps = (TextGap *)gmallocn(size, sizeof(TextGap));
513 }
514 
516  gfree(gaps);
517 }
518 
519 void TextGaps::addGap(double x, double w) {
520  if (length == size) {
521  size *= 2;
522  gaps = (TextGap *)greallocn(gaps, size, sizeof(TextGap));
523  }
524  gaps[length].x = x;
525  gaps[length].w = w;
526  ++length;
527 }
528 
529 //------------------------------------------------------------------------
530 // TextSuperLine
531 //------------------------------------------------------------------------
532 
534 public:
535 
536  TextSuperLine(GList *linesA);
537  ~TextSuperLine();
538 
539  GList *lines; // [TextLine]
540  double yMin, yMax;
541  double fontSize;
542 };
543 
545  TextLine *line;
546  int i;
547 
548  lines = linesA;
549  yMin = yMax = 0;
550  fontSize = ((TextLine *)lines->get(0))->fontSize;
551  for (i = 0; i < lines->getLength(); ++i) {
552  line = (TextLine *)lines->get(i);
553  if (i == 0 || line->yMin < yMin) {
554  yMin = line->yMin;
555  }
556  if (i == 0 || line->yMax > yMax) {
557  yMax = line->yMax;
558  }
559  }
560 }
561 
564 }
565 
566 //------------------------------------------------------------------------
567 // TextUnderline
568 //------------------------------------------------------------------------
569 
571 public:
572 
573  TextUnderline(double x0A, double y0A, double x1A, double y1A)
574  { x0 = x0A; y0 = y0A; x1 = x1A; y1 = y1A; horiz = y0 == y1; }
576 
577  double x0, y0, x1, y1;
579 };
580 
581 //------------------------------------------------------------------------
582 // TextLink
583 //------------------------------------------------------------------------
584 
585 class TextLink {
586 public:
587 
588  TextLink(double xMinA, double yMinA, double xMaxA, double yMaxA,
589  GString *uriA)
590  { xMin = xMinA; yMin = yMinA; xMax = xMaxA; yMax = yMaxA; uri = uriA; }
591  ~TextLink();
592 
593  double xMin, yMin, xMax, yMax;
595 };
596 
598  if (uri) {
599  delete uri;
600  }
601 }
602 
603 //------------------------------------------------------------------------
604 // TextOutputControl
605 //------------------------------------------------------------------------
606 
609  fixedPitch = 0;
610  fixedLineSpacing = 0;
611  html = gFalse;
612  clipText = gFalse;
620  insertBOM = gFalse;
621  marginLeft = 0;
622  marginRight = 0;
623  marginTop = 0;
624  marginBottom = 0;
625 }
626 
627 
628 //------------------------------------------------------------------------
629 // TextFontInfo
630 //------------------------------------------------------------------------
631 
633  GfxFont *gfxFont;
634 
635  gfxFont = state->getFont();
636  if (gfxFont) {
637  fontID = *gfxFont->getID();
638  ascent = gfxFont->getAscent();
640  // "odd" ascent/descent values cause trouble more often than not
641  // (in theory these could be legitimate values for oddly designed
642  // fonts -- but they are more often due to buggy PDF generators)
643  // (values that are too small are a different issue -- those seem
644  // to be more commonly legitimate)
645  if (ascent > 1) {
646  ascent = 0.75;
647  }
648  if (descent < -0.5) {
649  descent = -0.25;
650  }
651  } else {
652  fontID.num = -1;
653  fontID.gen = -1;
654  ascent = 0.75;
655  descent = -0.25;
656  }
658  : (GString *)NULL;
659  flags = gfxFont ? gfxFont->getFlags() : 0;
660  mWidth = 0;
661  if (gfxFont && !gfxFont->isCIDFont()) {
662  char *name;
663  int code;
664  for (code = 0; code < 256; ++code) {
665  if ((name = ((Gfx8BitFont *)gfxFont)->getCharName(code)) &&
666  name[0] == 'm' && name[1] == '\0') {
667  mWidth = ((Gfx8BitFont *)gfxFont)->getWidth((Guchar)code);
668  break;
669  }
670  }
671  }
672 }
673 
675  if (fontName) {
676  delete fontName;
677  }
678 }
679 
681  Ref id;
682 
683  if (state->getFont()) {
684  id = *state->getFont()->getID();
685  } else {
686  id.num = -1;
687  id.gen = -1;
688  }
689  return id.num == fontID.num && id.gen == fontID.gen;
690 }
691 
692 //------------------------------------------------------------------------
693 // TextWord
694 //------------------------------------------------------------------------
695 
696 // Build a TextWord object, using chars[start .. start+len-1].
697 // (If rot >= 2, the chars list is in reverse order.)
699  int rotA, GBool rotatedA, int dirA, GBool spaceAfterA) {
700  TextChar *ch;
701  int i;
702 
703  rot = (char)rotA;
704  rotated = (char)rotatedA;
705  len = lenA;
706  text = (Unicode *)gmallocn(len, sizeof(Unicode));
707  edge = (double *)gmallocn(len + 1, sizeof(double));
708  charPos = (int *)gmallocn(len + 1, sizeof(int));
709  if (rot & 1) {
710  ch = (TextChar *)chars->get(start);
711  xMin = ch->xMin;
712  xMax = ch->xMax;
713  yMin = ch->yMin;
714  ch = (TextChar *)chars->get(start + len - 1);
715  yMax = ch->yMax;
716  } else {
717  ch = (TextChar *)chars->get(start);
718  xMin = ch->xMin;
719  yMin = ch->yMin;
720  yMax = ch->yMax;
721  ch = (TextChar *)chars->get(start + len - 1);
722  xMax = ch->xMax;
723  }
724  for (i = 0; i < len; ++i) {
725  ch = (TextChar *)chars->get(rot >= 2 ? start + len - 1 - i : start + i);
726  text[i] = ch->c;
727  charPos[i] = ch->charPos;
728  if (i == len - 1) {
729  charPos[len] = ch->charPos + ch->charLen;
730  }
731  switch (rot) {
732  case 0:
733  default:
734  edge[i] = ch->xMin;
735  if (i == len - 1) {
736  edge[len] = ch->xMax;
737  }
738  break;
739  case 1:
740  edge[i] = ch->yMin;
741  if (i == len - 1) {
742  edge[len] = ch->yMax;
743  }
744  break;
745  case 2:
746  edge[i] = ch->xMax;
747  if (i == len - 1) {
748  edge[len] = ch->xMin;
749  }
750  break;
751  case 3:
752  edge[i] = ch->yMax;
753  if (i == len - 1) {
754  edge[len] = ch->yMin;
755  }
756  break;
757  }
758  }
759  ch = (TextChar *)chars->get(start);
760  font = ch->font;
761  fontSize = ch->fontSize;
762  dir = (char)dirA;
763  spaceAfter = (char)spaceAfterA;
764  underlined = gFalse;
765  link = NULL;
766  colorR = ch->colorR;
767  colorG = ch->colorG;
768  colorB = ch->colorB;
769  invisible = ch->invisible;
770 }
771 
773  *this = *word;
774  text = (Unicode *)gmallocn(len, sizeof(Unicode));
775  memcpy(text, word->text, len * sizeof(Unicode));
776  edge = (double *)gmallocn(len + 1, sizeof(double));
777  memcpy(edge, word->edge, (len + 1) * sizeof(double));
778  charPos = (int *)gmallocn(len + 1, sizeof(int));
779  memcpy(charPos, word->charPos, (len + 1) * sizeof(int));
780 }
781 
783  gfree(text);
784  gfree(edge);
785  gfree(charPos);
786 }
787 
788 int TextWord::cmpYX(const void *p1, const void *p2) {
789  const TextWord *word1 = *(const TextWord **)p1;
790  const TextWord *word2 = *(const TextWord **)p2;
791  double cmp;
792 
793  if ((cmp = word1->yMin - word2->yMin) == 0) {
794  cmp = word1->xMin - word2->xMin;
795  }
796  return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
797 }
798 
799 int TextWord::cmpCharPos(const void *p1, const void *p2) {
800  const TextWord *word1 = *(const TextWord **)p1;
801  const TextWord *word2 = *(const TextWord **)p2;
802 
803  return word1->charPos[0] - word2->charPos[0];
804 }
805 
807  GString *s;
808  UnicodeMap *uMap;
809  char buf[8];
810  int n, i;
811 
812  s = new GString();
813  if (!(uMap = globalParams->getTextEncoding())) {
814  return s;
815  }
816  for (i = 0; i < len; ++i) {
817  n = uMap->mapUnicode(text[i], buf, sizeof(buf));
818  s->append(buf, n);
819  }
820  uMap->decRefCnt();
821  return s;
822 }
823 
824 void TextWord::getCharBBox(int charIdx, double *xMinA, double *yMinA,
825  double *xMaxA, double *yMaxA) {
826  if (charIdx < 0 || charIdx >= len) {
827  return;
828  }
829  switch (rot) {
830  case 0:
831  *xMinA = edge[charIdx];
832  *xMaxA = edge[charIdx + 1];
833  *yMinA = yMin;
834  *yMaxA = yMax;
835  break;
836  case 1:
837  *xMinA = xMin;
838  *xMaxA = xMax;
839  *yMinA = edge[charIdx];
840  *yMaxA = edge[charIdx + 1];
841  break;
842  case 2:
843  *xMinA = edge[charIdx + 1];
844  *xMaxA = edge[charIdx];
845  *yMinA = yMin;
846  *yMaxA = yMax;
847  break;
848  case 3:
849  *xMinA = xMin;
850  *xMaxA = xMax;
851  *yMinA = edge[charIdx + 1];
852  *yMaxA = edge[charIdx];
853  break;
854  }
855 }
856 
858  switch (rot) {
859  case 0:
860  default:
861  return yMax + fontSize * font->descent;
862  case 1:
863  return xMin - fontSize * font->descent;
864  case 2:
865  return yMin - fontSize * font->descent;
866  case 3:
867  return xMax + fontSize * font->descent;
868  }
869 }
870 
872  return link ? link->uri : (GString *)NULL;
873 }
874 
875 //------------------------------------------------------------------------
876 // TextLine
877 //------------------------------------------------------------------------
878 
879 TextLine::TextLine(GList *wordsA, double xMinA, double yMinA,
880  double xMaxA, double yMaxA, double fontSizeA) {
881  TextWord *word;
882  int i, j, k;
883 
884  words = wordsA;
885  rot = 0;
886  xMin = xMinA;
887  yMin = yMinA;
888  xMax = xMaxA;
889  yMax = yMaxA;
890  fontSize = fontSizeA;
891  px = 0;
892  pw = 0;
893 
894  // build the text
895  len = 0;
896  for (i = 0; i < words->getLength(); ++i) {
897  word = (TextWord *)words->get(i);
898  len += word->len;
899  if (word->spaceAfter) {
900  ++len;
901  }
902  }
903  text = (Unicode *)gmallocn(len, sizeof(Unicode));
904  edge = (double *)gmallocn(len + 1, sizeof(double));
905  j = 0;
906  for (i = 0; i < words->getLength(); ++i) {
907  word = (TextWord *)words->get(i);
908  if (i == 0) {
909  rot = word->rot;
910  }
911  for (k = 0; k < word->len; ++k) {
912  text[j] = word->text[k];
913  edge[j] = word->edge[k];
914  ++j;
915  }
916  edge[j] = word->edge[word->len];
917  if (word->spaceAfter) {
918  text[j] = (Unicode)0x0020;
919  ++j;
920  edge[j] = edge[j - 1];
921  }
922  }
923  //~ need to check for other Unicode chars used as hyphens
924  hyphenated = text[len - 1] == (Unicode)'-';
925 }
926 
929  gfree(text);
930  gfree(edge);
931 }
932 
934  TextWord *word0;
935 
936  word0 = (TextWord *)words->get(0);
937  switch (rot) {
938  case 0:
939  default:
940  return yMax + fontSize * word0->font->descent;
941  case 1:
942  return xMin - fontSize * word0->font->descent;
943  case 2:
944  return yMin - fontSize * word0->font->descent;
945  case 3:
946  return xMax + fontSize * word0->font->descent;
947  }
948 }
949 
950 int TextLine::cmpX(const void *p1, const void *p2) {
951  const TextLine *line1 = *(const TextLine **)p1;
952  const TextLine *line2 = *(const TextLine **)p2;
953 
954  if (line1->xMin < line2->xMin) {
955  return -1;
956  } else if (line1->xMin > line2->xMin) {
957  return 1;
958  } else {
959  return 0;
960  }
961 }
962 
963 //------------------------------------------------------------------------
964 // TextParagraph
965 //------------------------------------------------------------------------
966 
968  TextLine *line;
969  int i;
970 
971  lines = linesA;
972  dropCap = dropCapA;
973  xMin = yMin = xMax = yMax = 0;
974  for (i = 0; i < lines->getLength(); ++i) {
975  line = (TextLine *)lines->get(i);
976  if (i == 0 || line->xMin < xMin) {
977  xMin = line->xMin;
978  }
979  if (i == 0 || line->yMin < yMin) {
980  yMin = line->yMin;
981  }
982  if (i == 0 || line->xMax > xMax) {
983  xMax = line->xMax;
984  }
985  if (i == 0 || line->yMax > yMax) {
986  yMax = line->yMax;
987  }
988  }
989 }
990 
993 }
994 
995 //------------------------------------------------------------------------
996 // TextColumn
997 //------------------------------------------------------------------------
998 
999 TextColumn::TextColumn(GList *paragraphsA, double xMinA, double yMinA,
1000  double xMaxA, double yMaxA) {
1001  paragraphs = paragraphsA;
1002  xMin = xMinA;
1003  yMin = yMinA;
1004  xMax = xMaxA;
1005  yMax = yMaxA;
1006  px = py = 0;
1007  pw = ph = 0;
1008 }
1009 
1012 }
1013 
1015  TextParagraph *par;
1016  TextLine *line;
1017 
1018  par = (TextParagraph *)paragraphs->get(0);
1019  line = (TextLine *)par->getLines()->get(0);
1020  return line->getRotation();
1021 }
1022 
1023 int TextColumn::cmpX(const void *p1, const void *p2) {
1024  const TextColumn *col1 = *(const TextColumn **)p1;
1025  const TextColumn *col2 = *(const TextColumn **)p2;
1026 
1027  if (col1->xMin < col2->xMin) {
1028  return -1;
1029  } else if (col1->xMin > col2->xMin) {
1030  return 1;
1031  } else {
1032  return 0;
1033  }
1034 }
1035 
1036 int TextColumn::cmpY(const void *p1, const void *p2) {
1037  const TextColumn *col1 = *(const TextColumn **)p1;
1038  const TextColumn *col2 = *(const TextColumn **)p2;
1039 
1040  if (col1->yMin < col2->yMin) {
1041  return -1;
1042  } else if (col1->yMin > col2->yMin) {
1043  return 1;
1044  } else {
1045  return 0;
1046  }
1047 }
1048 
1049 int TextColumn::cmpPX(const void *p1, const void *p2) {
1050  const TextColumn *col1 = *(const TextColumn **)p1;
1051  const TextColumn *col2 = *(const TextColumn **)p2;
1052 
1053  if (col1->px < col2->px) {
1054  return -1;
1055  } else if (col1->px > col2->px) {
1056  return 1;
1057  } else {
1058  return 0;
1059  }
1060 }
1061 
1062 //------------------------------------------------------------------------
1063 // TextWordList
1064 //------------------------------------------------------------------------
1065 
1067  words = wordsA;
1068  primaryLR = primaryLRA;
1069 }
1070 
1073 }
1074 
1076  return words->getLength();
1077 }
1078 
1080  if (idx < 0 || idx >= words->getLength()) {
1081  return NULL;
1082  }
1083  return (TextWord *)words->get(idx);
1084 }
1085 
1086 //------------------------------------------------------------------------
1087 // TextPosition
1088 //------------------------------------------------------------------------
1089 
1091  return colIdx == pos.colIdx &&
1092  parIdx == pos.parIdx &&
1093  lineIdx == pos.lineIdx &&
1094  charIdx == pos.charIdx;
1095 }
1096 
1098  return colIdx != pos.colIdx ||
1099  parIdx != pos.parIdx ||
1100  lineIdx != pos.lineIdx ||
1101  charIdx != pos.charIdx;
1102 }
1103 
1105  return colIdx < pos.colIdx ||
1106  (colIdx == pos.colIdx &&
1107  (parIdx < pos.parIdx ||
1108  (parIdx == pos.parIdx &&
1109  (lineIdx < pos.lineIdx ||
1110  (lineIdx == pos.lineIdx &&
1111  charIdx < pos.charIdx)))));
1112 }
1113 
1115  return colIdx > pos.colIdx ||
1116  (colIdx == pos.colIdx &&
1117  (parIdx > pos.parIdx ||
1118  (parIdx == pos.parIdx &&
1119  (lineIdx > pos.lineIdx ||
1120  (lineIdx == pos.lineIdx &&
1121  charIdx > pos.charIdx)))));
1122 }
1123 
1124 //------------------------------------------------------------------------
1125 // TextPage
1126 //------------------------------------------------------------------------
1127 
1129  control = *controlA;
1131  uBufSize = 16;
1132  uBuf = (Unicode *)gmallocn(uBufSize, sizeof(Unicode));
1133  pageWidth = pageHeight = 0;
1134  charPos = 0;
1135  curFont = NULL;
1136  curFontSize = 0;
1137  curRot = 0;
1138  diagonal = gFalse;
1139  rotated = gFalse;
1140  nTinyChars = 0;
1141  actualText = NULL;
1142  actualTextLen = 0;
1143  actualTextX0 = 0;
1144  actualTextY0 = 0;
1145  actualTextX1 = 0;
1146  actualTextY1 = 0;
1147  actualTextNBytes = 0;
1148 
1149  chars = new GList();
1150  fonts = new GList();
1151 
1152  underlines = new GList();
1153  links = new GList();
1154 
1155  findCols = NULL;
1156  lastFindXMin = lastFindYMin = 0;
1157  haveLastFind = gFalse;
1158 
1159  problematic = gFalse;
1160 }
1161 
1163  clear();
1168  if (findCols) {
1170  }
1171  gfree(uBuf);
1172 }
1173 
1175  clear();
1176  if (state) {
1177  pageWidth = state->getPageWidth();
1178  pageHeight = state->getPageHeight();
1179  } else {
1180  pageWidth = pageHeight = 0;
1181  }
1182 }
1183 
1185  pageWidth = pageHeight = 0;
1186  charPos = 0;
1187  curFont = NULL;
1188  curFontSize = 0;
1189  curRot = 0;
1190  diagonal = gFalse;
1191  rotated = gFalse;
1192  nTinyChars = 0;
1193  gfree(actualText);
1194  actualText = NULL;
1195  actualTextLen = 0;
1196  actualTextNBytes = 0;
1198  chars = new GList();
1200  fonts = new GList();
1202  underlines = new GList();
1204  links = new GList();
1205 
1206  if (findCols) {
1208  findCols = NULL;
1209  }
1210  lastFindXMin = lastFindYMin = 0;
1211  haveLastFind = gFalse;
1212 
1213  problematic = gFalse;
1214 }
1215 
1217  GfxFont *gfxFont;
1218  double *fm;
1219  char *name;
1220  int code, mCode, letterCode, anyCode;
1221  double w;
1222  double m[4], m2[4];
1223  int i;
1224 
1225  // get the font info object
1226  curFont = NULL;
1227  for (i = 0; i < fonts->getLength(); ++i) {
1228  curFont = (TextFontInfo *)fonts->get(i);
1229  if (curFont->matches(state)) {
1230  break;
1231  }
1232  curFont = NULL;
1233  }
1234  if (!curFont) {
1235  curFont = new TextFontInfo(state);
1236  fonts->append(curFont);
1237  if (state->getFont() && state->getFont()->problematicForUnicode()) {
1238  problematic = gTrue;
1239  }
1240  }
1241 
1242  // adjust the font size
1243  gfxFont = state->getFont();
1244  curFontSize = state->getTransformedFontSize();
1245  if (gfxFont && gfxFont->getType() == fontType3) {
1246  // This is a hack which makes it possible to deal with some Type 3
1247  // fonts. The problem is that it's impossible to know what the
1248  // base coordinate system used in the font is without actually
1249  // rendering the font. This code tries to guess by looking at the
1250  // width of the character 'm' (which breaks if the font is a
1251  // subset that doesn't contain 'm').
1252  mCode = letterCode = anyCode = -1;
1253  for (code = 0; code < 256; ++code) {
1254  name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
1255  if (name && name[0] == 'm' && name[1] == '\0') {
1256  mCode = code;
1257  }
1258  if (letterCode < 0 &&
1259  name &&
1260  ((name[0] >= 'A' && name[0] <= 'Z') ||
1261  (name[0] >= 'a' && name[0] <= 'z')) &&
1262  name[1] == '\0') {
1263  letterCode = code;
1264  }
1265  if (anyCode < 0 && name &&
1266  ((Gfx8BitFont *)gfxFont)->getWidth((Guchar)code) > 0) {
1267  anyCode = code;
1268  }
1269  }
1270  if (mCode >= 0 &&
1271  (w = ((Gfx8BitFont *)gfxFont)->getWidth((Guchar)mCode)) > 0) {
1272  // 0.6 is a generic average 'm' width -- yes, this is a hack
1273  curFontSize *= w / 0.6;
1274  } else if (letterCode >= 0 &&
1275  (w = ((Gfx8BitFont *)gfxFont)->getWidth((Guchar)letterCode))
1276  > 0) {
1277  // even more of a hack: 0.5 is a generic letter width
1278  curFontSize *= w / 0.5;
1279  } else if (anyCode >= 0 &&
1280  (w = ((Gfx8BitFont *)gfxFont)->getWidth((Guchar)anyCode)) > 0) {
1281  // better than nothing: 0.5 is a generic character width
1282  curFontSize *= w / 0.5;
1283  }
1284  fm = gfxFont->getFontMatrix();
1285  if (fm[0] != 0) {
1286  curFontSize *= fabs(fm[3] / fm[0]);
1287  }
1288  }
1289 
1290  // compute the rotation
1291  state->getFontTransMat(&m[0], &m[1], &m[2], &m[3]);
1292  if (gfxFont && gfxFont->getType() == fontType3) {
1293  fm = gfxFont->getFontMatrix();
1294  m2[0] = fm[0] * m[0] + fm[1] * m[2];
1295  m2[1] = fm[0] * m[1] + fm[1] * m[3];
1296  m2[2] = fm[2] * m[0] + fm[3] * m[2];
1297  m2[3] = fm[2] * m[1] + fm[3] * m[3];
1298  m[0] = m2[0];
1299  m[1] = m2[1];
1300  m[2] = m2[2];
1301  m[3] = m2[3];
1302  }
1303  if (curFontSize == 0) {
1304  // special case - if the font size is zero, just assume plain
1305  // horizontal text
1306  curRot = 0;
1307  diagonal = gFalse;
1308  } else if (fabs(m[0]) >= fabs(m[1])) {
1309  if (m[0] > 0) {
1310  curRot = 0;
1311  } else {
1312  curRot = 2;
1313  }
1314  diagonal = fabs(m[1]) > diagonalThreshold * fabs(m[0]);
1315  } else {
1316  if (m[1] > 0) {
1317  curRot = 1;
1318  } else {
1319  curRot = 3;
1320  }
1321  diagonal = fabs(m[0]) > diagonalThreshold * fabs(m[1]);
1322  }
1323  // this matches the 'horiz' test in SplashOutputDev::drawChar()
1324  rotated = !(m[0] > 0 && fabs(m[1]) < 0.001 &&
1325  fabs(m[2]) < 0.001 && m[3] < 0);
1326 }
1327 
1328 void TextPage::addChar(GfxState *state, double x, double y,
1329  double dx, double dy,
1330  CharCode c, int nBytes, Unicode *u, int uLen) {
1331  double x1, y1, x2, y2, w1, h1, dx2, dy2, ascent, descent, sp;
1332  double xMin, yMin, xMax, yMax, xMid, yMid;
1333  double clipXMin, clipYMin, clipXMax, clipYMax;
1334  GfxRGB rgb;
1335  double alpha;
1336  GBool clipped, rtl;
1337  int uBufLen, i, j;
1338 
1339  // if we're in an ActualText span, save the position info (the
1340  // ActualText chars will be added by TextPage::endActualText()).
1341  if (actualText) {
1342  if (!actualTextNBytes) {
1343  actualTextX0 = x;
1344  actualTextY0 = y;
1345  }
1346  actualTextX1 = x + dx;
1347  actualTextY1 = y + dy;
1348  actualTextNBytes += nBytes;
1349  return;
1350  }
1351 
1352  // throw away diagonal/rotated chars
1355  charPos += nBytes;
1356  return;
1357  }
1358 
1359  // subtract char and word spacing from the dx,dy values
1360  sp = state->getCharSpace();
1361  if (c == (CharCode)0x20) {
1362  sp += state->getWordSpace();
1363  }
1364  state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
1365  dx -= dx2;
1366  dy -= dy2;
1367  state->transformDelta(dx, dy, &w1, &h1);
1368 
1369  // throw away chars that aren't inside the page bounds
1370  // (and also do a sanity check on the character size)
1371  state->transform(x, y, &x1, &y1);
1372  if (x1 + w1 < control.marginLeft ||
1374  y1 + h1 < control.marginTop ||
1376  w1 > pageWidth ||
1377  h1 > pageHeight) {
1378  charPos += nBytes;
1379  return;
1380  }
1381 
1382  // check the tiny chars limit
1384  fabs(w1) < 3 && fabs(h1) < 3) {
1385  if (++nTinyChars > 50000) {
1386  charPos += nBytes;
1387  return;
1388  }
1389  }
1390 
1391  // skip space, tab, and non-breaking space characters
1392  // (ActualText spans can result in multiple space chars)
1393  for (i = 0; i < uLen; ++i) {
1394  if (u[i] != (Unicode)0x20 &&
1395  u[i] != (Unicode)0x09 &&
1396  u[i] != (Unicode)0xa0) {
1397  break;
1398  }
1399  }
1400  if (i == uLen && uLen >= 1) {
1401  charPos += nBytes;
1402  if (chars->getLength() > 0) {
1403  ((TextChar *)chars->get(chars->getLength() - 1))->spaceAfter =
1404  (char)gTrue;
1405  }
1406  return;
1407  }
1408 
1409  // remap Unicode
1410  uBufLen = 0;
1411  for (i = 0; i < uLen; ++i) {
1412  if (uBufSize - uBufLen < 8 && uBufSize < 20000) {
1413  uBufSize *= 2;
1414  uBuf = (Unicode *)greallocn(uBuf, uBufSize, sizeof(Unicode));
1415  }
1416  uBufLen += remapping->map(u[i], uBuf + uBufLen, uBufSize - uBufLen);
1417  }
1418 
1419  // add the characters
1420  if (uBufLen > 0) {
1421 
1422  // handle right-to-left ligatures: if there are multiple Unicode
1423  // characters, and they're all right-to-left, insert them in
1424  // right-to-left order
1425  if (uBufLen > 1) {
1426  rtl = gTrue;
1427  for (i = 0; i < uBufLen; ++i) {
1428  if (!unicodeTypeR(uBuf[i])) {
1429  rtl = gFalse;
1430  break;
1431  }
1432  }
1433  } else {
1434  rtl = gFalse;
1435  }
1436 
1437  // compute the bounding box
1438  w1 /= uBufLen;
1439  h1 /= uBufLen;
1440  ascent = curFont->ascent * curFontSize;
1441  descent = curFont->descent * curFontSize;
1442  for (i = 0; i < uBufLen; ++i) {
1443  x2 = x1 + i * w1;
1444  y2 = y1 + i * h1;
1445  switch (curRot) {
1446  case 0:
1447  default:
1448  xMin = x2;
1449  xMax = x2 + w1;
1450  yMin = y2 - ascent;
1451  yMax = y2 - descent;
1452  break;
1453  case 1:
1454  xMin = x2 + descent;
1455  xMax = x2 + ascent;
1456  yMin = y2;
1457  yMax = y2 + h1;
1458  break;
1459  case 2:
1460  xMin = x2 + w1;
1461  xMax = x2;
1462  yMin = y2 + descent;
1463  yMax = y2 + ascent;
1464  break;
1465  case 3:
1466  xMin = x2 - ascent;
1467  xMax = x2 - descent;
1468  yMin = y2 + h1;
1469  yMax = y2;
1470  break;
1471  }
1472 
1473  // check for clipping
1474  clipped = gFalse;
1476  state->getClipBBox(&clipXMin, &clipYMin, &clipXMax, &clipYMax);
1477  xMid = 0.5 * (xMin + xMax);
1478  yMid = 0.5 * (yMin + yMax);
1479  if (xMid < clipXMin || xMid > clipXMax ||
1480  yMid < clipYMin || yMid > clipYMax) {
1481  clipped = gTrue;
1482  }
1483  }
1484 
1485  if ((state->getRender() & 3) == 1) {
1486  state->getStrokeRGB(&rgb);
1487  alpha = state->getStrokeOpacity();
1488  } else {
1489  state->getFillRGB(&rgb);
1490  alpha = state->getFillOpacity();
1491  }
1492  if (rtl) {
1493  j = uBufLen - 1 - i;
1494  } else {
1495  j = i;
1496  }
1497  chars->append(new TextChar(uBuf[j], charPos, nBytes,
1498  xMin, yMin, xMax, yMax,
1499  curRot, rotated, clipped,
1500  state->getRender() == 3 || alpha < 0.001,
1502  colToDbl(rgb.r), colToDbl(rgb.g),
1503  colToDbl(rgb.b)));
1504  }
1505  }
1506 
1507  charPos += nBytes;
1508 }
1509 
1510 void TextPage::incCharCount(int nChars) {
1511  charPos += nChars;
1512 }
1513 
1515  if (actualText) {
1516  gfree(actualText);
1517  }
1518  actualText = (Unicode *)gmallocn(uLen, sizeof(Unicode));
1519  memcpy(actualText, u, uLen * sizeof(Unicode));
1520  actualTextLen = uLen;
1521  actualTextNBytes = 0;
1522 }
1523 
1525  Unicode *u;
1526 
1527  u = actualText;
1528  actualText = NULL; // so we can call TextPage::addChar()
1529  if (actualTextNBytes) {
1530  // now that we have the position info for all of the text inside
1531  // the marked content span, we feed the "ActualText" back through
1532  // addChar()
1536  }
1537  gfree(u);
1538  actualText = NULL;
1539  actualTextLen = 0;
1541 }
1542 
1543 void TextPage::addUnderline(double x0, double y0, double x1, double y1) {
1544  underlines->append(new TextUnderline(x0, y0, x1, y1));
1545 }
1546 
1547 void TextPage::addLink(double xMin, double yMin, double xMax, double yMax,
1548  Link *link) {
1549  GString *uri;
1550 
1551  if (link && link->getAction() && link->getAction()->getKind() == actionURI) {
1552  uri = ((LinkURI *)link->getAction())->getURI()->copy();
1553  links->append(new TextLink(xMin, yMin, xMax, yMax, uri));
1554  }
1555 }
1556 
1557 //------------------------------------------------------------------------
1558 // TextPage: output
1559 //------------------------------------------------------------------------
1560 
1561 void TextPage::write(void *outputStream, TextOutputFunc outputFunc) {
1562  UnicodeMap *uMap;
1563  char space[8], eol[16], eop[8];
1564  int spaceLen, eolLen, eopLen;
1565  GBool pageBreaks;
1566 
1567  // get the output encoding
1568  if (!(uMap = globalParams->getTextEncoding())) {
1569  return;
1570  }
1571  spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
1572  eolLen = 0; // make gcc happy
1573  switch (globalParams->getTextEOL()) {
1574  case eolUnix:
1575  eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
1576  break;
1577  case eolDOS:
1578  eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
1579  eolLen += uMap->mapUnicode(0x0a, eol + eolLen, (int)sizeof(eol) - eolLen);
1580  break;
1581  case eolMac:
1582  eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
1583  break;
1584  }
1585  eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
1586  pageBreaks = globalParams->getTextPageBreaks();
1587 
1588  switch (control.mode) {
1589  case textOutReadingOrder:
1590  writeReadingOrder(outputStream, outputFunc, uMap, space, spaceLen,
1591  eol, eolLen);
1592  break;
1593  case textOutPhysLayout:
1594  case textOutTableLayout:
1595  writePhysLayout(outputStream, outputFunc, uMap, space, spaceLen,
1596  eol, eolLen);
1597  break;
1598  case textOutSimpleLayout:
1599  writeSimpleLayout(outputStream, outputFunc, uMap, space, spaceLen,
1600  eol, eolLen);
1601  break;
1602  case textOutSimple2Layout:
1603  writeSimple2Layout(outputStream, outputFunc, uMap, space, spaceLen,
1604  eol, eolLen);
1605  break;
1606  case textOutLinePrinter:
1607  writeLinePrinter(outputStream, outputFunc, uMap, space, spaceLen,
1608  eol, eolLen);
1609  break;
1610  case textOutRawOrder:
1611  writeRaw(outputStream, outputFunc, uMap, space, spaceLen,
1612  eol, eolLen);
1613  break;
1614  }
1615 
1616  // end of page
1617  if (pageBreaks) {
1618  (*outputFunc)(outputStream, eop, eopLen);
1619  }
1620 
1621  uMap->decRefCnt();
1622 }
1623 
1624 void TextPage::writeReadingOrder(void *outputStream,
1625  TextOutputFunc outputFunc,
1626  UnicodeMap *uMap,
1627  char *space, int spaceLen,
1628  char *eol, int eolLen) {
1629  TextBlock *tree;
1630  TextColumn *col;
1631  TextParagraph *par;
1632  TextLine *line;
1633  GList *overlappingChars;
1634  GList *columns;
1635  GBool primaryLR;
1636  GString *s;
1637  int colIdx, parIdx, lineIdx, rot, n;
1638 
1639 #if 0 //~debug
1640  dumpChars(chars);
1641 #endif
1643  overlappingChars = separateOverlappingText(chars);
1644  } else {
1645  overlappingChars = NULL;
1646  }
1647  rot = rotateChars(chars);
1649  tree = splitChars(chars);
1650 #if 0 //~debug
1651  dumpTree(tree);
1652 #endif
1653  if (!tree) {
1654  // no text
1655  unrotateChars(chars, rot);
1656  return;
1657  }
1658  columns = buildColumns(tree, primaryLR);
1659  delete tree;
1660  unrotateChars(chars, rot);
1661  if (control.html) {
1664  }
1665  if (overlappingChars) {
1666  if (overlappingChars->getLength() > 0) {
1667  columns->append(buildOverlappingTextColumn(overlappingChars));
1668  }
1669  deleteGList(overlappingChars, TextChar);
1670  }
1671 #if 0 //~debug
1672  dumpColumns(columns);
1673 #endif
1674 
1675  for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
1676  col = (TextColumn *)columns->get(colIdx);
1677  for (parIdx = 0; parIdx < col->paragraphs->getLength(); ++parIdx) {
1678  par = (TextParagraph *)col->paragraphs->get(parIdx);
1679  for (lineIdx = 0; lineIdx < par->lines->getLength(); ++lineIdx) {
1680  line = (TextLine *)par->lines->get(lineIdx);
1681  n = line->len;
1682  if (line->hyphenated && lineIdx + 1 < par->lines->getLength()) {
1683  --n;
1684  }
1685  s = new GString();
1686  encodeFragment(line->text, n, uMap, primaryLR, s);
1687  if (lineIdx + 1 < par->lines->getLength() && !line->hyphenated) {
1688  s->append(space, spaceLen);
1689  }
1690  (*outputFunc)(outputStream, s->getCString(), s->getLength());
1691  delete s;
1692  }
1693  (*outputFunc)(outputStream, eol, eolLen);
1694  }
1695  (*outputFunc)(outputStream, eol, eolLen);
1696  }
1697 
1699 }
1700 
1702  TextBlock *tree;
1703  GList *overlappingChars;
1704  GList *columns;
1705  GBool primaryLR;
1706  int rot;
1707 
1714  } else {
1716  overlappingChars = separateOverlappingText(chars);
1717  } else {
1718  overlappingChars = NULL;
1719  }
1720  rot = rotateChars(chars);
1722  if ((tree = splitChars(chars))) {
1723  columns = buildColumns(tree, primaryLR);
1724  delete tree;
1725  } else {
1726  // no text
1727  columns = new GList();
1728  }
1729  unrotateChars(chars, rot);
1730  unrotateColumns(columns, rot);
1731  if (control.html) {
1733  }
1734  if (overlappingChars) {
1735  if (overlappingChars->getLength() > 0) {
1736  columns->append(buildOverlappingTextColumn(overlappingChars));
1737  }
1738  deleteGList(overlappingChars, TextChar);
1739  }
1740  }
1741  return columns;
1742 }
1743 
1744 // This handles both physical layout and table layout modes.
1745 void TextPage::writePhysLayout(void *outputStream,
1746  TextOutputFunc outputFunc,
1747  UnicodeMap *uMap,
1748  char *space, int spaceLen,
1749  char *eol, int eolLen) {
1750  TextBlock *tree;
1751  GString **out;
1752  int *outLen;
1753  TextColumn *col;
1754  TextParagraph *par;
1755  TextLine *line;
1756  GList *overlappingChars;
1757  GList *columns;
1758  GBool primaryLR;
1759  int ph, colIdx, parIdx, lineIdx, rot, y, i;
1760 
1761 #if 0 //~debug
1762  dumpChars(chars);
1763 #endif
1764 #if 0 //~debug
1765  dumpUnderlines();
1766 #endif
1768  overlappingChars = separateOverlappingText(chars);
1769  } else {
1770  overlappingChars = NULL;
1771  }
1772  rot = rotateChars(chars);
1774  tree = splitChars(chars);
1775 #if 0 //~debug
1776  dumpTree(tree);
1777 #endif
1778  if (!tree) {
1779  // no text
1780  unrotateChars(chars, rot);
1781  return;
1782  }
1783  //~ this doesn't correctly handle the right-to-left case
1784  columns = buildColumns(tree, gTrue);
1785  delete tree;
1786  unrotateChars(chars, rot);
1787  if (control.html) {
1790  }
1792 #if 0 //~debug
1793  dumpColumns(columns);
1794 #endif
1795 
1796  out = (GString **)gmallocn(ph, sizeof(GString *));
1797  outLen = (int *)gmallocn(ph, sizeof(int));
1798  for (i = 0; i < ph; ++i) {
1799  out[i] = NULL;
1800  outLen[i] = 0;
1801  }
1802 
1803  columns->sort(&TextColumn::cmpPX);
1804  for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
1805  col = (TextColumn *)columns->get(colIdx);
1806  y = col->py;
1807  for (parIdx = 0;
1808  parIdx < col->paragraphs->getLength() && y < ph;
1809  ++parIdx) {
1810  par = (TextParagraph *)col->paragraphs->get(parIdx);
1811  for (lineIdx = 0;
1812  lineIdx < par->lines->getLength() && y < ph;
1813  ++lineIdx) {
1814  line = (TextLine *)par->lines->get(lineIdx);
1815  if (!out[y]) {
1816  out[y] = new GString();
1817  }
1818  while (outLen[y] < col->px + line->px) {
1819  out[y]->append(space, spaceLen);
1820  ++outLen[y];
1821  }
1822  encodeFragment(line->text, line->len, uMap, primaryLR, out[y]);
1823  outLen[y] += line->pw;
1824  ++y;
1825  }
1826  if (parIdx + 1 < col->paragraphs->getLength()) {
1827  ++y;
1828  }
1829  }
1830  }
1831 
1832  for (i = 0; i < ph; ++i) {
1833  if (out[i]) {
1834  (*outputFunc)(outputStream, out[i]->getCString(), out[i]->getLength());
1835  delete out[i];
1836  }
1837  (*outputFunc)(outputStream, eol, eolLen);
1838  }
1839 
1840  gfree(out);
1841  gfree(outLen);
1842 
1844 
1845  if (overlappingChars) {
1846  if (overlappingChars->getLength() > 0) {
1847  TextColumn *col = buildOverlappingTextColumn(overlappingChars);
1848  (*outputFunc)(outputStream, eol, eolLen);
1849  for (parIdx = 0; parIdx < col->paragraphs->getLength(); ++parIdx) {
1850  par = (TextParagraph *)col->paragraphs->get(parIdx);
1851  for (lineIdx = 0; lineIdx < par->lines->getLength(); ++lineIdx) {
1852  line = (TextLine *)par->lines->get(lineIdx);
1853  GString *s = new GString();
1854  encodeFragment(line->text, line->len, uMap, primaryLR, s);
1855  s->append(eol, eolLen);
1856  (*outputFunc)(outputStream, s->getCString(), s->getLength());
1857  delete s;
1858  }
1859  if (parIdx < col->paragraphs->getLength() - 1) {
1860  (*outputFunc)(outputStream, eol, eolLen);
1861  }
1862  }
1863  delete col;
1864  }
1865  deleteGList(overlappingChars, TextChar);
1866  }
1867 }
1868 
1869 void TextPage::writeSimpleLayout(void *outputStream,
1870  TextOutputFunc outputFunc,
1871  UnicodeMap *uMap,
1872  char *space, int spaceLen,
1873  char *eol, int eolLen) {
1874  TextBlock *tree;
1875  TextSuperLine *superLine0, *superLine1;
1876  TextLine *line;
1877  GList *superLines;
1878  GString *out;
1879  GBool primaryLR;
1880  int rot, x, i, j;
1881 
1882 #if 0 //~debug
1883  dumpChars(chars);
1884 #endif
1885  rot = rotateChars(chars);
1887  tree = splitChars(chars);
1888 #if 0 //~debug
1889  dumpTree(tree);
1890 #endif
1891  if (!tree) {
1892  // no text
1893  unrotateChars(chars, rot);
1894  return;
1895  }
1896  superLines = new GList();
1897  buildSuperLines(tree, superLines);
1898  delete tree;
1899  unrotateChars(chars, rot);
1900  assignSimpleLayoutPositions(superLines, uMap);
1901 
1902  for (i = 0; i < superLines->getLength(); ++i) {
1903  superLine0 = (TextSuperLine *)superLines->get(i);
1904  out = new GString();
1905  x = 0;
1906  for (j = 0; j < superLine0->lines->getLength(); ++j) {
1907  line = (TextLine *)superLine0->lines->get(j);
1908  while (x < line->px) {
1909  out->append(space, spaceLen);
1910  ++x;
1911  }
1912  encodeFragment(line->text, line->len, uMap, primaryLR, out);
1913  x += line->pw;
1914  }
1915  (*outputFunc)(outputStream, out->getCString(), out->getLength());
1916  delete out;
1917  (*outputFunc)(outputStream, eol, eolLen);
1918  if (i + 1 < superLines->getLength()) {
1919  superLine1 = (TextSuperLine *)superLines->get(i + 1);
1920  if (superLine1->yMin - superLine0->yMax > 1.0 * superLine0->fontSize) {
1921  (*outputFunc)(outputStream, eol, eolLen);
1922  }
1923  }
1924  }
1925 
1926  deleteGList(superLines, TextSuperLine);
1927 }
1928 
1929 void TextPage::writeSimple2Layout(void *outputStream,
1930  TextOutputFunc outputFunc,
1931  UnicodeMap *uMap,
1932  char *space, int spaceLen,
1933  char *eol, int eolLen) {
1934  GList *columns;
1935  TextColumn *col;
1936  TextParagraph *par;
1937  TextLine *line;
1938  GString *out;
1939  GBool primaryLR;
1940  int colIdx, parIdx, lineIdx;
1941 
1944 #if 0 //~debug
1945  dumpChars(chars);
1946 #endif
1950 
1951  for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
1952  col = (TextColumn *)columns->get(colIdx);
1953  for (parIdx = 0; parIdx < col->paragraphs->getLength(); ++parIdx) {
1954  par = (TextParagraph *)col->paragraphs->get(parIdx);
1955  for (lineIdx = 0; lineIdx < par->lines->getLength(); ++lineIdx) {
1956  line = (TextLine *)par->lines->get(lineIdx);
1957  out = new GString();
1958  encodeFragment(line->text, line->len, uMap, primaryLR, out);
1959  (*outputFunc)(outputStream, out->getCString(), out->getLength());
1960  delete out;
1961  (*outputFunc)(outputStream, eol, eolLen);
1962  }
1963  }
1964  }
1965 
1967 }
1968 
1969 void TextPage::writeLinePrinter(void *outputStream,
1970  TextOutputFunc outputFunc,
1971  UnicodeMap *uMap,
1972  char *space, int spaceLen,
1973  char *eol, int eolLen) {
1974  TextChar *ch, *ch2;
1975  GList *line;
1976  GString *s;
1977  char buf[8];
1978  double pitch, lineSpacing, delta;
1979  double yMin0, yShift, xMin0, xShift;
1980  double y, x;
1981  int rot, n, i, j, k;
1982 
1983  rot = rotateChars(chars);
1985  // don't call removeDuplicates here, because it expects to be
1986  // working on a secondary list that doesn't own the TextChar objects
1988 
1989  // get character pitch
1990  if (control.fixedPitch > 0) {
1991  pitch = control.fixedPitch;
1992  } else {
1993  // compute (approximate) character pitch
1994  pitch = pageWidth;
1995  for (i = 0; i < chars->getLength(); ++i) {
1996  ch = (TextChar *)chars->get(i);
1997  for (j = i + 1; j < chars->getLength(); ++j) {
1998  ch2 = (TextChar *)chars->get(j);
1999  if (ch2->yMin + ascentAdjustFactor * (ch2->yMax - ch2->yMin) <
2000  ch->yMax - descentAdjustFactor * (ch->yMax - ch->yMin) &&
2001  ch->yMin + ascentAdjustFactor * (ch->yMax - ch->yMin) <
2002  ch2->yMax - descentAdjustFactor * (ch2->yMax - ch2->yMin)) {
2003  delta = fabs(ch2->xMin - ch->xMin);
2004  if (delta > 0.01 && delta < pitch) {
2005  pitch = delta;
2006  }
2007  }
2008  }
2009  }
2010  }
2011 
2012  // get line spacing
2013  if (control.fixedLineSpacing > 0) {
2014  lineSpacing = control.fixedLineSpacing;
2015  } else {
2016  // compute (approximate) line spacing
2017  lineSpacing = pageHeight;
2018  i = 0;
2019  while (i < chars->getLength()) {
2020  ch = (TextChar *)chars->get(i);
2021  // look for the first char that does not (substantially)
2022  // vertically overlap this one
2023  delta = 0;
2024  for (++i; delta == 0 && i < chars->getLength(); ++i) {
2025  ch2 = (TextChar *)chars->get(i);
2026  if (ch2->yMin + ascentAdjustFactor * (ch2->yMax - ch2->yMin) >
2027  ch->yMax - descentAdjustFactor * (ch->yMax - ch->yMin)) {
2028  delta = ch2->yMin - ch->yMin;
2029  }
2030  }
2031  if (delta > 0 && delta < lineSpacing) {
2032  lineSpacing = delta;
2033  }
2034  }
2035  }
2036 
2037  // shift the grid to avoid problems with floating point accuracy --
2038  // for fixed line spacing, this avoids problems with
2039  // dropping/inserting blank lines
2040  if (chars->getLength()) {
2041  yMin0 = ((TextChar *)chars->get(0))->yMin;
2042  yShift = yMin0 - (int)(yMin0 / lineSpacing + 0.5) * lineSpacing
2043  - 0.5 * lineSpacing;
2044  } else {
2045  yShift = 0;
2046  }
2047 
2048  // for each line...
2049  i = 0;
2050  j = chars->getLength() - 1;
2051  for (y = yShift; y < pageHeight; y += lineSpacing) {
2052 
2053  // get the characters in this line
2054  line = new GList;
2055  while (i < chars->getLength() &&
2056  ((TextChar *)chars->get(i))->yMin < y + lineSpacing) {
2057  line->append(chars->get(i++));
2058  }
2059  line->sort(&TextChar::cmpX);
2060 
2061  // shift the grid to avoid problems with floating point accuracy
2062  // -- for fixed char spacing, this avoids problems with
2063  // dropping/inserting spaces
2064  if (line->getLength()) {
2065  xMin0 = ((TextChar *)line->get(0))->xMin;
2066  xShift = xMin0 - (int)(xMin0 / pitch + 0.5) * pitch - 0.5 * pitch;
2067  } else {
2068  xShift = 0;
2069  }
2070 
2071  // write the line
2072  s = new GString();
2073  x = xShift;
2074  k = 0;
2075  while (k < line->getLength()) {
2076  ch = (TextChar *)line->get(k);
2077  if (ch->xMin < x + pitch) {
2078  n = uMap->mapUnicode(ch->c, buf, sizeof(buf));
2079  s->append(buf, n);
2080  ++k;
2081  } else {
2082  s->append(space, spaceLen);
2083  n = spaceLen;
2084  }
2085  x += (uMap->isUnicode() ? 1 : n) * pitch;
2086  }
2087  s->append(eol, eolLen);
2088  (*outputFunc)(outputStream, s->getCString(), s->getLength());
2089  delete s;
2090  delete line;
2091  }
2092 
2093  unrotateChars(chars, rot);
2094 }
2095 
2096 void TextPage::writeRaw(void *outputStream,
2097  TextOutputFunc outputFunc,
2098  UnicodeMap *uMap,
2099  char *space, int spaceLen,
2100  char *eol, int eolLen) {
2101  TextChar *ch, *ch2;
2102  GString *s;
2103  char buf[8];
2104  int n, i;
2105 
2106  s = new GString();
2107 
2108  for (i = 0; i < chars->getLength(); ++i) {
2109 
2110  // process one char
2111  ch = (TextChar *)chars->get(i);
2112  n = uMap->mapUnicode(ch->c, buf, sizeof(buf));
2113  s->append(buf, n);
2114 
2115  // check for space or eol
2116  if (i+1 < chars->getLength()) {
2117  ch2 = (TextChar *)chars->get(i+1);
2118  if (ch2->rot != ch->rot) {
2119  s->append(eol, eolLen);
2120  } else {
2121  switch (ch->rot) {
2122  case 0:
2123  default:
2124  if (fabs(ch2->yMin - ch->yMin) > rawModeLineDelta * ch->fontSize ||
2125  ch2->xMin - ch->xMax < -rawModeCharOverlap * ch->fontSize) {
2126  s->append(eol, eolLen);
2127  } else if (ch->spaceAfter ||
2128  ch2->xMin - ch->xMax >
2129  rawModeWordSpacing * ch->fontSize) {
2130  s->append(space, spaceLen);
2131  }
2132  break;
2133  case 1:
2134  if (fabs(ch->xMax - ch2->xMax) > rawModeLineDelta * ch->fontSize ||
2135  ch2->yMin - ch->yMax < -rawModeCharOverlap * ch->fontSize) {
2136  s->append(eol, eolLen);
2137  } else if (ch->spaceAfter ||
2138  ch2->yMin - ch->yMax >
2139  rawModeWordSpacing * ch->fontSize) {
2140  s->append(space, spaceLen);
2141  }
2142  break;
2143  case 2:
2144  if (fabs(ch->yMax - ch2->yMax) > rawModeLineDelta * ch->fontSize ||
2145  ch->xMin - ch2->xMax < -rawModeCharOverlap * ch->fontSize) {
2146  s->append(eol, eolLen);
2147  } else if (ch->spaceAfter ||
2148  ch->xMin - ch2->xMax >
2149  rawModeWordSpacing * ch->fontSize) {
2150  s->append(space, spaceLen);
2151  }
2152  break;
2153  case 3:
2154  if (fabs(ch2->xMin - ch->xMin) > rawModeLineDelta * ch->fontSize ||
2155  ch->yMin - ch2->yMax < -rawModeCharOverlap * ch->fontSize) {
2156  s->append(eol, eolLen);
2157  } else if (ch->spaceAfter ||
2158  ch->yMin - ch2->yMax >
2159  rawModeWordSpacing * ch->fontSize) {
2160  s->append(space, spaceLen);
2161  }
2162  break;
2163  }
2164  }
2165  } else {
2166  s->append(eol, eolLen);
2167  }
2168 
2169  if (s->getLength() > 1000) {
2170  (*outputFunc)(outputStream, s->getCString(), s->getLength());
2171  s->clear();
2172  }
2173  }
2174 
2175  if (s->getLength() > 0) {
2176  (*outputFunc)(outputStream, s->getCString(), s->getLength());
2177  }
2178  delete s;
2179 }
2180 
2182  GBool primaryLR, GString *s) {
2183  char lre[8], rle[8], popdf[8], buf[8];
2184  int lreLen, rleLen, popdfLen, n;
2185  int i, j, k;
2186 
2187  if (uMap->isUnicode()) {
2188 
2189  lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
2190  rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
2191  popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
2192 
2193  if (primaryLR) {
2194 
2195  i = 0;
2196  while (i < len) {
2197  // output a left-to-right section
2198  for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ;
2199  for (k = i; k < j; ++k) {
2200  n = uMap->mapUnicode(text[k], buf, sizeof(buf));
2201  s->append(buf, n);
2202  }
2203  i = j;
2204  // output a right-to-left section
2205  for (j = i;
2206  j < len && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
2207  ++j) ;
2208  if (j > i) {
2209  s->append(rle, rleLen);
2210  for (k = j - 1; k >= i; --k) {
2211  n = uMap->mapUnicode(text[k], buf, sizeof(buf));
2212  s->append(buf, n);
2213  }
2214  s->append(popdf, popdfLen);
2215  i = j;
2216  }
2217  }
2218 
2219  } else {
2220 
2221  // Note: This code treats numeric characters (European and
2222  // Arabic/Indic) as left-to-right, which isn't strictly correct
2223  // (incurs extra LRE/POPDF pairs), but does produce correct
2224  // visual formatting.
2225  s->append(rle, rleLen);
2226  i = len - 1;
2227  while (i >= 0) {
2228  // output a right-to-left section
2229  for (j = i;
2230  j >= 0 && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
2231  --j) ;
2232  for (k = i; k > j; --k) {
2233  n = uMap->mapUnicode(text[k], buf, sizeof(buf));
2234  s->append(buf, n);
2235  }
2236  i = j;
2237  // output a left-to-right section
2238  for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ;
2239  if (j < i) {
2240  s->append(lre, lreLen);
2241  for (k = j + 1; k <= i; ++k) {
2242  n = uMap->mapUnicode(text[k], buf, sizeof(buf));
2243  s->append(buf, n);
2244  }
2245  s->append(popdf, popdfLen);
2246  i = j;
2247  }
2248  }
2249  s->append(popdf, popdfLen);
2250  }
2251 
2252  } else {
2253  for (i = 0; i < len; ++i) {
2254  n = uMap->mapUnicode(text[i], buf, sizeof(buf));
2255  s->append(buf, n);
2256  }
2257  }
2258 }
2259 
2260 //------------------------------------------------------------------------
2261 // TextPage: layout analysis
2262 //------------------------------------------------------------------------
2263 
2264 // Determine primary (most common) rotation value. Rotate all chars
2265 // to that primary rotation.
2267  TextChar *ch;
2268  int nChars[4];
2269  double xMin, yMin, xMax, yMax, t;
2270  int rot, i;
2271 
2272  // determine primary rotation
2273  nChars[0] = nChars[1] = nChars[2] = nChars[3] = 0;
2274  for (i = 0; i < charsA->getLength(); ++i) {
2275  ch = (TextChar *)charsA->get(i);
2276  ++nChars[ch->rot];
2277  }
2278  rot = 0;
2279  for (i = 1; i < 4; ++i) {
2280  if (nChars[i] > nChars[rot]) {
2281  rot = i;
2282  }
2283  }
2284 
2285  // rotate
2286  switch (rot) {
2287  case 0:
2288  default:
2289  break;
2290  case 1:
2291  for (i = 0; i < charsA->getLength(); ++i) {
2292  ch = (TextChar *)charsA->get(i);
2293  xMin = ch->yMin;
2294  xMax = ch->yMax;
2295  yMin = pageWidth - ch->xMax;
2296  yMax = pageWidth - ch->xMin;
2297  ch->xMin = xMin;
2298  ch->xMax = xMax;
2299  ch->yMin = yMin;
2300  ch->yMax = yMax;
2301  ch->rot = (ch->rot + 3) & 3;
2302  }
2303  t = pageWidth;
2305  pageHeight = t;
2306  break;
2307  case 2:
2308  for (i = 0; i < charsA->getLength(); ++i) {
2309  ch = (TextChar *)charsA->get(i);
2310  xMin = pageWidth - ch->xMax;
2311  xMax = pageWidth - ch->xMin;
2312  yMin = pageHeight - ch->yMax;
2313  yMax = pageHeight - ch->yMin;
2314  ch->xMin = xMin;
2315  ch->xMax = xMax;
2316  ch->yMin = yMin;
2317  ch->yMax = yMax;
2318  ch->rot = (ch->rot + 2) & 3;
2319  }
2320  break;
2321  case 3:
2322  for (i = 0; i < charsA->getLength(); ++i) {
2323  ch = (TextChar *)charsA->get(i);
2324  xMin = pageHeight - ch->yMax;
2325  xMax = pageHeight - ch->yMin;
2326  yMin = ch->xMin;
2327  yMax = ch->xMax;
2328  ch->xMin = xMin;
2329  ch->xMax = xMax;
2330  ch->yMin = yMin;
2331  ch->yMax = yMax;
2332  ch->rot = (ch->rot + 1) & 3;
2333  }
2334  t = pageWidth;
2336  pageHeight = t;
2337  break;
2338  }
2339 
2340  return rot;
2341 }
2342 
2343 // Rotate all chars to zero rotation. This leaves the TextChar.rot
2344 // fields unchanged.
2346  TextChar *ch;
2347  double xMin, yMin, xMax, yMax;
2348  int i;
2349 
2350  for (i = 0; i < charsA->getLength(); ++i) {
2351  ch = (TextChar *)charsA->get(i);
2352  switch (ch->rot) {
2353  case 0:
2354  default:
2355  break;
2356  case 1:
2357  xMin = ch->yMin;
2358  xMax = ch->yMax;
2359  yMin = pageWidth - ch->xMax;
2360  yMax = pageWidth - ch->xMin;
2361  ch->xMin = xMin;
2362  ch->xMax = xMax;
2363  ch->yMin = yMin;
2364  ch->yMax = yMax;
2365  break;
2366  case 2:
2367  xMin = pageWidth - ch->xMax;
2368  xMax = pageWidth - ch->xMin;
2369  yMin = pageHeight - ch->yMax;
2370  yMax = pageHeight - ch->yMin;
2371  ch->xMin = xMin;
2372  ch->xMax = xMax;
2373  ch->yMin = yMin;
2374  ch->yMax = yMax;
2375  break;
2376  case 3:
2377  xMin = pageHeight - ch->yMax;
2378  xMax = pageHeight - ch->yMin;
2379  yMin = ch->xMin;
2380  yMax = ch->xMax;
2381  ch->xMin = xMin;
2382  ch->xMax = xMax;
2383  ch->yMin = yMin;
2384  ch->yMax = yMax;
2385  break;
2386  }
2387  }
2388 }
2389 
2390 // Rotate the TextUnderlines and TextLinks to match the transform
2391 // performed by rotateChars().
2393  TextUnderline *underline;
2394  TextLink *link;
2395  double xMin, yMin, xMax, yMax;
2396  int i;
2397 
2398  switch (rot) {
2399  case 0:
2400  default:
2401  break;
2402  case 1:
2403  for (i = 0; i < underlines->getLength(); ++i) {
2404  underline = (TextUnderline *)underlines->get(i);
2405  xMin = underline->y0;
2406  xMax = underline->y1;
2407  yMin = pageWidth - underline->x1;
2408  yMax = pageWidth - underline->x0;
2409  underline->x0 = xMin;
2410  underline->x1 = xMax;
2411  underline->y0 = yMin;
2412  underline->y1 = yMax;
2413  underline->horiz = !underline->horiz;
2414  }
2415  for (i = 0; i < links->getLength(); ++i) {
2416  link = (TextLink *)links->get(i);
2417  xMin = link->yMin;
2418  xMax = link->yMax;
2419  yMin = pageWidth - link->xMax;
2420  yMax = pageWidth - link->xMin;
2421  link->xMin = xMin;
2422  link->xMax = xMax;
2423  link->yMin = yMin;
2424  link->yMax = yMax;
2425  }
2426  break;
2427  case 2:
2428  for (i = 0; i < underlines->getLength(); ++i) {
2429  underline = (TextUnderline *)underlines->get(i);
2430  xMin = pageWidth - underline->x1;
2431  xMax = pageWidth - underline->x0;
2432  yMin = pageHeight - underline->y1;
2433  yMax = pageHeight - underline->y0;
2434  underline->x0 = xMin;
2435  underline->x1 = xMax;
2436  underline->y0 = yMin;
2437  underline->y1 = yMax;
2438  }
2439  for (i = 0; i < links->getLength(); ++i) {
2440  link = (TextLink *)links->get(i);
2441  xMin = pageWidth - link->xMax;
2442  xMax = pageWidth - link->xMin;
2443  yMin = pageHeight - link->yMax;
2444  yMax = pageHeight - link->yMin;
2445  link->xMin = xMin;
2446  link->xMax = xMax;
2447  link->yMin = yMin;
2448  link->yMax = yMax;
2449  }
2450  break;
2451  case 3:
2452  for (i = 0; i < underlines->getLength(); ++i) {
2453  underline = (TextUnderline *)underlines->get(i);
2454  xMin = pageHeight - underline->y1;
2455  xMax = pageHeight - underline->y0;
2456  yMin = underline->x0;
2457  yMax = underline->x1;
2458  underline->x0 = xMin;
2459  underline->x1 = xMax;
2460  underline->y0 = yMin;
2461  underline->y1 = yMax;
2462  underline->horiz = !underline->horiz;
2463  }
2464  for (i = 0; i < links->getLength(); ++i) {
2465  link = (TextLink *)links->get(i);
2466  xMin = pageHeight - link->yMax;
2467  xMax = pageHeight - link->yMin;
2468  yMin = link->xMin;
2469  yMax = link->xMax;
2470  link->xMin = xMin;
2471  link->xMax = xMax;
2472  link->yMin = yMin;
2473  link->yMax = yMax;
2474  }
2475  break;
2476  }
2477 }
2478 
2479 // Undo the coordinate transform performed by rotateChars().
2480 void TextPage::unrotateChars(GList *charsA, int rot) {
2481  TextChar *ch;
2482  double xMin, yMin, xMax, yMax, t;
2483  int i;
2484 
2485  switch (rot) {
2486  case 0:
2487  default:
2488  // no transform
2489  break;
2490  case 1:
2491  t = pageWidth;
2493  pageHeight = t;
2494  for (i = 0; i < charsA->getLength(); ++i) {
2495  ch = (TextChar *)charsA->get(i);
2496  xMin = pageWidth - ch->yMax;
2497  xMax = pageWidth - ch->yMin;
2498  yMin = ch->xMin;
2499  yMax = ch->xMax;
2500  ch->xMin = xMin;
2501  ch->xMax = xMax;
2502  ch->yMin = yMin;
2503  ch->yMax = yMax;
2504  ch->rot = (ch->rot + 1) & 3;
2505  }
2506  break;
2507  case 2:
2508  for (i = 0; i < charsA->getLength(); ++i) {
2509  ch = (TextChar *)charsA->get(i);
2510  xMin = pageWidth - ch->xMax;
2511  xMax = pageWidth - ch->xMin;
2512  yMin = pageHeight - ch->yMax;
2513  yMax = pageHeight - ch->yMin;
2514  ch->xMin = xMin;
2515  ch->xMax = xMax;
2516  ch->yMin = yMin;
2517  ch->yMax = yMax;
2518  ch->rot = (ch->rot + 2) & 3;
2519  }
2520  break;
2521  case 3:
2522  t = pageWidth;
2524  pageHeight = t;
2525  for (i = 0; i < charsA->getLength(); ++i) {
2526  ch = (TextChar *)charsA->get(i);
2527  xMin = ch->yMin;
2528  xMax = ch->yMax;
2529  yMin = pageHeight - ch->xMax;
2530  yMax = pageHeight - ch->xMin;
2531  ch->xMin = xMin;
2532  ch->xMax = xMax;
2533  ch->yMin = yMin;
2534  ch->yMax = yMax;
2535  ch->rot = (ch->rot + 3) & 3;
2536  }
2537  break;
2538  }
2539 }
2540 
2541 // Undo the coordinate transform performed by rotateCharsToZero().
2543  TextChar *ch;
2544  double xMin, yMin, xMax, yMax;
2545  int i;
2546 
2547  for (i = 0; i < charsA->getLength(); ++i) {
2548  ch = (TextChar *)charsA->get(i);
2549  switch (ch->rot) {
2550  case 0:
2551  default:
2552  break;
2553  case 1:
2554  xMin = pageWidth - ch->yMax;
2555  xMax = pageWidth - ch->yMin;
2556  yMin = ch->xMin;
2557  yMax = ch->xMax;
2558  ch->xMin = xMin;
2559  ch->xMax = xMax;
2560  ch->yMin = yMin;
2561  ch->yMax = yMax;
2562  break;
2563  case 2:
2564  xMin = pageWidth - ch->xMax;
2565  xMax = pageWidth - ch->xMin;
2566  yMin = pageHeight - ch->yMax;
2567  yMax = pageHeight - ch->yMin;
2568  ch->xMin = xMin;
2569  ch->xMax = xMax;
2570  ch->yMin = yMin;
2571  ch->yMax = yMax;
2572  break;
2573  case 3:
2574  xMin = ch->yMin;
2575  xMax = ch->yMax;
2576  yMin = pageHeight - ch->xMax;
2577  yMax = pageHeight - ch->xMin;
2578  ch->xMin = xMin;
2579  ch->xMax = xMax;
2580  ch->yMin = yMin;
2581  ch->yMax = yMax;
2582  break;
2583  }
2584  }
2585 }
2586 
2587 // Undo the coordinate transform performed by rotateCharsToZero().
2589  TextColumn *col;
2590  TextParagraph *par;
2591  TextLine *line;
2592  TextWord *word;
2593  double xMin, yMin, xMax, yMax;
2594  int colIdx, parIdx, lineIdx, wordIdx, i;
2595 
2596  for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
2597  col = (TextColumn *)columns->get(colIdx);
2598  switch (col->getRotation()) {
2599  case 0:
2600  default:
2601  break;
2602  case 1:
2603  xMin = pageWidth - col->yMax;
2604  xMax = pageWidth - col->yMin;
2605  yMin = col->xMin;
2606  yMax = col->xMax;
2607  col->xMin = xMin;
2608  col->xMax = xMax;
2609  col->yMin = yMin;
2610  col->yMax = yMax;
2611  for (parIdx = 0;
2612  parIdx < col->paragraphs->getLength();
2613  ++parIdx) {
2614  par = (TextParagraph *)col->paragraphs->get(parIdx);
2615  xMin = pageWidth - par->yMax;
2616  xMax = pageWidth - par->yMin;
2617  yMin = par->xMin;
2618  yMax = par->xMax;
2619  par->xMin = xMin;
2620  par->xMax = xMax;
2621  par->yMin = yMin;
2622  par->yMax = yMax;
2623  for (lineIdx = 0;
2624  lineIdx < par->lines->getLength();
2625  ++lineIdx) {
2626  line = (TextLine *)par->lines->get(lineIdx);
2627  xMin = pageWidth - line->yMax;
2628  xMax = pageWidth - line->yMin;
2629  yMin = line->xMin;
2630  yMax = line->xMax;
2631  line->xMin = xMin;
2632  line->xMax = xMax;
2633  line->yMin = yMin;
2634  line->yMax = yMax;
2635  for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
2636  word = (TextWord *)line->words->get(wordIdx);
2637  xMin = pageWidth - word->yMax;
2638  xMax = pageWidth - word->yMin;
2639  yMin = word->xMin;
2640  yMax = word->xMax;
2641  word->xMin = xMin;
2642  word->xMax = xMax;
2643  word->yMin = yMin;
2644  word->yMax = yMax;
2645  }
2646  }
2647  }
2648  break;
2649  case 2:
2650  xMin = pageWidth - col->xMax;
2651  xMax = pageWidth - col->xMin;
2652  yMin = pageHeight - col->yMax;
2653  yMax = pageHeight - col->yMin;
2654  col->xMin = xMin;
2655  col->xMax = xMax;
2656  col->yMin = yMin;
2657  col->yMax = yMax;
2658  for (parIdx = 0;
2659  parIdx < col->paragraphs->getLength();
2660  ++parIdx) {
2661  par = (TextParagraph *)col->paragraphs->get(parIdx);
2662  xMin = pageWidth - par->xMax;
2663  xMax = pageWidth - par->xMin;
2664  yMin = pageHeight - par->yMax;
2665  yMax = pageHeight - par->yMin;
2666  par->xMin = xMin;
2667  par->xMax = xMax;
2668  par->yMin = yMin;
2669  par->yMax = yMax;
2670  for (lineIdx = 0;
2671  lineIdx < par->lines->getLength();
2672  ++lineIdx) {
2673  line = (TextLine *)par->lines->get(lineIdx);
2674  xMin = pageWidth - line->xMax;
2675  xMax = pageWidth - line->xMin;
2676  yMin = pageHeight - line->yMax;
2677  yMax = pageHeight - line->yMin;
2678  line->xMin = xMin;
2679  line->xMax = xMax;
2680  line->yMin = yMin;
2681  line->yMax = yMax;
2682  for (i = 0; i <= line->len; ++i) {
2683  line->edge[i] = pageWidth - line->edge[i];
2684  }
2685  for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
2686  word = (TextWord *)line->words->get(wordIdx);
2687  xMin = pageWidth - word->xMax;
2688  xMax = pageWidth - word->xMin;
2689  yMin = pageHeight - word->yMax;
2690  yMax = pageHeight - word->yMin;
2691  word->xMin = xMin;
2692  word->xMax = xMax;
2693  word->yMin = yMin;
2694  word->yMax = yMax;
2695  for (i = 0; i <= word->len; ++i) {
2696  word->edge[i] = pageWidth - word->edge[i];
2697  }
2698  }
2699  }
2700  }
2701  break;
2702  case 3:
2703  xMin = col->yMin;
2704  xMax = col->yMax;
2705  yMin = pageHeight - col->xMax;
2706  yMax = pageHeight - col->xMin;
2707  col->xMin = xMin;
2708  col->xMax = xMax;
2709  col->yMin = yMin;
2710  col->yMax = yMax;
2711  for (parIdx = 0;
2712  parIdx < col->paragraphs->getLength();
2713  ++parIdx) {
2714  par = (TextParagraph *)col->paragraphs->get(parIdx);
2715  xMin = par->yMin;
2716  xMax = par->yMax;
2717  yMin = pageHeight - par->xMax;
2718  yMax = pageHeight - par->xMin;
2719  par->xMin = xMin;
2720  par->xMax = xMax;
2721  par->yMin = yMin;
2722  par->yMax = yMax;
2723  for (lineIdx = 0;
2724  lineIdx < par->lines->getLength();
2725  ++lineIdx) {
2726  line = (TextLine *)par->lines->get(lineIdx);
2727  xMin = line->yMin;
2728  xMax = line->yMax;
2729  yMin = pageHeight - line->xMax;
2730  yMax = pageHeight - line->xMin;
2731  line->xMin = xMin;
2732  line->xMax = xMax;
2733  line->yMin = yMin;
2734  line->yMax = yMax;
2735  for (i = 0; i <= line->len; ++i) {
2736  line->edge[i] = pageHeight - line->edge[i];
2737  }
2738  for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
2739  word = (TextWord *)line->words->get(wordIdx);
2740  xMin = word->yMin;
2741  xMax = word->yMax;
2742  yMin = pageHeight - word->xMax;
2743  yMax = pageHeight - word->xMin;
2744  word->xMin = xMin;
2745  word->xMax = xMax;
2746  word->yMin = yMin;
2747  word->yMax = yMax;
2748  for (i = 0; i <= word->len; ++i) {
2749  word->edge[i] = pageHeight - word->edge[i];
2750  }
2751  }
2752  }
2753  }
2754  break;
2755  }
2756  }
2757 }
2758 
2759 // Undo the coordinate transform performed by rotateChars().
2761  TextColumn *col;
2762  TextParagraph *par;
2763  TextLine *line;
2764  TextWord *word;
2765  double xMin, yMin, xMax, yMax;
2766  int colIdx, parIdx, lineIdx, wordIdx, i;
2767 
2768  switch (rot) {
2769  case 0:
2770  default:
2771  // no transform
2772  break;
2773  case 1:
2774  // NB: this is called after unrotateChars(), which will have
2775  // swapped pageWidth and pageHeight already.
2776  for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
2777  col = (TextColumn *)columns->get(colIdx);
2778  xMin = pageWidth - col->yMax;
2779  xMax = pageWidth - col->yMin;
2780  yMin = col->xMin;
2781  yMax = col->xMax;
2782  col->xMin = xMin;
2783  col->xMax = xMax;
2784  col->yMin = yMin;
2785  col->yMax = yMax;
2786  for (parIdx = 0;
2787  parIdx < col->paragraphs->getLength();
2788  ++parIdx) {
2789  par = (TextParagraph *)col->paragraphs->get(parIdx);
2790  xMin = pageWidth - par->yMax;
2791  xMax = pageWidth - par->yMin;
2792  yMin = par->xMin;
2793  yMax = par->xMax;
2794  par->xMin = xMin;
2795  par->xMax = xMax;
2796  par->yMin = yMin;
2797  par->yMax = yMax;
2798  for (lineIdx = 0;
2799  lineIdx < par->lines->getLength();
2800  ++lineIdx) {
2801  line = (TextLine *)par->lines->get(lineIdx);
2802  xMin = pageWidth - line->yMax;
2803  xMax = pageWidth - line->yMin;
2804  yMin = line->xMin;
2805  yMax = line->xMax;
2806  line->xMin = xMin;
2807  line->xMax = xMax;
2808  line->yMin = yMin;
2809  line->yMax = yMax;
2810  line->rot = (line->rot + 1) & 3;
2811  if (!(line->rot & 1)) {
2812  for (i = 0; i <= line->len; ++i) {
2813  line->edge[i] = pageWidth - line->edge[i];
2814  }
2815  }
2816  for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
2817  word = (TextWord *)line->words->get(wordIdx);
2818  xMin = pageWidth - word->yMax;
2819  xMax = pageWidth - word->yMin;
2820  yMin = word->xMin;
2821  yMax = word->xMax;
2822  word->xMin = xMin;
2823  word->xMax = xMax;
2824  word->yMin = yMin;
2825  word->yMax = yMax;
2826  word->rot = (word->rot + 1) & 3;
2827  if (!(word->rot & 1)) {
2828  for (i = 0; i <= word->len; ++i) {
2829  word->edge[i] = pageWidth - word->edge[i];
2830  }
2831  }
2832  }
2833  }
2834  }
2835  }
2836  break;
2837  case 2:
2838  for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
2839  col = (TextColumn *)columns->get(colIdx);
2840  xMin = pageWidth - col->xMax;
2841  xMax = pageWidth - col->xMin;
2842  yMin = pageHeight - col->yMax;
2843  yMax = pageHeight - col->yMin;
2844  col->xMin = xMin;
2845  col->xMax = xMax;
2846  col->yMin = yMin;
2847  col->yMax = yMax;
2848  for (parIdx = 0;
2849  parIdx < col->paragraphs->getLength();
2850  ++parIdx) {
2851  par = (TextParagraph *)col->paragraphs->get(parIdx);
2852  xMin = pageWidth - par->xMax;
2853  xMax = pageWidth - par->xMin;
2854  yMin = pageHeight - par->yMax;
2855  yMax = pageHeight - par->yMin;
2856  par->xMin = xMin;
2857  par->xMax = xMax;
2858  par->yMin = yMin;
2859  par->yMax = yMax;
2860  for (lineIdx = 0;
2861  lineIdx < par->lines->getLength();
2862  ++lineIdx) {
2863  line = (TextLine *)par->lines->get(lineIdx);
2864  xMin = pageWidth - line->xMax;
2865  xMax = pageWidth - line->xMin;
2866  yMin = pageHeight - line->yMax;
2867  yMax = pageHeight - line->yMin;
2868  line->xMin = xMin;
2869  line->xMax = xMax;
2870  line->yMin = yMin;
2871  line->yMax = yMax;
2872  line->rot = (line->rot + 2) & 3;
2873  if (line->rot & 1) {
2874  for (i = 0; i <= line->len; ++i) {
2875  line->edge[i] = pageHeight - line->edge[i];
2876  }
2877  } else {
2878  for (i = 0; i <= line->len; ++i) {
2879  line->edge[i] = pageWidth - line->edge[i];
2880  }
2881  }
2882  for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
2883  word = (TextWord *)line->words->get(wordIdx);
2884  xMin = pageWidth - word->xMax;
2885  xMax = pageWidth - word->xMin;
2886  yMin = pageHeight - word->yMax;
2887  yMax = pageHeight - word->yMin;
2888  word->xMin = xMin;
2889  word->xMax = xMax;
2890  word->yMin = yMin;
2891  word->yMax = yMax;
2892  word->rot = (word->rot + 2) & 3;
2893  if (word->rot & 1) {
2894  for (i = 0; i <= word->len; ++i) {
2895  word->edge[i] = pageHeight - word->edge[i];
2896  }
2897  } else {
2898  for (i = 0; i <= word->len; ++i) {
2899  word->edge[i] = pageWidth - word->edge[i];
2900  }
2901  }
2902  }
2903  }
2904  }
2905  }
2906  break;
2907  case 3:
2908  // NB: this is called after unrotateChars(), which will have
2909  // swapped pageWidth and pageHeight already.
2910  for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
2911  col = (TextColumn *)columns->get(colIdx);
2912  xMin = col->yMin;
2913  xMax = col->yMax;
2914  yMin = pageHeight - col->xMax;
2915  yMax = pageHeight - col->xMin;
2916  col->xMin = xMin;
2917  col->xMax = xMax;
2918  col->yMin = yMin;
2919  col->yMax = yMax;
2920  for (parIdx = 0;
2921  parIdx < col->paragraphs->getLength();
2922  ++parIdx) {
2923  par = (TextParagraph *)col->paragraphs->get(parIdx);
2924  xMin = par->yMin;
2925  xMax = par->yMax;
2926  yMin = pageHeight - par->xMax;
2927  yMax = pageHeight - par->xMin;
2928  par->xMin = xMin;
2929  par->xMax = xMax;
2930  par->yMin = yMin;
2931  par->yMax = yMax;
2932  for (lineIdx = 0;
2933  lineIdx < par->lines->getLength();
2934  ++lineIdx) {
2935  line = (TextLine *)par->lines->get(lineIdx);
2936  xMin = line->yMin;
2937  xMax = line->yMax;
2938  yMin = pageHeight - line->xMax;
2939  yMax = pageHeight - line->xMin;
2940  line->xMin = xMin;
2941  line->xMax = xMax;
2942  line->yMin = yMin;
2943  line->yMax = yMax;
2944  line->rot = (line->rot + 3) & 3;
2945  if (line->rot & 1) {
2946  for (i = 0; i <= line->len; ++i) {
2947  line->edge[i] = pageHeight - line->edge[i];
2948  }
2949  }
2950  for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
2951  word = (TextWord *)line->words->get(wordIdx);
2952  xMin = word->yMin;
2953  xMax = word->yMax;
2954  yMin = pageHeight - word->xMax;
2955  yMax = pageHeight - word->xMin;
2956  word->xMin = xMin;
2957  word->xMax = xMax;
2958  word->yMin = yMin;
2959  word->yMax = yMax;
2960  word->rot = (word->rot + 3) & 3;
2961  if (word->rot & 1) {
2962  for (i = 0; i <= word->len; ++i) {
2963  word->edge[i] = pageHeight - word->edge[i];
2964  }
2965  }
2966  }
2967  }
2968  }
2969  }
2970  break;
2971  }
2972 }
2973 
2975  TextWord *word;
2976  double xMin, yMin, xMax, yMax;
2977  int i, j;
2978 
2979  switch (rot) {
2980  case 0:
2981  default:
2982  // no transform
2983  break;
2984  case 1:
2985  for (i = 0; i < words->getLength(); ++i) {
2986  word = (TextWord *)words->get(i);
2987  xMin = pageWidth - word->yMax;
2988  xMax = pageWidth - word->yMin;
2989  yMin = word->xMin;
2990  yMax = word->xMax;
2991  word->xMin = xMin;
2992  word->xMax = xMax;
2993  word->yMin = yMin;
2994  word->yMax = yMax;
2995  word->rot = (word->rot + 1) & 3;
2996  if (!(word->rot & 1)) {
2997  for (j = 0; j <= word->len; ++j) {
2998  word->edge[j] = pageWidth - word->edge[j];
2999  }
3000  }
3001  }
3002  break;
3003  case 2:
3004  for (i = 0; i < words->getLength(); ++i) {
3005  word = (TextWord *)words->get(i);
3006  xMin = pageWidth - word->xMax;
3007  xMax = pageWidth - word->xMin;
3008  yMin = pageHeight - word->yMax;
3009  yMax = pageHeight - word->yMin;
3010  word->xMin = xMin;
3011  word->xMax = xMax;
3012  word->yMin = yMin;
3013  word->yMax = yMax;
3014  word->rot = (word->rot + 2) & 3;
3015  if (word->rot & 1) {
3016  for (j = 0; j <= word->len; ++j) {
3017  word->edge[j] = pageHeight - word->edge[j];
3018  }
3019  } else {
3020  for (j = 0; j <= word->len; ++j) {
3021  word->edge[j] = pageWidth - word->edge[j];
3022  }
3023  }
3024  }
3025  break;
3026  case 3:
3027  for (i = 0; i < words->getLength(); ++i) {
3028  word = (TextWord *)words->get(i);
3029  xMin = word->yMin;
3030  xMax = word->yMax;
3031  yMin = pageHeight - word->xMax;
3032  yMax = pageHeight - word->xMin;
3033  word->xMin = xMin;
3034  word->xMax = xMax;
3035  word->yMin = yMin;
3036  word->yMax = yMax;
3037  word->rot = (word->rot + 3) & 3;
3038  if (word->rot & 1) {
3039  for (j = 0; j <= word->len; ++j) {
3040  word->edge[j] = pageHeight - word->edge[j];
3041  }
3042  }
3043  }
3044  break;
3045  }
3046 }
3047 
3048 // Determine the primary text direction (LR vs RL). Returns true for
3049 // LR, false for RL.
3051  TextChar *ch;
3052  int i, lrCount;
3053 
3054  lrCount = 0;
3055  for (i = 0; i < charsA->getLength(); ++i) {
3056  ch = (TextChar *)charsA->get(i);
3057  if (unicodeTypeL(ch->c)) {
3058  ++lrCount;
3059  } else if (unicodeTypeR(ch->c)) {
3060  --lrCount;
3061  }
3062  }
3063  return lrCount >= 0;
3064 }
3065 
3066 // Remove duplicate characters. The list of chars has been sorted --
3067 // by x for rot=0,2; by y for rot=1,3.
3068 void TextPage::removeDuplicates(GList *charsA, int rot) {
3069  TextChar *ch, *ch2;
3070  double xDelta, yDelta;
3071  int i, j;
3072 
3073  if (rot & 1) {
3074  i = 0;
3075  while (i < charsA->getLength()) {
3076  ch = (TextChar *)charsA->get(i);
3077  xDelta = dupMaxSecDelta * ch->fontSize;
3078  yDelta = dupMaxPriDelta * ch->fontSize;
3079  j = i + 1;
3080  while (j < charsA->getLength()) {
3081  ch2 = (TextChar *)charsA->get(j);
3082  if (ch2->yMin - ch->yMin >= yDelta) {
3083  break;
3084  }
3085  if (ch2->c == ch->c &&
3086  fabs(ch2->xMin - ch->xMin) < xDelta &&
3087  fabs(ch2->xMax - ch->xMax) < xDelta &&
3088  fabs(ch2->yMax - ch->yMax) < yDelta) {
3089  if (ch->invisible && !ch2->invisible) {
3090  charsA->del(i);
3091  --i;
3092  break;
3093  }
3094  if (ch2->spaceAfter) {
3095  ch->spaceAfter = (char)gTrue;
3096  }
3097  charsA->del(j);
3098  } else {
3099  ++j;
3100  }
3101  }
3102  ++i;
3103  }
3104  } else {
3105  i = 0;
3106  while (i < charsA->getLength()) {
3107  ch = (TextChar *)charsA->get(i);
3108  xDelta = dupMaxPriDelta * ch->fontSize;
3109  yDelta = dupMaxSecDelta * ch->fontSize;
3110  j = i + 1;
3111  while (j < charsA->getLength()) {
3112  ch2 = (TextChar *)charsA->get(j);
3113  if (ch2->xMin - ch->xMin >= xDelta) {
3114  break;
3115  }
3116  if (ch2->c == ch->c &&
3117  fabs(ch2->xMax - ch->xMax) < xDelta &&
3118  fabs(ch2->yMin - ch->yMin) < yDelta &&
3119  fabs(ch2->yMax - ch->yMax) < yDelta) {
3120  if (ch->invisible && !ch2->invisible) {
3121  charsA->del(i);
3122  --i;
3123  break;
3124  }
3125  if (ch2->spaceAfter) {
3126  ch->spaceAfter = (char)gTrue;
3127  }
3128  charsA->del(j);
3129  } else {
3130  ++j;
3131  }
3132  }
3133  ++i;
3134  }
3135  }
3136 }
3137 
3139  TextCharNode(TextChar *chA, TextCharNode *nextA): ch(chA), next(nextA) {}
3142 };
3143 
3144 // Separate out any overlapping text. If handling is
3145 // textOutAppendOverlaps, return a list of the overlapping chars; else
3146 // delete them and return NULL.
3148  // bin-sort the TextChars
3150  for (int y = 0; y < overlapGridHeight; ++y) {
3151  for (int x = 0; x < overlapGridWidth; ++x) {
3152  grid[y][x] = NULL;
3153  }
3154  }
3155  for (int i = 0; i < charsA->getLength(); ++i) {
3156  TextChar *ch = (TextChar *)charsA->get(i);
3157  int y0 = (int)floor(overlapGridHeight * ch->yMin / pageHeight);
3158  int y1 = (int)ceil(overlapGridHeight * ch->yMax / pageHeight);
3159  int x0 = (int)floor(overlapGridWidth * ch->xMin / pageWidth);
3160  int x1 = (int)ceil(overlapGridWidth * ch->yMin / pageWidth);
3161  if (y0 < 0) {
3162  y0 = 0;
3163  }
3164  if (y1 >= overlapGridHeight) {
3165  y1 = overlapGridHeight - 1;
3166  }
3167  if (x0 < 0) {
3168  x0 = 0;
3169  }
3170  if (x1 >= overlapGridWidth) {
3171  x1 = overlapGridWidth - 1;
3172  }
3173  for (int y = y0; y <= y1; ++y) {
3174  for (int x = x0; x <= x1; ++x) {
3175  grid[y][x] = new TextCharNode(ch, grid[y][x]);
3176  }
3177  }
3178  }
3179 
3180  // look for overlaps in each cell
3181  GBool foundOverlaps = gFalse;
3182  for (int y = 0; y < overlapGridHeight; ++y) {
3183  for (int x = 0; x < overlapGridWidth; ++x) {
3184  for (TextCharNode *p0 = grid[y][x]; p0; p0 = p0->next) {
3185  for (TextCharNode *p1 = p0->next; p1; p1 = p1->next) {
3186  if (p0->ch->colorR != p1->ch->colorR ||
3187  p0->ch->colorG != p1->ch->colorG ||
3188  p0->ch->colorB != p1->ch->colorB) {
3189  double ovx = (dmin(p0->ch->xMax, p1->ch->xMax)
3190  - dmax(p0->ch->xMin, p1->ch->xMin))
3191  / dmin(p0->ch->xMax - p0->ch->xMin,
3192  p1->ch->xMax - p1->ch->xMin);
3193  double ovy = (dmin(p0->ch->yMax, p1->ch->yMax)
3194  - dmax(p0->ch->yMin, p1->ch->yMin))
3195  / dmin(p0->ch->yMax - p0->ch->yMin,
3196  p1->ch->yMax - p1->ch->yMin);
3197  if (ovx > minCharOverlap && ovy > minCharOverlap) {
3198  // assume the lighter colored text is extraneous
3199  if (p0->ch->colorR + p0->ch->colorG + p0->ch->colorB
3200  < p1->ch->colorR + p1->ch->colorG + p1->ch->colorB) {
3201  p1->ch->overlap = gTrue;
3202  } else {
3203  p0->ch->overlap = gTrue;
3204  }
3205  foundOverlaps = gTrue;
3206  }
3207  }
3208  }
3209  }
3210  }
3211  }
3212 
3213  // find overlapped strings
3214  GList *overlapChars = NULL;
3216  overlapChars = new GList();
3217  }
3218  if (foundOverlaps) {
3219  charsA->sort(&TextChar::cmpCharPos);
3220  int i = 0;
3221  while (i < charsA->getLength()) {
3222  TextChar *ch0 = (TextChar *)charsA->get(i);
3223  if (ch0->overlap) {
3224  int j0, j1;
3225  for (j0 = i - 1; j0 >= 0; --j0) {
3226  TextChar *ch1 = (TextChar *)charsA->get(j0);
3227  if (ch1->colorR != ch0->colorR ||
3228  ch1->colorG != ch0->colorG ||
3229  ch1->colorB != ch0->colorB ||
3230  ch1->rot != ch0->rot) {
3231  break;
3232  }
3233  }
3234  ++j0;
3235  for (j1 = i + 1; j1 < charsA->getLength(); ++j1) {
3236  TextChar *ch1 = (TextChar *)charsA->get(j1);
3237  if (ch1->colorR != ch0->colorR ||
3238  ch1->colorG != ch0->colorG ||
3239  ch1->colorB != ch0->colorB ||
3240  ch1->rot != ch0->rot) {
3241  break;
3242  }
3243  }
3244  --j1;
3245  for (int j = j0; j <= j1; ++j) {
3246  if (overlapChars) {
3247  overlapChars->append(charsA->get(j0));
3248  } else {
3249  delete (TextChar *)charsA->get(j0);
3250  }
3251  charsA->del(j0);
3252  }
3253  i = j0;
3254  } else {
3255  ++i;
3256  }
3257  }
3258  }
3259 
3260  // free memory
3261  for (int y = 0; y < overlapGridHeight; ++y) {
3262  for (int x = 0; x < overlapGridWidth; ++x) {
3263  TextCharNode *p0 = grid[y][x];
3264  while (p0) {
3265  TextCharNode *p1 = p0->next;
3266  delete p0;
3267  p0 = p1;
3268  }
3269  }
3270  }
3271 
3272  return overlapChars;
3273 }
3274 
3275 // Construct a TextColumn from the list of separated overlapping
3276 // chars.
3278  GList *pars = new GList();
3279  GList *lines = new GList();
3280  GList *words = new GList();
3281  int wordStart = 0;
3282  double lineXMin = 0, lineYMin = 0, lineXMax = 0, lineYMax = 0;
3283  double colXMin = 0, colYMin = 0, colXMax = 0, colYMax = 0;
3284  for (int i = 0; i < overlappingChars->getLength(); ++i) {
3285  TextChar *ch = (TextChar *)overlappingChars->get(i);
3286  TextChar *chNext = NULL;
3287  if (i + 1 < overlappingChars->getLength()) {
3288  chNext = (TextChar *)overlappingChars->get(i + 1);
3289  }
3290  double sp = 0;
3291  double dy = 0;
3292  if (chNext) {
3293  switch (ch->rot) {
3294  case 0:
3295  default:
3296  sp = chNext->xMin - ch->xMax;
3297  dy = chNext->yMin - ch->yMin;
3298  break;
3299  case 1:
3300  sp = chNext->yMin - ch->yMax;
3301  dy = chNext->xMax - ch->xMax;
3302  break;
3303  case 2:
3304  sp = ch->xMin - chNext->xMax;
3305  dy = ch->yMax - chNext->yMax;
3306  break;
3307  case 3:
3308  sp = ch->yMin - chNext->yMax;
3309  dy = ch->xMin - chNext->xMin;
3310  break;
3311  }
3312  }
3313  // the +1 here allows for a space character after ch
3314  GBool parBreak = !chNext ||
3315  chNext->rot != ch->rot ||
3316  chNext->charPos > ch->charPos + ch->charLen + 1;
3317  GBool lineBreak = parBreak ||
3318  sp < -rawModeCharOverlap * ch->fontSize ||
3319  fabs(dy) > rawModeLineDelta * ch->fontSize;
3320  GBool wordBreak = lineBreak ||
3321  ch->spaceAfter ||
3322  sp > rawModeWordSpacing * ch->fontSize;
3323  if (!wordBreak) {
3324  continue;
3325  }
3326  TextWord *word = new TextWord(overlappingChars, wordStart,
3327  i - wordStart + 1, ch->rot, ch->rotated,
3328  getCharDirection(ch), !lineBreak);
3329  words->append(word);
3330  if (words->getLength() == 0) {
3331  lineXMin = word->xMin;
3332  lineYMin = word->yMin;
3333  lineXMax = word->xMax;
3334  lineYMax = word->yMax;
3335  } else {
3336  lineXMin = dmin(lineXMin, word->xMin);
3337  lineYMin = dmin(lineYMin, word->yMin);
3338  lineXMax = dmax(lineXMax, word->xMax);
3339  lineYMax = dmax(lineYMax, word->yMax);
3340  }
3341  wordStart = i + 1;
3342  if (!lineBreak) {
3343  continue;
3344  }
3345  lines->append(new TextLine(words, lineXMin, lineYMin, lineXMax, lineYMax,
3346  ((TextWord *)words->get(0))->fontSize));
3347  words = new GList();
3348  if (!parBreak) {
3349  continue;
3350  }
3351  TextParagraph *par = new TextParagraph(lines, gFalse);
3352  pars->append(par);
3353  if (pars->getLength() == 0) {
3354  colXMin = par->xMin;
3355  colYMin = par->yMin;
3356  colXMax = par->xMax;
3357  colYMax = par->yMax;
3358  } else {
3359  colXMin = dmin(colXMin, par->xMin);
3360  colYMin = dmin(colYMin, par->yMin);
3361  colXMax = dmax(colXMax, par->xMax);
3362  colYMax = dmax(colYMax, par->yMax);
3363  }
3364  lines = new GList();
3365  }
3366  delete words;
3367  delete lines;
3368  return new TextColumn(pars, colXMin, colYMin, colXMax, colYMax);
3369 }
3370 
3371 // Split the characters into trees of TextBlocks, one tree for each
3372 // rotation. Merge into a single tree (with the primary rotation).
3374  TextBlock *tree[4];
3375  TextBlock *blk;
3376  GList *chars2, *clippedChars;
3377  TextChar *ch;
3378  int rot, i;
3379 
3380  // split: build a tree of TextBlocks for each rotation
3381  clippedChars = new GList();
3382  for (rot = 0; rot < 4; ++rot) {
3383  chars2 = new GList();
3384  for (i = 0; i < charsA->getLength(); ++i) {
3385  ch = (TextChar *)charsA->get(i);
3386  if (ch->rot == rot &&
3387  !(control.discardInvisibleText && ch->invisible) &&
3388  !(control.discardClippedText && ch->clipped)) {
3389  chars2->append(ch);
3390  }
3391  }
3392  tree[rot] = NULL;
3393  if (chars2->getLength() > 0) {
3394  chars2->sort((rot & 1) ? &TextChar::cmpY : &TextChar::cmpX);
3395  removeDuplicates(chars2, rot);
3396  if (control.clipText) {
3397  i = 0;
3398  while (i < chars2->getLength()) {
3399  ch = (TextChar *)chars2->get(i);
3400  if (ch->clipped) {
3401  ch = (TextChar *)chars2->del(i);
3402  clippedChars->append(ch);
3403  } else {
3404  ++i;
3405  }
3406  }
3407  }
3408  if (chars2->getLength() > 0) {
3409  tree[rot] = split(chars2, rot);
3410  }
3411  }
3412  delete chars2;
3413  }
3414 
3415  // if the page contains no (unclipped) text, just leave an empty
3416  // column list
3417  if (!tree[0]) {
3418  // normally tree[0] is empty only if there is no text at all, but
3419  // if the caller didn't do rotation, the rotated trees may be
3420  // non-empty, so we need to free them
3421  for (rot = 1; rot < 4; ++rot) {
3422  if (tree[rot]) {
3423  delete tree[rot];
3424  }
3425  }
3426  delete clippedChars;
3427  return NULL;
3428  }
3429 
3430  // if the main tree is not a multicolumn node, insert one so that
3431  // rotated text has somewhere to go
3432  if (tree[0]->tag != blkTagMulticolumn) {
3433  blk = new TextBlock(blkHorizSplit, 0);
3434  blk->addChild(tree[0]);
3435  blk->tag = blkTagMulticolumn;
3436  tree[0] = blk;
3437  }
3438 
3439  // merge non-primary-rotation text into the primary-rotation tree
3440  for (rot = 1; rot < 4; ++rot) {
3441  if (tree[rot]) {
3442  insertIntoTree(tree[rot], tree[0]);
3443  tree[rot] = NULL;
3444  }
3445  }
3446 
3447  if (clippedChars->getLength()) {
3448  insertClippedChars(clippedChars, tree[0]);
3449  }
3450  delete clippedChars;
3451 
3452 #if 0 //~debug
3453  dumpTree(tree[0]);
3454 #endif
3455 
3456  return tree[0];
3457 }
3458 
3459 // Generate a tree of TextBlocks, marked as columns, lines, and words.
3460 TextBlock *TextPage::split(GList *charsA, int rot) {
3461  TextBlock *blk;
3462  GList *chars2, *chars3;
3463  TextGaps *horizGaps, *vertGaps;
3464  TextChar *ch;
3465  double xMin, yMin, xMax, yMax, avgFontSize;
3466  double horizGapSize, vertGapSize, minHorizChunkWidth, minVertChunkWidth;
3467  double gap, nLines, vertGapThreshold, minChunk;
3468  double largeCharSize;
3469  double x0, x1, y0, y1;
3470  int nHorizGaps, nVertGaps, nLargeChars;
3471  int i;
3472  GBool doHorizSplit, doVertSplit, smallSplit;
3473 
3474  //----- find all horizontal and vertical gaps
3475 
3476  horizGaps = new TextGaps();
3477  vertGaps = new TextGaps();
3478  findGaps(charsA, rot, &xMin, &yMin, &xMax, &yMax, &avgFontSize,
3479  horizGaps, vertGaps);
3480 
3481  //----- find the largest horizontal and vertical gaps
3482 
3483  horizGapSize = 0;
3484  for (i = 0; i < horizGaps->getLength(); ++i) {
3485  gap = horizGaps->getW(i);
3486  if (gap > horizGapSize) {
3487  horizGapSize = gap;
3488  }
3489  }
3490  vertGapSize = 0;
3491  for (i = 0; i < vertGaps->getLength(); ++i) {
3492  gap = vertGaps->getW(i);
3493  if (gap > vertGapSize) {
3494  vertGapSize = gap;
3495  }
3496  }
3497 
3498  //----- count horiz/vert gaps equivalent to largest gaps
3499 
3500  minHorizChunkWidth = yMax - yMin;
3501  nHorizGaps = 0;
3502  if (horizGaps->getLength() > 0) {
3503  y0 = yMin;
3504  for (i = 0; i < horizGaps->getLength(); ++i) {
3505  gap = horizGaps->getW(i);
3506  if (gap > horizGapSize - splitGapSlack * avgFontSize) {
3507  ++nHorizGaps;
3508  y1 = horizGaps->getX(i) - 0.5 * gap;
3509  if (y1 - y0 < minHorizChunkWidth) {
3510  minHorizChunkWidth = y1 - y0;
3511  }
3512  y0 = y1 + gap;
3513  }
3514  }
3515  y1 = yMax;
3516  if (y1 - y0 < minHorizChunkWidth) {
3517  minHorizChunkWidth = y1 - y0;
3518  }
3519  }
3520  minVertChunkWidth = xMax - xMin;
3521  nVertGaps = 0;
3522  if (vertGaps->getLength() > 0) {
3523  x0 = xMin;
3524  for (i = 0; i < vertGaps->getLength(); ++i) {
3525  gap = vertGaps->getW(i);
3526  if (gap > vertGapSize - splitGapSlack * avgFontSize) {
3527  ++nVertGaps;
3528  x1 = vertGaps->getX(i) - 0.5 * gap;
3529  if (x1 - x0 < minVertChunkWidth) {
3530  minVertChunkWidth = x1 - x0;
3531  }
3532  x0 = x1 + gap;
3533  }
3534  }
3535  x1 = xMax;
3536  if (x1 - x0 < minVertChunkWidth) {
3537  minVertChunkWidth = x1 - x0;
3538  }
3539  }
3540 
3541  //----- compute splitting parameters
3542 
3543  // approximation of number of lines in block
3544  if (fabs(avgFontSize) < 0.001) {
3545  nLines = 1;
3546  } else if (rot & 1) {
3547  nLines = (xMax - xMin) / avgFontSize;
3548  } else {
3549  nLines = (yMax - yMin) / avgFontSize;
3550  }
3551 
3552  // compute the minimum allowed vertical gap size
3553  // (this is a horizontal gap threshold for rot=1,3
3554  if (control.mode == textOutTableLayout) {
3555  vertGapThreshold = vertGapThresholdTableMax
3556  + vertGapThresholdTableSlope * nLines;
3557  if (vertGapThreshold < vertGapThresholdTableMin) {
3558  vertGapThreshold = vertGapThresholdTableMin;
3559  }
3560  } else if (control.mode == textOutSimpleLayout) {
3561  vertGapThreshold = simpleLayoutGapThreshold;
3562  } else {
3563  vertGapThreshold = vertGapThresholdMax + vertGapThresholdSlope * nLines;
3564  if (vertGapThreshold < vertGapThresholdMin) {
3565  vertGapThreshold = vertGapThresholdMin;
3566  }
3567  }
3568  vertGapThreshold = vertGapThreshold * avgFontSize;
3569 
3570  // compute the minimum allowed chunk width
3571  if (control.mode == textOutTableLayout) {
3572  minChunk = 0;
3573  } else {
3574  minChunk = vertSplitChunkThreshold * avgFontSize;
3575  }
3576 
3577  // look for large chars
3578  // -- this kludge (multiply by 256, convert to int, divide by 256.0)
3579  // prevents floating point stability issues on x86 with gcc, where
3580  // largeCharSize could otherwise have slightly different values
3581  // here and where it's used below to do the large char partition
3582  // (because it gets truncated from 80 to 64 bits when spilled)
3583  nLargeChars = 0;
3584  largeCharSize = 0;
3586  largeCharSize = (int)(largeCharThreshold * avgFontSize * 256) / 256.0;
3587  for (i = 0; i < charsA->getLength(); ++i) {
3588  ch = (TextChar *)charsA->get(i);
3589  if (ch->fontSize > largeCharSize) {
3590  ++nLargeChars;
3591  }
3592  }
3593  }
3594 
3595  // figure out which type of split to do
3596  doHorizSplit = doVertSplit = gFalse;
3597  smallSplit = gFalse;
3598  if (rot & 1) {
3599  if (control.mode == textOutSimpleLayout) {
3600  if (nVertGaps > 0) {
3601  doVertSplit = gTrue;
3602  } else if (nHorizGaps > 0) {
3603  doHorizSplit = gTrue;
3604  smallSplit = horizGapSize <= vertGapThreshold;
3605  }
3606  } else if (nHorizGaps > 0 &&
3607  (horizGapSize > vertGapSize ||
3609  horizGapSize > vertGapThreshold &&
3610  (minHorizChunkWidth > minChunk ||
3611  nVertGaps == 0)) {
3612  doHorizSplit = gTrue;
3613  } else if (nVertGaps > 0) {
3614  doVertSplit = gTrue;
3615  } else if (nLargeChars == 0 && nHorizGaps > 0) {
3616  doHorizSplit = gTrue;
3617  smallSplit = gTrue;
3618  }
3619  } else {
3620  if (control.mode == textOutSimpleLayout) {
3621  if (nHorizGaps > 0) {
3622  doHorizSplit = gTrue;
3623  } else if (nVertGaps > 0) {
3624  doVertSplit = gTrue;
3625  smallSplit = vertGapSize <= vertGapThreshold;
3626  }
3627  } else if (nVertGaps > 0 &&
3628  (vertGapSize > horizGapSize ||
3630  vertGapSize > vertGapThreshold &&
3631  (minVertChunkWidth > minChunk ||
3632  nHorizGaps == 0)) {
3633  doVertSplit = gTrue;
3634  } else if (nHorizGaps > 0) {
3635  doHorizSplit = gTrue;
3636  } else if (nLargeChars == 0 && nVertGaps > 0) {
3637  doVertSplit = gTrue;
3638  smallSplit = gTrue;
3639  }
3640  }
3641 
3642  //----- split the block
3643 
3644  //~ this could use "other content" (vector graphics, rotated text) --
3645  //~ presence of other content in a gap means we should definitely split
3646 
3647  // split vertically
3648  if (doVertSplit) {
3649 #if 0 //~debug
3650  printf("vert split xMin=%g yMin=%g xMax=%g yMax=%g small=%d\n",
3651  xMin, pageHeight - yMax, xMax, pageHeight - yMin, smallSplit);
3652  for (i = 0; i < vertGaps->getLength(); ++i) {
3653  if (vertGaps->getW(i) > vertGapSize - splitGapSlack * avgFontSize) {
3654  printf(" x=%g\n", vertGaps->getX(i));
3655  }
3656  }
3657 #endif
3658  blk = new TextBlock(blkVertSplit, rot);
3659  blk->smallSplit = smallSplit;
3660  x0 = xMin - 1;
3661  for (i = 0; i < vertGaps->getLength(); ++i) {
3662  if (vertGaps->getW(i) > vertGapSize - splitGapSlack * avgFontSize) {
3663  x1 = vertGaps->getX(i);
3664  chars2 = getChars(charsA, x0, yMin - 1, x1, yMax + 1);
3665  blk->addChild(split(chars2, rot));
3666  delete chars2;
3667  x0 = x1;
3668  }
3669  }
3670  chars2 = getChars(charsA, x0, yMin - 1, xMax + 1, yMax + 1);
3671  blk->addChild(split(chars2, rot));
3672  delete chars2;
3673 
3674  // split horizontally
3675  } else if (doHorizSplit) {
3676 #if 0 //~debug
3677  printf("horiz split xMin=%g yMin=%g xMax=%g yMax=%g small=%d\n",
3678  xMin, pageHeight - yMax, xMax, pageHeight - yMin, smallSplit);
3679  for (i = 0; i < horizGaps->getLength(); ++i) {
3680  if (horizGaps->getW(i) > horizGapSize - splitGapSlack * avgFontSize) {
3681  printf(" y=%g\n", pageHeight - horizGaps->getX(i));
3682  }
3683  }
3684 #endif
3685  blk = new TextBlock(blkHorizSplit, rot);
3686  blk->smallSplit = smallSplit;
3687  y0 = yMin - 1;
3688  for (i = 0; i < horizGaps->getLength(); ++i) {
3689  if (horizGaps->getW(i) > horizGapSize - splitGapSlack * avgFontSize) {
3690  y1 = horizGaps->getX(i);
3691  chars2 = getChars(charsA, xMin - 1, y0, xMax + 1, y1);
3692  blk->addChild(split(chars2, rot));
3693  delete chars2;
3694  y0 = y1;
3695  }
3696  }
3697  chars2 = getChars(charsA, xMin - 1, y0, xMax + 1, yMax + 1);
3698  blk->addChild(split(chars2, rot));
3699  delete chars2;
3700 
3701  // split into larger and smaller chars
3702  } else if (nLargeChars > 0) {
3703 #if 0 //~debug
3704  printf("large char split xMin=%g yMin=%g xMax=%g yMax=%g\n",
3705  xMin, pageHeight - yMax, xMax, pageHeight - yMin);
3706 #endif
3707  chars2 = new GList();
3708  chars3 = new GList();
3709  for (i = 0; i < charsA->getLength(); ++i) {
3710  ch = (TextChar *)charsA->get(i);
3711  if (ch->fontSize > largeCharSize) {
3712  chars2->append(ch);
3713  } else {
3714  chars3->append(ch);
3715  }
3716  }
3717  blk = split(chars3, rot);
3718  insertLargeChars(chars2, blk);
3719  delete chars2;
3720  delete chars3;
3721 
3722  // create a leaf node
3723  } else {
3724 #if 0 //~debug
3725  printf("leaf xMin=%g yMin=%g xMax=%g yMax=%g\n",
3726  xMin, pageHeight - yMax, xMax, pageHeight - yMin);
3727 #endif
3728  blk = new TextBlock(blkLeaf, rot);
3729  for (i = 0; i < charsA->getLength(); ++i) {
3730  blk->addChild((TextChar *)charsA->get(i), gTrue);
3731  }
3732  }
3733 
3734  delete horizGaps;
3735  delete vertGaps;
3736 
3737  tagBlock(blk);
3738 
3739  return blk;
3740 }
3741 
3742 // Return the subset of chars inside a rectangle.
3743 GList *TextPage::getChars(GList *charsA, double xMin, double yMin,
3744  double xMax, double yMax) {
3745  GList *ret;
3746  TextChar *ch;
3747  double x, y;
3748  int i;
3749 
3750  ret = new GList();
3751  for (i = 0; i < charsA->getLength(); ++i) {
3752  ch = (TextChar *)charsA->get(i);
3753  // because of {ascent,descent}AdjustFactor, the y coords (or x
3754  // coords for rot 1,3) for the gaps will be a little bit tight --
3755  // so we use the center of the character here
3756  x = 0.5 * (ch->xMin + ch->xMax);
3757  y = 0.5 * (ch->yMin + ch->yMax);
3758  if (x > xMin && x < xMax && y > yMin && y < yMax) {
3759  ret->append(ch);
3760  }
3761  }
3762  return ret;
3763 }
3764 
3765 void TextPage::findGaps(GList *charsA, int rot,
3766  double *xMinOut, double *yMinOut,
3767  double *xMaxOut, double *yMaxOut,
3768  double *avgFontSizeOut,
3769  TextGaps *horizGaps, TextGaps *vertGaps) {
3770  TextChar *ch;
3771  char *horizProfile, *vertProfile;
3772  double xMin, yMin, xMax, yMax, w;
3773  double minFontSize, avgFontSize, splitPrecision, invSplitPrecision;
3774  double ascentAdjust, descentAdjust;
3775  int xMinI, yMinI, xMaxI, yMaxI, xMinI2, yMinI2, xMaxI2, yMaxI2;
3776  int start, x, y, i;
3777 
3778  //----- compute bbox, min font size, average font size, and split precision
3779 
3780  xMin = yMin = xMax = yMax = 0; // make gcc happy
3781  minFontSize = avgFontSize = 0;
3782  for (i = 0; i < charsA->getLength(); ++i) {
3783  ch = (TextChar *)charsA->get(i);
3784  if (i == 0 || ch->xMin < xMin) {
3785  xMin = ch->xMin;
3786  }
3787  if (i == 0 || ch->yMin < yMin) {
3788  yMin = ch->yMin;
3789  }
3790  if (i == 0 || ch->xMax > xMax) {
3791  xMax = ch->xMax;
3792  }
3793  if (i == 0 || ch->yMax > yMax) {
3794  yMax = ch->yMax;
3795  }
3796  avgFontSize += ch->fontSize;
3797  if (i == 0 || ch->fontSize < minFontSize) {
3798  minFontSize = ch->fontSize;
3799  }
3800  }
3801  avgFontSize /= charsA->getLength();
3802  splitPrecision = splitPrecisionMul * minFontSize;
3803  if (splitPrecision < minSplitPrecision) {
3804  splitPrecision = minSplitPrecision;
3805  }
3806  invSplitPrecision = 1 / splitPrecision;
3807  *xMinOut = xMin;
3808  *yMinOut = yMin;
3809  *xMaxOut = xMax;
3810  *yMaxOut = yMax;
3811  *avgFontSizeOut = avgFontSize;
3812 
3813  //----- compute the horizontal and vertical profiles
3814 
3815  if (xMin * invSplitPrecision < 0.5 * INT_MIN ||
3816  xMax * invSplitPrecision > 0.5 * INT_MAX ||
3817  yMin * invSplitPrecision < 0.5 * INT_MIN ||
3818  yMax * invSplitPrecision > 0.5 * INT_MAX) {
3819  return;
3820  }
3821  // add some slack to the array bounds to avoid floating point
3822  // precision problems
3823  xMinI = (int)floor(xMin * invSplitPrecision) - 1;
3824  yMinI = (int)floor(yMin * invSplitPrecision) - 1;
3825  xMaxI = (int)floor(xMax * invSplitPrecision) + 1;
3826  yMaxI = (int)floor(yMax * invSplitPrecision) + 1;
3827  horizProfile = (char *)gmalloc(yMaxI - yMinI + 1);
3828  vertProfile = (char *)gmalloc(xMaxI - xMinI + 1);
3829  memset(horizProfile, 0, yMaxI - yMinI + 1);
3830  memset(vertProfile, 0, xMaxI - xMinI + 1);
3831  for (i = 0; i < charsA->getLength(); ++i) {
3832  ch = (TextChar *)charsA->get(i);
3833  // yMinI2 and yMaxI2 are adjusted to allow for slightly overlapping lines
3834  switch (rot) {
3835  case 0:
3836  default:
3837  xMinI2 = (int)floor(ch->xMin * invSplitPrecision);
3838  xMaxI2 = (int)floor(ch->xMax * invSplitPrecision);
3839  ascentAdjust = ascentAdjustFactor * (ch->yMax - ch->yMin);
3840  yMinI2 = (int)floor((ch->yMin + ascentAdjust) * invSplitPrecision);
3841  descentAdjust = descentAdjustFactor * (ch->yMax - ch->yMin);
3842  yMaxI2 = (int)floor((ch->yMax - descentAdjust) * invSplitPrecision);
3843  break;
3844  case 1:
3845  descentAdjust = descentAdjustFactor * (ch->xMax - ch->xMin);
3846  xMinI2 = (int)floor((ch->xMin + descentAdjust) * invSplitPrecision);
3847  ascentAdjust = ascentAdjustFactor * (ch->xMax - ch->xMin);
3848  xMaxI2 = (int)floor((ch->xMax - ascentAdjust) * invSplitPrecision);
3849  yMinI2 = (int)floor(ch->yMin * invSplitPrecision);
3850  yMaxI2 = (int)floor(ch->yMax * invSplitPrecision);
3851  break;
3852  case 2:
3853  xMinI2 = (int)floor(ch->xMin * invSplitPrecision);
3854  xMaxI2 = (int)floor(ch->xMax * invSplitPrecision);
3855  descentAdjust = descentAdjustFactor * (ch->yMax - ch->yMin);
3856  yMinI2 = (int)floor((ch->yMin + descentAdjust) * invSplitPrecision);
3857  ascentAdjust = ascentAdjustFactor * (ch->yMax - ch->yMin);
3858  yMaxI2 = (int)floor((ch->yMax - ascentAdjust) * invSplitPrecision);
3859  break;
3860  case 3:
3861  ascentAdjust = ascentAdjustFactor * (ch->xMax - ch->xMin);
3862  xMinI2 = (int)floor((ch->xMin + ascentAdjust) * invSplitPrecision);
3863  descentAdjust = descentAdjustFactor * (ch->xMax - ch->xMin);
3864  xMaxI2 = (int)floor((ch->xMax - descentAdjust) * invSplitPrecision);
3865  yMinI2 = (int)floor(ch->yMin * invSplitPrecision);
3866  yMaxI2 = (int)floor(ch->yMax * invSplitPrecision);
3867  break;
3868  }
3869  for (y = yMinI2; y <= yMaxI2; ++y) {
3870  horizProfile[y - yMinI] = 1;
3871  }
3872  for (x = xMinI2; x <= xMaxI2; ++x) {
3873  vertProfile[x - xMinI] = 1;
3874  }
3875  }
3876 
3877  //----- build the list of horizontal gaps
3878 
3879  for (start = yMinI; start < yMaxI && !horizProfile[start - yMinI]; ++start) ;
3880  for (y = start; y < yMaxI; ++y) {
3881  if (horizProfile[y - yMinI]) {
3882  if (!horizProfile[y + 1 - yMinI]) {
3883  start = y;
3884  }
3885  } else {
3886  if (horizProfile[y + 1 - yMinI]) {
3887  w = (y - start) * splitPrecision;
3888  horizGaps->addGap((start + 1) * splitPrecision + 0.5 * w, w);
3889  }
3890  }
3891  }
3892 
3893  //----- build the list of vertical gaps
3894 
3895  for (start = xMinI; start < xMaxI && !vertProfile[start - xMinI]; ++start) ;
3896  for (x = start; x < xMaxI; ++x) {
3897  if (vertProfile[x - xMinI]) {
3898  if (!vertProfile[x + 1 - xMinI]) {
3899  start = x;
3900  }
3901  } else {
3902  if (vertProfile[x + 1 - xMinI]) {
3903  w = (x - start) * splitPrecision;
3904  vertGaps->addGap((start + 1) * splitPrecision + 0.5 * w, w);
3905  }
3906  }
3907  }
3908 
3909  gfree(horizProfile);
3910  gfree(vertProfile);
3911 }
3912 
3913 // Decide whether this block is a line, column, or multiple columns:
3914 // - all leaf nodes are lines
3915 // - horiz split nodes whose children are lines or columns are columns
3916 // - other horiz split nodes are multiple columns
3917 // - vert split nodes, with small gaps, whose children are lines are lines
3918 // - other vert split nodes are multiple columns
3919 // (for rot=1,3: the horiz and vert splits are swapped)
3920 // In table layout mode:
3921 // - all leaf nodes are lines
3922 // - vert split nodes, with small gaps, whose children are lines are lines
3923 // - everything else is multiple columns
3924 // In simple layout mode:
3925 // - all leaf nodes are lines
3926 // - vert split nodes with small gaps are lines
3927 // - vert split nodes with large gaps are super-lines
3928 // - horiz split nodes are columns
3930  TextBlock *child;
3931  int i;
3932 
3933  if (control.mode == textOutTableLayout) {
3934  if (blk->type == blkLeaf) {
3935  blk->tag = blkTagLine;
3936  } else if (blk->type == ((blk->rot & 1) ? blkHorizSplit : blkVertSplit) &&
3937  blk->smallSplit) {
3938  blk->tag = blkTagLine;
3939  for (i = 0; i < blk->children->getLength(); ++i) {
3940  child = (TextBlock *)blk->children->get(i);
3941  if (child->tag != blkTagLine) {
3942  blk->tag = blkTagMulticolumn;
3943  break;
3944  }
3945  }
3946  } else {
3947  blk->tag = blkTagMulticolumn;
3948  }
3949  return;
3950  }
3951 
3952  if (control.mode == textOutSimpleLayout) {
3953  if (blk->type == blkLeaf) {
3954  blk->tag = blkTagLine;
3955  } else if (blk->type == ((blk->rot & 1) ? blkHorizSplit : blkVertSplit)) {
3956  blk->tag = blk->smallSplit ? blkTagLine : blkTagSuperLine;
3957  } else {
3958  blk->tag = blkTagColumn;
3959  }
3960  return;
3961  }
3962 
3963  if (blk->type == blkLeaf) {
3964  blk->tag = blkTagLine;
3965 
3966  } else {
3967  if (blk->type == ((blk->rot & 1) ? blkVertSplit : blkHorizSplit)) {
3968  blk->tag = blkTagColumn;
3969  for (i = 0; i < blk->children->getLength(); ++i) {
3970  child = (TextBlock *)blk->children->get(i);
3971  if (child->tag != blkTagColumn && child->tag != blkTagLine) {
3972  blk->tag = blkTagMulticolumn;
3973  break;
3974  }
3975  }
3976  } else {
3977  if (blk->smallSplit) {
3978  blk->tag = blkTagLine;
3979  for (i = 0; i < blk->children->getLength(); ++i) {
3980  child = (TextBlock *)blk->children->get(i);
3981  if (child->tag != blkTagLine) {
3982  blk->tag = blkTagMulticolumn;
3983  break;
3984  }
3985  }
3986  } else {
3987  blk->tag = blkTagMulticolumn;
3988  }
3989  }
3990  }
3991 }
3992 
3993 // Insert a list of large characters into a tree.
3994 void TextPage::insertLargeChars(GList *largeChars, TextBlock *blk) {
3995  TextChar *ch, *ch2;
3996  GBool singleLine;
3997  double minOverlap;
3998  int i;
3999 
4000  //~ this currently works only for characters in the primary rotation
4001 
4002  // check to see if the large chars are a single line
4003  singleLine = gTrue;
4004  for (i = 1; i < largeChars->getLength(); ++i) {
4005  ch = (TextChar *)largeChars->get(i-1);
4006  ch2 = (TextChar *)largeChars->get(i);
4007  minOverlap = 0.5 * (ch->fontSize < ch2->fontSize ? ch->fontSize
4008  : ch2->fontSize);
4009  if (ch->yMax - ch2->yMin < minOverlap ||
4010  ch2->yMax - ch->yMin < minOverlap) {
4011  singleLine = gFalse;
4012  break;
4013  }
4014  }
4015 
4016  if (singleLine) {
4017  // if the large chars are a single line, prepend them to the first
4018  // leaf node in blk
4019  insertLargeCharsInFirstLeaf(largeChars, blk);
4020  } else {
4021  // if the large chars are not a single line, prepend each one to
4022  // the appropriate leaf node -- this handles cases like bullets
4023  // drawn in a large font, on the left edge of a column
4024  for (i = largeChars->getLength() - 1; i >= 0; --i) {
4025  ch = (TextChar *)largeChars->get(i);
4026  insertLargeCharInLeaf(ch, blk);
4027  }
4028  }
4029 }
4030 
4031 // Find the first leaf (in depth-first order) in blk, and prepend a
4032 // list of large chars.
4034  TextChar *ch;
4035  int i;
4036 
4037  if (blk->type == blkLeaf) {
4038  for (i = largeChars->getLength() - 1; i >= 0; --i) {
4039  ch = (TextChar *)largeChars->get(i);
4040  blk->prependChild(ch);
4041  }
4042  } else {
4043  insertLargeCharsInFirstLeaf(largeChars, (TextBlock *)blk->children->get(0));
4044  blk->updateBounds(0);
4045  }
4046 }
4047 
4048 // Find the leaf in <blk> where large char <ch> belongs, and prepend
4049 // it.
4051  TextBlock *child;
4052  double y;
4053  int i;
4054 
4055  //~ this currently works only for characters in the primary rotation
4056 
4057  //~ this currently just looks down the left edge of blk
4058  //~ -- it could be extended to do more
4059 
4060  // estimate the baseline of ch
4061  y = ch->yMin + 0.75 * (ch->yMax - ch->yMin);
4062 
4063  if (blk->type == blkLeaf) {
4064  blk->prependChild(ch);
4065  } else if (blk->type == blkHorizSplit) {
4066  for (i = 0; i < blk->children->getLength(); ++i) {
4067  child = (TextBlock *)blk->children->get(i);
4068  if (y < child->yMax || i == blk->children->getLength() - 1) {
4070  blk->updateBounds(i);
4071  break;
4072  }
4073  }
4074  } else {
4076  blk->updateBounds(0);
4077  }
4078 }
4079 
4080 // Merge blk (rot != 0) into primaryTree (rot == 0).
4082  TextBlock *child;
4083 
4084  // we insert a whole column at a time - so call insertIntoTree
4085  // recursively until we get to a column (or line)
4086 
4087  if (blk->tag == blkTagMulticolumn) {
4088  while (blk->children->getLength()) {
4089  child = (TextBlock *)blk->children->del(0);
4090  insertIntoTree(child, primaryTree);
4091  }
4092  delete blk;
4093  } else {
4094  insertColumnIntoTree(blk, primaryTree);
4095  }
4096 }
4097 
4098 // Insert a column (as an atomic subtree) into tree.
4099 // Requirement: tree is not a leaf node.
4101  TextBlock *child;
4102  int i;
4103 
4104  for (i = 0; i < tree->children->getLength(); ++i) {
4105  child = (TextBlock *)tree->children->get(i);
4106  if (child->tag == blkTagMulticolumn &&
4107  column->xMin >= child->xMin &&
4108  column->yMin >= child->yMin &&
4109  column->xMax <= child->xMax &&
4110  column->yMax <= child->yMax) {
4112  tree->tag = blkTagMulticolumn;
4113  return;
4114  }
4115  }
4116 
4117  if (tree->type == blkVertSplit) {
4118  if (tree->rot == 1 || tree->rot == 2) {
4119  for (i = 0; i < tree->children->getLength(); ++i) {
4120  child = (TextBlock *)tree->children->get(i);
4121  if (column->xMax > 0.5 * (child->xMin + child->xMax)) {
4122  break;
4123  }
4124  }
4125  } else {
4126  for (i = 0; i < tree->children->getLength(); ++i) {
4127  child = (TextBlock *)tree->children->get(i);
4128  if (column->xMin < 0.5 * (child->xMin + child->xMax)) {
4129  break;
4130  }
4131  }
4132  }
4133  } else if (tree->type == blkHorizSplit) {
4134  if (tree->rot >= 2) {
4135  for (i = 0; i < tree->children->getLength(); ++i) {
4136  child = (TextBlock *)tree->children->get(i);
4137  if (column->yMax > 0.5 * (child->yMin + child->yMax)) {
4138  break;
4139  }
4140  }
4141  } else {
4142  for (i = 0; i < tree->children->getLength(); ++i) {
4143  child = (TextBlock *)tree->children->get(i);
4144  if (column->yMin < 0.5 * (child->yMin + child->yMax)) {
4145  break;
4146  }
4147  }
4148  }
4149  } else {
4150  // this should never happen
4151  return;
4152  }
4153  tree->children->insert(i, column);
4154  tree->tag = blkTagMulticolumn;
4155 }
4156 
4157 // Insert clipped characters back into the TextBlock tree.
4158 void TextPage::insertClippedChars(GList *clippedChars, TextBlock *tree) {
4159  TextChar *ch, *ch2;
4160  TextBlock *leaf;
4161  double y;
4162  int i;
4163 
4164  //~ this currently works only for characters in the primary rotation
4165 
4166  clippedChars->sort(TextChar::cmpX);
4167  while (clippedChars->getLength()) {
4168  ch = (TextChar *)clippedChars->del(0);
4169  if (ch->rot != 0) {
4170  continue;
4171  }
4172  if (!(leaf = findClippedCharLeaf(ch, tree))) {
4173  continue;
4174  }
4175  leaf->addChild(ch, gFalse);
4176  i = 0;
4177  while (i < clippedChars->getLength()) {
4178  ch2 = (TextChar *)clippedChars->get(i);
4179  if (ch2->xMin > ch->xMax + clippedTextMaxWordSpace * ch->fontSize) {
4180  break;
4181  }
4182  y = 0.5 * (ch2->yMin + ch2->yMax);
4183  if (y > leaf->yMin && y < leaf->yMax) {
4184  ch2 = (TextChar *)clippedChars->del(i);
4185  leaf->addChild(ch2, gFalse);
4186  ch = ch2;
4187  } else {
4188  ++i;
4189  }
4190  }
4191  }
4192 }
4193 
4194 // Find the leaf in <tree> to which clipped char <ch> can be appended.
4195 // Returns NULL if there is no appropriate append point.
4197  TextBlock *ret, *child;
4198  double y;
4199  int i;
4200 
4201  //~ this currently works only for characters in the primary rotation
4202 
4203  y = 0.5 * (ch->yMin + ch->yMax);
4204  if (tree->type == blkLeaf) {
4205  if (tree->rot == 0) {
4206  if (y > tree->yMin && y < tree->yMax &&
4207  ch->xMin <= tree->xMax + clippedTextMaxWordSpace * ch->fontSize) {
4208  return tree;
4209  }
4210  }
4211  } else {
4212  for (i = 0; i < tree->children->getLength(); ++i) {
4213  child = (TextBlock *)tree->children->get(i);
4214  if ((ret = findClippedCharLeaf(ch, child))) {
4215  return ret;
4216  }
4217  }
4218  }
4219  return NULL;
4220 }
4221 
4222 // Convert the tree of TextBlocks into a list of TextColumns.
4224  GList *columns;
4225 
4226  columns = new GList();
4228  return columns;
4229 }
4230 
4232  TextColumn *col;
4233  int i;
4234 
4235  switch (blk->tag) {
4236  case blkTagSuperLine: // should never happen
4237  case blkTagLine:
4238  case blkTagColumn:
4239  col = buildColumn(blk);
4240  columns->append(col);
4241  break;
4242  case blkTagMulticolumn:
4243 #if 0 //~tmp
4244  if (!primaryLR && blk->type == blkVertSplit) {
4245  for (i = blk->children->getLength() - 1; i >= 0; --i) {
4247  }
4248  } else {
4249 #endif
4250  for (i = 0; i < blk->children->getLength(); ++i) {
4252  }
4253 #if 0 //~tmp
4254  }
4255 #endif
4256  break;
4257  }
4258 }
4259 
4261  GList *lines, *parLines;
4262  GList *paragraphs;
4263  TextLine *line0, *line1;
4264  GBool dropCap;
4265  double spaceThresh, indent0, indent1, fontSize0, fontSize1;
4266  int i;
4267 
4268  lines = new GList();
4269  buildLines(blk, lines, gFalse);
4270 
4272 
4273  //~ could look for bulleted lists here: look for the case where
4274  //~ all out-dented lines start with the same char
4275 
4276  //~ this doesn't handle right-to-left scripts (need to look for indents
4277  //~ on the right instead of left, etc.)
4278 
4279  // build the paragraphs
4280  paragraphs = new GList();
4281  i = 0;
4282  while (i < lines->getLength()) {
4283 
4284  // get the first line of the paragraph
4285  parLines = new GList();
4286  dropCap = gFalse;
4287  line0 = (TextLine *)lines->get(i);
4288  parLines->append(line0);
4289  ++i;
4290 
4291  if (i < lines->getLength()) {
4292  line1 = (TextLine *)lines->get(i);
4293  indent0 = getLineIndent(line0, blk);
4294  indent1 = getLineIndent(line1, blk);
4295  fontSize0 = line0->fontSize;
4296  fontSize1 = line1->fontSize;
4297 
4298  // inverted indent
4299  if (indent1 - indent0 > minParagraphIndent * fontSize0 &&
4300  fabs(fontSize0 - fontSize1) <= paragraphFontSizeDelta &&
4301  getLineSpacing(line0, line1) <= spaceThresh) {
4302  parLines->append(line1);
4303  indent0 = indent1;
4304  for (++i; i < lines->getLength(); ++i) {
4305  line1 = (TextLine *)lines->get(i);
4306  indent1 = getLineIndent(line1, blk);
4307  fontSize1 = line1->fontSize;
4308  if (indent0 - indent1 > minParagraphIndent * fontSize0) {
4309  break;
4310  }
4311  if (fabs(fontSize0 - fontSize1) > paragraphFontSizeDelta) {
4312  break;
4313  }
4314  if (getLineSpacing((TextLine *)lines->get(i - 1), line1)
4315  > spaceThresh) {
4316  break;
4317  }
4318  parLines->append(line1);
4319  }
4320 
4321  // drop cap
4322  } else if (fontSize0 > largeCharThreshold * fontSize1 &&
4323  indent1 - indent0 > minParagraphIndent * fontSize1 &&
4324  getLineSpacing(line0, line1) < 0) {
4325  dropCap = gTrue;
4326  parLines->append(line1);
4327  fontSize0 = fontSize1;
4328  for (++i; i < lines->getLength(); ++i) {
4329  line1 = (TextLine *)lines->get(i);
4330  indent1 = getLineIndent(line1, blk);
4331  if (indent1 - indent0 <= minParagraphIndent * fontSize0) {
4332  break;
4333  }
4334  if (getLineSpacing((TextLine *)lines->get(i - 1), line1)
4335  > spaceThresh) {
4336  break;
4337  }
4338  parLines->append(line1);
4339  }
4340  for (; i < lines->getLength(); ++i) {
4341  line1 = (TextLine *)lines->get(i);
4342  indent1 = getLineIndent(line1, blk);
4343  fontSize1 = line1->fontSize;
4344  if (indent1 - indent0 > minParagraphIndent * fontSize0) {
4345  break;
4346  }
4347  if (fabs(fontSize0 - fontSize1) > paragraphFontSizeDelta) {
4348  break;
4349  }
4350  if (getLineSpacing((TextLine *)lines->get(i - 1), line1)
4351  > spaceThresh) {
4352  break;
4353  }
4354  parLines->append(line1);
4355  }
4356 
4357  // regular indent or no indent
4358  } else if (fabs(fontSize0 - fontSize1) <= paragraphFontSizeDelta &&
4359  getLineSpacing(line0, line1) <= spaceThresh) {
4360  parLines->append(line1);
4361  indent0 = indent1;
4362  for (++i; i < lines->getLength(); ++i) {
4363  line1 = (TextLine *)lines->get(i);
4364  indent1 = getLineIndent(line1, blk);
4365  fontSize1 = line1->fontSize;
4366  if (indent1 - indent0 > minParagraphIndent * fontSize0) {
4367  break;
4368  }
4369  if (fabs(fontSize0 - fontSize1) > paragraphFontSizeDelta) {
4370  break;
4371  }
4372  if (getLineSpacing((TextLine *)lines->get(i - 1), line1)
4373  > spaceThresh) {
4374  break;
4375  }
4376  parLines->append(line1);
4377  }
4378  }
4379  }
4380 
4381  paragraphs->append(new TextParagraph(parLines, dropCap));
4382  }
4383 
4384  delete lines;
4385 
4386  return new TextColumn(paragraphs, blk->xMin, blk->yMin,
4387  blk->xMax, blk->yMax);
4388 }
4389 
4391  double indent;
4392 
4393  switch (line->rot) {
4394  case 0:
4395  default: indent = line->xMin - blk->xMin; break;
4396  case 1: indent = line->yMin - blk->yMin; break;
4397  case 2: indent = blk->xMax - line->xMax; break;
4398  case 3: indent = blk->yMax - line->yMax; break;
4399  }
4400  return indent;
4401 }
4402 
4403 // Compute average line spacing in column.
4405  double avg, sp;
4406  int n, i;
4407 
4408  avg = 0;
4409  n = 0;
4410  for (i = 1; i < lines->getLength(); ++i) {
4411  sp = getLineSpacing((TextLine *)lines->get(i - 1),
4412  (TextLine *)lines->get(i));
4413  if (sp > 0) {
4414  avg += sp;
4415  ++n;
4416  }
4417  }
4418  if (n > 0) {
4419  avg /= n;
4420  }
4421  return avg;
4422 }
4423 
4424 // Compute the space between two lines.
4426  double sp;
4427 
4428  switch (line0->rot) {
4429  case 0:
4430  default: sp = line1->yMin - line0->yMax; break;
4431  case 1: sp = line0->xMin - line1->xMax; break;
4432  case 2: sp = line0->yMin - line1->yMin; break;
4433  case 3: sp = line1->xMin - line1->xMax; break;
4434  }
4435  return sp;
4436 }
4437 
4439  GBool splitSuperLines) {
4440  TextLine *line;
4441  int i;
4442 
4443  if (blk->tag == blkTagLine ||
4444  (blk->tag == blkTagSuperLine && !splitSuperLines)) {
4445  line = buildLine(blk);
4446  if (blk->rot == 1 || blk->rot == 2) {
4447  lines->insert(0, line);
4448  } else {
4449  lines->append(line);
4450  }
4451  } else {
4452  for (i = 0; i < blk->children->getLength(); ++i) {
4453  buildLines((TextBlock *)blk->children->get(i), lines, splitSuperLines);
4454  }
4455  }
4456 }
4457 
4459  GList *columns, *paragraphs, *lines;
4461  int rot;
4462 
4463  charsA->sort(&TextChar::cmpX);
4464  columns = new GList();
4465  for (rot = 0; rot < 4; ++rot) {
4466  lines = buildSimple2Lines(charsA, rot);
4467  if (lines->getLength() == 0) {
4468  delete lines;
4469  continue;
4470  }
4472  paragraphs = new GList();
4473  paragraphs->append(paragraph);
4474  columns->append(new TextColumn(paragraphs,
4475  paragraph->xMin, paragraph->yMin,
4476  paragraph->xMax, paragraph->yMax));
4477  }
4478  return columns;
4479 }
4480 
4482  GList *openCharLines, *lines;
4483  TextCharLine *firstCharLine, *lastCharLine, *charLine, *p;
4484  TextChar *ch;
4485  TextLine *line;
4486  double bestOverlap, overlap, xMin, yMin, xMax, yMax;
4487  int bestLine, i, j, k, m;
4488 
4489  firstCharLine = lastCharLine = NULL;
4490  openCharLines = new GList();
4491  for (i = 0; i < charsA->getLength(); ++i) {
4492  ch = (TextChar *)charsA->get(i);
4493  if (ch->rot != rot) {
4494  continue;
4495  }
4496 
4497  // find the first open line with line.yMax > ch.yMin
4498  j = -1;
4499  k = openCharLines->getLength();
4500  while (j < k - 1) {
4501  // invariants: openLines[j].yMax <= ch.yMin (or j = -1)
4502  // openLines[k].yMax > ch.yMin (or k = nOpenLines)
4503  // j < k - 1
4504  m = j + (k - j) / 2;
4505  charLine = (TextCharLine *)openCharLines->get(m);
4506  if (charLine->yMax <= ch->yMin) {
4507  j = m;
4508  } else {
4509  k = m;
4510  }
4511  }
4512 
4513  // check overlap for all overlapping lines
4514  // i.e., all lines with line.yMin < ch.yMax and line.yMax > ch.yMin
4515  bestLine = -1;
4516  bestOverlap = 0;
4517  for (; k < openCharLines->getLength(); ++k) {
4518  charLine = (TextCharLine *)openCharLines->get(k);
4519  if (charLine->yMin >= ch->yMax) {
4520  break;
4521  }
4522  overlap = ((ch->yMax < charLine->yMax ? ch->yMax : charLine->yMax)
4523  - (ch->yMin > charLine->yMin ? ch->yMin : charLine->yMin))
4524  / (ch->yMax - ch->yMin);
4525  if (overlap > bestOverlap) {
4526  bestLine = k;
4527  bestOverlap = overlap;
4528  }
4529  }
4530 
4531  // found an overlapping line
4532  if (bestLine >= 0 && bestOverlap > simple2MinOverlap) {
4533  k = bestLine;
4534  charLine = (TextCharLine *)openCharLines->get(k);
4535 
4536  // else insert a new line immediately before line k
4537  } else {
4538  charLine = new TextCharLine(ch->rot);
4539  if (k < openCharLines->getLength()) {
4540  p = (TextCharLine *)openCharLines->get(k);
4541  if (p->prev) {
4542  p->prev->next = charLine;
4543  charLine->prev = p->prev;
4544  } else {
4545  firstCharLine = charLine;
4546  }
4547  p->prev = charLine;
4548  charLine->next = p;
4549  } else {
4550  if (lastCharLine) {
4551  lastCharLine->next = charLine;
4552  charLine->prev = lastCharLine;
4553  } else {
4554  firstCharLine = charLine;
4555  }
4556  lastCharLine = charLine;
4557  }
4558  openCharLines->insert(k, charLine);
4559  }
4560 
4561  // add the char to the line
4562  charLine->add(ch);
4563  charLine->yMin = ch->yMin;
4564  charLine->yMax = ch->yMax;
4565 
4566  // update open lines before k
4567  j = k - 1;
4568  while (j >= 0) {
4569  charLine = (TextCharLine *)openCharLines->get(j);
4570  if (charLine->yMax <= ch->yMin) {
4571  break;
4572  }
4573  charLine->yMax = ch->yMin;
4574  if (charLine->yMin < charLine->yMax) {
4575  break;
4576  }
4577  openCharLines->del(j);
4578  --j;
4579  }
4580 
4581  // update open lines after k
4582  j = k + 1;
4583  while (j < openCharLines->getLength()) {
4584  charLine = (TextCharLine *)openCharLines->get(j);
4585  if (charLine->yMin >= ch->yMax) {
4586  break;
4587  }
4