"Fossies" - the Fresh Open Source Software Archive 
Member "xpdf-4.04/xpdf/HTMLGen.cc" (18 Apr 2022, 34442 Bytes) of package /linux/misc/xpdf-4.04.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
For more information about "HTMLGen.cc" see the
Fossies "Dox" file reference documentation and the latest
Fossies "Diffs" side-by-side code changes report:
4.03_vs_4.04.
1 //========================================================================
2 //
3 // HTMLGen.cc
4 //
5 // Copyright 2010-2021 Glyph & Cog, LLC
6 //
7 //========================================================================
8
9 //~ to do:
10 //~ - fonts
11 //~ - underlined? (underlines are present in the background image)
12 //~ - include the original font name in the CSS entry (before the
13 //~ generic serif/sans-serif/monospace name)
14 //~ - check that htmlDir exists and is a directory
15 //~ - links:
16 //~ - internal links (to pages, to named destinations)
17 //~ - links from non-text content
18 //~ - rotated text should go in the background image
19 //~ - metadata
20 //~ - PDF outline
21
22 #include <aconf.h>
23
24 #ifdef USE_GCC_PRAGMAS
25 #pragma implementation
26 #endif
27
28 #include <stdlib.h>
29 #include <png.h>
30 #include "gmem.h"
31 #include "gmempp.h"
32 #include "GString.h"
33 #include "GList.h"
34 #include "SplashBitmap.h"
35 #include "PDFDoc.h"
36 #include "GfxFont.h"
37 #include "AcroForm.h"
38 #include "TextOutputDev.h"
39 #include "SplashOutputDev.h"
40 #include "ErrorCodes.h"
41 #include "WebFont.h"
42 #include "HTMLGen.h"
43
44 #ifdef _WIN32
45 # define strcasecmp stricmp
46 # define strncasecmp strnicmp
47 #endif
48
49 //------------------------------------------------------------------------
50
51 struct FontStyleTagInfo {
52 const char *tag;
53 int tagLen;
54 GBool bold;
55 GBool italic;
56 };
57
58 // NB: these are compared, in order, against the tail of the font
59 // name, so "BoldItalic" must come before "Italic", etc.
60 static FontStyleTagInfo fontStyleTags[] = {
61 {"Roman", 5, gFalse, gFalse},
62 {"Regular", 7, gFalse, gFalse},
63 {"Condensed", 9, gFalse, gFalse},
64 {"CondensedBold", 13, gTrue, gFalse},
65 {"CondensedLight", 14, gFalse, gFalse},
66 {"SemiBold", 8, gTrue, gFalse},
67 {"BoldItalicMT", 12, gTrue, gTrue},
68 {"BoldItalic", 10, gTrue, gTrue},
69 {"Bold_Italic", 11, gTrue, gTrue},
70 {"BoldOblique", 11, gTrue, gTrue},
71 {"Bold_Oblique", 12, gTrue, gTrue},
72 {"BoldMT", 6, gTrue, gFalse},
73 {"Bold", 4, gTrue, gFalse},
74 {"ItalicMT", 8, gFalse, gTrue},
75 {"Italic", 6, gFalse, gTrue},
76 {"Oblique", 7, gFalse, gTrue},
77 {"Light", 5, gFalse, gFalse},
78 {NULL, 0, gFalse, gFalse}
79 };
80
81 struct StandardFontInfo {
82 const char *name;
83 GBool fixedWidth;
84 GBool serif;
85 };
86
87 static StandardFontInfo standardFonts[] = {
88 {"Arial", gFalse, gFalse},
89 {"Courier", gTrue, gFalse},
90 {"Futura", gFalse, gFalse},
91 {"Helvetica", gFalse, gFalse},
92 {"Minion", gFalse, gTrue},
93 {"NewCenturySchlbk", gFalse, gTrue},
94 {"Times", gFalse, gTrue},
95 {"TimesNew", gFalse, gTrue},
96 {"Times_New", gFalse, gTrue},
97 {"Verdana", gFalse, gFalse},
98 {"LucidaSans", gFalse, gFalse},
99 {NULL, gFalse, gFalse}
100 };
101
102 struct SubstFontInfo {
103 double mWidth;
104 };
105
106 // index: {fixed:8, serif:4, sans-serif:0} + bold*2 + italic
107 static SubstFontInfo substFonts[16] = {
108 {0.833},
109 {0.833},
110 {0.889},
111 {0.889},
112 {0.788},
113 {0.722},
114 {0.833},
115 {0.778},
116 {0.600},
117 {0.600},
118 {0.600},
119 {0.600}
120 };
121
122 // Map Unicode indexes from the private use area, following the Adobe
123 // Glyph list.
124 #define privateUnicodeMapStart 0xf6f9
125 #define privateUnicodeMapEnd 0xf7ff
126 static int
127 privateUnicodeMap[privateUnicodeMapEnd - privateUnicodeMapStart + 1] = {
128 0x0141, 0x0152, 0, 0, 0x0160, 0, 0x017d, // f6f9
129 0, 0, 0, 0, 0, 0, 0, 0, // f700
130 0, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0, // f710
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0x0021, 0, 0, 0x0024, 0, 0x0026, 0, // f720
134 0, 0, 0, 0, 0, 0, 0, 0,
135 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, // f730
136 0x0038, 0x0039, 0, 0, 0, 0, 0, 0x003f,
137 0, 0, 0, 0, 0, 0, 0, 0, // f740
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0, // f750
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, // f760
142 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
143 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, // f770
144 0x0058, 0x0059, 0x005a, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0, // f780
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0, // f790
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0x00a1, 0x00a2, 0, 0, 0, 0, 0, // f7a0
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0, // f7b0
152 0, 0, 0, 0, 0, 0, 0, 0x00bf,
153 0, 0, 0, 0, 0, 0, 0, 0, // f7c0
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0, // f7d0
156 0, 0, 0, 0, 0, 0, 0, 0,
157 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, // f7e0
158 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
159 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0, // f7f0
160 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x0178
161 };
162
163 enum VerticalAlignment {
164 vertAlignBaseline,
165 vertAlignSub,
166 vertAlignSuper,
167 vertAlignTop
168 };
169
170 static const char *vertAlignNames[] = {
171 "baseline",
172 "sub",
173 "super",
174 "top"
175 };
176
177 //------------------------------------------------------------------------
178
179 class HTMLGenFontDefn {
180 public:
181
182 HTMLGenFontDefn(Ref fontIDA, GString *fontFaceA, GString *fontSpecA,
183 double scaleA)
184 : fontID(fontIDA), fontFace(fontFaceA), fontSpec(fontSpecA)
185 , scale(scaleA), used(gFalse) {}
186 ~HTMLGenFontDefn() { delete fontFace; delete fontSpec; }
187 GBool match(Ref fontIDA)
188 { return fontIDA.num == fontID.num && fontIDA.gen == fontID.gen; }
189
190 Ref fontID;
191 GString *fontFace; // NULL for substituted fonts
192 GString *fontSpec;
193 double scale;
194 GBool used; // set when used (per page)
195 };
196
197 //------------------------------------------------------------------------
198
199 class HTMLGenFormFieldInfo {
200 public:
201
202 HTMLGenFormFieldInfo(AcroFormField *acroFormFieldA)
203 : acroFormField(acroFormFieldA) {}
204
205 AcroFormField *acroFormField;
206 };
207
208 //------------------------------------------------------------------------
209
210 class Base64Encoder {
211 public:
212
213 Base64Encoder(int (*writeFuncA)(void *stream, const char *data, int size),
214 void *streamA);
215 void encode(const unsigned char *data, size_t size);
216 void flush();
217
218 private:
219
220 int (*writeFunc)(void *stream, const char *data, int size);
221 void *stream;
222 unsigned char buf[3];
223 int bufLen;
224 };
225
226 static char base64Chars[65] =
227 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
228
229 Base64Encoder::Base64Encoder(int (*writeFuncA)(void *stream, const char *data,
230 int size),
231 void *streamA) {
232 writeFunc = writeFuncA;
233 stream = streamA;
234 bufLen = 0;
235 }
236
237 void Base64Encoder::encode(const unsigned char *data, size_t size) {
238 size_t i = 0;
239 while (1) {
240 while (bufLen < 3) {
241 if (i >= size) {
242 return;
243 }
244 buf[bufLen++] = data[i++];
245 }
246 char out[4];
247 out[0] = base64Chars[(buf[0] >> 2) & 0x3f];
248 out[1] = base64Chars[((buf[0] << 4) | (buf[1] >> 4)) & 0x3f];
249 out[2] = base64Chars[((buf[1] << 2) | (buf[2] >> 6)) & 0x3f];
250 out[3] = base64Chars[buf[2] & 0x3f];
251 writeFunc(stream, out, 4);
252 bufLen = 0;
253 }
254 }
255
256 void Base64Encoder::flush() {
257 // if bufLen == 0, this does nothing
258 // bufLen should never be 3 here
259 char out[4];
260 if (bufLen == 1) {
261 out[0] = base64Chars[(buf[0] >> 2) & 0x3f];
262 out[1] = base64Chars[(buf[0] << 4) & 0x3f];
263 out[2] = '=';
264 out[3] = '=';
265 writeFunc(stream, out, 4);
266 } else if (bufLen == 2) {
267 out[0] = base64Chars[(buf[0] >> 2) & 0x3f];
268 out[1] = base64Chars[((buf[0] << 4) | (buf[1] >> 4)) & 0x3f];
269 out[2] = base64Chars[(buf[1] << 2) & 0x3f];
270 out[3] = '=';
271 writeFunc(stream, out, 4);
272 }
273 }
274
275 static int writeToString(void *stream, const char *data, int size) {
276 ((GString *)stream)->append(data, size);
277 return size;
278 }
279
280 //------------------------------------------------------------------------
281
282
283 //------------------------------------------------------------------------
284
285 HTMLGen::HTMLGen(double backgroundResolutionA, GBool tableMode) {
286 TextOutputControl textOutControl;
287 SplashColor paperColor;
288
289 ok = gTrue;
290
291 backgroundResolution = backgroundResolutionA;
292 zoom = 1.0;
293 vStretch = 1.0;
294 drawInvisibleText = gTrue;
295 allTextInvisible = gFalse;
296 extractFontFiles = gFalse;
297 convertFormFields = gFalse;
298 embedBackgroundImage = gFalse;
299 embedFonts = gFalse;
300
301 // set up the TextOutputDev
302 textOutControl.mode = tableMode ? textOutTableLayout : textOutReadingOrder;
303 textOutControl.html = gTrue;
304 textOutControl.splitRotatedWords = gTrue;
305 textOut = new TextOutputDev(NULL, &textOutControl, gFalse);
306 if (!textOut->isOk()) {
307 ok = gFalse;
308 }
309
310 // set up the SplashOutputDev
311 paperColor[0] = paperColor[1] = paperColor[2] = 0xff;
312 splashOut = new SplashOutputDev(splashModeRGB8, 1, gFalse, paperColor);
313
314 fontDefns = NULL;
315 }
316
317 HTMLGen::~HTMLGen() {
318 delete textOut;
319 delete splashOut;
320 if (fontDefns) {
321 deleteGList(fontDefns, HTMLGenFontDefn);
322 }
323 }
324
325 void HTMLGen::startDoc(PDFDoc *docA) {
326 doc = docA;
327 splashOut->startDoc(doc->getXRef());
328
329 if (fontDefns) {
330 deleteGList(fontDefns, HTMLGenFontDefn);
331 }
332 fontDefns = new GList();
333 nextFontFaceIdx = 0;
334 }
335
336 static inline int pr(int (*writeFunc)(void *stream, const char *data, int size),
337 void *stream, const char *data) {
338 return writeFunc(stream, data, (int)strlen(data));
339 }
340
341 static int pf(int (*writeFunc)(void *stream, const char *data, int size),
342 void *stream, const char *fmt, ...) {
343 va_list args;
344 GString *s;
345 int ret;
346
347 va_start(args, fmt);
348 s = GString::formatv(fmt, args);
349 va_end(args);
350 ret = writeFunc(stream, s->getCString(), s->getLength());
351 delete s;
352 return ret;
353 }
354
355 struct PNGWriteInfo {
356 Base64Encoder *base64;
357 int (*writePNG)(void *stream, const char *data, int size);
358 void *pngStream;
359 };
360
361 static void pngWriteFunc(png_structp png, png_bytep data, png_size_t size) {
362 PNGWriteInfo *info = (PNGWriteInfo *)png_get_progressive_ptr(png);
363 if (info->base64) {
364 info->base64->encode(data, size);
365 } else {
366 info->writePNG(info->pngStream, (char *)data, (int)size);
367 }
368 }
369
370 int HTMLGen::convertPage(
371 int pg, const char *pngURL, const char *htmlDir,
372 int (*writeHTML)(void *stream, const char *data, int size),
373 void *htmlStream,
374 int (*writePNG)(void *stream, const char *data, int size),
375 void *pngStream) {
376 png_structp png;
377 png_infop pngInfo;
378 PNGWriteInfo writeInfo;
379 SplashBitmap *bitmap;
380 Guchar *p;
381 double pageW, pageH;
382 TextPage *text;
383 GList *cols, *pars, *lines, *words;
384 TextFontInfo *font;
385 TextColumn *col;
386 TextParagraph *par;
387 TextLine *line;
388 HTMLGenFontDefn *fontDefn;
389 GString *s;
390 double base;
391 int primaryDir, spanDir;
392 int colIdx, parIdx, lineIdx, firstWordIdx, lastWordIdx;
393 int y, i;
394
395 // generate the background bitmap
396 splashOut->setSkipText(!allTextInvisible, gFalse);
397 doc->displayPage(splashOut, pg,
398 backgroundResolution, backgroundResolution * vStretch,
399 0, gFalse, gTrue, gFalse);
400 bitmap = splashOut->getBitmap();
401
402 // page size
403 if (doc->getPageRotate(pg) == 90 || doc->getPageRotate(pg) == 270) {
404 pageW = doc->getPageCropHeight(pg);
405 pageH = doc->getPageCropWidth(pg);
406 } else {
407 pageW = doc->getPageCropWidth(pg);
408 pageH = doc->getPageCropHeight(pg);
409 }
410
411 // get the PDF text
412 doc->displayPage(textOut, pg, 72, 72, 0, gFalse, gTrue, gFalse);
413 doc->processLinks(textOut, pg);
414 text = textOut->takeText();
415 primaryDir = text->primaryDirectionIsLR() ? 1 : -1;
416
417 // insert a special character for each form field;
418 // remove existing characters inside field bboxes;
419 // erase background content inside field bboxes
420 formFieldFont = NULL;
421 formFieldInfo = NULL;
422 if (convertFormFields) {
423 AcroForm *form = doc->getCatalog()->getForm();
424 if (form) {
425 formFieldInfo = new GList();
426 formFieldFont = new TextFontInfo();
427 double yTop = doc->getCatalog()->getPage(pg)->getMediaBox()->y2;
428 for (i = 0; i < form->getNumFields(); ++i) {
429 AcroFormField *field = form->getField(i);
430 AcroFormFieldType fieldType = field->getAcroFormFieldType();
431 if (field->getPageNum() == pg &&
432 (fieldType == acroFormFieldText ||
433 fieldType == acroFormFieldCheckbox)) {
434 double llx, lly, urx, ury;
435 field->getBBox(&llx, &lly, &urx, &ury);
436 lly = yTop - lly;
437 ury = yTop - ury;
438
439 // add the field info
440 int fieldIdx = formFieldInfo->getLength();
441 formFieldInfo->append(new HTMLGenFormFieldInfo(field));
442
443 // remove exsting chars
444 text->removeChars(llx, ury, urx, lly, 0.75, 0.5);
445
446 // erase background content
447 int llxI = (int)(llx * backgroundResolution / 72 + 0.5);
448 int llyI = (int)(lly * backgroundResolution * vStretch / 72 + 0.5);
449 int urxI = (int)(urx * backgroundResolution / 72 + 0.5);
450 int uryI = (int)(ury * backgroundResolution * vStretch / 72 + 0.5);
451 llyI += (int)(backgroundResolution * vStretch / 20);
452 if (llxI < 0) {
453 llxI = 0;
454 }
455 if (urxI >= bitmap->getWidth()) {
456 urxI = bitmap->getWidth() - 1;
457 }
458 if (uryI < 0) {
459 uryI = 0;
460 }
461 if (llyI > bitmap->getHeight()) {
462 llyI = bitmap->getHeight() - 1;
463 }
464 if (uryI <= llyI && llxI <= urxI) {
465 SplashColorPtr p = bitmap->getDataPtr()
466 + uryI * bitmap->getRowSize() + llxI * 3;
467 for (int y = uryI; y <= llyI; ++y) {
468 memset(p, 0xff, (urxI - llxI + 1) * 3);
469 p += bitmap->getRowSize();
470 }
471 }
472
473 // add a special char
474 // (the font size is unused -- 10 is an arbitrary value)
475 text->addSpecialChar(llx, ury, urx, lly,
476 0, formFieldFont, 10, 0x80000000 + fieldIdx);
477 }
478 }
479 }
480 }
481
482 // HTML header
483 pr(writeHTML, htmlStream, "<html>\n");
484 pr(writeHTML, htmlStream, "<head>\n");
485 pr(writeHTML, htmlStream, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n");
486 pr(writeHTML, htmlStream, "<style type=\"text/css\">\n");
487 pr(writeHTML, htmlStream, ".txt { white-space:nowrap; }\n");
488 if (convertFormFields) {
489 pr(writeHTML, htmlStream, ".textfield {\n");
490 pr(writeHTML, htmlStream, " border: 0;\n");
491 pr(writeHTML, htmlStream, " padding: 0;\n");
492 pr(writeHTML, htmlStream, " background: #ccccff;\n");
493 pr(writeHTML, htmlStream, "}\n");
494 pr(writeHTML, htmlStream, ".checkbox {\n");
495 pr(writeHTML, htmlStream, "}\n");
496 }
497 fonts = text->getFonts();
498 fontScales = (double *)gmallocn(fonts->getLength(), sizeof(double));
499 for (i = 0; i < fontDefns->getLength(); ++i) {
500 fontDefn = (HTMLGenFontDefn *)fontDefns->get(i);
501 fontDefn->used = gFalse;
502 }
503 for (i = 0; i < fonts->getLength(); ++i) {
504 font = (TextFontInfo *)fonts->get(i);
505 fontDefn = getFontDefn(font, htmlDir);
506 if (!fontDefn->used && fontDefn->fontFace) {
507 pr(writeHTML, htmlStream, fontDefn->fontFace->getCString());
508 }
509 pf(writeHTML, htmlStream, ".f{0:d} {{ {1:t} }}\n", i, fontDefn->fontSpec);
510 fontScales[i] = fontDefn->scale;
511 fontDefn->used = gTrue;
512 }
513 pr(writeHTML, htmlStream, "</style>\n");
514 pr(writeHTML, htmlStream, "</head>\n");
515 if (primaryDir >= 0) {
516 pr(writeHTML, htmlStream, "<body>\n");
517 } else {
518 pr(writeHTML, htmlStream, "<body dir=\"rtl\">\n");
519 }
520
521 // background image element (part 1)
522 if (primaryDir >= 0) {
523 pf(writeHTML, htmlStream, "<img style=\"position:absolute; left:0px; top:0px;\" width=\"{0:d}\" height=\"{1:d}\" ",
524 (int)(pageW * zoom), (int)(pageH * zoom * vStretch));
525 } else {
526 pf(writeHTML, htmlStream, "<img style=\"position:absolute; right:0px; top:0px;\" width=\"{0:d}\" height=\"{1:d}\" ",
527 (int)(pageW * zoom), (int)(pageH * zoom * vStretch));
528 }
529 if (embedBackgroundImage) {
530 pr(writeHTML, htmlStream, "src=\"data:image/png;base64,\n");
531 writeInfo.base64 = new Base64Encoder(writeHTML, htmlStream);
532 writeInfo.writePNG = NULL;
533 writeInfo.pngStream = NULL;
534 } else {
535 pf(writeHTML, htmlStream, "src=\"{0:s}\"", pngURL);
536 writeInfo.base64 = NULL;
537 writeInfo.writePNG = writePNG;
538 writeInfo.pngStream = pngStream;
539 }
540
541 // background image data - writing to a separate file, or embedding
542 // with base64 encoding
543 if (!(png = png_create_write_struct(PNG_LIBPNG_VER_STRING,
544 NULL, NULL, NULL)) ||
545 !(pngInfo = png_create_info_struct(png))) {
546 return errFileIO;
547 }
548 if (setjmp(png_jmpbuf(png))) {
549 return errFileIO;
550 }
551 png_set_write_fn(png, &writeInfo, pngWriteFunc, NULL);
552 png_set_IHDR(png, pngInfo, bitmap->getWidth(), bitmap->getHeight(),
553 8, PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE,
554 PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
555 png_write_info(png, pngInfo);
556 p = bitmap->getDataPtr();
557 for (y = 0; y < bitmap->getHeight(); ++y) {
558 png_write_row(png, (png_bytep)p);
559 p += bitmap->getRowSize();
560 }
561 png_write_end(png, pngInfo);
562 png_destroy_write_struct(&png, &pngInfo);
563 if (embedBackgroundImage) {
564 writeInfo.base64->flush();
565 delete writeInfo.base64;
566 }
567
568 // background image element (part 2)
569 pr(writeHTML, htmlStream, "\">\n");
570
571 // generate the HTML text
572 nextFieldID = 0;
573 cols = text->makeColumns();
574 for (colIdx = 0; colIdx < cols->getLength(); ++colIdx) {
575 col = (TextColumn *)cols->get(colIdx);
576 pars = col->getParagraphs();
577 for (parIdx = 0; parIdx < pars->getLength(); ++parIdx) {
578 par = (TextParagraph *)pars->get(parIdx);
579 lines = par->getLines();
580 for (lineIdx = 0; lineIdx < lines->getLength(); ++lineIdx) {
581 line = (TextLine *)lines->get(lineIdx);
582 if (line->getRotation() != 0) {
583 continue;
584 }
585 words = line->getWords();
586 if (lineIdx == 0 && par->hasDropCap() && words->getLength() >= 2) {
587 base = ((TextWord *)words->get(1))->getBaseline();
588 } else {
589 base = line->getBaseline();
590 }
591 s = new GString();
592 for (firstWordIdx = (primaryDir >= 0) ? 0 : words->getLength() - 1;
593 (primaryDir >= 0) ? firstWordIdx < words->getLength()
594 : firstWordIdx >= 0;
595 firstWordIdx = lastWordIdx + primaryDir) {
596 lastWordIdx = findDirSpan(words, firstWordIdx,
597 primaryDir, &spanDir);
598 appendSpans(words, firstWordIdx, lastWordIdx,
599 primaryDir, spanDir,
600 base, lineIdx == 0 && par->hasDropCap(),
601 s);
602 }
603 if (primaryDir >= 0) {
604 pf(writeHTML, htmlStream, "<div class=\"txt\" style=\"position:absolute; left:{0:d}px; top:{1:d}px;\">{2:t}</div>\n",
605 (int)(line->getXMin() * zoom),
606 (int)(line->getYMin() * zoom * vStretch), s);
607 } else {
608 pf(writeHTML, htmlStream, "<div class=\"txt\" style=\"position:absolute; right:{0:d}px; top:{1:d}px;\">{2:t}</div>\n",
609 (int)((pageW - line->getXMax()) * zoom),
610 (int)(line->getYMin() * zoom * vStretch), s);
611 }
612 delete s;
613 }
614 }
615 }
616 gfree(fontScales);
617 delete text;
618 deleteGList(cols, TextColumn);
619 if (formFieldFont) {
620 delete formFieldFont;
621 formFieldFont = NULL;
622 }
623 if (formFieldInfo) {
624 deleteGList(formFieldInfo, HTMLGenFormFieldInfo);
625 formFieldInfo = NULL;
626 }
627
628 // HTML trailer
629 pr(writeHTML, htmlStream, "</body>\n");
630 pr(writeHTML, htmlStream, "</html>\n");
631
632 return errNone;
633 }
634
635 // Find a sequence of words, starting at <firstWordIdx>, that have the
636 // same writing direction. Returns the index of the last word, and
637 // sets *<spanDir> to the span direction.
638 int HTMLGen::findDirSpan(GList *words, int firstWordIdx, int primaryDir,
639 int *spanDir) {
640 int dir0, dir1, nextWordIdx;
641
642 dir0 = ((TextWord *)words->get(firstWordIdx))->getDirection();
643 for (nextWordIdx = firstWordIdx + primaryDir;
644 (primaryDir >= 0) ? nextWordIdx < words->getLength()
645 : nextWordIdx >= 0;
646 nextWordIdx += primaryDir) {
647 dir1 = ((TextWord *)words->get(nextWordIdx))->getDirection();
648 if (dir0 == 0) {
649 dir0 = dir1;
650 } else if (dir1 != 0 && dir1 != dir0) {
651 break;
652 }
653 }
654
655 if (dir0 == 0) {
656 *spanDir = primaryDir;
657 } else {
658 *spanDir = dir0;
659 }
660
661 return nextWordIdx - primaryDir;
662 }
663
664 // Create HTML spans for words <firstWordIdx> .. <lastWordIdx>, and
665 // append them to <s>.
666 void HTMLGen::appendSpans(GList *words, int firstWordIdx, int lastWordIdx,
667 int primaryDir, int spanDir,
668 double base, GBool dropCapLine, GString *s) {
669 if (allTextInvisible && !drawInvisibleText) {
670 return;
671 }
672
673 if (spanDir != primaryDir) {
674 int t = firstWordIdx;
675 firstWordIdx = lastWordIdx;
676 lastWordIdx = t;
677 }
678
679 int wordIdx = firstWordIdx;
680 while ((spanDir >= 0) ? wordIdx <= lastWordIdx
681 : wordIdx >= lastWordIdx) {
682 TextWord *word0 = (TextWord *)words->get(wordIdx);
683
684 // form field(s): generate <input> element(s)
685 if (convertFormFields && word0->getFontInfo() == formFieldFont) {
686 for (int i = (spanDir >= 0) ? 0 : word0->getLength() - 1;
687 (spanDir >= 0) ? i < word0->getLength() : i >= 0;
688 i += spanDir) {
689 int fieldIdx = word0->getChar(0) - 0x80000000;
690 if (fieldIdx >= 0 && fieldIdx < formFieldInfo->getLength()) {
691 HTMLGenFormFieldInfo *ffi =
692 (HTMLGenFormFieldInfo *)formFieldInfo->get(fieldIdx);
693 AcroFormField *field = ffi->acroFormField;
694 AcroFormFieldType fieldType = field->getAcroFormFieldType();
695 double llx, lly, urx, ury;
696 field->getBBox(&llx, &lly, &urx, &ury);
697 int width = (int)(urx - llx);
698 Ref fontID;
699 double fontSize;
700 field->getFont(&fontID, &fontSize);
701 if (fontSize == 0) {
702 fontSize = 12;
703 }
704 if (fieldType == acroFormFieldText) {
705 s->appendf("<input type=\"text\" class=\"textfield\" id=\"textfield{0:d}\" style=\"width:{1:d}px; font-size:{2:d}px;\">", nextFieldID, width, (int)(fontSize + 0.5));
706 ++nextFieldID;
707 } else if (fieldType == acroFormFieldCheckbox) {
708 s->appendf("<input type=\"checkbox\" class=\"checkbox\" id=\"checkbox{0:d}\" style=\"width:{1:d}px; font-size:{2:d}px;\">", nextFieldID, width, (int)(fontSize + 0.5));
709 ++nextFieldID;
710 }
711 }
712 }
713
714 if (word0->getSpaceAfter()) {
715 s->append(' ');
716 }
717
718 wordIdx += spanDir;
719
720 // skip invisible words
721 } else if (!drawInvisibleText &&
722 (word0->isInvisible() || word0->isRotated())) {
723 wordIdx += spanDir;
724
725 // generate a <span> containing one or more words
726 } else {
727
728 double r0 = 0, g0 = 0, b0 = 0; // make gcc happy
729 VerticalAlignment vertAlign0 = vertAlignBaseline; // make gcc happy
730 GString *linkURI0 = NULL;
731
732 GBool invisible = word0->isInvisible() || word0->isRotated();
733
734 do {
735 TextWord *word1 = (TextWord *)words->get(wordIdx);
736
737 // get word parameters
738 double r1, g1, b1;
739 word0->getColor(&r1, &g1, &b1);
740 double base1 = word1->getBaseline();
741 VerticalAlignment vertAlign1;
742 if (dropCapLine) {
743 //~ this will fail if there are subscripts or superscripts in
744 //~ the first line of a paragraph with a drop cap
745 vertAlign1 = vertAlignTop;
746 } else if (base1 - base < -1) {
747 vertAlign1 = vertAlignSuper;
748 } else if (base1 - base > 1) {
749 vertAlign1 = vertAlignSub;
750 } else {
751 vertAlign1 = vertAlignBaseline;
752 }
753 GString *linkURI1 = word1->getLinkURI();
754
755 // start of span
756 if (word1 == word0) {
757 r0 = r1;
758 g0 = g1;
759 b0 = b1;
760 vertAlign0 = vertAlign1;
761 linkURI0 = linkURI1;
762
763 int i;
764 for (i = 0; i < fonts->getLength(); ++i) {
765 if (word1->getFontInfo() == (TextFontInfo *)fonts->get(i)) {
766 break;
767 }
768 }
769 if (linkURI1) {
770 s->appendf("<a href=\"{0:t}\">", linkURI0);
771 }
772 // we force spans to be LTR or RTL; this is a kludge, but it's
773 // far easier than implementing the full Unicode bidi algorithm
774 const char *dirTag;
775 if (spanDir == primaryDir) {
776 dirTag = "";
777 } else if (spanDir < 0) {
778 dirTag = " dir=\"rtl\"";
779 } else {
780 dirTag = " dir=\"ltr\"";
781 }
782 s->appendf("<span class=\"f{0:d}\"{1:s} style=\"font-size:{2:d}px;vertical-align:{3:s};{4:s}color:rgba({5:d},{6:d},{7:d},{8:d});\">",
783 i,
784 dirTag,
785 (int)(fontScales[i] * word1->getFontSize() * zoom),
786 vertAlignNames[vertAlign1],
787 (dropCapLine && wordIdx == 0) ? "line-height:75%;" : "",
788 (int)(r0 * 255), (int)(g0 * 255), (int)(b0 * 255),
789 invisible ? 0 : 1);
790
791 // end of span
792 } else if (word1->getFontInfo() != word0->getFontInfo() ||
793 word1->getFontSize() != word0->getFontSize() ||
794 word1->isInvisible() != word0->isInvisible() ||
795 word1->isRotated() != word0->isRotated() ||
796 vertAlign1 != vertAlign0 ||
797 r1 != r0 || g1 != g0 || b1 != b0 ||
798 linkURI1 != linkURI0) {
799 break;
800 }
801
802 // add a space before the word, if needed
803 // -- this only happens with the first word in a reverse section
804 if (spanDir != primaryDir && wordIdx == firstWordIdx) {
805 GBool sp;
806 if (spanDir >= 0) {
807 if (wordIdx > 0) {
808 sp = ((TextWord *)words->get(wordIdx - 1))->getSpaceAfter();
809 } else {
810 sp = gFalse;
811 }
812 } else {
813 sp = word1->getSpaceAfter();
814 }
815 if (sp) {
816 s->append(' ');
817 }
818 }
819
820 // generate the word text
821 for (int i = (spanDir >= 0) ? 0 : word1->getLength() - 1;
822 (spanDir >= 0) ? i < word1->getLength() : i >= 0;
823 i += spanDir) {
824 Unicode u = word1->getChar(i);
825 if (u >= privateUnicodeMapStart &&
826 u <= privateUnicodeMapEnd &&
827 privateUnicodeMap[u - privateUnicodeMapStart]) {
828 u = privateUnicodeMap[u - privateUnicodeMapStart];
829 }
830 appendUTF8(u, s);
831 }
832
833 // add a space after the word, if needed
834 // -- there is never a space after the last word in a reverse
835 // section (this will be handled as a space after the last
836 // word in the previous primary-direction section)
837 GBool sp;
838 if (spanDir != primaryDir && wordIdx == lastWordIdx) {
839 sp = gFalse;
840 } else if (spanDir >= 0) {
841 sp = word1->getSpaceAfter();
842 } else {
843 if (wordIdx > 0) {
844 sp = ((TextWord *)words->get(wordIdx - 1))->getSpaceAfter();
845 } else {
846 sp = gFalse;
847 }
848 }
849 if (sp) {
850 s->append(' ');
851 }
852
853 wordIdx += spanDir;
854 } while ((spanDir >= 0) ? wordIdx <= lastWordIdx
855 : wordIdx >= lastWordIdx);
856
857 s->append("</span>");
858 if (linkURI0) {
859 s->append("</a>");
860 }
861 }
862 }
863 }
864
865 void HTMLGen::appendUTF8(Unicode u, GString *s) {
866 if (u <= 0x7f) {
867 if (u == '&') {
868 s->append("&");
869 } else if (u == '<') {
870 s->append("<");
871 } else if (u == '>') {
872 s->append(">");
873 } else {
874 s->append((char)u);
875 }
876 } else if (u <= 0x7ff) {
877 s->append((char)(0xc0 + (u >> 6)));
878 s->append((char)(0x80 + (u & 0x3f)));
879 } else if (u <= 0xffff) {
880 s->append((char)(0xe0 + (u >> 12)));
881 s->append((char)(0x80 + ((u >> 6) & 0x3f)));
882 s->append((char)(0x80 + (u & 0x3f)));
883 } else if (u <= 0x1fffff) {
884 s->append((char)(0xf0 + (u >> 18)));
885 s->append((char)(0x80 + ((u >> 12) & 0x3f)));
886 s->append((char)(0x80 + ((u >> 6) & 0x3f)));
887 s->append((char)(0x80 + (u & 0x3f)));
888 } else if (u <= 0x3ffffff) {
889 s->append((char)(0xf8 + (u >> 24)));
890 s->append((char)(0x80 + ((u >> 18) & 0x3f)));
891 s->append((char)(0x80 + ((u >> 12) & 0x3f)));
892 s->append((char)(0x80 + ((u >> 6) & 0x3f)));
893 s->append((char)(0x80 + (u & 0x3f)));
894 } else if (u <= 0x7fffffff) {
895 s->append((char)(0xfc + (u >> 30)));
896 s->append((char)(0x80 + ((u >> 24) & 0x3f)));
897 s->append((char)(0x80 + ((u >> 18) & 0x3f)));
898 s->append((char)(0x80 + ((u >> 12) & 0x3f)));
899 s->append((char)(0x80 + ((u >> 6) & 0x3f)));
900 s->append((char)(0x80 + (u & 0x3f)));
901 }
902 }
903
904 HTMLGenFontDefn *HTMLGen::getFontDefn(TextFontInfo *font,
905 const char *htmlDir) {
906 Ref id;
907 HTMLGenFontDefn *fontDefn;
908 int i;
909
910 // check the existing font defns
911 id = font->getFontID();
912 if (id.num >= 0) {
913 for (i = 0; i < fontDefns->getLength(); ++i) {
914 fontDefn = (HTMLGenFontDefn *)fontDefns->get(i);
915 if (fontDefn->match(id)) {
916 return fontDefn;
917 }
918 }
919 }
920
921 // try to extract a font file
922 if (!extractFontFiles ||
923 !(fontDefn = getFontFile(font, htmlDir))) {
924
925 // get a substitute font
926 fontDefn = getSubstituteFont(font);
927 }
928
929 fontDefns->append(fontDefn);
930 return fontDefn;
931 }
932
933 HTMLGenFontDefn *HTMLGen::getFontFile(TextFontInfo *font,
934 const char *htmlDir) {
935 Ref id;
936 HTMLGenFontDefn *fontDefn;
937 Object fontObj;
938 GfxFont *gfxFont;
939 WebFont *webFont;
940 GString *fontFile, *fontPath, *fontFace, *fontSpec;
941 const char *family, *weight, *style;
942 double scale;
943
944 id = font->getFontID();
945 if (id.num < 0) {
946 return NULL;
947 }
948
949 doc->getXRef()->fetch(id.num, id.gen, &fontObj);
950 if (!fontObj.isDict()) {
951 fontObj.free();
952 return NULL;
953 }
954
955 gfxFont = GfxFont::makeFont(doc->getXRef(), "F", id, fontObj.getDict());
956 webFont = new WebFont(gfxFont, doc->getXRef());
957 fontDefn = NULL;
958 fontFace = NULL;
959
960 if (webFont->canWriteTTF()) {
961 if (embedFonts) {
962 GString *ttfData = webFont->getTTFData();
963 if (ttfData) {
964 fontFace = GString::format("@font-face {{ font-family: ff{0:d}; src: url(\"data:font/ttf;base64,",
965 nextFontFaceIdx);
966 Base64Encoder enc(writeToString, fontFace);
967 enc.encode((unsigned char *)ttfData->getCString(),
968 (size_t)ttfData->getLength());
969 enc.flush();
970 fontFace->append("\"); }\n");
971 delete ttfData;
972 }
973 } else {
974 fontFile = GString::format("{0:d}.ttf", nextFontFaceIdx);
975 fontPath = GString::format("{0:s}/{1:t}", htmlDir, fontFile);
976 if (webFont->writeTTF(fontPath->getCString())) {
977 fontFace = GString::format("@font-face {{ font-family: ff{0:d}; src: url(\"{1:t}\"); }}\n",
978 nextFontFaceIdx, fontFile);
979 }
980 delete fontPath;
981 delete fontFile;
982 }
983 if (fontFace) {
984 getFontDetails(font, &family, &weight, &style, &scale);
985 fontSpec = GString::format("font-family:ff{0:d},{1:s}; font-weight:{2:s}; font-style:{3:s};",
986 nextFontFaceIdx, family, weight, style);
987 ++nextFontFaceIdx;
988 fontDefn = new HTMLGenFontDefn(id, fontFace, fontSpec, 1.0);
989 }
990
991 } else if (webFont->canWriteOTF()) {
992 if (embedFonts) {
993 GString *otfData = webFont->getOTFData();
994 if (otfData) {
995 fontFace = GString::format("@font-face {{ font-family: ff{0:d}; src: url(\"data:font/otf;base64,",
996 nextFontFaceIdx);
997 Base64Encoder enc(writeToString, fontFace);
998 enc.encode((unsigned char *)otfData->getCString(),
999 (size_t)otfData->getLength());
1000 enc.flush();
1001 fontFace->append("\"); }\n");
1002 delete otfData;
1003 }
1004 } else {
1005 fontFile = GString::format("{0:d}.otf", nextFontFaceIdx);
1006 fontPath = GString::format("{0:s}/{1:t}", htmlDir, fontFile);
1007 if (webFont->writeOTF(fontPath->getCString())) {
1008 fontFace = GString::format("@font-face {{ font-family: ff{0:d}; src: url(\"{1:t}\"); }}\n",
1009 nextFontFaceIdx, fontFile);
1010 }
1011 delete fontPath;
1012 delete fontFile;
1013 }
1014 if (fontFace) {
1015 getFontDetails(font, &family, &weight, &style, &scale);
1016 fontSpec = GString::format("font-family:ff{0:d},{1:s}; font-weight:{2:s}; font-style:{3:s};",
1017 nextFontFaceIdx, family, weight, style);
1018 ++nextFontFaceIdx;
1019 fontDefn = new HTMLGenFontDefn(id, fontFace, fontSpec, 1.0);
1020 }
1021 }
1022
1023 delete webFont;
1024 delete gfxFont;
1025 fontObj.free();
1026
1027 return fontDefn;
1028 }
1029
1030 HTMLGenFontDefn *HTMLGen::getSubstituteFont(TextFontInfo *font) {
1031 const char *family, *weight, *style;
1032 double scale;
1033 GString *fontSpec;
1034
1035 getFontDetails(font, &family, &weight, &style, &scale);
1036 fontSpec = GString::format("font-family:{0:s}; font-weight:{1:s}; font-style:{2:s};",
1037 family, weight, style);
1038 return new HTMLGenFontDefn(font->getFontID(), NULL, fontSpec, scale);
1039 }
1040
1041 void HTMLGen::getFontDetails(TextFontInfo *font, const char **family,
1042 const char **weight, const char **style,
1043 double *scale) {
1044 GString *fontName;
1045 char *fontName2;
1046 FontStyleTagInfo *fst;
1047 StandardFontInfo *sf;
1048 GBool fixedWidth, serif, bold, italic;
1049 double s;
1050 int n, i;
1051
1052 // get the font name, remove any subset tag
1053 fontName = font->getFontName();
1054 if (fontName) {
1055 fontName2 = fontName->getCString();
1056 n = fontName->getLength();
1057 for (i = 0; i < n && i < 7; ++i) {
1058 if (fontName2[i] < 'A' || fontName2[i] > 'Z') {
1059 break;
1060 }
1061 }
1062 if (i == 6 && n > 7 && fontName2[6] == '+') {
1063 fontName2 += 7;
1064 n -= 7;
1065 }
1066 } else {
1067 fontName2 = NULL;
1068 n = 0;
1069 }
1070
1071 // get the style info from the font descriptor flags
1072 fixedWidth = font->isFixedWidth();
1073 serif = font->isSerif();
1074 bold = font->isBold();
1075 italic = font->isItalic();
1076
1077 if (fontName2) {
1078
1079 // look for a style tag at the end of the font name -- this
1080 // overrides the font descriptor bold/italic flags
1081 for (fst = fontStyleTags; fst->tag; ++fst) {
1082 if (n > fst->tagLen &&
1083 !strcasecmp(fontName2 + n - fst->tagLen, fst->tag)) {
1084 bold = fst->bold;
1085 italic = fst->italic;
1086 n -= fst->tagLen;
1087 if (n > 1 && (fontName2[n-1] == '-' ||
1088 fontName2[n-1] == ',' ||
1089 fontName2[n-1] == '.' ||
1090 fontName2[n-1] == '_')) {
1091 --n;
1092 }
1093 break;
1094 }
1095 }
1096
1097 // look for a known font name -- this overrides the font descriptor
1098 // fixedWidth/serif flags
1099 for (sf = standardFonts; sf->name; ++sf) {
1100 if (!strncasecmp(fontName2, sf->name, n)) {
1101 fixedWidth = sf->fixedWidth;
1102 serif = sf->serif;
1103 break;
1104 }
1105 }
1106 }
1107
1108 // compute the scaling factor
1109 *scale = 1;
1110 if ((s = font->getMWidth())) {
1111 i = (fixedWidth ? 8 : serif ? 4 : 0) + (bold ? 2 : 0) + (italic ? 1 : 0);
1112 if (s < substFonts[i].mWidth) {
1113 *scale = s / substFonts[i].mWidth;
1114 }
1115 }
1116
1117 *family = fixedWidth ? "monospace" : serif ? "serif" : "sans-serif";
1118 *weight = bold ? "bold" : "normal";
1119 *style = italic ? "italic" : "normal";
1120 }