"Fossies" - the Fresh Open Source Software Archive 
Member "xpdf-4.04/xpdf/HTMLGen.cc" (18 Apr 2022, 34442 Bytes) of package /linux/misc/xpdf-4.04.tar.gz:
As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style:
standard) with prefixed line numbers and
code folding option.
Alternatively you can here
view or
download the uninterpreted source code file.
1 //========================================================================
2 //
3 // HTMLGen.cc
4 //
5 // Copyright 2010-2021 Glyph & Cog, LLC
6 //
7 //========================================================================
8
9 //~ to do:
10 //~ - fonts
11 //~ - underlined? (underlines are present in the background image)
12 //~ - include the original font name in the CSS entry (before the
13 //~ generic serif/sans-serif/monospace name)
14 //~ - check that htmlDir exists and is a directory
15 //~ - links:
16 //~ - internal links (to pages, to named destinations)
17 //~ - links from non-text content
18 //~ - rotated text should go in the background image
19 //~ - metadata
20 //~ - PDF outline
21
22 #include <aconf.h>
23
24 #ifdef USE_GCC_PRAGMAS
25 #pragma implementation
26 #endif
27
28 #include <stdlib.h>
29 #include <png.h>
30 #include "gmem.h"
31 #include "gmempp.h"
32 #include "GString.h"
33 #include "GList.h"
34 #include "SplashBitmap.h"
35 #include "PDFDoc.h"
36 #include "GfxFont.h"
37 #include "AcroForm.h"
38 #include "TextOutputDev.h"
39 #include "SplashOutputDev.h"
40 #include "ErrorCodes.h"
41 #include "WebFont.h"
42 #include "HTMLGen.h"
43
44 #ifdef _WIN32
45 # define strcasecmp stricmp
46 # define strncasecmp strnicmp
47 #endif
48
49 //------------------------------------------------------------------------
50
51 struct FontStyleTagInfo {
52 const char *tag;
53 int tagLen;
54 GBool bold;
55 GBool italic;
56 };
57
58 // NB: these are compared, in order, against the tail of the font
59 // name, so "BoldItalic" must come before "Italic", etc.
60 static FontStyleTagInfo fontStyleTags[] = {
61 {"Roman", 5, gFalse, gFalse},
62 {"Regular", 7, gFalse, gFalse},
63 {"Condensed", 9, gFalse, gFalse},
64 {"CondensedBold", 13, gTrue, gFalse},
65 {"CondensedLight", 14, gFalse, gFalse},
66 {"SemiBold", 8, gTrue, gFalse},
67 {"BoldItalicMT", 12, gTrue, gTrue},
68 {"BoldItalic", 10, gTrue, gTrue},
69 {"Bold_Italic", 11, gTrue, gTrue},
70 {"BoldOblique", 11, gTrue, gTrue},
71 {"Bold_Oblique", 12, gTrue, gTrue},
72 {"BoldMT", 6, gTrue, gFalse},
73 {"Bold", 4, gTrue, gFalse},
74 {"ItalicMT", 8, gFalse, gTrue},
75 {"Italic", 6, gFalse, gTrue},
76 {"Oblique", 7, gFalse, gTrue},
77 {"Light", 5, gFalse, gFalse},
78 {NULL, 0, gFalse, gFalse}
79 };
80
81 struct StandardFontInfo {
82 const char *name;
83 GBool fixedWidth;
84 GBool serif;
85 };
86
87 static StandardFontInfo standardFonts[] = {
88 {"Arial", gFalse, gFalse},
89 {"Courier", gTrue, gFalse},
90 {"Futura", gFalse, gFalse},
91 {"Helvetica", gFalse, gFalse},
92 {"Minion", gFalse, gTrue},
93 {"NewCenturySchlbk", gFalse, gTrue},
94 {"Times", gFalse, gTrue},
95 {"TimesNew", gFalse, gTrue},
96 {"Times_New", gFalse, gTrue},
97 {"Verdana", gFalse, gFalse},
98 {"LucidaSans", gFalse, gFalse},
99 {NULL, gFalse, gFalse}
100 };
101
102 struct SubstFontInfo {
103 double mWidth;
104 };
105
106 // index: {fixed:8, serif:4, sans-serif:0} + bold*2 + italic
107 static SubstFontInfo substFonts[16] = {
108 {0.833},
109 {0.833},
110 {0.889},
111 {0.889},
112 {0.788},
113 {0.722},
114 {0.833},
115 {0.778},
116 {0.600},
117 {0.600},
118 {0.600},
119 {0.600}
120 };
121
122 // Map Unicode indexes from the private use area, following the Adobe
123 // Glyph list.
124 #define privateUnicodeMapStart 0xf6f9
125 #define privateUnicodeMapEnd 0xf7ff
126 static int
127 privateUnicodeMap[privateUnicodeMapEnd - privateUnicodeMapStart + 1] = {
128 0x0141, 0x0152, 0, 0, 0x0160, 0, 0x017d, // f6f9
129 0, 0, 0, 0, 0, 0, 0, 0, // f700
130 0, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0, // f710
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0x0021, 0, 0, 0x0024, 0, 0x0026, 0, // f720
134 0, 0, 0, 0, 0, 0, 0, 0,
135 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, // f730
136 0x0038, 0x0039, 0, 0, 0, 0, 0, 0x003f,
137 0, 0, 0, 0, 0, 0, 0, 0, // f740
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0, // f750
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, // f760
142 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
143 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, // f770
144 0x0058, 0x0059, 0x005a, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0, // f780
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0, // f790
148 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0x00a1, 0x00a2, 0, 0, 0, 0, 0, // f7a0
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0, // f7b0
152 0, 0, 0, 0, 0, 0, 0, 0x00bf,
153 0, 0, 0, 0, 0, 0, 0, 0, // f7c0
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0, // f7d0
156 0, 0, 0, 0, 0, 0, 0, 0,
157 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, // f7e0
158 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
159 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0, // f7f0
160 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x0178
161 };
162
163 enum VerticalAlignment {
164 vertAlignBaseline,
165 vertAlignSub,
166 vertAlignSuper,
167 vertAlignTop
168 };
169
170 static const char *vertAlignNames[] = {
171 "baseline",
172 "sub",
173 "super",
174 "top"
175 };
176
177 //------------------------------------------------------------------------
178
179 class HTMLGenFontDefn {
180 public:
181
182 HTMLGenFontDefn(Ref fontIDA, GString *fontFaceA, GString *fontSpecA,
183 double scaleA)
184 : fontID(fontIDA), fontFace(fontFaceA), fontSpec(fontSpecA)
185 , scale(scaleA), used(gFalse) {}
186 ~HTMLGenFontDefn() { delete fontFace; delete fontSpec; }
187 GBool match(Ref fontIDA)
188 { return fontIDA.num == fontID.num && fontIDA.gen == fontID.gen; }
189
190 Ref fontID;
191 GString *fontFace; // NULL for substituted fonts
192 GString *fontSpec;
193 double scale;
194 GBool used; // set when used (per page)
195 };
196
197 //------------------------------------------------------------------------
198
199 class HTMLGenFormFieldInfo {
200 public:
201
202 HTMLGenFormFieldInfo(AcroFormField *acroFormFieldA)
203 : acroFormField(acroFormFieldA) {}
204
205 AcroFormField *acroFormField;
206 };
207
208 //------------------------------------------------------------------------
209
210 class Base64Encoder {
211 public:
212
213 Base64Encoder(int (*writeFuncA)(void *stream, const char *data, int size),
214 void *streamA);
215 void encode(const unsigned char *data, size_t size);
216 void flush();
217
218 private:
219
220 int (*writeFunc)(void *stream, const char *data, int size);
221 void *stream;
222 unsigned char buf[3];
223 int bufLen;
224 };
225
226 static char base64Chars[65] =
227 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
228
229 Base64Encoder::Base64Encoder(int (*writeFuncA)(void *stream, const char *data,
230 int size),
231 void *streamA) {
232 writeFunc = writeFuncA;
233 stream = streamA;
234 bufLen = 0;
235 }
236
237 void Base64Encoder::encode(const unsigned char *data, size_t size) {
238 size_t i = 0;
239 while (1) {
240 while (bufLen < 3) {
241 if (i >= size) {
242 return;
243 }
244 buf[bufLen++] = data[i++];
245 }
246 char out[4];
247 out[0] = base64Chars[(buf[0] >> 2) & 0x3f];
248 out[1] = base64Chars[((buf[0] << 4) | (buf[1] >> 4)) & 0x3f];
249 out[2] = base64Chars[((buf[1] << 2) | (buf[2] >> 6)) & 0x3f];
250 out[3] = base64Chars[buf[2] & 0x3f];
251 writeFunc(stream, out, 4);
252 bufLen = 0;
253 }
254 }
255
256 void Base64Encoder::flush() {
257 // if bufLen == 0, this does nothing
258 // bufLen should never be 3 here
259 char out[4];
260 if (bufLen == 1) {
261 out[0] = base64Chars[(buf[0] >> 2) & 0x3f];
262 out[1] = base64Chars[(buf[0] << 4) & 0x3f];
263 out[2] = '=';
264 out[3] = '=';
265 writeFunc(stream, out, 4);
266 } else if (bufLen == 2) {
267 out[0] = base64Chars[(buf[0] >> 2) & 0x3f];
268 out[1] = base64Chars[((buf[0] << 4) | (buf[1] >> 4)) & 0x3f];
269 out[2] = base64Chars[(buf[1] << 2) & 0x3f];
270 out[3] = '=';
271 writeFunc(stream, out, 4);
272 }
273 }
274
275 static int writeToString(void *stream, const char *data, int size) {
276 ((GString *)stream)->append(data, size);
277 return size;
278 }
279
280 //------------------------------------------------------------------------
281
282
283 //------------------------------------------------------------------------
284
285 HTMLGen::HTMLGen(double backgroundResolutionA, GBool tableMode) {
286 TextOutputControl textOutControl;
287 SplashColor paperColor;
288
289 ok = gTrue;
290
291 backgroundResolution = backgroundResolutionA;
292 zoom = 1.0;
293 vStretch = 1.0;
294 drawInvisibleText = gTrue;
295 allTextInvisible = gFalse;
296 extractFontFiles = gFalse;
297 convertFormFields = gFalse;
298 embedBackgroundImage = gFalse;
299 embedFonts = gFalse;
300
301 // set up the TextOutputDev
302 textOutControl.mode = tableMode ? textOutTableLayout : textOutReadingOrder;
303 textOutControl.html = gTrue;
304 textOutControl.splitRotatedWords = gTrue;
305 textOut = new TextOutputDev(NULL, &textOutControl, gFalse);
306 if (!textOut->isOk()) {
307 ok = gFalse;
308 }
309
310 // set up the SplashOutputDev
311 paperColor[0] = paperColor[1] = paperColor[2] = 0xff;
312 splashOut = new SplashOutputDev(splashModeRGB8, 1, gFalse, paperColor);
313
314 fontDefns = NULL;
315 }
316
317 HTMLGen::~HTMLGen() {
318 delete textOut;
319 delete splashOut;
320 if (fontDefns) {
321 deleteGList(fontDefns, HTMLGenFontDefn);
322 }
323 }
324
325 void HTMLGen::startDoc(PDFDoc *docA) {
326 doc = docA;
327 splashOut->startDoc(doc->getXRef());
328
329 if (fontDefns) {
330 deleteGList(fontDefns, HTMLGenFontDefn);
331 }
332 fontDefns = new GList();
333 nextFontFaceIdx = 0;
334 }
335
336 static inline int pr(int (*writeFunc)(void *stream, const char *data, int size),
337 void *stream, const char *data) {
338 return writeFunc(stream, data, (int)strlen(data));
339 }
340
341 static int pf(int (*writeFunc)(void *stream, const char *data, int size),
342 void *stream, const char *fmt, ...) {
343 va_list args;
344 GString *s;
345 int ret;
346
347 va_start(args, fmt);
348 s = GString::formatv(fmt, args);
349 va_end(args);
350 ret = writeFunc(stream, s->getCString(), s->getLength());
351 delete s;
352 return ret;
353 }
354
355 struct PNGWriteInfo {
356 Base64Encoder *base64;
357 int (*writePNG)(void *stream, const char *data, int size);
358 void *pngStream;
359 };
360
361 static void pngWriteFunc(png_structp png, png_bytep data, png_size_t size) {
362 PNGWriteInfo *info = (PNGWriteInfo *)png_get_progressive_ptr(png);
363 if (info->base64) {
364 info->base64->encode(data, size);
365 } else {
366 info->writePNG(info->pngStream, (char *)data, (int)size);
367 }
368 }
369
370 int HTMLGen::convertPage(
371 int pg, const char *pngURL, const char *htmlDir,
372 int (*writeHTML)(void *stream, const char *data, int size),
373 void *htmlStream,
374 int (*writePNG)(void *stream, const char *data, int size),
375 void *pngStream) {
376 png_structp png;
377 png_infop pngInfo;
378 PNGWriteInfo writeInfo;
379 SplashBitmap *bitmap;
380 Guchar *p;
381 double pageW, pageH;
382 TextPage *text;
383 GList *cols, *pars, *lines, *words;
384 TextFontInfo *font;
385 TextColumn *col;
386 TextParagraph *par;
387 TextLine *line;
388 HTMLGenFontDefn *fontDefn;
389 GString *s;
390 double base;
391 int primaryDir, spanDir;
392 int colIdx, parIdx, lineIdx, firstWordIdx, lastWordIdx;
393 int y, i;
394
395 // generate the background bitmap
396 splashOut->setSkipText(!allTextInvisible, gFalse);
397 doc->displayPage(splashOut, pg,
398 backgroundResolution, backgroundResolution * vStretch,
399 0, gFalse, gTrue, gFalse);
400 bitmap = splashOut->getBitmap();
401
402 // page size
403 if (doc->getPageRotate(pg) == 90 || doc->getPageRotate(pg) == 270) {
404 pageW = doc->getPageCropHeight(pg);
405 pageH = doc->getPageCropWidth(pg);
406 } else {
407 pageW = doc->getPageCropWidth(pg);
408 pageH = doc->getPageCropHeight(pg);
409 }
410
411 // get the PDF text
412 doc->displayPage(textOut, pg, 72, 72, 0, gFalse, gTrue, gFalse);
413 doc->processLinks(textOut, pg);
414 text = textOut->takeText();
415 primaryDir = text->primaryDirectionIsLR() ? 1 : -1;
416
417 // insert a special character for each form field;
418 // remove existing characters inside field bboxes;
419 // erase background content inside field bboxes
420 formFieldFont = NULL;
421 formFieldInfo = NULL;
422 if (convertFormFields) {
423 AcroForm *form = doc->getCatalog()->getForm();
424 if (form) {
425 formFieldInfo = new GList();
426 formFieldFont = new TextFontInfo();
427 double yTop = doc->getCatalog()->getPage(pg)->getMediaBox()->y2;
428 for (i = 0; i < form->getNumFields(); ++i) {
429 AcroFormField *field = form->getField(i);
430 AcroFormFieldType fieldType = field->getAcroFormFieldType();
431 if (field->getPageNum() == pg &&
432 (fieldType == acroFormFieldText ||
433 fieldType == acroFormFieldCheckbox)) {
434 double llx, lly, urx, ury;
435 field->getBBox(&llx, &lly, &urx, &ury);
436 lly = yTop - lly;
437 ury = yTop - ury;
438
439 // add the field info
440 int fieldIdx = formFieldInfo->getLength();
441 formFieldInfo->append(new HTMLGenFormFieldInfo(field));
442
443 // remove exsting chars
444 text->removeChars(llx, ury, urx, lly, 0.75, 0.5);
445
446 // erase background content
447 int llxI = (int)(llx * backgroundResolution / 72 + 0.5);
448 int llyI = (int)(lly * backgroundResolution * vStretch / 72 + 0.5);
449 int urxI = (int)(urx * backgroundResolution / 72 + 0.5);
450 int uryI = (int)(ury * backgroundResolution * vStretch / 72 + 0.5);
451 llyI += (int)(backgroundResolution * vStretch / 20);
452 if (llxI < 0) {
453 llxI = 0;
454 }
455 if (urxI >= bitmap->getWidth()) {
456 urxI = bitmap->getWidth() - 1;
457 }
458 if (uryI < 0) {
459 uryI = 0;
460 }
461 if (llyI > bitmap->getHeight()) {
462 llyI = bitmap->getHeight() - 1;
463 }
464 if (uryI <= llyI && llxI <= urxI) {
465 SplashColorPtr p = bitmap->getDataPtr()
466 + uryI * bitmap->getRowSize() + llxI * 3;
467 for (int y = uryI; y <= llyI; ++y) {
468 memset(p, 0xff, (urxI - llxI + 1) * 3);
469 p += bitmap->getRowSize();
470 }
471 }
472
473 // add a special char
474 // (the font size is unused -- 10 is an arbitrary value)
475 text->addSpecialChar(llx, ury, urx, lly,
476 0, formFieldFont, 10, 0x80000000 + fieldIdx);
477 }
478 }
479 }
480 }
481
482 // HTML header
483 pr(writeHTML, htmlStream, "<html>\n");
484 pr(writeHTML, htmlStream, "<head>\n");
485 pr(writeHTML, htmlStream, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n");
486 pr(writeHTML, htmlStream, "<style type=\"text/css\">\n");
487 pr(writeHTML, htmlStream, ".txt { white-space:nowrap; }\n");
488 if (convertFormFields) {
489 pr(writeHTML, htmlStream, ".textfield {\n");
490 pr(writeHTML, htmlStream, " border: 0;\n");
491 pr(writeHTML, htmlStream, " padding: 0;\n");
492 pr(writeHTML, htmlStream, " background: #ccccff;\n");
493 pr(writeHTML, htmlStream, "}\n");
494 pr(writeHTML, htmlStream, ".checkbox {\n");
495 pr(writeHTML, htmlStream, "}\n");
496 }
497 fonts = text->getFonts();
498 fontScales = (double *)gmallocn(fonts->getLength(), sizeof(double));
499 for (i = 0; i < fontDefns->getLength(); ++i) {
500 fontDefn = (HTMLGenFontDefn *)fontDefns->get(i);
501 fontDefn->used = gFalse;
502 }
503 for (i = 0; i < fonts->getLength(); ++i) {
504 font = (TextFontInfo *)fonts->get(i);
505 fontDefn = getFontDefn(font, htmlDir);
506 if (!fontDefn->used && fontDefn->fontFace) {
507 pr(writeHTML, htmlStream, fontDefn->fontFace->getCString());
508 }
509 pf(writeHTML, htmlStream, ".f{0:d} {{ {1:t} }}\n", i, fontDefn->fontSpec);
510 fontScales[i] = fontDefn->scale;
511 fontDefn->used = gTrue;
512 }
513 pr(writeHTML, htmlStream, "</style>\n");
514 pr(writeHTML, htmlStream, "</head>\n");
515 if (primaryDir >= 0) {
516 pr(writeHTML, htmlStream, "<body>\n");
517 } else {
518 pr(writeHTML, htmlStream, "<body dir=\"rtl\">\n");
519 }
520
521 // background image element (part 1)
522 if (primaryDir >= 0) {
523 pf(writeHTML, htmlStream, "<img style=\"position:absolute; left:0px; top:0px;\" width=\"{0:d}\" height=\"{1:d}\" ",
524 (int)(pageW * zoom), (int)(pageH * zoom * vStretch));
525 } else {
526 pf(writeHTML, htmlStream, "<img style=\"position:absolute; right:0px; top:0px;\" width=\"{0:d}\" height=\"{1:d}\" ",
527 (int)(pageW * zoom), (int)(pageH * zoom * vStretch));
528 }
529 if (embedBackgroundImage) {
530 pr(writeHTML, htmlStream, "src=\"data:image/png;base64,\n");
531 writeInfo.base64 = new Base64Encoder(writeHTML, htmlStream);
532 writeInfo.writePNG = NULL;
533 writeInfo.pngStream = NULL;
534 } else {
535 pf(writeHTML, htmlStream, "src=\"{0:s}\"", pngURL);
536 writeInfo.base64 = NULL;
537 writeInfo.writePNG = writePNG;
538 writeInfo.pngStream = pngStream;
539 }
540
541 // background image data - writing to a separate file, or embedding
542 // with base64 encoding
543 if (!(png = png_create_write_struct(PNG_LIBPNG_VER_STRING,
544 NULL, NULL, NULL)) ||
545 !(pngInfo = png_create_info_struct(png))) {
546 return errFileIO;
547 }
548 if (setjmp(png_jmpbuf(png))) {
549 return errFileIO;
550 }
551 png_set_write_fn(png, &writeInfo, pngWriteFunc, NULL);
552 png_set_IHDR(png, pngInfo, bitmap->getWidth(), bitmap->getHeight(),
553 8, PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE,
554 PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
555 png_write_info(png, pngInfo);
556 p = bitmap->getDataPtr();
557 for (y = 0; y < bitmap->getHeight(); ++y) {
558 png_write_row(png, (png_bytep)p);
559 p += bitmap->getRowSize();
560 }
561 png_write_end(png, pngInfo);
562 png_destroy_write_struct(&png, &pngInfo);
563 if (embedBackgroundImage) {
564 writeInfo.base64->flush();
565 delete writeInfo.base64;
566 }
567
568 // background image element (part 2)
569 pr(writeHTML, htmlStream, "\">\n");
570
571 // generate the HTML text
572 nextFieldID = 0;
573 cols = text->makeColumns();
574 for (colIdx = 0; colIdx < cols->getLength(); ++colIdx) {
575 col = (TextColumn *)cols->get(colIdx);
576 pars = col->getParagraphs();
577 for (parIdx = 0; parIdx < pars->getLength(); ++parIdx) {
578 par = (TextParagraph *)pars->get(parIdx);
579 lines = par->getLines();
580 for (lineIdx = 0; lineIdx < lines->getLength(); ++lineIdx) {
581 line = (TextLine *)lines->get(lineIdx);
582 if (line->getRotation() != 0) {
583 continue;
584 }
585 words = line->getWords();
586 if (lineIdx == 0 && par->hasDropCap() && words->getLength() >= 2) {
587 base = ((TextWord *)words->get(1))->getBaseline();
588 } else {
589 base = line->getBaseline();
590 }
591 s = new GString();
592 for (firstWordIdx = (primaryDir >= 0) ? 0 : words->getLength() - 1;
593 (primaryDir >= 0) ? firstWordIdx < words->getLength()
594 : firstWordIdx >= 0;
595 firstWordIdx = lastWordIdx + primaryDir) {
596 lastWordIdx = findDirSpan(words, firstWordIdx,
597 primaryDir, &spanDir);
598 appendSpans(words, firstWordIdx, lastWordIdx,
599 primaryDir, spanDir,
600 base, lineIdx == 0 && par->hasDropCap(),
601 s);
602 }
603 if (primaryDir >= 0) {
604 pf(writeHTML, htmlStream, "<div class=\"txt\" style=\"position:absolute; left:{0:d}px; top:{1:d}px;\">{2:t}</div>\n",
605 (int)(line->getXMin() * zoom),
606 (int)(line->getYMin() * zoom * vStretch), s);
607 } else {
608 pf(writeHTML, htmlStream, "<div class=\"txt\" style=\"position:absolute; right:{0:d}px; top:{1:d}px;\">{2:t}</div>\n",
609 (int)((pageW - line->getXMax()) * zoom),
610 (int)(line->getYMin() * zoom * vStretch), s);
611 }
612 delete s;
613 }
614 }
615 }
616 gfree(fontScales);
617 delete text;
618 deleteGList(cols, TextColumn);
619 if (formFieldFont) {
620 delete formFieldFont;
621 formFieldFont = NULL;
622 }
623 if (formFieldInfo) {
624 deleteGList(formFieldInfo, HTMLGenFormFieldInfo);
625 formFieldInfo = NULL;
626 }
627
628 // HTML trailer
629 pr(writeHTML, htmlStream, "</body>\n");
630 pr(writeHTML, htmlStream, "</html>\n");
631
632 return errNone;
633 }
634
635 // Find a sequence of words, starting at <firstWordIdx>, that have the
636 // same writing direction. Returns the index of the last word, and
637 // sets *<spanDir> to the span direction.
638 int HTMLGen::findDirSpan(GList *words, int firstWordIdx, int primaryDir,
639 int *spanDir) {
640 int dir0, dir1, nextWordIdx;
641
642 dir0 = ((TextWord *)words->get(firstWordIdx))->getDirection();
643 for (nextWordIdx = firstWordIdx + primaryDir;
644 (primaryDir >= 0) ? nextWordIdx < words->getLength()
645 : nextWordIdx >= 0;
646 nextWordIdx += primaryDir) {
647 dir1 = ((TextWord *)words->get(nextWordIdx))->getDirection();
648 if (dir0 == 0) {
649 dir0 = dir1;
650 } else if (dir1 != 0 && dir1 != dir0) {
651 break;
652 }
653 }
654
655 if (dir0 == 0) {
656 *spanDir = primaryDir;
657 } else {
658 *spanDir = dir0;
659 }
660
661 return nextWordIdx - primaryDir;
662 }
663
664 // Create HTML spans for words <firstWordIdx> .. <lastWordIdx>, and
665 // append them to <s>.
666 void HTMLGen::appendSpans(GList *words, int firstWordIdx, int lastWordIdx,
667 int primaryDir, int spanDir,
668 double base, GBool dropCapLine, GString *s) {
669 if (allTextInvisible && !drawInvisibleText) {
670 return;
671 }
672
673 if (spanDir != primaryDir) {
674 int t = firstWordIdx;
675 firstWordIdx = lastWordIdx;
676 lastWordIdx = t;
677 }
678
679 int wordIdx = firstWordIdx;
680 while ((spanDir >= 0) ? wordIdx <= lastWordIdx
681 : wordIdx >= lastWordIdx) {
682 TextWord *word0 = (TextWord *)words->get(wordIdx);
683
684 // form field(s): generate <input> element(s)
685 if (convertFormFields && word0->getFontInfo() == formFieldFont) {
686 for (int i = (spanDir >= 0) ? 0 : word0->getLength() - 1;
687 (spanDir >= 0) ? i < word0->getLength() : i >= 0;
688 i += spanDir) {
689 int fieldIdx = word0->getChar(0) - 0x80000000;
690 if (fieldIdx >= 0 && fieldIdx < formFieldInfo->getLength()) {
691 HTMLGenFormFieldInfo *ffi =
692 (HTMLGenFormFieldInfo *)formFieldInfo->get(fieldIdx);
693 AcroFormField *field = ffi->acroFormField;
694 AcroFormFieldType fieldType = field->getAcroFormFieldType();
695 double llx, lly, urx, ury;
696 field->getBBox(&llx, &lly, &urx, &ury);
697 int width = (int)(urx - llx);
698 Ref fontID;
699 double fontSize;
700 field->getFont(&fontID, &fontSize);
701 if (fontSize == 0) {
702 fontSize = 12;
703 }
704 if (fieldType == acroFormFieldText) {
705 s->appendf("<input type=\"text\" class=\"textfield\" id=\"textfield{0:d}\" style=\"width:{1:d}px; font-size:{2:d}px;\">", nextFieldID, width, (int)(fontSize + 0.5));
706 ++nextFieldID;
707 } else if (fieldType == acroFormFieldCheckbox) {
708 s->appendf("<input type=\"checkbox\" class=\"checkbox\" id=\"checkbox{0:d}\" style=\"width:{1:d}px; font-size:{2:d}px;\">", nextFieldID, width, (int)(fontSize + 0.5));
709 ++nextFieldID;
710 }
711 }
712 }
713
714 if (word0->getSpaceAfter()) {
715 s->append(' ');
716 }
717
718 wordIdx += spanDir;
719
720 // skip invisible words
721 } else if (!drawInvisibleText &&
722 (word0->isInvisible() || word0->isRotated())) {
723 wordIdx += spanDir;
724
725 // generate a <span> containing one or more words
726 } else {
727
728 double r0 = 0, g0 = 0, b0 = 0; // make gcc happy
729 VerticalAlignment vertAlign0 = vertAlignBaseline; // make gcc happy
730 GString *linkURI0 = NULL;
731
732 GBool invisible = word0->isInvisible() || word0->isRotated();
733
734 do {
735 TextWord *word1 = (TextWord *)words->get(wordIdx);
736
737 // get word parameters
738 double r1, g1, b1;
739 word0->getColor(&r1, &g1, &b1);
740 double base1 = word1->getBaseline();
741 VerticalAlignment vertAlign1;
742 if (dropCapLine) {
743 //~ this will fail if there are subscripts or superscripts in
744 //~ the first line of a paragraph with a drop cap
745 vertAlign1 = vertAlignTop;
746 } else if (base1 - base < -1) {
747 vertAlign1 = vertAlignSuper;
748 } else if (base1 - base > 1) {
749 vertAlign1 = vertAlignSub;
750 } else {
751 vertAlign1 = vertAlignBaseline;
752 }
753 GString *linkURI1 = word1->getLinkURI();
754
755 // start of span
756 if (word1 == word0) {
757 r0 = r1;
758 g0 = g1;
759 b0 = b1;
760 vertAlign0 = vertAlign1;
761 linkURI0 = linkURI1;
762
763 int i;
764 for (i = 0; i < fonts->getLength(); ++i) {
765 if (word1->getFontInfo() == (TextFontInfo *)fonts->get(i)) {
766 break;
767 }
768 }
769 if (linkURI1) {
770 s->appendf("<a href=\"{0:t}\">", linkURI0);
771 }
772 // we force spans to be LTR or RTL; this is a kludge, but it's
773 // far easier than implementing the full Unicode bidi algorithm
774 const char *dirTag;
775 if (spanDir == primaryDir) {
776 dirTag = "";
777 } else if (spanDir < 0) {
778 dirTag = " dir=\"rtl\"";
779 } else {
780 dirTag = " dir=\"ltr\"";
781 }
782 s->appendf("<span class=\"f{0:d}\"{1:s} style=\"font-size:{2:d}px;vertical-align:{3:s};{4:s}color:rgba({5:d},{6:d},{7:d},{8:d});\">",
783 i,
784 dirTag,
785 (int)(fontScales[i] * word1->getFontSize() * zoom),
786 vertAlignNames[vertAlign1],
787 (dropCapLine && wordIdx == 0) ? "line-height:75%;" : "",
788 (int)(r0 * 255), (int)(g0 * 255), (int)(b0 * 255),
789 invisible ? 0 : 1);
790
791 // end of span
792 } else if (word1->getFontInfo() != word0->getFontInfo() ||
793 word1->getFontSize() != word0->getFontSize() ||
794 word1->isInvisible() != word0->isInvisible() ||
795 word1->isRotated() != word0->isRotated() ||
796 vertAlign1 != vertAlign0 ||
797 r1 != r0 || g1 != g0 || b1 != b0 ||
798 linkURI1 != linkURI0) {
799 break;
800 }
801
802 // add a space before the word, if needed
803 // -- this only happens with the first word in a reverse section
804 if (spanDir != primaryDir && wordIdx == firstWordIdx) {
805 GBool sp;
806 if (spanDir >= 0) {
807 if (wordIdx > 0) {
808 sp = ((TextWord *)words->get(wordIdx - 1))->getSpaceAfter();
809 } else {
810 sp = gFalse;
811 }
812 } else {
813 sp = word1->getSpaceAfter();
814 }
815 if (sp) {
816 s->append(' ');
817 }
818 }
819
820 // generate the word text
821 for (int i = (spanDir >= 0) ? 0 : word1->getLength() - 1;
822 (spanDir >= 0) ? i < word1->getLength() : i >= 0;
823 i += spanDir) {
824 Unicode u = word1->getChar(i);
825 if (u >= privateUnicodeMapStart &&
826 u <= privateUnicodeMapEnd &&
827 privateUnicodeMap[u - privateUnicodeMapStart]) {
828 u = privateUnicodeMap[u - privateUnicodeMapStart];
829 }
830 appendUTF8(u, s);
831 }
832
833 // add a space after the word, if needed
834 // -- there is never a space after the last word in a reverse
835 // section (this will be handled as a space after the last
836 // word in the previous primary-direction section)
837 GBool sp;
838 if (spanDir != primaryDir && wordIdx == lastWordIdx) {
839 sp = gFalse;
840 } else if (spanDir >= 0) {
841 sp = word1->getSpaceAfter();
842 } else {
843 if (wordIdx > 0) {
844 sp = ((TextWord *)words->get(wordIdx - 1))->getSpaceAfter();
845 } else {
846 sp = gFalse;
847 }
848 }
849 if (sp) {
850 s->append(' ');
851 }
852
853 wordIdx += spanDir;
854 } while ((spanDir >= 0) ? wordIdx <= lastWordIdx
855 : wordIdx >= lastWordIdx);
856
857 s->append("</span>");
858 if (linkURI0) {
859 s->append("</a>");
860 }
861 }
862 }
863 }
864
865 void HTMLGen::appendUTF8(Unicode u, GString *s) {
866 if (u <= 0x7f) {
867 if (u == '&') {
868 s->append("&");
869 } else if (u == '<') {
870 s->append("<");
871 } else if (u == '>') {
872 s->append(">");
873 } else {
874 s->append((char)u);
875 }
876 } else if (u <= 0x7ff) {
877 s->append((char)(0xc0 + (u >> 6)));
878 s->append((char)(0x80 + (u & 0x3f)));
879 } else if (u <= 0xffff) {
880 s->append((char)(0xe0 + (u >> 12)));
881 s->append((char)(0x80 + ((u >> 6) & 0x3f)));
882 s->append((char)(0x80 + (u & 0x3f)));
883 } else if (u <= 0x1fffff) {
884 s->append((char)(0xf0 + (u >> 18)));
885 s->append((char)(0x80 + ((u >> 12) & 0x3f)));
886 s->append((char)(0x80 + ((u >> 6) & 0x3f)));
887 s->append((char)(0x80 + (u & 0x3f)));
888 } else if (u <= 0x3ffffff) {
889 s->append((char)(0xf8 + (u >> 24)));
890 s->append((char)(0x80 + ((u >> 18) & 0x3f)));
891 s->append((char)(0x80 + ((u >> 12) & 0x3f)));
892 s->append((char)(0x80 + ((u >> 6) & 0x3f)));
893 s->append((char)(0x80 + (u & 0x3f)));
894 } else if (u <= 0x7fffffff) {
895 s->append((char)(0xfc + (u >> 30)));
896 s->append((char)(0x80 + ((u >> 24) & 0x3f)));
897 s->append((char)(0x80 + ((u >> 18) & 0x3f)));
898 s->append((char)(0x80 + ((u >> 12) & 0x3f)));
899 s->append((char)(0x80 + ((u >> 6) & 0x3f)));
900 s->append((char)(0x80 + (u & 0x3f)));
901 }
902 }
903
904 HTMLGenFontDefn *HTMLGen::getFontDefn(TextFontInfo *font,
905 const char *htmlDir) {
906 Ref id;
907 HTMLGenFontDefn *fontDefn;
908 int i;
909
910 // check the existing font defns
911 id = font->getFontID();
912 if (id.num >= 0) {
913 for (i = 0; i < fontDefns->getLength(); ++i) {
914 fontDefn = (HTMLGenFontDefn *)fontDefns->get(i);
915 if (fontDefn->match(id)) {
916 return fontDefn;
917 }
918 }
919 }
920
921 // try to extract a font file
922 if (!extractFontFiles ||
923 !(fontDefn = getFontFile(font, htmlDir))) {
924
925 // get a substitute font
926 fontDefn = getSubstituteFont(font);
927 }
928
929 fontDefns->append(fontDefn);
930 return fontDefn;
931 }
932
933 HTMLGenFontDefn *HTMLGen::getFontFile(TextFontInfo *font,
934 const char *htmlDir) {
935 Ref id;
936 HTMLGenFontDefn *fontDefn;
937 Object fontObj;
938 GfxFont *gfxFont;
939 WebFont *webFont;
940 GString *fontFile, *fontPath, *fontFace, *fontSpec;
941 const char *family, *weight, *style;
942 double scale;
943
944 id = font->getFontID();
945 if (id.num < 0) {
946 return NULL;
947 }
948
949 doc->getXRef()->fetch(id.num, id.gen, &fontObj);
950 if (!fontObj.isDict()) {
951 fontObj.free();
952 return NULL;
953 }
954
955 gfxFont = GfxFont::makeFont(doc->getXRef(), "F", id, fontObj.getDict());
956 webFont = new WebFont(gfxFont, doc->getXRef());
957 fontDefn = NULL;
958 fontFace = NULL;
959
960 if (webFont->canWriteTTF()) {
961 if (embedFonts) {
962 GString *ttfData = webFont->getTTFData();
963 if (ttfData) {
964 fontFace = GString::format("@font-face {{ font-family: ff{0:d}; src: url(\"data:font/ttf;base64,",
965 nextFontFaceIdx);
966 Base64Encoder enc(writeToString, fontFace);
967 enc.encode((unsigned char *)ttfData->getCString(),
968 (size_t)ttfData->getLength());
969 enc.flush();
970 fontFace->append("\"); }\n");
971 delete ttfData;
972 }
973 } else {
974 fontFile = GString::format("{0:d}.ttf", nextFontFaceIdx);
975 fontPath = GString::format("{0:s}/{1:t}", htmlDir, fontFile);
976 if (webFont->writeTTF(fontPath->getCString())) {
977 fontFace = GString::format("@font-face {{ font-family: ff{0:d}; src: url(\"{1:t}\"); }}\n",
978 nextFontFaceIdx, fontFile);
979 }
980 delete fontPath;
981 delete fontFile;
982 }
983 if (fontFace) {
984 getFontDetails(font, &family, &weight, &style, &scale);
985 fontSpec = GString::format("font-family:ff{0:d},{1:s}; font-weight:{2:s}; font-style:{3:s};",
986 nextFontFaceIdx, family, weight, style);
987 ++nextFontFaceIdx;
988 fontDefn = new HTMLGenFontDefn(id, fontFace, fontSpec, 1.0);
989 }
990
991 } else if (webFont->canWriteOTF()) {
992 if (embedFonts) {
993 GString *otfData = webFont->getOTFData();
994 if (otfData) {
995 fontFace = GString::format("@font-face {{ font-family: ff{0:d}; src: url(\"data:font/otf;base64,",
996 nextFontFaceIdx);
997 Base64Encoder enc(writeToString, fontFace);
998 enc.encode((unsigned char *)otfData->getCString(),
999 (size_t)otfData->getLength());
1000 enc.flush();
1001 fontFace->append("\"); }\n");
1002 delete otfData;
1003 }
1004 } else {
1005 fontFile = GString::format("{0:d}.otf", nextFontFaceIdx);
1006 fontPath = GString::format("{0:s}/{1:t}", htmlDir, fontFile);
1007 if (webFont->writeOTF(fontPath->getCString())) {
1008 fontFace = GString::format("@font-face {{ font-family: ff{0:d}; src: url(\"{1:t}\"); }}\n",
1009 nextFontFaceIdx, fontFile);
1010 }
1011 delete fontPath;
1012 delete fontFile;
1013 }
1014 if (fontFace) {
1015 getFontDetails(font, &family, &weight, &style, &scale);
1016 fontSpec = GString::format("font-family:ff{0:d},{1:s}; font-weight:{2:s}; font-style:{3:s};",
1017 nextFontFaceIdx, family, weight, style);
1018 ++nextFontFaceIdx;
1019 fontDefn = new HTMLGenFontDefn(id, fontFace, fontSpec, 1.0);
1020 }
1021 }
1022
1023 delete webFont;
1024 delete gfxFont;
1025 fontObj.free();
1026
1027 return fontDefn;
1028 }
1029
1030 HTMLGenFontDefn *HTMLGen::getSubstituteFont(TextFontInfo *font) {
1031 const char *family, *weight, *style;
1032 double scale;
1033 GString *fontSpec;
1034
1035 getFontDetails(font, &family, &weight, &style, &scale);
1036 fontSpec = GString::format("font-family:{0:s}; font-weight:{1:s}; font-style:{2:s};",
1037 family, weight, style);
1038 return new HTMLGenFontDefn(font->getFontID(), NULL, fontSpec, scale);
1039 }
1040
1041 void HTMLGen::getFontDetails(TextFontInfo *font, const char **family,
1042 const char **weight, const char **style,
1043 double *scale) {
1044 GString *fontName;
1045 char *fontName2;
1046 FontStyleTagInfo *fst;
1047 StandardFontInfo *sf;
1048 GBool fixedWidth, serif, bold, italic;
1049 double s;
1050 int n, i;
1051
1052 // get the font name, remove any subset tag
1053 fontName = font->getFontName();
1054 if (fontName) {
1055 fontName2 = fontName->getCString();
1056 n = fontName->getLength();
1057 for (i = 0; i < n && i < 7; ++i) {
1058 if (fontName2[i] < 'A' || fontName2[i] > 'Z') {
1059 break;
1060 }
1061 }
1062 if (i == 6 && n > 7 && fontName2[6] == '+') {
1063 fontName2 += 7;
1064 n -= 7;
1065 }
1066 } else {
1067 fontName2 = NULL;
1068 n = 0;
1069 }
1070
1071 // get the style info from the font descriptor flags
1072 fixedWidth = font->isFixedWidth();
1073 serif = font->isSerif();
1074 bold = font->isBold();
1075 italic = font->isItalic();
1076
1077 if (fontName2) {
1078
1079 // look for a style tag at the end of the font name -- this
1080 // overrides the font descriptor bold/italic flags
1081 for (fst = fontStyleTags; fst->tag; ++fst) {
1082 if (n > fst->tagLen &&
1083 !strcasecmp(fontName2 + n - fst->tagLen, fst->tag)) {
1084 bold = fst->bold;
1085 italic = fst->italic;
1086 n -= fst->tagLen;
1087 if (n > 1 && (fontName2[n-1] == '-' ||
1088 fontName2[n-1] == ',' ||
1089 fontName2[n-1] == '.' ||
1090 fontName2[n-1] == '_')) {
1091 --n;
1092 }
1093 break;
1094 }
1095 }
1096
1097 // look for a known font name -- this overrides the font descriptor
1098 // fixedWidth/serif flags
1099 for (sf = standardFonts; sf->name; ++sf) {
1100 if (!strncasecmp(fontName2, sf->name, n)) {
1101 fixedWidth = sf->fixedWidth;
1102 serif = sf->serif;
1103 break;
1104 }
1105 }
1106 }
1107
1108 // compute the scaling factor
1109 *scale = 1;
1110 if ((s = font->getMWidth())) {
1111 i = (fixedWidth ? 8 : serif ? 4 : 0) + (bold ? 2 : 0) + (italic ? 1 : 0);
1112 if (s < substFonts[i].mWidth) {
1113 *scale = s / substFonts[i].mWidth;
1114 }
1115 }
1116
1117 *family = fixedWidth ? "monospace" : serif ? "serif" : "sans-serif";
1118 *weight = bold ? "bold" : "normal";
1119 *style = italic ? "italic" : "normal";
1120 }