"Fossies" - the Fresh Open Source Software Archive

Member "xpdf-4.04/xpdf/HTMLGen.cc" (18 Apr 2022, 34442 Bytes) of package /linux/misc/xpdf-4.04.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file.

    1 //========================================================================
    2 //
    3 // HTMLGen.cc
    4 //
    5 // Copyright 2010-2021 Glyph & Cog, LLC
    6 //
    7 //========================================================================
    8 
    9 //~ to do:
   10 //~ - fonts
   11 //~   - underlined? (underlines are present in the background image)
   12 //~   - include the original font name in the CSS entry (before the
   13 //~     generic serif/sans-serif/monospace name)
   14 //~ - check that htmlDir exists and is a directory
   15 //~ - links:
   16 //~   - internal links (to pages, to named destinations)
   17 //~   - links from non-text content
   18 //~ - rotated text should go in the background image
   19 //~ - metadata
   20 //~ - PDF outline
   21 
   22 #include <aconf.h>
   23 
   24 #ifdef USE_GCC_PRAGMAS
   25 #pragma implementation
   26 #endif
   27 
   28 #include <stdlib.h>
   29 #include <png.h>
   30 #include "gmem.h"
   31 #include "gmempp.h"
   32 #include "GString.h"
   33 #include "GList.h"
   34 #include "SplashBitmap.h"
   35 #include "PDFDoc.h"
   36 #include "GfxFont.h"
   37 #include "AcroForm.h"
   38 #include "TextOutputDev.h"
   39 #include "SplashOutputDev.h"
   40 #include "ErrorCodes.h"
   41 #include "WebFont.h"
   42 #include "HTMLGen.h"
   43 
   44 #ifdef _WIN32
   45 #  define strcasecmp stricmp
   46 #  define strncasecmp strnicmp
   47 #endif
   48 
   49 //------------------------------------------------------------------------
   50 
   51 struct FontStyleTagInfo {
   52   const char *tag;
   53   int tagLen;
   54   GBool bold;
   55   GBool italic;
   56 };
   57 
   58 // NB: these are compared, in order, against the tail of the font
   59 // name, so "BoldItalic" must come before "Italic", etc.
   60 static FontStyleTagInfo fontStyleTags[] = {
   61   {"Roman",                    5, gFalse, gFalse},
   62   {"Regular",                  7, gFalse, gFalse},
   63   {"Condensed",                9, gFalse, gFalse},
   64   {"CondensedBold",           13, gTrue,  gFalse},
   65   {"CondensedLight",          14, gFalse, gFalse},
   66   {"SemiBold",                 8, gTrue,  gFalse},
   67   {"BoldItalicMT",            12, gTrue,  gTrue},
   68   {"BoldItalic",              10, gTrue,  gTrue},
   69   {"Bold_Italic",             11, gTrue,  gTrue},
   70   {"BoldOblique",             11, gTrue,  gTrue},
   71   {"Bold_Oblique",            12, gTrue,  gTrue},
   72   {"BoldMT",                   6, gTrue,  gFalse},
   73   {"Bold",                     4, gTrue,  gFalse},
   74   {"ItalicMT",                 8, gFalse, gTrue},
   75   {"Italic",                   6, gFalse, gTrue},
   76   {"Oblique",                  7, gFalse, gTrue},
   77   {"Light",                    5, gFalse, gFalse},
   78   {NULL,                       0, gFalse, gFalse}
   79 };
   80 
   81 struct StandardFontInfo {
   82   const char *name;
   83   GBool fixedWidth;
   84   GBool serif;
   85 };
   86 
   87 static StandardFontInfo standardFonts[] = {
   88   {"Arial",                    gFalse, gFalse},
   89   {"Courier",                  gTrue,  gFalse},
   90   {"Futura",                   gFalse, gFalse},
   91   {"Helvetica",                gFalse, gFalse},
   92   {"Minion",                   gFalse, gTrue},
   93   {"NewCenturySchlbk",         gFalse, gTrue},
   94   {"Times",                    gFalse, gTrue},
   95   {"TimesNew",                 gFalse, gTrue},
   96   {"Times_New",                gFalse, gTrue},
   97   {"Verdana",                  gFalse, gFalse},
   98   {"LucidaSans",               gFalse, gFalse},
   99   {NULL,                       gFalse, gFalse}
  100 };
  101 
  102 struct SubstFontInfo {
  103   double mWidth;
  104 };
  105 
  106 // index: {fixed:8, serif:4, sans-serif:0} + bold*2 + italic
  107 static SubstFontInfo substFonts[16] = {
  108   {0.833},
  109   {0.833},
  110   {0.889},
  111   {0.889},
  112   {0.788},
  113   {0.722},
  114   {0.833},
  115   {0.778},
  116   {0.600},
  117   {0.600},
  118   {0.600},
  119   {0.600}
  120 };
  121 
  122 // Map Unicode indexes from the private use area, following the Adobe
  123 // Glyph list.
  124 #define privateUnicodeMapStart 0xf6f9
  125 #define privateUnicodeMapEnd   0xf7ff
  126 static int
  127 privateUnicodeMap[privateUnicodeMapEnd - privateUnicodeMapStart + 1] = {
  128   0x0141, 0x0152, 0,      0,      0x0160, 0,      0x017d,         // f6f9
  129   0,      0,      0,      0,      0,      0,      0,      0,      // f700
  130   0,      0,      0,      0,      0,      0,      0,      0,
  131   0,      0,      0,      0,      0,      0,      0,      0,      // f710
  132   0,      0,      0,      0,      0,      0,      0,      0,
  133   0,      0x0021, 0,      0,      0x0024, 0,      0x0026, 0,      // f720
  134   0,      0,      0,      0,      0,      0,      0,      0,
  135   0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, // f730
  136   0x0038, 0x0039, 0,      0,      0,      0,      0,      0x003f,
  137   0,      0,      0,      0,      0,      0,      0,      0,      // f740
  138   0,      0,      0,      0,      0,      0,      0,      0,
  139   0,      0,      0,      0,      0,      0,      0,      0,      // f750
  140   0,      0,      0,      0,      0,      0,      0,      0,
  141   0,      0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, // f760
  142   0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
  143   0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, // f770
  144   0x0058, 0x0059, 0x005a, 0,      0,      0,      0,      0,
  145   0,      0,      0,      0,      0,      0,      0,      0,      // f780
  146   0,      0,      0,      0,      0,      0,      0,      0,
  147   0,      0,      0,      0,      0,      0,      0,      0,      // f790
  148   0,      0,      0,      0,      0,      0,      0,      0,
  149   0,      0x00a1, 0x00a2, 0,      0,      0,      0,      0,      // f7a0
  150   0,      0,      0,      0,      0,      0,      0,      0,
  151   0,      0,      0,      0,      0,      0,      0,      0,      // f7b0
  152   0,      0,      0,      0,      0,      0,      0,      0x00bf,
  153   0,      0,      0,      0,      0,      0,      0,      0,      // f7c0
  154   0,      0,      0,      0,      0,      0,      0,      0,
  155   0,      0,      0,      0,      0,      0,      0,      0,      // f7d0
  156   0,      0,      0,      0,      0,      0,      0,      0,
  157   0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, // f7e0
  158   0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
  159   0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0,      // f7f0
  160   0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x0178
  161 };
  162 
  163 enum VerticalAlignment {
  164   vertAlignBaseline,
  165   vertAlignSub,
  166   vertAlignSuper,
  167   vertAlignTop
  168 };
  169 
  170 static const char *vertAlignNames[] = {
  171   "baseline",
  172   "sub",
  173   "super",
  174   "top"
  175 };
  176 
  177 //------------------------------------------------------------------------
  178 
  179 class HTMLGenFontDefn {
  180 public:
  181 
  182   HTMLGenFontDefn(Ref fontIDA, GString *fontFaceA, GString *fontSpecA,
  183           double scaleA)
  184     : fontID(fontIDA), fontFace(fontFaceA), fontSpec(fontSpecA)
  185     , scale(scaleA), used(gFalse) {}
  186   ~HTMLGenFontDefn() { delete fontFace; delete fontSpec; }
  187   GBool match(Ref fontIDA)
  188     { return fontIDA.num == fontID.num && fontIDA.gen == fontID.gen; }
  189 
  190   Ref fontID;
  191   GString *fontFace;        // NULL for substituted fonts
  192   GString *fontSpec;
  193   double scale;
  194   GBool used;           // set when used (per page)
  195 };
  196 
  197 //------------------------------------------------------------------------
  198 
  199 class HTMLGenFormFieldInfo {
  200 public:
  201 
  202   HTMLGenFormFieldInfo(AcroFormField *acroFormFieldA)
  203     : acroFormField(acroFormFieldA) {}
  204 
  205   AcroFormField *acroFormField;
  206 };
  207 
  208 //------------------------------------------------------------------------
  209 
  210 class Base64Encoder {
  211 public:
  212 
  213   Base64Encoder(int (*writeFuncA)(void *stream, const char *data, int size),
  214         void *streamA);
  215   void encode(const unsigned char *data, size_t size);
  216   void flush();
  217 
  218 private:
  219 
  220   int (*writeFunc)(void *stream, const char *data, int size);
  221   void *stream;
  222   unsigned char buf[3];
  223   int bufLen;
  224 };
  225 
  226 static char base64Chars[65] =
  227     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
  228 
  229 Base64Encoder::Base64Encoder(int (*writeFuncA)(void *stream, const char *data,
  230                            int size),
  231                  void *streamA) {
  232   writeFunc = writeFuncA;
  233   stream = streamA;
  234   bufLen = 0;
  235 }
  236 
  237 void Base64Encoder::encode(const unsigned char *data, size_t size) {
  238   size_t i = 0;
  239   while (1) {
  240     while (bufLen < 3) {
  241       if (i >= size) {
  242     return;
  243       }
  244       buf[bufLen++] = data[i++];
  245     }
  246     char out[4];
  247     out[0] = base64Chars[(buf[0] >> 2) & 0x3f];
  248     out[1] = base64Chars[((buf[0] << 4) | (buf[1] >> 4)) & 0x3f];
  249     out[2] = base64Chars[((buf[1] << 2) | (buf[2] >> 6)) & 0x3f];
  250     out[3] = base64Chars[buf[2] & 0x3f];
  251     writeFunc(stream, out, 4);
  252     bufLen = 0;
  253   }
  254 }
  255 
  256 void Base64Encoder::flush() {
  257   // if bufLen == 0, this does nothing
  258   // bufLen should never be 3 here
  259   char out[4];
  260   if (bufLen == 1) {
  261     out[0] = base64Chars[(buf[0] >> 2) & 0x3f];
  262     out[1] = base64Chars[(buf[0] << 4) & 0x3f];
  263     out[2] = '=';
  264     out[3] = '=';
  265     writeFunc(stream, out, 4);
  266   } else if (bufLen == 2) {
  267     out[0] = base64Chars[(buf[0] >> 2) & 0x3f];
  268     out[1] = base64Chars[((buf[0] << 4) | (buf[1] >> 4)) & 0x3f];
  269     out[2] = base64Chars[(buf[1] << 2) & 0x3f];
  270     out[3] = '=';
  271     writeFunc(stream, out, 4);
  272   }
  273 }
  274 
  275 static int writeToString(void *stream, const char *data, int size) {
  276   ((GString *)stream)->append(data, size);
  277   return size;
  278 }
  279 
  280 //------------------------------------------------------------------------
  281 
  282 
  283 //------------------------------------------------------------------------
  284 
  285 HTMLGen::HTMLGen(double backgroundResolutionA, GBool tableMode) {
  286   TextOutputControl textOutControl;
  287   SplashColor paperColor;
  288 
  289   ok = gTrue;
  290 
  291   backgroundResolution = backgroundResolutionA;
  292   zoom = 1.0;
  293   vStretch = 1.0;
  294   drawInvisibleText = gTrue;
  295   allTextInvisible = gFalse;
  296   extractFontFiles = gFalse;
  297   convertFormFields = gFalse;
  298   embedBackgroundImage = gFalse;
  299   embedFonts = gFalse;
  300 
  301   // set up the TextOutputDev
  302   textOutControl.mode = tableMode ? textOutTableLayout : textOutReadingOrder;
  303   textOutControl.html = gTrue;
  304   textOutControl.splitRotatedWords = gTrue;
  305   textOut = new TextOutputDev(NULL, &textOutControl, gFalse);
  306   if (!textOut->isOk()) {
  307     ok = gFalse;
  308   }
  309 
  310   // set up the SplashOutputDev
  311   paperColor[0] = paperColor[1] = paperColor[2] = 0xff;
  312   splashOut = new SplashOutputDev(splashModeRGB8, 1, gFalse, paperColor);
  313 
  314   fontDefns = NULL;
  315 }
  316 
  317 HTMLGen::~HTMLGen() {
  318   delete textOut;
  319   delete splashOut;
  320   if (fontDefns) {
  321     deleteGList(fontDefns, HTMLGenFontDefn);
  322   }
  323 }
  324 
  325 void HTMLGen::startDoc(PDFDoc *docA) {
  326   doc = docA;
  327   splashOut->startDoc(doc->getXRef());
  328 
  329   if (fontDefns) {
  330     deleteGList(fontDefns, HTMLGenFontDefn);
  331   }
  332   fontDefns = new GList();
  333   nextFontFaceIdx = 0;
  334 }
  335 
  336 static inline int pr(int (*writeFunc)(void *stream, const char *data, int size),
  337              void *stream, const char *data) {
  338   return writeFunc(stream, data, (int)strlen(data));
  339 }
  340 
  341 static int pf(int (*writeFunc)(void *stream, const char *data, int size),
  342           void *stream, const char *fmt, ...) {
  343   va_list args;
  344   GString *s;
  345   int ret;
  346 
  347   va_start(args, fmt);
  348   s = GString::formatv(fmt, args);
  349   va_end(args);
  350   ret = writeFunc(stream, s->getCString(), s->getLength());
  351   delete s;
  352   return ret;
  353 }
  354 
  355 struct PNGWriteInfo {
  356   Base64Encoder *base64;
  357   int (*writePNG)(void *stream, const char *data, int size);
  358   void *pngStream;
  359 };
  360 
  361 static void pngWriteFunc(png_structp png, png_bytep data, png_size_t size) {
  362   PNGWriteInfo *info = (PNGWriteInfo *)png_get_progressive_ptr(png);
  363   if (info->base64) {
  364     info->base64->encode(data, size);
  365   } else {
  366     info->writePNG(info->pngStream, (char *)data, (int)size);
  367   }
  368 }
  369 
  370 int HTMLGen::convertPage(
  371          int pg, const char *pngURL, const char *htmlDir,
  372          int (*writeHTML)(void *stream, const char *data, int size),
  373          void *htmlStream,
  374          int (*writePNG)(void *stream, const char *data, int size),
  375          void *pngStream) {
  376   png_structp png;
  377   png_infop pngInfo;
  378   PNGWriteInfo writeInfo;
  379   SplashBitmap *bitmap;
  380   Guchar *p;
  381   double pageW, pageH;
  382   TextPage *text;
  383   GList *cols, *pars, *lines, *words;
  384   TextFontInfo *font;
  385   TextColumn *col;
  386   TextParagraph *par;
  387   TextLine *line;
  388   HTMLGenFontDefn *fontDefn;
  389   GString *s;
  390   double base;
  391   int primaryDir, spanDir;
  392   int colIdx, parIdx, lineIdx, firstWordIdx, lastWordIdx;
  393   int y, i;
  394 
  395   // generate the background bitmap
  396   splashOut->setSkipText(!allTextInvisible, gFalse);
  397   doc->displayPage(splashOut, pg,
  398            backgroundResolution, backgroundResolution * vStretch,
  399            0, gFalse, gTrue, gFalse);
  400   bitmap = splashOut->getBitmap();
  401 
  402   // page size
  403   if (doc->getPageRotate(pg) == 90 || doc->getPageRotate(pg) == 270) {
  404     pageW = doc->getPageCropHeight(pg);
  405     pageH = doc->getPageCropWidth(pg);
  406   } else {
  407     pageW = doc->getPageCropWidth(pg);
  408     pageH = doc->getPageCropHeight(pg);
  409   }
  410 
  411   // get the PDF text
  412   doc->displayPage(textOut, pg, 72, 72, 0, gFalse, gTrue, gFalse);
  413   doc->processLinks(textOut, pg);
  414   text = textOut->takeText();
  415   primaryDir = text->primaryDirectionIsLR() ? 1 : -1;
  416 
  417   // insert a special character for each form field;
  418   // remove existing characters inside field bboxes;
  419   // erase background content inside field bboxes
  420   formFieldFont = NULL;
  421   formFieldInfo = NULL;
  422   if (convertFormFields) {
  423     AcroForm *form = doc->getCatalog()->getForm();
  424     if (form) {
  425       formFieldInfo = new GList();
  426       formFieldFont = new TextFontInfo();
  427       double yTop = doc->getCatalog()->getPage(pg)->getMediaBox()->y2;
  428       for (i = 0; i < form->getNumFields(); ++i) {
  429     AcroFormField *field = form->getField(i);
  430     AcroFormFieldType fieldType = field->getAcroFormFieldType();
  431     if (field->getPageNum() == pg &&
  432         (fieldType == acroFormFieldText ||
  433          fieldType == acroFormFieldCheckbox)) {
  434       double llx, lly, urx, ury;
  435       field->getBBox(&llx, &lly, &urx, &ury);
  436       lly = yTop - lly;
  437       ury = yTop - ury;
  438 
  439       // add the field info
  440       int fieldIdx = formFieldInfo->getLength();
  441       formFieldInfo->append(new HTMLGenFormFieldInfo(field));
  442 
  443       // remove exsting chars
  444       text->removeChars(llx, ury, urx, lly, 0.75, 0.5);
  445     
  446       // erase background content
  447       int llxI = (int)(llx * backgroundResolution / 72 + 0.5);
  448       int llyI = (int)(lly * backgroundResolution * vStretch / 72 + 0.5);
  449       int urxI = (int)(urx * backgroundResolution / 72 + 0.5);
  450       int uryI = (int)(ury * backgroundResolution * vStretch / 72 + 0.5);
  451       llyI += (int)(backgroundResolution * vStretch / 20);
  452       if (llxI < 0) {
  453         llxI = 0;
  454       }
  455       if (urxI >= bitmap->getWidth()) {
  456         urxI = bitmap->getWidth() - 1;
  457       }
  458       if (uryI < 0) {
  459         uryI = 0;
  460       }
  461       if (llyI > bitmap->getHeight()) {
  462         llyI = bitmap->getHeight() - 1;
  463       }
  464       if (uryI <= llyI && llxI <= urxI) {
  465         SplashColorPtr p = bitmap->getDataPtr()
  466                              + uryI * bitmap->getRowSize() + llxI * 3;
  467         for (int y = uryI; y <= llyI; ++y) {
  468           memset(p, 0xff, (urxI - llxI + 1) * 3);
  469           p += bitmap->getRowSize();
  470         }
  471       }
  472 
  473       // add a special char
  474       // (the font size is unused -- 10 is an arbitrary value)
  475       text->addSpecialChar(llx, ury, urx, lly,
  476                    0, formFieldFont, 10, 0x80000000 + fieldIdx);
  477     }
  478       }
  479     }
  480   }
  481 
  482   // HTML header
  483   pr(writeHTML, htmlStream, "<html>\n");
  484   pr(writeHTML, htmlStream, "<head>\n");
  485   pr(writeHTML, htmlStream, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n");
  486   pr(writeHTML, htmlStream, "<style type=\"text/css\">\n");
  487   pr(writeHTML, htmlStream, ".txt { white-space:nowrap; }\n");
  488   if (convertFormFields) {
  489     pr(writeHTML, htmlStream, ".textfield {\n");
  490     pr(writeHTML, htmlStream, "  border: 0;\n");
  491     pr(writeHTML, htmlStream, "  padding: 0;\n");
  492     pr(writeHTML, htmlStream, "  background: #ccccff;\n");
  493     pr(writeHTML, htmlStream, "}\n");
  494     pr(writeHTML, htmlStream, ".checkbox {\n");
  495     pr(writeHTML, htmlStream, "}\n");
  496   }
  497   fonts = text->getFonts();
  498   fontScales = (double *)gmallocn(fonts->getLength(), sizeof(double));
  499   for (i = 0; i < fontDefns->getLength(); ++i) {
  500     fontDefn = (HTMLGenFontDefn *)fontDefns->get(i);
  501     fontDefn->used = gFalse;
  502   }
  503   for (i = 0; i < fonts->getLength(); ++i) {
  504     font = (TextFontInfo *)fonts->get(i);
  505     fontDefn = getFontDefn(font, htmlDir);
  506     if (!fontDefn->used && fontDefn->fontFace) {
  507       pr(writeHTML, htmlStream, fontDefn->fontFace->getCString());
  508     }
  509     pf(writeHTML, htmlStream, ".f{0:d} {{ {1:t} }}\n", i, fontDefn->fontSpec);
  510     fontScales[i] = fontDefn->scale;
  511     fontDefn->used = gTrue;
  512   }
  513   pr(writeHTML, htmlStream, "</style>\n");
  514   pr(writeHTML, htmlStream, "</head>\n");
  515   if (primaryDir >= 0) {
  516     pr(writeHTML, htmlStream, "<body>\n");
  517   } else {
  518     pr(writeHTML, htmlStream, "<body dir=\"rtl\">\n");
  519   }
  520 
  521   // background image element (part 1)
  522   if (primaryDir >= 0) {
  523     pf(writeHTML, htmlStream, "<img style=\"position:absolute; left:0px; top:0px;\" width=\"{0:d}\" height=\"{1:d}\" ",
  524        (int)(pageW * zoom), (int)(pageH * zoom * vStretch));
  525   } else {
  526     pf(writeHTML, htmlStream, "<img style=\"position:absolute; right:0px; top:0px;\" width=\"{0:d}\" height=\"{1:d}\" ",
  527        (int)(pageW * zoom), (int)(pageH * zoom * vStretch));
  528   }
  529   if (embedBackgroundImage) {
  530     pr(writeHTML, htmlStream, "src=\"data:image/png;base64,\n");
  531     writeInfo.base64 = new Base64Encoder(writeHTML, htmlStream); 
  532     writeInfo.writePNG = NULL;
  533     writeInfo.pngStream = NULL;
  534   } else {
  535     pf(writeHTML, htmlStream, "src=\"{0:s}\"", pngURL);
  536     writeInfo.base64 = NULL;
  537     writeInfo.writePNG = writePNG;
  538     writeInfo.pngStream = pngStream;
  539   }
  540 
  541   // background image data - writing to a separate file, or embedding
  542   // with base64 encoding
  543   if (!(png = png_create_write_struct(PNG_LIBPNG_VER_STRING,
  544                       NULL, NULL, NULL)) ||
  545       !(pngInfo = png_create_info_struct(png))) {
  546     return errFileIO;
  547   }
  548   if (setjmp(png_jmpbuf(png))) {
  549     return errFileIO;
  550   }
  551   png_set_write_fn(png, &writeInfo, pngWriteFunc, NULL);
  552   png_set_IHDR(png, pngInfo, bitmap->getWidth(), bitmap->getHeight(),
  553            8, PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE,
  554            PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
  555   png_write_info(png, pngInfo);
  556   p = bitmap->getDataPtr();
  557   for (y = 0; y < bitmap->getHeight(); ++y) {
  558     png_write_row(png, (png_bytep)p);
  559     p += bitmap->getRowSize();
  560   }
  561   png_write_end(png, pngInfo);
  562   png_destroy_write_struct(&png, &pngInfo);
  563   if (embedBackgroundImage) {
  564     writeInfo.base64->flush();
  565     delete writeInfo.base64;
  566   }
  567 
  568   // background image element (part 2)
  569   pr(writeHTML, htmlStream, "\">\n");
  570 
  571   // generate the HTML text
  572   nextFieldID = 0;
  573   cols = text->makeColumns();
  574   for (colIdx = 0; colIdx < cols->getLength(); ++colIdx) {
  575     col = (TextColumn *)cols->get(colIdx);
  576     pars = col->getParagraphs();
  577     for (parIdx = 0; parIdx < pars->getLength(); ++parIdx) {
  578       par = (TextParagraph *)pars->get(parIdx);
  579       lines = par->getLines();
  580       for (lineIdx = 0; lineIdx < lines->getLength(); ++lineIdx) {
  581     line = (TextLine *)lines->get(lineIdx);
  582     if (line->getRotation() != 0) {
  583       continue;
  584     }
  585     words = line->getWords();
  586     if (lineIdx == 0 && par->hasDropCap() && words->getLength() >= 2) {
  587       base = ((TextWord *)words->get(1))->getBaseline();
  588     } else {
  589       base = line->getBaseline();
  590     }
  591     s = new GString();
  592     for (firstWordIdx = (primaryDir >= 0) ? 0 : words->getLength() - 1;
  593          (primaryDir >= 0) ? firstWordIdx < words->getLength()
  594                            : firstWordIdx >= 0;
  595          firstWordIdx = lastWordIdx + primaryDir) {
  596       lastWordIdx = findDirSpan(words, firstWordIdx,
  597                     primaryDir, &spanDir);
  598       appendSpans(words, firstWordIdx, lastWordIdx,
  599               primaryDir, spanDir,
  600               base, lineIdx == 0 && par->hasDropCap(),
  601               s);
  602     }
  603     if (primaryDir >= 0) {
  604       pf(writeHTML, htmlStream, "<div class=\"txt\" style=\"position:absolute; left:{0:d}px; top:{1:d}px;\">{2:t}</div>\n",
  605          (int)(line->getXMin() * zoom),
  606          (int)(line->getYMin() * zoom * vStretch), s);
  607     } else {
  608       pf(writeHTML, htmlStream, "<div class=\"txt\" style=\"position:absolute; right:{0:d}px; top:{1:d}px;\">{2:t}</div>\n",
  609          (int)((pageW - line->getXMax()) * zoom),
  610          (int)(line->getYMin() * zoom * vStretch), s);
  611     }
  612     delete s;
  613       }
  614     }
  615   }
  616   gfree(fontScales);
  617   delete text;
  618   deleteGList(cols, TextColumn);
  619   if (formFieldFont) {
  620     delete formFieldFont;
  621     formFieldFont = NULL;
  622   }
  623   if (formFieldInfo) {
  624     deleteGList(formFieldInfo, HTMLGenFormFieldInfo);
  625     formFieldInfo = NULL;
  626   }
  627 
  628   // HTML trailer
  629   pr(writeHTML, htmlStream, "</body>\n");
  630   pr(writeHTML, htmlStream, "</html>\n");
  631 
  632   return errNone;
  633 }
  634 
  635 // Find a sequence of words, starting at <firstWordIdx>, that have the
  636 // same writing direction.  Returns the index of the last word, and
  637 // sets *<spanDir> to the span direction.
  638 int HTMLGen::findDirSpan(GList *words, int firstWordIdx, int primaryDir,
  639              int *spanDir) {
  640   int dir0, dir1, nextWordIdx;
  641 
  642   dir0 = ((TextWord *)words->get(firstWordIdx))->getDirection();
  643   for (nextWordIdx = firstWordIdx + primaryDir;
  644        (primaryDir >= 0) ? nextWordIdx < words->getLength()
  645                      : nextWordIdx >= 0;
  646        nextWordIdx += primaryDir) {
  647     dir1 = ((TextWord *)words->get(nextWordIdx))->getDirection();
  648     if (dir0 == 0) {
  649       dir0 = dir1;
  650     } else if (dir1 != 0 && dir1 != dir0) {
  651       break;
  652     }
  653   }
  654 
  655   if (dir0 == 0) {
  656     *spanDir = primaryDir;
  657   } else {
  658     *spanDir = dir0;
  659   }
  660 
  661   return nextWordIdx - primaryDir;
  662 }
  663 
  664 // Create HTML spans for words <firstWordIdx> .. <lastWordIdx>, and
  665 // append them to <s>.
  666 void HTMLGen::appendSpans(GList *words, int firstWordIdx, int lastWordIdx,
  667               int primaryDir, int spanDir,
  668               double base, GBool dropCapLine, GString *s) {
  669   if (allTextInvisible && !drawInvisibleText) {
  670     return;
  671   }
  672 
  673   if (spanDir != primaryDir) {
  674     int t = firstWordIdx;
  675     firstWordIdx = lastWordIdx;
  676     lastWordIdx = t;
  677   }
  678 
  679   int wordIdx = firstWordIdx;
  680   while ((spanDir >= 0) ? wordIdx <= lastWordIdx
  681                     : wordIdx >= lastWordIdx) {
  682     TextWord *word0 = (TextWord *)words->get(wordIdx);
  683 
  684     // form field(s): generate <input> element(s)
  685     if (convertFormFields && word0->getFontInfo() == formFieldFont) {
  686       for (int i = (spanDir >= 0) ? 0 : word0->getLength() - 1;
  687        (spanDir >= 0) ? i < word0->getLength() : i >= 0;
  688        i += spanDir) {
  689     int fieldIdx = word0->getChar(0) - 0x80000000;
  690     if (fieldIdx >= 0 && fieldIdx < formFieldInfo->getLength()) {
  691       HTMLGenFormFieldInfo *ffi =
  692           (HTMLGenFormFieldInfo *)formFieldInfo->get(fieldIdx);
  693       AcroFormField *field = ffi->acroFormField;
  694       AcroFormFieldType fieldType = field->getAcroFormFieldType();
  695       double llx, lly, urx, ury;
  696       field->getBBox(&llx, &lly, &urx, &ury);
  697       int width = (int)(urx - llx);
  698       Ref fontID;
  699       double fontSize;
  700       field->getFont(&fontID, &fontSize);
  701       if (fontSize == 0) {
  702         fontSize = 12;
  703       }
  704       if (fieldType == acroFormFieldText) {
  705         s->appendf("<input type=\"text\" class=\"textfield\" id=\"textfield{0:d}\" style=\"width:{1:d}px; font-size:{2:d}px;\">", nextFieldID, width, (int)(fontSize + 0.5));
  706         ++nextFieldID;
  707       } else if (fieldType == acroFormFieldCheckbox) {
  708         s->appendf("<input type=\"checkbox\" class=\"checkbox\" id=\"checkbox{0:d}\" style=\"width:{1:d}px; font-size:{2:d}px;\">", nextFieldID, width, (int)(fontSize + 0.5));
  709         ++nextFieldID;
  710       }
  711     }
  712       }
  713 
  714       if (word0->getSpaceAfter()) {
  715     s->append(' ');
  716       }
  717 
  718       wordIdx += spanDir;
  719 
  720     // skip invisible words
  721     } else if (!drawInvisibleText &&
  722            (word0->isInvisible() || word0->isRotated())) {
  723       wordIdx += spanDir;
  724 
  725     // generate a <span> containing one or more words
  726     } else {
  727 
  728       double r0 = 0, g0 = 0, b0 = 0; // make gcc happy
  729       VerticalAlignment vertAlign0 = vertAlignBaseline; // make gcc happy
  730       GString *linkURI0 = NULL;
  731 
  732       GBool invisible = word0->isInvisible() || word0->isRotated();
  733 
  734       do {
  735     TextWord *word1 = (TextWord *)words->get(wordIdx);
  736 
  737     // get word parameters
  738     double r1, g1, b1;
  739     word0->getColor(&r1, &g1, &b1);
  740     double base1 = word1->getBaseline();
  741     VerticalAlignment vertAlign1;
  742     if (dropCapLine) {
  743       //~ this will fail if there are subscripts or superscripts in
  744       //~   the first line of a paragraph with a drop cap
  745       vertAlign1 = vertAlignTop;
  746     } else if (base1 - base < -1) {
  747       vertAlign1 = vertAlignSuper;
  748     } else if (base1 - base > 1) {
  749       vertAlign1 = vertAlignSub;
  750     } else {
  751       vertAlign1 = vertAlignBaseline;
  752     }
  753     GString *linkURI1 = word1->getLinkURI();
  754 
  755     // start of span
  756     if (word1 == word0) {
  757       r0 = r1;
  758       g0 = g1;
  759       b0 = b1;
  760       vertAlign0 = vertAlign1;
  761       linkURI0 = linkURI1;
  762 
  763       int i;
  764       for (i = 0; i < fonts->getLength(); ++i) {
  765         if (word1->getFontInfo() == (TextFontInfo *)fonts->get(i)) {
  766           break;
  767         }
  768       }
  769       if (linkURI1) {
  770         s->appendf("<a href=\"{0:t}\">", linkURI0);
  771       }
  772       // we force spans to be LTR or RTL; this is a kludge, but it's
  773       // far easier than implementing the full Unicode bidi algorithm
  774       const char *dirTag;
  775       if (spanDir == primaryDir) {
  776         dirTag = "";
  777       } else if (spanDir < 0) {
  778         dirTag = " dir=\"rtl\"";
  779       } else {
  780         dirTag = " dir=\"ltr\"";
  781       }
  782       s->appendf("<span class=\"f{0:d}\"{1:s} style=\"font-size:{2:d}px;vertical-align:{3:s};{4:s}color:rgba({5:d},{6:d},{7:d},{8:d});\">",
  783              i,
  784              dirTag,
  785              (int)(fontScales[i] * word1->getFontSize() * zoom),
  786              vertAlignNames[vertAlign1],
  787              (dropCapLine && wordIdx == 0) ? "line-height:75%;" : "",
  788              (int)(r0 * 255), (int)(g0 * 255), (int)(b0 * 255),
  789              invisible ? 0 : 1);
  790 
  791     // end of span
  792     } else if (word1->getFontInfo() != word0->getFontInfo() ||
  793            word1->getFontSize() != word0->getFontSize() ||
  794            word1->isInvisible() != word0->isInvisible() ||
  795            word1->isRotated() != word0->isRotated() ||
  796            vertAlign1 != vertAlign0 ||
  797            r1 != r0 || g1 != g0 || b1 != b0 ||
  798            linkURI1 != linkURI0) {
  799       break;
  800     }
  801 
  802     // add a space before the word, if needed
  803     // -- this only happens with the first word in a reverse section
  804     if (spanDir != primaryDir && wordIdx == firstWordIdx) {
  805       GBool sp;
  806       if (spanDir >= 0) {
  807         if (wordIdx > 0) {
  808           sp = ((TextWord *)words->get(wordIdx - 1))->getSpaceAfter();
  809         } else {
  810           sp = gFalse;
  811         }
  812       } else {
  813         sp = word1->getSpaceAfter();
  814       }
  815       if (sp) {
  816         s->append(' ');
  817       }
  818     }
  819 
  820     // generate the word text
  821     for (int i = (spanDir >= 0) ? 0 : word1->getLength() - 1;
  822          (spanDir >= 0) ? i < word1->getLength() : i >= 0;
  823          i += spanDir) {
  824       Unicode u = word1->getChar(i);
  825       if (u >= privateUnicodeMapStart &&
  826           u <= privateUnicodeMapEnd &&
  827           privateUnicodeMap[u - privateUnicodeMapStart]) {
  828         u = privateUnicodeMap[u - privateUnicodeMapStart];
  829       }
  830       appendUTF8(u, s);
  831     }
  832 
  833     // add a space after the word, if needed
  834     // -- there is never a space after the last word in a reverse
  835     //    section (this will be handled as a space after the last
  836     //    word in the previous primary-direction section)
  837     GBool sp;
  838     if (spanDir != primaryDir && wordIdx == lastWordIdx) {
  839       sp = gFalse;
  840     } else if (spanDir >= 0) {
  841       sp = word1->getSpaceAfter();
  842     } else {
  843       if (wordIdx > 0) {
  844         sp = ((TextWord *)words->get(wordIdx - 1))->getSpaceAfter();
  845       } else {
  846         sp = gFalse;
  847       }
  848     }
  849     if (sp) {
  850       s->append(' ');
  851     }
  852 
  853     wordIdx += spanDir;
  854       } while ((spanDir >= 0) ? wordIdx <= lastWordIdx
  855                           : wordIdx >= lastWordIdx);
  856 
  857       s->append("</span>");
  858       if (linkURI0) {
  859     s->append("</a>");
  860       }
  861     }
  862   }
  863 }
  864 
  865 void HTMLGen::appendUTF8(Unicode u, GString *s) {
  866   if (u <= 0x7f) {
  867     if (u == '&') {
  868       s->append("&amp;");
  869     } else if (u == '<') {
  870       s->append("&lt;");
  871     } else if (u == '>') {
  872       s->append("&gt;");
  873     } else {
  874       s->append((char)u);
  875     }
  876   } else if (u <= 0x7ff) {
  877     s->append((char)(0xc0 + (u >> 6)));
  878     s->append((char)(0x80 + (u & 0x3f)));
  879   } else if (u <= 0xffff) {
  880     s->append((char)(0xe0 + (u >> 12)));
  881     s->append((char)(0x80 + ((u >> 6) & 0x3f)));
  882     s->append((char)(0x80 + (u & 0x3f)));
  883   } else if (u <= 0x1fffff) {
  884     s->append((char)(0xf0 + (u >> 18)));
  885     s->append((char)(0x80 + ((u >> 12) & 0x3f)));
  886     s->append((char)(0x80 + ((u >> 6) & 0x3f)));
  887     s->append((char)(0x80 + (u & 0x3f)));
  888   } else if (u <= 0x3ffffff) {
  889     s->append((char)(0xf8 + (u >> 24)));
  890     s->append((char)(0x80 + ((u >> 18) & 0x3f)));
  891     s->append((char)(0x80 + ((u >> 12) & 0x3f)));
  892     s->append((char)(0x80 + ((u >> 6) & 0x3f)));
  893     s->append((char)(0x80 + (u & 0x3f)));
  894   } else if (u <= 0x7fffffff) {
  895     s->append((char)(0xfc + (u >> 30)));
  896     s->append((char)(0x80 + ((u >> 24) & 0x3f)));
  897     s->append((char)(0x80 + ((u >> 18) & 0x3f)));
  898     s->append((char)(0x80 + ((u >> 12) & 0x3f)));
  899     s->append((char)(0x80 + ((u >> 6) & 0x3f)));
  900     s->append((char)(0x80 + (u & 0x3f)));
  901   }
  902 }
  903 
  904 HTMLGenFontDefn *HTMLGen::getFontDefn(TextFontInfo *font,
  905                       const char *htmlDir) {
  906   Ref id;
  907   HTMLGenFontDefn *fontDefn;
  908   int i;
  909 
  910   // check the existing font defns
  911   id = font->getFontID();
  912   if (id.num >= 0) {
  913     for (i = 0; i < fontDefns->getLength(); ++i) {
  914       fontDefn = (HTMLGenFontDefn *)fontDefns->get(i);
  915       if (fontDefn->match(id)) {
  916     return fontDefn;
  917       }
  918     }
  919   }
  920 
  921   // try to extract a font file
  922   if (!extractFontFiles ||
  923       !(fontDefn = getFontFile(font, htmlDir))) {
  924 
  925     // get a substitute font
  926     fontDefn = getSubstituteFont(font);
  927   }
  928 
  929   fontDefns->append(fontDefn);
  930   return fontDefn;
  931 }
  932 
  933 HTMLGenFontDefn *HTMLGen::getFontFile(TextFontInfo *font,
  934                       const char *htmlDir) {
  935   Ref id;
  936   HTMLGenFontDefn *fontDefn;
  937   Object fontObj;
  938   GfxFont *gfxFont;
  939   WebFont *webFont;
  940   GString *fontFile, *fontPath, *fontFace, *fontSpec;
  941   const char *family, *weight, *style;
  942   double scale;
  943 
  944   id = font->getFontID();
  945   if (id.num < 0) {
  946     return NULL;
  947   }
  948 
  949   doc->getXRef()->fetch(id.num, id.gen, &fontObj);
  950   if (!fontObj.isDict()) {
  951     fontObj.free();
  952     return NULL;
  953   }
  954 
  955   gfxFont = GfxFont::makeFont(doc->getXRef(), "F", id, fontObj.getDict());
  956   webFont = new WebFont(gfxFont, doc->getXRef());
  957   fontDefn = NULL;
  958   fontFace = NULL;
  959 
  960   if (webFont->canWriteTTF()) {
  961     if (embedFonts) {
  962       GString *ttfData = webFont->getTTFData();
  963       if (ttfData) {
  964     fontFace = GString::format("@font-face {{ font-family: ff{0:d}; src: url(\"data:font/ttf;base64,",
  965                    nextFontFaceIdx);
  966     Base64Encoder enc(writeToString, fontFace);
  967     enc.encode((unsigned char *)ttfData->getCString(),
  968            (size_t)ttfData->getLength());
  969     enc.flush();
  970     fontFace->append("\"); }\n");
  971     delete ttfData;
  972       }
  973     } else {
  974       fontFile = GString::format("{0:d}.ttf", nextFontFaceIdx);
  975       fontPath = GString::format("{0:s}/{1:t}", htmlDir, fontFile);
  976       if (webFont->writeTTF(fontPath->getCString())) {
  977     fontFace = GString::format("@font-face {{ font-family: ff{0:d}; src: url(\"{1:t}\"); }}\n",
  978                    nextFontFaceIdx, fontFile);
  979       }
  980       delete fontPath;
  981       delete fontFile;
  982     }
  983     if (fontFace) {
  984       getFontDetails(font, &family, &weight, &style, &scale);
  985       fontSpec = GString::format("font-family:ff{0:d},{1:s}; font-weight:{2:s}; font-style:{3:s};",
  986                  nextFontFaceIdx, family, weight, style);
  987       ++nextFontFaceIdx;
  988       fontDefn = new HTMLGenFontDefn(id, fontFace, fontSpec, 1.0);
  989     }
  990 
  991   } else if (webFont->canWriteOTF()) {
  992     if (embedFonts) {
  993       GString *otfData = webFont->getOTFData();
  994       if (otfData) {
  995     fontFace = GString::format("@font-face {{ font-family: ff{0:d}; src: url(\"data:font/otf;base64,",
  996                    nextFontFaceIdx);
  997     Base64Encoder enc(writeToString, fontFace);
  998     enc.encode((unsigned char *)otfData->getCString(),
  999            (size_t)otfData->getLength());
 1000     enc.flush();
 1001     fontFace->append("\"); }\n");
 1002     delete otfData;
 1003       }
 1004     } else {
 1005       fontFile = GString::format("{0:d}.otf", nextFontFaceIdx);
 1006       fontPath = GString::format("{0:s}/{1:t}", htmlDir, fontFile);
 1007       if (webFont->writeOTF(fontPath->getCString())) {
 1008     fontFace = GString::format("@font-face {{ font-family: ff{0:d}; src: url(\"{1:t}\"); }}\n",
 1009                    nextFontFaceIdx, fontFile);
 1010       }
 1011       delete fontPath;
 1012       delete fontFile;
 1013     }
 1014     if (fontFace) {
 1015       getFontDetails(font, &family, &weight, &style, &scale);
 1016       fontSpec = GString::format("font-family:ff{0:d},{1:s}; font-weight:{2:s}; font-style:{3:s};",
 1017                  nextFontFaceIdx, family, weight, style);
 1018       ++nextFontFaceIdx;
 1019       fontDefn = new HTMLGenFontDefn(id, fontFace, fontSpec, 1.0);
 1020     }
 1021   }
 1022 
 1023   delete webFont;
 1024   delete gfxFont;
 1025   fontObj.free();
 1026 
 1027   return fontDefn;
 1028 }
 1029 
 1030 HTMLGenFontDefn *HTMLGen::getSubstituteFont(TextFontInfo *font) {
 1031   const char *family, *weight, *style;
 1032   double scale;
 1033   GString *fontSpec;
 1034 
 1035   getFontDetails(font, &family, &weight, &style, &scale);
 1036   fontSpec = GString::format("font-family:{0:s}; font-weight:{1:s}; font-style:{2:s};",
 1037                  family, weight, style);
 1038   return new HTMLGenFontDefn(font->getFontID(), NULL, fontSpec, scale);
 1039 }
 1040 
 1041 void HTMLGen::getFontDetails(TextFontInfo *font, const char **family,
 1042                  const char **weight, const char **style,
 1043                  double *scale) {
 1044   GString *fontName;
 1045   char *fontName2;
 1046   FontStyleTagInfo *fst;
 1047   StandardFontInfo *sf;
 1048   GBool fixedWidth, serif, bold, italic;
 1049   double s;
 1050   int n, i;
 1051 
 1052   // get the font name, remove any subset tag
 1053   fontName = font->getFontName();
 1054   if (fontName) {
 1055     fontName2 = fontName->getCString();
 1056     n = fontName->getLength();
 1057     for (i = 0; i < n && i < 7; ++i) {
 1058       if (fontName2[i] < 'A' || fontName2[i] > 'Z') {
 1059     break;
 1060       }
 1061     }
 1062     if (i == 6 && n > 7 && fontName2[6] == '+') {
 1063       fontName2 += 7;
 1064       n -= 7;
 1065     }
 1066   } else {
 1067     fontName2 = NULL;
 1068     n = 0;
 1069   }
 1070 
 1071   // get the style info from the font descriptor flags
 1072   fixedWidth = font->isFixedWidth();
 1073   serif = font->isSerif();
 1074   bold = font->isBold();
 1075   italic = font->isItalic();
 1076 
 1077   if (fontName2) {
 1078 
 1079     // look for a style tag at the end of the font name -- this
 1080     // overrides the font descriptor bold/italic flags
 1081     for (fst = fontStyleTags; fst->tag; ++fst) {
 1082       if (n > fst->tagLen &&
 1083       !strcasecmp(fontName2 + n - fst->tagLen, fst->tag)) {
 1084     bold = fst->bold;
 1085     italic = fst->italic;
 1086     n -= fst->tagLen;
 1087     if (n > 1 && (fontName2[n-1] == '-' ||
 1088               fontName2[n-1] == ',' ||
 1089               fontName2[n-1] == '.' ||
 1090               fontName2[n-1] == '_')) {
 1091       --n;
 1092     }
 1093     break;
 1094       }
 1095     }
 1096 
 1097     // look for a known font name -- this overrides the font descriptor
 1098     // fixedWidth/serif flags
 1099     for (sf = standardFonts; sf->name; ++sf) {
 1100       if (!strncasecmp(fontName2, sf->name, n)) {
 1101     fixedWidth = sf->fixedWidth;
 1102     serif = sf->serif;
 1103     break;
 1104       }
 1105     }
 1106   }
 1107 
 1108   // compute the scaling factor
 1109   *scale = 1;
 1110   if ((s = font->getMWidth())) {
 1111     i = (fixedWidth ? 8 : serif ? 4 : 0) + (bold ? 2 : 0) + (italic ? 1 : 0);
 1112     if (s < substFonts[i].mWidth) {
 1113       *scale = s / substFonts[i].mWidth;
 1114     }
 1115   }
 1116 
 1117   *family = fixedWidth ? "monospace" : serif ? "serif" : "sans-serif";
 1118   *weight = bold ? "bold" : "normal";
 1119   *style = italic ? "italic" : "normal";
 1120 }