"Fossies" - the Fresh Open Source Software Archive

Member "xpdf-4.04/xpdf/HTMLGen.cc" (18 Apr 2022, 34442 Bytes) of package /linux/misc/xpdf-4.04.tar.gz:


As a special service "Fossies" has tried to format the requested source page into HTML format using (guessed) C and C++ source code syntax highlighting (style: standard) with prefixed line numbers and code folding option. Alternatively you can here view or download the uninterpreted source code file. For more information about "HTMLGen.cc" see the Fossies "Dox" file reference documentation and the latest Fossies "Diffs" side-by-side code changes report: 4.03_vs_4.04.

    1 //========================================================================
    2 //
    3 // HTMLGen.cc
    4 //
    5 // Copyright 2010-2021 Glyph & Cog, LLC
    6 //
    7 //========================================================================
    8 
    9 //~ to do:
   10 //~ - fonts
   11 //~   - underlined? (underlines are present in the background image)
   12 //~   - include the original font name in the CSS entry (before the
   13 //~     generic serif/sans-serif/monospace name)
   14 //~ - check that htmlDir exists and is a directory
   15 //~ - links:
   16 //~   - internal links (to pages, to named destinations)
   17 //~   - links from non-text content
   18 //~ - rotated text should go in the background image
   19 //~ - metadata
   20 //~ - PDF outline
   21 
   22 #include <aconf.h>
   23 
   24 #ifdef USE_GCC_PRAGMAS
   25 #pragma implementation
   26 #endif
   27 
   28 #include <stdlib.h>
   29 #include <png.h>
   30 #include "gmem.h"
   31 #include "gmempp.h"
   32 #include "GString.h"
   33 #include "GList.h"
   34 #include "SplashBitmap.h"
   35 #include "PDFDoc.h"
   36 #include "GfxFont.h"
   37 #include "AcroForm.h"
   38 #include "TextOutputDev.h"
   39 #include "SplashOutputDev.h"
   40 #include "ErrorCodes.h"
   41 #include "WebFont.h"
   42 #include "HTMLGen.h"
   43 
   44 #ifdef _WIN32
   45 #  define strcasecmp stricmp
   46 #  define strncasecmp strnicmp
   47 #endif
   48 
   49 //------------------------------------------------------------------------
   50 
   51 struct FontStyleTagInfo {
   52   const char *tag;
   53   int tagLen;
   54   GBool bold;
   55   GBool italic;
   56 };
   57 
   58 // NB: these are compared, in order, against the tail of the font
   59 // name, so "BoldItalic" must come before "Italic", etc.
   60 static FontStyleTagInfo fontStyleTags[] = {
   61   {"Roman",                    5, gFalse, gFalse},
   62   {"Regular",                  7, gFalse, gFalse},
   63   {"Condensed",                9, gFalse, gFalse},
   64   {"CondensedBold",           13, gTrue,  gFalse},
   65   {"CondensedLight",          14, gFalse, gFalse},
   66   {"SemiBold",                 8, gTrue,  gFalse},
   67   {"BoldItalicMT",            12, gTrue,  gTrue},
   68   {"BoldItalic",              10, gTrue,  gTrue},
   69   {"Bold_Italic",             11, gTrue,  gTrue},
   70   {"BoldOblique",             11, gTrue,  gTrue},
   71   {"Bold_Oblique",            12, gTrue,  gTrue},
   72   {"BoldMT",                   6, gTrue,  gFalse},
   73   {"Bold",                     4, gTrue,  gFalse},
   74   {"ItalicMT",                 8, gFalse, gTrue},
   75   {"Italic",                   6, gFalse, gTrue},
   76   {"Oblique",                  7, gFalse, gTrue},
   77   {"Light",                    5, gFalse, gFalse},
   78   {NULL,                       0, gFalse, gFalse}
   79 };
   80 
   81 struct StandardFontInfo {
   82   const char *name;
   83   GBool fixedWidth;
   84   GBool serif;
   85 };
   86 
   87 static StandardFontInfo standardFonts[] = {
   88   {"Arial",                    gFalse, gFalse},
   89   {"Courier",                  gTrue,  gFalse},
   90   {"Futura",                   gFalse, gFalse},
   91   {"Helvetica",                gFalse, gFalse},
   92   {"Minion",                   gFalse, gTrue},
   93   {"NewCenturySchlbk",         gFalse, gTrue},
   94   {"Times",                    gFalse, gTrue},
   95   {"TimesNew",                 gFalse, gTrue},
   96   {"Times_New",                gFalse, gTrue},
   97   {"Verdana",                  gFalse, gFalse},
   98   {"LucidaSans",               gFalse, gFalse},
   99   {NULL,                       gFalse, gFalse}
  100 };
  101 
  102 struct SubstFontInfo {
  103   double mWidth;
  104 };
  105 
  106 // index: {fixed:8, serif:4, sans-serif:0} + bold*2 + italic
  107 static SubstFontInfo substFonts[16] = {
  108   {0.833},
  109   {0.833},
  110   {0.889},
  111   {0.889},
  112   {0.788},
  113   {0.722},
  114   {0.833},
  115   {0.778},
  116   {0.600},
  117   {0.600},
  118   {0.600},
  119   {0.600}
  120 };
  121 
  122 // Map Unicode indexes from the private use area, following the Adobe
  123 // Glyph list.
  124 #define privateUnicodeMapStart 0xf6f9
  125 #define privateUnicodeMapEnd   0xf7ff
  126 static int
  127 privateUnicodeMap[privateUnicodeMapEnd - privateUnicodeMapStart + 1] = {
  128   0x0141, 0x0152, 0,      0,      0x0160, 0,      0x017d,         // f6f9
  129   0,      0,      0,      0,      0,      0,      0,      0,      // f700
  130   0,      0,      0,      0,      0,      0,      0,      0,
  131   0,      0,      0,      0,      0,      0,      0,      0,      // f710
  132   0,      0,      0,      0,      0,      0,      0,      0,
  133   0,      0x0021, 0,      0,      0x0024, 0,      0x0026, 0,      // f720
  134   0,      0,      0,      0,      0,      0,      0,      0,
  135   0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, // f730
  136   0x0038, 0x0039, 0,      0,      0,      0,      0,      0x003f,
  137   0,      0,      0,      0,      0,      0,      0,      0,      // f740
  138   0,      0,      0,      0,      0,      0,      0,      0,
  139   0,      0,      0,      0,      0,      0,      0,      0,      // f750
  140   0,      0,      0,      0,      0,      0,      0,      0,
  141   0,      0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, // f760
  142   0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
  143   0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, // f770
  144   0x0058, 0x0059, 0x005a, 0,      0,      0,      0,      0,
  145   0,      0,      0,      0,      0,      0,      0,      0,      // f780
  146   0,      0,      0,      0,      0,      0,      0,      0,
  147   0,      0,      0,      0,      0,      0,      0,      0,      // f790
  148   0,      0,      0,      0,      0,      0,      0,      0,
  149   0,      0x00a1, 0x00a2, 0,      0,      0,      0,      0,      // f7a0
  150   0,      0,      0,      0,      0,      0,      0,      0,
  151   0,      0,      0,      0,      0,      0,      0,      0,      // f7b0
  152   0,      0,      0,      0,      0,      0,      0,      0x00bf,
  153   0,      0,      0,      0,      0,      0,      0,      0,      // f7c0
  154   0,      0,      0,      0,      0,      0,      0,      0,
  155   0,      0,      0,      0,      0,      0,      0,      0,      // f7d0
  156   0,      0,      0,      0,      0,      0,      0,      0,
  157   0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, // f7e0
  158   0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
  159   0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0,      // f7f0
  160   0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x0178
  161 };
  162 
  163 enum VerticalAlignment {
  164   vertAlignBaseline,
  165   vertAlignSub,
  166   vertAlignSuper,
  167   vertAlignTop
  168 };
  169 
  170 static const char *vertAlignNames[] = {
  171   "baseline",
  172   "sub",
  173   "super",
  174   "top"
  175 };
  176 
  177 //------------------------------------------------------------------------
  178 
  179 class HTMLGenFontDefn {
  180 public:
  181 
  182   HTMLGenFontDefn(Ref fontIDA, GString *fontFaceA, GString *fontSpecA,
  183           double scaleA)
  184     : fontID(fontIDA), fontFace(fontFaceA), fontSpec(fontSpecA)
  185     , scale(scaleA), used(gFalse) {}
  186   ~HTMLGenFontDefn() { delete fontFace; delete fontSpec; }
  187   GBool match(Ref fontIDA)
  188     { return fontIDA.num == fontID.num && fontIDA.gen == fontID.gen; }
  189 
  190   Ref fontID;
  191   GString *fontFace;        // NULL for substituted fonts
  192   GString *fontSpec;
  193   double scale;
  194   GBool used;           // set when used (per page)
  195 };
  196 
  197 //------------------------------------------------------------------------
  198 
  199 class HTMLGenFormFieldInfo {
  200 public:
  201 
  202   HTMLGenFormFieldInfo(AcroFormField *acroFormFieldA)
  203     : acroFormField(acroFormFieldA) {}
  204 
  205   AcroFormField *acroFormField;
  206 };
  207 
  208 //------------------------------------------------------------------------
  209 
  210 class Base64Encoder {
  211 public:
  212 
  213   Base64Encoder(int (*writeFuncA)(void *stream, const char *data, int size),
  214         void *streamA);
  215   void encode(const unsigned char *data, size_t size);
  216   void flush();
  217 
  218 private:
  219 
  220   int (*writeFunc)(void *stream, const char *data, int size);
  221   void *stream;
  222   unsigned char buf[3];
  223   int bufLen;
  224 };
  225 
  226 static char base64Chars[65] =
  227     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
  228 
  229 Base64Encoder::Base64Encoder(int (*writeFuncA)(void *stream, const char *data,
  230                            int size),
  231                  void *streamA) {
  232   writeFunc = writeFuncA;
  233   stream = streamA;
  234   bufLen = 0;
  235 }
  236 
  237 void Base64Encoder::encode(const unsigned char *data, size_t size) {
  238   size_t i = 0;
  239   while (1) {
  240     while (bufLen < 3) {
  241       if (i >= size) {
  242     return;
  243       }
  244       buf[bufLen++] = data[i++];
  245     }
  246     char out[4];
  247     out[0] = base64Chars[(buf[0] >> 2) & 0x3f];
  248     out[1] = base64Chars[((buf[0] << 4) | (buf[1] >> 4)) & 0x3f];
  249     out[2] = base64Chars[((buf[1] << 2) | (buf[2] >> 6)) & 0x3f];
  250     out[3] = base64Chars[buf[2] & 0x3f];
  251     writeFunc(stream, out, 4);
  252     bufLen = 0;
  253   }
  254 }
  255 
  256 void Base64Encoder::flush() {
  257   // if bufLen == 0, this does nothing
  258   // bufLen should never be 3 here
  259   char out[4];
  260   if (bufLen == 1) {
  261     out[0] = base64Chars[(buf[0] >> 2) & 0x3f];
  262     out[1] = base64Chars[(buf[0] << 4) & 0x3f];
  263     out[2] = '=';
  264     out[3] = '=';
  265     writeFunc(stream, out, 4);
  266   } else if (bufLen == 2) {
  267     out[0] = base64Chars[(buf[0] >> 2) & 0x3f];
  268     out[1] = base64Chars[((buf[0] << 4) | (buf[1] >> 4)) & 0x3f];
  269     out[2] = base64Chars[(buf[1] << 2) & 0x3f];
  270     out[3] = '=';
  271     writeFunc(stream, out, 4);
  272   }
  273 }
  274 
  275 static int writeToString(void *stream, const char *data, int size) {
  276   ((GString *)stream)->append(data, size);
  277   return size;
  278 }
  279 
  280 //------------------------------------------------------------------------
  281 
  282 
  283 //------------------------------------------------------------------------
  284 
  285 HTMLGen::HTMLGen(double backgroundResolutionA, GBool tableMode) {
  286   TextOutputControl textOutControl;
  287   SplashColor paperColor;
  288 
  289   ok = gTrue;
  290 
  291   backgroundResolution = backgroundResolutionA;
  292   zoom = 1.0;
  293   vStretch = 1.0;
  294   drawInvisibleText = gTrue;
  295   allTextInvisible = gFalse;
  296   extractFontFiles = gFalse;
  297   convertFormFields = gFalse;
  298   embedBackgroundImage = gFalse;
  299   embedFonts = gFalse;
  300 
  301   // set up the TextOutputDev
  302   textOutControl.mode = tableMode ? textOutTableLayout : textOutReadingOrder;
  303   textOutControl.html = gTrue;
  304   textOutControl.splitRotatedWords = gTrue;
  305   textOut = new TextOutputDev(NULL, &textOutControl, gFalse);
  306   if (!textOut->isOk()) {
  307     ok = gFalse;
  308   }
  309 
  310   // set up the SplashOutputDev
  311   paperColor[0] = paperColor[1] = paperColor[2] = 0xff;
  312   splashOut = new SplashOutputDev(splashModeRGB8, 1, gFalse, paperColor);
  313 
  314   fontDefns = NULL;
  315 }
  316 
  317 HTMLGen::~HTMLGen() {
  318   delete textOut;
  319   delete splashOut;
  320   if (fontDefns) {
  321     deleteGList(fontDefns, HTMLGenFontDefn);
  322   }
  323 }
  324 
  325 void HTMLGen::startDoc(PDFDoc *docA) {
  326   doc = docA;
  327   splashOut->startDoc(doc->getXRef());
  328 
  329   if (fontDefns) {
  330     deleteGList(fontDefns, HTMLGenFontDefn);
  331   }
  332   fontDefns = new GList();
  333   nextFontFaceIdx = 0;
  334 }
  335 
  336 static inline int pr(int (*writeFunc)(void *stream, const char *data, int size),
  337              void *stream, const char *data) {
  338   return writeFunc(stream, data, (int)strlen(data));
  339 }
  340 
  341 static int pf(int (*writeFunc)(void *stream, const char *data, int size),
  342           void *stream, const char *fmt, ...) {
  343   va_list args;
  344   GString *s;
  345   int ret;
  346 
  347   va_start(args, fmt);
  348   s = GString::formatv(fmt, args);
  349   va_end(args);
  350   ret = writeFunc(stream, s->getCString(), s->getLength());
  351   delete s;
  352   return ret;
  353 }
  354 
  355 struct PNGWriteInfo {
  356   Base64Encoder *base64;
  357   int (*writePNG)(void *stream, const char *data, int size);
  358   void *pngStream;
  359 };
  360 
  361 static void pngWriteFunc(png_structp png, png_bytep data, png_size_t size) {
  362   PNGWriteInfo *info = (PNGWriteInfo *)png_get_progressive_ptr(png);
  363   if (info->base64) {
  364     info->base64->encode(data, size);
  365   } else {
  366     info->writePNG(info->pngStream, (char *)data, (int)size);
  367   }
  368 }
  369 
  370 int HTMLGen::convertPage(
  371          int pg, const char *pngURL, const char *htmlDir,
  372          int (*writeHTML)(void *stream, const char *data, int size),
  373          void *htmlStream,
  374          int (*writePNG)(void *stream, const char *data, int size),
  375          void *pngStream) {
  376   png_structp png;
  377   png_infop pngInfo;
  378   PNGWriteInfo writeInfo;
  379   SplashBitmap *bitmap;
  380   Guchar *p;
  381   double pageW, pageH;
  382   TextPage *text;
  383   GList *cols, *pars, *lines, *words;
  384   TextFontInfo *font;
  385   TextColumn *col;
  386   TextParagraph *par;
  387   TextLine *line;
  388   HTMLGenFontDefn *fontDefn;
  389   GString *s;
  390   double base;
  391   int primaryDir, spanDir;
  392   int colIdx, parIdx, lineIdx, firstWordIdx, lastWordIdx;
  393   int y, i;
  394 
  395   // generate the background bitmap
  396   splashOut->setSkipText(!allTextInvisible, gFalse);
  397   doc->displayPage(splashOut, pg,
  398            backgroundResolution, backgroundResolution * vStretch,
  399            0, gFalse, gTrue, gFalse);
  400   bitmap = splashOut->getBitmap();
  401 
  402   // page size
  403   if (doc->getPageRotate(pg) == 90 || doc->getPageRotate(pg) == 270) {
  404     pageW = doc->getPageCropHeight(pg);
  405     pageH = doc->getPageCropWidth(pg);
  406   } else {
  407     pageW = doc->getPageCropWidth(pg);
  408     pageH = doc->getPageCropHeight(pg);
  409   }
  410 
  411   // get the PDF text
  412   doc->displayPage(textOut, pg, 72, 72, 0, gFalse, gTrue, gFalse);
  413   doc->processLinks(textOut, pg);
  414   text = textOut->takeText();
  415   primaryDir = text->primaryDirectionIsLR() ? 1 : -1;
  416 
  417   // insert a special character for each form field;
  418   // remove existing characters inside field bboxes;
  419   // erase background content inside field bboxes
  420   formFieldFont = NULL;
  421   formFieldInfo = NULL;
  422   if (convertFormFields) {
  423     AcroForm *form = doc->getCatalog()->getForm();
  424     if (form) {
  425       formFieldInfo = new GList();
  426       formFieldFont = new TextFontInfo();
  427       double yTop = doc->getCatalog()->getPage(pg)->getMediaBox()->y2;
  428       for (i = 0; i < form->getNumFields(); ++i) {
  429     AcroFormField *field = form->getField(i);
  430     AcroFormFieldType fieldType = field->getAcroFormFieldType();
  431     if (field->getPageNum() == pg &&
  432         (fieldType == acroFormFieldText ||
  433          fieldType == acroFormFieldCheckbox)) {
  434       double llx, lly, urx, ury;
  435       field->getBBox(&llx, &lly, &urx, &ury);
  436       lly = yTop - lly;
  437       ury = yTop - ury;
  438 
  439       // add the field info
  440       int fieldIdx = formFieldInfo->getLength();
  441       formFieldInfo->append(new HTMLGenFormFieldInfo(field));
  442 
  443       // remove exsting chars
  444       text->removeChars(llx, ury, urx, lly, 0.75, 0.5);
  445     
  446       // erase background content
  447       int llxI = (int)(llx * backgroundResolution / 72 + 0.5);
  448       int llyI = (int)(lly * backgroundResolution * vStretch / 72 + 0.5);
  449       int urxI = (int)(urx * backgroundResolution / 72 + 0.5);
  450       int uryI = (int)(ury * backgroundResolution * vStretch / 72 + 0.5);
  451       llyI += (int)(backgroundResolution * vStretch / 20);
  452       if (llxI < 0) {
  453         llxI = 0;
  454       }
  455       if (urxI >= bitmap->getWidth()) {
  456         urxI = bitmap->getWidth() - 1;
  457       }
  458       if (uryI < 0) {
  459         uryI = 0;
  460       }
  461       if (llyI > bitmap->getHeight()) {
  462         llyI = bitmap->getHeight() - 1;
  463       }
  464       if (uryI <= llyI && llxI <= urxI) {
  465         SplashColorPtr p = bitmap->getDataPtr()
  466                              + uryI * bitmap->getRowSize() + llxI * 3;
  467         for (int y = uryI; y <= llyI; ++y) {
  468           memset(p, 0xff, (urxI - llxI + 1) * 3);
  469           p += bitmap->getRowSize();
  470         }
  471       }
  472 
  473       // add a special char
  474       // (the font size is unused -- 10 is an arbitrary value)
  475       text->addSpecialChar(llx, ury, urx, lly,
  476                    0, formFieldFont, 10, 0x80000000 + fieldIdx);
  477     }
  478       }
  479     }
  480   }
  481 
  482   // HTML header
  483   pr(writeHTML, htmlStream, "<html>\n");
  484   pr(writeHTML, htmlStream, "<head>\n");
  485   pr(writeHTML, htmlStream, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n");
  486   pr(writeHTML, htmlStream, "<style type=\"text/css\">\n");
  487   pr(writeHTML, htmlStream, ".txt { white-space:nowrap; }\n");
  488   if (convertFormFields) {
  489     pr(writeHTML, htmlStream, ".textfield {\n");
  490     pr(writeHTML, htmlStream, "  border: 0;\n");
  491     pr(writeHTML, htmlStream, "  padding: 0;\n");
  492     pr(writeHTML, htmlStream, "  background: #ccccff;\n");
  493     pr(writeHTML, htmlStream, "}\n");
  494     pr(writeHTML, htmlStream, ".checkbox {\n");
  495     pr(writeHTML, htmlStream, "}\n");
  496   }
  497   fonts = text->getFonts();
  498   fontScales = (double *)gmallocn(fonts->getLength(), sizeof(double));
  499   for (i = 0; i < fontDefns->getLength(); ++i) {
  500     fontDefn = (HTMLGenFontDefn *)fontDefns->get(i);
  501     fontDefn->used = gFalse;
  502   }
  503   for (i = 0; i < fonts->getLength(); ++i) {
  504     font = (TextFontInfo *)fonts->get(i);
  505     fontDefn = getFontDefn(font, htmlDir);
  506     if (!fontDefn->used && fontDefn->fontFace) {
  507       pr(writeHTML, htmlStream, fontDefn->fontFace->getCString());
  508     }
  509     pf(writeHTML, htmlStream, ".f{0:d} {{ {1:t} }}\n", i, fontDefn->fontSpec);
  510     fontScales[i] = fontDefn->scale;
  511     fontDefn->used = gTrue;
  512   }
  513   pr(writeHTML, htmlStream, "</style>\n");
  514   pr(writeHTML, htmlStream, "</head>\n");
  515   if (primaryDir >= 0) {
  516     pr(writeHTML, htmlStream, "<body>\n");
  517   } else {
  518     pr(writeHTML, htmlStream, "<body dir=\"rtl\">\n");
  519   }
  520 
  521   // background image element (part 1)
  522   if (primaryDir >= 0) {
  523     pf(writeHTML, htmlStream, "<img style=\"position:absolute; left:0px; top:0px;\" width=\"{0:d}\" height=\"{1:d}\" ",
  524        (int)(pageW * zoom), (int)(pageH * zoom * vStretch));
  525   } else {
  526     pf(writeHTML, htmlStream, "<img style=\"position:absolute; right:0px; top:0px;\" width=\"{0:d}\" height=\"{1:d}\" ",
  527        (int)(pageW * zoom), (int)(pageH * zoom * vStretch));
  528   }
  529   if (embedBackgroundImage) {
  530     pr(writeHTML, htmlStream, "src=\"data:image/png;base64,\n");
  531     writeInfo.base64 = new Base64Encoder(writeHTML, htmlStream); 
  532     writeInfo.writePNG = NULL;
  533     writeInfo.pngStream = NULL;
  534   } else {
  535     pf(writeHTML, htmlStream, "src=\"{0:s}\"", pngURL);
  536     writeInfo.base64 = NULL;
  537     writeInfo.writePNG = writePNG;
  538     writeInfo.pngStream = pngStream;
  539   }
  540 
  541   // background image data - writing to a separate file, or embedding
  542   // with base64 encoding
  543   if (!(png = png_create_write_struct(PNG_LIBPNG_VER_STRING,
  544                       NULL, NULL, NULL)) ||
  545       !(pngInfo = png_create_info_struct(png))) {
  546     return errFileIO;
  547   }
  548   if (setjmp(png_jmpbuf(png))) {
  549     return errFileIO;
  550   }
  551   png_set_write_fn(png, &writeInfo, pngWriteFunc, NULL);
  552   png_set_IHDR(png, pngInfo, bitmap->getWidth(), bitmap->getHeight(),
  553            8, PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE,
  554            PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
  555   png_write_info(png, pngInfo);
  556   p = bitmap->getDataPtr();
  557   for (y = 0; y < bitmap->getHeight(); ++y) {
  558     png_write_row(png, (png_bytep)p);
  559     p += bitmap->getRowSize();
  560   }
  561   png_write_end(png, pngInfo);
  562   png_destroy_write_struct(&png, &pngInfo);
  563   if (embedBackgroundImage) {
  564     writeInfo.base64->flush();
  565     delete writeInfo.base64;
  566   }
  567 
  568   // background image element (part 2)
  569   pr(writeHTML, htmlStream, "\">\n");
  570 
  571   // generate the HTML text
  572   nextFieldID = 0;
  573   cols = text->makeColumns();
  574   for (colIdx = 0; colIdx < cols->getLength(); ++colIdx) {
  575     col = (TextColumn *)cols->get(colIdx);
  576     pars = col->getParagraphs();
  577     for (parIdx = 0; parIdx < pars->getLength(); ++parIdx) {
  578       par = (TextParagraph *)pars->get(parIdx);
  579       lines = par->getLines();
  580       for (lineIdx = 0; lineIdx < lines->getLength(); ++lineIdx) {
  581     line = (TextLine *)lines->get(lineIdx);
  582     if (line->getRotation() != 0) {
  583       continue;
  584     }
  585     words = line->getWords();
  586     if (lineIdx == 0 && par->hasDropCap() && words->getLength() >= 2) {
  587       base = ((TextWord *)words->get(1))->getBaseline();
  588     } else {
  589       base = line->getBaseline();
  590     }
  591     s = new GString();
  592     for (firstWordIdx = (primaryDir >= 0) ? 0 : words->getLength() - 1;
  593          (primaryDir >= 0) ? firstWordIdx < words->getLength()
  594                            : firstWordIdx >= 0;
  595          firstWordIdx = lastWordIdx + primaryDir) {
  596       lastWordIdx = findDirSpan(words, firstWordIdx,
  597                     primaryDir, &spanDir);
  598       appendSpans(words, firstWordIdx, lastWordIdx,
  599               primaryDir, spanDir,
  600               base, lineIdx == 0 && par->hasDropCap(),
  601               s);
  602     }
  603     if (primaryDir >= 0) {
  604       pf(writeHTML, htmlStream, "<div class=\"txt\" style=\"position:absolute; left:{0:d}px; top:{1:d}px;\">{2:t}</div>\n",
  605          (int)(line->getXMin() * zoom),
  606          (int)(line->getYMin() * zoom * vStretch), s);
  607     } else {
  608       pf(writeHTML, htmlStream, "<div class=\"txt\" style=\"position:absolute; right:{0:d}px; top:{1:d}px;\">{2:t}</div>\n",
  609          (int)((pageW - line->getXMax()) * zoom),
  610          (int)(line->getYMin() * zoom * vStretch), s);
  611     }
  612     delete s;
  613       }
  614     }
  615   }
  616   gfree(fontScales);
  617   delete text;
  618   deleteGList(cols, TextColumn);
  619   if (formFieldFont) {
  620     delete formFieldFont;
  621     formFieldFont = NULL;
  622   }
  623   if (formFieldInfo) {
  624     deleteGList(formFieldInfo, HTMLGenFormFieldInfo);
  625     formFieldInfo = NULL;
  626   }
  627 
  628   // HTML trailer
  629   pr(writeHTML, htmlStream, "</body>\n");
  630   pr(writeHTML, htmlStream, "</html>\n");
  631 
  632   return errNone;
  633 }
  634 
  635 // Find a sequence of words, starting at <firstWordIdx>, that have the
  636 // same writing direction.  Returns the index of the last word, and
  637 // sets *<spanDir> to the span direction.
  638 int HTMLGen::findDirSpan(GList *words, int firstWordIdx, int primaryDir,
  639              int *spanDir) {
  640   int dir0, dir1, nextWordIdx;
  641 
  642   dir0 = ((TextWord *)words->get(firstWordIdx))->getDirection();
  643   for (nextWordIdx = firstWordIdx + primaryDir;
  644        (primaryDir >= 0) ? nextWordIdx < words->getLength()
  645                      : nextWordIdx >= 0;
  646        nextWordIdx += primaryDir) {
  647     dir1 = ((TextWord *)words->get(nextWordIdx))->getDirection();
  648     if (dir0 == 0) {
  649       dir0 = dir1;
  650     } else if (dir1 != 0 && dir1 != dir0) {
  651       break;
  652     }
  653   }
  654 
  655   if (dir0 == 0) {
  656     *spanDir = primaryDir;
  657   } else {
  658     *spanDir = dir0;
  659   }
  660 
  661   return nextWordIdx - primaryDir;
  662 }
  663 
  664 // Create HTML spans for words <firstWordIdx> .. <lastWordIdx>, and
  665 // append them to <s>.
  666 void HTMLGen::appendSpans(GList *words, int firstWordIdx, int lastWordIdx,
  667               int primaryDir, int spanDir,
  668               double base, GBool dropCapLine, GString *s) {
  669   if (allTextInvisible && !drawInvisibleText) {
  670     return;
  671   }
  672 
  673   if (spanDir != primaryDir) {
  674     int t = firstWordIdx;
  675     firstWordIdx = lastWordIdx;
  676     lastWordIdx = t;
  677   }
  678 
  679   int wordIdx = firstWordIdx;
  680   while ((spanDir >= 0) ? wordIdx <= lastWordIdx
  681                     : wordIdx >= lastWordIdx) {
  682     TextWord *word0 = (TextWord *)words->get(wordIdx);
  683 
  684     // form field(s): generate <input> element(s)
  685     if (convertFormFields && word0->getFontInfo() == formFieldFont) {
  686       for (int i = (spanDir >= 0) ? 0 : word0->getLength() - 1;
  687        (spanDir >= 0) ? i < word0->getLength() : i >= 0;
  688        i += spanDir) {
  689     int fieldIdx = word0->getChar(0) - 0x80000000;
  690     if (fieldIdx >= 0 && fieldIdx < formFieldInfo->getLength()) {
  691       HTMLGenFormFieldInfo *ffi =
  692           (HTMLGenFormFieldInfo *)formFieldInfo->get(fieldIdx);
  693       AcroFormField *field = ffi->acroFormField;
  694       AcroFormFieldType fieldType = field->getAcroFormFieldType();
  695       double llx, lly, urx, ury;
  696       field->getBBox(&llx, &lly, &urx, &ury);
  697       int width = (int)(urx - llx);
  698       Ref fontID;
  699       double fontSize;
  700       field->getFont(&fontID, &fontSize);
  701       if (fontSize == 0) {
  702         fontSize = 12;
  703       }
  704       if (fieldType == acroFormFieldText) {
  705         s->appendf("<input type=\"text\" class=\"textfield\" id=\"textfield{0:d}\" style=\"width:{1:d}px; font-size:{2:d}px;\">", nextFieldID, width, (int)(fontSize + 0.5));
  706         ++nextFieldID;
  707       } else if (fieldType == acroFormFieldCheckbox) {
  708         s->appendf("<input type=\"checkbox\" class=\"checkbox\" id=\"checkbox{0:d}\" style=\"width:{1:d}px; font-size:{2:d}px;\">", nextFieldID, width, (int)(fontSize + 0.5));
  709         ++nextFieldID;
  710       }
  711     }
  712       }
  713 
  714       if (word0->getSpaceAfter()) {
  715     s->append(' ');
  716       }
  717 
  718       wordIdx += spanDir;
  719 
  720     // skip invisible words
  721     } else if (!drawInvisibleText &&
  722            (word0->isInvisible() || word0->isRotated())) {
  723       wordIdx += spanDir;
  724 
  725     // generate a <span> containing one or more words
  726     } else {
  727 
  728       double r0 = 0, g0 = 0, b0 = 0; // make gcc happy
  729       VerticalAlignment vertAlign0 = vertAlignBaseline; // make gcc happy
  730       GString *linkURI0 = NULL;
  731 
  732       GBool invisible = word0->isInvisible() || word0->isRotated();
  733 
  734       do {
  735     TextWord *word1 = (TextWord *)words->get(wordIdx);
  736 
  737     // get word parameters
  738     double r1, g1, b1;
  739     word0->getColor(&r1, &g1, &b1);
  740     double base1 = word1->getBaseline();
  741     VerticalAlignment vertAlign1;
  742     if (dropCapLine) {
  743       //~ this will fail if there are subscripts or superscripts in
  744       //~   the first line of a paragraph with a drop cap
  745       vertAlign1 = vertAlignTop;
  746     } else if (base1 - base < -1) {
  747       vertAlign1 = vertAlignSuper;
  748     } else if (base1 - base > 1) {
  749       vertAlign1 = vertAlignSub;
  750     } else {
  751       vertAlign1 = vertAlignBaseline;
  752     }
  753     GString *linkURI1 = word1->getLinkURI();
  754 
  755     // start of span
  756     if (word1 == word0) {
  757       r0 = r1;
  758       g0 = g1;
  759       b0 = b1;
  760       vertAlign0 = vertAlign1;
  761       linkURI0 = linkURI1;
  762 
  763       int i;
  764       for (i = 0; i < fonts->getLength(); ++i) {
  765         if (word1->getFontInfo() == (TextFontInfo *)fonts->get(i)) {
  766           break;
  767         }
  768       }
  769       if (linkURI1) {
  770         s->appendf("<a href=\"{0:t}\">", linkURI0);
  771       }
  772       // we force spans to be LTR or RTL; this is a kludge, but it's
  773       // far easier than implementing the full Unicode bidi algorithm
  774       const char *dirTag;
  775       if (spanDir == primaryDir) {
  776         dirTag = "";
  777       } else if (spanDir < 0) {
  778         dirTag = " dir=\"rtl\"";
  779       } else {
  780         dirTag = " dir=\"ltr\"";
  781       }
  782       s->appendf("<span class=\"f{0:d}\"{1:s} style=\"font-size:{2:d}px;vertical-align:{3:s};{4:s}color:rgba({5:d},{6:d},{7:d},{8:d});\">",
  783              i,
  784              dirTag,
  785              (int)(fontScales[i] * word1->getFontSize() * zoom),
  786              vertAlignNames[vertAlign1],
  787              (dropCapLine && wordIdx == 0) ? "line-height:75%;" : "",
  788              (int)(r0 * 255), (int)(g0 * 255), (int)(b0 * 255),
  789              invisible ? 0 : 1);
  790 
  791     // end of span
  792     } else if (word1->getFontInfo() != word0->getFontInfo() ||
  793            word1->getFontSize() != word0->getFontSize() ||
  794            word1->isInvisible() != word0->isInvisible() ||
  795            word1->isRotated() != word0->isRotated() ||
  796            vertAlign1 != vertAlign0 ||
  797            r1 != r0 || g1 != g0 || b1 != b0 ||
  798            linkURI1 != linkURI0) {
  799       break;
  800     }
  801 
  802     // add a space before the word, if needed
  803     // -- this only happens with the first word in a reverse section
  804     if (spanDir != primaryDir && wordIdx == firstWordIdx) {
  805       GBool sp;
  806       if (spanDir >= 0) {
  807         if (wordIdx > 0) {
  808           sp = ((TextWord *)words->get(wordIdx - 1))->getSpaceAfter();
  809         } else {
  810           sp = gFalse;
  811         }
  812       } else {
  813         sp = word1->getSpaceAfter();
  814       }
  815       if (sp) {
  816         s->append(' ');
  817       }
  818     }
  819 
  820     // generate the word text
  821     for (int i = (spanDir >= 0) ? 0 : word1->getLength() - 1;
  822          (spanDir >= 0) ? i < word1->getLength() : i >= 0;
  823          i += spanDir) {
  824       Unicode u = word1->getChar(i);
  825       if (u >= privateUnicodeMapStart &&
  826           u <= privateUnicodeMapEnd &&
  827           privateUnicodeMap[u - privateUnicodeMapStart]) {
  828         u = privateUnicodeMap[u - privateUnicodeMapStart];
  829       }
  830       appendUTF8(u, s);
  831     }
  832 
  833     // add a space after the word, if needed
  834     // -- there is never a space after the last word in a reverse
  835     //    section (this will be handled as a space after the last
  836     //    word in the previous primary-direction section)
  837     GBool sp;
  838     if (spanDir != primaryDir && wordIdx == lastWordIdx) {
  839       sp = gFalse;
  840     } else if (spanDir >= 0) {
  841       sp = word1->getSpaceAfter();
  842     } else {
  843       if (wordIdx > 0) {
  844         sp = ((TextWord *)words->get(wordIdx - 1))->getSpaceAfter();
  845       } else {
  846         sp = gFalse;
  847       }
  848     }
  849     if (sp) {
  850       s->append(' ');
  851     }
  852 
  853     wordIdx += spanDir;
  854       } while ((spanDir >= 0) ? wordIdx <= lastWordIdx
  855                           : wordIdx >= lastWordIdx);
  856 
  857       s->append("</span>");
  858       if (linkURI0) {
  859     s->append("</a>");
  860       }
  861     }
  862   }
  863 }
  864 
  865 void HTMLGen::appendUTF8(Unicode u, GString *s) {
  866   if (u <= 0x7f) {
  867     if (u == '&') {
  868       s->append("&amp;");
  869     } else if (u == '<') {
  870       s->append("&lt;");
  871     } else if (u == '>') {
  872       s->append("&gt;");
  873     } else {
  874       s->append((char)u);
  875     }
  876   } else if (u <= 0x7ff) {
  877     s->append((char)(0xc0 + (u >> 6)));
  878     s->append((char)(0x80 + (u & 0x3f)));
  879   } else if (u <= 0xffff) {
  880     s->append((char)(0xe0 + (u >> 12)));
  881     s->append((char)(0x80 + ((u >> 6) & 0x3f)));
  882     s->append((char)(0x80 + (u & 0x3f)));
  883   } else if (u <= 0x1fffff) {
  884     s->append((char)(0xf0 + (u >> 18)));
  885     s->append((char)(0x80 + ((u >> 12) & 0x3f)));
  886     s->append((char)(0x80 + ((u >> 6) & 0x3f)));
  887     s->append((char)(0x80 + (u & 0x3f)));
  888   } else if (u <= 0x3ffffff) {
  889     s->append((char)(0xf8 + (u >> 24)));
  890     s->append((char)(0x80 + ((u >> 18) & 0x3f)));
  891     s->append((char)(0x80 + ((u >> 12) & 0x3f)));
  892     s->append((char)(0x80 + ((u >> 6) & 0x3f)));
  893     s->append((char)(0x80 + (u & 0x3f)));
  894   } else if (u <= 0x7fffffff) {
  895     s->append((char)(0xfc + (u >> 30)));
  896     s->append((char)(0x80 + ((u >> 24) & 0x3f)));
  897     s->append((char)(0x80 + ((u >> 18) & 0x3f)));
  898     s->append((char)(0x80 + ((u >> 12) & 0x3f)));
  899     s->append((char)(0x80 + ((u >> 6) & 0x3f)));
  900     s->append((char)(0x80 + (u & 0x3f)));
  901   }
  902 }
  903 
  904 HTMLGenFontDefn *HTMLGen::getFontDefn(TextFontInfo *font,
  905                       const char *htmlDir) {
  906   Ref id;
  907   HTMLGenFontDefn *fontDefn;
  908   int i;
  909 
  910   // check the existing font defns
  911   id = font->getFontID();
  912   if (id.num >= 0) {
  913     for (i = 0; i < fontDefns->getLength(); ++i) {
  914       fontDefn = (HTMLGenFontDefn *)fontDefns->get(i);
  915       if (fontDefn->match(id)) {
  916     return fontDefn;
  917       }
  918     }
  919   }
  920 
  921   // try to extract a font file
  922   if (!extractFontFiles ||
  923       !(fontDefn = getFontFile(font, htmlDir))) {
  924 
  925     // get a substitute font
  926     fontDefn = getSubstituteFont(font);
  927   }
  928 
  929   fontDefns->append(fontDefn);
  930   return fontDefn;
  931 }
  932 
  933 HTMLGenFontDefn *HTMLGen::getFontFile(TextFontInfo *font,
  934                       const char *htmlDir) {
  935   Ref id;
  936   HTMLGenFontDefn *fontDefn;
  937   Object fontObj;
  938   GfxFont *gfxFont;
  939   WebFont *webFont;
  940   GString *fontFile, *fontPath, *fontFace, *fontSpec;
  941   const char *family, *weight, *style;
  942   double scale;
  943 
  944   id = font->getFontID();
  945   if (id.num < 0) {
  946     return NULL;
  947   }
  948 
  949   doc->getXRef()->fetch(id.num, id.gen, &fontObj);
  950   if (!fontObj.isDict()) {
  951     fontObj.free();
  952     return NULL;
  953   }
  954 
  955   gfxFont = GfxFont::makeFont(doc->getXRef(), "F", id, fontObj.getDict());
  956   webFont = new WebFont(gfxFont, doc->getXRef());
  957   fontDefn = NULL;
  958   fontFace = NULL;
  959 
  960   if (webFont->canWriteTTF()) {
  961     if (embedFonts) {
  962       GString *ttfData = webFont->getTTFData();
  963       if (ttfData) {
  964     fontFace = GString::format("@font-face {{ font-family: ff{0:d}; src: url(\"data:font/ttf;base64,",
  965                    nextFontFaceIdx);
  966     Base64Encoder enc(writeToString, fontFace);
  967     enc.encode((unsigned char *)ttfData->getCString(),
  968            (size_t)ttfData->getLength());
  969     enc.flush();
  970     fontFace->append("\"); }\n");
  971     delete ttfData;
  972       }
  973     } else {
  974       fontFile = GString::format("{0:d}.ttf", nextFontFaceIdx);
  975       fontPath = GString::format("{0:s}/{1:t}", htmlDir, fontFile);
  976       if (webFont->writeTTF(fontPath->getCString())) {
  977     fontFace = GString::format("@font-face {{ font-family: ff{0:d}; src: url(\"{1:t}\"); }}\n",
  978                    nextFontFaceIdx, fontFile);
  979       }
  980       delete fontPath;
  981       delete fontFile;
  982     }
  983     if (fontFace) {
  984       getFontDetails(font, &family, &weight, &style, &scale);
  985       fontSpec = GString::format("font-family:ff{0:d},{1:s}; font-weight:{2:s}; font-style:{3:s};",
  986                  nextFontFaceIdx, family, weight, style);
  987       ++nextFontFaceIdx;
  988       fontDefn = new HTMLGenFontDefn(id, fontFace, fontSpec, 1.0);
  989     }
  990 
  991   } else if (webFont->canWriteOTF()) {
  992     if (embedFonts) {
  993       GString *otfData = webFont->getOTFData();
  994       if (otfData) {
  995     fontFace = GString::format("@font-face {{ font-family: ff{0:d}; src: url(\"data:font/otf;base64,",
  996                    nextFontFaceIdx);
  997     Base64Encoder enc(writeToString, fontFace);
  998     enc.encode((unsigned char *)otfData->getCString(),
  999            (size_t)otfData->getLength());
 1000     enc.flush();
 1001     fontFace->append("\"); }\n");
 1002     delete otfData;
 1003       }
 1004     } else {
 1005       fontFile = GString::format("{0:d}.otf", nextFontFaceIdx);
 1006       fontPath = GString::format("{0:s}/{1:t}", htmlDir, fontFile);
 1007       if (webFont->writeOTF(fontPath->getCString())) {
 1008     fontFace = GString::format("@font-face {{ font-family: ff{0:d}; src: url(\"{1:t}\"); }}\n",
 1009                    nextFontFaceIdx, fontFile);
 1010       }
 1011       delete fontPath;
 1012       delete fontFile;
 1013     }
 1014     if (fontFace) {
 1015       getFontDetails(font, &family, &weight, &style, &scale);
 1016       fontSpec = GString::format("font-family:ff{0:d},{1:s}; font-weight:{2:s}; font-style:{3:s};",
 1017                  nextFontFaceIdx, family, weight, style);
 1018       ++nextFontFaceIdx;
 1019       fontDefn = new HTMLGenFontDefn(id, fontFace, fontSpec, 1.0);
 1020     }
 1021   }
 1022 
 1023   delete webFont;
 1024   delete gfxFont;
 1025   fontObj.free();
 1026 
 1027   return fontDefn;
 1028 }
 1029 
 1030 HTMLGenFontDefn *HTMLGen::getSubstituteFont(TextFontInfo *font) {
 1031   const char *family, *weight, *style;
 1032   double scale;
 1033   GString *fontSpec;
 1034 
 1035   getFontDetails(font, &family, &weight, &style, &scale);
 1036   fontSpec = GString::format("font-family:{0:s}; font-weight:{1:s}; font-style:{2:s};",
 1037                  family, weight, style);
 1038   return new HTMLGenFontDefn(font->getFontID(), NULL, fontSpec, scale);
 1039 }
 1040 
 1041 void HTMLGen::getFontDetails(TextFontInfo *font, const char **family,
 1042                  const char **weight, const char **style,
 1043                  double *scale) {
 1044   GString *fontName;
 1045   char *fontName2;
 1046   FontStyleTagInfo *fst;
 1047   StandardFontInfo *sf;
 1048   GBool fixedWidth, serif, bold, italic;
 1049   double s;
 1050   int n, i;
 1051 
 1052   // get the font name, remove any subset tag
 1053   fontName = font->getFontName();
 1054   if (fontName) {
 1055     fontName2 = fontName->getCString();
 1056     n = fontName->getLength();
 1057     for (i = 0; i < n && i < 7; ++i) {
 1058       if (fontName2[i] < 'A' || fontName2[i] > 'Z') {
 1059     break;
 1060       }
 1061     }
 1062     if (i == 6 && n > 7 && fontName2[6] == '+') {
 1063       fontName2 += 7;
 1064       n -= 7;
 1065     }
 1066   } else {
 1067     fontName2 = NULL;
 1068     n = 0;
 1069   }
 1070 
 1071   // get the style info from the font descriptor flags
 1072   fixedWidth = font->isFixedWidth();
 1073   serif = font->isSerif();
 1074   bold = font->isBold();
 1075   italic = font->isItalic();
 1076 
 1077   if (fontName2) {
 1078 
 1079     // look for a style tag at the end of the font name -- this
 1080     // overrides the font descriptor bold/italic flags
 1081     for (fst = fontStyleTags; fst->tag; ++fst) {
 1082       if (n > fst->tagLen &&
 1083       !strcasecmp(fontName2 + n - fst->tagLen, fst->tag)) {
 1084     bold = fst->bold;
 1085     italic = fst->italic;
 1086     n -= fst->tagLen;
 1087     if (n > 1 && (fontName2[n-1] == '-' ||
 1088               fontName2[n-1] == ',' ||
 1089               fontName2[n-1] == '.' ||
 1090               fontName2[n-1] == '_')) {
 1091       --n;
 1092     }
 1093     break;
 1094       }
 1095     }
 1096 
 1097     // look for a known font name -- this overrides the font descriptor
 1098     // fixedWidth/serif flags
 1099     for (sf = standardFonts; sf->name; ++sf) {
 1100       if (!strncasecmp(fontName2, sf->name, n)) {
 1101     fixedWidth = sf->fixedWidth;
 1102     serif = sf->serif;
 1103     break;
 1104       }
 1105     }
 1106   }
 1107 
 1108   // compute the scaling factor
 1109   *scale = 1;
 1110   if ((s = font->getMWidth())) {
 1111     i = (fixedWidth ? 8 : serif ? 4 : 0) + (bold ? 2 : 0) + (italic ? 1 : 0);
 1112     if (s < substFonts[i].mWidth) {
 1113       *scale = s / substFonts[i].mWidth;
 1114     }
 1115   }
 1116 
 1117   *family = fixedWidth ? "monospace" : serif ? "serif" : "sans-serif";
 1118   *weight = bold ? "bold" : "normal";
 1119   *style = italic ? "italic" : "normal";
 1120 }