w32tex
About: TeX Live provides a comprehensive TeX system including all the major TeX-related programs, macro packages, and fonts that are free software. Windows sources.
  Fossies Dox: w32tex-src.tar.xz  ("unofficial" and yet experimental doxygen-generated source code documentation)  

pdftotext.cc File Reference
#include "config.h"
#include <poppler-config.h>
#include <cstdio>
#include <cstdlib>
#include <cstddef>
#include <cstring>
#include "parseargs.h"
#include "printencodings.h"
#include "goo/GooString.h"
#include "goo/gmem.h"
#include "GlobalParams.h"
#include "Object.h"
#include "Stream.h"
#include "Array.h"
#include "Dict.h"
#include "XRef.h"
#include "Catalog.h"
#include "Page.h"
#include "PDFDoc.h"
#include "PDFDocFactory.h"
#include "TextOutputDev.h"
#include "CharTypes.h"
#include "UnicodeMap.h"
#include "PDFDocEncoding.h"
#include "Error.h"
#include <string>
#include <sstream>
#include <iomanip>
#include "Win32Console.h"
Include dependency graph for pdftotext.cc:

Go to the source code of this file.

Functions

static void printInfoString (FILE *f, Dict *infoDict, const char *key, const char *text1, const char *text2, const UnicodeMap *uMap)
 
static void printInfoDate (FILE *f, Dict *infoDict, const char *key, const char *fmt)
 
void printDocBBox (FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last)
 
void printWordBBox (FILE *f, PDFDoc *doc, TextOutputDev *textOut, int first, int last)
 
static std::string myStringReplace (const std::string &inString, const std::string &oldToken, const std::string &newToken)
 
static std::string myXmlTokenReplace (const char *inString)
 
int main (int argc, char *argv[])
 
static void printLine (FILE *f, const TextLine *line)
 

Variables

static int firstPage = 1
 
static int lastPage = 0
 
static double resolution = 72.0
 
static int x = 0
 
static int y = 0
 
static int w = 0
 
static int h = 0
 
static bool bbox = false
 
static bool bboxLayout = false
 
static bool physLayout = false
 
static bool useCropBox = false
 
static double fixedPitch = 0
 
static bool rawOrder = false
 
static bool discardDiag = false
 
static bool htmlMeta = false
 
static char textEncName [128] = ""
 
static char textEOLStr [16] = ""
 
static bool noPageBreaks = false
 
static char ownerPassword [33] = "\001"
 
static char userPassword [33] = "\001"
 
static bool quiet = false
 
static bool printVersion = false
 
static bool printHelp = false
 
static bool printEnc = false
 
static const ArgDesc argDesc []
 

Function Documentation

◆ main()

◆ myStringReplace()

static std::string myStringReplace ( const std::string inString,
const std::string oldToken,
const std::string newToken 
)
static

Definition at line 130 of file pdftotext.cc.

References advance, check-static-inits::result, and string.

Referenced by myXmlTokenReplace().

◆ myXmlTokenReplace()

static std::string myXmlTokenReplace ( const char *  inString)
static

Definition at line 145 of file pdftotext.cc.

References myStringReplace(), and string.

Referenced by printInfoString(), printLine(), and printWordBBox().

◆ printDocBBox()

◆ printInfoDate()

static void printInfoDate ( FILE f,
Dict infoDict,
const char *  key,
const char *  fmt 
)
static

Definition at line 450 of file pdftotext.cc.

References f, fmt, fprintf, Object::getString(), Object::isString(), key, Dict::lookup(), and s.

Referenced by main().

◆ printInfoString()

static void printInfoString ( FILE f,
Dict infoDict,
const char *  key,
const char *  text1,
const char *  text2,
const UnicodeMap uMap 
)
static

◆ printLine()

static void printLine ( FILE f,
const TextLine line 
)
static

Definition at line 462 of file pdftotext.cc.

References f, fprintf, fputs, myXmlTokenReplace(), and string.

Referenced by error(), printDocBBox(), and warning().

◆ printWordBBox()

Variable Documentation

◆ argDesc

const ArgDesc argDesc[]
static
Initial value:
= { { "-f", argInt, &firstPage, 0, "first page to convert" },
{ "-l", argInt, &lastPage, 0, "last page to convert" },
{ "-r", argFP, &resolution, 0, "resolution, in DPI (default is 72)" },
{ "-x", argInt, &x, 0, "x-coordinate of the crop area top left corner" },
{ "-y", argInt, &y, 0, "y-coordinate of the crop area top left corner" },
{ "-W", argInt, &w, 0, "width of crop area in pixels (default is 0)" },
{ "-H", argInt, &h, 0, "height of crop area in pixels (default is 0)" },
{ "-layout", argFlag, &physLayout, 0, "maintain original physical layout" },
{ "-fixed", argFP, &fixedPitch, 0, "assume fixed-pitch (or tabular) text" },
{ "-raw", argFlag, &rawOrder, 0, "keep strings in content stream order" },
{ "-nodiag", argFlag, &discardDiag, 0, "discard diagonal text" },
{ "-htmlmeta", argFlag, &htmlMeta, 0, "generate a simple HTML file, including the meta information" },
{ "-enc", argString, textEncName, sizeof(textEncName), "output text encoding name" },
{ "-listenc", argFlag, &printEnc, 0, "list available encodings" },
{ "-eol", argString, textEOLStr, sizeof(textEOLStr), "output end-of-line convention (unix, dos, or mac)" },
{ "-nopgbrk", argFlag, &noPageBreaks, 0, "don't insert page breaks between pages" },
{ "-bbox", argFlag, &bbox, 0, "output bounding box for each word and page size to html. Sets -htmlmeta" },
{ "-bbox-layout", argFlag, &bboxLayout, 0, "like -bbox but with extra layout bounding box data. Sets -htmlmeta" },
{ "-cropbox", argFlag, &useCropBox, 0, "use the crop box rather than media box" },
{ "-opw", argString, ownerPassword, sizeof(ownerPassword), "owner password (for encrypted files)" },
{ "-upw", argString, userPassword, sizeof(userPassword), "user password (for encrypted files)" },
{ "-q", argFlag, &quiet, 0, "don't print any messages or errors" },
{ "-v", argFlag, &printVersion, 0, "print copyright and version info" },
{ "-h", argFlag, &printHelp, 0, "print usage information" },
{ "-help", argFlag, &printHelp, 0, "print usage information" },
{ "--help", argFlag, &printHelp, 0, "print usage information" },
{ "-?", argFlag, &printHelp, 0, "print usage information" },
{} }
@ argFlag
Definition: parseargs.h:22
@ argInt
Definition: parseargs.h:24
@ argString
Definition: parseargs.h:28
@ argFP
Definition: parseargs.h:26
static int y
Definition: pdftotext.cc:80
static double resolution
Definition: pdftotext.cc:78
static bool physLayout
Definition: pdftotext.cc:85
static int h
Definition: pdftotext.cc:82
static bool noPageBreaks
Definition: pdftotext.cc:93
static char textEncName[128]
Definition: pdftotext.cc:91
static bool discardDiag
Definition: pdftotext.cc:89
static int x
Definition: pdftotext.cc:79
static bool bbox
Definition: pdftotext.cc:83
static bool printVersion
Definition: pdftotext.cc:97
static bool bboxLayout
Definition: pdftotext.cc:84
static int lastPage
Definition: pdftotext.cc:77
static bool printEnc
Definition: pdftotext.cc:99
static bool useCropBox
Definition: pdftotext.cc:86
static bool printHelp
Definition: pdftotext.cc:98
static int w
Definition: pdftotext.cc:81
static char ownerPassword[33]
Definition: pdftotext.cc:94
static double fixedPitch
Definition: pdftotext.cc:87
static int firstPage
Definition: pdftotext.cc:76
static bool quiet
Definition: pdftotext.cc:96
static char textEOLStr[16]
Definition: pdftotext.cc:92
static char userPassword[33]
Definition: pdftotext.cc:95
static bool htmlMeta
Definition: pdftotext.cc:90
static bool rawOrder
Definition: pdftotext.cc:88

Definition at line 101 of file pdftotext.cc.

Referenced by main().

◆ bbox

bool bbox = false
static

Definition at line 83 of file pdftotext.cc.

◆ bboxLayout

bool bboxLayout = false
static

Definition at line 84 of file pdftotext.cc.

Referenced by main().

◆ discardDiag

bool discardDiag = false
static

Definition at line 89 of file pdftotext.cc.

Referenced by main().

◆ firstPage

int firstPage = 1
static

Definition at line 76 of file pdftotext.cc.

Referenced by main().

◆ fixedPitch

double fixedPitch = 0
static

Definition at line 87 of file pdftotext.cc.

Referenced by main().

◆ h

int h = 0
static

Definition at line 82 of file pdftotext.cc.

Referenced by main().

◆ htmlMeta

bool htmlMeta = false
static

Definition at line 90 of file pdftotext.cc.

Referenced by main().

◆ lastPage

int lastPage = 0
static

Definition at line 77 of file pdftotext.cc.

Referenced by main().

◆ noPageBreaks

bool noPageBreaks = false
static

Definition at line 93 of file pdftotext.cc.

Referenced by main().

◆ ownerPassword

char ownerPassword[33] = "\001"
static

Definition at line 94 of file pdftotext.cc.

Referenced by main().

◆ physLayout

bool physLayout = false
static

Definition at line 85 of file pdftotext.cc.

Referenced by main().

◆ printEnc

bool printEnc = false
static

Definition at line 99 of file pdftotext.cc.

Referenced by main().

◆ printHelp

bool printHelp = false
static

Definition at line 98 of file pdftotext.cc.

Referenced by main().

◆ printVersion

bool printVersion = false
static

Definition at line 97 of file pdftotext.cc.

Referenced by main().

◆ quiet

bool quiet = false
static

Definition at line 96 of file pdftotext.cc.

Referenced by main().

◆ rawOrder

bool rawOrder = false
static

Definition at line 88 of file pdftotext.cc.

Referenced by main().

◆ resolution

double resolution = 72.0
static

Definition at line 78 of file pdftotext.cc.

Referenced by main(), printDocBBox(), and printWordBBox().

◆ textEncName

char textEncName[128] = ""
static

Definition at line 91 of file pdftotext.cc.

Referenced by main().

◆ textEOLStr

char textEOLStr[16] = ""
static

Definition at line 92 of file pdftotext.cc.

Referenced by main().

◆ useCropBox

bool useCropBox = false
static

Definition at line 86 of file pdftotext.cc.

Referenced by printDocBBox(), and printWordBBox().

◆ userPassword

char userPassword[33] = "\001"
static

Definition at line 95 of file pdftotext.cc.

Referenced by main().

◆ w

int w = 0
static

Definition at line 81 of file pdftotext.cc.

Referenced by main().

◆ x

int x = 0
static

Definition at line 79 of file pdftotext.cc.

Referenced by main().

◆ y

int y = 0
static

Definition at line 80 of file pdftotext.cc.

Referenced by main().