"Fossies" - the Fresh Open Source Software Archive

Member "tesseract-ocr/doc/html/mftraining_8cpp.html" (26 Oct 2012, 33788 Bytes) of package /linux/misc/old/tesseract-ocr-3.02.02-doc-html.tar.gz:

Caution: In this restricted "Fossies" environment the current HTML page may not be correctly presentated and may have some non-functional links. You can here alternatively try to browse the pure source code or just view or download the uninterpreted raw source code. If the rendering is insufficient you may try to find and view the page on the tesseract-ocr-3.02.02-doc-html.tar.gz project site itself.

Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
mftraining.cpp File Reference
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "classify.h"
#include "cluster.h"
#include "clusttool.h"
#include "commontraining.h"
#include "danerror.h"
#include "efio.h"
#include "emalloc.h"
#include "featdefs.h"
#include "fontinfo.h"
#include "genericvector.h"
#include "indexmapbidi.h"
#include "intproto.h"
#include "mastertrainer.h"
#include "mergenf.h"
#include "mf.h"
#include "ndminx.h"
#include "ocrfeatures.h"
#include "oldlist.h"
#include "protos.h"
#include "shapetable.h"
#include "tessopt.h"
#include "tprintf.h"
#include "unicity_table.h"

Go to the source code of this file.




int main (int argc, char **argv)


const int kMaxShapeLabelLength = 10

Macro Definition Documentation


Include Files and Type Defines —————————————————————————-

Definition at line 31 of file mftraining.cpp.


Definition at line 76 of file mftraining.cpp.

Function Documentation

int main ( int  argc,
char **  argv 

Public Function Prototypes —————————————————————————-

Definition at line 50 of file tesseractmain.cpp.

setlocale (LC_ALL, "");
bindtextdomain (PACKAGE, LOCALEDIR);
textdomain (PACKAGE);
if ((argc == 2 && strcmp(argv[1], "-v") == 0) ||
(argc == 2 && strcmp(argv[1], "--version") == 0)) {
char *versionStrP;
fprintf(stderr, "tesseract %s\n", tesseract::TessBaseAPI::Version());
versionStrP = getLeptonicaVersion();
fprintf(stderr, " %s\n", versionStrP);
versionStrP = getImagelibVersions();
fprintf(stderr, " %s\n", versionStrP);
STRING tessdata_dir;
truncate_path(argv[0], &tessdata_dir);
int rc = api.Init(tessdata_dir.string(), NULL);
if (rc) {
fprintf(stderr, _("Could not initialize tesseract.\n"));
if (argc == 2 && strcmp(argv[1], "--list-langs") == 0) {
fprintf(stderr, _("List of available languages (%d):\n"), languages.size());
for (int index = 0; index < languages.size(); ++index) {
STRING& string = languages[index];
fprintf(stderr, "%s\n", string.string());
// Make the order of args a bit more forgiving than it used to be.
const char* lang = "eng";
const char* image = NULL;
const char* output = NULL;
int arg = 1;
while (arg < argc && (output == NULL || argv[arg][0] == '-')) {
if (strcmp(argv[arg], "-l") == 0 && arg + 1 < argc) {
lang = argv[arg + 1];
} else if (strcmp(argv[arg], "-psm") == 0 && arg + 1 < argc) {
pagesegmode = static_cast<tesseract::PageSegMode>(atoi(argv[arg + 1]));
} else if (image == NULL) {
image = argv[arg];
} else if (output == NULL) {
output = argv[arg];
if (output == NULL) {
fprintf(stderr, _("Usage:%s imagename outputbase [-l lang] "
"[-psm pagesegmode] [configfile...]\n\n"), argv[0]);
_("pagesegmode values are:\n"
"0 = Orientation and script detection (OSD) only.\n"
"1 = Automatic page segmentation with OSD.\n"
"2 = Automatic page segmentation, but no OSD, or OCR\n"
"3 = Fully automatic page segmentation, but no OSD. (Default)\n"
"4 = Assume a single column of text of variable sizes.\n"
"5 = Assume a single uniform block of vertically aligned text.\n"
"6 = Assume a single uniform block of text.\n"
"7 = Treat the image as a single text line.\n"
"8 = Treat the image as a single word.\n"
"9 = Treat the image as a single word in a circle.\n"
"10 = Treat the image as a single character.\n"));
fprintf(stderr, _("-l lang and/or -psm pagesegmode must occur before any"
fprintf(stderr, _("Single options:\n"));
fprintf(stderr, _(" -v --version: version info\n"));
fprintf(stderr, _(" --list-langs: list available languages for tesseract "
rc = api.Init(tessdata_dir.string(), lang, tesseract::OEM_DEFAULT,
&(argv[arg]), argc - arg, NULL, NULL, false);
if (rc) {
fprintf(stderr, _("Could not initialize tesseract.\n"));
// We have 2 possible sources of pagesegmode: a config file and
// the command line. For backwards compatability reasons, the
// default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
// default for this program is tesseract::PSM_AUTO. We will let
// the config file take priority, so the command-line default
// can take priority over the tesseract default, so we use the
// value from the command line only if the retrieved mode
// is still tesseract::PSM_SINGLE_BLOCK, indicating no change
// in any config file. Therefore the only way to force
// tesseract::PSM_SINGLE_BLOCK is from the command line.
// It would be simpler if we could set the value before Init,
// but that doesn't work.
tprintf("Tesseract Open Source OCR Engine v%s with Leptonica\n",
FILE* fin = fopen(image, "rb");
if (fin == NULL) {
fprintf(stderr, _("Cannot open input file: %s\n"), image);
PIX *pixs;
if ((pixs = pixRead(image)) == NULL) {
fprintf(stderr, _("Unsupported image type.\n"));
if (!api.ProcessPages(image, NULL, 0, &text_out)) {
fprintf(stderr, _("Error during processing.\n"));
bool output_hocr = false;
api.GetBoolVariable("tessedit_create_hocr", &output_hocr);
bool output_box = false;
api.GetBoolVariable("tessedit_create_boxfile", &output_box);
STRING outfile = output;
outfile += output_hocr ? ".html" : output_box ? ".box" : ".txt";
FILE* fout = fopen(outfile.string(), "wb");
if (fout == NULL) {
fprintf(stderr, _("Cannot create output file %s\n"), outfile.string());
fwrite(text_out.string(), 1, text_out.length(), fout);
return 0; // Normal exit

Variable Documentation

const int kMaxShapeLabelLength = 10

Definition at line 79 of file mftraining.cpp.