"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "unittest/resultiterator_test.cc" between
tesseract-5.0.0-rc2.tar.gz and tesseract-5.0.0-rc3.tar.gz

About: Tesseract is an Optical Character Recognition (OCR) engine. Release candidate.

resultiterator_test.cc  (tesseract-5.0.0-rc2):resultiterator_test.cc  (tesseract-5.0.0-rc3)
skipping to change at line 133 skipping to change at line 133
if (it->IsAtFinalElement(tesseract::RIL_PARA, level) && if (it->IsAtFinalElement(tesseract::RIL_PARA, level) &&
!(it->IsAtFinalElement(tesseract::RIL_BLOCK, level))) { !(it->IsAtFinalElement(tesseract::RIL_BLOCK, level))) {
result += '\n'; result += '\n';
} }
} }
} while (it->Next(level)); } while (it->Next(level));
EXPECT_STREQ(truth.c_str(), result.c_str()) << "Rebuild failed at Text Level " << level; EXPECT_STREQ(truth.c_str(), result.c_str()) << "Rebuild failed at Text Level " << level;
} }
void VerifyRebuilds(int block_limit, int para_limit, int line_limit, int word_ limit, void VerifyRebuilds(int block_limit, int para_limit, int line_limit, int word_ limit,
int symbol_limit, PageIterator *it) { int symbol_limit, PageIterator *it, PageIteratorLevel maxl evel=tesseract::RIL_SYMBOL) {
VerifyRebuild(block_limit, tesseract::RIL_BLOCK, it); VerifyRebuild(block_limit, tesseract::RIL_BLOCK, it);
VerifyRebuild(para_limit, tesseract::RIL_PARA, it); VerifyRebuild(para_limit, tesseract::RIL_PARA, it);
VerifyRebuild(line_limit, tesseract::RIL_TEXTLINE, it); VerifyRebuild(line_limit, tesseract::RIL_TEXTLINE, it);
VerifyRebuild(word_limit, tesseract::RIL_WORD, it); VerifyRebuild(word_limit, tesseract::RIL_WORD, it);
VerifyRebuild(symbol_limit, tesseract::RIL_SYMBOL, it); if (maxlevel == tesseract::RIL_SYMBOL) {
VerifyRebuild(symbol_limit, maxlevel, it);
}
} }
void VerifyAllText(const std::string &truth, ResultIterator *it) { void VerifyAllText(const std::string &truth, ResultIterator *it) {
VerifyIteratorText(truth, tesseract::RIL_BLOCK, it); VerifyIteratorText(truth, tesseract::RIL_BLOCK, it);
VerifyIteratorText(truth, tesseract::RIL_PARA, it); VerifyIteratorText(truth, tesseract::RIL_PARA, it);
VerifyIteratorText(truth, tesseract::RIL_TEXTLINE, it); VerifyIteratorText(truth, tesseract::RIL_TEXTLINE, it);
VerifyIteratorText(truth, tesseract::RIL_WORD, it); VerifyIteratorText(truth, tesseract::RIL_WORD, it);
VerifyIteratorText(truth, tesseract::RIL_SYMBOL, it); VerifyIteratorText(truth, tesseract::RIL_SYMBOL, it);
} }
skipping to change at line 280 skipping to change at line 282
VerifyRebuilds(10, 10, 0, 0, 0, p_it); VerifyRebuilds(10, 10, 0, 0, 0, p_it);
delete p_it; delete p_it;
char *result = api_.GetUTF8Text(); char *result = api_.GetUTF8Text();
ocr_text_ = result; ocr_text_ = result;
delete[] result; delete[] result;
ResultIterator *r_it = api_.GetIterator(); ResultIterator *r_it = api_.GetIterator();
// The images should rebuild almost perfectly. // The images should rebuild almost perfectly.
LOG(INFO) << "Verifying image rebuilds 2a (resultiterator)" LOG(INFO) << "Verifying image rebuilds 2a (resultiterator)"
<< "\n"; << "\n";
VerifyRebuilds(8, 8, 0, 0, 40, r_it); VerifyRebuilds(8, 8, 0, 0, 40, r_it, tesseract::RIL_WORD);
// Test the text. // Test the text.
LOG(INFO) << "Verifying text rebuilds 1 (resultiterator)" LOG(INFO) << "Verifying text rebuilds 1 (resultiterator)"
<< "\n"; << "\n";
VerifyAllText(ocr_text_, r_it); VerifyAllText(ocr_text_, r_it);
// The images should rebuild almost perfectly. // The images should rebuild almost perfectly.
LOG(INFO) << "Verifying image rebuilds 2b (resultiterator)" LOG(INFO) << "Verifying image rebuilds 2b (resultiterator)"
<< "\n"; << "\n";
VerifyRebuilds(8, 8, 0, 0, 40, r_it); VerifyRebuilds(8, 8, 0, 0, 40, r_it, tesseract::RIL_WORD);
r_it->Begin(); r_it->Begin();
// Test baseline of the first line. // Test baseline of the first line.
int x1, y1, x2, y2; int x1, y1, x2, y2;
r_it->Baseline(tesseract::RIL_TEXTLINE, &x1, &y1, &x2, &y2); r_it->Baseline(tesseract::RIL_TEXTLINE, &x1, &y1, &x2, &y2);
LOG(INFO) << "Baseline (" LOG(INFO) << "Baseline ("
<< x1 << ',' << y1 << ")->(" << x2 << ',' << y2 << ")\n"; << x1 << ',' << y1 << ")->(" << x2 << ',' << y2 << ")\n";
// Make sure we have a decent vector. // Make sure we have a decent vector.
EXPECT_GE(x2, x1 + 400); EXPECT_GE(x2, x1 + 400);
// The point 200,116 should be very close to the baseline. // The point 200,116 should be very close to the baseline.
skipping to change at line 311 skipping to change at line 313
int x3 = 200 - x1; int x3 = 200 - x1;
int y3 = 116 - y1; int y3 = 116 - y1;
x2 -= x1; x2 -= x1;
y2 -= y1; y2 -= y1;
// The cross product (x2,y1)x(x3,y3) should be small. // The cross product (x2,y1)x(x3,y3) should be small.
int product = x2 * y3 - x3 * y2; int product = x2 * y3 - x3 * y2;
EXPECT_LE(abs(product), x2); EXPECT_LE(abs(product), x2);
// Test font attributes for each word. // Test font attributes for each word.
do { do {
bool bold, italic, underlined, monospace, serif, smallcaps; float confidence = r_it->Confidence(tesseract::RIL_WORD);
#ifndef DISABLED_LEGACY_ENGINE
int pointsize, font_id; int pointsize, font_id;
bool bold, italic, underlined, monospace, serif, smallcaps;
const char *font = r_it->WordFontAttributes(&bold, &italic, &underlined, &mo nospace, &serif, const char *font = r_it->WordFontAttributes(&bold, &italic, &underlined, &mo nospace, &serif,
&smallcaps, &pointsize, &font_id ); &smallcaps, &pointsize, &font_id );
float confidence = r_it->Confidence(tesseract::RIL_WORD);
EXPECT_GE(confidence, 80.0f); EXPECT_GE(confidence, 80.0f);
#endif
char *word_str = r_it->GetUTF8Text(tesseract::RIL_WORD); char *word_str = r_it->GetUTF8Text(tesseract::RIL_WORD);
#ifdef DISABLED_LEGACY_ENGINE
LOG(INFO) << "Word " << word_str << ", conf " << confidence << "\n";
#else
LOG(INFO) << "Word " << word_str << " in font " << font LOG(INFO) << "Word " << word_str << " in font " << font
<< ", id " << font_id << ", size " << pointsize << ", id " << font_id << ", size " << pointsize
<< ", conf " << confidence << "\n"; << ", conf " << confidence << "\n";
#endif // def DISABLED_LEGACY_ENGINE
delete[] word_str; delete[] word_str;
#ifndef DISABLED_LEGACY_ENGINE
EXPECT_FALSE(bold); EXPECT_FALSE(bold);
EXPECT_FALSE(italic); EXPECT_FALSE(italic);
EXPECT_FALSE(underlined); EXPECT_FALSE(underlined);
EXPECT_FALSE(monospace); EXPECT_FALSE(monospace);
EXPECT_FALSE(serif); EXPECT_FALSE(serif);
// The text is about 31 pixels high. Above we say the source is 200 ppi, // The text is about 31 pixels high. Above we say the source is 200 ppi,
// which translates to: // which translates to:
// 31 pixels / textline * (72 pts / inch) / (200 pixels / inch) = 11.16 pts // 31 pixels / textline * (72 pts / inch) / (200 pixels / inch) = 11.16 pts
EXPECT_GE(pointsize, 11.16 - 1.50); EXPECT_GE(pointsize, 11.16 - 1.50);
EXPECT_LE(pointsize, 11.16 + 1.50); EXPECT_LE(pointsize, 11.16 + 1.50);
#endif // def DISABLED_LEGACY_ENGINE
} while (r_it->Next(tesseract::RIL_WORD)); } while (r_it->Next(tesseract::RIL_WORD));
delete r_it; delete r_it;
} }
// Tests image rebuild on the UNLV page numbered 8087_054.3B.tif. (Dubrovnik) // Tests image rebuild on the UNLV page numbered 8087_054.3B.tif. (Dubrovnik)
TEST_F(ResultIteratorTest, ComplexTest) { TEST_F(ResultIteratorTest, ComplexTest) {
SetImage("8087_054.3B.tif"); SetImage("8087_054.3B.tif");
// Just run layout analysis. // Just run layout analysis.
PageIterator *it = api_.AnalyseLayout(); PageIterator *it = api_.AnalyseLayout();
EXPECT_FALSE(it == nullptr); EXPECT_FALSE(it == nullptr);
skipping to change at line 360 skipping to change at line 371
// Just run layout analysis. // Just run layout analysis.
PageIterator *it = api_.AnalyseLayout(); PageIterator *it = api_.AnalyseLayout();
EXPECT_FALSE(it == nullptr); EXPECT_FALSE(it == nullptr);
// The images should rebuild almost perfectly. // The images should rebuild almost perfectly.
VerifyRebuilds(600, 600, 600, 600, 600, it); VerifyRebuilds(600, 600, 600, 600, 600, it);
delete it; delete it;
} }
// Tests that Tesseract gets smallcaps and dropcaps. // Tests that Tesseract gets smallcaps and dropcaps.
TEST_F(ResultIteratorTest, SmallCapDropCapTest) { TEST_F(ResultIteratorTest, SmallCapDropCapTest) {
#ifdef DISABLED_LEGACY_ENGINE
// Skip test as LSTM mode does not recognize smallcaps & dropcaps attributes.
GTEST_SKIP();
#else
SetImage("8071_093.3B.tif"); SetImage("8071_093.3B.tif");
char *result = api_.GetUTF8Text(); char *result = api_.GetUTF8Text();
delete[] result; delete[] result;
ResultIterator *r_it = api_.GetIterator(); ResultIterator *r_it = api_.GetIterator();
// Iterate over the words. // Iterate over the words.
int found_dropcaps = 0; int found_dropcaps = 0;
int found_smallcaps = 0; int found_smallcaps = 0;
int false_positives = 0; int false_positives = 0;
do { do {
bool bold, italic, underlined, monospace, serif, smallcaps; bool bold, italic, underlined, monospace, serif, smallcaps;
skipping to change at line 407 skipping to change at line 422
} }
EXPECT_FALSE(s_it.SymbolIsDropcap()); EXPECT_FALSE(s_it.SymbolIsDropcap());
} }
delete[] word_str; delete[] word_str;
} }
} while (r_it->Next(tesseract::RIL_WORD)); } while (r_it->Next(tesseract::RIL_WORD));
delete r_it; delete r_it;
EXPECT_EQ(1, found_dropcaps); EXPECT_EQ(1, found_dropcaps);
EXPECT_GE(4, found_smallcaps); EXPECT_GE(4, found_smallcaps);
EXPECT_LE(false_positives, 3); EXPECT_LE(false_positives, 3);
#endif // DISABLED_LEGACY_ENGINE
} }
#if 0 #if 0
// TODO(rays) uncomment on the next change to layout analysis. // TODO(rays) uncomment on the next change to layout analysis.
// CL 22736106 breaks it, but it is fixed in the change when // CL 22736106 breaks it, but it is fixed in the change when
// the textline finders start to collapse. // the textline finders start to collapse.
// Tests that Tesseract gets subscript and superscript. // Tests that Tesseract gets subscript and superscript.
// TODO(rays) This test is a bit feeble, due to bad textline finding on this // TODO(rays) This test is a bit feeble, due to bad textline finding on this
// image, so beef up the test a bit when we get less false positive subs. // image, so beef up the test a bit when we get less false positive subs.
 End of changes. 14 change blocks. 
6 lines changed or deleted 22 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)