PDFMarkedContentExtractor.java (pdfbox-2.0.23-src) | : | PDFMarkedContentExtractor.java (pdfbox-2.0.24-src) | ||
---|---|---|---|---|
skipping to change at line 43 | skipping to change at line 43 | |||
import org.apache.pdfbox.contentstream.operator.markedcontent.DrawObject; | import org.apache.pdfbox.contentstream.operator.markedcontent.DrawObject; | |||
import org.apache.pdfbox.contentstream.operator.markedcontent.EndMarkedContentSe quence; | import org.apache.pdfbox.contentstream.operator.markedcontent.EndMarkedContentSe quence; | |||
/** | /** | |||
* This is an stream engine to extract the marked content of a pdf. | * This is an stream engine to extract the marked content of a pdf. | |||
* | * | |||
* @author Johannes Koch | * @author Johannes Koch | |||
*/ | */ | |||
public class PDFMarkedContentExtractor extends LegacyPDFStreamEngine | public class PDFMarkedContentExtractor extends LegacyPDFStreamEngine | |||
{ | { | |||
private final boolean suppressDuplicateOverlappingText = true; | private boolean suppressDuplicateOverlappingText = true; | |||
private final List<PDMarkedContent> markedContents = new ArrayList<PDMarkedC ontent>(); | private final List<PDMarkedContent> markedContents = new ArrayList<PDMarkedC ontent>(); | |||
private final Deque<PDMarkedContent> currentMarkedContents = new ArrayDeque< PDMarkedContent>(); | private final Deque<PDMarkedContent> currentMarkedContents = new ArrayDeque< PDMarkedContent>(); | |||
private final Map<String, List<TextPosition>> characterListMapping = new Has hMap<String, List<TextPosition>>(); | private final Map<String, List<TextPosition>> characterListMapping = new Has hMap<String, List<TextPosition>>(); | |||
/** | /** | |||
* Instantiate a new PDFTextStripper object. | * Instantiate a new PDFTextStripper object. | |||
*/ | */ | |||
public PDFMarkedContentExtractor() throws IOException | public PDFMarkedContentExtractor() throws IOException | |||
{ | { | |||
this(null); | this(null); | |||
skipping to change at line 72 | skipping to change at line 72 | |||
{ | { | |||
addOperator(new BeginMarkedContentSequenceWithProperties()); | addOperator(new BeginMarkedContentSequenceWithProperties()); | |||
addOperator(new BeginMarkedContentSequence()); | addOperator(new BeginMarkedContentSequence()); | |||
addOperator(new EndMarkedContentSequence()); | addOperator(new EndMarkedContentSequence()); | |||
addOperator(new DrawObject()); | addOperator(new DrawObject()); | |||
// todo: DP - Marked Content Point | // todo: DP - Marked Content Point | |||
// todo: MP - Marked Content Point with Properties | // todo: MP - Marked Content Point with Properties | |||
} | } | |||
/** | /** | |||
* @return the suppressDuplicateOverlappingText setting. | ||||
*/ | ||||
public boolean isSuppressDuplicateOverlappingText() | ||||
{ | ||||
return suppressDuplicateOverlappingText; | ||||
} | ||||
/** | ||||
* By default the class will attempt to remove text that overlaps each other | ||||
. Word paints the | ||||
* same character several times in order to make it look bold. By setting th | ||||
is to false all text | ||||
* will be extracted, which means that certain sections will be duplicated, | ||||
but better | ||||
* performance will be noticed. | ||||
* | ||||
* @param suppressDuplicateOverlappingText The suppressDuplicateOverlappingT | ||||
ext setting to set. | ||||
*/ | ||||
public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOve | ||||
rlappingText) | ||||
{ | ||||
this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingText | ||||
; | ||||
} | ||||
/** | ||||
* This will determine of two floating point numbers are within a specified variance. | * This will determine of two floating point numbers are within a specified variance. | |||
* | * | |||
* @param first The first number to compare to. | * @param first The first number to compare to. | |||
* @param second The second number to compare to. | * @param second The second number to compare to. | |||
* @param variance The allowed variance. | * @param variance The allowed variance. | |||
*/ | */ | |||
private boolean within( float first, float second, float variance ) | private boolean within( float first, float second, float variance ) | |||
{ | { | |||
return second > first - variance && second < first + variance; | return second > first - variance && second < first + variance; | |||
} | } | |||
skipping to change at line 159 | skipping to change at line 180 | |||
// of padding are applied, then backed off (not sure why this is don e, but there | // of padding are applied, then backed off (not sure why this is don e, but there | |||
// are cases where the padding is on the order of 10x the character width, and | // are cases where the padding is on the order of 10x the character width, and | |||
// the TJ just backs up to compensate after each character). Also, we subtract | // the TJ just backs up to compensate after each character). Also, we subtract | |||
// an amount to allow for kerning (a percentage of the width of the last | // an amount to allow for kerning (a percentage of the width of the last | |||
// character). | // character). | |||
// | // | |||
boolean suppressCharacter = false; | boolean suppressCharacter = false; | |||
float tolerance = (text.getWidth()/textCharacter.length())/3.0f; | float tolerance = (text.getWidth()/textCharacter.length())/3.0f; | |||
for (TextPosition sameTextCharacter : sameTextCharacters) | for (TextPosition sameTextCharacter : sameTextCharacters) | |||
{ | { | |||
TextPosition character = sameTextCharacter; | String charCharacter = sameTextCharacter.getUnicode(); | |||
String charCharacter = character.getUnicode(); | float charX = sameTextCharacter.getX(); | |||
float charX = character.getX(); | float charY = sameTextCharacter.getY(); | |||
float charY = character.getY(); | ||||
//only want to suppress | //only want to suppress | |||
if( charCharacter != null && | if( charCharacter != null && | |||
//charCharacter.equals( textCharacter ) && | //charCharacter.equals( textCharacter ) && | |||
within( charX, textX, tolerance ) && | within( charX, textX, tolerance ) && | |||
within( charY, | within( charY, | |||
textY, | textY, | |||
tolerance ) ) | tolerance ) ) | |||
{ | { | |||
suppressCharacter = true; | suppressCharacter = true; | |||
break; | break; | |||
End of changes. 3 change blocks. | ||||
5 lines changed or deleted | 31 lines changed or added |