"Fossies" - the Fresh Open Source Software Archive  

Source code changes of the file "src/main/java/com/openkm/extractor/RegisteredExtractors.java" between
OpenKM-document-management-system-6.3.10.tar.gz and OpenKM-document-management-system-6.3.11.tar.gz

About: OpenKM (Knowledge Management) is a document management system that allows easy management of documents, users, roles and finding your enterprise documents and records. Community version (source code).

RegisteredExtractors.java  (OpenKM-document-management-system-6.3.10):RegisteredExtractors.java  (OpenKM-document-management-system-6.3.11)
skipping to change at line 30 skipping to change at line 30
*/ */
package com.openkm.extractor; package com.openkm.extractor;
import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch; import com.ibm.icu.text.CharsetMatch;
import com.openkm.core.Config; import com.openkm.core.Config;
import com.openkm.core.DatabaseException; import com.openkm.core.DatabaseException;
import com.openkm.core.PathNotFoundException; import com.openkm.core.PathNotFoundException;
import com.openkm.dao.NodeBaseDAO; import com.openkm.dao.NodeBaseDAO;
import com.openkm.dao.PluginDAO;
import com.openkm.module.db.stuff.PersistentFile; import com.openkm.module.db.stuff.PersistentFile;
import com.openkm.util.PluginUtils;
import com.openkm.util.SystemProfiling; import com.openkm.util.SystemProfiling;
import com.openkm.util.UserActivity; import com.openkm.util.UserActivity;
import net.xeoh.plugins.base.Plugin;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.BufferedInputStream; import java.io.BufferedInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.HashMap; import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map;
/** /**
* @author pavila * @author pavila
*/ */
public class RegisteredExtractors { public class RegisteredExtractors {
private static Logger log = LoggerFactory.getLogger(RegisteredExtractors. private static final Logger log = LoggerFactory.getLogger(RegisteredExtra
class); ctors.class);
private static Map<String, TextExtractor> engine = new HashMap<String, Te public static final String PLUGIN_URI = "classpath://com.openkm.extractor
xtExtractor>(); .**";
private static List<TextExtractor> extractorList = null;
private static final int MIN_EXTRACTION = 16; private static final int MIN_EXTRACTION = 16;
/** /**
* Initialize text extractors from REGISTERED_TEXT_EXTRACTORS * Enable default extractors
*/ */
public static synchronized void init() { public static void enableDefaultExtractors() throws URISyntaxException, D
log.info("Initializing text extractors"); atabaseException {
URI uri = new URI(PLUGIN_URI);
for (String clazz : Config.REGISTERED_TEXT_EXTRACTORS) { for (TextExtractor te : PluginUtils.getAllPlugins(uri, TextExtrac
try { tor.class)) {
Object obj = Class.forName(clazz).newInstance(); com.openkm.dao.bean.Plugin plugin = new com.openkm.dao.be
an.Plugin();
if (obj instanceof TextExtractor) { plugin.setClassName(te.getClass().getCanonicalName());
TextExtractor te = (TextExtractor) obj; plugin.setActive(false);
for (String contType : te.getContentTypes for (String className : Config.REGISTERED_TEXT_EXTRACTORS
()) { ) {
log.info("Registering {} for '{}' if (te.getClass().getSimpleName().equals(classNam
", te.getClass().getCanonicalName(), contType); e)) {
engine.put(contType, te); plugin.setActive(true);
} break;
} else {
log.warn("Unknown text extractor class: {
}", clazz);
} }
} catch (ClassNotFoundException e) {
log.warn("Extractor class not found: {}", clazz,
e);
} catch (LinkageError e) {
log.warn("Extractor dependency not found: {}", cl
azz, e);
} catch (IllegalAccessException e) {
log.warn("Extractor constructor not accessible: {
}", clazz, e);
} catch (InstantiationException e) {
log.warn("Extractor instantiation failed: {}", cl
azz, e);
} }
}
}
/** PluginDAO.getInstance().create(plugin);
* Return registered content types }
*/
public static String[] getContentTypes() {
return engine.keySet().toArray(new String[engine.keySet().size()]
);
} }
/** /**
* Return guessed text extractor * Return guessed text extractor
*/ */
public static TextExtractor getTextExtractor(String mimeType) { public static TextExtractor getTextExtractor(String mimeType) throws URIS
return engine.get(mimeType); yntaxException {
for (TextExtractor te : findExtractors(false)) {
for (String teMime : te.getContentTypes()) {
if (teMime.equals(mimeType)) {
log.debug("Text extractor for '{}' found:
{}", mimeType, te.getClass());
return te;
}
}
}
return null;
} }
/** /**
* Check for registered text extractor * Check for registered text extractor
*/ */
public static boolean isRegistered(String className) { public static boolean isRegistered(String className) throws URISyntaxExce
List<String> classes = Config.REGISTERED_TEXT_EXTRACTORS; ption {
for (TextExtractor te : findExtractors(false)) {
for (String name : classes) { if (te.getClass().getCanonicalName().equals(className)) {
if (name.equals(className)) {
return true; return true;
} }
} }
return false; return false;
} }
/** /**
* Extract text to be indexed * Extract text to be indexed
*/ */
public static String getText(String docPath, String mimeType, String enco ding, InputStream isContent) throws IOException { public static String getText(String docPath, String mimeType, String enco ding, InputStream isContent) throws IOException {
log.debug("getText({}, {}, {}, {})", new Object[]{docPath, mimeTy pe, encoding, isContent}); log.debug("getText({}, {}, {}, {})", docPath, mimeType, encoding, isContent);
long begin = System.currentTimeMillis(); long begin = System.currentTimeMillis();
String failureMessage = "Unknown error"; String failureMessage = "Unknown error";
boolean failure = false; boolean failure = false;
String text = null; String text = null;
try { try {
text = getText(mimeType, encoding, isContent); text = getText(mimeType, encoding, isContent);
// Check for minimum text extraction size // Check for minimum text extraction size
if (text.length() < MIN_EXTRACTION) { if (text.length() < MIN_EXTRACTION) {
skipping to change at line 153 skipping to change at line 148
log.debug("getText: {}", text); log.debug("getText: {}", text);
return text; return text;
} }
/** /**
* Extract text to be indexed * Extract text to be indexed
*/ */
public static String getText(String mimeType, String encoding, InputStrea m isContent) throws IOException { public static String getText(String mimeType, String encoding, InputStrea m isContent) throws IOException {
BufferedInputStream bis = new BufferedInputStream(isContent); BufferedInputStream bis = new BufferedInputStream(isContent);
TextExtractor te = engine.get(mimeType);
String text = null; String text = null;
if (te != null) { try {
if (mimeType.startsWith("text/") && encoding == null) { TextExtractor te = getTextExtractor(mimeType);
CharsetDetector detector = new CharsetDetector();
detector.setText(bis); if (te != null) {
CharsetMatch cm = detector.detect(); if (mimeType.startsWith("text/") && encoding == n
encoding = cm.getName(); ull) {
} CharsetDetector detector = new CharsetDet
ector();
detector.setText(bis);
CharsetMatch cm = detector.detect();
encoding = cm.getName();
}
text = te.extractText(bis, mimeType, encoding); text = te.extractText(bis, mimeType, encoding);
} else { } else {
throw new IOException("Full text indexing of '" + mimeTyp throw new IOException("Full text indexing of '" +
e + "' is not supported"); mimeType + "' is not supported");
}
} catch (URISyntaxException e) {
throw new IOException(e.getMessage(), e);
} }
IOUtils.closeQuietly(bis); IOUtils.closeQuietly(bis);
return text; return text;
} }
// //
// DB Methods // DB Methods
// //
/** /**
* Extract text to be indexed * Extract text to be indexed
*/ */
@SuppressWarnings("unused") @SuppressWarnings("unused")
private static String getDbText(String docUuid, String mimeType, String e private static String getDbText(String docUuid, String mimeType, String e
ncoding, InputStream isContent) ncoding, InputStream isContent) throws
throws IOException, PathNotFoundException, DatabaseExcept PathNotFoundException, DatabaseException {
ion { log.debug("getDbText({}, {}, {}, {})", docUuid, mimeType, encodin
log.debug("getDbText({}, {}, {}, {})", new Object[]{docUuid, mime g, isContent);
Type, encoding, isContent});
String text = null; String text = null;
try { try {
text = getText(null, mimeType, encoding, isContent); text = getText(null, mimeType, encoding, isContent);
} catch (IOException e) { } catch (IOException e) {
if (docUuid != null) { if (docUuid != null) {
String nodePath = NodeBaseDAO.getInstance().getPa thFromUuid(docUuid); String nodePath = NodeBaseDAO.getInstance().getPa thFromUuid(docUuid);
log.warn("There was a problem extracting text fro m '{}'", nodePath); log.warn("There was a problem extracting text fro m '{}'", nodePath);
UserActivity.log(Config.SYSTEM_USER, "MISC_TEXT_E XTRACTION_FAILURE", docUuid, nodePath, e.getMessage()); UserActivity.log(Config.SYSTEM_USER, "MISC_TEXT_E XTRACTION_FAILURE", docUuid, nodePath, e.getMessage());
} }
skipping to change at line 217 skipping to change at line 217
String text = null; String text = null;
try { try {
isContent = persistentFile.getInputStream(); isContent = persistentFile.getInputStream();
text = getText("text/plain", "UTF-8", isContent); text = getText("text/plain", "UTF-8", isContent);
return text; return text;
} finally { } finally {
IOUtils.closeQuietly(isContent); IOUtils.closeQuietly(isContent);
} }
} }
//
// DB Methods
//
/**
* Get all converters
*/
public static synchronized List<TextExtractor> findExtractors(boolean rel
oad) throws URISyntaxException {
log.debug("findExtractors({})", reload);
if (extractorList == null || reload) {
extractorList = new ArrayList<>();
URI uri = new URI(PLUGIN_URI);
for (Plugin plg : PluginUtils.getPlugins(uri, TextExtract
or.class)) {
extractorList.add((TextExtractor) plg);
}
}
return extractorList;
}
} }
 End of changes. 20 change blocks. 
71 lines changed or deleted 96 lines changed or added

Home  |  About  |  Features  |  All  |  Newest  |  Dox  |  Diffs  |  RSS Feeds  |  Screenshots  |  Comments  |  Imprint  |  Privacy  |  HTTP(S)