本文整理汇总了Java中org.apache.tika.exception.TikaException类的典型用法代码示例。如果您正苦于以下问题:Java TikaException类的具体用法?Java TikaException怎么用?Java TikaException使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
TikaException类属于org.apache.tika.exception包,在下文中一共展示了TikaException类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: convertWordDocumentIntoHtml
import org.apache.tika.exception.TikaException; //导入依赖的package包/类
/**
* Converts a .docx document into HTML markup. This code
* is based on <a href="http://stackoverflow.com/a/9053258/313554">this StackOverflow</a> answer.
*
* @param wordDocument The converted .docx document.
* @return
*/
public ConvertedDocumentDTO convertWordDocumentIntoHtml(MultipartFile wordDocument) {
LOGGER.info("Converting word document: {} into HTML", wordDocument.getOriginalFilename());
try {
InputStream input = wordDocument.getInputStream();
Parser parser = new OOXMLParser();
StringWriter sw = new StringWriter();
SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8");
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.setResult(new StreamResult(sw));
Metadata metadata = new Metadata();
metadata.add(Metadata.CONTENT_TYPE, "text/html;charset=utf-8");
parser.parse(input, handler, metadata, new ParseContext());
return new ConvertedDocumentDTO(wordDocument.getOriginalFilename(), sw.toString());
}
catch (IOException | SAXException | TransformerException | TikaException ex) {
LOGGER.error("Conversion failed because an exception was thrown", ex);
throw new DocumentConversionException(ex.getMessage(), ex);
}
}
示例2: getFullText
import org.apache.tika.exception.TikaException; //导入依赖的package包/类
private static String getFullText(final String filepath) throws IOException, SAXException, TikaException {
StringWriter writer = new StringWriter();
final TikaInputStream inputStream = TikaInputStream.get(new File(filepath));
try {
final Detector detector = new DefaultDetector();
final Parser parser = new AutoDetectParser(detector);
final Metadata metadata = new Metadata();
final ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, parser);
ContentHandler contentHandler = new BodyContentHandler(writer);
parser.parse(inputStream, contentHandler, metadata, parseContext);
}
finally {
inputStream.close();
}
return writer.toString();
}
示例3: parse
import org.apache.tika.exception.TikaException; //导入依赖的package包/类
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
new ImageMetadataExtractor(metadata).parseJpeg(tis.getFile());
new JempboxExtractor(metadata).parse(tis);
} finally {
tmp.dispose();
}
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
}
示例4: parse
import org.apache.tika.exception.TikaException; //导入依赖的package包/类
/**
* parses with tika, throwing any exception hit while parsing the document
*/
// only package private for testing!
static String parse(final byte content[], final Metadata metadata, final int limit) throws TikaException, IOException {
// check that its not unprivileged code like a script
SpecialPermission.check();
try {
return AccessController.doPrivileged((PrivilegedExceptionAction<String>)
() -> TIKA_INSTANCE.parseToString(new ByteArrayInputStream(content), metadata, limit), RESTRICTED_CONTEXT);
} catch (PrivilegedActionException e) {
// checked exception from tika: unbox it
Throwable cause = e.getCause();
if (cause instanceof TikaException) {
throw (TikaException) cause;
} else if (cause instanceof IOException) {
throw (IOException) cause;
} else {
throw new AssertionError(cause);
}
}
}
示例5: setUp
import org.apache.tika.exception.TikaException; //导入依赖的package包/类
@Before
public void setUp() throws IOException, TikaException, SAXException {
MockitoAnnotations.initMocks(this);
store = new FileRefStore();
store.setEntityManager(getEm());
store.setQueryFactory(new JPAQueryFactory(getEm()));
store.setTemplate(template);
createDirectories(Paths.get("fileTestFiles"));
TikaProvider tikaProvider = new TikaProvider();
TikaTransformer transformer = new TikaTransformer();
transformer.setTika(tikaProvider.tika());
repository = new FileRepository();
repository.setStore(store);
repository.setBasePath("fileTestFiles");
repository.setTransformer(transformer);
}
示例6: setUp
import org.apache.tika.exception.TikaException; //导入依赖的package包/类
@Before
public void setUp() throws TikaException, IOException, SAXException {
VelocityEngine engine = new VelocityEngine();
engine.setProperty(RuntimeConstants.RESOURCE_LOADER, "classpath");
engine.setProperty("classpath.resource.loader.class", ClasspathResourceLoader.class.getName());
engine.init();
Templater templater = new Templater();
templater.setEngine(engine);
exporter = new HtmlExporter();
exporter.setTemplater(templater);
TikaProvider provider = new TikaProvider();
Tika tika = provider.tika();
transformer = new TikaTransformer();
transformer.setTika(tika);
}
示例7: setUp
import org.apache.tika.exception.TikaException; //导入依赖的package包/类
@Before
public void setUp() throws TikaException, IOException, SAXException {
VelocityEngine engine = new VelocityEngine();
engine.setProperty(RuntimeConstants.RESOURCE_LOADER, "classpath");
engine.setProperty("classpath.resource.loader.class", ClasspathResourceLoader.class.getName());
engine.init();
Templater templater = new Templater();
templater.setEngine(engine);
exporter = new PdfExporter();
exporter.setTemplater(templater);
TikaProvider provider = new TikaProvider();
Tika tika = provider.tika();
transformer = new TikaTransformer();
transformer.setTika(tika);
}
示例8: process
import org.apache.tika.exception.TikaException; //导入依赖的package包/类
@Override
public void process(ProcessingContext<Corpus> ctx, Corpus corpus) throws ModuleException {
Logger.getLogger("org.apache.pdfbox").setLevel(Level.OFF);
AutoDetectParser parser = new AutoDetectParser();
ParseContext parseContext = new ParseContext();
try {
for (InputStream is : Iterators.loop(source.getInputStreams())) {
TikaReaderHandler handler = parse(parser, parseContext, is);
Document doc = createDocument(corpus, handler);
createTagAnnotations(doc, handler);
}
}
catch (IOException|SAXException|TikaException e) {
rethrow(e);
}
}
示例9: testParseDamagedPdfFile
import org.apache.tika.exception.TikaException; //导入依赖的package包/类
@Test
public void testParseDamagedPdfFile() throws IOException {
String path = getClass().getResource("/damaged.pdf").getPath();
PCollection<ParseResult> res = p.apply("ParseInvalidPdfFile", TikaIO.parse().filepattern(path));
PAssert.thatSingleton(res)
.satisfies(
new SerializableFunction<ParseResult, Void>() {
@Override
public Void apply(ParseResult input) {
assertEquals(path, input.getFileLocation());
assertFalse(input.isSuccess());
assertTrue(input.getError() instanceof TikaException);
return null;
}
});
p.run();
}
示例10: getHtmlFromFile
import org.apache.tika.exception.TikaException; //导入依赖的package包/类
/**
* Ritorna una vista HTML sul file processato. L'estrazione del testo fa
* perdere di fatto la formattazione contenuta nel word o nel PDF. La
* versione HTML è una versione processabile ma che mantiene il formato
*
* @param file file da parsare
* @return vista HTML formattato del contenuto del documento (privato delle
* immagini)
*/
public String getHtmlFromFile(File file) {
try {
ByteArrayOutputStream out = new ByteArrayOutputStream();
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "UTF-8");
handler.setResult(new StreamResult(out));
ExpandedTitleContentHandler handler1 = new ExpandedTitleContentHandler(handler);
InputStream is = file.toURI().toURL().openStream();
adp.parse(is, handler1, new Metadata());
is.close();
return new String(out.toByteArray(), "UTF-8").replaceAll("<img .*?</img>", "").replaceAll("<img .*?/>", "");
} catch (TransformerConfigurationException | IllegalArgumentException | IOException | SAXException | TikaException ex) {
return "!ERROR: " + ex.getLocalizedMessage();
}
}
示例11: main
import org.apache.tika.exception.TikaException; //导入依赖的package包/类
public static void main(String[] args) throws IOException, TikaException {
//Tomcat changes the working dir, so we make this absolute
ResourceManager.setDataDir(new File(dataDir).getAbsoluteFile());
//noinspection ResultOfMethodCallIgnored
ResourceManager.getDataDir().mkdirs();
SpringApplication.run(SpringController.class, args);
new Caddy().start();
if(VirusScanner.isAvInstalled()) {
log.info("Virus scanning available");
virusScanner = new VirusScanner();
virusScanner.start();
} else {
log.warn("Virus scanning unavailable. Skipping...");
}
}
示例12: detectCharset
import org.apache.tika.exception.TikaException; //导入依赖的package包/类
private static Charset detectCharset(final Path path, final Metadata metadata) throws IOException {
final Charset charset;
// Try to parse the character set from the content-encoding.
String orig = metadata.get(Metadata.CONTENT_ENCODING);
// Try to detect the character set.
if (null != orig && Charset.isSupported(orig)) {
return Charset.forName(orig);
}
try (
final InputStream input = new BufferedInputStream(Files.newInputStream(path));
final AutoDetectReader detector = new AutoDetectReader(input, metadata)
) {
charset = detector.getCharset();
} catch (TikaException e) {
throw new IOException("Unable to detect charset.", e);
}
return charset;
}
示例13: testWrite
import org.apache.tika.exception.TikaException; //导入依赖的package包/类
@Test
public void testWrite() throws IOException, TikaException {
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
final PrintStream printStream = new PrintStream(outputStream);
final Spewer spewer = new PrintStreamSpewer(printStream, new FieldNames());
final String buffer = "$";
final String name = "imaginary-file.txt";
final InputStream inputStream = new ByteArrayInputStream(buffer.getBytes(StandardCharsets.UTF_8));
final ParsingReader reader = new ParsingReader(inputStream, name);
spewer.outputMetadata(false);
spewer.write(factory.create(name), reader);
Assert.assertEquals("$\n\n", outputStream.toString(StandardCharsets.UTF_8.name()));
Assert.assertArrayEquals(new byte[] {0x24, 0x0A}, Arrays.copyOfRange(outputStream.toByteArray(), 0, 2));
}
示例14: testWriteFromUTF16LE
import org.apache.tika.exception.TikaException; //导入依赖的package包/类
@Test
public void testWriteFromUTF16LE() throws IOException, TikaException {
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
final PrintStream printStream = new PrintStream(outputStream);
final Spewer spewer = new PrintStreamSpewer(printStream, new FieldNames());
final byte[] buffer = new byte[] {(byte) 0xFF, (byte) 0xFE, 0x24, 0x00};
final String name = "imaginary-file.txt";
final InputStream inputStream = new ByteArrayInputStream(buffer);
final ParsingReader reader = new ParsingReader(inputStream, name);
spewer.outputMetadata(false);
spewer.write(factory.create(name), reader);
Assert.assertEquals("$\n\n", outputStream.toString(StandardCharsets.UTF_8.name()));
Assert.assertArrayEquals(new byte[] {0x24, 0x0A}, Arrays.copyOfRange(outputStream.toByteArray(), 0, 2));
}
示例15: testWriteFromUTF16BE
import org.apache.tika.exception.TikaException; //导入依赖的package包/类
@Test
public void testWriteFromUTF16BE() throws IOException, TikaException {
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
final PrintStream printStream = new PrintStream(outputStream);
final Spewer spewer = new PrintStreamSpewer(printStream, new FieldNames());
final byte[] buffer = new byte[] {(byte) 0xFE, (byte) 0xFF, 0x00, 0x24};
final String name = "imaginary-file.txt";
final InputStream inputStream = new ByteArrayInputStream(buffer);
final ParsingReader reader = new ParsingReader(inputStream, name);
spewer.outputMetadata(false);
spewer.write(factory.create(name), reader);
Assert.assertEquals("$\n\n", outputStream.toString(StandardCharsets.UTF_8.name()));
Assert.assertArrayEquals(new byte[] {0x24, 0x0A}, Arrays.copyOfRange(outputStream.toByteArray(), 0, 2));
}