public class ExternalParser extends AbstractParser
Modifier and Type | Field and Description |
---|---|
private java.lang.String[] |
command
The external command to invoke.
|
static java.lang.String |
INPUT_FILE_TOKEN
The token, which if present in the Command string, will
be replaced with the input filename.
|
private java.util.Map<java.util.regex.Pattern,java.lang.String> |
metadataPatterns
Regular Expressions to run over STDOUT to
extract Metadata.
|
static java.lang.String |
OUTPUT_FILE_TOKEN
The token, which if present in the Command string, will
be replaced with the output filename.
|
private static long |
serialVersionUID |
private java.util.Set<MediaType> |
supportedTypes
Media types supported by the external program.
|
Constructor and Description |
---|
ExternalParser() |
Modifier and Type | Method and Description |
---|---|
static boolean |
check(java.lang.String[] checkCmd,
int... errorValue) |
static boolean |
check(java.lang.String checkCmd,
int... errorValue)
Checks to see if the command can be run.
|
private void |
extractMetadata(java.io.InputStream stream,
Metadata metadata) |
private void |
extractOutput(java.io.InputStream stream,
XHTMLContentHandler xhtml)
Starts a thread that extracts the contents of the standard output
stream of the given process to the given XHTML content handler.
|
java.lang.String[] |
getCommand() |
java.util.Map<java.util.regex.Pattern,java.lang.String> |
getMetadataExtractionPatterns() |
java.util.Set<MediaType> |
getSupportedTypes() |
java.util.Set<MediaType> |
getSupportedTypes(ParseContext context)
Returns the set of media types supported by this parser when used
with the given parse context.
|
private void |
ignoreStream(java.io.InputStream stream)
Starts a thread that reads and discards the contents of the
standard stream of the given process.
|
void |
parse(java.io.InputStream stream,
org.xml.sax.ContentHandler handler,
Metadata metadata,
ParseContext context)
Executes the configured external command and passes the given document
stream as a simple XHTML document to the given SAX content handler.
|
private void |
parse(TikaInputStream stream,
XHTMLContentHandler xhtml,
Metadata metadata,
TemporaryResources tmp) |
private void |
sendInput(java.lang.Process process,
java.io.InputStream stream)
Starts a thread that sends the contents of the given input stream
to the standard input stream of the given process.
|
void |
setCommand(java.lang.String... command)
Sets the command to be run.
|
void |
setMetadataExtractionPatterns(java.util.Map<java.util.regex.Pattern,java.lang.String> patterns)
Sets the map of regular expression patterns and Metadata
keys.
|
void |
setSupportedTypes(java.util.Set<MediaType> supportedTypes) |
parse
private static final long serialVersionUID
public static final java.lang.String INPUT_FILE_TOKEN
public static final java.lang.String OUTPUT_FILE_TOKEN
private java.util.Set<MediaType> supportedTypes
private java.util.Map<java.util.regex.Pattern,java.lang.String> metadataPatterns
private java.lang.String[] command
Runtime.exec(String[])
public java.util.Set<MediaType> getSupportedTypes(ParseContext context)
Parser
context
- parse contextpublic java.util.Set<MediaType> getSupportedTypes()
public void setSupportedTypes(java.util.Set<MediaType> supportedTypes)
public java.lang.String[] getCommand()
public void setCommand(java.lang.String... command)
INPUT_FILE_TOKEN
or OUTPUT_FILE_TOKEN
if the command needs filenames.Runtime.exec(String[])
public java.util.Map<java.util.regex.Pattern,java.lang.String> getMetadataExtractionPatterns()
public void setMetadataExtractionPatterns(java.util.Map<java.util.regex.Pattern,java.lang.String> patterns)
public void parse(java.io.InputStream stream, org.xml.sax.ContentHandler handler, Metadata metadata, ParseContext context) throws java.io.IOException, org.xml.sax.SAXException, TikaException
setMetadataExtractionPatterns(Map)
has been called to set patterns.stream
- the document stream (input)handler
- handler for the XHTML SAX events (output)metadata
- document metadata (input and output)context
- parse contextjava.io.IOException
- if the document stream could not be readorg.xml.sax.SAXException
- if the SAX events could not be processedTikaException
- if the document could not be parsedprivate void parse(TikaInputStream stream, XHTMLContentHandler xhtml, Metadata metadata, TemporaryResources tmp) throws java.io.IOException, org.xml.sax.SAXException, TikaException
java.io.IOException
org.xml.sax.SAXException
TikaException
private void extractOutput(java.io.InputStream stream, XHTMLContentHandler xhtml) throws org.xml.sax.SAXException, java.io.IOException
process
- processxhtml
- XHTML content handlerorg.xml.sax.SAXException
- if the XHTML SAX events could not be handledjava.io.IOException
- if an input error occurredprivate void sendInput(java.lang.Process process, java.io.InputStream stream)
process
- processstream
- input streamprivate void ignoreStream(java.io.InputStream stream)
process
- processprivate void extractMetadata(java.io.InputStream stream, Metadata metadata)
public static boolean check(java.lang.String checkCmd, int... errorValue)
checkCmd
- The check command to runerrorValue
- What is considered an error value?public static boolean check(java.lang.String[] checkCmd, int... errorValue)