HTMLStripCharFilter
@Deprecated
public class LegacyHTMLStripCharFilter
extends org.apache.lucene.analysis.BaseCharFilter
This class is NOT recommended for new users and should be considered UNSUPPORTED.
In Solr version 3.5 and earlier, HTMLStripCharFilter(Factory) had known bugs in the offsets it provided, triggering e.g. exceptions in highlighting.
This class is provided as possible alternative for people who depend on the "broken" behavior of HTMLStripCharFilter in Solr version 3.5 and earlier, and/or who don't like the changes introduced by the Solr 3.6+ version of HTMLStripCharFilterFactory. (See the 3.6.0 release section of solr/CHANGES.txt for a list of differences in behavior.)
Modifier and Type | Field and Description |
---|---|
static int |
DEFAULT_READ_AHEAD
Deprecated.
|
private static java.util.HashMap<java.lang.String,java.lang.Character> |
entityTable
Deprecated.
|
private static int |
EOF
Deprecated.
|
private java.util.Set<java.lang.String> |
escapedTags
Deprecated.
|
private int |
lastMark
Deprecated.
|
private static int |
MATCH
Deprecated.
|
private static int |
MISMATCH
Deprecated.
|
private int |
numEaten
Deprecated.
|
private int |
numRead
Deprecated.
|
private int |
numReturned
Deprecated.
|
private int |
numWhitespace
Deprecated.
|
private java.lang.StringBuilder |
pushed
Deprecated.
|
private int |
readAheadLimit
Deprecated.
|
private int |
safeReadAheadLimit
Deprecated.
|
private java.lang.StringBuilder |
sb
Deprecated.
|
Constructor and Description |
---|
LegacyHTMLStripCharFilter(org.apache.lucene.analysis.CharStream source)
Deprecated.
|
LegacyHTMLStripCharFilter(org.apache.lucene.analysis.CharStream source,
java.util.Set<java.lang.String> escapedTags)
Deprecated.
|
LegacyHTMLStripCharFilter(org.apache.lucene.analysis.CharStream source,
java.util.Set<java.lang.String> escapedTags,
int readAheadLimit)
Deprecated.
|
Modifier and Type | Method and Description | |
---|---|---|
void |
close()
Deprecated.
|
|
private int |
eatSSI()
Deprecated.
|
|
(package private) int |
findEndTag()
Deprecated.
|
|
int |
getReadAheadLimit()
Deprecated.
|
|
private boolean |
isAlpha(int ch)
Deprecated.
|
|
private boolean |
isDigit(int ch)
Deprecated.
|
|
private boolean |
isFirstIdChar(int ch)
Deprecated.
|
|
private boolean |
isHex(int ch)
Deprecated.
|
|
private boolean |
isIdChar(int ch)
Deprecated.
From HTML 4.0
[4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
[5] Name ::= (Letter | '_' | ':') (NameChar)*
[6] Names ::= Name (#x20 Name)*
[7] Nmtoken ::= (NameChar)+
[8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
|
|
private boolean |
isSpace(int ch)
Deprecated.
|
|
static void |
main(java.lang.String[] args)
Deprecated.
|
|
private int |
next()
Deprecated.
|
|
private int |
nextSkipWS()
Deprecated.
|
|
private int |
peek()
Deprecated.
|
|
private void |
push(int ch)
Deprecated.
|
|
int |
read()
Deprecated.
|
|
int |
read(char[] cbuf,
int off,
int len)
Deprecated.
|
|
private int |
readAttr2()
Deprecated.
[10] AttValue ::= '"' ([^<&"] | Reference)* '"'
| "'" ([^<&'] | Reference)* "'"
need to also handle unquoted attributes, and attributes w/o values:
|
|
private int |
readBang(boolean inScript)
Deprecated.
valid comments according to HTML specs
Hello -->
#comments inside of an entity decl:
Turns out, IE & mozilla don't parse comments correctly.
|
|
private int |
readComment(boolean inScript)
Deprecated.
|
|
private int |
readEntity()
Deprecated.
|
|
private int |
readName(boolean checkEscaped)
Deprecated.
|
|
private int |
readNumericEntity()
Deprecated.
|
|
private int |
readProcessingInstruction()
Deprecated.
|
|
private int |
readScriptString()
Deprecated.
|
|
private int |
readTag()
Deprecated.
|
|
private void |
restoreState()
Deprecated.
|
|
private void |
saveState()
Deprecated.
|
addOffCorrectMap, correct, getLastCumulativeDiff
private int readAheadLimit
private int safeReadAheadLimit
private int numWhitespace
private int numRead
private int numEaten
private int numReturned
private int lastMark
private java.util.Set<java.lang.String> escapedTags
private final java.lang.StringBuilder pushed
private static final int EOF
private static final int MISMATCH
private static final int MATCH
private final java.lang.StringBuilder sb
public static final int DEFAULT_READ_AHEAD
private static final java.util.HashMap<java.lang.String,java.lang.Character> entityTable
public LegacyHTMLStripCharFilter(org.apache.lucene.analysis.CharStream source)
public LegacyHTMLStripCharFilter(org.apache.lucene.analysis.CharStream source, java.util.Set<java.lang.String> escapedTags)
public LegacyHTMLStripCharFilter(org.apache.lucene.analysis.CharStream source, java.util.Set<java.lang.String> escapedTags, int readAheadLimit)
public static void main(java.lang.String[] args) throws java.io.IOException
java.io.IOException
public int getReadAheadLimit()
private int next() throws java.io.IOException
java.io.IOException
private int nextSkipWS() throws java.io.IOException
java.io.IOException
private int peek() throws java.io.IOException
java.io.IOException
private void push(int ch)
private boolean isSpace(int ch)
private boolean isHex(int ch)
private boolean isAlpha(int ch)
private boolean isDigit(int ch)
private boolean isIdChar(int ch)
private boolean isFirstIdChar(int ch)
private void saveState() throws java.io.IOException
java.io.IOException
private void restoreState() throws java.io.IOException
java.io.IOException
private int readNumericEntity() throws java.io.IOException
java.io.IOException
private int readEntity() throws java.io.IOException
java.io.IOException
private int readBang(boolean inScript) throws java.io.IOException
java.io.IOException
private int readComment(boolean inScript) throws java.io.IOException
java.io.IOException
private int readTag() throws java.io.IOException
java.io.IOException
int findEndTag() throws java.io.IOException
java.io.IOException
private int readScriptString() throws java.io.IOException
java.io.IOException
private int readName(boolean checkEscaped) throws java.io.IOException
java.io.IOException
private int readAttr2() throws java.io.IOException
java.io.IOException
private int eatSSI() throws java.io.IOException
java.io.IOException
private int readProcessingInstruction() throws java.io.IOException
java.io.IOException
public int read() throws java.io.IOException
read
in class java.io.Reader
java.io.IOException
public int read(char[] cbuf, int off, int len) throws java.io.IOException
read
in class org.apache.lucene.analysis.CharFilter
java.io.IOException
public void close() throws java.io.IOException
close
in interface java.io.Closeable
close
in interface java.lang.AutoCloseable
close
in class org.apache.lucene.analysis.CharFilter
java.io.IOException