public class URLClassifyProcessor extends UpdateRequestProcessor
This processor is intended used in connection with processing web resources, and helping to produce values which may be used for boosting or filtering later.
Modifier and Type | Field and Description |
---|---|
private java.lang.String |
canonicalUrlFieldname |
private static java.lang.String |
DEFAULT_LANDINGPAGE_FIELDNAME |
private static java.lang.String |
DEFAULT_LENGTH_FIELDNAME |
private static java.lang.String |
DEFAULT_LEVELS_FIELDNAME |
private static java.lang.String |
DEFAULT_TOPLEVEL_FIELDNAME |
private static java.lang.String |
DEFAULT_URL_FIELDNAME |
private java.lang.String |
domainFieldname |
private boolean |
enabled |
private static java.lang.String |
INPUT_FIELD_PARAM |
private java.lang.String |
landingpageFieldname |
private static java.lang.String[] |
landingPageSuffixes |
private java.lang.String |
lengthFieldname |
private java.lang.String |
levelsFieldname |
private static org.slf4j.Logger |
log |
private static java.lang.String |
OUTPUT_CANONICALURL_FIELD_PARAM |
private static java.lang.String |
OUTPUT_DOMAIN_FIELD_PARAM |
private static java.lang.String |
OUTPUT_LANDINGPAGE_FIELD_PARAM |
private static java.lang.String |
OUTPUT_LENGTH_FIELD_PARAM |
private static java.lang.String |
OUTPUT_LEVELS_FIELD_PARAM |
private static java.lang.String |
OUTPUT_TOPLEVEL_FIELD_PARAM |
private java.lang.String |
toplevelpageFieldname |
private java.lang.String |
urlFieldname |
next
Constructor and Description |
---|
URLClassifyProcessor(SolrParams parameters,
SolrQueryRequest request,
SolrQueryResponse response,
UpdateRequestProcessor nextProcessor) |
Modifier and Type | Method and Description |
---|---|
java.net.URL |
getCanonicalUrl(java.net.URL url)
Gets a canonical form of the URL for use as main URL
|
java.net.URL |
getNormalizedURL(java.lang.String url) |
private java.lang.String |
getPathWithoutSuffix(java.net.URL url) |
private void |
initParameters(SolrParams parameters) |
boolean |
isEnabled() |
boolean |
isLandingPage(java.net.URL url)
Calculates whether the URL is a landing page or not
|
boolean |
isTopLevelPage(java.net.URL url)
Calculates whether a URL is a top level page
|
private java.lang.String |
landingPageSuffix(java.net.URL url) |
int |
length(java.net.URL url)
Calculates the length of the URL in characters
|
int |
levels(java.net.URL url)
Calculates the number of path levels in the given URL
|
void |
processAdd(AddUpdateCommand command) |
void |
setEnabled(boolean enabled) |
finish, processCommit, processDelete, processMergeIndexes, processRollback
private static final java.lang.String INPUT_FIELD_PARAM
private static final java.lang.String OUTPUT_LENGTH_FIELD_PARAM
private static final java.lang.String OUTPUT_LEVELS_FIELD_PARAM
private static final java.lang.String OUTPUT_TOPLEVEL_FIELD_PARAM
private static final java.lang.String OUTPUT_LANDINGPAGE_FIELD_PARAM
private static final java.lang.String OUTPUT_DOMAIN_FIELD_PARAM
private static final java.lang.String OUTPUT_CANONICALURL_FIELD_PARAM
private static final java.lang.String DEFAULT_URL_FIELDNAME
private static final java.lang.String DEFAULT_LENGTH_FIELDNAME
private static final java.lang.String DEFAULT_LEVELS_FIELDNAME
private static final java.lang.String DEFAULT_TOPLEVEL_FIELDNAME
private static final java.lang.String DEFAULT_LANDINGPAGE_FIELDNAME
private static final org.slf4j.Logger log
private boolean enabled
private java.lang.String urlFieldname
private java.lang.String lengthFieldname
private java.lang.String levelsFieldname
private java.lang.String toplevelpageFieldname
private java.lang.String landingpageFieldname
private java.lang.String domainFieldname
private java.lang.String canonicalUrlFieldname
private static final java.lang.String[] landingPageSuffixes
public URLClassifyProcessor(SolrParams parameters, SolrQueryRequest request, SolrQueryResponse response, UpdateRequestProcessor nextProcessor)
private void initParameters(SolrParams parameters)
public void processAdd(AddUpdateCommand command) throws java.io.IOException
processAdd
in class UpdateRequestProcessor
java.io.IOException
public java.net.URL getCanonicalUrl(java.net.URL url)
url
- The input urlpublic int length(java.net.URL url)
url
- The input URLpublic int levels(java.net.URL url)
url
- The input URLpublic boolean isTopLevelPage(java.net.URL url)
url
- The input URLpublic boolean isLandingPage(java.net.URL url)
url
- The input URLpublic java.net.URL getNormalizedURL(java.lang.String url) throws java.net.MalformedURLException, java.net.URISyntaxException
java.net.MalformedURLException
java.net.URISyntaxException
public boolean isEnabled()
public void setEnabled(boolean enabled)
private java.lang.String landingPageSuffix(java.net.URL url)
private java.lang.String getPathWithoutSuffix(java.net.URL url)