本文整理匯總了Java中com.digitalpebble.stormcrawler.parse.ParseData類的典型用法代碼示例。如果您正苦於以下問題:Java ParseData類的具體用法?Java ParseData怎麽用?Java ParseData使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
ParseData類屬於com.digitalpebble.stormcrawler.parse包,在下文中一共展示了ParseData類的6個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: filter
import com.digitalpebble.stormcrawler.parse.ParseData; //導入依賴的package包/類
@Override
public void filter(String URL, byte[] content, DocumentFragment doc,
ParseResult parse) {
ParseData parseData = parse.get(URL);
Metadata metadata = parseData.getMetadata();
if (copyKeyName != null) {
String signature = metadata.getFirstValue(key_name);
if (signature != null) {
metadata.setValue(copyKeyName, signature);
}
}
byte[] data = null;
if (useText) {
String text = parseData.getText();
if (StringUtils.isNotBlank(text)) {
data = text.getBytes(StandardCharsets.UTF_8);
}
} else {
data = content;
}
if (data == null) {
data = URL.getBytes(StandardCharsets.UTF_8);
}
String hex = DigestUtils.md5Hex(data);
metadata.setValue(key_name, hex);
}
示例2: filter
import com.digitalpebble.stormcrawler.parse.ParseData; //導入依賴的package包/類
@Override
public void filter(String URL, byte[] content, DocumentFragment doc,
ParseResult parse) {
if (doc == null) {
return;
}
try {
JsonNode json = filterJson(doc);
if (json == null) {
return;
}
ParseData parseData = parse.get(URL);
Metadata metadata = parseData.getMetadata();
// extract patterns and store as metadata
for (LabelledJsonPointer expression : expressions) {
JsonNode match = json.at(expression.pointer);
if (match.isMissingNode()) {
continue;
}
metadata.addValue(expression.label, match.asText());
}
} catch (Exception e) {
LOG.error("Exception caught when extracting json", e);
}
}
示例3: filter
import com.digitalpebble.stormcrawler.parse.ParseData; //導入依賴的package包/類
@Override
public void filter(String URL, byte[] content, DocumentFragment doc,
ParseResult parse) {
InputStream stream = new ByteArrayInputStream(content);
try {
DocumentBuilderFactory factory = DocumentBuilderFactory
.newInstance();
Document document = factory.newDocumentBuilder().parse(stream);
Element root = document.getDocumentElement();
XPath xPath = XPathFactory.newInstance().newXPath();
XPathExpression expression = xPath.compile("//url");
NodeList nodes = (NodeList) expression.evaluate(root,
XPathConstants.NODESET);
for (int i = 0; i < nodes.getLength(); i++) {
Node node = nodes.item(i);
expression = xPath.compile("loc");
Node child = (Node) expression.evaluate(node,
XPathConstants.NODE);
// create a subdocument for each url found in the sitemap
ParseData parseData = parse.get(child.getTextContent());
NodeList childs = node.getChildNodes();
for (int j = 0; j < childs.getLength(); j++) {
Node n = childs.item(j);
parseData.put(n.getNodeName(), n.getTextContent());
}
}
} catch (Exception e) {
LOG.error("Error processing sitemap from {}: {}", URL, e);
}
}
示例4: execute
import com.digitalpebble.stormcrawler.parse.ParseData; //導入依賴的package包/類
@Override
public void execute(Tuple tuple) {
Metadata metadata = (Metadata) tuple.getValueByField("metadata");
byte[] content = tuple.getBinaryByField("content");
String url = tuple.getStringByField("url");
boolean isFeed = Boolean.valueOf(metadata.getFirstValue(isFeedKey));
if (!isFeed) {
String ct = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE);
if (ct != null) {
for (String clue : mimeTypeClues) {
if (ct.contains(clue)) {
isFeed = true;
metadata.setValue(isFeedKey, "true");
LOG.info("Feed detected from content type <{}> for {}",
ct, url);
break;
}
}
}
}
if (!isFeed) {
if (contentDetector.matches(content)) {
isFeed = true;
metadata.setValue(isFeedKey, "true");
LOG.info("Feed detected from content: {}", url);
}
}
if (isFeed) {
// do not parse but run parse filters
ParseResult parse = new ParseResult();
ParseData parseData = parse.get(url);
parseData.setMetadata(metadata);
parseFilters.filter(url, content, null, parse);
// emit status
collector.emit(Constants.StatusStreamName, tuple,
new Values(url, metadata, Status.FETCHED));
} else {
// pass on
collector.emit(tuple, tuple.getValues());
}
collector.ack(tuple);
}
示例5: execute
import com.digitalpebble.stormcrawler.parse.ParseData; //導入依賴的package包/類
@Override
public void execute(Tuple tuple) {
Metadata metadata = (Metadata) tuple.getValueByField("metadata");
byte[] content = tuple.getBinaryByField("content");
String url = tuple.getStringByField("url");
boolean isSitemap = Boolean.valueOf(
metadata.getFirstValue(SiteMapParserBolt.isSitemapKey));
boolean isNewsSitemap = Boolean.valueOf(
metadata.getFirstValue(NewsSiteMapParserBolt.isSitemapNewsKey));
if (!isNewsSitemap || !isSitemap) {
int match = contentDetector.getFirstMatch(content);
if (match >= 0) {
// a sitemap, not necessarily a news sitemap
isSitemap = true;
metadata.setValue(SiteMapParserBolt.isSitemapKey, "true");
if (match <= NewsSiteMapParserBolt.contentCluesSitemapNewsMatchUpTo) {
isNewsSitemap = true;
LOG.info("{} detected as news sitemap based on content",
url);
metadata.setValue(NewsSiteMapParserBolt.isSitemapNewsKey,
"true");
}
}
}
if (isSitemap) {
// do not parse but run parse filters
ParseResult parse = new ParseResult();
ParseData parseData = parse.get(url);
parseData.setMetadata(metadata);
parseFilters.filter(url, content, null, parse);
// emit status
collector.emit(Constants.StatusStreamName, tuple,
new Values(url, metadata, Status.FETCHED));
} else {
// pass on
collector.emit(tuple, tuple.getValues());
}
collector.ack(tuple);
}
示例6: filter
import com.digitalpebble.stormcrawler.parse.ParseData; //導入依賴的package包/類
@Override
public void filter(String URL, byte[] content, DocumentFragment doc,
ParseResult parse) {
ParseData pd = parse.get(URL);
// TODO determine how to restrict the expressions e.g. regexp on URL
// or value in metadata
// iterates on the expressions - stops at the first that matches
for (LabelledExpression expression : expressions) {
try {
NodeList evalResults = (NodeList) expression.evaluate(doc,
XPathConstants.NODESET);
if (evalResults.getLength() == 0) {
continue;
}
StringBuilder newText = new StringBuilder();
for (int i = 0; i < evalResults.getLength(); i++) {
Node node = evalResults.item(i);
newText.append(node.getTextContent()).append("\n");
}
// ignore if no text captured
if (StringUtils.isBlank(newText.toString())) {
LOG.debug(
"Found match for doc {} but empty text extracted - skipping",
URL);
continue;
}
// give the doc its new text value
LOG.debug(
"Restricted text for doc {}. Text size was {} and is now {}",
URL, pd.getText().length(), newText.length());
pd.setText(newText.toString());
pd.getMetadata().setValue(MATCH_KEY, expression.getLabel());
return;
} catch (XPathExpressionException e) {
LOG.error("Caught XPath expression", e);
}
}
}