本文整理汇总了Java中org.htmlparser.filters.OrFilter类的典型用法代码示例。如果您正苦于以下问题:Java OrFilter类的具体用法?Java OrFilter怎么用?Java OrFilter使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
OrFilter类属于org.htmlparser.filters包,在下文中一共展示了OrFilter类的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: readTextAndLinkAndTitle
import org.htmlparser.filters.OrFilter; //导入依赖的package包/类
/**
* 分别读纯文本和链接.
* @param result 网页的内容
* @throws Exception
*/
public static void readTextAndLinkAndTitle(String result) throws Exception {
Parser parser;
NodeList nodelist;
parser = Parser.createParser(result, "utf8");
NodeFilter textFilter = new NodeClassFilter(TextNode.class);
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
NodeFilter titleFilter = new NodeClassFilter(TitleTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { textFilter, linkFilter, titleFilter });
nodelist = parser.parse(lastFilter);
Node[] nodes = nodelist.toNodeArray();
String line = "";
for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i];
if (node instanceof TextNode) {
TextNode textnode = (TextNode) node;
line = textnode.getText();
} else if (node instanceof LinkTag) {
LinkTag link = (LinkTag) node;
line = link.getLink();
} else if (node instanceof TitleTag) {
TitleTag titlenode = (TitleTag) node;
line = titlenode.getTitle();
}
if (isTrimEmpty(line))
continue;
System.out.println(line);
}
}
示例2: extracLinks
import org.htmlparser.filters.OrFilter; //导入依赖的package包/类
/**
* 获取一个网站上的a链接
* @param url
* @return
*/
public static Set<String> extracLinks(String url) {
Set<String> links = new HashSet<String>();
try {
Parser parser = new Parser(url);
parser.setEncoding("utf-8");
// 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
@SuppressWarnings("serial")
NodeFilter frameFilter = new NodeFilter() {
public boolean accept(Node node) {
if (node.getText().startsWith("frame src=")) {
return true;
} else {
return false;
}
}
};
// OrFilter 来设置过滤 <a> 标签,和 <frame> 标签
OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);
// 得到所有经过过滤的标签
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag) {
// <a> 标签
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();
links.add(linkUrl);
} else {
// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1) {
end = frame.indexOf(">");
}
String frameUrl = frame.substring(5, end - 1);
links.add(frameUrl);
}
}
} catch (ParserException e) {
logger.error("", e);
}
return links;
}
示例3: parserNode
import org.htmlparser.filters.OrFilter; //导入依赖的package包/类
/**
* 对结点进行词法分析
* @param node 所要分析的结点
*/
private void parserNode(Node node) {
depth ++;
String regex = "[ \b\t\n\f\r]*";
if(node instanceof TextNode) { // 若为文本结点,则进行分词
if(depth == 1) {
System.out.println("TextNode!");
Lexer lexer = new Lexer(node.getPage());
Parser parser = new Parser(lexer, Parser.STDOUT);
//TODO filter script & style
OrFilter it = new OrFilter(new NotFilter(new TagNameFilter("script ")), new NotFilter(new TagNameFilter("style ")));
try {
NodeList nl = parser.extractAllNodesThatMatch(it);
NodeIterator nit = nl.elements();
while(nit.hasMoreNodes()) {
Node n = nit.nextNode();
if(n instanceof TextNode) {
if(!(n.getText().matches(regex))) { // 用正则表达式进行匹配,对非空的文本进行分词
segment(n.getText()); // 对网页中的文本进行分词
}
}
}
}
catch(ParserException exc) {
System.out.println("ParserException");
//exc.printStackTrace();
}
}
}
else if(node instanceof TagNode) { // 若为链接结点,则扩展外链
if(node instanceof LinkTag) {
LinkTag tag = (LinkTag)node;
if(!(tag.getLink().matches(regex))) {
urlInfo.addExtendedURL(tag.getLink()); // 将得到的外链加入到urlInfo中
}
}
dealTag(node);
}
depth --;
}
示例4: extracLinks
import org.htmlparser.filters.OrFilter; //导入依赖的package包/类
public static Set<String> extracLinks(String url, LinkFilter filter) {
Set<String> links = new HashSet<String>();
try{
Parser parser = new Parser(url);
parser.setEncoding("gb2312");
// <frame >
@SuppressWarnings("serial")
NodeFilter frameFilter = new NodeFilter(){
public boolean accept(Node node){
if (node.getText().startsWith("frame src=")){
return true;
}
else{
return false;
}
}
};
//<a><frame>
OrFilter linkFilter = new OrFilter(new NodeClassFilter(
LinkTag.class), frameFilter);
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag){// <a>
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();
if (filter.accept(linkUrl))
links.add(linkUrl);
}
else{
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1)
end = frame.indexOf(">");
String frameUrl = frame.substring(5, end - 1);
if (filter.accept(frameUrl))
links.add(frameUrl);
}
}
System.out.println(links);
} catch (ParserException e){
e.printStackTrace();
}
return links;
}
示例5: extracLinks
import org.htmlparser.filters.OrFilter; //导入依赖的package包/类
public static void extracLinks(String url) {
try {
Parser parser = new Parser(url);
parser.setEncoding("utf-8");// gb2312
// 过滤 <frame> 标签的 filter,用来提取 frame 标签里的 src 属性所、表示的链接
NodeFilter frameFilter = new NodeFilter() {
public boolean accept(Node node) {
if (node.getText().startsWith("frame src=")) {
return true;
} else {
return false;
}
}
};
// OrFilter 来设置过滤 <a> 标签,<img> 标签和 <frame> 标签,三个标签是 or 的关系
OrFilter orFilter = new OrFilter(
new NodeClassFilter(LinkTag.class), new NodeClassFilter(
ImageTag.class));
OrFilter linkFilter = new OrFilter(orFilter, frameFilter);
// 得到所有经过过滤的标签
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag)// <a> 标签
{
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();// url
String text = link.getLinkText();// 链接文字
System.out.println(linkUrl + "**********" + text);
} else if (tag instanceof ImageTag)// <img> 标签
{
ImageTag image = (ImageTag) list.elementAt(i);
System.out.print(image.getImageURL() + "********");// 图片地址
System.out.println(image.getText());// 图片文字
} else// <frame> 标签
{
// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1)
end = frame.indexOf(">");
frame = frame.substring(5, end - 1);
System.out.println(frame);
}
}
} catch (ParserException e) {
e.printStackTrace();
}
}
示例6: filter
import org.htmlparser.filters.OrFilter; //导入依赖的package包/类
private AX2JClassTranslator filter(String content) {
try {
Parser parser = Parser.createParser(content, Config.ENCODE);
AndFilter andFilter1 =
new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("class","alt-color api apilevel-"));
AndFilter andFilter2 =
//kill me, the " api apilevel-" has a space at the start
new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("class"," api apilevel-"));
OrFilter orFilter = new OrFilter(andFilter1, andFilter2);
NodeList tableNodeList = parser.parse(orFilter);
NodeIterator tableIt = tableNodeList.elements();
AX2JClassTranslator map = new AX2JClassTranslator(type);
while(tableIt.hasMoreNodes()) {
Node trNode = tableIt.nextNode();
NodeList trNodeList = trNode.getChildren();
/**
* ***** trNodeList example *****
* Txt (268[6,37],269[7,0]): \nTag (269[7,0],292[7,23]): td class="jd-linkcol"
* Tag (292[7,23],381[7,112]): a href="../../../reference/android/view/View.html...
* Txt (381[7,112],412[7,143]): android:accessibilityLiveRegion
* End (412[7,143],416[7,147]): /a
* End (416[7,147],421[7,152]): /td
* Txt (421[7,152],422[8,0]): \nTag (422[8,0],445[8,23]): td class="jd-linkcol"
* Txt (445[8,23],446[9,0]): \n
* Tag (446[9,0],530[9,84]): a href="../../../reference/android/view/View.html#s...
* Txt (530[9,84],561[9,115]): setAccessibilityLiveRegion(int)
* End (561[9,115],565[9,119]): /a
* Txt (565[9,119],566[10,0]): \n
* End (566[10,0],571[10,5]): /td
* Txt (571[10,5],572[11,0]): \nTag (572[11,0],609[11,37]): td class="jd-descrcol" width="100%"
* Txt (609[11,37],712[14,0]): \nIndicates to accessibility services whether the...
* End (712[14,0],717[14,5]): /td
* Txt (717[14,5],718[15,0]): \n
* ***** trNodeList example *****
*/
if (trNodeList.size() != 7) {
throw new AndroidDocException(AndroidDocException.ATM_FORMAT_ERROR);
}
String attr = trNodeList.elementAt(1).toPlainTextString();
attr = attr.replace("\n", "");
String method = trNodeList.elementAt(3).toPlainTextString();
map.add(attr, method);
}
return map;
} catch (ParserException e) {
throw new AndroidDocException(AndroidDocException.AXML_FORMAT_ERROR);
}
}
示例7: extracLinks
import org.htmlparser.filters.OrFilter; //导入依赖的package包/类
/**
* 获取一个网页上的链接,并加入到队列中
* @param content
* @param filter 用来过滤链接
* @return Set<String>
* @author cxn 2015年11月5日
*/
public static Set<String> extracLinks(String content, LinkFilter filter) {
Set<String> links = new HashSet<String>();
try {
Parser parser = new Parser(content);
// parser.setEncoding("utf-8");
// 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
NodeFilter frameFilter = new NodeFilter() {
public boolean accept(Node node) {
if (node.getText().startsWith("frame src=")) {
return true;
} else {
return false;
}
}
};
// OrFilter 来设置过滤 <a> 标签,和 <frame> 标签
OrFilter linkFilter = new OrFilter(new NodeClassFilter(
LinkTag.class), frameFilter);
// 得到所有经过过滤的标签
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag){
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();
if(linkUrl.startsWith("http") && filter.accept(linkUrl, Main.keyWord)){
links.add(linkUrl);
}else if(linkUrl.startsWith("/") && filter.accept(Main.baseUrl+linkUrl, Main.keyWord)){
links.add(Main.baseUrl+linkUrl);
}
}else{
// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1){
end = frame.indexOf(">");
}
String frameUrl = frame.substring(5, end - 1);
if (filter.accept(frameUrl)){
links.add(frameUrl);
}
}
}
} catch (ParserException e) {
e.printStackTrace();
}
LinkQueue.addUnvisitedUrl(links);
return links;
}
示例8: content
import org.htmlparser.filters.OrFilter; //导入依赖的package包/类
@Action(value = "eventcontent", results = { @Result(type = "json", params = {
"root", "en" }) })
public String content() {
Cache c = CacheManager.getInstance().getCache("News");
String ckey = "eventcontent" + newsid;
Element ele = c.get(ckey);
if (!CommonUtil.isEmpty(ele)) {
en = (News) ele.getObjectValue();
} else {
StringBuffer retstr = fetch(RD+"/calendar/?a=one&evid="
+ newsid+"&_="+System.currentTimeMillis());
Parser p = Parser.createParser(retstr.toString(), "utf-8");
try {
NodeList nl = p.extractAllNodesThatMatch(new OrFilter(
new TagNameFilter("h1"), new TagNameFilter("table")));
SimpleNodeIterator i = nl.elements();
en = new News();
en.setId(newsid);
while (i.hasMoreNodes()) {
Node n = i.nextNode();
if (n instanceof TagNode) {
TagNode tn = (TagNode) n;
if (tn.getTagName().equalsIgnoreCase("h1"))
en.setTitle(tn.toPlainTextString());
if (tn.getTagName().equalsIgnoreCase("table")) {
en.setContent(tn.toHtml());
}
}
}
String str=retstr.toString().trim();
int tk=retstr.indexOf("imageurl");
if(tk>0)
{
tk=retstr.indexOf("'",tk);
int tk1=retstr.indexOf("'", tk+1);
String imgurl=RD+str.substring(tk+1,tk1);
String imgid = EncodeHelper.digest(
imgurl, "MD5");
BasicDBObject obj = new BasicDBObject("id",
imgid);
DBCollection col = MongoUtil.getInstance().getDB()
.getCollection("CrawlerImages");
DBObject dbo = col.findOne(obj);
if (dbo == null)
col.save(obj.append("url",imgurl));
en.setPubdate(imgid);
}
} catch (ParserException e) {
e.printStackTrace();
}
if (!CommonUtil.isEmpty(en) && !CommonUtil.isEmpty(en.getContent()))
c.put(new Element(ckey, en));
}
return SUCCESS;
}
示例9: extracLinks
import org.htmlparser.filters.OrFilter; //导入依赖的package包/类
public Set<String> extracLinks(PageResult pageResult, LinkFilter filter) {
//String url=crawlUrl.getOriUrl();
Set<String> links = new HashSet<String>();
try {
Parser parser = new Parser(pageResult.getContent());
parser.setEncoding(pageResult.getCharSet());
// 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
NodeFilter frameFilter = new NodeFilter() {
public boolean accept(Node node) {
if (node.getText().startsWith("frame src=")) {
return true;
} else {
return false;
}
}
};
// OrFilter 来设置过滤 <a> 标签,和 <frame> 标签
OrFilter linkFilter = new OrFilter(new NodeClassFilter(
LinkTag.class), frameFilter);
// 得到所有经过过滤的标签
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag)// <a> 标签
{
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();// url
if (filter.accept(linkUrl))
{
//CrawlUrl crawlUrl=CrawlUrlUtil.getCrawlUrlByUrl(linkUrl);
links.add(linkUrl);
}
} else// <frame> 标签
{
// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1)
end = frame.indexOf(">");
String frameUrl = frame.substring(5, end - 1);
if (filter.accept(frameUrl))
{
//CrawlUrl crawlUrl=CrawlUrlUtil.getCrawlUrlByUrl(frameUrl);
links.add(frameUrl);
}
}
}
} catch (ParserException e) {
e.printStackTrace();
}
return links;
}