本文整理汇总了Java中org.htmlparser.util.NodeList.size方法的典型用法代码示例。如果您正苦于以下问题:Java NodeList.size方法的具体用法?Java NodeList.size怎么用?Java NodeList.size使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.htmlparser.util.NodeList
的用法示例。
在下文中一共展示了NodeList.size方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: parseDetailInfo
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
private Map<String, String> parseDetailInfo(NodeList nodeList) {
Map<String, String> InfoMap = Maps.newHashMap();
if (nodeList.size() == 0) {
return InfoMap;
}
for (Node pageNode : nodeList.elementAt(0).getChildren().toNodeArray()) {
try {
if (pageNode instanceof LinkTag) {
String rawId = ((LinkTag) pageNode).getAttribute("id");
if (StringUtils.isBlank(rawId)) {
continue;
}
if (rawId.contains("all")) {
continue;
}
String id = rawId.substring(rawId.indexOf("_") + 1);
InfoMap.put(id, pageNode.toPlainTextString());
}
} catch (Exception e) {
log.error("parse parseDetailInfo catch Exception:", e);
}
}
return InfoMap;
}
示例2: run
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
@Override
public void run() {
try {
parser = new Parser(content);
logger.info(currentThread().getName() + "开始解析Post请求响应的HTML!,并存储到HBASE中!");
NodeIterator rootList = parser.elements();
rootList.nextNode();
NodeList nodeList = rootList.nextNode().getChildren();
// System.out.println("===================="+nodeList.size());
/*
* 判断该HTML响应是否有具体的内容,在出错或者到所有数据读取完毕时起效
* 如果起效,修改endFlag标志位,停止开启新的线程,结束当前任务!
*/
if (nodeList.size() <= 4) {
program.endFlag = true;
}
/*
* 找到对应的tag记录,然后解析
*/
nodeList.remove(0);
nodeList.remove(0);
SimpleNodeIterator childList = nodeList.elements();
while (childList.hasMoreNodes()) {
Node node = childList.nextNode();
if (node.getChildren() != null) {
toObject(node);
}
}
} catch (Exception e) {
logger.error(currentThread().getName() + "解析HTML文件出现异常!\n"+e.getMessage()+"\n");
} finally {
logger.info(currentThread().getName() + "HTML文件解析结束!");
store.close();
}
}
示例3: parsePageInfo
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
/**
* 爬取当前楼幢的页数
*
* @return
* @throws InterruptedException
* @throws IOException
* @throws Exception
*/
public int parsePageInfo(String url, DepartmentInfo departmentInfo) throws ParserException, IOException {
Parser parser = new Parser(CommonHttpURLConnection.getURLConnection(url));
int page = 0;
//解析页数
NodeFilter nodeFilter = new HasAttributeFilter("class", "spagenext");
NodeList nodeList = parser.extractAllNodesThatMatch(nodeFilter);
if (nodeList.size() == 0) {
return page;
}
for (Node pageNode : nodeList.elementAt(0).getChildren().toNodeArray()) {
if (pageNode instanceof Span) {
try {
String tmp = pageNode.toPlainTextString();
page = Integer.parseInt(tmp.substring(tmp.indexOf("/") + 1, tmp.indexOf("总数") - 1).trim());
break;
} catch (Exception e) {
}
}
}
log.info("get total page [{}] for department:[{}]", page, departmentInfo.toString());
return page;
}
示例4: parseFlashEmbedTag
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
/**
* Processes the EMBED node that should contain the Flash animation:
* @param embedTag the Root object tag to tackle
* @param flashObjToFill the flash obect to fill in with data
* @return the updated flash object
*/
@SuppressWarnings("unchecked")
private FlashEmbeddedObject parseFlashEmbedTag( NodeList embeds, final FlashEmbeddedObject flashObjToFill ) {
if( embeds != null ) {
logger.debug( "The number of embed-tag nodes is " + embeds.size() );
for( int i = 0; i < embeds.size() ; i++ ) {
Node embedNode = embeds.elementAt( i );
if( embedNode instanceof Tag ) {
Tag embedTag = (Tag) embedNode;
//If it is not an end node then we process its attributes, if it is an empty
//XML tag then we do the same I believe an empty XML tag is smth like: <TAG />
if( !embedTag.isEndTag() || embedTag.isEmptyXmlTag() ) {
//Process the attributes
logger.debug("Processing embed node's '" + embedTag + "' attributes");
Vector<Attribute> atts = (Vector<Attribute>) embedTag.getAttributesEx();
if( atts != null ) {
for( Attribute att : atts ) {
String nameValue = att.getName();
String valueValue = att.getValue();
if( ! flashObjToFill.setNameValue( nameValue, valueValue ) ) {
logger.warn("An unknown EMBED attribute, name='" + nameValue + "' value='" + valueValue + "'" );
} else {
logger.debug("Set the EMBED attribute, name='" + nameValue + "' value='" + valueValue + "'");
}
}
}
} else {
logger.warn( "Encountered an EMBED node: " + embedTag + " that is an end tag!" );
}
} else {
logger.warn( "Encountered a EMBED node: " + embedNode + " that is not an EMBED tag!" );
}
}
} else {
logger.debug( "The list of embed-tag nodes is null" );
}
return flashObjToFill;
}
示例5: extractTextByTextNode
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
public static List<String> extractTextByTextNode(String content){
List<String> doc=new ArrayList<String>();//每个元素为一个段落
if (content == null) {
return doc;
}
try{
Parser parser = Parser.createParser(content, "utf8");
NodeFilter textFilter = new NodeClassFilter(TextNode.class);
NodeList nodelist=parser.extractAllNodesThatMatch(textFilter);
HashMap<String,Integer> parentWeight=new HashMap<String,Integer>();
for (int i = 0; i < nodelist.size(); i++) {
Node textnode = (Node) nodelist.elementAt(i);
if(textnode.toPlainTextString().trim().length()>0)
log.debug(i+": "+" content: "+textnode.toPlainTextString());
if(isInformativeStricter(textnode,parentWeight)){
log.debug(i+": "+" content: "+textnode.toPlainTextString());
doc.add(textnode.toPlainTextString());
}
}
}catch(Exception e){
e.printStackTrace();
log.error("Text extractor has encountered a problem!! "+e.getMessage());
}
return doc;
}
示例6: extractTextByTagP
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
public static List<String> extractTextByTagP(String content){
List<String> doc=new ArrayList<String>();//每个元素为一个段落
try{
if (content == null) {
return doc;
}
Parser parser = Parser.createParser(content, "utf8");
TagNameFilter paraFilter=new TagNameFilter("p");//get content between <p> </p>
// TagNameFilter paraFilter2=new TagNameFilter("br");//get content between <br> </br>
// NodeFilter filter = new OrFilter(paraFilter, paraFilter2);
NodeList nodelist=parser.extractAllNodesThatMatch(paraFilter);//报错!!
HashMap<String,Integer> parentWeight=new HashMap<String,Integer>();
for (int i = 0; i < nodelist.size(); i++) {
Node textnode = (Node) nodelist.elementAt(i);
log.debug(i+": "+" content: "+textnode.toPlainTextString());
if(isInformative(textnode,parentWeight)){
log.debug(i+": "+" content: "+textnode.toPlainTextString());
doc.add(textnode.toPlainTextString());
}
}
}catch(Exception e){
e.printStackTrace();
log.error("Text extractor has encountered a problem!! "+e.getMessage());
}
return doc;
}
示例7: getLinks
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
public static List<String> getLinks(String url) throws ParserException {
Parser htmlParser = new Parser(url);
List<String> links = new LinkedList<String>();
NodeList tagNodeList = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class));
for (int m = 0; m < tagNodeList.size(); m++) {
LinkTag loopLinks = (LinkTag) tagNodeList.elementAt(m);
String linkName = loopLinks.getLink();
links.add(linkName);
}
return links;
}
示例8: processResponse
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
private boolean processResponse(HttpResponse resp, Document doc, Element root) {
if(resp.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
System.out.println("[INFO] HTTP Status OK.");
System.out.println("[INFO] Extracting html page...");
String html = extractHtml(resp);
if(html == null) return false;
System.out.println("[INFO] " + html.length() + "B html page extracted.");
if(html.length() < 500) {
System.out.println("[INFO] EOF reached, task completed.");
return false;
} else {
System.out.println("[INFO] Parsing html page...");
try {
Parser parser = new Parser(html);
NodeList weibo_list = parser.extractAllNodesThatMatch(
new HasAttributeFilter("action-type", "feed_list_item"));
System.out.println("[INFO] " + weibo_list.size() + " entries detected.");
SimpleNodeIterator iter = weibo_list.elements();
while(iter.hasMoreNodes()) {
System.out.println("[INFO] processing entry #" + (++total) + "...");
Element elem = extractContent(iter.nextNode(), doc);
if(elem == null) {
System.out.println("[ERROR] Data extraction failed.");
return false;
}
root.appendChild(elem);
}
if(weibo_list.size() != 15) return false;
} catch (ParserException e) {
System.out.println("[ERROR] Parser failed.");
e.printStackTrace();
return false;
}
}
} else {
return false;
}
return true;
}
示例9: extractContent
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
private Element extractContent(Node n, Document doc) {
String mid = ((TagNode)n).getAttribute("mid");
if(mid == null) {
System.out.println("[ERROR] MID tag not found.");
return doc.createElement("MID_NOT_FOUND");
}
NodeList text = n.getChildren().extractAllNodesThatMatch(
new HasAttributeFilter("class", "WB_text"), true);
NodeList time = n.getChildren().extractAllNodesThatMatch(
new HasAttributeFilter("class", "S_link2 WB_time"), true);
if(time.size() == 0 || text.size() == 0) {
System.out.println("[ERROR] No identifiers found for Weibo No." + mid + ".");
return doc.createElement("UNRECOGNIZED_" + mid);
}
Element elem = doc.createElement("MID_" + mid);
Attr attr = doc.createAttribute("time");
attr.setNodeValue(time.elementAt(0).getChildren().asString());
elem.setAttributeNode(attr);
Element content = doc.createElement("content");
content.setTextContent(text.elementAt(0).getChildren().asString());
elem.appendChild(content);
if(text.size() > 1) {
Element retweet = doc.createElement("retweet");
Attr from = doc.createAttribute("from");
from.setNodeValue(text.elementAt(1).getParent().getFirstChild().getNextSibling().getFirstChild().getNextSibling().toPlainTextString());
retweet.setAttributeNode(from);
retweet.setTextContent(text.elementAt(1).getChildren().asString());
elem.appendChild(retweet);
}
fetchComment(mid, doc, elem);
System.out.println("[INFO] Weibo No." + mid + " processed.");
return elem;
}
示例10: extracLinks
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
/**
* 获取一个网站上的a链接
* @param url
* @return
*/
public static Set<String> extracLinks(String url) {
Set<String> links = new HashSet<String>();
try {
Parser parser = new Parser(url);
parser.setEncoding("utf-8");
// 过滤 <frame >标签的 filter,用来提取 frame 标签里的 src 属性所表示的链接
@SuppressWarnings("serial")
NodeFilter frameFilter = new NodeFilter() {
public boolean accept(Node node) {
if (node.getText().startsWith("frame src=")) {
return true;
} else {
return false;
}
}
};
// OrFilter 来设置过滤 <a> 标签,和 <frame> 标签
OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);
// 得到所有经过过滤的标签
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag) {
// <a> 标签
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();
links.add(linkUrl);
} else {
// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1) {
end = frame.indexOf(">");
}
String frameUrl = frame.substring(5, end - 1);
links.add(frameUrl);
}
}
} catch (ParserException e) {
logger.error("", e);
}
return links;
}
示例11: extracLinks
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
public static Set<String> extracLinks(String url, LinkFilter filter) {
Set<String> links = new HashSet<String>();
try{
Parser parser = new Parser(url);
parser.setEncoding("gb2312");
// <frame >
@SuppressWarnings("serial")
NodeFilter frameFilter = new NodeFilter(){
public boolean accept(Node node){
if (node.getText().startsWith("frame src=")){
return true;
}
else{
return false;
}
}
};
//<a><frame>
OrFilter linkFilter = new OrFilter(new NodeClassFilter(
LinkTag.class), frameFilter);
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag){// <a>
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();
if (filter.accept(linkUrl))
links.add(linkUrl);
}
else{
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1)
end = frame.indexOf(">");
String frameUrl = frame.substring(5, end - 1);
if (filter.accept(frameUrl))
links.add(frameUrl);
}
}
System.out.println(links);
} catch (ParserException e){
e.printStackTrace();
}
return links;
}
示例12: parseTheEmbeddedObject
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
/**
* Parses the embedded object, creates the Flash embedded object out of it,
* if possible, then serializes it into string and returns the string.
* If the object could not be parseed or it turnes out to be a non Flash
* embedded object, then an exception is thrown
* @param textToParse the text to parse
* @return the string with the filtered, verified and completed embedded
* Flash animation embedding code. Creates Flash with the sameDomain
* security level.
* @throws MessageException if the provided HTML code is broken or the animation was detected to be not a flash movie
*/
private String parseTheEmbeddedObject( final String textToParse ) throws MessageException {
String result = "";
try{
logger.debug("Trying to parse the found message-embedded object: " + textToParse );
Parser parser = new Parser( new Lexer( textToParse ) );
NodeList nodes = parser.parse( null );
//Process the nodes in the result
NodeList objects = nodes.extractAllNodesThatMatch( new TagNameFilter( FlashEmbeddedObject.OBJECT_TAG_NAME ) );
/* Create Flash with the never security level, to prevent Flash injection,
the user can have a url pointing to XCure itself but not an external
flash with the getURL exevuting malicius JavaScript that, e.g. reads
the user's session coockies */
FlashEmbeddedObject flashObject = new FlashEmbeddedObject( xcureDomainPattern );
if( (objects.size() <= 2 ) && ( objects.size() > 0 ) ) {
//If there are OBJECT tags then parse them
parseFlashObjectTag( objects, flashObject );
} else {
//If there are no OBJECT tags then parse the EMBED tags
NodeList embeds = nodes.extractAllNodesThatMatch( new TagNameFilter( FlashEmbeddedObject.EMBED_TAG_NAME ) );
if( embeds.size() <= 2 ) {
//There should not be more than two EMBED tags because one is the open and another is the close tags
parseFlashEmbedTag( embeds, flashObject );
} else {
logger.error("An improper number of the object (" + objects.size() +
") and embed (" + embeds.size() + ") tags in the string: " + textToParse);
throw new MessageException( MessageException.IMPROPER_EMBEDDED_OBJECT );
}
}
//Validate the obtained flash object
if( flashObject.isValidEmbedFlash() ) {
//Complete the flash object
flashObject.completeEmbedFlash();
//Serialize the object into String
result = flashObject.toString();
} else {
logger.error( "The parsed embedded object '" + textToParse +
"' was not recognized as a valid flash animation, we got:" + flashObject.toString() );
throw new MessageException( MessageException.IMPROPER_EMBEDDED_OBJECT );
}
} catch( Exception e ) {
logger.error("Unable to parse the embedded object from the user's message: " + textToParse, e);
throw new MessageException( MessageException.IMPROPER_EMBEDDED_OBJECT );
}
return result;
}
示例13: extracLinks
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
public static void extracLinks(String url) {
try {
Parser parser = new Parser(url);
parser.setEncoding("utf-8");// gb2312
// 过滤 <frame> 标签的 filter,用来提取 frame 标签里的 src 属性所、表示的链接
NodeFilter frameFilter = new NodeFilter() {
public boolean accept(Node node) {
if (node.getText().startsWith("frame src=")) {
return true;
} else {
return false;
}
}
};
// OrFilter 来设置过滤 <a> 标签,<img> 标签和 <frame> 标签,三个标签是 or 的关系
OrFilter orFilter = new OrFilter(
new NodeClassFilter(LinkTag.class), new NodeClassFilter(
ImageTag.class));
OrFilter linkFilter = new OrFilter(orFilter, frameFilter);
// 得到所有经过过滤的标签
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag)// <a> 标签
{
LinkTag link = (LinkTag) tag;
String linkUrl = link.getLink();// url
String text = link.getLinkText();// 链接文字
System.out.println(linkUrl + "**********" + text);
} else if (tag instanceof ImageTag)// <img> 标签
{
ImageTag image = (ImageTag) list.elementAt(i);
System.out.print(image.getImageURL() + "********");// 图片地址
System.out.println(image.getText());// 图片文字
} else// <frame> 标签
{
// 提取 frame 里 src 属性的链接如 <frame src="test.html"/>
String frame = tag.getText();
int start = frame.indexOf("src=");
frame = frame.substring(start);
int end = frame.indexOf(" ");
if (end == -1)
end = frame.indexOf(">");
frame = frame.substring(5, end - 1);
System.out.println(frame);
}
}
} catch (ParserException e) {
e.printStackTrace();
}
}
示例14: extractLink
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
public static void extractLink(String content, String keyword) {
/**
* 通过判断链接中是否含keyword确定是否为有效链接。
* 注:keyword可能是一组词语或者是一个短语,检索出的内容或许只是匹配上keyword中部分词语
*/
try {
Parser parser = Parser.createParser(content, "utf8");
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
NodeList nodelist = parser.extractAllNodesThatMatch(linkFilter);
int lastNodeID=0;//上一个确定为有效链接的node ID
int disThre=8; //通常检索出来的有效链接的id是连续的,因此可用此区分那些广告信息
for (int i = 0; i < nodelist.size(); i++) {
Node node = (Node) nodelist.elementAt(i);
LinkTag link = (LinkTag) node;
String linkUrl = link.getLink();// url
String text = link.getLinkText();// 链接文字
/* //simple keywords test for debug
boolean flag=false;
String[] tmps=keyword.split("\\s+");
for(String tmp:tmps){
if(text.contains(tmp)){
flag=true;break;
}
}
if(flag){*/
if(containKeyword(text,keyword)){
if(lastNodeID>0 &&i-lastNodeID>disThre){
log.debug("Noisy link!!!");
continue;
}
if(!linkUrl.startsWith("http")) continue;
log.debug(i+":"+linkUrl+", "+text);
lastNodeID=i;
LinkDb.addUnvisitedUrl(linkUrl);
}else{
/* if(text.contains("下一页")){
System.out.println(i+":"+linkUrl+", "+text);
}*/
}
}
} catch (Exception e) {
e.printStackTrace();
log.error("Link extractor has encountered a problem!! "+e.getMessage());
}
}
示例15: filter
import org.htmlparser.util.NodeList; //导入方法依赖的package包/类
private AX2JClassTranslator filter(String content) {
try {
Parser parser = Parser.createParser(content, Config.ENCODE);
AndFilter andFilter1 =
new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("class","alt-color api apilevel-"));
AndFilter andFilter2 =
//kill me, the " api apilevel-" has a space at the start
new AndFilter(new TagNameFilter("tr"), new HasAttributeFilter("class"," api apilevel-"));
OrFilter orFilter = new OrFilter(andFilter1, andFilter2);
NodeList tableNodeList = parser.parse(orFilter);
NodeIterator tableIt = tableNodeList.elements();
AX2JClassTranslator map = new AX2JClassTranslator(type);
while(tableIt.hasMoreNodes()) {
Node trNode = tableIt.nextNode();
NodeList trNodeList = trNode.getChildren();
/**
* ***** trNodeList example *****
* Txt (268[6,37],269[7,0]): \nTag (269[7,0],292[7,23]): td class="jd-linkcol"
* Tag (292[7,23],381[7,112]): a href="../../../reference/android/view/View.html...
* Txt (381[7,112],412[7,143]): android:accessibilityLiveRegion
* End (412[7,143],416[7,147]): /a
* End (416[7,147],421[7,152]): /td
* Txt (421[7,152],422[8,0]): \nTag (422[8,0],445[8,23]): td class="jd-linkcol"
* Txt (445[8,23],446[9,0]): \n
* Tag (446[9,0],530[9,84]): a href="../../../reference/android/view/View.html#s...
* Txt (530[9,84],561[9,115]): setAccessibilityLiveRegion(int)
* End (561[9,115],565[9,119]): /a
* Txt (565[9,119],566[10,0]): \n
* End (566[10,0],571[10,5]): /td
* Txt (571[10,5],572[11,0]): \nTag (572[11,0],609[11,37]): td class="jd-descrcol" width="100%"
* Txt (609[11,37],712[14,0]): \nIndicates to accessibility services whether the...
* End (712[14,0],717[14,5]): /td
* Txt (717[14,5],718[15,0]): \n
* ***** trNodeList example *****
*/
if (trNodeList.size() != 7) {
throw new AndroidDocException(AndroidDocException.ATM_FORMAT_ERROR);
}
String attr = trNodeList.elementAt(1).toPlainTextString();
attr = attr.replace("\n", "");
String method = trNodeList.elementAt(3).toPlainTextString();
map.add(attr, method);
}
return map;
} catch (ParserException e) {
throw new AndroidDocException(AndroidDocException.AXML_FORMAT_ERROR);
}
}