当前位置: 首页>>代码示例>>Java>>正文


Java TagNode.evaluateXPath方法代码示例

本文整理汇总了Java中org.htmlcleaner.TagNode.evaluateXPath方法的典型用法代码示例。如果您正苦于以下问题:Java TagNode.evaluateXPath方法的具体用法?Java TagNode.evaluateXPath怎么用?Java TagNode.evaluateXPath使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.htmlcleaner.TagNode的用法示例。


在下文中一共展示了TagNode.evaluateXPath方法的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: processFollow

import org.htmlcleaner.TagNode; //导入方法依赖的package包/类
/**
 * 解析关注页面,关注与被关注
 * 
 * @param followUrl
 */
public static void processFollow(String followUrl) {
	String content = PageUtil.getContent(followUrl);
	HtmlCleaner htmlCleaner = new HtmlCleaner();
	TagNode tNode = htmlCleaner.clean(content);
	extractUserUrl(content);
	try {
		Object[] pageNumObj = tNode
				.evaluateXPath("//*[@id=\"Profile-following\"]//div[@class=\"Pagination\"]/button");
		if (pageNumObj != null && pageNumObj.length > 0) {
			TagNode node = (TagNode) pageNumObj[pageNumObj.length - 2];
			int pagenum = Integer.parseInt(node.getText().toString());
			for (int i = 2; i <= pagenum; i++) {
				String url = followUrl + "?page=" + i;
				content = PageUtil.getContent(url);
				extractUserUrl(content);
			}
		}
	} catch (XPatherException e) {
		logger.error(e.getMessage());
	}
}
 
开发者ID:monsonlee,项目名称:BigData,代码行数:27,代码来源:UserUtil.java

示例2: getTextFromHtmlString

import org.htmlcleaner.TagNode; //导入方法依赖的package包/类
/**
 * This method extracts the text from html string.
 * @param htmlString {@link String}
 * @return {@link String}
 */
public static String getTextFromHtmlString(String htmlString) {
	String errorText = "";
	CleanerProperties cleanerProps = new CleanerProperties();
	// set some properties to non-default values
	cleanerProps.setTransResCharsToNCR(true);
	cleanerProps.setTranslateSpecialEntities(true);
	cleanerProps.setOmitComments(true);
	cleanerProps.setOmitDoctypeDeclaration(true);
	cleanerProps.setOmitXmlDeclaration(true);
	cleanerProps.setUseEmptyElementTags(true);

	HtmlCleaner cleaner = new HtmlCleaner(cleanerProps);
	TagNode tagNode = cleaner.clean(htmlString);
	Object[] rootNode = null;
	try {
		rootNode = tagNode.evaluateXPath("//table");
		if (null != rootNode && rootNode.length > 0) {
			TagNode[] textNode = ((TagNode) rootNode[rootNode.length - 1]).getElementsByName("td", true);
			for (TagNode tag : textNode) {
				if (tag != null && tag.getText() != null) {
					StringBuilder errorTextString = new StringBuilder();
					errorTextString.append(errorText);
					if (tag.getText().toString().trim().equals("&nbsp;")) {
						errorTextString.append(" ");
						errorText = errorTextString.toString();
					} else {
						errorTextString.append(tag.getText());
						errorText = errorTextString.toString();
					}
				}
			}
		}
	} catch (XPatherException e) {
		LOGGER.error("Error extracting table node from html." + e.getMessage());
	}
	return errorText;
}
 
开发者ID:kuzavas,项目名称:ephesoft,代码行数:43,代码来源:AbstractUploadFile.java

示例3: getUrlListByPath

import org.htmlcleaner.TagNode; //导入方法依赖的package包/类
public  ArrayList<String> getUrlListByPath(TagNode node, String xpath) throws IOException, XPatherException
{
	 Object[] ns = node.evaluateXPath(xpath);
	 ArrayList<String> nodeList = new ArrayList<String> ();

	 for (Object object : ns) 
	 {
	    TagNode dd = (TagNode) object;
	    nodeList.add(dd.getAttributeByName("href"));
	 }
	 return nodeList;
}
 
开发者ID:anphoenix,项目名称:data_crawler_generic,代码行数:13,代码来源:ExtractInfoWithHtmlCleaner.java

示例4: parsePageInfoByPath

import org.htmlcleaner.TagNode; //导入方法依赖的package包/类
public  String parsePageInfoByPath(TagNode node, String xpath) throws IOException, XPatherException
{
	 Object[] ns = node.evaluateXPath(xpath);
	 String result ="";
	 for (Object object : ns) 
	 {
	    TagNode dd = (TagNode) object;
	    result = result +dd.getText();
	 }
	 return result;
}
 
开发者ID:anphoenix,项目名称:data_crawler_generic,代码行数:12,代码来源:ExtractInfoWithHtmlCleaner.java

示例5: parsePageInfoByPathandName

import org.htmlcleaner.TagNode; //导入方法依赖的package包/类
public  String parsePageInfoByPathandName(TagNode node, String xpath,String name) throws IOException, XPatherException
{
	 Object[] ns = node.evaluateXPath(xpath);
	 String result = "";
	 for (Object object : ns) 
	 {
	    TagNode dd = (TagNode) object;
	    result = result +dd.getAttributeByName(name);
	 }
	 return result;
}
 
开发者ID:anphoenix,项目名称:data_crawler_generic,代码行数:12,代码来源:ExtractInfoWithHtmlCleaner.java

示例6: parsePageInfoByPathandIndex

import org.htmlcleaner.TagNode; //导入方法依赖的package包/类
public  String parsePageInfoByPathandIndex(TagNode node, String xpath,int index) throws IOException, XPatherException
{
	 Object[] ns = node.evaluateXPath(xpath);
	 String result = "" ;
	 if(ns.length>0)
	 {
	    TagNode dd = (TagNode) ns[index];
	    result = result +dd.getText();
	 }
	 return result;
}
 
开发者ID:anphoenix,项目名称:data_crawler_generic,代码行数:12,代码来源:ExtractInfoWithHtmlCleaner.java

示例7: parsePageInfoByPathandNameAndindex

import org.htmlcleaner.TagNode; //导入方法依赖的package包/类
public  String parsePageInfoByPathandNameAndindex(TagNode node, String xpath,String name,int index) throws IOException, XPatherException
{
	 Object[] ns = node.evaluateXPath(xpath);
	 String result = "";
	 if(ns.length>0)
	 {
	    TagNode dd = (TagNode) ns[index];
	    result = result +dd.getAttributeByName(name);
	 }
	 return result;
}
 
开发者ID:anphoenix,项目名称:data_crawler_generic,代码行数:12,代码来源:ExtractInfoWithHtmlCleaner.java

示例8: main

import org.htmlcleaner.TagNode; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException, XPatherException {
	CleanerProperties props = cleaner.getProperties();     
       props.setUseCdataForScriptAndStyle(true);     
       props.setRecognizeUnicodeChars(true);     
       props.setUseEmptyElementTags(true);     
       props.setAdvancedXmlEscape(true);     
       props.setTranslateSpecialEntities(true);     
       props.setBooleanAttributeValues("empty");     
       String result ="";
       File file = new File("E:/test4java/tangniaobing.htm");
       
       
       URL url = new URL("http://www.haodf.com/wenda/anzhentaohong_g_638200415.htm");
	 TagNode node = cleaner.clean(url,"gb2312");
	 //Object[] ns = node.getElementsByName("", true);
	 Object[] ns = node.evaluateXPath("//*[@class=\"bb_d3 bl_d3 pb20\"]/div[3]/div[2]/p[2]");
	 //Object[] ns = node.("//*[@id=\"shequREP_pageNumLab\"]/a");
	 for (Object object : ns) 
	 {
	    TagNode dd = (TagNode) object;
	    
	    result = result +dd.getText()+"\n";
	 }
	 result = result.replace("&nbsp", "").replace("\r", "").replace(";", "");
	 
			result = CommonUtil.getDateString(result,".*?([0-9]+.[0-9]+.[0-9]+).*");

	 /*result = "?uthorid=4917458&page=6&tid=16785968";
	 String rex = "\\?(?!authorid=).*";
	 Pattern p = Pattern.compile(rex);
	 Matcher m = p.matcher(result);
	 boolean s = m.matches();
	 for(int i=1;i<=m.groupCount();i++)
	 {
		 System.out.println(m.group(i));
	 }*/
	 
	 System.out.print(result);
}
 
开发者ID:anphoenix,项目名称:data_crawler_generic,代码行数:40,代码来源:ExtractInfoWithHtmlCleaner.java

示例9: parserPerformanceTest

import org.htmlcleaner.TagNode; //导入方法依赖的package包/类
@Ignore("take long time")
@Test
public void parserPerformanceTest() throws XPatherException {
    System.out.println(html.length());

    HtmlCleaner htmlCleaner = new HtmlCleaner();
    TagNode tagNode = htmlCleaner.clean(html);
    Document document = Jsoup.parse(html);

    long time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        htmlCleaner.clean(html);
    }
    System.out.println(System.currentTimeMillis()-time);

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        tagNode.evaluateXPath("//a");
    }
    System.out.println(System.currentTimeMillis()-time);

    System.out.println("=============");

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        Jsoup.parse(html);
    }
    System.out.println(System.currentTimeMillis()-time);

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        document.select("a");
    }
    System.out.println(System.currentTimeMillis()-time);

    System.out.println("=============");

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        htmlCleaner.clean(html);
    }
    System.out.println(System.currentTimeMillis()-time);

    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        tagNode.evaluateXPath("//a");
    }
    System.out.println(System.currentTimeMillis()-time);

    System.out.println("=============");

    XPathEvaluator compile = Xsoup.compile("//a");
    time =System.currentTimeMillis();
    for (int i = 0; i < 2000; i++) {
        compile.evaluate(document);
    }
    System.out.println(System.currentTimeMillis()-time);

}
 
开发者ID:code4craft,项目名称:webmagic,代码行数:60,代码来源:XpathSelectorTest.java


注:本文中的org.htmlcleaner.TagNode.evaluateXPath方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。