当前位置: 首页>>代码示例>>Java>>正文


Java DOMParser.getDocument方法代码示例

本文整理汇总了Java中org.cyberneko.html.parsers.DOMParser.getDocument方法的典型用法代码示例。如果您正苦于以下问题:Java DOMParser.getDocument方法的具体用法?Java DOMParser.getDocument怎么用?Java DOMParser.getDocument使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.cyberneko.html.parsers.DOMParser的用法示例。


在下文中一共展示了DOMParser.getDocument方法的11个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: readHtmlDocument

import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
public static Document readHtmlDocument(String str) {
    Document document = null;
    try {
        URL url = FlexibleLocation.resolveLocation(str);
        if (url != null) {
            DOMParser parser = new DOMParser();
            parser.setFeature("http://xml.org/sax/features/namespaces", false);
            parser.parse(url.toExternalForm());
            document = parser.getDocument();
        } else {
            Debug.logError("Unable to locate HTML document " + str, module);
        }
    } catch (Exception e) {
        Debug.logError(e, "Error while reading HTML document " + str, module);
    }
    return document;
}
 
开发者ID:ilscipio,项目名称:scipio-erp,代码行数:18,代码来源:UelFunctions.java

示例2: main

import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
    DOMParser parser = new DOMParser();

    String pathname = "src/test/resources/html/simple/3.html";
    parser.parse(new InputSource(new FileReader(pathname)));
    Document document = parser.getDocument();

    TreeNode node = new TreeNode(document, null);
    node.postOrderIndex(new AtomicInteger(1));
    node.prettyPrint();

    List<TreeNode> nodes = node.postOrderTraverse();
    System.out.println(nodes);

    System.out.println(node.getKeyRoots());
}
 
开发者ID:thammegowda,项目名称:autoextractor,代码行数:17,代码来源:TreeNode.java

示例3: getSource

import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
@TimeThis(task="read-file", category=TimerCategory.LOAD_RESOURCE)
protected Source getSource(@SuppressWarnings("unused") ProcessingContext<Corpus> ctx, InputStream file) throws SAXException, IOException, ParserConfigurationException {
	if (html) {
        DOMParser parser = new DOMParser();
        parser.setFeature("http://xml.org/sax/features/namespaces", false);
        parser.setFeature("http://cyberneko.org/html/features/scanner/cdata-sections", true);
        parser.setFeature("http://cyberneko.org/html/features/parse-noscript-content", false);
        parser.setProperty("http://cyberneko.org/html/properties/default-encoding", sourcePath.getCharset());
        if (rawTagNames) {
        	parser.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
        }
        else {
        	parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
        }
        parser.parse(new InputSource(file));
        Document doc = parser.getDocument();
        return new DOMSource(doc);
	}
	SAXParserFactory spf = SAXParserFactory.newInstance();
	spf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
    org.xml.sax.XMLReader xmlReader = spf.newSAXParser().getXMLReader();
    xmlReader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
    xmlReader.setEntityResolver(new EntityResolver() {
        @Override
		public InputSource resolveEntity(String pid, String sid) throws SAXException {
            return new InputSource(new ByteArrayInputStream(new byte[] {}));
        }
    });
    new SAXSource(xmlReader, new InputSource(file));
    return new StreamSource(file);
}
 
开发者ID:Bibliome,项目名称:alvisnlp,代码行数:32,代码来源:XMLReader.java

示例4: getDoc

import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
public static Document getDoc(InputStream in, String encoding)
		throws Exception {
	DOMParser parser = new DOMParser();
	parser.setProperty(
			"http://cyberneko.org/html/properties/default-encoding",
			encoding);
	parser.setFeature("http://xml.org/sax/features/namespaces", false);
	BufferedReader br = new BufferedReader(new InputStreamReader(in,
			encoding));
	parser.parse(new InputSource(br));
	return parser.getDocument();

}
 
开发者ID:bdceo,项目名称:bd-codes,代码行数:14,代码来源:XpathTest.java

示例5: computeDistance

import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
/**
 * Computes edit distance between two html files
 * @param file1 first html file
 * @param file2 second html file
 * @return edit distance measure
 * @throws IOException when an error occurs
 * @throws SAXException when parser fails
 */
public static double computeDistance(File file1, File file2)
        throws IOException, SAXException {
    DOMParser domParser = new DOMParser();
    domParser.parse(new InputSource(new FileReader(file1)));
    Document doc1 = domParser.getDocument();
    domParser.reset();
    domParser.parse(new InputSource(new FileReader(file2)));
    Document doc2 = domParser.getDocument();

    ZSTEDComputer computer = new ZSTEDComputer();
    return computer.computeDistance(new TreeNode(doc1, null), new TreeNode(doc2, null));
}
 
开发者ID:thammegowda,项目名称:autoextractor,代码行数:21,代码来源:ZSTEDComputer.java

示例6: VSMVector

import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
public VSMVector(String file, boolean isForm, StopList stoplist) throws MalformedURLException, IOException, SAXException {
 this.stoplist = stoplist;
    this.elems = new HashMap<>();
    if(isForm){
        DOMParser parser = new DOMParser();
        if((file.toLowerCase()).indexOf("<form ") != -1){//verify if the string is the name of file or the content of form
            parser.parse(new InputSource(new BufferedReader(new StringReader(file))));
        }else{
            parser.parse(file);
        }
        String srcForm = "";
        Document doc = parser.getDocument();
        NodeList list = doc.getElementsByTagName("form");
        StringBuffer source = new StringBuffer();
        parse(list.item(0), source, new StringBuffer(), "html", stoplist);
        srcForm = source.toString().toLowerCase();
        PaginaURL formPage = new PaginaURL(new URL("http://www"),srcForm, stoplist);

        stemPage(formPage, true);
    } else {
        StringBuffer content = new StringBuffer();
        BufferedReader input = new BufferedReader(new FileReader(new File(
                file)));
        for (String line = input.readLine(); line != null;
                line = input.readLine()) {

            content.append(line);
            content.append("\n");

        }
        input.close();
        String src = content.toString();
        PaginaURL page = new PaginaURL(new URL("http://www"), src, stoplist);
        addTitle(page, stoplist);
        stemPage(page, false);
    }
}
 
开发者ID:ViDA-NYU,项目名称:ache,代码行数:38,代码来源:VSMVector.java

示例7: asDocument

import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
/**
 * transforms a string into a Document object. TODO This needs more optimizations. As it seems
 * the getDocument is called way too much times causing a lot of parsing which is slow and not
 * necessary.
 * 
 * @param html
 *            the HTML string.
 * @return The DOM Document version of the HTML string.
 * @throws IOException
 *             if an IO failure occurs.
 * @throws SAXException
 *             if an exception occurs while parsing the HTML string.
 */
public static Document asDocument(String html) throws IOException {
	DOMParser domParser = new DOMParser();
	try {
		domParser.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
		domParser.setFeature("http://xml.org/sax/features/namespaces", false);
		domParser.parse(new InputSource(new StringReader(html)));
	} catch (SAXException e) {
		throw new IOException("Error while reading HTML: " + html, e);
	}catch (Exception unknown){
		unknown.printStackTrace();
		throw new IOException("Error while reading HTML: " + html);
	}
	return domParser.getDocument();
}
 
开发者ID:aminmf,项目名称:crawljax,代码行数:28,代码来源:DomUtils.java

示例8: parse

import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
@Override
public Document parse() throws SAXException, IOException
{
    DOMParser parser = new DOMParser(new HTMLConfiguration());
    parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
    if (charset != null)
        parser.setProperty("http://cyberneko.org/html/properties/default-encoding", charset);
    parser.parse(new org.xml.sax.InputSource(getDocumentSource().getInputStream()));
    return parser.getDocument();
}
 
开发者ID:chrimm,项目名称:cordovastudio,代码行数:11,代码来源:DefaultDOMSource.java

示例9: main

import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
/**
 * Testing
 *
 * @param argv
 * @throws Exception
 */
public static void main(String[] argv) throws Exception {
	// URL input = new
	// URL("file:///home/dd/Programming/Readability4J/t.html");
	// URL input = new
	// URL("http://news.bbc.co.uk/1/hi/politics/10362367.stm");
	final URL input = new URL("http://blog.confluent.io/2015/01/29/making-sense-of-stream-processing/");
	// URL input = new URL("http://euobserver.com/9/30465");
	// URL input = new URL("http://euobserver.com/?aid=23383");
	// URL input = new
	// URL("http://abandoninplace.squarespace.com/blog/2010/6/8/wwdc-monday.html");
	// URL input = new URL("file:///Users/jsh2/Desktop/test.html");
	// URL input = new
	// URL("http://mobile.engadget.com/2010/06/17/htc-aria-review/");
	// URL input = new URL("http://thedailywtf.com/Articles/Benched.aspx");
	// URL input = new
	// URL("http://www.dailymail.co.uk/news/article-1287625/Woman-sparked-150-000-manhunt-slashing-face-crying-rape-faces-jail.html");
	// URL input = new
	// URL("http://mrpaparazzi.com/post/11619/Lindsay-Lohan-Tests-Negative-For-Alcohol-Goes-Clubbing-To-Celebrate.aspx");
	// URL input = new
	// URL("http://www.bbc.co.uk/news/world-middle-east-11415719");
	// URL input = new URL("http://www.thebigproject.co.uk/news/");
	// URL input = new
	// URL("http://blogs.euobserver.com/popescu/2009/12/15/on-euro-optimism-pessimism-and-failures/#more-958");
	// URL input = new
	// URL("http://www.cnn.com/2010/WORLD/meast/09/27/west.bank.settlement.construction/index.html?hpt=T2");

	// URL input = new
	// URL("http://www.huffingtonpost.com/steven-cohen/its-time-to-enact-congest_b_740315.html");
	// URL input = new
	// URL("http://uk.mac.ign.com/articles/573/573319p1.html");
	final DOMParser parser = new DOMParser();
	parser.parse(new InputSource(input.openStream()));

	final Readability r = new Readability(parser.getDocument(), true, true);

	// System.out.println(r.getArticleTitle());
	System.out.println(r.getArticleHTML());
	// System.out.println(r.getAllLinks());
	// System.out.println(r.getArticleText());

	System.out.println();
	System.out.println("***");
	System.out.println();

	for (final MappingNode s : r.getArticleTextMapping())
		System.out.println(s);

	// PrintStream out = new PrintStream("news-sites");
	// for (Anchor anchor : r.getAllLinks()) {
	// out.println(anchor.getHref() + "\t" + anchor.getText());
	// }
	// out.close();

	System.out.println(r.getArticleImages());
	// System.out.println(r.getArticleSubheadings());
	// System.out.println(r.getArticleHTML());
	// System.out.println(r.getArticleHTML_DOM());

	// System.out.println(r.getArticleDateString());
	// System.out.println(r.getArticleDate());
}
 
开发者ID:openimaj,项目名称:openimaj,代码行数:68,代码来源:Readability.java

示例10: getReadability

import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
/**
 * Convenience method to build a {@link Readability} instance from an html
 * string.
 *
 * @param html
 *            The html string
 * @param addTitle
 *            Should the title be added to the generated article?
 * @return new {@link Readability} instance.
 * @throws SAXException
 * @throws IOException
 */
public static Readability getReadability(String html, boolean addTitle) throws SAXException, IOException {
	final DOMParser parser = new DOMParser();
	parser.parse(new InputSource(new StringReader(html)));

	return new Readability(parser.getDocument(), false, addTitle);
}
 
开发者ID:openimaj,项目名称:openimaj,代码行数:19,代码来源:Readability.java

示例11: getDocumentNoBalance

import org.cyberneko.html.parsers.DOMParser; //导入方法依赖的package包/类
/**
 * @param html
 *            the HTML string.
 * @return a Document object made from the HTML string.
 * @throws SAXException
 *             if an exception occurs while parsing the HTML string.
 * @throws IOException
 *             if an IO failure occurs.
 */
public static Document getDocumentNoBalance(String html) throws SAXException, IOException {
	DOMParser domParser = new DOMParser();
	domParser.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
	domParser.setFeature("http://cyberneko.org/html/features/balance-tags", false);
	domParser.parse(new InputSource(new StringReader(html)));
	return domParser.getDocument();
}
 
开发者ID:aminmf,项目名称:crawljax,代码行数:17,代码来源:DomUtils.java


注:本文中的org.cyberneko.html.parsers.DOMParser.getDocument方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。