当前位置: 首页>>代码示例>>Java>>正文


Java DOMParser类代码示例

本文整理汇总了Java中org.cyberneko.html.parsers.DOMParser的典型用法代码示例。如果您正苦于以下问题:Java DOMParser类的具体用法?Java DOMParser怎么用?Java DOMParser使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


DOMParser类属于org.cyberneko.html.parsers包,在下文中一共展示了DOMParser类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: readHtmlDocument

import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
public static Document readHtmlDocument(String str) {
    Document document = null;
    try {
        URL url = FlexibleLocation.resolveLocation(str);
        if (url != null) {
            DOMParser parser = new DOMParser();
            parser.setFeature("http://xml.org/sax/features/namespaces", false);
            parser.parse(url.toExternalForm());
            document = parser.getDocument();
        } else {
            Debug.logError("Unable to locate HTML document " + str, module);
        }
    } catch (Exception e) {
        Debug.logError(e, "Error while reading HTML document " + str, module);
    }
    return document;
}
 
开发者ID:ilscipio,项目名称:scipio-erp,代码行数:18,代码来源:UelFunctions.java

示例2: testPost

import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
@Test
public void testPost() throws Exception {
	cfg.setProperty(WSFedConstants.PROP_USE_REDIRECT, false);

	StringWriter sw = new StringWriter();
	when(res.getWriter()).thenReturn(new PrintWriter(sw));
	
	LoginHandler lh = new LoginHandler();
	lh.handleGet(rc);
	
	WebWindow win = mock(WebWindow.class);
	when(win.getScriptObject()).thenThrow(new RuntimeException("test"));
	when(win.getWebClient()).thenReturn(new WebClient(BrowserVersion.FIREFOX_2));
	
	DOMParser parser = new DOMParser();
	parser.parse(new InputSource(new ByteArrayInputStream(sw.toString().getBytes())));
	HTMLElement e = (HTMLElement) parser.getDocument().getDocumentElement();
	
	NodeList forms = e.getElementsByTagName("form");
	assertEquals(1, forms.getLength());
	Element form = (Element)forms.item(0);
	assertEquals("loginform", form.getAttribute("name"));
	assertEquals(rc.getIdpMetadata().getFirstMetadata().getSingleSignonServiceLocation(WSFedConstants.WSFED_PROTOCOL), form.getAttribute("action"));
	
	verify(res, never()).sendRedirect(anyString());
}
 
开发者ID:amagdenko,项目名称:oiosaml.java,代码行数:27,代码来源:LoginHandlerTest.java

示例3: main

import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
    DOMParser parser = new DOMParser();

    String pathname = "src/test/resources/html/simple/3.html";
    parser.parse(new InputSource(new FileReader(pathname)));
    Document document = parser.getDocument();

    TreeNode node = new TreeNode(document, null);
    node.postOrderIndex(new AtomicInteger(1));
    node.prettyPrint();

    List<TreeNode> nodes = node.postOrderTraverse();
    System.out.println(nodes);

    System.out.println(node.getKeyRoots());
}
 
开发者ID:thammegowda,项目名称:autoextractor,代码行数:17,代码来源:TreeNode.java

示例4: getSource

import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
@TimeThis(task="read-file", category=TimerCategory.LOAD_RESOURCE)
protected Source getSource(@SuppressWarnings("unused") ProcessingContext<Corpus> ctx, InputStream file) throws SAXException, IOException, ParserConfigurationException {
	if (html) {
        DOMParser parser = new DOMParser();
        parser.setFeature("http://xml.org/sax/features/namespaces", false);
        parser.setFeature("http://cyberneko.org/html/features/scanner/cdata-sections", true);
        parser.setFeature("http://cyberneko.org/html/features/parse-noscript-content", false);
        parser.setProperty("http://cyberneko.org/html/properties/default-encoding", sourcePath.getCharset());
        if (rawTagNames) {
        	parser.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
        }
        else {
        	parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
        }
        parser.parse(new InputSource(file));
        Document doc = parser.getDocument();
        return new DOMSource(doc);
	}
	SAXParserFactory spf = SAXParserFactory.newInstance();
	spf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
    org.xml.sax.XMLReader xmlReader = spf.newSAXParser().getXMLReader();
    xmlReader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
    xmlReader.setEntityResolver(new EntityResolver() {
        @Override
		public InputSource resolveEntity(String pid, String sid) throws SAXException {
            return new InputSource(new ByteArrayInputStream(new byte[] {}));
        }
    });
    new SAXSource(xmlReader, new InputSource(file));
    return new StreamSource(file);
}
 
开发者ID:Bibliome,项目名称:alvisnlp,代码行数:32,代码来源:XMLReader.java

示例5: getDoc

import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
public static Document getDoc(InputStream in, String encoding)
		throws Exception {
	DOMParser parser = new DOMParser();
	parser.setProperty(
			"http://cyberneko.org/html/properties/default-encoding",
			encoding);
	parser.setFeature("http://xml.org/sax/features/namespaces", false);
	BufferedReader br = new BufferedReader(new InputStreamReader(in,
			encoding));
	parser.parse(new InputSource(br));
	return parser.getDocument();

}
 
开发者ID:bdceo,项目名称:bd-codes,代码行数:14,代码来源:XpathTest.java

示例6: computeDistance

import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
/**
 * Computes edit distance between two html files
 * @param file1 first html file
 * @param file2 second html file
 * @return edit distance measure
 * @throws IOException when an error occurs
 * @throws SAXException when parser fails
 */
public static double computeDistance(File file1, File file2)
        throws IOException, SAXException {
    DOMParser domParser = new DOMParser();
    domParser.parse(new InputSource(new FileReader(file1)));
    Document doc1 = domParser.getDocument();
    domParser.reset();
    domParser.parse(new InputSource(new FileReader(file2)));
    Document doc2 = domParser.getDocument();

    ZSTEDComputer computer = new ZSTEDComputer();
    return computer.computeDistance(new TreeNode(doc1, null), new TreeNode(doc2, null));
}
 
开发者ID:thammegowda,项目名称:autoextractor,代码行数:21,代码来源:ZSTEDComputer.java

示例7: VSMVector

import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
public VSMVector(String file, boolean isForm, StopList stoplist) throws MalformedURLException, IOException, SAXException {
 this.stoplist = stoplist;
    this.elems = new HashMap<>();
    if(isForm){
        DOMParser parser = new DOMParser();
        if((file.toLowerCase()).indexOf("<form ") != -1){//verify if the string is the name of file or the content of form
            parser.parse(new InputSource(new BufferedReader(new StringReader(file))));
        }else{
            parser.parse(file);
        }
        String srcForm = "";
        Document doc = parser.getDocument();
        NodeList list = doc.getElementsByTagName("form");
        StringBuffer source = new StringBuffer();
        parse(list.item(0), source, new StringBuffer(), "html", stoplist);
        srcForm = source.toString().toLowerCase();
        PaginaURL formPage = new PaginaURL(new URL("http://www"),srcForm, stoplist);

        stemPage(formPage, true);
    } else {
        StringBuffer content = new StringBuffer();
        BufferedReader input = new BufferedReader(new FileReader(new File(
                file)));
        for (String line = input.readLine(); line != null;
                line = input.readLine()) {

            content.append(line);
            content.append("\n");

        }
        input.close();
        String src = content.toString();
        PaginaURL page = new PaginaURL(new URL("http://www"), src, stoplist);
        addTitle(page, stoplist);
        stemPage(page, false);
    }
}
 
开发者ID:ViDA-NYU,项目名称:ache,代码行数:38,代码来源:VSMVector.java

示例8: asDocument

import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
/**
 * transforms a string into a Document object. TODO This needs more optimizations. As it seems
 * the getDocument is called way too much times causing a lot of parsing which is slow and not
 * necessary.
 * 
 * @param html
 *            the HTML string.
 * @return The DOM Document version of the HTML string.
 * @throws IOException
 *             if an IO failure occurs.
 * @throws SAXException
 *             if an exception occurs while parsing the HTML string.
 */
public static Document asDocument(String html) throws IOException {
	DOMParser domParser = new DOMParser();
	try {
		domParser.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
		domParser.setFeature("http://xml.org/sax/features/namespaces", false);
		domParser.parse(new InputSource(new StringReader(html)));
	} catch (SAXException e) {
		throw new IOException("Error while reading HTML: " + html, e);
	}catch (Exception unknown){
		unknown.printStackTrace();
		throw new IOException("Error while reading HTML: " + html);
	}
	return domParser.getDocument();
}
 
开发者ID:aminmf,项目名称:crawljax,代码行数:28,代码来源:DomUtils.java

示例9: parse

import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
@Override
public Document parse() throws SAXException, IOException
{
    DOMParser parser = new DOMParser(new HTMLConfiguration());
    parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
    if (charset != null)
        parser.setProperty("http://cyberneko.org/html/properties/default-encoding", charset);
    parser.parse(new org.xml.sax.InputSource(getDocumentSource().getInputStream()));
    return parser.getDocument();
}
 
开发者ID:chrimm,项目名称:cordovastudio,代码行数:11,代码来源:DefaultDOMSource.java

示例10: computeDistances

import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
/**
 * Computes the edit distance between files in a directory
 * @param inputDir directory of html pages
 * @throws IOException
 * @throws SAXException
 */
private static void computeDistances(File inputDir) throws IOException, SAXException {

    File[] files = inputDir.listFiles();
    List<TreeNode> docs = new ArrayList<>();
    List<String> htmlPaths = new ArrayList<>();
    DOMParser parser = new DOMParser();
    for (File file : files) {
        if (!file.isFile()) {
            //skip
            continue;
        }
        try(FileReader reader = new FileReader(file)) {
            parser.parse(new InputSource(reader));
            htmlPaths.add(file.getAbsolutePath());
            docs.add(new TreeNode(parser.getDocument(), null));
            parser.reset();
        }
    }
    int n = docs.size();
    if (n < 2) {
        throw new RuntimeException("At least 2 html/xml files should be present in the input directory");
    }

    ZSTEDComputer edComputer = new ZSTEDComputer();
    StructureSimComputer simComputer = new StructureSimComputer(edComputer);
    double[][] distMatrix = edComputer.computeDistanceMatrix(docs);
    int treeSizes[] = new int[n];
    for (int i = 0; i < docs.size(); i++) {
        treeSizes[i] = docs.get(i).getSize();
    }
    double[][] simMatrix = simComputer.compute(treeSizes, distMatrix);

    System.out.println("#Index\tFile Path");
    for (int i = 0; i < htmlPaths.size(); i++) {
        System.out.println(i + "\t" + htmlPaths.get(i));
    }
    System.out.println("\n#Distance Matrix");
    MatrixUtils.printMatrix(distMatrix);

    System.out.println("\n#Similarity Matrix");
    MatrixUtils.printMatrix(simMatrix);
}
 
开发者ID:thammegowda,项目名称:autoextractor,代码行数:49,代码来源:ZSTEDComputer.java

示例11: main

import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
/**
 * Testing
 *
 * @param argv
 * @throws Exception
 */
public static void main(String[] argv) throws Exception {
	// URL input = new
	// URL("file:///home/dd/Programming/Readability4J/t.html");
	// URL input = new
	// URL("http://news.bbc.co.uk/1/hi/politics/10362367.stm");
	final URL input = new URL("http://blog.confluent.io/2015/01/29/making-sense-of-stream-processing/");
	// URL input = new URL("http://euobserver.com/9/30465");
	// URL input = new URL("http://euobserver.com/?aid=23383");
	// URL input = new
	// URL("http://abandoninplace.squarespace.com/blog/2010/6/8/wwdc-monday.html");
	// URL input = new URL("file:///Users/jsh2/Desktop/test.html");
	// URL input = new
	// URL("http://mobile.engadget.com/2010/06/17/htc-aria-review/");
	// URL input = new URL("http://thedailywtf.com/Articles/Benched.aspx");
	// URL input = new
	// URL("http://www.dailymail.co.uk/news/article-1287625/Woman-sparked-150-000-manhunt-slashing-face-crying-rape-faces-jail.html");
	// URL input = new
	// URL("http://mrpaparazzi.com/post/11619/Lindsay-Lohan-Tests-Negative-For-Alcohol-Goes-Clubbing-To-Celebrate.aspx");
	// URL input = new
	// URL("http://www.bbc.co.uk/news/world-middle-east-11415719");
	// URL input = new URL("http://www.thebigproject.co.uk/news/");
	// URL input = new
	// URL("http://blogs.euobserver.com/popescu/2009/12/15/on-euro-optimism-pessimism-and-failures/#more-958");
	// URL input = new
	// URL("http://www.cnn.com/2010/WORLD/meast/09/27/west.bank.settlement.construction/index.html?hpt=T2");

	// URL input = new
	// URL("http://www.huffingtonpost.com/steven-cohen/its-time-to-enact-congest_b_740315.html");
	// URL input = new
	// URL("http://uk.mac.ign.com/articles/573/573319p1.html");
	final DOMParser parser = new DOMParser();
	parser.parse(new InputSource(input.openStream()));

	final Readability r = new Readability(parser.getDocument(), true, true);

	// System.out.println(r.getArticleTitle());
	System.out.println(r.getArticleHTML());
	// System.out.println(r.getAllLinks());
	// System.out.println(r.getArticleText());

	System.out.println();
	System.out.println("***");
	System.out.println();

	for (final MappingNode s : r.getArticleTextMapping())
		System.out.println(s);

	// PrintStream out = new PrintStream("news-sites");
	// for (Anchor anchor : r.getAllLinks()) {
	// out.println(anchor.getHref() + "\t" + anchor.getText());
	// }
	// out.close();

	System.out.println(r.getArticleImages());
	// System.out.println(r.getArticleSubheadings());
	// System.out.println(r.getArticleHTML());
	// System.out.println(r.getArticleHTML_DOM());

	// System.out.println(r.getArticleDateString());
	// System.out.println(r.getArticleDate());
}
 
开发者ID:openimaj,项目名称:openimaj,代码行数:68,代码来源:Readability.java

示例12: evaluate

import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
public Object evaluate(TaskRequest req, TaskResponse res) {


		Node rslt = null;

		String ctx_str = (String) context.evaluate(req, res);
		String loc_str = (String) location.evaluate(req, res);
		try {

			URL ctx = new URL(ctx_str);
			URL src = new URL(ctx, loc_str);

			DOMParser parser = new DOMParser();
			parser.parse(src.toString());

			Document doc = new DOMReader().read(parser.getDocument());
			rslt = doc.getRootElement();


		} catch (Throwable t) {
			String msg = "Unable to read the specified document:"
						+ "\n\tCONTEXT=" + ctx_str
						+ "\n\tLOCATION=" + loc_str;
			throw new RuntimeException(msg, t);
		}

		return rslt;

	}
 
开发者ID:drewwills,项目名称:cernunnos,代码行数:30,代码来源:NekoHtmlPhrase.java

示例13: getReadability

import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
/**
 * Convenience method to build a {@link Readability} instance from an html
 * string.
 *
 * @param html
 *            The html string
 * @param addTitle
 *            Should the title be added to the generated article?
 * @return new {@link Readability} instance.
 * @throws SAXException
 * @throws IOException
 */
public static Readability getReadability(String html, boolean addTitle) throws SAXException, IOException {
	final DOMParser parser = new DOMParser();
	parser.parse(new InputSource(new StringReader(html)));

	return new Readability(parser.getDocument(), false, addTitle);
}
 
开发者ID:openimaj,项目名称:openimaj,代码行数:19,代码来源:Readability.java

示例14: LSMEnglishHymnalHTMLParser

import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
public LSMEnglishHymnalHTMLParser() throws Exception {

    parser = new DOMParser();
    //parser.setProperty("http://cyberneko.org/html/properties/default-encoding","" );

    hymns = new ArrayList<Hymn>(2000);
    hymnal = new Hymnal(new AlphanumComparator());

    hymnal.id = PUB_PREFIX + "English";
    hymnal.title = "Hymns";

  }
 
开发者ID:linfrank,项目名称:hymnal-tool,代码行数:13,代码来源:LSMEnglishHymnalHTMLParser.java

示例15: getDocumentNoBalance

import org.cyberneko.html.parsers.DOMParser; //导入依赖的package包/类
/**
 * @param html
 *            the HTML string.
 * @return a Document object made from the HTML string.
 * @throws SAXException
 *             if an exception occurs while parsing the HTML string.
 * @throws IOException
 *             if an IO failure occurs.
 */
public static Document getDocumentNoBalance(String html) throws SAXException, IOException {
	DOMParser domParser = new DOMParser();
	domParser.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
	domParser.setFeature("http://cyberneko.org/html/features/balance-tags", false);
	domParser.parse(new InputSource(new StringReader(html)));
	return domParser.getDocument();
}
 
开发者ID:aminmf,项目名称:crawljax,代码行数:17,代码来源:DomUtils.java


注:本文中的org.cyberneko.html.parsers.DOMParser类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。