当前位置: 首页>>代码示例>>Java>>正文


Java OutputDocument.toString方法代码示例

本文整理汇总了Java中net.htmlparser.jericho.OutputDocument.toString方法的典型用法代码示例。如果您正苦于以下问题:Java OutputDocument.toString方法的具体用法?Java OutputDocument.toString怎么用?Java OutputDocument.toString使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在net.htmlparser.jericho.OutputDocument的用法示例。


在下文中一共展示了OutputDocument.toString方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: processPage

import net.htmlparser.jericho.OutputDocument; //导入方法依赖的package包/类
protected String processPage( PathOrigin baseDir, String pagePath ) throws IOException {

    long start = System.currentTimeMillis();
    InputStream file = null;
    try {
      file = baseDir.getReader( getRepo() ).getFileInputStream( pagePath );
      Source html = new Source( file ); 
      OutputDocument outDoc = new OutputDocument( html );
      // transform
      modifyDocument( html, baseDir, outDoc );
      return outDoc.toString();
    } finally {
      IOUtils.closeQuietly( file );
      if ( log.isDebugEnabled() ) {
        log.debug( String.format( "processPage for %s took %dms", pagePath, System.currentTimeMillis() - start ) );
      }
    }

  }
 
开发者ID:webdetails,项目名称:cte,代码行数:20,代码来源:ProcessedHtmlPage.java

示例2: strip

import net.htmlparser.jericho.OutputDocument; //导入方法依赖的package包/类
/**
	 * Retira tags indesejadas
	 * 
	 * @param html
	 * @return
	 */
	public String strip(String html) {

		
		if (html == null)
			return "";
		Source source = new Source(html);
		source.fullSequentialParse();
		OutputDocument output = new OutputDocument(source);
		List<Tag> tags = source.getAllTags();

		for (Tag tag : tags) {
			if (processTag(tag, output)) {
				tag.setUserData(VALID_MARKER);
			} else {
				output.remove(tag);
			}
//			reencodeTextSegment(source, output, pos, tag.getBegin());
		}
//		reencodeTextSegment(source, output, pos, source.getEnd());
		return output.toString();
	}
 
开发者ID:camaradosdeputadosoficial,项目名称:edemocracia,代码行数:28,代码来源:HtmlStripperDiscussion.java

示例3: strip

import net.htmlparser.jericho.OutputDocument; //导入方法依赖的package包/类
/**
 * Retira tags indesejadas
 * 
 * @param html
 * @return
 */
public String strip(String html) {
	if (html == null)
		return "";
	Source source = new Source(html);
	source.fullSequentialParse();
	OutputDocument output = new OutputDocument(source);
	List<Tag> tags = source.getAllTags();
	int pos = 0;
	for (Tag tag : tags) {
		if (processTag(tag, output)) {
			tag.setUserData(VALID_MARKER);
		} else {
			output.remove(tag);
		}
		reencodeTextSegment(source, output, pos, tag.getBegin());
		pos = tag.getEnd();
	}
	reencodeTextSegment(source, output, pos, source.getEnd());
	return output.toString();
}
 
开发者ID:camaradosdeputadosoficial,项目名称:edemocracia,代码行数:27,代码来源:HtmlStripper.java

示例4: sanitise

import net.htmlparser.jericho.OutputDocument; //导入方法依赖的package包/类
private static String sanitise(String pseudoHTML, boolean formatWhiteSpace, boolean stripInvalidElements) {
	Source source=new Source(pseudoHTML);
	source.fullSequentialParse();
	OutputDocument outputDocument=new OutputDocument(source);
	List<Tag> tags=source.getAllTags();
	int pos=0;
	for (Tag tag : tags) {
		if (processTag(tag,outputDocument)) {
			tag.setUserData(VALID_MARKER);
		} else {
			if (!stripInvalidElements) continue; // element will be encoded along with surrounding text
			if (!stripInvalidElements) continue; // element will be encoded along with surrounding text
			if(tag.getName().equalsIgnoreCase("style")){
				Tag nextTag=tag.getNextTag();
				int endPos=0;
				if(nextTag!=null){
					endPos=nextTag.getBegin()-1;
				}else{
					endPos=source.getEnd();
				}
				outputDocument.remove(tag.getBegin(),endPos);
			}else{
				outputDocument.remove(tag);
			}
		}
		//reencodeTextSegment(source,outputDocument,pos,tag.getBegin(),formatWhiteSpace);
		pos=tag.getEnd();
	}
	//reencodeTextSegment(source,outputDocument,pos,source.getEnd(),formatWhiteSpace);
	return outputDocument.toString();
}
 
开发者ID:trackplus,项目名称:Genji,代码行数:32,代码来源:HTMLSanitiser.java

示例5: removeNotAllowedTags

import net.htmlparser.jericho.OutputDocument; //导入方法依赖的package包/类
/**
    * Serduszko dla Bartka od Kasi <3
    * @param htmlFragment
    * @param docUri
    * @return
    */
   private String removeNotAllowedTags(String htmlFragment, URI docUri) {
       Source source = new Source(htmlFragment);
       OutputDocument outputDocument = new OutputDocument(source);
       List<Element> elements = source.getAllElements();


    for (Element element : elements) {
    	Attributes attrs = element.getAttributes();
    	Map<String, String> attrsUpdate = outputDocument.replace(attrs, true);
    	if (!element.getName().contains("a")) {
			attrsUpdate.clear();
		} else {
    		if (attrsUpdate.get("href")!=null) {
	    		String link = attrsUpdate.get("href");
	    		if (!link.contains("http")) {
		    		URI documentUri = docUri;

		    		URI anchorUri;
					try {
						anchorUri = new URI(link);
						URI result = documentUri.resolve(anchorUri);

						attrsUpdate.put("href",	result.toString());
					} catch (URISyntaxException e) {
						outputDocument.remove(element);
					}
	    		}
    		}
    	}

    	if (NOT_ALLOWED_HTML_TAGS.contains(element.getName())) {
    		Segment content = element.getContent();
    		if (element.getName() == "script"
    				|| element.getName() == "style"
    				|| element.getName() == "form") {
    			outputDocument.remove(content);
    		}
            outputDocument.remove(element.getStartTag());

            if (!element.getStartTag().isSyntacticalEmptyElementTag()) {
                outputDocument.remove(element.getEndTag());
            }
        }
    }

    String out = outputDocument.toString();
    out = out.replaceAll("\\n", "");
    out = out.replaceAll("\\t", "");

    return out;
}
 
开发者ID:BartoszJarocki,项目名称:android-boilerpipe,代码行数:58,代码来源:HtmlArticleExtractor.java


注:本文中的net.htmlparser.jericho.OutputDocument.toString方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。