当前位置: 首页>>代码示例>>Java>>正文


Java DocumentPreprocessor.iterator方法代码示例

本文整理汇总了Java中edu.stanford.nlp.process.DocumentPreprocessor.iterator方法的典型用法代码示例。如果您正苦于以下问题:Java DocumentPreprocessor.iterator方法的具体用法?Java DocumentPreprocessor.iterator怎么用?Java DocumentPreprocessor.iterator使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在edu.stanford.nlp.process.DocumentPreprocessor的用法示例。


在下文中一共展示了DocumentPreprocessor.iterator方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: splitSentencesINDocument

import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
public List<String> splitSentencesINDocument(String sDoc)
{
    Reader reader = new StringReader(sDoc);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);
    List<String> sentenceList = new ArrayList<String>();
    Iterator<List<HasWord>> it = dp.iterator();

    while (it.hasNext())
    {
        StringBuilder sentenceSb = new StringBuilder();
        List<HasWord> sentence = it.next();
        for (HasWord token : sentence)
        {
            if(sentenceSb.length()>1)
            {
                sentenceSb.append(" ");
            }
            sentenceSb.append(token);
        }
        sentenceList.add(sentenceSb.toString().trim());
    }
    return sentenceList;
}
 
开发者ID:serendio-labs-stage,项目名称:diskoveror-ta,代码行数:24,代码来源:StanfordNLP.java

示例2: parse_text

import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
public List<ParseTree> parse_text(String text, List<String> outputFormat) throws TApplicationException
{
    List<ParseTree> results = new ArrayList<ParseTree>();

    try
    {
        treePrinter = ParserUtil.setOptions(outputFormat, tlp);

        // assume no tokenization was done; use Stanford's default org.ets.research.nlp.stanford_thrift.tokenizer
        DocumentPreprocessor preprocess = new DocumentPreprocessor(new StringReader(text));
        Iterator<List<HasWord>> foundSentences = preprocess.iterator();
        while (foundSentences.hasNext())
        {
            Tree parseTree = parser.apply(foundSentences.next());
            results.add(new ParseTree(ParserUtil.TreeObjectToString(parseTree, treePrinter), parseTree.score()));
        }
    }
    catch (Exception e)
    {
        // FIXME
        throw new TApplicationException(TApplicationException.INTERNAL_ERROR, e.getMessage());
    }

    return results;
}
 
开发者ID:dmnapolitano,项目名称:stanford-thrift,代码行数:26,代码来源:StanfordParserThrift.java

示例3: tokenizeText

import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
public List<List<String>> tokenizeText(String arbitraryText)
{
	List<List<String>> tokenizedSentences = new ArrayList<List<String>>();
	
   	DocumentPreprocessor preprocess = new DocumentPreprocessor(new StringReader(arbitraryText));
   	Iterator<List<HasWord>> foundSentences = preprocess.iterator();
   	while (foundSentences.hasNext())
   	{
   		List<HasWord> tokenizedSentence = foundSentences.next();
   		List<String> tokenizedSentenceAsListOfStrings = new ArrayList<String>();
   		for (HasWord w : tokenizedSentence)
   		{
   			tokenizedSentenceAsListOfStrings.add(w.word());
   		}
   		tokenizedSentences.add(tokenizedSentenceAsListOfStrings);
   	}
   	
   	return tokenizedSentences;
}
 
开发者ID:dmnapolitano,项目名称:stanford-thrift,代码行数:20,代码来源:StanfordTokenizerThrift.java

示例4: getSentencesFromText

import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
public static List<List<HasWord>> getSentencesFromText(String str, boolean invertible, String options) {
    List<List<HasWord>> sentences = new ArrayList<List<HasWord>>();
    StringReader reader = new StringReader(str);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);
    TokenizerFactory factory = null;

    if( invertible ) {
      factory = PTBTokenizer.factory(true, true);
      if( options != null && options.length() > 0 ) 
        options = "invertible=true, " + options;
      else 
        options = "invertible=true";
    } else {
      factory = PTBTokenizer.factory();
    }

//    System.out.println("Setting splitter options=" + options);
    factory.setOptions(options);
    dp.setTokenizerFactory(factory);
    
    Iterator<List<HasWord>> iter = dp.iterator();
    while( iter.hasNext() ) {
      List<HasWord> sentence = iter.next();
      sentences.add(sentence);
    }
    return sentences;
    
  }
 
开发者ID:nchambers,项目名称:probschemas,代码行数:29,代码来源:Ling.java

示例5: tag_text

import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
public List<List<TaggedToken>> tag_text(String untokenizedText)
{
	List<List<TaggedToken>> taggedAndTokenizedSentences = new ArrayList<List<TaggedToken>>();
	
	// assume no tokenization was done; use Stanford's default org.ets.research.nlp.stanford_thrift.tokenizer
   	DocumentPreprocessor preprocess = new DocumentPreprocessor(new StringReader(untokenizedText));
   	Iterator<List<HasWord>> foundSentences = preprocess.iterator();
   	while (foundSentences.hasNext())
   	{
   		taggedAndTokenizedSentences.add(tagSingleSentence(foundSentences.next()));
   	}
   	
   	return taggedAndTokenizedSentences;
}
 
开发者ID:dmnapolitano,项目名称:stanford-thrift,代码行数:15,代码来源:StanfordTaggerThrift.java

示例6: getSentences1_old

import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
public static List<String> getSentences1_old(String text, Set<String> entities){
        text=text.trim();
        text=StringEscapeUtils.escapeHtml(text);
        text=text.replaceAll("http:.*&hellip;\\z","");
        String[] toMatch={"\\ART\\[email protected]\\S+", "\\AMT\\[email protected]\\S+"};
        for(String t:toMatch){
                Pattern pattern = Pattern.compile(t, Pattern.CASE_INSENSITIVE);
                String newTweet = text.trim();
                text="";
                while(!newTweet.equals(text)){         //each loop will cut off one "RT @XXX" or "#XXX"; may need a few calls to cut all hashtags etc.
                        text=newTweet;
                        Matcher matcher = pattern.matcher(text);
                        newTweet = matcher.replaceAll("");
                        newTweet =newTweet.trim();
                }
        }
        text=text.replaceAll("-\\s*\\z","");
        text=text.replaceAll("&hellip;\\z","");
        text=StringEscapeUtils.unescapeHtml(text);
        text=text.trim();
        String[] parts=text.split(Extractor.urlRegExp);
        List<String> sentences=new ArrayList<String>();
        
//        for(int i=0;i<parts.length;i++){
        int limit=10;
        if(limit>parts.length) 
			limit=parts.length;
        for(int i=0;i<limit;i++){
//            parts[i]=text.replace("http://*&hellip;","");
            String text_cleaned=extractor.cleanText(parts[i]);
//            List<String> sentences_tmp=new ArrayList<String>();
            Reader reader = new StringReader(text_cleaned);
            DocumentPreprocessor dp = new DocumentPreprocessor(reader);
            dp.setTokenizerFactory(PTBTokenizerFactory.newWordTokenizerFactory("ptb3Escaping=false,untokenizable=noneDelete"));
                    //prop.setProperty("tokenizerOptions", "untokenizable=noneDelete");

            Iterator<List<HasWord>> it = dp.iterator();
            while (it.hasNext()) {
                StringBuilder sentenceSb = new StringBuilder();
                List<HasWord> sentence = it.next();
                boolean last_keep=false;
                for (HasWord token : sentence) {
                    if((!token.word().matches("[,:!.;?)]"))&&(!token.word().contains("'"))&&!last_keep){
                        sentenceSb.append(" ");
                    }
                    last_keep=false;
                    if(token.word().matches("[(\\[]"))
                            last_keep=true;
                    String next_word=token.toString();
                      
                    if((next_word.toUpperCase().equals(next_word))&&(!next_word.equals("I"))&&(!entities.contains(next_word)))
                        next_word=next_word.toLowerCase();
                    if(next_word.equals("i")) next_word="I";
                    sentenceSb.append(next_word);
                }
                String new_sentence=sentenceSb.toString().trim();
                Character fc=new_sentence.charAt(0);
                new_sentence=fc.toString().toUpperCase()+new_sentence.substring(1);
                if(new_sentence.endsWith(":"))
                    text=text.substring(0,text.length()-3)+".";

                sentences.add(new_sentence);
            }
  //          sentences.addAll(sentences_tmp);
        }
        return sentences;
    }
 
开发者ID:socialsensor,项目名称:trends-labeler,代码行数:68,代码来源:TrendsLabeler.java

示例7: getSentences1

import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
public static List<String> getSentences1(String text, Set<String> entities) {
//		System.out.println("   Text as it is    :   " + text);
		text = TrendsLabeler.getCleanedTitleMR(text);

		String[] parts = text.split(Extractor.urlRegExp);
		List<String> sentences = new ArrayList<String>();

		// for(int i=0;i<parts.length;i++){
		int limit = 10;
		if (limit > parts.length)
			limit = parts.length;
		for (int i = 0; i < limit; i++) {
			String text_cleaned = extr.cleanText(parts[i]);
			// List<String> sentences_tmp=new ArrayList<String>();
			Reader reader = new StringReader(text_cleaned);
			DocumentPreprocessor dp = new DocumentPreprocessor(reader);
			dp.setTokenizerFactory(PTBTokenizerFactory
					.newWordTokenizerFactory("ptb3Escaping=false, untokenizable=noneDelete"));
			// dp.setTokenizerFactory(PTBTokenizerFactory.newWordTokenizerFactory("untokenizable=noneDelete"));

			Iterator<List<HasWord>> it = dp.iterator();
			while (it.hasNext()) {
				StringBuilder sentenceSb = new StringBuilder();
				List<HasWord> sentence = it.next();
				boolean last_keep = false;
				for (HasWord token : sentence) {
					if ((!token.word().matches("[,:!.;?)]"))
							&& (!token.word().contains("'")) && !last_keep) {
						sentenceSb.append(" ");
					}
					last_keep = false;
					if (token.word().matches("[(\\[]"))
						last_keep = true;
					String next_word = token.toString();

					if ((next_word.toUpperCase().equals(next_word))
							&& (!next_word.equals("I"))
							&& (!entities.contains(next_word)))
						next_word = next_word.toLowerCase();
					if (next_word.equals("i"))
						next_word = "I";
					sentenceSb.append(next_word);
				}
				String new_sentence = sentenceSb.toString().trim();
				Character fc = new_sentence.charAt(0);
				new_sentence = fc.toString().toUpperCase()
						+ new_sentence.substring(1);
				if (new_sentence.endsWith(":"))
					text = text.substring(0, text.length() - 3) + "."; 

				sentences.add(new_sentence);
			}
			// sentences.addAll(sentences_tmp);
		}
		return sentences;
	}
 
开发者ID:socialsensor,项目名称:trends-labeler,代码行数:57,代码来源:TrendsLabeler.java


注:本文中的edu.stanford.nlp.process.DocumentPreprocessor.iterator方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。