当前位置: 首页>>代码示例>>Java>>正文


Java DocumentPreprocessor.setTokenizerFactory方法代码示例

本文整理汇总了Java中edu.stanford.nlp.process.DocumentPreprocessor.setTokenizerFactory方法的典型用法代码示例。如果您正苦于以下问题:Java DocumentPreprocessor.setTokenizerFactory方法的具体用法?Java DocumentPreprocessor.setTokenizerFactory怎么用?Java DocumentPreprocessor.setTokenizerFactory使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在edu.stanford.nlp.process.DocumentPreprocessor的用法示例。


在下文中一共展示了DocumentPreprocessor.setTokenizerFactory方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: getIDFMapForDocument

import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
/**
 * Get an IDF map for the given document string.
 *
 * @param document
 * @return
 */
private static Counter<String> getIDFMapForDocument(String document) {
  // Clean up -- remove some Gigaword patterns that slow things down
  // / don't help anything
  document = headingSeparator.matcher(document).replaceAll("");

  DocumentPreprocessor preprocessor = new DocumentPreprocessor(new StringReader(document));
  preprocessor.setTokenizerFactory(tokenizerFactory);

  Counter<String> idfMap = new ClassicCounter<String>();
  for (List<HasWord> sentence : preprocessor) {
    if (sentence.size() > MAX_SENTENCE_LENGTH)
      continue;

    List<TaggedWord> tagged = tagger.tagSentence(sentence);

    for (TaggedWord w : tagged) {
      if (w.tag().startsWith("n"))
        idfMap.incrementCount(w.word());
    }
  }

  return idfMap;
}
 
开发者ID:asmehra95,项目名称:wiseowl,代码行数:30,代码来源:DocumentFrequencyCounter.java

示例2: applyPTBTokenizer

import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
private static List<String> applyPTBTokenizer(DocumentPreprocessor dp, boolean tokenizeNLs, boolean ptb3Escaping) {
	PTBTokenizerFactory<Word> tf = PTBTokenizer.PTBTokenizerFactory.newWordTokenizerFactory("tokenizeNLs=" + tokenizeNLs + ",ptb3Escaping=" + ptb3Escaping + ",asciiQuotes=true");
	dp.setTokenizerFactory(tf);
	List<String> sentences = new ArrayList<>();
	for (List<HasWord> wordList : dp) {
		String sentence = "";
		for (HasWord word : wordList) {
			sentence += " " + splitCompounds(word.word());
		}
		sentences.add(sentence);
	}
	return sentences;
}
 
开发者ID:infolis,项目名称:infoLink,代码行数:14,代码来源:TokenizerStanford.java

示例3: main

import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
  if (args.length != 2) {
    System.err.println("usage: java TaggerDemo modelFile fileToTag");
    return;
  }
  MaxentTagger tagger = new MaxentTagger(args[0]);
  TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
							   "untokenizable=noneKeep");
  BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
  PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
  DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
  documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
  for (List<HasWord> sentence : documentPreprocessor) {
    List<TaggedWord> tSentence = tagger.tagSentence(sentence);
    pw.println(Sentence.listToString(tSentence, false));
  }
  pw.close();
}
 
开发者ID:jaimeguzman,项目名称:data_mining,代码行数:19,代码来源:TaggerDemo2.java

示例4: process

import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
@Override
public TaggerResult process(Integer etextNo, Reader text) {
  final DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(text);
  documentPreprocessor.setTokenizerFactory(tokenizerFactory);

  int words = 0;
  final Map<String,Double> tagCounts = new TreeMap<String,Double>();
  final Map<String,Map<String,Integer>> wordBags = new HashMap<>();
  for (List<HasWord> sentence : documentPreprocessor) {
    for (TaggedWord word : tagger.tagSentence(sentence)) {
      // word count
      words++;

      // tag counts
      final String tag = word.tag();
      tagCounts.put(tag, tagCounts.getOrDefault(tag, 0.0) + 1.0);

      // noun/verb word bags
      if ("NN".equals(tag) || "NNS".equals(tag) /* || tag.startsWith("VB") */) {
        // get base form of word
        String lemma = morphology.stem(word).toString();
        if (lemma == null) {
          lemma = word.toString();
        }
        // get bag for words of this POS
        Map<String,Integer> wordBag = wordBags.get(tag);
        if (wordBag == null) {
          wordBag = new HashMap<>();
          wordBags.put(tag, wordBag);
        }
        // increment count
        wordBag.put(lemma, wordBag.getOrDefault(lemma, 0) + 1);
      }
    }
  }
  System.err.println("Processed: " + etextNo + " " + words + " words");
  return new TaggerResult(etextNo, tagCounts, wordBags, words);
}
 
开发者ID:tmmcguire,项目名称:ashurbanipal,代码行数:39,代码来源:EnglishTagger.java

示例5: getSentencesFromText

import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
public static List<List<HasWord>> getSentencesFromText(String str, boolean invertible, String options) {
    List<List<HasWord>> sentences = new ArrayList<List<HasWord>>();
    StringReader reader = new StringReader(str);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);
    TokenizerFactory factory = null;

    if( invertible ) {
      factory = PTBTokenizer.factory(true, true);
      if( options != null && options.length() > 0 ) 
        options = "invertible=true, " + options;
      else 
        options = "invertible=true";
    } else {
      factory = PTBTokenizer.factory();
    }

//    System.out.println("Setting splitter options=" + options);
    factory.setOptions(options);
    dp.setTokenizerFactory(factory);
    
    Iterator<List<HasWord>> iter = dp.iterator();
    while( iter.hasNext() ) {
      List<HasWord> sentence = iter.next();
      sentences.add(sentence);
    }
    return sentences;
    
  }
 
开发者ID:nchambers,项目名称:probschemas,代码行数:29,代码来源:Ling.java

示例6: main

import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
  if (args.length != 2) {
    System.err.println("usage: java TaggerDemo2 modelFile fileToTag");
    return;
  }
  MaxentTagger tagger = new MaxentTagger(args[0]);
  TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
							   "untokenizable=noneKeep");
  BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
  PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
  DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
  documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
  for (List<HasWord> sentence : documentPreprocessor) {
    List<TaggedWord> tSentence = tagger.tagSentence(sentence);
    pw.println(Sentence.listToString(tSentence, false));
  }

  // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence.
  List<HasWord> sent = Sentence.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
  List<TaggedWord> taggedSent = tagger.tagSentence(sent);
  for (TaggedWord tw : taggedSent) {
    if (tw.tag().startsWith("JJ")) {
      pw.println(tw.word());
    }
  }

  pw.close();
}
 
开发者ID:tudarmstadt-lt,项目名称:sentiment,代码行数:29,代码来源:TaggerDemo2.java

示例7: getSentences1_old

import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
public static List<String> getSentences1_old(String text, Set<String> entities){
        text=text.trim();
        text=StringEscapeUtils.escapeHtml(text);
        text=text.replaceAll("http:.*&hellip;\\z","");
        String[] toMatch={"\\ART\\[email protected]\\S+", "\\AMT\\[email protected]\\S+"};
        for(String t:toMatch){
                Pattern pattern = Pattern.compile(t, Pattern.CASE_INSENSITIVE);
                String newTweet = text.trim();
                text="";
                while(!newTweet.equals(text)){         //each loop will cut off one "RT @XXX" or "#XXX"; may need a few calls to cut all hashtags etc.
                        text=newTweet;
                        Matcher matcher = pattern.matcher(text);
                        newTweet = matcher.replaceAll("");
                        newTweet =newTweet.trim();
                }
        }
        text=text.replaceAll("-\\s*\\z","");
        text=text.replaceAll("&hellip;\\z","");
        text=StringEscapeUtils.unescapeHtml(text);
        text=text.trim();
        String[] parts=text.split(Extractor.urlRegExp);
        List<String> sentences=new ArrayList<String>();
        
//        for(int i=0;i<parts.length;i++){
        int limit=10;
        if(limit>parts.length) 
			limit=parts.length;
        for(int i=0;i<limit;i++){
//            parts[i]=text.replace("http://*&hellip;","");
            String text_cleaned=extractor.cleanText(parts[i]);
//            List<String> sentences_tmp=new ArrayList<String>();
            Reader reader = new StringReader(text_cleaned);
            DocumentPreprocessor dp = new DocumentPreprocessor(reader);
            dp.setTokenizerFactory(PTBTokenizerFactory.newWordTokenizerFactory("ptb3Escaping=false,untokenizable=noneDelete"));
                    //prop.setProperty("tokenizerOptions", "untokenizable=noneDelete");

            Iterator<List<HasWord>> it = dp.iterator();
            while (it.hasNext()) {
                StringBuilder sentenceSb = new StringBuilder();
                List<HasWord> sentence = it.next();
                boolean last_keep=false;
                for (HasWord token : sentence) {
                    if((!token.word().matches("[,:!.;?)]"))&&(!token.word().contains("'"))&&!last_keep){
                        sentenceSb.append(" ");
                    }
                    last_keep=false;
                    if(token.word().matches("[(\\[]"))
                            last_keep=true;
                    String next_word=token.toString();
                      
                    if((next_word.toUpperCase().equals(next_word))&&(!next_word.equals("I"))&&(!entities.contains(next_word)))
                        next_word=next_word.toLowerCase();
                    if(next_word.equals("i")) next_word="I";
                    sentenceSb.append(next_word);
                }
                String new_sentence=sentenceSb.toString().trim();
                Character fc=new_sentence.charAt(0);
                new_sentence=fc.toString().toUpperCase()+new_sentence.substring(1);
                if(new_sentence.endsWith(":"))
                    text=text.substring(0,text.length()-3)+".";

                sentences.add(new_sentence);
            }
  //          sentences.addAll(sentences_tmp);
        }
        return sentences;
    }
 
开发者ID:socialsensor,项目名称:trends-labeler,代码行数:68,代码来源:TrendsLabeler.java

示例8: getSentences1

import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
public static List<String> getSentences1(String text, Set<String> entities) {
//		System.out.println("   Text as it is    :   " + text);
		text = TrendsLabeler.getCleanedTitleMR(text);

		String[] parts = text.split(Extractor.urlRegExp);
		List<String> sentences = new ArrayList<String>();

		// for(int i=0;i<parts.length;i++){
		int limit = 10;
		if (limit > parts.length)
			limit = parts.length;
		for (int i = 0; i < limit; i++) {
			String text_cleaned = extr.cleanText(parts[i]);
			// List<String> sentences_tmp=new ArrayList<String>();
			Reader reader = new StringReader(text_cleaned);
			DocumentPreprocessor dp = new DocumentPreprocessor(reader);
			dp.setTokenizerFactory(PTBTokenizerFactory
					.newWordTokenizerFactory("ptb3Escaping=false, untokenizable=noneDelete"));
			// dp.setTokenizerFactory(PTBTokenizerFactory.newWordTokenizerFactory("untokenizable=noneDelete"));

			Iterator<List<HasWord>> it = dp.iterator();
			while (it.hasNext()) {
				StringBuilder sentenceSb = new StringBuilder();
				List<HasWord> sentence = it.next();
				boolean last_keep = false;
				for (HasWord token : sentence) {
					if ((!token.word().matches("[,:!.;?)]"))
							&& (!token.word().contains("'")) && !last_keep) {
						sentenceSb.append(" ");
					}
					last_keep = false;
					if (token.word().matches("[(\\[]"))
						last_keep = true;
					String next_word = token.toString();

					if ((next_word.toUpperCase().equals(next_word))
							&& (!next_word.equals("I"))
							&& (!entities.contains(next_word)))
						next_word = next_word.toLowerCase();
					if (next_word.equals("i"))
						next_word = "I";
					sentenceSb.append(next_word);
				}
				String new_sentence = sentenceSb.toString().trim();
				Character fc = new_sentence.charAt(0);
				new_sentence = fc.toString().toUpperCase()
						+ new_sentence.substring(1);
				if (new_sentence.endsWith(":"))
					text = text.substring(0, text.length() - 3) + "."; 

				sentences.add(new_sentence);
			}
			// sentences.addAll(sentences_tmp);
		}
		return sentences;
	}
 
开发者ID:socialsensor,项目名称:trends-labeler,代码行数:57,代码来源:TrendsLabeler.java

示例9: nearestDelimiter

import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
/**
 * Finds the nearest delimiter starting from index start. If <tt>seekDir</tt>
 * is SEEK_FORWARD, finds the nearest delimiter after start.  Else, if it is
 * SEEK_BACK, finds the nearest delimiter before start.
 */
private int nearestDelimiter(String text, int start, int seekDir) {
  if (seekDir != SEEK_BACK && seekDir != SEEK_FORWARD) {
    throw new IllegalArgumentException("Unknown seek direction " +
                                       seekDir);
  }
  StringReader reader = new StringReader(text);
  DocumentPreprocessor processor = new DocumentPreprocessor(reader);
  TokenizerFactory tf = tlp.getTokenizerFactory();
  processor.setTokenizerFactory(tf);
  List<Integer> boundaries = new ArrayList<Integer>();
  for (List<HasWord> sentence : processor) {
    if (sentence.size() == 0)
      continue;
    if (!(sentence.get(0) instanceof HasOffset)) {
      throw new ClassCastException("Expected HasOffsets from the " +
                                   "DocumentPreprocessor");
    }
    if (boundaries.size() == 0) {
      boundaries.add(0);
    } else {
      HasOffset first = (HasOffset) sentence.get(0);
      boundaries.add(first.beginPosition());
    }
  }
  boundaries.add(text.length());
  for (int i = 0; i < boundaries.size() - 1; ++i) {
    if (boundaries.get(i) <= start && start < boundaries.get(i + 1)) {
      if (seekDir == SEEK_BACK) {
        return boundaries.get(i) - 1;
      } else if (seekDir == SEEK_FORWARD) {
        return boundaries.get(i + 1) - 1;
      }
    }
  }
  // The cursor position at the end is actually one past the text length.
  // We might as well highlight the last interval in that case.
  if (boundaries.size() >= 2 && start >= text.length()) {
    if (seekDir == SEEK_BACK) {
      return boundaries.get(boundaries.size() - 2) - 1;
    } else if (seekDir == SEEK_FORWARD) {
      return boundaries.get(boundaries.size() - 1) - 1;
    }
  }
  return -1;
}
 
开发者ID:amark-india,项目名称:eventspotter,代码行数:51,代码来源:ParserPanel.java

示例10: saveOutput

import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
/**
 * Saves the results of applying the parser to the current text to
 * the specified filename.
 */
public void saveOutput(String filename) {
  if (filename == null || filename.equals("")) {
    return;
  }

  String text = textPane.getText();
  StringReader reader = new StringReader(text);
  DocumentPreprocessor processor = new DocumentPreprocessor(reader);
  TokenizerFactory tf = tlp.getTokenizerFactory();
  processor.setTokenizerFactory(tf);
  List<List<HasWord>> sentences = new ArrayList<List<HasWord>>();
  for (List<HasWord> sentence : processor) {
    sentences.add(sentence);
  }

  JProgressBar progress = new JProgressBar(0, sentences.size());
  JButton cancel = new javax.swing.JButton();

  JDialog dialog = new JDialog(new Frame(), "Parser Progress", true);

  dialog.setSize(300, 150);
  dialog.add(BorderLayout.NORTH, 
             new JLabel("Parsing " + sentences.size() + " sentences"));
  dialog.add(BorderLayout.CENTER, progress);
  dialog.add(BorderLayout.SOUTH, cancel);
  //dialog.add(progress);

  final SaveOutputThread thread = 
    new SaveOutputThread(filename, progress, dialog, cancel, sentences);

  cancel.setText("Cancel");
  cancel.setToolTipText("Cancel");
  cancel.addActionListener(new java.awt.event.ActionListener() {
    public void actionPerformed(java.awt.event.ActionEvent evt) {
      thread.cancelled = true;
    }
  });

  thread.start();

  dialog.setVisible(true);    
}
 
开发者ID:amark-india,项目名称:eventspotter,代码行数:47,代码来源:ParserPanel.java


注:本文中的edu.stanford.nlp.process.DocumentPreprocessor.setTokenizerFactory方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。