当前位置: 首页>>代码示例>>Java>>正文


Java DocumentPreprocessor类代码示例

本文整理汇总了Java中edu.stanford.nlp.process.DocumentPreprocessor的典型用法代码示例。如果您正苦于以下问题:Java DocumentPreprocessor类的具体用法?Java DocumentPreprocessor怎么用?Java DocumentPreprocessor使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


DocumentPreprocessor类属于edu.stanford.nlp.process包,在下文中一共展示了DocumentPreprocessor类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: demoDP

import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
/**
 * demoDP demonstrates turning a file into tokens and then parse trees. Note
 * that the trees are printed by calling pennPrint on the Tree object. It is
 * also possible to pass a PrintWriter to pennPrint if you want to capture
 * the output.
 * 
 * file => tokens => parse trees
 */
public static void demoDP(LexicalizedParser lp, String filename) {
	// This option shows loading, sentence-segmenting and tokenizing
	// a file using DocumentPreprocessor.
	TreebankLanguagePack tlp = new PennTreebankLanguagePack();
	GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
	// You could also create a tokenizer here (as below) and pass it
	// to DocumentPreprocessor
	for (List<HasWord> sentence : new DocumentPreprocessor(filename)) {
		Tree parse = lp.apply(sentence);
		parse.pennPrint();
		System.out.println();

		GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
		Collection tdl = gs.typedDependenciesCCprocessed();
		System.out.println(tdl);
		System.out.println();
	}
}
 
开发者ID:opinion-extraction-propagation,项目名称:TASC-Tuples,代码行数:27,代码来源:ParserDemo.java

示例2: getIDFMapForDocument

import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
/**
 * Get an IDF map for the given document string.
 *
 * @param document
 * @return
 */
private static Counter<String> getIDFMapForDocument(String document) {
  // Clean up -- remove some Gigaword patterns that slow things down
  // / don't help anything
  document = headingSeparator.matcher(document).replaceAll("");

  DocumentPreprocessor preprocessor = new DocumentPreprocessor(new StringReader(document));
  preprocessor.setTokenizerFactory(tokenizerFactory);

  Counter<String> idfMap = new ClassicCounter<String>();
  for (List<HasWord> sentence : preprocessor) {
    if (sentence.size() > MAX_SENTENCE_LENGTH)
      continue;

    List<TaggedWord> tagged = tagger.tagSentence(sentence);

    for (TaggedWord w : tagged) {
      if (w.tag().startsWith("n"))
        idfMap.incrementCount(w.word());
    }
  }

  return idfMap;
}
 
开发者ID:asmehra95,项目名称:wiseowl,代码行数:30,代码来源:DocumentFrequencyCounter.java

示例3: applyPTBTokenizer

import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
private static List<String> applyPTBTokenizer(DocumentPreprocessor dp, boolean tokenizeNLs, boolean ptb3Escaping) {
	PTBTokenizerFactory<Word> tf = PTBTokenizer.PTBTokenizerFactory.newWordTokenizerFactory("tokenizeNLs=" + tokenizeNLs + ",ptb3Escaping=" + ptb3Escaping + ",asciiQuotes=true");
	dp.setTokenizerFactory(tf);
	List<String> sentences = new ArrayList<>();
	for (List<HasWord> wordList : dp) {
		String sentence = "";
		for (HasWord word : wordList) {
			sentence += " " + splitCompounds(word.word());
		}
		sentences.add(sentence);
	}
	return sentences;
}
 
开发者ID:infolis,项目名称:infoLink,代码行数:14,代码来源:TokenizerStanford.java

示例4: splitSentencesINDocument

import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
public List<String> splitSentencesINDocument(String sDoc)
{
    Reader reader = new StringReader(sDoc);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);
    List<String> sentenceList = new ArrayList<String>();
    Iterator<List<HasWord>> it = dp.iterator();

    while (it.hasNext())
    {
        StringBuilder sentenceSb = new StringBuilder();
        List<HasWord> sentence = it.next();
        for (HasWord token : sentence)
        {
            if(sentenceSb.length()>1)
            {
                sentenceSb.append(" ");
            }
            sentenceSb.append(token);
        }
        sentenceList.add(sentenceSb.toString().trim());
    }
    return sentenceList;
}
 
开发者ID:serendio-labs-stage,项目名称:diskoveror-ta,代码行数:24,代码来源:StanfordNLP.java

示例5: parse_text

import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
public List<ParseTree> parse_text(String text, List<String> outputFormat) throws TApplicationException
{
    List<ParseTree> results = new ArrayList<ParseTree>();

    try
    {
        treePrinter = ParserUtil.setOptions(outputFormat, tlp);

        // assume no tokenization was done; use Stanford's default org.ets.research.nlp.stanford_thrift.tokenizer
        DocumentPreprocessor preprocess = new DocumentPreprocessor(new StringReader(text));
        Iterator<List<HasWord>> foundSentences = preprocess.iterator();
        while (foundSentences.hasNext())
        {
            Tree parseTree = parser.apply(foundSentences.next());
            results.add(new ParseTree(ParserUtil.TreeObjectToString(parseTree, treePrinter), parseTree.score()));
        }
    }
    catch (Exception e)
    {
        // FIXME
        throw new TApplicationException(TApplicationException.INTERNAL_ERROR, e.getMessage());
    }

    return results;
}
 
开发者ID:dmnapolitano,项目名称:stanford-thrift,代码行数:26,代码来源:StanfordParserThrift.java

示例6: tokenizeText

import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
public List<List<String>> tokenizeText(String arbitraryText)
{
	List<List<String>> tokenizedSentences = new ArrayList<List<String>>();
	
   	DocumentPreprocessor preprocess = new DocumentPreprocessor(new StringReader(arbitraryText));
   	Iterator<List<HasWord>> foundSentences = preprocess.iterator();
   	while (foundSentences.hasNext())
   	{
   		List<HasWord> tokenizedSentence = foundSentences.next();
   		List<String> tokenizedSentenceAsListOfStrings = new ArrayList<String>();
   		for (HasWord w : tokenizedSentence)
   		{
   			tokenizedSentenceAsListOfStrings.add(w.word());
   		}
   		tokenizedSentences.add(tokenizedSentenceAsListOfStrings);
   	}
   	
   	return tokenizedSentences;
}
 
开发者ID:dmnapolitano,项目名称:stanford-thrift,代码行数:20,代码来源:StanfordTokenizerThrift.java

示例7: main

import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
  if (args.length != 2) {
    System.err.println("usage: java TaggerDemo modelFile fileToTag");
    return;
  }
  MaxentTagger tagger = new MaxentTagger(args[0]);
  TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
							   "untokenizable=noneKeep");
  BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
  PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
  DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
  documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
  for (List<HasWord> sentence : documentPreprocessor) {
    List<TaggedWord> tSentence = tagger.tagSentence(sentence);
    pw.println(Sentence.listToString(tSentence, false));
  }
  pw.close();
}
 
开发者ID:jaimeguzman,项目名称:data_mining,代码行数:19,代码来源:TaggerDemo2.java

示例8: demoDP

import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
public static void demoDP(LexicalizedParser lp, String filename) {
  // This option shows loading and sentence-segment and tokenizing
  // a file using DocumentPreprocessor
  TreebankLanguagePack tlp = new PennTreebankLanguagePack();
  GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
  // You could also create a tokenizer here (as below) and pass it
  // to DocumentPreprocessor
  for (List<HasWord> sentence : new DocumentPreprocessor(filename)) {
    Tree parse = lp.apply(sentence);
    parse.pennPrint();
    System.out.println();

    GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
    Collection tdl = gs.typedDependenciesCCprocessed(true);
    System.out.println(tdl);
    System.out.println();
  }
}
 
开发者ID:amark-india,项目名称:eventspotter,代码行数:19,代码来源:ParserDemo.java

示例9: depParseSentence

import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
public static DepTree depParseSentence(String sentence){
	DepTree tree = null;
	DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(
			sentence));
	List<TaggedWord> tagged;

	for (List<HasWord> sen : dp) { // 只有一句话,只循环一次
		tagged = tagger.tagSentence(sen);
		GrammaticalStructure gs = parser.predict(tagged);
		tree = new DepTree(tagged, gs);
		//tree.printDepBranch(tree.getNode(0), 2);
	}
	
	return tree;
}
 
开发者ID:cs-zyluo,项目名称:CausalNet,代码行数:16,代码来源:DepBranchUtil.java

示例10: segment

import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
@Override
public List<String> segment(String input) {
    DocumentPreprocessor preprocessor = new DocumentPreprocessor(new StringReader(input));
    List<String> results = new ArrayList<>();
    for (List<HasWord> sentence : preprocessor) {
        results.add(SentenceUtils.listToOriginalTextString(sentence));
    }
    return results;
}
 
开发者ID:clearwsd,项目名称:clearwsd,代码行数:10,代码来源:StanfordTokenizer.java

示例11: sentenceSplitter

import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
public List<String> sentenceSplitter(String input){
	Reader reader = new StringReader(input);
	DocumentPreprocessor dp = new DocumentPreprocessor(reader);
	List<String> sentenceList = new ArrayList<String>();

	for (List<HasWord> sentence : dp) {
	   String sentenceString = Sentence.listToString(sentence);
	   sentenceList.add(sentenceString.toString());
	}
	return sentenceList;
	
}
 
开发者ID:kunal15595,项目名称:smart-question-answering-nlp,代码行数:13,代码来源:QuestionGenerator.java

示例12: ParseTree

import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
/**
 * Construct a parse tree using the stanford NLP parser. Only one sentence.
 * Here we are omitting the information of dependency labels (tags).
 * @param text input text.
 */
public ParseTree(String text, NLParser parser) {
	// pre-processing the input text
	DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
	List<HasWord> sentence = null;
	for (List<HasWord> sentenceHasWord : tokenizer) {
		sentence = sentenceHasWord;
		break;
	}
	// part-of-speech tagging
	List<TaggedWord> tagged = parser.tagger.tagSentence(sentence);
	// dependency syntax parsing
	GrammaticalStructure gs = parser.parser.predict(tagged);
	
	// Reading the parsed sentence into ParseTree
	int N = sentence.size()+1;
	Node[] nodes = new Node[N];
	root = new Node(0, "ROOT", "ROOT");
	nodes[0] = root;
	for (int i = 0; i < N-1; i++) {
		nodes[i+1] = new Node(i+1, 
				sentence.get(i).word(), tagged.get(i).tag());
	}
	for (TypedDependency typedDep : gs.allTypedDependencies()) {
		int from = typedDep.gov().index();
		int to   = typedDep.dep().index();
		// String label = typedDep.reln().getShortName(); // omitting the label
		nodes[to].parent = nodes[from];
		nodes[from].children.add(nodes[to]);
	}
}
 
开发者ID:DukeNLIDB,项目名称:NLIDB,代码行数:36,代码来源:ParseTree.java

示例13: main

import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
public static void main(String[] args) {
	String modelPath = DependencyParser.DEFAULT_MODEL;
	String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";

	for (int argIndex = 0; argIndex < args.length;) {
		switch (args[argIndex]) {
		case "-tagger":
			taggerPath = args[argIndex + 1];
			argIndex += 2;
			break;
		case "-com.dukenlidb.nlidb.model":
			modelPath = args[argIndex + 1];
			argIndex += 2;
			break;
		default:
			throw new RuntimeException("Unknown argument " + args[argIndex]);
		}
	}

	String text = "Return authors who have more papers than Bob in VLDB after 2000";

	MaxentTagger tagger = new MaxentTagger(taggerPath);
	DependencyParser parser = DependencyParser.loadFromModelFile(modelPath);

	DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
	for (List<HasWord> sentence : tokenizer) {
		List<TaggedWord> tagged = tagger.tagSentence(sentence);
		GrammaticalStructure gs = parser.predict(tagged);

		// Print typed dependencies
		log.info(gs);
	}
	
}
 
开发者ID:DukeNLIDB,项目名称:NLIDB,代码行数:35,代码来源:ParserDemo.java

示例14: getSentences

import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
/**
 * 
 * @return list of sentences representing the turn text
 */
public List<String> getSentences(boolean removeSignatures) {
	List<String> sentences = new ArrayList<>();
	DocumentPreprocessor prep = new DocumentPreprocessor(new StringReader(text));
	
	for (List<HasWord> sentence : prep) {			
		StringBuilder sb = new StringBuilder();
        for (HasWord word : sentence) {
        	CoreLabel cl = (CoreLabel) word;
            sb.append(cl.get(CoreAnnotations.OriginalTextAnnotation.class));
            sb.append(' ');
        }
		String resSentence =sb.toString().trim();
		if(removeSignatures){
			for(String pattern:signaturepatternsPTB){
				if(resSentence.contains(pattern)){
					resSentence= resSentence.split(pattern)[0];					
				}
			}
			
		}
        if(!resSentence.trim().isEmpty()&&resSentence.matches(".*[a-zA-Z]+.*")){
			sentences.add(resSentence);	        	
        }
	}
	
	return sentences;
}
 
开发者ID:DiscourseDB,项目名称:discoursedb-core,代码行数:32,代码来源:Turn.java

示例15: process

import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
@Override
public TaggerResult process(Integer etextNo, Reader text) {
  final DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(text);
  documentPreprocessor.setTokenizerFactory(tokenizerFactory);

  int words = 0;
  final Map<String,Double> tagCounts = new TreeMap<String,Double>();
  final Map<String,Map<String,Integer>> wordBags = new HashMap<>();
  for (List<HasWord> sentence : documentPreprocessor) {
    for (TaggedWord word : tagger.tagSentence(sentence)) {
      // word count
      words++;

      // tag counts
      final String tag = word.tag();
      tagCounts.put(tag, tagCounts.getOrDefault(tag, 0.0) + 1.0);

      // noun/verb word bags
      if ("NN".equals(tag) || "NNS".equals(tag) /* || tag.startsWith("VB") */) {
        // get base form of word
        String lemma = morphology.stem(word).toString();
        if (lemma == null) {
          lemma = word.toString();
        }
        // get bag for words of this POS
        Map<String,Integer> wordBag = wordBags.get(tag);
        if (wordBag == null) {
          wordBag = new HashMap<>();
          wordBags.put(tag, wordBag);
        }
        // increment count
        wordBag.put(lemma, wordBag.getOrDefault(lemma, 0) + 1);
      }
    }
  }
  System.err.println("Processed: " + etextNo + " " + words + " words");
  return new TaggerResult(etextNo, tagCounts, wordBags, words);
}
 
开发者ID:tmmcguire,项目名称:ashurbanipal,代码行数:39,代码来源:EnglishTagger.java


注:本文中的edu.stanford.nlp.process.DocumentPreprocessor类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。