本文整理汇总了Java中edu.stanford.nlp.process.DocumentPreprocessor类的典型用法代码示例。如果您正苦于以下问题:Java DocumentPreprocessor类的具体用法?Java DocumentPreprocessor怎么用?Java DocumentPreprocessor使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
DocumentPreprocessor类属于edu.stanford.nlp.process包,在下文中一共展示了DocumentPreprocessor类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: demoDP
import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
/**
* demoDP demonstrates turning a file into tokens and then parse trees. Note
* that the trees are printed by calling pennPrint on the Tree object. It is
* also possible to pass a PrintWriter to pennPrint if you want to capture
* the output.
*
* file => tokens => parse trees
*/
public static void demoDP(LexicalizedParser lp, String filename) {
// This option shows loading, sentence-segmenting and tokenizing
// a file using DocumentPreprocessor.
TreebankLanguagePack tlp = new PennTreebankLanguagePack();
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
// You could also create a tokenizer here (as below) and pass it
// to DocumentPreprocessor
for (List<HasWord> sentence : new DocumentPreprocessor(filename)) {
Tree parse = lp.apply(sentence);
parse.pennPrint();
System.out.println();
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
Collection tdl = gs.typedDependenciesCCprocessed();
System.out.println(tdl);
System.out.println();
}
}
示例2: getIDFMapForDocument
import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
/**
* Get an IDF map for the given document string.
*
* @param document
* @return
*/
private static Counter<String> getIDFMapForDocument(String document) {
// Clean up -- remove some Gigaword patterns that slow things down
// / don't help anything
document = headingSeparator.matcher(document).replaceAll("");
DocumentPreprocessor preprocessor = new DocumentPreprocessor(new StringReader(document));
preprocessor.setTokenizerFactory(tokenizerFactory);
Counter<String> idfMap = new ClassicCounter<String>();
for (List<HasWord> sentence : preprocessor) {
if (sentence.size() > MAX_SENTENCE_LENGTH)
continue;
List<TaggedWord> tagged = tagger.tagSentence(sentence);
for (TaggedWord w : tagged) {
if (w.tag().startsWith("n"))
idfMap.incrementCount(w.word());
}
}
return idfMap;
}
示例3: applyPTBTokenizer
import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
private static List<String> applyPTBTokenizer(DocumentPreprocessor dp, boolean tokenizeNLs, boolean ptb3Escaping) {
PTBTokenizerFactory<Word> tf = PTBTokenizer.PTBTokenizerFactory.newWordTokenizerFactory("tokenizeNLs=" + tokenizeNLs + ",ptb3Escaping=" + ptb3Escaping + ",asciiQuotes=true");
dp.setTokenizerFactory(tf);
List<String> sentences = new ArrayList<>();
for (List<HasWord> wordList : dp) {
String sentence = "";
for (HasWord word : wordList) {
sentence += " " + splitCompounds(word.word());
}
sentences.add(sentence);
}
return sentences;
}
示例4: splitSentencesINDocument
import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
public List<String> splitSentencesINDocument(String sDoc)
{
Reader reader = new StringReader(sDoc);
DocumentPreprocessor dp = new DocumentPreprocessor(reader);
List<String> sentenceList = new ArrayList<String>();
Iterator<List<HasWord>> it = dp.iterator();
while (it.hasNext())
{
StringBuilder sentenceSb = new StringBuilder();
List<HasWord> sentence = it.next();
for (HasWord token : sentence)
{
if(sentenceSb.length()>1)
{
sentenceSb.append(" ");
}
sentenceSb.append(token);
}
sentenceList.add(sentenceSb.toString().trim());
}
return sentenceList;
}
示例5: parse_text
import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
public List<ParseTree> parse_text(String text, List<String> outputFormat) throws TApplicationException
{
List<ParseTree> results = new ArrayList<ParseTree>();
try
{
treePrinter = ParserUtil.setOptions(outputFormat, tlp);
// assume no tokenization was done; use Stanford's default org.ets.research.nlp.stanford_thrift.tokenizer
DocumentPreprocessor preprocess = new DocumentPreprocessor(new StringReader(text));
Iterator<List<HasWord>> foundSentences = preprocess.iterator();
while (foundSentences.hasNext())
{
Tree parseTree = parser.apply(foundSentences.next());
results.add(new ParseTree(ParserUtil.TreeObjectToString(parseTree, treePrinter), parseTree.score()));
}
}
catch (Exception e)
{
// FIXME
throw new TApplicationException(TApplicationException.INTERNAL_ERROR, e.getMessage());
}
return results;
}
示例6: tokenizeText
import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
public List<List<String>> tokenizeText(String arbitraryText)
{
List<List<String>> tokenizedSentences = new ArrayList<List<String>>();
DocumentPreprocessor preprocess = new DocumentPreprocessor(new StringReader(arbitraryText));
Iterator<List<HasWord>> foundSentences = preprocess.iterator();
while (foundSentences.hasNext())
{
List<HasWord> tokenizedSentence = foundSentences.next();
List<String> tokenizedSentenceAsListOfStrings = new ArrayList<String>();
for (HasWord w : tokenizedSentence)
{
tokenizedSentenceAsListOfStrings.add(w.word());
}
tokenizedSentences.add(tokenizedSentenceAsListOfStrings);
}
return tokenizedSentences;
}
示例7: main
import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("usage: java TaggerDemo modelFile fileToTag");
return;
}
MaxentTagger tagger = new MaxentTagger(args[0]);
TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
"untokenizable=noneKeep");
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
for (List<HasWord> sentence : documentPreprocessor) {
List<TaggedWord> tSentence = tagger.tagSentence(sentence);
pw.println(Sentence.listToString(tSentence, false));
}
pw.close();
}
示例8: demoDP
import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
public static void demoDP(LexicalizedParser lp, String filename) {
// This option shows loading and sentence-segment and tokenizing
// a file using DocumentPreprocessor
TreebankLanguagePack tlp = new PennTreebankLanguagePack();
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
// You could also create a tokenizer here (as below) and pass it
// to DocumentPreprocessor
for (List<HasWord> sentence : new DocumentPreprocessor(filename)) {
Tree parse = lp.apply(sentence);
parse.pennPrint();
System.out.println();
GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
Collection tdl = gs.typedDependenciesCCprocessed(true);
System.out.println(tdl);
System.out.println();
}
}
示例9: depParseSentence
import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
public static DepTree depParseSentence(String sentence){
DepTree tree = null;
DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(
sentence));
List<TaggedWord> tagged;
for (List<HasWord> sen : dp) { // 只有一句话,只循环一次
tagged = tagger.tagSentence(sen);
GrammaticalStructure gs = parser.predict(tagged);
tree = new DepTree(tagged, gs);
//tree.printDepBranch(tree.getNode(0), 2);
}
return tree;
}
示例10: segment
import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
@Override
public List<String> segment(String input) {
DocumentPreprocessor preprocessor = new DocumentPreprocessor(new StringReader(input));
List<String> results = new ArrayList<>();
for (List<HasWord> sentence : preprocessor) {
results.add(SentenceUtils.listToOriginalTextString(sentence));
}
return results;
}
示例11: sentenceSplitter
import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
public List<String> sentenceSplitter(String input){
Reader reader = new StringReader(input);
DocumentPreprocessor dp = new DocumentPreprocessor(reader);
List<String> sentenceList = new ArrayList<String>();
for (List<HasWord> sentence : dp) {
String sentenceString = Sentence.listToString(sentence);
sentenceList.add(sentenceString.toString());
}
return sentenceList;
}
示例12: ParseTree
import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
/**
* Construct a parse tree using the stanford NLP parser. Only one sentence.
* Here we are omitting the information of dependency labels (tags).
* @param text input text.
*/
public ParseTree(String text, NLParser parser) {
// pre-processing the input text
DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
List<HasWord> sentence = null;
for (List<HasWord> sentenceHasWord : tokenizer) {
sentence = sentenceHasWord;
break;
}
// part-of-speech tagging
List<TaggedWord> tagged = parser.tagger.tagSentence(sentence);
// dependency syntax parsing
GrammaticalStructure gs = parser.parser.predict(tagged);
// Reading the parsed sentence into ParseTree
int N = sentence.size()+1;
Node[] nodes = new Node[N];
root = new Node(0, "ROOT", "ROOT");
nodes[0] = root;
for (int i = 0; i < N-1; i++) {
nodes[i+1] = new Node(i+1,
sentence.get(i).word(), tagged.get(i).tag());
}
for (TypedDependency typedDep : gs.allTypedDependencies()) {
int from = typedDep.gov().index();
int to = typedDep.dep().index();
// String label = typedDep.reln().getShortName(); // omitting the label
nodes[to].parent = nodes[from];
nodes[from].children.add(nodes[to]);
}
}
示例13: main
import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
public static void main(String[] args) {
String modelPath = DependencyParser.DEFAULT_MODEL;
String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger";
for (int argIndex = 0; argIndex < args.length;) {
switch (args[argIndex]) {
case "-tagger":
taggerPath = args[argIndex + 1];
argIndex += 2;
break;
case "-com.dukenlidb.nlidb.model":
modelPath = args[argIndex + 1];
argIndex += 2;
break;
default:
throw new RuntimeException("Unknown argument " + args[argIndex]);
}
}
String text = "Return authors who have more papers than Bob in VLDB after 2000";
MaxentTagger tagger = new MaxentTagger(taggerPath);
DependencyParser parser = DependencyParser.loadFromModelFile(modelPath);
DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text));
for (List<HasWord> sentence : tokenizer) {
List<TaggedWord> tagged = tagger.tagSentence(sentence);
GrammaticalStructure gs = parser.predict(tagged);
// Print typed dependencies
log.info(gs);
}
}
示例14: getSentences
import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
/**
*
* @return list of sentences representing the turn text
*/
public List<String> getSentences(boolean removeSignatures) {
List<String> sentences = new ArrayList<>();
DocumentPreprocessor prep = new DocumentPreprocessor(new StringReader(text));
for (List<HasWord> sentence : prep) {
StringBuilder sb = new StringBuilder();
for (HasWord word : sentence) {
CoreLabel cl = (CoreLabel) word;
sb.append(cl.get(CoreAnnotations.OriginalTextAnnotation.class));
sb.append(' ');
}
String resSentence =sb.toString().trim();
if(removeSignatures){
for(String pattern:signaturepatternsPTB){
if(resSentence.contains(pattern)){
resSentence= resSentence.split(pattern)[0];
}
}
}
if(!resSentence.trim().isEmpty()&&resSentence.matches(".*[a-zA-Z]+.*")){
sentences.add(resSentence);
}
}
return sentences;
}
示例15: process
import edu.stanford.nlp.process.DocumentPreprocessor; //导入依赖的package包/类
@Override
public TaggerResult process(Integer etextNo, Reader text) {
final DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(text);
documentPreprocessor.setTokenizerFactory(tokenizerFactory);
int words = 0;
final Map<String,Double> tagCounts = new TreeMap<String,Double>();
final Map<String,Map<String,Integer>> wordBags = new HashMap<>();
for (List<HasWord> sentence : documentPreprocessor) {
for (TaggedWord word : tagger.tagSentence(sentence)) {
// word count
words++;
// tag counts
final String tag = word.tag();
tagCounts.put(tag, tagCounts.getOrDefault(tag, 0.0) + 1.0);
// noun/verb word bags
if ("NN".equals(tag) || "NNS".equals(tag) /* || tag.startsWith("VB") */) {
// get base form of word
String lemma = morphology.stem(word).toString();
if (lemma == null) {
lemma = word.toString();
}
// get bag for words of this POS
Map<String,Integer> wordBag = wordBags.get(tag);
if (wordBag == null) {
wordBag = new HashMap<>();
wordBags.put(tag, wordBag);
}
// increment count
wordBag.put(lemma, wordBag.getOrDefault(lemma, 0) + 1);
}
}
}
System.err.println("Processed: " + etextNo + " " + words + " words");
return new TaggerResult(etextNo, tagCounts, wordBags, words);
}