本文整理汇总了Java中edu.stanford.nlp.process.DocumentPreprocessor.setTokenizerFactory方法的典型用法代码示例。如果您正苦于以下问题:Java DocumentPreprocessor.setTokenizerFactory方法的具体用法?Java DocumentPreprocessor.setTokenizerFactory怎么用?Java DocumentPreprocessor.setTokenizerFactory使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类edu.stanford.nlp.process.DocumentPreprocessor
的用法示例。
在下文中一共展示了DocumentPreprocessor.setTokenizerFactory方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getIDFMapForDocument
import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
/**
* Get an IDF map for the given document string.
*
* @param document
* @return
*/
private static Counter<String> getIDFMapForDocument(String document) {
// Clean up -- remove some Gigaword patterns that slow things down
// / don't help anything
document = headingSeparator.matcher(document).replaceAll("");
DocumentPreprocessor preprocessor = new DocumentPreprocessor(new StringReader(document));
preprocessor.setTokenizerFactory(tokenizerFactory);
Counter<String> idfMap = new ClassicCounter<String>();
for (List<HasWord> sentence : preprocessor) {
if (sentence.size() > MAX_SENTENCE_LENGTH)
continue;
List<TaggedWord> tagged = tagger.tagSentence(sentence);
for (TaggedWord w : tagged) {
if (w.tag().startsWith("n"))
idfMap.incrementCount(w.word());
}
}
return idfMap;
}
示例2: applyPTBTokenizer
import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
private static List<String> applyPTBTokenizer(DocumentPreprocessor dp, boolean tokenizeNLs, boolean ptb3Escaping) {
PTBTokenizerFactory<Word> tf = PTBTokenizer.PTBTokenizerFactory.newWordTokenizerFactory("tokenizeNLs=" + tokenizeNLs + ",ptb3Escaping=" + ptb3Escaping + ",asciiQuotes=true");
dp.setTokenizerFactory(tf);
List<String> sentences = new ArrayList<>();
for (List<HasWord> wordList : dp) {
String sentence = "";
for (HasWord word : wordList) {
sentence += " " + splitCompounds(word.word());
}
sentences.add(sentence);
}
return sentences;
}
示例3: main
import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("usage: java TaggerDemo modelFile fileToTag");
return;
}
MaxentTagger tagger = new MaxentTagger(args[0]);
TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
"untokenizable=noneKeep");
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
for (List<HasWord> sentence : documentPreprocessor) {
List<TaggedWord> tSentence = tagger.tagSentence(sentence);
pw.println(Sentence.listToString(tSentence, false));
}
pw.close();
}
示例4: process
import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
@Override
public TaggerResult process(Integer etextNo, Reader text) {
final DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(text);
documentPreprocessor.setTokenizerFactory(tokenizerFactory);
int words = 0;
final Map<String,Double> tagCounts = new TreeMap<String,Double>();
final Map<String,Map<String,Integer>> wordBags = new HashMap<>();
for (List<HasWord> sentence : documentPreprocessor) {
for (TaggedWord word : tagger.tagSentence(sentence)) {
// word count
words++;
// tag counts
final String tag = word.tag();
tagCounts.put(tag, tagCounts.getOrDefault(tag, 0.0) + 1.0);
// noun/verb word bags
if ("NN".equals(tag) || "NNS".equals(tag) /* || tag.startsWith("VB") */) {
// get base form of word
String lemma = morphology.stem(word).toString();
if (lemma == null) {
lemma = word.toString();
}
// get bag for words of this POS
Map<String,Integer> wordBag = wordBags.get(tag);
if (wordBag == null) {
wordBag = new HashMap<>();
wordBags.put(tag, wordBag);
}
// increment count
wordBag.put(lemma, wordBag.getOrDefault(lemma, 0) + 1);
}
}
}
System.err.println("Processed: " + etextNo + " " + words + " words");
return new TaggerResult(etextNo, tagCounts, wordBags, words);
}
示例5: getSentencesFromText
import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
public static List<List<HasWord>> getSentencesFromText(String str, boolean invertible, String options) {
List<List<HasWord>> sentences = new ArrayList<List<HasWord>>();
StringReader reader = new StringReader(str);
DocumentPreprocessor dp = new DocumentPreprocessor(reader);
TokenizerFactory factory = null;
if( invertible ) {
factory = PTBTokenizer.factory(true, true);
if( options != null && options.length() > 0 )
options = "invertible=true, " + options;
else
options = "invertible=true";
} else {
factory = PTBTokenizer.factory();
}
// System.out.println("Setting splitter options=" + options);
factory.setOptions(options);
dp.setTokenizerFactory(factory);
Iterator<List<HasWord>> iter = dp.iterator();
while( iter.hasNext() ) {
List<HasWord> sentence = iter.next();
sentences.add(sentence);
}
return sentences;
}
示例6: main
import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("usage: java TaggerDemo2 modelFile fileToTag");
return;
}
MaxentTagger tagger = new MaxentTagger(args[0]);
TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
"untokenizable=noneKeep");
BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
for (List<HasWord> sentence : documentPreprocessor) {
List<TaggedWord> tSentence = tagger.tagSentence(sentence);
pw.println(Sentence.listToString(tSentence, false));
}
// print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence.
List<HasWord> sent = Sentence.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
List<TaggedWord> taggedSent = tagger.tagSentence(sent);
for (TaggedWord tw : taggedSent) {
if (tw.tag().startsWith("JJ")) {
pw.println(tw.word());
}
}
pw.close();
}
示例7: getSentences1_old
import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
public static List<String> getSentences1_old(String text, Set<String> entities){
text=text.trim();
text=StringEscapeUtils.escapeHtml(text);
text=text.replaceAll("http:.*…\\z","");
String[] toMatch={"\\ART\\[email protected]\\S+", "\\AMT\\[email protected]\\S+"};
for(String t:toMatch){
Pattern pattern = Pattern.compile(t, Pattern.CASE_INSENSITIVE);
String newTweet = text.trim();
text="";
while(!newTweet.equals(text)){ //each loop will cut off one "RT @XXX" or "#XXX"; may need a few calls to cut all hashtags etc.
text=newTweet;
Matcher matcher = pattern.matcher(text);
newTweet = matcher.replaceAll("");
newTweet =newTweet.trim();
}
}
text=text.replaceAll("-\\s*\\z","");
text=text.replaceAll("…\\z","");
text=StringEscapeUtils.unescapeHtml(text);
text=text.trim();
String[] parts=text.split(Extractor.urlRegExp);
List<String> sentences=new ArrayList<String>();
// for(int i=0;i<parts.length;i++){
int limit=10;
if(limit>parts.length)
limit=parts.length;
for(int i=0;i<limit;i++){
// parts[i]=text.replace("http://*…","");
String text_cleaned=extractor.cleanText(parts[i]);
// List<String> sentences_tmp=new ArrayList<String>();
Reader reader = new StringReader(text_cleaned);
DocumentPreprocessor dp = new DocumentPreprocessor(reader);
dp.setTokenizerFactory(PTBTokenizerFactory.newWordTokenizerFactory("ptb3Escaping=false,untokenizable=noneDelete"));
//prop.setProperty("tokenizerOptions", "untokenizable=noneDelete");
Iterator<List<HasWord>> it = dp.iterator();
while (it.hasNext()) {
StringBuilder sentenceSb = new StringBuilder();
List<HasWord> sentence = it.next();
boolean last_keep=false;
for (HasWord token : sentence) {
if((!token.word().matches("[,:!.;?)]"))&&(!token.word().contains("'"))&&!last_keep){
sentenceSb.append(" ");
}
last_keep=false;
if(token.word().matches("[(\\[]"))
last_keep=true;
String next_word=token.toString();
if((next_word.toUpperCase().equals(next_word))&&(!next_word.equals("I"))&&(!entities.contains(next_word)))
next_word=next_word.toLowerCase();
if(next_word.equals("i")) next_word="I";
sentenceSb.append(next_word);
}
String new_sentence=sentenceSb.toString().trim();
Character fc=new_sentence.charAt(0);
new_sentence=fc.toString().toUpperCase()+new_sentence.substring(1);
if(new_sentence.endsWith(":"))
text=text.substring(0,text.length()-3)+".";
sentences.add(new_sentence);
}
// sentences.addAll(sentences_tmp);
}
return sentences;
}
示例8: getSentences1
import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
public static List<String> getSentences1(String text, Set<String> entities) {
// System.out.println(" Text as it is : " + text);
text = TrendsLabeler.getCleanedTitleMR(text);
String[] parts = text.split(Extractor.urlRegExp);
List<String> sentences = new ArrayList<String>();
// for(int i=0;i<parts.length;i++){
int limit = 10;
if (limit > parts.length)
limit = parts.length;
for (int i = 0; i < limit; i++) {
String text_cleaned = extr.cleanText(parts[i]);
// List<String> sentences_tmp=new ArrayList<String>();
Reader reader = new StringReader(text_cleaned);
DocumentPreprocessor dp = new DocumentPreprocessor(reader);
dp.setTokenizerFactory(PTBTokenizerFactory
.newWordTokenizerFactory("ptb3Escaping=false, untokenizable=noneDelete"));
// dp.setTokenizerFactory(PTBTokenizerFactory.newWordTokenizerFactory("untokenizable=noneDelete"));
Iterator<List<HasWord>> it = dp.iterator();
while (it.hasNext()) {
StringBuilder sentenceSb = new StringBuilder();
List<HasWord> sentence = it.next();
boolean last_keep = false;
for (HasWord token : sentence) {
if ((!token.word().matches("[,:!.;?)]"))
&& (!token.word().contains("'")) && !last_keep) {
sentenceSb.append(" ");
}
last_keep = false;
if (token.word().matches("[(\\[]"))
last_keep = true;
String next_word = token.toString();
if ((next_word.toUpperCase().equals(next_word))
&& (!next_word.equals("I"))
&& (!entities.contains(next_word)))
next_word = next_word.toLowerCase();
if (next_word.equals("i"))
next_word = "I";
sentenceSb.append(next_word);
}
String new_sentence = sentenceSb.toString().trim();
Character fc = new_sentence.charAt(0);
new_sentence = fc.toString().toUpperCase()
+ new_sentence.substring(1);
if (new_sentence.endsWith(":"))
text = text.substring(0, text.length() - 3) + ".";
sentences.add(new_sentence);
}
// sentences.addAll(sentences_tmp);
}
return sentences;
}
示例9: nearestDelimiter
import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
/**
* Finds the nearest delimiter starting from index start. If <tt>seekDir</tt>
* is SEEK_FORWARD, finds the nearest delimiter after start. Else, if it is
* SEEK_BACK, finds the nearest delimiter before start.
*/
private int nearestDelimiter(String text, int start, int seekDir) {
if (seekDir != SEEK_BACK && seekDir != SEEK_FORWARD) {
throw new IllegalArgumentException("Unknown seek direction " +
seekDir);
}
StringReader reader = new StringReader(text);
DocumentPreprocessor processor = new DocumentPreprocessor(reader);
TokenizerFactory tf = tlp.getTokenizerFactory();
processor.setTokenizerFactory(tf);
List<Integer> boundaries = new ArrayList<Integer>();
for (List<HasWord> sentence : processor) {
if (sentence.size() == 0)
continue;
if (!(sentence.get(0) instanceof HasOffset)) {
throw new ClassCastException("Expected HasOffsets from the " +
"DocumentPreprocessor");
}
if (boundaries.size() == 0) {
boundaries.add(0);
} else {
HasOffset first = (HasOffset) sentence.get(0);
boundaries.add(first.beginPosition());
}
}
boundaries.add(text.length());
for (int i = 0; i < boundaries.size() - 1; ++i) {
if (boundaries.get(i) <= start && start < boundaries.get(i + 1)) {
if (seekDir == SEEK_BACK) {
return boundaries.get(i) - 1;
} else if (seekDir == SEEK_FORWARD) {
return boundaries.get(i + 1) - 1;
}
}
}
// The cursor position at the end is actually one past the text length.
// We might as well highlight the last interval in that case.
if (boundaries.size() >= 2 && start >= text.length()) {
if (seekDir == SEEK_BACK) {
return boundaries.get(boundaries.size() - 2) - 1;
} else if (seekDir == SEEK_FORWARD) {
return boundaries.get(boundaries.size() - 1) - 1;
}
}
return -1;
}
示例10: saveOutput
import edu.stanford.nlp.process.DocumentPreprocessor; //导入方法依赖的package包/类
/**
* Saves the results of applying the parser to the current text to
* the specified filename.
*/
public void saveOutput(String filename) {
if (filename == null || filename.equals("")) {
return;
}
String text = textPane.getText();
StringReader reader = new StringReader(text);
DocumentPreprocessor processor = new DocumentPreprocessor(reader);
TokenizerFactory tf = tlp.getTokenizerFactory();
processor.setTokenizerFactory(tf);
List<List<HasWord>> sentences = new ArrayList<List<HasWord>>();
for (List<HasWord> sentence : processor) {
sentences.add(sentence);
}
JProgressBar progress = new JProgressBar(0, sentences.size());
JButton cancel = new javax.swing.JButton();
JDialog dialog = new JDialog(new Frame(), "Parser Progress", true);
dialog.setSize(300, 150);
dialog.add(BorderLayout.NORTH,
new JLabel("Parsing " + sentences.size() + " sentences"));
dialog.add(BorderLayout.CENTER, progress);
dialog.add(BorderLayout.SOUTH, cancel);
//dialog.add(progress);
final SaveOutputThread thread =
new SaveOutputThread(filename, progress, dialog, cancel, sentences);
cancel.setText("Cancel");
cancel.setToolTipText("Cancel");
cancel.addActionListener(new java.awt.event.ActionListener() {
public void actionPerformed(java.awt.event.ActionEvent evt) {
thread.cancelled = true;
}
});
thread.start();
dialog.setVisible(true);
}