本文整理汇总了Java中edu.stanford.nlp.process.TokenizerFactory.setOptions方法的典型用法代码示例。如果您正苦于以下问题:Java TokenizerFactory.setOptions方法的具体用法?Java TokenizerFactory.setOptions怎么用?Java TokenizerFactory.setOptions使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类edu.stanford.nlp.process.TokenizerFactory
的用法示例。
在下文中一共展示了TokenizerFactory.setOptions方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getSentencesFromText
import edu.stanford.nlp.process.TokenizerFactory; //导入方法依赖的package包/类
public static List<List<HasWord>> getSentencesFromText(String str, boolean invertible, String options) {
List<List<HasWord>> sentences = new ArrayList<List<HasWord>>();
StringReader reader = new StringReader(str);
DocumentPreprocessor dp = new DocumentPreprocessor(reader);
TokenizerFactory factory = null;
if( invertible ) {
factory = PTBTokenizer.factory(true, true);
if( options != null && options.length() > 0 )
options = "invertible=true, " + options;
else
options = "invertible=true";
} else {
factory = PTBTokenizer.factory();
}
// System.out.println("Setting splitter options=" + options);
factory.setOptions(options);
dp.setTokenizerFactory(factory);
Iterator<List<HasWord>> iter = dp.iterator();
while( iter.hasNext() ) {
List<HasWord> sentence = iter.next();
sentences.add(sentence);
}
return sentences;
}
示例2: atbFactory
import edu.stanford.nlp.process.TokenizerFactory; //导入方法依赖的package包/类
public static TokenizerFactory<CoreLabel> atbFactory() {
TokenizerFactory<CoreLabel> tf = ArabicTokenizerFactory.newTokenizerFactory();
for (String option : atbOptions.stringPropertyNames()) {
tf.setOptions(option);
}
return tf;
}
示例3: main
import edu.stanford.nlp.process.TokenizerFactory; //导入方法依赖的package包/类
/**
* A fast, rule-based tokenizer for Modern Standard Arabic (UTF-8 encoding).
* Performs punctuation splitting and light tokenization by default.
* Orthographic normalization options are available, and can be enabled with
* command line options.
* <p>
* Currently, this tokenizer does not do line splitting. It normalizes non-printing
* line separators across platforms and prints the system default line splitter
* to the output.
* </p>
* <p>
* The following normalization options are provided:
* <ul>
* <li><code>useUTF8Ellipsis</code> : Replaces sequences of three or more full stops with \u2026</li>
* <li><code>normArDigits</code> : Convert Arabic digits to ASCII equivalents</li>
* <li><code>normArPunc</code> : Convert Arabic punctuation to ASCII equivalents</li>
* <li><code>normAlif</code> : Change all alif forms to bare alif</li>
* <li><code>normYa</code> : Map ya to alif maqsura</li>
* <li><code>removeDiacritics</code> : Strip all diacritics</li>
* <li><code>removeTatweel</code> : Strip tatweel elongation character</li>
* <li><code>removeQuranChars</code> : Remove diacritics that appear in the Quran</li>
* <li><code>removeProMarker</code> : Remove the ATB null pronoun marker</li>
* <li><code>removeSegMarker</code> : Remove the ATB clitic segmentation marker</li>
* <li><code>removeMorphMarker</code> : Remove the ATB morpheme boundary markers</li>
* <li><code>atbEscaping</code> : Replace left/right parentheses with ATB escape characters</li>
* </ul>
* </p>
*
* @param args
*/
public static void main(String[] args) {
if (args.length > 0 && args[0].contains("help")) {
System.err.printf("Usage: java %s [OPTIONS] < file%n", ArabicTokenizer.class.getName());
System.err.printf("%nOptions:%n");
System.err.println(" -help : Print this message. See javadocs for all normalization options.");
System.err.println(" -atb : Tokenization for the parsing experiments in Green and Manning (2010)");
System.exit(-1);
}
// Process normalization options
final Properties tokenizerOptions = StringUtils.argsToProperties(args);
final TokenizerFactory<CoreLabel> tf = tokenizerOptions.containsKey("atb") ?
ArabicTokenizer.atbFactory() : ArabicTokenizer.factory();
for (String option : tokenizerOptions.stringPropertyNames()) {
tf.setOptions(option);
}
// Replace line separators with a token so that we can
// count lines
tf.setOptions("tokenizeNLs");
// Read the file
int nLines = 0;
int nTokens = 0;
final String encoding = "UTF-8";
try {
Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new InputStreamReader(System.in, encoding));
boolean printSpace = false;
while (tokenizer.hasNext()) {
++nTokens;
String word = tokenizer.next().word();
if (word.equals(ArabicLexer.NEWLINE_TOKEN)) {
++nLines;
printSpace = false;
System.out.println();
} else {
if (printSpace) System.out.print(" ");
System.out.print(word);
printSpace = true;
}
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
System.err.printf("Done! Tokenized %d lines (%d tokens)%n", nLines, nTokens);
}