当前位置: 首页>>代码示例>>Java>>正文


Java TokenizerFactory类代码示例

本文整理汇总了Java中edu.stanford.nlp.process.TokenizerFactory的典型用法代码示例。如果您正苦于以下问题:Java TokenizerFactory类的具体用法?Java TokenizerFactory怎么用?Java TokenizerFactory使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


TokenizerFactory类属于edu.stanford.nlp.process包,在下文中一共展示了TokenizerFactory类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: init

import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
public void init(SeqClassifierFlags flags) {
  String options = "tokenizeNLs=false,invertible=true";
  if (flags.tokenizerOptions != null) {
    options = options + "," + flags.tokenizerOptions;
  }
  TokenizerFactory<IN> factory;
  if (flags.tokenizerFactory != null) {
    try {
      Class<TokenizerFactory<? extends HasWord>> clazz = ErasureUtils.uncheckedCast(Class.forName(flags.tokenizerFactory));
      Method factoryMethod = clazz.getMethod("newCoreLabelTokenizerFactory", String.class);
      factory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null, options));
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  } else {
    factory = ErasureUtils.uncheckedCast(PTBTokenizer.PTBTokenizerFactory.newCoreLabelTokenizerFactory(options));
  }
  init(flags, factory);
}
 
开发者ID:paulirwin,项目名称:Stanford.NER.Net,代码行数:20,代码来源:PlainTextDocumentReaderAndWriter.java

示例2: runTagger

import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
/**
 * This method runs the tagger on the provided reader and writer.
 *
 * It takes into from the given <code>reader</code>, applies the
 * tagger to it one sentence at a time (determined using
 * documentPreprocessor), and writes the output to the given
 * <code>writer</code>.
 *
 * The document is broken into sentences using the sentence
 * processor determined in the tagger's TaggerConfig.
 *
 * <code>tagInside</code> makes the tagger run in XML mode... if set
 * to non-empty, instead of processing the document as one large
 * text blob, it considers each region in between the given tag to
 * be a separate text blob.
 */
public void runTagger(BufferedReader reader, BufferedWriter writer,
                      String tagInside, OutputStyle outputStyle)
  throws IOException
{
  String sentenceDelimiter = config.getSentenceDelimiter();
  if (sentenceDelimiter != null && sentenceDelimiter.equals("newline")) {
    sentenceDelimiter = "\n";
  }
  final TokenizerFactory<? extends HasWord> tokenizerFactory = chooseTokenizerFactory();

  //Now we do everything through the doc preprocessor
  final DocumentPreprocessor docProcessor;
  if (tagInside.length() > 0) {
    docProcessor = new DocumentPreprocessor(reader, DocumentPreprocessor.DocType.XML);
    docProcessor.setElementDelimiter(tagInside);
  } else {
    docProcessor = new DocumentPreprocessor(reader);
    docProcessor.setSentenceDelimiter(sentenceDelimiter);
  }
  docProcessor.setTokenizerFactory(tokenizerFactory);

  runTagger(docProcessor, writer, outputStyle);
}
 
开发者ID:benblamey,项目名称:stanford-nlp,代码行数:40,代码来源:MaxentTagger.java

示例3: writeImage

import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
public static void writeImage(String sentence, String outFile, int scale) throws Exception {
    
    LexicalizedParser lp = null;
    try {
        lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
    } catch (Exception e) {
        System.err.println("Could not load file englishPCFG.ser.gz. Try placing this file in the same directory as Dependencee.jar");
        return;
    }
    
    lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"});
    TokenizerFactory<CoreLabel> tokenizerFactory =
            PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
    Tree tree = lp.apply(wordList);
    writeImage(tree, outFile, scale);
    
}
 
开发者ID:awaisathar,项目名称:dependensee,代码行数:19,代码来源:Main.java

示例4: testWriteImage

import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
/**
 * Test of writeImage method, of class Main.
 */

@Test
public void testWriteImage() throws Exception {
    String text = "A quick brown fox jumped over the lazy dog.";
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    LexicalizedParser lp = LexicalizedParser.loadModel();
    lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"});
    TokenizerFactory<CoreLabel> tokenizerFactory =
            PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize();
    Tree tree = lp.apply(wordList);
    GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
    Collection<TypedDependency> tdl = gs.typedDependenciesCollapsed();
    Main.writeImage(tdl, "image.png", 3);
    assert (new File("image.png").exists());
}
 
开发者ID:awaisathar,项目名称:dependensee,代码行数:21,代码来源:MainTest.java

示例5: main

import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
  if (args.length != 2) {
    System.err.println("usage: java TaggerDemo modelFile fileToTag");
    return;
  }
  MaxentTagger tagger = new MaxentTagger(args[0]);
  TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
							   "untokenizable=noneKeep");
  BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
  PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
  DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
  documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
  for (List<HasWord> sentence : documentPreprocessor) {
    List<TaggedWord> tSentence = tagger.tagSentence(sentence);
    pw.println(Sentence.listToString(tSentence, false));
  }
  pw.close();
}
 
开发者ID:jaimeguzman,项目名称:data_mining,代码行数:19,代码来源:TaggerDemo2.java

示例6: getResult

import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
public List<Word> getResult(String sentence) throws InvalidInputException {
    if (sentence == null || sentence.length() == 0)
        throw new InvalidInputException();

    TokenizerFactory<Word> tf = null;
    if (tf == null)
        tf = PTBTokenizer.factory();

    List<Word> tokens_words = tf.getTokenizer(new StringReader(sentence)).tokenize();

    return tokens_words;
}
 
开发者ID:cipriancus,项目名称:FakeTwitterDetection,代码行数:13,代码来源:Tokenizer.java

示例7: getSentencesFromText

import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
public static List<List<HasWord>> getSentencesFromText(String str, boolean invertible, String options) {
    List<List<HasWord>> sentences = new ArrayList<List<HasWord>>();
    StringReader reader = new StringReader(str);
    DocumentPreprocessor dp = new DocumentPreprocessor(reader);
    TokenizerFactory factory = null;

    if( invertible ) {
      factory = PTBTokenizer.factory(true, true);
      if( options != null && options.length() > 0 ) 
        options = "invertible=true, " + options;
      else 
        options = "invertible=true";
    } else {
      factory = PTBTokenizer.factory();
    }

//    System.out.println("Setting splitter options=" + options);
    factory.setOptions(options);
    dp.setTokenizerFactory(factory);
    
    Iterator<List<HasWord>> iter = dp.iterator();
    while( iter.hasNext() ) {
      List<HasWord> sentence = iter.next();
      sentences.add(sentence);
    }
    return sentences;
    
  }
 
开发者ID:nchambers,项目名称:probschemas,代码行数:29,代码来源:Ling.java

示例8: main

import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
  if (args.length != 2) {
    System.err.println("usage: java TaggerDemo2 modelFile fileToTag");
    return;
  }
  MaxentTagger tagger = new MaxentTagger(args[0]);
  TokenizerFactory<CoreLabel> ptbTokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(),
							   "untokenizable=noneKeep");
  BufferedReader r = new BufferedReader(new InputStreamReader(new FileInputStream(args[1]), "utf-8"));
  PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out, "utf-8"));
  DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
  documentPreprocessor.setTokenizerFactory(ptbTokenizerFactory);
  for (List<HasWord> sentence : documentPreprocessor) {
    List<TaggedWord> tSentence = tagger.tagSentence(sentence);
    pw.println(Sentence.listToString(tSentence, false));
  }

  // print the adjectives in one more sentence. This shows how to get at words and tags in a tagged sentence.
  List<HasWord> sent = Sentence.toWordList("The", "slimy", "slug", "crawled", "over", "the", "long", ",", "green", "grass", ".");
  List<TaggedWord> taggedSent = tagger.tagSentence(sent);
  for (TaggedWord tw : taggedSent) {
    if (tw.tag().startsWith("JJ")) {
      pw.println(tw.word());
    }
  }

  pw.close();
}
 
开发者ID:tudarmstadt-lt,项目名称:sentiment,代码行数:29,代码来源:TaggerDemo2.java

示例9: demoAPI

import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
/**
 * demoAPI demonstrates other ways of calling the parser with already
 * tokenized text, or in some cases, raw text that needs to be tokenized as
 * a single sentence. Output is handled with a TreePrint object. Note that
 * the options used when creating the TreePrint can determine what results
 * to print out. Once again, one can capture the output by passing a
 * PrintWriter to TreePrint.printTree.
 * 
 * difference: already tokenized text
 * 
 * 
 */
public static void demoAPI(LexicalizedParser lp) {
	// This option shows parsing a list of correctly tokenized words
	String[] sent = { "This", "is", "an", "easy", "sentence", "." };
	List<CoreLabel> rawWords = Sentence.toCoreLabelList(sent);
	Tree parse = lp.apply(rawWords);
	parse.pennPrint();
	System.out.println();

	// This option shows loading and using an explicit tokenizer
	String sent2 = "Hey @Apple, pretty much all your products are amazing. You blow minds every time you launch a new gizmo."
			+ " that said, your hold music is crap";
	TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(
			new CoreLabelTokenFactory(), "");
	Tokenizer<CoreLabel> tok = tokenizerFactory
			.getTokenizer(new StringReader(sent2));
	List<CoreLabel> rawWords2 = tok.tokenize();
	parse = lp.apply(rawWords2);

	TreebankLanguagePack tlp = new PennTreebankLanguagePack();
	GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
	GrammaticalStructure gs = gsf.newGrammaticalStructure(parse);
	List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
	System.out.println(tdl);
	System.out.println();

	// You can also use a TreePrint object to print trees and dependencies
	TreePrint tp = new TreePrint("penn,typedDependenciesCollapsed");
	tp.printTree(parse);
}
 
开发者ID:opinion-extraction-propagation,项目名称:TASC-Tuples,代码行数:42,代码来源:ParserDemo.java

示例10: getTokenizerFactory

import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
@Override
public TokenizerFactory<? extends HasWord> getTokenizerFactory() {
  if (tf != null) {
    return tf;
  } else {
    return super.getTokenizerFactory();
  }
}
 
开发者ID:paulirwin,项目名称:Stanford.NER.Net,代码行数:9,代码来源:ChineseTreebankLanguagePack.java

示例11: atbFactory

import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
public static TokenizerFactory<CoreLabel> atbFactory() {
  TokenizerFactory<CoreLabel> tf = ArabicTokenizerFactory.newTokenizerFactory();
  for (String option : atbOptions.stringPropertyNames()) {
    tf.setOptions(option);
  }
  return tf;
}
 
开发者ID:benblamey,项目名称:stanford-nlp,代码行数:8,代码来源:ArabicTokenizer.java

示例12: chooseTokenizerFactory

import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
/**
 * Figures out what tokenizer factory might be described by the
 * config.  If it's described by name in the config, uses reflection
 * to get the factory (which may cause an exception, of course...)
 */
protected TokenizerFactory<? extends HasWord> chooseTokenizerFactory() {
  return chooseTokenizerFactory(config.getTokenize(),
                                config.getTokenizerFactory(),
                                config.getTokenizerOptions(),
                                config.getTokenizerInvertible());
}
 
开发者ID:benblamey,项目名称:stanford-nlp,代码行数:12,代码来源:MaxentTagger.java

示例13: tokenizeText

import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
/**
 * Reads data from r, tokenizes it with the given tokenizer, and
 * returns a List of Lists of (extends) HasWord objects, which can then be
 * fed into tagSentence.
 *
 * @param r Reader where untokenized text is read
 * @param tokenizerFactory Tokenizer.  This can be <code>null</code> in which case
 *     the default English tokenizer (PTBTokenizerFactory) is used.
 * @return List of tokenized sentences
 */
public static List<List<HasWord>> tokenizeText(Reader r,
               TokenizerFactory<? extends HasWord> tokenizerFactory) {
  DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(r);
  if (tokenizerFactory != null) {
    documentPreprocessor.setTokenizerFactory(tokenizerFactory);
  }
  List<List<HasWord>> out = new ArrayList<List<HasWord>>();
  for (List<HasWord> item : documentPreprocessor) {
    out.add(item);
  }
  return out;
}
 
开发者ID:benblamey,项目名称:stanford-nlp,代码行数:23,代码来源:MaxentTagger.java

示例14: getGraph

import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
public static Graph getGraph(String sentence) throws Exception {
    LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
    lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"});
    TokenizerFactory<CoreLabel> tokenizerFactory =
            PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
    Tree tree = lp.apply(wordList);
    GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
    Collection<TypedDependency> tdl = gs.typedDependencies();
    return getGraph(tree, tdl);
}
 
开发者ID:awaisathar,项目名称:dependensee,代码行数:12,代码来源:Main.java

示例15: initialValue

import edu.stanford.nlp.process.TokenizerFactory; //导入依赖的package包/类
@Override
protected TokenizerFactory<CoreLabel> initialValue() {
    return PTBTokenizer.factory(false, true);
}
 
开发者ID:marcusklang,项目名称:langforia,代码行数:5,代码来源:StanfordPTBTokenizer.java


注:本文中的edu.stanford.nlp.process.TokenizerFactory类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。