当前位置: 首页>>代码示例>>Java>>正文


Java TokenizerFactory.create方法代码示例

本文整理汇总了Java中org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory.create方法的典型用法代码示例。如果您正苦于以下问题:Java TokenizerFactory.create方法的具体用法?Java TokenizerFactory.create怎么用?Java TokenizerFactory.create使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory的用法示例。


在下文中一共展示了TokenizerFactory.create方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: testDefaultTokenizer2

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testDefaultTokenizer2() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
    tokenizer2.countTokens();
    while (tokenizer.hasMoreTokens()) {
        String tok1 = tokenizer.nextToken();
        String tok2 = tokenizer2.nextToken();
        assertEquals(tok1, tok2);
    }


    System.out.println("-----------------------------------------------");

    ClassPathResource resource = new ClassPathResource("reuters/5250");
    String str = FileUtils.readFileToString(resource.getFile());
    int stringCount = t.create(str).countTokens();
    int stringCount2 = t.create(resource.getInputStream()).countTokens();

    log.info("String tok: [" + stringCount + "], Stream tok: [" + stringCount2 + "], Difference: "
                    + Math.abs(stringCount - stringCount2));

    assertTrue(Math.abs(stringCount - stringCount2) < 2);
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:27,代码来源:DefaulTokenizerTests.java

示例2: windows

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @param windowSize the window size to generate
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(String words, @NonNull TokenizerFactory tokenizerFactory, int windowSize,
                WordVectors vectors) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens()) {
        String token = tokenizer.nextToken();

        // if we don't have UNK word defined - we have to skip this word
        if (vectors.getWordVectorMatrix(token) != null)
            list.add(token);
    }

    if (list.isEmpty())
        throw new IllegalStateException("No tokens found for windows");

    return windows(list, windowSize);
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:26,代码来源:Windows.java

示例3: testDefaultTokenizer1

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testDefaultTokenizer1() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
    int position = 1;
    while (tokenizer2.hasMoreTokens()) {
        String tok1 = tokenizer.nextToken();
        String tok2 = tokenizer2.nextToken();
        log.info("Position: [" + position + "], token1: '" + tok1 + "', token 2: '" + tok2 + "'");
        position++;
        assertEquals(tok1, tok2);
    }


    ClassPathResource resource = new ClassPathResource("reuters/5250");
    String str = FileUtils.readFileToString(resource.getFile());
    int stringCount = t.create(str).countTokens();
    int stringCount2 = t.create(resource.getInputStream()).countTokens();
    assertTrue(Math.abs(stringCount - stringCount2) < 2);
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:23,代码来源:DefaulTokenizerTests.java

示例4: testDefaultStreamTokenizer

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testDefaultStreamTokenizer() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));

    assertEquals(5, tokenizer2.countTokens());

    int cnt = 0;
    while (tokenizer2.hasMoreTokens()) {
        String tok1 = tokenizer2.nextToken();
        log.info(tok1);
        cnt++;
    }

    assertEquals(5, cnt);
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:18,代码来源:DefaulTokenizerTests.java

示例5: testGetTokens

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testGetTokens() throws Exception {
    TokenizerFactory tf = new JapaneseTokenizerFactory();

    Tokenizer tokenizer = tf.create(toTokenize);

    // Exhaust iterator.
    assertEquals(expect.length, tokenizer.countTokens());
    for (int i = 0; i < tokenizer.countTokens(); ++i) {
        assertEquals(tokenizer.nextToken(), expect[i]);
    }

    // Ensure exhausted.
    assertEquals(false, tokenizer.hasMoreTokens());

    // Count doesn't change.
    assertEquals(expect.length, tokenizer.countTokens());

    // getTokens still returns everything.
    List<String> tokens = tokenizer.getTokens();
    assertEquals(expect.length, tokens.size());
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:23,代码来源:JapaneseTokenizerTest.java

示例6: testKoreanTokenizer

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testKoreanTokenizer() throws Exception {
    String toTokenize = "세계 최초의 상용 수준 오픈소스 딥러닝 라이브러리입니다";
    TokenizerFactory t = new KoreanTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);
    String[] expect = {"세계", "최초", "의", "상용", "수준", "오픈소스", "딥", "러닝", "라이브러리", "입니", "다"};

    assertEquals(expect.length, tokenizer.countTokens());

    for (int i = 0; i < tokenizer.countTokens(); ++i) {
        assertEquals(tokenizer.nextToken(), expect[i]);
    }
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:14,代码来源:KoreanTokenizerTest.java

示例7: testChineseTokenizer

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testChineseTokenizer() {
    TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
    Tokenizer tokenizer = tokenizerFactory.create(toTokenize);
    assertEquals(expect.length, tokenizer.countTokens());
    for (int i = 0; i < tokenizer.countTokens(); ++i) {
        assertEquals(tokenizer.nextToken(), expect[i]);
    }
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:10,代码来源:ChineseTokenizerTest.java

示例8: testNGramTokenizer

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testNGramTokenizer() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory factory = new NGramTokenizerFactory(new DefaultTokenizerFactory(), 1, 2);
    Tokenizer tokenizer = factory.create(toTokenize);
    Tokenizer tokenizer2 = factory.create(toTokenize);
    while (tokenizer.hasMoreTokens()) {
        assertEquals(tokenizer.nextToken(), tokenizer2.nextToken());
    }

    int stringCount = factory.create(toTokenize).countTokens();
    List<String> tokens = factory.create(toTokenize).getTokens();
    assertEquals(9, stringCount);

    assertTrue(tokens.contains("Mary"));
    assertTrue(tokens.contains("had"));
    assertTrue(tokens.contains("a"));
    assertTrue(tokens.contains("little"));
    assertTrue(tokens.contains("lamb."));
    assertTrue(tokens.contains("Mary had"));
    assertTrue(tokens.contains("had a"));
    assertTrue(tokens.contains("a little"));
    assertTrue(tokens.contains("little lamb."));

    factory = new NGramTokenizerFactory(new DefaultTokenizerFactory(), 2, 2);
    tokens = factory.create(toTokenize).getTokens();
    assertEquals(4, tokens.size());

    assertTrue(tokens.contains("Mary had"));
    assertTrue(tokens.contains("had a"));
    assertTrue(tokens.contains("a little"));
    assertTrue(tokens.contains("little lamb."));
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:34,代码来源:NGramTokenizerTest.java

示例9: testDefaultTokenizer3

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testDefaultTokenizer3() throws Exception {
    String toTokenize = "Mary had a little lamb.";
    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);
    Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
    int position = 1;
    while (tokenizer2.hasMoreTokens()) {
        String tok1 = tokenizer.nextToken();
        String tok2 = tokenizer2.nextToken();
        log.info("Position: [" + position + "], token1: '" + tok1 + "', token 2: '" + tok2 + "'");
        position++;
        assertEquals(tok1, tok2);
    }
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:16,代码来源:DefaulTokenizerTests.java

示例10: testJapaneseTokenizer

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testJapaneseTokenizer() throws Exception {
    TokenizerFactory t = new JapaneseTokenizerFactory();
    Tokenizer tokenizer = t.create(toTokenize);

    assertEquals(expect.length, tokenizer.countTokens());
    for (int i = 0; i < tokenizer.countTokens(); ++i) {
        assertEquals(tokenizer.nextToken(), expect[i]);
    }
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:11,代码来源:JapaneseTokenizerTest.java

示例11: testBaseForm

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testBaseForm() throws Exception {
    TokenizerFactory tf = new JapaneseTokenizerFactory(true);

    Tokenizer tokenizer1 = tf.create(toTokenize);
    Tokenizer tokenizer2 = tf.create(baseString);

    assertEquals("黒い", tokenizer1.nextToken());
    assertEquals("驚く", tokenizer2.nextToken());
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:11,代码来源:JapaneseTokenizerTest.java

示例12: testDocumentIterator

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testDocumentIterator() throws Exception {
    ClassPathResource reuters5250 = new ClassPathResource("/reuters/5250");
    File f = reuters5250.getFile();

    DocumentIterator iter = new FileDocumentIterator(f.getAbsolutePath());

    InputStream doc = iter.nextDocument();

    TokenizerFactory t = new DefaultTokenizerFactory();
    Tokenizer next = t.create(doc);
    String[] list = "PEARSON CONCENTRATES ON FOUR SECTORS".split(" ");
    ///PEARSON CONCENTRATES ON FOUR SECTORS
    int count = 0;
    while (next.hasMoreTokens() && count < list.length) {
        String token = next.nextToken();
        assertEquals(list[count++], token);
    }


    doc.close();



}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:26,代码来源:DefaultDocumentIteratorTest.java


注:本文中的org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory.create方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。