本文整理汇总了Java中org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory.create方法的典型用法代码示例。如果您正苦于以下问题:Java TokenizerFactory.create方法的具体用法?Java TokenizerFactory.create怎么用?Java TokenizerFactory.create使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory
的用法示例。
在下文中一共展示了TokenizerFactory.create方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testDefaultTokenizer2
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testDefaultTokenizer2() throws Exception {
String toTokenize = "Mary had a little lamb.";
TokenizerFactory t = new DefaultTokenizerFactory();
Tokenizer tokenizer = t.create(toTokenize);
Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
tokenizer2.countTokens();
while (tokenizer.hasMoreTokens()) {
String tok1 = tokenizer.nextToken();
String tok2 = tokenizer2.nextToken();
assertEquals(tok1, tok2);
}
System.out.println("-----------------------------------------------");
ClassPathResource resource = new ClassPathResource("reuters/5250");
String str = FileUtils.readFileToString(resource.getFile());
int stringCount = t.create(str).countTokens();
int stringCount2 = t.create(resource.getInputStream()).countTokens();
log.info("String tok: [" + stringCount + "], Stream tok: [" + stringCount2 + "], Difference: "
+ Math.abs(stringCount - stringCount2));
assertTrue(Math.abs(stringCount - stringCount2) < 2);
}
示例2: windows
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
/**
* Constructs a list of window of size windowSize.
* Note that padding for each window is created as well.
* @param words the words to tokenize and construct windows from
* @param tokenizerFactory tokenizer factory to use
* @param windowSize the window size to generate
* @return the list of windows for the tokenized string
*/
public static List<Window> windows(String words, @NonNull TokenizerFactory tokenizerFactory, int windowSize,
WordVectors vectors) {
Tokenizer tokenizer = tokenizerFactory.create(words);
List<String> list = new ArrayList<>();
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
// if we don't have UNK word defined - we have to skip this word
if (vectors.getWordVectorMatrix(token) != null)
list.add(token);
}
if (list.isEmpty())
throw new IllegalStateException("No tokens found for windows");
return windows(list, windowSize);
}
示例3: testDefaultTokenizer1
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testDefaultTokenizer1() throws Exception {
String toTokenize = "Mary had a little lamb.";
TokenizerFactory t = new DefaultTokenizerFactory();
Tokenizer tokenizer = t.create(toTokenize);
Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
int position = 1;
while (tokenizer2.hasMoreTokens()) {
String tok1 = tokenizer.nextToken();
String tok2 = tokenizer2.nextToken();
log.info("Position: [" + position + "], token1: '" + tok1 + "', token 2: '" + tok2 + "'");
position++;
assertEquals(tok1, tok2);
}
ClassPathResource resource = new ClassPathResource("reuters/5250");
String str = FileUtils.readFileToString(resource.getFile());
int stringCount = t.create(str).countTokens();
int stringCount2 = t.create(resource.getInputStream()).countTokens();
assertTrue(Math.abs(stringCount - stringCount2) < 2);
}
示例4: testDefaultStreamTokenizer
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testDefaultStreamTokenizer() throws Exception {
String toTokenize = "Mary had a little lamb.";
TokenizerFactory t = new DefaultTokenizerFactory();
Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
assertEquals(5, tokenizer2.countTokens());
int cnt = 0;
while (tokenizer2.hasMoreTokens()) {
String tok1 = tokenizer2.nextToken();
log.info(tok1);
cnt++;
}
assertEquals(5, cnt);
}
示例5: testGetTokens
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testGetTokens() throws Exception {
TokenizerFactory tf = new JapaneseTokenizerFactory();
Tokenizer tokenizer = tf.create(toTokenize);
// Exhaust iterator.
assertEquals(expect.length, tokenizer.countTokens());
for (int i = 0; i < tokenizer.countTokens(); ++i) {
assertEquals(tokenizer.nextToken(), expect[i]);
}
// Ensure exhausted.
assertEquals(false, tokenizer.hasMoreTokens());
// Count doesn't change.
assertEquals(expect.length, tokenizer.countTokens());
// getTokens still returns everything.
List<String> tokens = tokenizer.getTokens();
assertEquals(expect.length, tokens.size());
}
示例6: testKoreanTokenizer
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testKoreanTokenizer() throws Exception {
String toTokenize = "세계 최초의 상용 수준 오픈소스 딥러닝 라이브러리입니다";
TokenizerFactory t = new KoreanTokenizerFactory();
Tokenizer tokenizer = t.create(toTokenize);
String[] expect = {"세계", "최초", "의", "상용", "수준", "오픈소스", "딥", "러닝", "라이브러리", "입니", "다"};
assertEquals(expect.length, tokenizer.countTokens());
for (int i = 0; i < tokenizer.countTokens(); ++i) {
assertEquals(tokenizer.nextToken(), expect[i]);
}
}
示例7: testChineseTokenizer
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testChineseTokenizer() {
TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
Tokenizer tokenizer = tokenizerFactory.create(toTokenize);
assertEquals(expect.length, tokenizer.countTokens());
for (int i = 0; i < tokenizer.countTokens(); ++i) {
assertEquals(tokenizer.nextToken(), expect[i]);
}
}
示例8: testNGramTokenizer
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testNGramTokenizer() throws Exception {
String toTokenize = "Mary had a little lamb.";
TokenizerFactory factory = new NGramTokenizerFactory(new DefaultTokenizerFactory(), 1, 2);
Tokenizer tokenizer = factory.create(toTokenize);
Tokenizer tokenizer2 = factory.create(toTokenize);
while (tokenizer.hasMoreTokens()) {
assertEquals(tokenizer.nextToken(), tokenizer2.nextToken());
}
int stringCount = factory.create(toTokenize).countTokens();
List<String> tokens = factory.create(toTokenize).getTokens();
assertEquals(9, stringCount);
assertTrue(tokens.contains("Mary"));
assertTrue(tokens.contains("had"));
assertTrue(tokens.contains("a"));
assertTrue(tokens.contains("little"));
assertTrue(tokens.contains("lamb."));
assertTrue(tokens.contains("Mary had"));
assertTrue(tokens.contains("had a"));
assertTrue(tokens.contains("a little"));
assertTrue(tokens.contains("little lamb."));
factory = new NGramTokenizerFactory(new DefaultTokenizerFactory(), 2, 2);
tokens = factory.create(toTokenize).getTokens();
assertEquals(4, tokens.size());
assertTrue(tokens.contains("Mary had"));
assertTrue(tokens.contains("had a"));
assertTrue(tokens.contains("a little"));
assertTrue(tokens.contains("little lamb."));
}
示例9: testDefaultTokenizer3
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testDefaultTokenizer3() throws Exception {
String toTokenize = "Mary had a little lamb.";
TokenizerFactory t = new DefaultTokenizerFactory();
Tokenizer tokenizer = t.create(toTokenize);
Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
int position = 1;
while (tokenizer2.hasMoreTokens()) {
String tok1 = tokenizer.nextToken();
String tok2 = tokenizer2.nextToken();
log.info("Position: [" + position + "], token1: '" + tok1 + "', token 2: '" + tok2 + "'");
position++;
assertEquals(tok1, tok2);
}
}
示例10: testJapaneseTokenizer
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testJapaneseTokenizer() throws Exception {
TokenizerFactory t = new JapaneseTokenizerFactory();
Tokenizer tokenizer = t.create(toTokenize);
assertEquals(expect.length, tokenizer.countTokens());
for (int i = 0; i < tokenizer.countTokens(); ++i) {
assertEquals(tokenizer.nextToken(), expect[i]);
}
}
示例11: testBaseForm
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testBaseForm() throws Exception {
TokenizerFactory tf = new JapaneseTokenizerFactory(true);
Tokenizer tokenizer1 = tf.create(toTokenize);
Tokenizer tokenizer2 = tf.create(baseString);
assertEquals("黒い", tokenizer1.nextToken());
assertEquals("驚く", tokenizer2.nextToken());
}
示例12: testDocumentIterator
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testDocumentIterator() throws Exception {
ClassPathResource reuters5250 = new ClassPathResource("/reuters/5250");
File f = reuters5250.getFile();
DocumentIterator iter = new FileDocumentIterator(f.getAbsolutePath());
InputStream doc = iter.nextDocument();
TokenizerFactory t = new DefaultTokenizerFactory();
Tokenizer next = t.create(doc);
String[] list = "PEARSON CONCENTRATES ON FOUR SECTORS".split(" ");
///PEARSON CONCENTRATES ON FOUR SECTORS
int count = 0;
while (next.hasMoreTokens() && count < list.length) {
String token = next.nextToken();
assertEquals(list[count++], token);
}
doc.close();
}