本文整理汇总了Java中org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory类的典型用法代码示例。如果您正苦于以下问题:Java TokenizerFactory类的具体用法?Java TokenizerFactory怎么用?Java TokenizerFactory使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
TokenizerFactory类属于org.deeplearning4j.text.tokenization.tokenizerfactory包,在下文中一共展示了TokenizerFactory类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testFindNamesFromText
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入依赖的package包/类
@Ignore
@Test
public void testFindNamesFromText() throws IOException {
SentenceIterator iter = new BasicLineIterator("src/test/resources/chineseName.txt");
log.info("load is right!");
TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
//tokenizerFactory.setTokenPreProcessor(new ChineseTokenizer());
//Generates a word-vector from the dataset stored in resources folder
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(2).iterations(5).layerSize(100).seed(42)
.learningRate(0.1).windowSize(20).iterate(iter).tokenizerFactory(tokenizerFactory).build();
vec.fit();
WordVectorSerializer.writeWordVectors(vec, new File("src/test/resources/chineseNameWordVector.txt"));
//trains a model that can find out all names from news(Suffix txt),It uses word vector generated
// WordVectors wordVectors;
//test model,Whether the model find out name from unknow text;
}
示例2: testWord2VecPlot
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入依赖的package包/类
@Test
public void testWord2VecPlot() throws Exception {
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025)
.layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5)
.modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10)
.tokenizerFactory(t).build();
vec.fit();
// UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();
// vec.getLookupTable().plotVocab(100, connectionInfo);
Thread.sleep(10000000000L);
fail("Not implemented");
}
示例3: main
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
// Gets Path to Text file
String filePath = "c:/raw_sentences.txt";
log.info("Load & Vectorize Sentences....");
// Strip white space before and after for each line
SentenceIterator iter = UimaSentenceIterator.createWithPath(filePath);
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
InMemoryLookupCache cache = new InMemoryLookupCache();
WeightLookupTable table = new InMemoryLookupTable.Builder()
.vectorLength(100)
.useAdaGrad(false)
.cache(cache)
.lr(0.025f).build();
log.info("Building model....");
Word2Vec vec = new Word2Vec.Builder()
.minWordFrequency(5).iterations(1)
.layerSize(100).lookupTable(table)
.stopWords(new ArrayList<String>())
.vocabCache(cache).seed(42)
.windowSize(5).iterate(iter).tokenizerFactory(t).build();
log.info("Fitting Word2Vec model....");
vec.fit();
log.info("Writing word vectors to text file....");
// Write word
WordVectorSerializer.writeWordVectors(vec, "word2vec.txt");
log.info("Closest Words:");
Collection<String> lst = vec.wordsNearest("man", 5);
System.out.println(lst);
double cosSim = vec.similarity("cruise", "voyage");
System.out.println(cosSim);
}
示例4: w2vBuilder
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入依赖的package包/类
public static Word2Vec w2vBuilder(SentenceIterator iter, TokenizerFactory t) {
return new Word2Vec.Builder()
.seed(12345)
.iterate(iter)
.tokenizerFactory(t)
.batchSize(1000)
.allowParallelTokenization(true) // enable parallel tokenization
.epochs(1) // number of epochs (iterations over whole training corpus) for training
.iterations(3) // number of iterations done for each mini-batch during training
.elementsLearningAlgorithm(new SkipGram<>()) // use SkipGram Model. If CBOW: new CBOW<>()
.minWordFrequency(50) // discard words that appear less than the times of set value
.windowSize(5) // set max skip length between words
.learningRate(0.05) // the starting learning rate
.minLearningRate(5e-4) // learning rate should not lower than the set threshold value
.negativeSample(10) // number of negative examples
// set threshold for occurrence of words. Those that appear with higher frequency will be
// randomly down-sampled
.sampling(1e-5)
.useHierarchicSoftmax(true) // use hierarchical softmax
.layerSize(300) // size of word vectors
.workers(8) // number of threads
.build();
}
示例5: RnnTextEmbeddingDataSetIterator
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入依赖的package包/类
/**
* @param data Instances with documents and labels
* @param wordVectors WordVectors object
* @param tokenFact Tokenizer factory
* @param tpp Token pre processor
* @param stopWords Stop word object
* @param batchSize Size of each minibatch for training
* @param truncateLength If reviews exceed
*/
public RnnTextEmbeddingDataSetIterator(
Instances data,
WordVectors wordVectors,
TokenizerFactory tokenFact,
TokenPreProcess tpp,
AbstractStopwords stopWords,
LabeledSentenceProvider sentenceProvider,
int batchSize,
int truncateLength) {
this.batchSize = batchSize;
this.vectorSize = wordVectors.getWordVector(wordVectors.vocab().wordAtIndex(0)).length;
this.data = data;
this.wordVectors = wordVectors;
this.truncateLength = truncateLength;
this.tokenizerFactory = tokenFact;
this.tokenizerFactory.setTokenPreProcessor(tpp);
this.stopWords = stopWords;
this.sentenceProvider = sentenceProvider;
}
示例6: testWord2VecMultiEpoch
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入依赖的package包/类
@Test
public void testWord2VecMultiEpoch() throws Exception {
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150)
.seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5).epochs(3)
.modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8)
.tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();
vec.fit();
Collection<String> lst = vec.wordsNearest("day", 10);
log.info(Arrays.toString(lst.toArray()));
// assertEquals(10, lst.size());
double sim = vec.similarity("day", "night");
log.info("Day/night similarity: " + sim);
assertTrue(lst.contains("week"));
assertTrue(lst.contains("night"));
assertTrue(lst.contains("year"));
}
示例7: testWord2VecGoogleModelUptraining
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入依赖的package包/类
@Ignore
@Test
public void testWord2VecGoogleModelUptraining() throws Exception {
long time1 = System.currentTimeMillis();
Word2Vec vec = WordVectorSerializer.readWord2VecModel(
new File("C:\\Users\\raver\\Downloads\\GoogleNews-vectors-negative300.bin.gz"), false);
long time2 = System.currentTimeMillis();
log.info("Model loaded in {} msec", time2 - time1);
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
vec.setTokenizerFactory(t);
vec.setSentenceIterator(iter);
vec.getConfiguration().setUseHierarchicSoftmax(false);
vec.getConfiguration().setNegative(5.0);
vec.setElementsLearningAlgorithm(new CBOW<VocabWord>());
vec.fit();
}
示例8: before
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入依赖的package包/类
@Before
public void before() throws Exception {
if (vec == null) {
ClassPathResource resource = new ClassPathResource("/labeled/");
File file = resource.getFile();
SentenceIterator iter = UimaSentenceIterator.createWithPath(file.getAbsolutePath());
new File("cache.ser").delete();
TokenizerFactory t = new UimaTokenizerFactory();
vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).layerSize(100)
.stopWords(new ArrayList<String>()).useUnknown(true).windowSize(5).iterate(iter)
.tokenizerFactory(t).build();
vec.fit();
}
}
示例9: getTokenizerFactory
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入依赖的package包/类
protected static TokenizerFactory getTokenizerFactory(VectorsConfiguration configuration) {
if (configuration == null)
return null;
if (configuration != null && configuration.getTokenizerFactory() != null
&& !configuration.getTokenizerFactory().isEmpty()) {
try {
TokenizerFactory factory =
(TokenizerFactory) Class.forName(configuration.getTokenizerFactory()).newInstance();
if (configuration.getTokenPreProcessor() != null && !configuration.getTokenPreProcessor().isEmpty()) {
TokenPreProcess preProcessor =
(TokenPreProcess) Class.forName(configuration.getTokenPreProcessor()).newInstance();
factory.setTokenPreProcessor(preProcessor);
}
return factory;
} catch (Exception e) {
log.error("Can't instantiate saved TokenizerFactory: {}", configuration.getTokenizerFactory());
}
}
return null;
}
示例10: windows
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入依赖的package包/类
/**
* Constructs a list of window of size windowSize.
* Note that padding for each window is created as well.
* @param words the words to tokenize and construct windows from
* @param tokenizerFactory tokenizer factory to use
* @param windowSize the window size to generate
* @return the list of windows for the tokenized string
*/
public static List<Window> windows(String words, @NonNull TokenizerFactory tokenizerFactory, int windowSize,
WordVectors vectors) {
Tokenizer tokenizer = tokenizerFactory.create(words);
List<String> list = new ArrayList<>();
while (tokenizer.hasMoreTokens()) {
String token = tokenizer.nextToken();
// if we don't have UNK word defined - we have to skip this word
if (vectors.getWordVectorMatrix(token) != null)
list.add(token);
}
if (list.isEmpty())
throw new IllegalStateException("No tokens found for windows");
return windows(list, windowSize);
}
示例11: testGoogleModelForInference
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入依赖的package包/类
@Ignore
@Test
public void testGoogleModelForInference() throws Exception {
WordVectors googleVectors = WordVectorSerializer.loadGoogleModelNonNormalized(
new File("/ext/GoogleNews-vectors-negative300.bin.gz"), true, false);
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
ParagraphVectors pv =
new ParagraphVectors.Builder().tokenizerFactory(t).iterations(10).useHierarchicSoftmax(false)
.trainWordVectors(false).iterations(10).useExistingWordVectors(googleVectors)
.negativeSample(10).sequenceLearningAlgorithm(new DM<VocabWord>()).build();
INDArray vec1 = pv.inferVector("This text is pretty awesome");
INDArray vec2 = pv.inferVector("Fantastic process of crazy things happening inside just for history purposes");
log.info("vec1/vec2: {}", Transforms.cosineSim(vec1, vec2));
}
示例12: testDefaultTokenizer1
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入依赖的package包/类
@Test
public void testDefaultTokenizer1() throws Exception {
String toTokenize = "Mary had a little lamb.";
TokenizerFactory t = new DefaultTokenizerFactory();
Tokenizer tokenizer = t.create(toTokenize);
Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
int position = 1;
while (tokenizer2.hasMoreTokens()) {
String tok1 = tokenizer.nextToken();
String tok2 = tokenizer2.nextToken();
log.info("Position: [" + position + "], token1: '" + tok1 + "', token 2: '" + tok2 + "'");
position++;
assertEquals(tok1, tok2);
}
ClassPathResource resource = new ClassPathResource("reuters/5250");
String str = FileUtils.readFileToString(resource.getFile());
int stringCount = t.create(str).countTokens();
int stringCount2 = t.create(resource.getInputStream()).countTokens();
assertTrue(Math.abs(stringCount - stringCount2) < 2);
}
示例13: testDefaultTokenizer2
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入依赖的package包/类
@Test
public void testDefaultTokenizer2() throws Exception {
String toTokenize = "Mary had a little lamb.";
TokenizerFactory t = new DefaultTokenizerFactory();
Tokenizer tokenizer = t.create(toTokenize);
Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
tokenizer2.countTokens();
while (tokenizer.hasMoreTokens()) {
String tok1 = tokenizer.nextToken();
String tok2 = tokenizer2.nextToken();
assertEquals(tok1, tok2);
}
System.out.println("-----------------------------------------------");
ClassPathResource resource = new ClassPathResource("reuters/5250");
String str = FileUtils.readFileToString(resource.getFile());
int stringCount = t.create(str).countTokens();
int stringCount2 = t.create(resource.getInputStream()).countTokens();
log.info("String tok: [" + stringCount + "], Stream tok: [" + stringCount2 + "], Difference: "
+ Math.abs(stringCount - stringCount2));
assertTrue(Math.abs(stringCount - stringCount2) < 2);
}
示例14: testDefaultStreamTokenizer
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入依赖的package包/类
@Test
public void testDefaultStreamTokenizer() throws Exception {
String toTokenize = "Mary had a little lamb.";
TokenizerFactory t = new DefaultTokenizerFactory();
Tokenizer tokenizer2 = t.create(new ByteArrayInputStream(toTokenize.getBytes()));
assertEquals(5, tokenizer2.countTokens());
int cnt = 0;
while (tokenizer2.hasMoreTokens()) {
String tok1 = tokenizer2.nextToken();
log.info(tok1);
cnt++;
}
assertEquals(5, cnt);
}
示例15: testGetTokens
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入依赖的package包/类
@Test
public void testGetTokens() throws Exception {
TokenizerFactory tf = new JapaneseTokenizerFactory();
Tokenizer tokenizer = tf.create(toTokenize);
// Exhaust iterator.
assertEquals(expect.length, tokenizer.countTokens());
for (int i = 0; i < tokenizer.countTokens(); ++i) {
assertEquals(tokenizer.nextToken(), expect[i]);
}
// Ensure exhausted.
assertEquals(false, tokenizer.hasMoreTokens());
// Count doesn't change.
assertEquals(expect.length, tokenizer.countTokens());
// getTokens still returns everything.
List<String> tokens = tokenizer.getTokens();
assertEquals(expect.length, tokens.size());
}