本文整理汇总了Java中org.deeplearning4j.models.embeddings.loader.WordVectorSerializer.writeWord2VecModel方法的典型用法代码示例。如果您正苦于以下问题:Java WordVectorSerializer.writeWord2VecModel方法的具体用法?Java WordVectorSerializer.writeWord2VecModel怎么用?Java WordVectorSerializer.writeWord2VecModel使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.deeplearning4j.models.embeddings.loader.WordVectorSerializer
的用法示例。
在下文中一共展示了WordVectorSerializer.writeWord2VecModel方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testW2VnegativeOnRestore
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Test
public void testW2VnegativeOnRestore() throws Exception {
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(3).batchSize(64).layerSize(100)
.stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
.sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>()).negativeSample(10).epochs(1)
.windowSize(5).useHierarchicSoftmax(false).allowParallelTokenization(true)
.modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();
assertEquals(false, vec.getConfiguration().isUseHierarchicSoftmax());
log.info("Fit 1");
vec.fit();
File tmpFile = File.createTempFile("temp", "file");
tmpFile.deleteOnExit();
WordVectorSerializer.writeWord2VecModel(vec, tmpFile);
iter.reset();
Word2Vec restoredVec = WordVectorSerializer.readWord2VecModel(tmpFile, true);
restoredVec.setTokenizerFactory(t);
restoredVec.setSentenceIterator(iter);
assertEquals(false, restoredVec.getConfiguration().isUseHierarchicSoftmax());
assertTrue(restoredVec.getModelUtils() instanceof FlatModelUtils);
assertTrue(restoredVec.getConfiguration().isAllowParallelTokenization());
log.info("Fit 2");
restoredVec.fit();
iter.reset();
restoredVec = WordVectorSerializer.readWord2VecModel(tmpFile, false);
restoredVec.setTokenizerFactory(t);
restoredVec.setSentenceIterator(iter);
assertEquals(false, restoredVec.getConfiguration().isUseHierarchicSoftmax());
assertTrue(restoredVec.getModelUtils() instanceof BasicModelUtils);
log.info("Fit 3");
restoredVec.fit();
}
示例2: testUnknown1
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Test
public void testUnknown1() throws Exception {
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10).useUnknown(true)
.unknownElement(new VocabWord(1.0, "PEWPEW")).iterations(1).layerSize(100)
.stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
.sampling(0).elementsLearningAlgorithm(new CBOW<VocabWord>()).epochs(1).windowSize(5)
.useHierarchicSoftmax(true).allowParallelTokenization(true)
.modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();
vec.fit();
assertTrue(vec.hasWord("PEWPEW"));
assertTrue(vec.getVocab().containsWord("PEWPEW"));
INDArray unk = vec.getWordVectorMatrix("PEWPEW");
assertNotEquals(null, unk);
File tempFile = File.createTempFile("temp", "file");
tempFile.deleteOnExit();
WordVectorSerializer.writeWord2VecModel(vec, tempFile);
log.info("Original configuration: {}", vec.getConfiguration());
Word2Vec restored = WordVectorSerializer.readWord2VecModel(tempFile);
assertTrue(restored.hasWord("PEWPEW"));
assertTrue(restored.getVocab().containsWord("PEWPEW"));
INDArray unk_restored = restored.getWordVectorMatrix("PEWPEW");
assertEquals(unk, unk_restored);
// now we're getting some junk word
INDArray random = vec.getWordVectorMatrix("hhsd7d7sdnnmxc_SDsda");
INDArray randomRestored = restored.getWordVectorMatrix("hhsd7d7sdnnmxc_SDsda");
log.info("Restored configuration: {}", restored.getConfiguration());
assertEquals(unk, random);
assertEquals(unk, randomRestored);
}
示例3: testOutputStream
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Test
public void testOutputStream() throws Exception {
File file = File.createTempFile("tmp_ser", "ssa");
file.deleteOnExit();
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile);
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
InMemoryLookupCache cache = new InMemoryLookupCache(false);
WeightLookupTable table = new InMemoryLookupTable.Builder().vectorLength(100).useAdaGrad(false).negative(5.0)
.cache(cache).lr(0.025f).build();
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100)
.lookupTable(table).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5)
.vocabCache(cache).seed(42)
// .workers(6)
.windowSize(5).iterate(iter).tokenizerFactory(t).build();
assertEquals(new ArrayList<String>(), vec.getStopWords());
vec.fit();
INDArray day1 = vec.getWordVectorMatrix("day");
WordVectorSerializer.writeWordVectors(vec, new FileOutputStream(file));
WordVectors vec2 = WordVectorSerializer.loadTxtVectors(file);
INDArray day2 = vec2.getWordVectorMatrix("day");
assertEquals(day1, day2);
File tempFile = File.createTempFile("tetsts", "Fdfs");
tempFile.deleteOnExit();
WordVectorSerializer.writeWord2VecModel(vec, tempFile);
Word2Vec vec3 = WordVectorSerializer.readWord2VecModel(tempFile);
}
示例4: main
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
public static void main (String[] args) throws Exception {
// a 66.6MB sample data extract from around 13GB wikipedia dataset
String filePath = new ClassPathResource("data/raw_sentences.txt").getFile().getAbsolutePath();
log.info("Load & Vectorize Sentences....");
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(filePath);
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
// CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
// So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
// Additionally it forces lower case for all tokens.
t.setTokenPreProcessor(new CommonPreprocessor());
log.info("Building model....");
Word2Vec vec = Word2VecTrainer.w2vBuilder(iter, t);
log.info("Fitting Word2Vec model....");
vec.fit();
log.info("done...");
// Write word vectors to file
log.info("Writing word vectors to file....");
WordVectorSerializer.writeWord2VecModel(vec, "src/main/resources/word2vec_dl4j_model.bin");
log.info("done...");
// Load word vectors to Word2Vec
log.info("Load word vectors from file");
Word2Vec w2v = WordVectorSerializer.readWord2VecModel("src/main/resources/word2vec_dl4j_model.bin");
log.info("Testing result:");
Collection<String> lst = w2v.wordsNearest("man", 10);
log.info("Closest Words--10 Words closest to \"man\": " + lst);
double cosSim = w2v.similarity("man", "woman");
log.info("Cosine Similarity between \"man\" and \"woman\": " + String.valueOf(cosSim));
}
示例5: main
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
// Gets Path to Text file
String filePath = new ClassPathResource("raw_sentences.txt").getFile().getAbsolutePath();
log.info("Load & Vectorize Sentences....");
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(filePath);
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
/*
* CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
* So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
* Additionally it forces lower case for all tokens.
*/
t.setTokenPreProcessor(new CommonPreprocessor());
log.info("Building model....");
Word2Vec vec = new Word2Vec.Builder()
.minWordFrequency(5)
.iterations(1)
.layerSize(100)
.seed(42)
.windowSize(5)
.iterate(iter)
.tokenizerFactory(t)
.learningRate(0.025)
.minLearningRate(1e-3)
//.negativeSample(10)
.build();
log.info("Fitting Word2Vec model....");
vec.fit();
// Write word vectors to file
log.info("Writing word vectors to text file....");
WordVectorSerializer.writeWord2VecModel(vec, "src/main/resources/W2VModel.txt");
// Load word vectors to Word2Vec
Word2Vec w2v = WordVectorSerializer.readWord2VecModel("src/main/resources/W2VModel.txt");
// Prints out the closest 10 words to "day". An example on what to do with these Word Vectors.
log.info("Closest Words:");
//Collection<String> lst = vec.wordsNearest("day", 10);
Collection<String> lst = w2v.wordsNearest("day", 10);
System.out.println("10 Words closest to 'day': " + lst);
double cosSim = w2v.similarity("day", "night");
System.out.println(cosSim);
}
示例6: saveModel
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
public void saveModel(Word2Vec model, Language language) {
String dir = "./src/main/resources/word2vec";
String path = (learningAlgorithm instanceof SkipGram) ?
dir + "/" + language.getName() + "_model.txt"
:dir + "/" + language.getName() + "_model_" + learningAlgorithm.getCodeName() + ".txt";
WordVectorSerializer.writeWord2VecModel(model, path);
}