本文整理汇总了Java中org.deeplearning4j.models.embeddings.loader.WordVectorSerializer.readWord2VecModel方法的典型用法代码示例。如果您正苦于以下问题:Java WordVectorSerializer.readWord2VecModel方法的具体用法?Java WordVectorSerializer.readWord2VecModel怎么用?Java WordVectorSerializer.readWord2VecModel使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.deeplearning4j.models.embeddings.loader.WordVectorSerializer
的用法示例。
在下文中一共展示了WordVectorSerializer.readWord2VecModel方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testWord2VecGoogleModelUptraining
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Ignore
@Test
public void testWord2VecGoogleModelUptraining() throws Exception {
long time1 = System.currentTimeMillis();
Word2Vec vec = WordVectorSerializer.readWord2VecModel(
new File("C:\\Users\\raver\\Downloads\\GoogleNews-vectors-negative300.bin.gz"), false);
long time2 = System.currentTimeMillis();
log.info("Model loaded in {} msec", time2 - time1);
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
vec.setTokenizerFactory(t);
vec.setSentenceIterator(iter);
vec.getConfiguration().setUseHierarchicSoftmax(false);
vec.getConfiguration().setNegative(5.0);
vec.setElementsLearningAlgorithm(new CBOW<VocabWord>());
vec.fit();
}
示例2: testUnifiedLoaderArchive1
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Test
public void testUnifiedLoaderArchive1() throws Exception {
logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());
File w2v = new ClassPathResource("word2vec.dl4j/file.w2v").getFile();
WordVectors vectorsLive = WordVectorSerializer.readWord2Vec(w2v);
WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(w2v, false);
INDArray arrayLive = vectorsLive.getWordVectorMatrix("night");
INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("night");
assertNotEquals(null, arrayLive);
assertEquals(arrayLive, arrayStatic);
assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1Neg());
}
示例3: testUnifiedLoaderArchive2
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Test
public void testUnifiedLoaderArchive2() throws Exception {
logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());
File w2v = new ClassPathResource("word2vec.dl4j/file.w2v").getFile();
WordVectors vectorsLive = WordVectorSerializer.readWord2Vec(w2v);
WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(w2v, true);
INDArray arrayLive = vectorsLive.getWordVectorMatrix("night");
INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("night");
assertNotEquals(null, arrayLive);
assertEquals(arrayLive, arrayStatic);
assertNotEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
}
示例4: testUnifiedLoaderText
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
/**
* This method tests CSV file loading via unified loader
*
* @throws Exception
*/
@Test
public void testUnifiedLoaderText() throws Exception {
logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());
WordVectors vectorsLive = WordVectorSerializer.loadTxtVectors(textFile);
WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(textFile, true);
INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("Morgan_Freeman");
assertNotEquals(null, arrayLive);
assertEquals(arrayLive, arrayStatic);
// we're trying EXTENDED model, but file doesn't have syn1/huffman info, so it should be silently degraded to simplified model
assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
}
示例5: main
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
public static void main (String[] args) throws FileNotFoundException {
// download GoogleNews-vectors-negative300.bin.gz first
// load google news vectors for measurements
log.info("load word2vec model");
Word2Vec w2v = WordVectorSerializer.readWord2VecModel(
new File("/Users/zhanghao/Documents/Files/GoogleNews-vectors-negative300.bin"));
log.info("done.");
log.info("Semantic Property Task...");
// 1. TOEFL test
log.info("|********************load TOEFL data********************|");
List<Word2VecTOEFLTest.TFLNode> tflList = loadTOEFLData();
log.info("run the test");
TOEFLTest(tflList, w2v);
log.info("|*************************done.*************************|");
// 2. Analogy test -- "king - queen = man - woman"
log.info("|*******************load Syn_Sem data*******************|");
Map<String, List<Word2VecAnalogyTest.SynSemNode>> anaMap = loadSynSemData();
log.info("run the test");
AnalogyTest(anaMap, w2v);
log.info("|*************************done.*************************|");
// 3. WS353 test
log.info("|********************load WS353 data********************|");
LinkedList<Word2VecWS353Test.WS353Node> wsList = loadWS353Data("ws/ws353.txt");
LinkedList<Word2VecWS353Test.WS353Node> wsListRel = loadWS353Data("ws/ws353_relatedness.txt");
LinkedList<Word2VecWS353Test.WS353Node> wsListSim = loadWS353Data("ws/ws353_similarity.txt");
log.info("done.");
log.info("run the test");
WS353Test(w2v, wsList, "WS353");
WS353Test(w2v, wsListRel, "WS353 Relatedness");
WS353Test(w2v, wsListSim, "WS353 Similarity");
log.info("|*************************done.*************************|");
}
示例6: getStructure
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Override
public Instances getStructure() throws IOException {
if (m_sourceFile == null) {
throw new IOException("No source has been specified.");
}
if (m_structure == null) {
setSource(m_sourceFile);
this.vec = WordVectorSerializer.readWord2VecModel(m_sourceFile);
this.setStructure();
}
return m_structure;
}
示例7: testUnifiedLoaderBinary
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
/**
* This method tests binary file loading via unified loader
*
* @throws Exception
*/
@Test
public void testUnifiedLoaderBinary() throws Exception {
logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());
WordVectors vectorsLive = WordVectorSerializer.loadGoogleModel(binaryFile, true);
WordVectors vectorsStatic = WordVectorSerializer.readWord2VecModel(binaryFile, false);
INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("Morgan_Freeman");
assertNotEquals(null, arrayLive);
assertEquals(arrayLive, arrayStatic);
}
示例8: testW2VnegativeOnRestore
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Test
public void testW2VnegativeOnRestore() throws Exception {
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(3).batchSize(64).layerSize(100)
.stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
.sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>()).negativeSample(10).epochs(1)
.windowSize(5).useHierarchicSoftmax(false).allowParallelTokenization(true)
.modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();
assertEquals(false, vec.getConfiguration().isUseHierarchicSoftmax());
log.info("Fit 1");
vec.fit();
File tmpFile = File.createTempFile("temp", "file");
tmpFile.deleteOnExit();
WordVectorSerializer.writeWord2VecModel(vec, tmpFile);
iter.reset();
Word2Vec restoredVec = WordVectorSerializer.readWord2VecModel(tmpFile, true);
restoredVec.setTokenizerFactory(t);
restoredVec.setSentenceIterator(iter);
assertEquals(false, restoredVec.getConfiguration().isUseHierarchicSoftmax());
assertTrue(restoredVec.getModelUtils() instanceof FlatModelUtils);
assertTrue(restoredVec.getConfiguration().isAllowParallelTokenization());
log.info("Fit 2");
restoredVec.fit();
iter.reset();
restoredVec = WordVectorSerializer.readWord2VecModel(tmpFile, false);
restoredVec.setTokenizerFactory(t);
restoredVec.setSentenceIterator(iter);
assertEquals(false, restoredVec.getConfiguration().isUseHierarchicSoftmax());
assertTrue(restoredVec.getModelUtils() instanceof BasicModelUtils);
log.info("Fit 3");
restoredVec.fit();
}
示例9: testUnknown1
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Test
public void testUnknown1() throws Exception {
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10).useUnknown(true)
.unknownElement(new VocabWord(1.0, "PEWPEW")).iterations(1).layerSize(100)
.stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
.sampling(0).elementsLearningAlgorithm(new CBOW<VocabWord>()).epochs(1).windowSize(5)
.useHierarchicSoftmax(true).allowParallelTokenization(true)
.modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();
vec.fit();
assertTrue(vec.hasWord("PEWPEW"));
assertTrue(vec.getVocab().containsWord("PEWPEW"));
INDArray unk = vec.getWordVectorMatrix("PEWPEW");
assertNotEquals(null, unk);
File tempFile = File.createTempFile("temp", "file");
tempFile.deleteOnExit();
WordVectorSerializer.writeWord2VecModel(vec, tempFile);
log.info("Original configuration: {}", vec.getConfiguration());
Word2Vec restored = WordVectorSerializer.readWord2VecModel(tempFile);
assertTrue(restored.hasWord("PEWPEW"));
assertTrue(restored.getVocab().containsWord("PEWPEW"));
INDArray unk_restored = restored.getWordVectorMatrix("PEWPEW");
assertEquals(unk, unk_restored);
// now we're getting some junk word
INDArray random = vec.getWordVectorMatrix("hhsd7d7sdnnmxc_SDsda");
INDArray randomRestored = restored.getWordVectorMatrix("hhsd7d7sdnnmxc_SDsda");
log.info("Restored configuration: {}", restored.getConfiguration());
assertEquals(unk, random);
assertEquals(unk, randomRestored);
}
示例10: testOutputStream
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Test
public void testOutputStream() throws Exception {
File file = File.createTempFile("tmp_ser", "ssa");
file.deleteOnExit();
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile);
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
InMemoryLookupCache cache = new InMemoryLookupCache(false);
WeightLookupTable table = new InMemoryLookupTable.Builder().vectorLength(100).useAdaGrad(false).negative(5.0)
.cache(cache).lr(0.025f).build();
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100)
.lookupTable(table).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5)
.vocabCache(cache).seed(42)
// .workers(6)
.windowSize(5).iterate(iter).tokenizerFactory(t).build();
assertEquals(new ArrayList<String>(), vec.getStopWords());
vec.fit();
INDArray day1 = vec.getWordVectorMatrix("day");
WordVectorSerializer.writeWordVectors(vec, new FileOutputStream(file));
WordVectors vec2 = WordVectorSerializer.loadTxtVectors(file);
INDArray day2 = vec2.getWordVectorMatrix("day");
assertEquals(day1, day2);
File tempFile = File.createTempFile("tetsts", "Fdfs");
tempFile.deleteOnExit();
WordVectorSerializer.writeWord2VecModel(vec, tempFile);
Word2Vec vec3 = WordVectorSerializer.readWord2VecModel(tempFile);
}
示例11: testCnnSentenceDataSetIteratorNoTokensEdgeCase
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Test
public void testCnnSentenceDataSetIteratorNoTokensEdgeCase() throws Exception {
WordVectors w2v = WordVectorSerializer
.readWord2VecModel(new ClassPathResource("word2vec/googleload/sample_vec.bin").getFile());
int vectorSize = w2v.lookupTable().layerSize();
List<String> sentences = new ArrayList<>();
//First 2 sentences - no valid words
sentences.add("NOVALID WORDSHERE");
sentences.add("!!!");
sentences.add("these balance Database model");
sentences.add("into same THISWORDDOESNTEXIST are");
int maxLength = 4;
List<String> s1 = Arrays.asList("these", "balance", "Database", "model");
List<String> s2 = Arrays.asList("into", "same", "are");
List<String> labelsForSentences = Arrays.asList("Positive", "Negative", "Positive", "Negative");
INDArray expLabels = Nd4j.create(new double[][] {{0, 1}, {1, 0}}); //Order of labels: alphabetic. Positive -> [0,1]
LabeledSentenceProvider p = new CollectionLabeledSentenceProvider(sentences, labelsForSentences, null);
CnnSentenceDataSetIterator dsi = new CnnSentenceDataSetIterator.Builder().sentenceProvider(p).wordVectors(w2v)
.maxSentenceLength(256).minibatchSize(32).sentencesAlongHeight(false).build();
// System.out.println("alongHeight = " + alongHeight);
DataSet ds = dsi.next();
INDArray expectedFeatures = Nd4j.create(2, 1, vectorSize, maxLength);
INDArray expectedFeatureMask = Nd4j.create(new double[][] {{1, 1, 1, 1}, {1, 1, 1, 0}});
for (int i = 0; i < 4; i++) {
expectedFeatures.get(NDArrayIndex.point(0), NDArrayIndex.point(0), NDArrayIndex.all(),
NDArrayIndex.point(i)).assign(w2v.getWordVectorMatrix(s1.get(i)));
}
for (int i = 0; i < 3; i++) {
expectedFeatures.get(NDArrayIndex.point(1), NDArrayIndex.point(0), NDArrayIndex.all(),
NDArrayIndex.point(i)).assign(w2v.getWordVectorMatrix(s2.get(i)));
}
assertArrayEquals(expectedFeatures.shape(), ds.getFeatures().shape());
assertEquals(expectedFeatures, ds.getFeatures());
assertEquals(expLabels, ds.getLabels());
assertEquals(expectedFeatureMask, ds.getFeaturesMaskArray());
assertNull(ds.getLabelsMaskArray());
}
示例12: testCnnSentenceDataSetIteratorNoValidTokensNextEdgeCase
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Test
public void testCnnSentenceDataSetIteratorNoValidTokensNextEdgeCase() throws Exception {
//Case: 2 minibatches, of size 2
//First minibatch: OK
//Second minibatch: would be empty
//Therefore: after first minibatch is returned, .hasNext() should return false
WordVectors w2v = WordVectorSerializer
.readWord2VecModel(new ClassPathResource("word2vec/googleload/sample_vec.bin").getFile());
int vectorSize = w2v.lookupTable().layerSize();
List<String> sentences = new ArrayList<>();
sentences.add("these balance Database model");
sentences.add("into same THISWORDDOESNTEXIST are");
//Last 2 sentences - no valid words
sentences.add("NOVALID WORDSHERE");
sentences.add("!!!");
int maxLength = 4;
List<String> s1 = Arrays.asList("these", "balance", "Database", "model");
List<String> s2 = Arrays.asList("into", "same", "are");
List<String> labelsForSentences = Arrays.asList("Positive", "Negative", "Positive", "Negative");
INDArray expLabels = Nd4j.create(new double[][] {{0, 1}, {1, 0}}); //Order of labels: alphabetic. Positive -> [0,1]
LabeledSentenceProvider p = new CollectionLabeledSentenceProvider(sentences, labelsForSentences, null);
CnnSentenceDataSetIterator dsi = new CnnSentenceDataSetIterator.Builder().sentenceProvider(p).wordVectors(w2v)
.maxSentenceLength(256).minibatchSize(2).sentencesAlongHeight(false).build();
assertTrue(dsi.hasNext());
DataSet ds = dsi.next();
assertFalse(dsi.hasNext());
INDArray expectedFeatures = Nd4j.create(2, 1, vectorSize, maxLength);
INDArray expectedFeatureMask = Nd4j.create(new double[][] {{1, 1, 1, 1}, {1, 1, 1, 0}});
for (int i = 0; i < 4; i++) {
expectedFeatures.get(NDArrayIndex.point(0), NDArrayIndex.point(0), NDArrayIndex.all(),
NDArrayIndex.point(i)).assign(w2v.getWordVectorMatrix(s1.get(i)));
}
for (int i = 0; i < 3; i++) {
expectedFeatures.get(NDArrayIndex.point(1), NDArrayIndex.point(0), NDArrayIndex.all(),
NDArrayIndex.point(i)).assign(w2v.getWordVectorMatrix(s2.get(i)));
}
assertArrayEquals(expectedFeatures.shape(), ds.getFeatures().shape());
assertEquals(expectedFeatures, ds.getFeatures());
assertEquals(expLabels, ds.getLabels());
assertEquals(expectedFeatureMask, ds.getFeaturesMaskArray());
assertNull(ds.getLabelsMaskArray());
}
示例13: main
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
public static void main (String[] args) throws Exception {
// a 66.6MB sample data extract from around 13GB wikipedia dataset
String filePath = new ClassPathResource("data/raw_sentences.txt").getFile().getAbsolutePath();
log.info("Load & Vectorize Sentences....");
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(filePath);
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
// CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
// So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
// Additionally it forces lower case for all tokens.
t.setTokenPreProcessor(new CommonPreprocessor());
log.info("Building model....");
Word2Vec vec = Word2VecTrainer.w2vBuilder(iter, t);
log.info("Fitting Word2Vec model....");
vec.fit();
log.info("done...");
// Write word vectors to file
log.info("Writing word vectors to file....");
WordVectorSerializer.writeWord2VecModel(vec, "src/main/resources/word2vec_dl4j_model.bin");
log.info("done...");
// Load word vectors to Word2Vec
log.info("Load word vectors from file");
Word2Vec w2v = WordVectorSerializer.readWord2VecModel("src/main/resources/word2vec_dl4j_model.bin");
log.info("Testing result:");
Collection<String> lst = w2v.wordsNearest("man", 10);
log.info("Closest Words--10 Words closest to \"man\": " + lst);
double cosSim = w2v.similarity("man", "woman");
log.info("Cosine Similarity between \"man\" and \"woman\": " + String.valueOf(cosSim));
}
示例14: main
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
// Gets Path to Text file
String filePath = new ClassPathResource("raw_sentences.txt").getFile().getAbsolutePath();
log.info("Load & Vectorize Sentences....");
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(filePath);
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
/*
* CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
* So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
* Additionally it forces lower case for all tokens.
*/
t.setTokenPreProcessor(new CommonPreprocessor());
log.info("Building model....");
Word2Vec vec = new Word2Vec.Builder()
.minWordFrequency(5)
.iterations(1)
.layerSize(100)
.seed(42)
.windowSize(5)
.iterate(iter)
.tokenizerFactory(t)
.learningRate(0.025)
.minLearningRate(1e-3)
//.negativeSample(10)
.build();
log.info("Fitting Word2Vec model....");
vec.fit();
// Write word vectors to file
log.info("Writing word vectors to text file....");
WordVectorSerializer.writeWord2VecModel(vec, "src/main/resources/W2VModel.txt");
// Load word vectors to Word2Vec
Word2Vec w2v = WordVectorSerializer.readWord2VecModel("src/main/resources/W2VModel.txt");
// Prints out the closest 10 words to "day". An example on what to do with these Word Vectors.
log.info("Closest Words:");
//Collection<String> lst = vec.wordsNearest("day", 10);
Collection<String> lst = w2v.wordsNearest("day", 10);
System.out.println("10 Words closest to 'day': " + lst);
double cosSim = w2v.similarity("day", "night");
System.out.println(cosSim);
}
示例15: readModelFromFile
import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
public Word2Vec readModelFromFile(Language language) {
String path = (learningAlgorithm instanceof SkipGram) ?
language.getName() + "_model.txt" : language.getName() + "_model_" + learningAlgorithm.getCodeName() + ".txt";
URL resource = Pan15Word2Vec.class.getClassLoader()
.getResource("word2vec/" + path);
try {
return WordVectorSerializer.readWord2VecModel(Paths.get(resource.toURI()).toFile());
} catch (URISyntaxException e) {
e.printStackTrace();
}
return null;
}