当前位置: 首页>>代码示例>>Java>>正文


Java WordVectorSerializer.writeWord2VecModel方法代码示例

本文整理汇总了Java中org.deeplearning4j.models.embeddings.loader.WordVectorSerializer.writeWord2VecModel方法的典型用法代码示例。如果您正苦于以下问题:Java WordVectorSerializer.writeWord2VecModel方法的具体用法?Java WordVectorSerializer.writeWord2VecModel怎么用?Java WordVectorSerializer.writeWord2VecModel使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.deeplearning4j.models.embeddings.loader.WordVectorSerializer的用法示例。


在下文中一共展示了WordVectorSerializer.writeWord2VecModel方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: testW2VnegativeOnRestore

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Test
public void testW2VnegativeOnRestore() throws Exception {
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());


    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(3).batchSize(64).layerSize(100)
                    .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
                    .sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>()).negativeSample(10).epochs(1)
                    .windowSize(5).useHierarchicSoftmax(false).allowParallelTokenization(true)
                    .modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();


    assertEquals(false, vec.getConfiguration().isUseHierarchicSoftmax());

    log.info("Fit 1");
    vec.fit();

    File tmpFile = File.createTempFile("temp", "file");
    tmpFile.deleteOnExit();

    WordVectorSerializer.writeWord2VecModel(vec, tmpFile);

    iter.reset();

    Word2Vec restoredVec = WordVectorSerializer.readWord2VecModel(tmpFile, true);
    restoredVec.setTokenizerFactory(t);
    restoredVec.setSentenceIterator(iter);

    assertEquals(false, restoredVec.getConfiguration().isUseHierarchicSoftmax());
    assertTrue(restoredVec.getModelUtils() instanceof FlatModelUtils);
    assertTrue(restoredVec.getConfiguration().isAllowParallelTokenization());

    log.info("Fit 2");
    restoredVec.fit();


    iter.reset();
    restoredVec = WordVectorSerializer.readWord2VecModel(tmpFile, false);
    restoredVec.setTokenizerFactory(t);
    restoredVec.setSentenceIterator(iter);

    assertEquals(false, restoredVec.getConfiguration().isUseHierarchicSoftmax());
    assertTrue(restoredVec.getModelUtils() instanceof BasicModelUtils);

    log.info("Fit 3");
    restoredVec.fit();
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:52,代码来源:Word2VecTests.java

示例2: testUnknown1

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Test
public void testUnknown1() throws Exception {
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10).useUnknown(true)
                    .unknownElement(new VocabWord(1.0, "PEWPEW")).iterations(1).layerSize(100)
                    .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
                    .sampling(0).elementsLearningAlgorithm(new CBOW<VocabWord>()).epochs(1).windowSize(5)
                    .useHierarchicSoftmax(true).allowParallelTokenization(true)
                    .modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();

    vec.fit();

    assertTrue(vec.hasWord("PEWPEW"));
    assertTrue(vec.getVocab().containsWord("PEWPEW"));

    INDArray unk = vec.getWordVectorMatrix("PEWPEW");
    assertNotEquals(null, unk);

    File tempFile = File.createTempFile("temp", "file");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeWord2VecModel(vec, tempFile);

    log.info("Original configuration: {}", vec.getConfiguration());

    Word2Vec restored = WordVectorSerializer.readWord2VecModel(tempFile);

    assertTrue(restored.hasWord("PEWPEW"));
    assertTrue(restored.getVocab().containsWord("PEWPEW"));
    INDArray unk_restored = restored.getWordVectorMatrix("PEWPEW");

    assertEquals(unk, unk_restored);



    // now we're getting some junk word
    INDArray random = vec.getWordVectorMatrix("hhsd7d7sdnnmxc_SDsda");
    INDArray randomRestored = restored.getWordVectorMatrix("hhsd7d7sdnnmxc_SDsda");

    log.info("Restored configuration: {}", restored.getConfiguration());

    assertEquals(unk, random);
    assertEquals(unk, randomRestored);
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:50,代码来源:Word2VecTests.java

示例3: testOutputStream

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Test
public void testOutputStream() throws Exception {
    File file = File.createTempFile("tmp_ser", "ssa");
    file.deleteOnExit();

    File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
    SentenceIterator iter = new BasicLineIterator(inputFile);
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    InMemoryLookupCache cache = new InMemoryLookupCache(false);
    WeightLookupTable table = new InMemoryLookupTable.Builder().vectorLength(100).useAdaGrad(false).negative(5.0)
                    .cache(cache).lr(0.025f).build();

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100)
                    .lookupTable(table).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5)
                    .vocabCache(cache).seed(42)
                    //                .workers(6)
                    .windowSize(5).iterate(iter).tokenizerFactory(t).build();

    assertEquals(new ArrayList<String>(), vec.getStopWords());
    vec.fit();

    INDArray day1 = vec.getWordVectorMatrix("day");

    WordVectorSerializer.writeWordVectors(vec, new FileOutputStream(file));

    WordVectors vec2 = WordVectorSerializer.loadTxtVectors(file);

    INDArray day2 = vec2.getWordVectorMatrix("day");

    assertEquals(day1, day2);

    File tempFile = File.createTempFile("tetsts", "Fdfs");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeWord2VecModel(vec, tempFile);

    Word2Vec vec3 = WordVectorSerializer.readWord2VecModel(tempFile);
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:42,代码来源:WordVectorSerializerTest.java

示例4: main

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
public static void main (String[] args) throws Exception {
    // a 66.6MB sample data extract from around 13GB wikipedia dataset
    String filePath = new ClassPathResource("data/raw_sentences.txt").getFile().getAbsolutePath();
    log.info("Load & Vectorize Sentences....");
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator(filePath);
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    // CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
    // So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
    // Additionally it forces lower case for all tokens.
    t.setTokenPreProcessor(new CommonPreprocessor());
    log.info("Building model....");
    Word2Vec vec = Word2VecTrainer.w2vBuilder(iter, t);
    log.info("Fitting Word2Vec model....");
    vec.fit();
    log.info("done...");
    // Write word vectors to file
    log.info("Writing word vectors to file....");
    WordVectorSerializer.writeWord2VecModel(vec, "src/main/resources/word2vec_dl4j_model.bin");
    log.info("done...");
    // Load word vectors to Word2Vec
    log.info("Load word vectors from file");
    Word2Vec w2v = WordVectorSerializer.readWord2VecModel("src/main/resources/word2vec_dl4j_model.bin");
    log.info("Testing result:");
    Collection<String> lst = w2v.wordsNearest("man", 10);
    log.info("Closest Words--10 Words closest to \"man\": " + lst);
    double cosSim = w2v.similarity("man", "woman");
    log.info("Cosine Similarity between \"man\" and \"woman\": " + String.valueOf(cosSim));
}
 
开发者ID:IsaacChanghau,项目名称:Word2VecfJava,代码行数:31,代码来源:DL4JWord2VecExample.java

示例5: main

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
	// Gets Path to Text file
	String filePath = new ClassPathResource("raw_sentences.txt").getFile().getAbsolutePath();
	log.info("Load & Vectorize Sentences....");
	// Strip white space before and after for each line
	SentenceIterator iter = new BasicLineIterator(filePath);
	// Split on white spaces in the line to get words
	TokenizerFactory t = new DefaultTokenizerFactory();
       /*
        * CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
        * So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
        * Additionally it forces lower case for all tokens.
        */
	t.setTokenPreProcessor(new CommonPreprocessor());
	log.info("Building model....");
	Word2Vec vec = new Word2Vec.Builder()
			.minWordFrequency(5)
			.iterations(1)
			.layerSize(100)
			.seed(42)
			.windowSize(5)
			.iterate(iter)
			.tokenizerFactory(t)
			.learningRate(0.025)
			.minLearningRate(1e-3)
			//.negativeSample(10)
			.build();
	log.info("Fitting Word2Vec model....");
	vec.fit();
	// Write word vectors to file
	log.info("Writing word vectors to text file....");
	WordVectorSerializer.writeWord2VecModel(vec, "src/main/resources/W2VModel.txt");
	// Load word vectors to Word2Vec
	Word2Vec w2v = WordVectorSerializer.readWord2VecModel("src/main/resources/W2VModel.txt");
	// Prints out the closest 10 words to "day". An example on what to do with these Word Vectors.
	log.info("Closest Words:");
	//Collection<String> lst = vec.wordsNearest("day", 10);
	Collection<String> lst = w2v.wordsNearest("day", 10);
	System.out.println("10 Words closest to 'day': " + lst);

	double cosSim = w2v.similarity("day", "night");
	System.out.println(cosSim);
}
 
开发者ID:IsaacChanghau,项目名称:NeuralNetworksLite,代码行数:44,代码来源:Word2VecRawTextExample.java

示例6: saveModel

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
public void saveModel(Word2Vec model, Language language) {
    String dir = "./src/main/resources/word2vec";
    String path = (learningAlgorithm instanceof SkipGram) ?
            dir + "/" + language.getName() + "_model.txt"
            :dir + "/" + language.getName() + "_model_" + learningAlgorithm.getCodeName() + ".txt";
    WordVectorSerializer.writeWord2VecModel(model, path);
}
 
开发者ID:madeleine789,项目名称:dl4j-apr,代码行数:8,代码来源:Pan15Word2Vec.java


注:本文中的org.deeplearning4j.models.embeddings.loader.WordVectorSerializer.writeWord2VecModel方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。