当前位置: 首页>>代码示例>>Java>>正文


Java WordVectorSerializer.readWord2VecModel方法代码示例

本文整理汇总了Java中org.deeplearning4j.models.embeddings.loader.WordVectorSerializer.readWord2VecModel方法的典型用法代码示例。如果您正苦于以下问题:Java WordVectorSerializer.readWord2VecModel方法的具体用法?Java WordVectorSerializer.readWord2VecModel怎么用?Java WordVectorSerializer.readWord2VecModel使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.deeplearning4j.models.embeddings.loader.WordVectorSerializer的用法示例。


在下文中一共展示了WordVectorSerializer.readWord2VecModel方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: testWord2VecGoogleModelUptraining

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Ignore
@Test
public void testWord2VecGoogleModelUptraining() throws Exception {
    long time1 = System.currentTimeMillis();
    Word2Vec vec = WordVectorSerializer.readWord2VecModel(
                    new File("C:\\Users\\raver\\Downloads\\GoogleNews-vectors-negative300.bin.gz"), false);
    long time2 = System.currentTimeMillis();
    log.info("Model loaded in {} msec", time2 - time1);
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    vec.setTokenizerFactory(t);
    vec.setSentenceIterator(iter);
    vec.getConfiguration().setUseHierarchicSoftmax(false);
    vec.getConfiguration().setNegative(5.0);
    vec.setElementsLearningAlgorithm(new CBOW<VocabWord>());

    vec.fit();
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:22,代码来源:Word2VecTests.java

示例2: testUnifiedLoaderArchive1

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Test
public void testUnifiedLoaderArchive1() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    File w2v = new ClassPathResource("word2vec.dl4j/file.w2v").getFile();

    WordVectors vectorsLive = WordVectorSerializer.readWord2Vec(w2v);
    WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(w2v, false);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("night");
    INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("night");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);

    assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
    assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1Neg());
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:19,代码来源:WordVectorSerializerTest.java

示例3: testUnifiedLoaderArchive2

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Test
public void testUnifiedLoaderArchive2() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    File w2v = new ClassPathResource("word2vec.dl4j/file.w2v").getFile();

    WordVectors vectorsLive = WordVectorSerializer.readWord2Vec(w2v);
    WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(w2v, true);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("night");
    INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("night");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);

    assertNotEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:18,代码来源:WordVectorSerializerTest.java

示例4: testUnifiedLoaderText

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
/**
 * This method tests CSV file loading via unified loader
 *
 * @throws Exception
 */
@Test
public void testUnifiedLoaderText() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    WordVectors vectorsLive = WordVectorSerializer.loadTxtVectors(textFile);
    WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(textFile, true);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
    INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("Morgan_Freeman");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);

    // we're trying EXTENDED model, but file doesn't have syn1/huffman info, so it should be silently degraded to simplified model
    assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:22,代码来源:WordVectorSerializerTest.java

示例5: main

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
public static void main (String[] args) throws FileNotFoundException {
    // download GoogleNews-vectors-negative300.bin.gz first
    // load google news vectors for measurements
    log.info("load word2vec model");
    Word2Vec w2v = WordVectorSerializer.readWord2VecModel(
            new File("/Users/zhanghao/Documents/Files/GoogleNews-vectors-negative300.bin"));
    log.info("done.");

    log.info("Semantic Property Task...");
    // 1. TOEFL test
    log.info("|********************load TOEFL data********************|");
    List<Word2VecTOEFLTest.TFLNode> tflList = loadTOEFLData();
    log.info("run the test");
    TOEFLTest(tflList, w2v);
    log.info("|*************************done.*************************|");

    // 2. Analogy test -- "king - queen = man - woman"
    log.info("|*******************load Syn_Sem data*******************|");
    Map<String, List<Word2VecAnalogyTest.SynSemNode>> anaMap = loadSynSemData();
    log.info("run the test");
    AnalogyTest(anaMap, w2v);
    log.info("|*************************done.*************************|");

    // 3. WS353 test
    log.info("|********************load WS353 data********************|");
    LinkedList<Word2VecWS353Test.WS353Node> wsList = loadWS353Data("ws/ws353.txt");
    LinkedList<Word2VecWS353Test.WS353Node> wsListRel = loadWS353Data("ws/ws353_relatedness.txt");
    LinkedList<Word2VecWS353Test.WS353Node> wsListSim = loadWS353Data("ws/ws353_similarity.txt");
    log.info("done.");
    log.info("run the test");
    WS353Test(w2v, wsList, "WS353");
    WS353Test(w2v, wsListRel, "WS353 Relatedness");
    WS353Test(w2v, wsListSim, "WS353 Similarity");
    log.info("|*************************done.*************************|");
}
 
开发者ID:IsaacChanghau,项目名称:Word2VecfJava,代码行数:36,代码来源:DL4JWord2VecSemanticExample.java

示例6: getStructure

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Override
public Instances getStructure() throws IOException {
  if (m_sourceFile == null) {
    throw new IOException("No source has been specified.");
  }

  if (m_structure == null) {
    setSource(m_sourceFile);
    this.vec = WordVectorSerializer.readWord2VecModel(m_sourceFile);
    this.setStructure();
  }

  return m_structure;
}
 
开发者ID:Waikato,项目名称:wekaDeeplearning4j,代码行数:15,代码来源:Word2VecLoader.java

示例7: testUnifiedLoaderBinary

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
/**
 * This method tests binary file loading via unified loader
 *
 * @throws Exception
 */
@Test
public void testUnifiedLoaderBinary() throws Exception {

    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    WordVectors vectorsLive = WordVectorSerializer.loadGoogleModel(binaryFile, true);
    WordVectors vectorsStatic = WordVectorSerializer.readWord2VecModel(binaryFile, false);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
    INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("Morgan_Freeman");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:20,代码来源:WordVectorSerializerTest.java

示例8: testW2VnegativeOnRestore

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Test
public void testW2VnegativeOnRestore() throws Exception {
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());


    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(3).batchSize(64).layerSize(100)
                    .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
                    .sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>()).negativeSample(10).epochs(1)
                    .windowSize(5).useHierarchicSoftmax(false).allowParallelTokenization(true)
                    .modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();


    assertEquals(false, vec.getConfiguration().isUseHierarchicSoftmax());

    log.info("Fit 1");
    vec.fit();

    File tmpFile = File.createTempFile("temp", "file");
    tmpFile.deleteOnExit();

    WordVectorSerializer.writeWord2VecModel(vec, tmpFile);

    iter.reset();

    Word2Vec restoredVec = WordVectorSerializer.readWord2VecModel(tmpFile, true);
    restoredVec.setTokenizerFactory(t);
    restoredVec.setSentenceIterator(iter);

    assertEquals(false, restoredVec.getConfiguration().isUseHierarchicSoftmax());
    assertTrue(restoredVec.getModelUtils() instanceof FlatModelUtils);
    assertTrue(restoredVec.getConfiguration().isAllowParallelTokenization());

    log.info("Fit 2");
    restoredVec.fit();


    iter.reset();
    restoredVec = WordVectorSerializer.readWord2VecModel(tmpFile, false);
    restoredVec.setTokenizerFactory(t);
    restoredVec.setSentenceIterator(iter);

    assertEquals(false, restoredVec.getConfiguration().isUseHierarchicSoftmax());
    assertTrue(restoredVec.getModelUtils() instanceof BasicModelUtils);

    log.info("Fit 3");
    restoredVec.fit();
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:52,代码来源:Word2VecTests.java

示例9: testUnknown1

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Test
public void testUnknown1() throws Exception {
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10).useUnknown(true)
                    .unknownElement(new VocabWord(1.0, "PEWPEW")).iterations(1).layerSize(100)
                    .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
                    .sampling(0).elementsLearningAlgorithm(new CBOW<VocabWord>()).epochs(1).windowSize(5)
                    .useHierarchicSoftmax(true).allowParallelTokenization(true)
                    .modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();

    vec.fit();

    assertTrue(vec.hasWord("PEWPEW"));
    assertTrue(vec.getVocab().containsWord("PEWPEW"));

    INDArray unk = vec.getWordVectorMatrix("PEWPEW");
    assertNotEquals(null, unk);

    File tempFile = File.createTempFile("temp", "file");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeWord2VecModel(vec, tempFile);

    log.info("Original configuration: {}", vec.getConfiguration());

    Word2Vec restored = WordVectorSerializer.readWord2VecModel(tempFile);

    assertTrue(restored.hasWord("PEWPEW"));
    assertTrue(restored.getVocab().containsWord("PEWPEW"));
    INDArray unk_restored = restored.getWordVectorMatrix("PEWPEW");

    assertEquals(unk, unk_restored);



    // now we're getting some junk word
    INDArray random = vec.getWordVectorMatrix("hhsd7d7sdnnmxc_SDsda");
    INDArray randomRestored = restored.getWordVectorMatrix("hhsd7d7sdnnmxc_SDsda");

    log.info("Restored configuration: {}", restored.getConfiguration());

    assertEquals(unk, random);
    assertEquals(unk, randomRestored);
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:50,代码来源:Word2VecTests.java

示例10: testOutputStream

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Test
public void testOutputStream() throws Exception {
    File file = File.createTempFile("tmp_ser", "ssa");
    file.deleteOnExit();

    File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
    SentenceIterator iter = new BasicLineIterator(inputFile);
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    InMemoryLookupCache cache = new InMemoryLookupCache(false);
    WeightLookupTable table = new InMemoryLookupTable.Builder().vectorLength(100).useAdaGrad(false).negative(5.0)
                    .cache(cache).lr(0.025f).build();

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100)
                    .lookupTable(table).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5)
                    .vocabCache(cache).seed(42)
                    //                .workers(6)
                    .windowSize(5).iterate(iter).tokenizerFactory(t).build();

    assertEquals(new ArrayList<String>(), vec.getStopWords());
    vec.fit();

    INDArray day1 = vec.getWordVectorMatrix("day");

    WordVectorSerializer.writeWordVectors(vec, new FileOutputStream(file));

    WordVectors vec2 = WordVectorSerializer.loadTxtVectors(file);

    INDArray day2 = vec2.getWordVectorMatrix("day");

    assertEquals(day1, day2);

    File tempFile = File.createTempFile("tetsts", "Fdfs");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeWord2VecModel(vec, tempFile);

    Word2Vec vec3 = WordVectorSerializer.readWord2VecModel(tempFile);
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:42,代码来源:WordVectorSerializerTest.java

示例11: testCnnSentenceDataSetIteratorNoTokensEdgeCase

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Test
public void testCnnSentenceDataSetIteratorNoTokensEdgeCase() throws Exception {

    WordVectors w2v = WordVectorSerializer
                    .readWord2VecModel(new ClassPathResource("word2vec/googleload/sample_vec.bin").getFile());

    int vectorSize = w2v.lookupTable().layerSize();

    List<String> sentences = new ArrayList<>();
    //First 2 sentences - no valid words
    sentences.add("NOVALID WORDSHERE");
    sentences.add("!!!");
    sentences.add("these balance Database model");
    sentences.add("into same THISWORDDOESNTEXIST are");
    int maxLength = 4;
    List<String> s1 = Arrays.asList("these", "balance", "Database", "model");
    List<String> s2 = Arrays.asList("into", "same", "are");

    List<String> labelsForSentences = Arrays.asList("Positive", "Negative", "Positive", "Negative");

    INDArray expLabels = Nd4j.create(new double[][] {{0, 1}, {1, 0}}); //Order of labels: alphabetic. Positive -> [0,1]


    LabeledSentenceProvider p = new CollectionLabeledSentenceProvider(sentences, labelsForSentences, null);
    CnnSentenceDataSetIterator dsi = new CnnSentenceDataSetIterator.Builder().sentenceProvider(p).wordVectors(w2v)
                    .maxSentenceLength(256).minibatchSize(32).sentencesAlongHeight(false).build();

    //            System.out.println("alongHeight = " + alongHeight);
    DataSet ds = dsi.next();

    INDArray expectedFeatures = Nd4j.create(2, 1, vectorSize, maxLength);

    INDArray expectedFeatureMask = Nd4j.create(new double[][] {{1, 1, 1, 1}, {1, 1, 1, 0}});

    for (int i = 0; i < 4; i++) {
        expectedFeatures.get(NDArrayIndex.point(0), NDArrayIndex.point(0), NDArrayIndex.all(),
                        NDArrayIndex.point(i)).assign(w2v.getWordVectorMatrix(s1.get(i)));
    }

    for (int i = 0; i < 3; i++) {
        expectedFeatures.get(NDArrayIndex.point(1), NDArrayIndex.point(0), NDArrayIndex.all(),
                        NDArrayIndex.point(i)).assign(w2v.getWordVectorMatrix(s2.get(i)));
    }

    assertArrayEquals(expectedFeatures.shape(), ds.getFeatures().shape());
    assertEquals(expectedFeatures, ds.getFeatures());
    assertEquals(expLabels, ds.getLabels());
    assertEquals(expectedFeatureMask, ds.getFeaturesMaskArray());
    assertNull(ds.getLabelsMaskArray());
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:51,代码来源:TestCnnSentenceDataSetIterator.java

示例12: testCnnSentenceDataSetIteratorNoValidTokensNextEdgeCase

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
@Test
public void testCnnSentenceDataSetIteratorNoValidTokensNextEdgeCase() throws Exception {
    //Case: 2 minibatches, of size 2
    //First minibatch: OK
    //Second minibatch: would be empty
    //Therefore: after first minibatch is returned, .hasNext() should return false

    WordVectors w2v = WordVectorSerializer
                    .readWord2VecModel(new ClassPathResource("word2vec/googleload/sample_vec.bin").getFile());

    int vectorSize = w2v.lookupTable().layerSize();

    List<String> sentences = new ArrayList<>();
    sentences.add("these balance Database model");
    sentences.add("into same THISWORDDOESNTEXIST are");
    //Last 2 sentences - no valid words
    sentences.add("NOVALID WORDSHERE");
    sentences.add("!!!");
    int maxLength = 4;
    List<String> s1 = Arrays.asList("these", "balance", "Database", "model");
    List<String> s2 = Arrays.asList("into", "same", "are");

    List<String> labelsForSentences = Arrays.asList("Positive", "Negative", "Positive", "Negative");

    INDArray expLabels = Nd4j.create(new double[][] {{0, 1}, {1, 0}}); //Order of labels: alphabetic. Positive -> [0,1]


    LabeledSentenceProvider p = new CollectionLabeledSentenceProvider(sentences, labelsForSentences, null);
    CnnSentenceDataSetIterator dsi = new CnnSentenceDataSetIterator.Builder().sentenceProvider(p).wordVectors(w2v)
                    .maxSentenceLength(256).minibatchSize(2).sentencesAlongHeight(false).build();

    assertTrue(dsi.hasNext());
    DataSet ds = dsi.next();

    assertFalse(dsi.hasNext());


    INDArray expectedFeatures = Nd4j.create(2, 1, vectorSize, maxLength);

    INDArray expectedFeatureMask = Nd4j.create(new double[][] {{1, 1, 1, 1}, {1, 1, 1, 0}});

    for (int i = 0; i < 4; i++) {
        expectedFeatures.get(NDArrayIndex.point(0), NDArrayIndex.point(0), NDArrayIndex.all(),
                        NDArrayIndex.point(i)).assign(w2v.getWordVectorMatrix(s1.get(i)));
    }

    for (int i = 0; i < 3; i++) {
        expectedFeatures.get(NDArrayIndex.point(1), NDArrayIndex.point(0), NDArrayIndex.all(),
                        NDArrayIndex.point(i)).assign(w2v.getWordVectorMatrix(s2.get(i)));
    }

    assertArrayEquals(expectedFeatures.shape(), ds.getFeatures().shape());
    assertEquals(expectedFeatures, ds.getFeatures());
    assertEquals(expLabels, ds.getLabels());
    assertEquals(expectedFeatureMask, ds.getFeaturesMaskArray());
    assertNull(ds.getLabelsMaskArray());
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:58,代码来源:TestCnnSentenceDataSetIterator.java

示例13: main

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
public static void main (String[] args) throws Exception {
    // a 66.6MB sample data extract from around 13GB wikipedia dataset
    String filePath = new ClassPathResource("data/raw_sentences.txt").getFile().getAbsolutePath();
    log.info("Load & Vectorize Sentences....");
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator(filePath);
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    // CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
    // So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
    // Additionally it forces lower case for all tokens.
    t.setTokenPreProcessor(new CommonPreprocessor());
    log.info("Building model....");
    Word2Vec vec = Word2VecTrainer.w2vBuilder(iter, t);
    log.info("Fitting Word2Vec model....");
    vec.fit();
    log.info("done...");
    // Write word vectors to file
    log.info("Writing word vectors to file....");
    WordVectorSerializer.writeWord2VecModel(vec, "src/main/resources/word2vec_dl4j_model.bin");
    log.info("done...");
    // Load word vectors to Word2Vec
    log.info("Load word vectors from file");
    Word2Vec w2v = WordVectorSerializer.readWord2VecModel("src/main/resources/word2vec_dl4j_model.bin");
    log.info("Testing result:");
    Collection<String> lst = w2v.wordsNearest("man", 10);
    log.info("Closest Words--10 Words closest to \"man\": " + lst);
    double cosSim = w2v.similarity("man", "woman");
    log.info("Cosine Similarity between \"man\" and \"woman\": " + String.valueOf(cosSim));
}
 
开发者ID:IsaacChanghau,项目名称:Word2VecfJava,代码行数:31,代码来源:DL4JWord2VecExample.java

示例14: main

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
	// Gets Path to Text file
	String filePath = new ClassPathResource("raw_sentences.txt").getFile().getAbsolutePath();
	log.info("Load & Vectorize Sentences....");
	// Strip white space before and after for each line
	SentenceIterator iter = new BasicLineIterator(filePath);
	// Split on white spaces in the line to get words
	TokenizerFactory t = new DefaultTokenizerFactory();
       /*
        * CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
        * So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
        * Additionally it forces lower case for all tokens.
        */
	t.setTokenPreProcessor(new CommonPreprocessor());
	log.info("Building model....");
	Word2Vec vec = new Word2Vec.Builder()
			.minWordFrequency(5)
			.iterations(1)
			.layerSize(100)
			.seed(42)
			.windowSize(5)
			.iterate(iter)
			.tokenizerFactory(t)
			.learningRate(0.025)
			.minLearningRate(1e-3)
			//.negativeSample(10)
			.build();
	log.info("Fitting Word2Vec model....");
	vec.fit();
	// Write word vectors to file
	log.info("Writing word vectors to text file....");
	WordVectorSerializer.writeWord2VecModel(vec, "src/main/resources/W2VModel.txt");
	// Load word vectors to Word2Vec
	Word2Vec w2v = WordVectorSerializer.readWord2VecModel("src/main/resources/W2VModel.txt");
	// Prints out the closest 10 words to "day". An example on what to do with these Word Vectors.
	log.info("Closest Words:");
	//Collection<String> lst = vec.wordsNearest("day", 10);
	Collection<String> lst = w2v.wordsNearest("day", 10);
	System.out.println("10 Words closest to 'day': " + lst);

	double cosSim = w2v.similarity("day", "night");
	System.out.println(cosSim);
}
 
开发者ID:IsaacChanghau,项目名称:NeuralNetworksLite,代码行数:44,代码来源:Word2VecRawTextExample.java

示例15: readModelFromFile

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; //导入方法依赖的package包/类
public  Word2Vec readModelFromFile(Language language) {
    String path = (learningAlgorithm instanceof SkipGram) ?
            language.getName() + "_model.txt" : language.getName() + "_model_" + learningAlgorithm.getCodeName() + ".txt";
    URL resource = Pan15Word2Vec.class.getClassLoader()
            .getResource("word2vec/" + path);
    try {
        return WordVectorSerializer.readWord2VecModel(Paths.get(resource.toURI()).toFile());
    } catch (URISyntaxException e) {
        e.printStackTrace();
    }
    return null;
}
 
开发者ID:madeleine789,项目名称:dl4j-apr,代码行数:13,代码来源:Pan15Word2Vec.java


注:本文中的org.deeplearning4j.models.embeddings.loader.WordVectorSerializer.readWord2VecModel方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。