当前位置: 首页>>代码示例>>Java>>正文


Java Word2Vec类代码示例

本文整理汇总了Java中org.deeplearning4j.models.word2vec.Word2Vec的典型用法代码示例。如果您正苦于以下问题:Java Word2Vec类的具体用法?Java Word2Vec怎么用?Java Word2Vec使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


Word2Vec类属于org.deeplearning4j.models.word2vec包,在下文中一共展示了Word2Vec类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: testWriteWordVectorsFromWord2Vec

import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
@Test
@Ignore
public void testWriteWordVectorsFromWord2Vec() throws IOException {
    WordVectors vec = WordVectorSerializer.loadGoogleModel(binaryFile, true);
    WordVectorSerializer.writeWordVectors((Word2Vec) vec, pathToWriteto);

    WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(new File(pathToWriteto));
    INDArray wordVector1 = wordVectors.getWordVectorMatrix("Morgan_Freeman");
    INDArray wordVector2 = wordVectors.getWordVectorMatrix("JA_Montalbano");
    assertEquals(vec.getWordVectorMatrix("Morgan_Freeman"), wordVector1);
    assertEquals(vec.getWordVectorMatrix("JA_Montalbano"), wordVector2);
    assertTrue(wordVector1.length() == 300);
    assertTrue(wordVector2.length() == 300);
    assertEquals(wordVector1.getDouble(0), 0.044423, 1e-3);
    assertEquals(wordVector2.getDouble(0), 0.051964, 1e-3);
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:17,代码来源:WordVectorSerializerTest.java

示例2: testFindNamesFromText

import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
@Ignore
@Test
public void testFindNamesFromText() throws IOException {
    SentenceIterator iter = new BasicLineIterator("src/test/resources/chineseName.txt");

    log.info("load is right!");
    TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
    //tokenizerFactory.setTokenPreProcessor(new ChineseTokenizer());

    //Generates a word-vector from the dataset stored in resources folder
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(2).iterations(5).layerSize(100).seed(42)
                    .learningRate(0.1).windowSize(20).iterate(iter).tokenizerFactory(tokenizerFactory).build();
    vec.fit();
    WordVectorSerializer.writeWordVectors(vec, new File("src/test/resources/chineseNameWordVector.txt"));

    //trains a model that can find out all names from news(Suffix txt),It uses word vector generated
    // WordVectors wordVectors;

    //test model,Whether the model find out name from unknow text;

}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:22,代码来源:ChineseTokenizerTest.java

示例3: resetWeights

import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
@Override
public void resetWeights(boolean reset) {
    if (rng == null)
        this.rng = Nd4j.getRandom();

    //note the +2 which is the unk vocab word and the bias
    if (syn0 == null || reset) {
        syn0 = Nd4j.rand(new int[] {vocab.numWords() + 1, vectorLength}, rng).subi(0.5).divi((double) vectorLength);
        INDArray randUnk = Nd4j.rand(1, vectorLength, rng).subi(0.5).divi(vectorLength);
        putVector(Word2Vec.DEFAULT_UNK, randUnk);
    }
    if (weightAdaGrad == null || reset) {
        weightAdaGrad = new AdaGrad(new int[] {vocab.numWords() + 1, vectorLength}, lr.get());
    }


    //right after unknown
    if (bias == null || reset)
        bias = Nd4j.create(syn0.rows());

    if (biasAdaGrad == null || reset) {
        biasAdaGrad = new AdaGrad(bias.shape(), lr.get());
    }


}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:27,代码来源:GloveWeightLookupTable.java

示例4: asExampleArray

import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
/**
 * Converts a window (each word in the window)
 *
 * in to a vector.
 *
 * Keep in mind each window is a multi word context.
 *
 * From there, each word uses the passed in model
 * as a lookup table to getFromOrigin what vectors are relevant
 * to the passed in windows
 * @param window the window to take in.
 * @param vec the model to use as a lookup table
 * @return a concacneated 1 row array
 * containing all of the numbers for each word in the window
 */
public static INDArray asExampleArray(Window window, Word2Vec vec, boolean normalize) {
    int length = vec.lookupTable().layerSize();
    List<String> words = window.getWords();
    int windowSize = vec.getWindow();
    assert words.size() == vec.getWindow();
    INDArray ret = Nd4j.create(length * windowSize);



    for (int i = 0; i < words.size(); i++) {
        String word = words.get(i);
        INDArray n = normalize ? vec.getWordVectorMatrixNormalized(word) : vec.getWordVectorMatrix(word);
        ret.put(new INDArrayIndex[] {NDArrayIndex.interval(i * vec.lookupTable().layerSize(),
                        i * vec.lookupTable().layerSize() + vec.lookupTable().layerSize())}, n);
    }

    return ret;
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:34,代码来源:WindowConverter.java

示例5: testWord2VecPlot

import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
@Test
public void testWord2VecPlot() throws Exception {
    File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025)
                    .layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10)
                    .tokenizerFactory(t).build();

    vec.fit();

    //        UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();

    //        vec.getLookupTable().plotVocab(100, connectionInfo);

    Thread.sleep(10000000000L);
    fail("Not implemented");
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:23,代码来源:ManualTests.java

示例6: main

import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
public static void main(String[] args) throws Exception {

        // Gets Path to Text file
        String filePath = "c:/raw_sentences.txt";

        log.info("Load & Vectorize Sentences....");
        // Strip white space before and after for each line
        SentenceIterator iter = UimaSentenceIterator.createWithPath(filePath);
        // Split on white spaces in the line to get words
        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());

        InMemoryLookupCache cache = new InMemoryLookupCache();
        WeightLookupTable table = new InMemoryLookupTable.Builder()
                .vectorLength(100)
                .useAdaGrad(false)
                .cache(cache)
                .lr(0.025f).build();

        log.info("Building model....");
        Word2Vec vec = new Word2Vec.Builder()
                .minWordFrequency(5).iterations(1)
                .layerSize(100).lookupTable(table)
                .stopWords(new ArrayList<String>())
                .vocabCache(cache).seed(42)
                .windowSize(5).iterate(iter).tokenizerFactory(t).build();

        log.info("Fitting Word2Vec model....");
        vec.fit();

        log.info("Writing word vectors to text file....");
        // Write word
        WordVectorSerializer.writeWordVectors(vec, "word2vec.txt");

        log.info("Closest Words:");
        Collection<String> lst = vec.wordsNearest("man", 5); 
        System.out.println(lst);
        double cosSim = vec.similarity("cruise", "voyage");
        System.out.println(cosSim);
    }
 
开发者ID:PacktPublishing,项目名称:Java-Data-Science-Cookbook,代码行数:41,代码来源:Word2VecRawTextExample.java

示例7: w2vBuilder

import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
public static Word2Vec w2vBuilder(SentenceIterator iter, TokenizerFactory t) {
	return new Word2Vec.Builder()
			.seed(12345)
			.iterate(iter)
			.tokenizerFactory(t)
			.batchSize(1000)
			.allowParallelTokenization(true) // enable parallel tokenization
			.epochs(1) //  number of epochs (iterations over whole training corpus) for training
			.iterations(3) // number of iterations done for each mini-batch during training
			.elementsLearningAlgorithm(new SkipGram<>()) // use SkipGram Model. If CBOW: new CBOW<>()
			.minWordFrequency(50) // discard words that appear less than the times of set value
			.windowSize(5) // set max skip length between words
			.learningRate(0.05) // the starting learning rate
			.minLearningRate(5e-4) // learning rate should not lower than the set threshold value
			.negativeSample(10) // number of negative examples
			// set threshold for occurrence of words. Those that appear with higher frequency will be
			// randomly down-sampled
			.sampling(1e-5)
			.useHierarchicSoftmax(true) // use hierarchical softmax
			.layerSize(300) // size of word vectors
			.workers(8) // number of threads
			.build();
}
 
开发者ID:IsaacChanghau,项目名称:Word2VecfJava,代码行数:24,代码来源:Word2VecTrainer.java

示例8: before

import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
@Before
public void before() throws Exception {
    if (vec == null) {
        ClassPathResource resource = new ClassPathResource("/labeled/");
        File file = resource.getFile();
        SentenceIterator iter = UimaSentenceIterator.createWithPath(file.getAbsolutePath());
        new File("cache.ser").delete();

        TokenizerFactory t = new UimaTokenizerFactory();

        vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).layerSize(100)
                        .stopWords(new ArrayList<String>()).useUnknown(true).windowSize(5).iterate(iter)
                        .tokenizerFactory(t).build();
        vec.fit();

    }
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:18,代码来源:Word2VecIteratorTest.java

示例9: fromPair

import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
/**
 * Load word vectors from the given pair
 *
 * @param pair
 *            the given pair
 * @return a read only word vectors impl based on the given lookup table and vocab
 */
public static Word2Vec fromPair(Pair<InMemoryLookupTable, VocabCache> pair) {
    Word2Vec vectors = new Word2Vec();
    vectors.setLookupTable(pair.getFirst());
    vectors.setVocab(pair.getSecond());
    vectors.setModelUtils(new BasicModelUtils());
    return vectors;
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:15,代码来源:WordVectorSerializer.java

示例10: w2vBuilder4SmallCorpus

import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
@SuppressWarnings("unused")
public static Word2Vec w2vBuilder4SmallCorpus(SentenceIterator iter, TokenizerFactory t) {
	return new Word2Vec.Builder()
			.minWordFrequency(5)
			.iterations(1)
			.layerSize(100)
			.seed(42)
			.windowSize(5)
			.iterate(iter)
			.tokenizerFactory(t)
			.learningRate(0.025)
			.minLearningRate(1e-3)
			.build();
}
 
开发者ID:IsaacChanghau,项目名称:Word2VecfJava,代码行数:15,代码来源:Word2VecTrainer.java

示例11: main

import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
public static void main (String[] args) throws FileNotFoundException {
    // download GoogleNews-vectors-negative300.bin.gz first
    // load google news vectors for measurements
    log.info("load word2vec model");
    Word2Vec w2v = WordVectorSerializer.readWord2VecModel(
            new File("/Users/zhanghao/Documents/Files/GoogleNews-vectors-negative300.bin"));
    log.info("done.");

    log.info("Semantic Property Task...");
    // 1. TOEFL test
    log.info("|********************load TOEFL data********************|");
    List<Word2VecTOEFLTest.TFLNode> tflList = loadTOEFLData();
    log.info("run the test");
    TOEFLTest(tflList, w2v);
    log.info("|*************************done.*************************|");

    // 2. Analogy test -- "king - queen = man - woman"
    log.info("|*******************load Syn_Sem data*******************|");
    Map<String, List<Word2VecAnalogyTest.SynSemNode>> anaMap = loadSynSemData();
    log.info("run the test");
    AnalogyTest(anaMap, w2v);
    log.info("|*************************done.*************************|");

    // 3. WS353 test
    log.info("|********************load WS353 data********************|");
    LinkedList<Word2VecWS353Test.WS353Node> wsList = loadWS353Data("ws/ws353.txt");
    LinkedList<Word2VecWS353Test.WS353Node> wsListRel = loadWS353Data("ws/ws353_relatedness.txt");
    LinkedList<Word2VecWS353Test.WS353Node> wsListSim = loadWS353Data("ws/ws353_similarity.txt");
    log.info("done.");
    log.info("run the test");
    WS353Test(w2v, wsList, "WS353");
    WS353Test(w2v, wsListRel, "WS353 Relatedness");
    WS353Test(w2v, wsListSim, "WS353 Similarity");
    log.info("|*************************done.*************************|");
}
 
开发者ID:IsaacChanghau,项目名称:Word2VecfJava,代码行数:36,代码来源:DL4JWord2VecSemanticExample.java

示例12: TOEFLTest

import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
private static void TOEFLTest(List<Word2VecTOEFLTest.TFLNode> tflList, Word2Vec w2v) {
    int accuracy = 0;
    int ignore = 0;
    for (int i = 0; i < tflList.size(); i++) {
        Word2VecTOEFLTest.TFLNode node = tflList.get(i);
        int bestId = -1;
        double cosValue = Double.MIN_VALUE;
        for (int k = 0; k < node.choices.length; k++) {
            double cosSim = w2v.similarity(node.ques, node.choices[k]);
            if (cosSim > cosValue) {
                bestId = k;
                cosValue = cosSim;
            }
        }
        tflList.get(i).setPredict(bestId);
        log.info((i + 1) + "--" + tflList.get(i).toFileString() + "\n");
        if (tflList.get(i).predict == -1)
            ignore++;
        if (tflList.get(i).ans == tflList.get(i).predict)
            accuracy += 1;
    }
    log.info("Total Questions: " + tflList.size() + ", Ignore: " + ignore + ", Accuracy: " +
            String.format("%.2f", (1.0 * accuracy) / tflList.size() * 100.0) + "%(" + accuracy + "/" +
            tflList.size() + ")");
}
 
开发者ID:IsaacChanghau,项目名称:Word2VecfJava,代码行数:26,代码来源:DL4JWord2VecSemanticExample.java

示例13: getWord2Vec

import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
private void getWord2Vec() {

        t.setTokenPreProcessor(new CommonPreprocessor());

        for (Language language: languages.keySet()) {
            List<String> sentences = getSentencesFromLanguage(language);

            SentenceIterator iter = new CollectionSentenceIterator(PREPROCESSOR, sentences);
            Word2Vec vec = new Word2Vec.Builder().elementsLearningAlgorithm(learningAlgorithm)
                    .minWordFrequency(6)
                    .iterations(15)
                    .layerSize(VEC_LENGTH)
                    .seed(42)
                    .windowSize(5)
                    .iterate(iter)
                    .tokenizerFactory(t)
                    .build();

            vec.fit();
            saveModel(vec, language);
            languageWord2VecMap.put(language, vec);
        }

    }
 
开发者ID:madeleine789,项目名称:dl4j-apr,代码行数:25,代码来源:Pan15Word2Vec.java

示例14: main

import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
public static void main(String[] args) throws Exception {
    ClassPathResource r = new ClassPathResource("/train.tsv");
    if(r.exists()) {
        InputStream is = r.getInputStream();
        BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(new File("train.tsv")));
        IOUtils.copy(is, bos);
        bos.flush();
        bos.close();
        is.close();
    }
    SentenceIterator docIter = new CollectionSentenceIterator(new SentenceToPhraseMapper(new File("train.tsv")).sentences());
    TokenizerFactory factory = new DefaultTokenizerFactory();
    Word2Vec  vec = new Word2Vec.Builder().iterate(docIter)
            .tokenizerFactory(factory).batchSize(10000)
            .learningRate(2.5e-2).sampling(5).learningRateDecayWords(10000)
            .iterations(3).minWordFrequency(1)
            .layerSize(300).windowSize(5).build();
    vec.fit();
    FileUtils.writeLines(new File("vocab.csv"),vec.getCache().words());


    String word = "amusing";
    String otherWord = "turd";
    System.out.println("Words nearest  " + word +  " " + vec.wordsNearest(word,10));
    System.out.println("Words nearest  " + otherWord +  " " + vec.wordsNearest(otherWord,10));



    Tsne t = new Tsne.Builder()
            .setMaxIter(100).stopLyingIteration(20).build();


    vec.getCache().plotVocab(t);

}
 
开发者ID:ihuerga,项目名称:deeplearning4j-nlp-examples,代码行数:36,代码来源:Visualization.java

示例15: writeWord2VecModel

import org.deeplearning4j.models.word2vec.Word2Vec; //导入依赖的package包/类
/**
 * This method saves Word2Vec model into compressed zip file and sends it to output stream
 * PLEASE NOTE: This method saves FULL model, including syn0 AND syn1
 *
 */
public static void writeWord2VecModel(Word2Vec vectors, File file) {
    try (FileOutputStream fos = new FileOutputStream(file);
                    BufferedOutputStream stream = new BufferedOutputStream(fos)) {
        writeWord2VecModel(vectors, stream);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:14,代码来源:WordVectorSerializer.java


注:本文中的org.deeplearning4j.models.word2vec.Word2Vec类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。