当前位置: 首页>>代码示例>>Java>>正文


Java TokenizerFactory.setTokenPreProcessor方法代码示例

本文整理汇总了Java中org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory.setTokenPreProcessor方法的典型用法代码示例。如果您正苦于以下问题:Java TokenizerFactory.setTokenPreProcessor方法的具体用法?Java TokenizerFactory.setTokenPreProcessor怎么用?Java TokenizerFactory.setTokenPreProcessor使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory的用法示例。


在下文中一共展示了TokenizerFactory.setTokenPreProcessor方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: testWord2VecPlot

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testWord2VecPlot() throws Exception {
    File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025)
                    .layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10)
                    .tokenizerFactory(t).build();

    vec.fit();

    //        UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();

    //        vec.getLookupTable().plotVocab(100, connectionInfo);

    Thread.sleep(10000000000L);
    fail("Not implemented");
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:23,代码来源:ManualTests.java

示例2: main

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {

        // Gets Path to Text file
        String filePath = "c:/raw_sentences.txt";

        log.info("Load & Vectorize Sentences....");
        // Strip white space before and after for each line
        SentenceIterator iter = UimaSentenceIterator.createWithPath(filePath);
        // Split on white spaces in the line to get words
        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());

        InMemoryLookupCache cache = new InMemoryLookupCache();
        WeightLookupTable table = new InMemoryLookupTable.Builder()
                .vectorLength(100)
                .useAdaGrad(false)
                .cache(cache)
                .lr(0.025f).build();

        log.info("Building model....");
        Word2Vec vec = new Word2Vec.Builder()
                .minWordFrequency(5).iterations(1)
                .layerSize(100).lookupTable(table)
                .stopWords(new ArrayList<String>())
                .vocabCache(cache).seed(42)
                .windowSize(5).iterate(iter).tokenizerFactory(t).build();

        log.info("Fitting Word2Vec model....");
        vec.fit();

        log.info("Writing word vectors to text file....");
        // Write word
        WordVectorSerializer.writeWordVectors(vec, "word2vec.txt");

        log.info("Closest Words:");
        Collection<String> lst = vec.wordsNearest("man", 5); 
        System.out.println(lst);
        double cosSim = vec.similarity("cruise", "voyage");
        System.out.println(cosSim);
    }
 
开发者ID:PacktPublishing,项目名称:Java-Data-Science-Cookbook,代码行数:41,代码来源:Word2VecRawTextExample.java

示例3: testWord2VecGoogleModelUptraining

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Ignore
@Test
public void testWord2VecGoogleModelUptraining() throws Exception {
    long time1 = System.currentTimeMillis();
    Word2Vec vec = WordVectorSerializer.readWord2VecModel(
                    new File("C:\\Users\\raver\\Downloads\\GoogleNews-vectors-negative300.bin.gz"), false);
    long time2 = System.currentTimeMillis();
    log.info("Model loaded in {} msec", time2 - time1);
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    vec.setTokenizerFactory(t);
    vec.setSentenceIterator(iter);
    vec.getConfiguration().setUseHierarchicSoftmax(false);
    vec.getConfiguration().setNegative(5.0);
    vec.setElementsLearningAlgorithm(new CBOW<VocabWord>());

    vec.fit();
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:22,代码来源:Word2VecTests.java

示例4: getTokenizerFactory

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
protected static TokenizerFactory getTokenizerFactory(VectorsConfiguration configuration) {
    if (configuration == null)
        return null;

    if (configuration != null && configuration.getTokenizerFactory() != null
                    && !configuration.getTokenizerFactory().isEmpty()) {
        try {
            TokenizerFactory factory =
                            (TokenizerFactory) Class.forName(configuration.getTokenizerFactory()).newInstance();

            if (configuration.getTokenPreProcessor() != null && !configuration.getTokenPreProcessor().isEmpty()) {
                TokenPreProcess preProcessor =
                                (TokenPreProcess) Class.forName(configuration.getTokenPreProcessor()).newInstance();
                factory.setTokenPreProcessor(preProcessor);
            }

            return factory;

        } catch (Exception e) {
            log.error("Can't instantiate saved TokenizerFactory: {}", configuration.getTokenizerFactory());
        }
    }
    return null;
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:25,代码来源:WordVectorSerializer.java

示例5: testGoogleModelForInference

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Ignore
@Test
public void testGoogleModelForInference() throws Exception {
    WordVectors googleVectors = WordVectorSerializer.loadGoogleModelNonNormalized(
                    new File("/ext/GoogleNews-vectors-negative300.bin.gz"), true, false);

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    ParagraphVectors pv =
                    new ParagraphVectors.Builder().tokenizerFactory(t).iterations(10).useHierarchicSoftmax(false)
                                    .trainWordVectors(false).iterations(10).useExistingWordVectors(googleVectors)
                                    .negativeSample(10).sequenceLearningAlgorithm(new DM<VocabWord>()).build();

    INDArray vec1 = pv.inferVector("This text is pretty awesome");
    INDArray vec2 = pv.inferVector("Fantastic process of crazy things happening inside just for history purposes");

    log.info("vec1/vec2: {}", Transforms.cosineSim(vec1, vec2));
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:20,代码来源:ParagraphVectorsTest.java

示例6: loadParagraphVectors

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
private static ParagraphVectors loadParagraphVectors() {
    ParagraphVectors paragraphVectors = null;
    try {
        paragraphVectors = WordVectorSerializer.readParagraphVectors((PARAGRAPHVECTORMODELPATH));
        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());
        paragraphVectors.setTokenizerFactory(t);
        paragraphVectors.getConfiguration().setIterations(10); // please note, we set iterations to 1 here, just to speedup inference

    } catch (IOException e) {
        e.printStackTrace();
    }
    return paragraphVectors;
}
 
开发者ID:gizemsogancioglu,项目名称:biosses,代码行数:15,代码来源:SentenceVectorsBasedSimilarity.java

示例7: trainParagraghVecModel

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
public void trainParagraghVecModel(String locationToSave) throws FileNotFoundException {
    ClassPathResource resource = new ClassPathResource("/paragraphVectors/paragraphVectorTraining.txt");
    File file = resource.getFile();
    SentenceIterator iter = new BasicLineIterator(file);
    AbstractCache<VocabWord> cache = new AbstractCache<VocabWord>();
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    /*
         if you don't have LabelAwareIterator handy, you can use synchronized labels generator
          it will be used to label each document/sequence/line with it's own label.
          But if you have LabelAwareIterator ready, you can can provide it, for your in-house labels
    */
    LabelsSource source = new LabelsSource("DOC_");

    ParagraphVectors vec = new ParagraphVectors.Builder()
            .minWordFrequency(1)
            .iterations(100)
            .epochs(1)
            .layerSize(50)
            .learningRate(0.02)
            .labelsSource(source)
            .windowSize(5)
            .iterate(iter)
            .trainWordVectors(true)
            .vocabCache(cache)
            .tokenizerFactory(t)
            .sampling(0)
            .build();

    vec.fit();

    WordVectorSerializer.writeParagraphVectors(vec, locationToSave);
}
 
开发者ID:gizemsogancioglu,项目名称:biosses,代码行数:34,代码来源:SentenceVectorsBasedSimilarity.java

示例8: testSparkW2VonBiggerCorpus

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Ignore
@Test
public void testSparkW2VonBiggerCorpus() throws Exception {
    SparkConf sparkConf = new SparkConf().setMaster("local[8]").setAppName("sparktest")
                    .set("spark.driver.maxResultSize", "4g").set("spark.driver.memory", "8g")
                    .set("spark.executor.memory", "8g");

    // Set SparkContext
    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    // Path of data part-00000
    //String dataPath = new ClassPathResource("/big/raw_sentences.txt").getFile().getAbsolutePath();
    //        String dataPath = "/ext/Temp/SampleRussianCorpus.txt";
    String dataPath = new ClassPathResource("spark_word2vec_test.txt").getFile().getAbsolutePath();

    // Read in data
    JavaRDD<String> corpus = sc.textFile(dataPath);

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new LowCasePreProcessor());

    Word2Vec word2Vec = new Word2Vec.Builder().setNGrams(1)
                    //     .setTokenizer("org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory")
                    //     .setTokenPreprocessor("org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor")
                    //     .setRemoveStop(false)
                    .tokenizerFactory(t).seed(42L).negative(3).useAdaGrad(false).layerSize(100).windowSize(5)
                    .learningRate(0.025).minLearningRate(0.0001).iterations(1).batchSize(100).minWordFrequency(5)
                    .useUnknown(true).build();

    word2Vec.train(corpus);


    sc.stop();

    WordVectorSerializer.writeWordVectors(word2Vec.getLookupTable(), "/ext/Temp/sparkRuModel.txt");
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:37,代码来源:Word2VecTest.java

示例9: testWord2VecCBOW

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testWord2VecCBOW() throws Exception {
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150)
                    .seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8)
                    .tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();

    vec.fit();

    Collection<String> lst = vec.wordsNearest("day", 10);
    log.info(Arrays.toString(lst.toArray()));

    //   assertEquals(10, lst.size());

    double sim = vec.similarity("day", "night");
    log.info("Day/night similarity: " + sim);

    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));
    assertTrue(sim > 0.65f);
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:28,代码来源:Word2VecTests.java

示例10: main

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
    ClassPathResource srcFile = new ClassPathResource("/raw_sentences.txt");
    File file = srcFile.getFile();
    SentenceIterator iter = new BasicLineIterator(file);
    
    TokenizerFactory tFact = new DefaultTokenizerFactory();
    tFact.setTokenPreProcessor(new CommonPreprocessor());

    LabelsSource labelFormat = new LabelsSource("LINE_");

    ParagraphVectors vec = new ParagraphVectors.Builder()
            .minWordFrequency(1)
            .iterations(5)
            .epochs(1)
            .layerSize(100)
            .learningRate(0.025)
            .labelsSource(labelFormat)
            .windowSize(5)
            .iterate(iter)
            .trainWordVectors(false)
            .tokenizerFactory(tFact)
            .sampling(0)
            .build();

    vec.fit();

    double similar1 = vec.similarity("LINE_9835", "LINE_12492");
    out.println("Comparing lines 9836 & 12493 ('This is my house .'/'This is my world .') Similarity = " + similar1);


    double similar2 = vec.similarity("LINE_3720", "LINE_16392");
    out.println("Comparing lines 3721 & 16393 ('This is my way .'/'This is my work .') Similarity = " + similar2);

    double similar3 = vec.similarity("LINE_6347", "LINE_3720");
    out.println("Comparing lines 6348 & 3721 ('This is my case .'/'This is my way .') Similarity = " + similar3);

    double dissimilar1 = vec.similarity("LINE_3720", "LINE_9852");
    out.println("Comparing lines 3721 & 9853 ('This is my way .'/'We now have one .') Similarity = " + dissimilar1);
    
    double dissimilar2 = vec.similarity("LINE_3720", "LINE_3719");
    out.println("Comparing lines 3721 & 3720 ('This is my way .'/'At first he says no .') Similarity = " + dissimilar2);
    
    
    
}
 
开发者ID:PacktPublishing,项目名称:Machine-Learning-End-to-Endguide-for-Java-developers,代码行数:46,代码来源:ClassifyBySimilarity.java

示例11: shouldLoadAndCreateSameWord2Vec

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void shouldLoadAndCreateSameWord2Vec() {
    //given
    Pan15Parser parser = new Pan15Parser();
    HashMap<String, Pan15Author> english = parser.parseCSVCorpus().get(Language.ENGLISH);
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());
    List<String> englishSentences = english.values().stream().map(Author::getDocuments)
            .collect(Collectors.toList())
            .stream().flatMap(List::stream).collect(Collectors.toList());

    SentenceIterator englishIter = new CollectionSentenceIterator(new Pan15SentencePreProcessor(), englishSentences);
    // when
    Word2Vec englishVec = new Word2Vec.Builder()
            .minWordFrequency(6)
            .iterations(15)
            .layerSize(250)
            .seed(42)
            .windowSize(5)
            .iterate(englishIter)
            .tokenizerFactory(t)
            .build();

    englishVec.fit();
    Word2Vec loadedEnglishVec1 = new Pan15Word2Vec(new SkipGram<>()).readModelFromFile(Language.ENGLISH);
    Word2Vec loadedEnglishVec2 = new Pan15Word2Vec(new CBOW<>()).readModelFromFile(Language.ENGLISH);
    Word2Vec loadedEnglishVec3 = new Pan15Word2Vec(new GloVe<>()).readModelFromFile(Language.ENGLISH);
    loadedEnglishVec1.setTokenizerFactory(t);
    loadedEnglishVec1.setSentenceIterator(englishIter);
    loadedEnglishVec2.setTokenizerFactory(t);
    loadedEnglishVec2.setSentenceIterator(englishIter);
    loadedEnglishVec3.setTokenizerFactory(t);
    loadedEnglishVec3.setSentenceIterator(englishIter);

    //then
    Assert.assertNotNull(loadedEnglishVec1);
    System.out.println(englishVec.wordsNearest("home", 15));
    System.out.println(loadedEnglishVec1.wordsNearest("home", 15));
    System.out.println(loadedEnglishVec2.wordsNearest("home", 15));
    System.out.println(loadedEnglishVec3.wordsNearest("home", 15));
}
 
开发者ID:madeleine789,项目名称:dl4j-apr,代码行数:42,代码来源:Pan15Word2VecTest.java

示例12: testRunWord2Vec

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testRunWord2Vec() throws Exception {
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());


    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(3).batchSize(64).layerSize(100)
                    .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
                    .sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>())
                    //.negativeSample(10)
                    .epochs(1).windowSize(5).allowParallelTokenization(true)
                    .modelUtils(new BasicModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();

    assertEquals(new ArrayList<String>(), vec.getStopWords());
    vec.fit();
    File tempFile = File.createTempFile("temp", "temp");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeFullModel(vec, tempFile.getAbsolutePath());
    Collection<String> lst = vec.wordsNearest("day", 10);
    //log.info(Arrays.toString(lst.toArray()));
    printWords("day", lst, vec);

    assertEquals(10, lst.size());

    double sim = vec.similarity("day", "night");
    log.info("Day/night similarity: " + sim);

    assertTrue(sim < 1.0);
    assertTrue(sim > 0.4);


    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));

    assertFalse(lst.contains(null));


    lst = vec.wordsNearest("day", 10);
    //log.info(Arrays.toString(lst.toArray()));
    printWords("day", lst, vec);

    assertTrue(lst.contains("week"));
    assertTrue(lst.contains("night"));
    assertTrue(lst.contains("year"));

    new File("cache.ser").delete();

    ArrayList<String> labels = new ArrayList<>();
    labels.add("day");
    labels.add("night");
    labels.add("week");

    INDArray matrix = vec.getWordVectors(labels);
    assertEquals(matrix.getRow(0), vec.getWordVectorMatrix("day"));
    assertEquals(matrix.getRow(1), vec.getWordVectorMatrix("night"));
    assertEquals(matrix.getRow(2), vec.getWordVectorMatrix("week"));

    WordVectorSerializer.writeWordVectors(vec, pathToWriteto);
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:65,代码来源:Word2VecTests.java

示例13: testW2VnegativeOnRestore

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testW2VnegativeOnRestore() throws Exception {
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());


    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(3).batchSize(64).layerSize(100)
                    .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
                    .sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>()).negativeSample(10).epochs(1)
                    .windowSize(5).useHierarchicSoftmax(false).allowParallelTokenization(true)
                    .modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();


    assertEquals(false, vec.getConfiguration().isUseHierarchicSoftmax());

    log.info("Fit 1");
    vec.fit();

    File tmpFile = File.createTempFile("temp", "file");
    tmpFile.deleteOnExit();

    WordVectorSerializer.writeWord2VecModel(vec, tmpFile);

    iter.reset();

    Word2Vec restoredVec = WordVectorSerializer.readWord2VecModel(tmpFile, true);
    restoredVec.setTokenizerFactory(t);
    restoredVec.setSentenceIterator(iter);

    assertEquals(false, restoredVec.getConfiguration().isUseHierarchicSoftmax());
    assertTrue(restoredVec.getModelUtils() instanceof FlatModelUtils);
    assertTrue(restoredVec.getConfiguration().isAllowParallelTokenization());

    log.info("Fit 2");
    restoredVec.fit();


    iter.reset();
    restoredVec = WordVectorSerializer.readWord2VecModel(tmpFile, false);
    restoredVec.setTokenizerFactory(t);
    restoredVec.setSentenceIterator(iter);

    assertEquals(false, restoredVec.getConfiguration().isUseHierarchicSoftmax());
    assertTrue(restoredVec.getModelUtils() instanceof BasicModelUtils);

    log.info("Fit 3");
    restoredVec.fit();
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:52,代码来源:Word2VecTests.java

示例14: testUnknown1

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testUnknown1() throws Exception {
    // Strip white space before and after for each line
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10).useUnknown(true)
                    .unknownElement(new VocabWord(1.0, "PEWPEW")).iterations(1).layerSize(100)
                    .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
                    .sampling(0).elementsLearningAlgorithm(new CBOW<VocabWord>()).epochs(1).windowSize(5)
                    .useHierarchicSoftmax(true).allowParallelTokenization(true)
                    .modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();

    vec.fit();

    assertTrue(vec.hasWord("PEWPEW"));
    assertTrue(vec.getVocab().containsWord("PEWPEW"));

    INDArray unk = vec.getWordVectorMatrix("PEWPEW");
    assertNotEquals(null, unk);

    File tempFile = File.createTempFile("temp", "file");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeWord2VecModel(vec, tempFile);

    log.info("Original configuration: {}", vec.getConfiguration());

    Word2Vec restored = WordVectorSerializer.readWord2VecModel(tempFile);

    assertTrue(restored.hasWord("PEWPEW"));
    assertTrue(restored.getVocab().containsWord("PEWPEW"));
    INDArray unk_restored = restored.getWordVectorMatrix("PEWPEW");

    assertEquals(unk, unk_restored);



    // now we're getting some junk word
    INDArray random = vec.getWordVectorMatrix("hhsd7d7sdnnmxc_SDsda");
    INDArray randomRestored = restored.getWordVectorMatrix("hhsd7d7sdnnmxc_SDsda");

    log.info("Restored configuration: {}", restored.getConfiguration());

    assertEquals(unk, random);
    assertEquals(unk, randomRestored);
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:50,代码来源:Word2VecTests.java

示例15: testIndexPersistence

import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testIndexPersistence() throws Exception {
    File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
    SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100)
                    .stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).seed(42).windowSize(5)
                    .iterate(iter).tokenizerFactory(t).build();

    vec.fit();

    VocabCache orig = vec.getVocab();

    File tempFile = File.createTempFile("temp", "w2v");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeWordVectors(vec, tempFile);

    WordVectors vec2 = WordVectorSerializer.loadTxtVectors(tempFile);

    VocabCache rest = vec2.vocab();

    assertEquals(orig.totalNumberOfDocs(), rest.totalNumberOfDocs());

    for (VocabWord word : vec.getVocab().vocabWords()) {
        INDArray array1 = vec.getWordVectorMatrix(word.getLabel());
        INDArray array2 = vec2.getWordVectorMatrix(word.getLabel());

        assertEquals(array1, array2);
    }
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:35,代码来源:WordVectorSerializerTest.java


注:本文中的org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory.setTokenPreProcessor方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。