本文整理汇总了Java中org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory.setTokenPreProcessor方法的典型用法代码示例。如果您正苦于以下问题:Java TokenizerFactory.setTokenPreProcessor方法的具体用法?Java TokenizerFactory.setTokenPreProcessor怎么用?Java TokenizerFactory.setTokenPreProcessor使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory
的用法示例。
在下文中一共展示了TokenizerFactory.setTokenPreProcessor方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: testWord2VecPlot
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testWord2VecPlot() throws Exception {
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025)
.layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5)
.modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10)
.tokenizerFactory(t).build();
vec.fit();
// UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();
// vec.getLookupTable().plotVocab(100, connectionInfo);
Thread.sleep(10000000000L);
fail("Not implemented");
}
示例2: main
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
// Gets Path to Text file
String filePath = "c:/raw_sentences.txt";
log.info("Load & Vectorize Sentences....");
// Strip white space before and after for each line
SentenceIterator iter = UimaSentenceIterator.createWithPath(filePath);
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
InMemoryLookupCache cache = new InMemoryLookupCache();
WeightLookupTable table = new InMemoryLookupTable.Builder()
.vectorLength(100)
.useAdaGrad(false)
.cache(cache)
.lr(0.025f).build();
log.info("Building model....");
Word2Vec vec = new Word2Vec.Builder()
.minWordFrequency(5).iterations(1)
.layerSize(100).lookupTable(table)
.stopWords(new ArrayList<String>())
.vocabCache(cache).seed(42)
.windowSize(5).iterate(iter).tokenizerFactory(t).build();
log.info("Fitting Word2Vec model....");
vec.fit();
log.info("Writing word vectors to text file....");
// Write word
WordVectorSerializer.writeWordVectors(vec, "word2vec.txt");
log.info("Closest Words:");
Collection<String> lst = vec.wordsNearest("man", 5);
System.out.println(lst);
double cosSim = vec.similarity("cruise", "voyage");
System.out.println(cosSim);
}
示例3: testWord2VecGoogleModelUptraining
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Ignore
@Test
public void testWord2VecGoogleModelUptraining() throws Exception {
long time1 = System.currentTimeMillis();
Word2Vec vec = WordVectorSerializer.readWord2VecModel(
new File("C:\\Users\\raver\\Downloads\\GoogleNews-vectors-negative300.bin.gz"), false);
long time2 = System.currentTimeMillis();
log.info("Model loaded in {} msec", time2 - time1);
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
vec.setTokenizerFactory(t);
vec.setSentenceIterator(iter);
vec.getConfiguration().setUseHierarchicSoftmax(false);
vec.getConfiguration().setNegative(5.0);
vec.setElementsLearningAlgorithm(new CBOW<VocabWord>());
vec.fit();
}
示例4: getTokenizerFactory
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
protected static TokenizerFactory getTokenizerFactory(VectorsConfiguration configuration) {
if (configuration == null)
return null;
if (configuration != null && configuration.getTokenizerFactory() != null
&& !configuration.getTokenizerFactory().isEmpty()) {
try {
TokenizerFactory factory =
(TokenizerFactory) Class.forName(configuration.getTokenizerFactory()).newInstance();
if (configuration.getTokenPreProcessor() != null && !configuration.getTokenPreProcessor().isEmpty()) {
TokenPreProcess preProcessor =
(TokenPreProcess) Class.forName(configuration.getTokenPreProcessor()).newInstance();
factory.setTokenPreProcessor(preProcessor);
}
return factory;
} catch (Exception e) {
log.error("Can't instantiate saved TokenizerFactory: {}", configuration.getTokenizerFactory());
}
}
return null;
}
示例5: testGoogleModelForInference
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Ignore
@Test
public void testGoogleModelForInference() throws Exception {
WordVectors googleVectors = WordVectorSerializer.loadGoogleModelNonNormalized(
new File("/ext/GoogleNews-vectors-negative300.bin.gz"), true, false);
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
ParagraphVectors pv =
new ParagraphVectors.Builder().tokenizerFactory(t).iterations(10).useHierarchicSoftmax(false)
.trainWordVectors(false).iterations(10).useExistingWordVectors(googleVectors)
.negativeSample(10).sequenceLearningAlgorithm(new DM<VocabWord>()).build();
INDArray vec1 = pv.inferVector("This text is pretty awesome");
INDArray vec2 = pv.inferVector("Fantastic process of crazy things happening inside just for history purposes");
log.info("vec1/vec2: {}", Transforms.cosineSim(vec1, vec2));
}
示例6: loadParagraphVectors
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
private static ParagraphVectors loadParagraphVectors() {
ParagraphVectors paragraphVectors = null;
try {
paragraphVectors = WordVectorSerializer.readParagraphVectors((PARAGRAPHVECTORMODELPATH));
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
paragraphVectors.setTokenizerFactory(t);
paragraphVectors.getConfiguration().setIterations(10); // please note, we set iterations to 1 here, just to speedup inference
} catch (IOException e) {
e.printStackTrace();
}
return paragraphVectors;
}
示例7: trainParagraghVecModel
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
public void trainParagraghVecModel(String locationToSave) throws FileNotFoundException {
ClassPathResource resource = new ClassPathResource("/paragraphVectors/paragraphVectorTraining.txt");
File file = resource.getFile();
SentenceIterator iter = new BasicLineIterator(file);
AbstractCache<VocabWord> cache = new AbstractCache<VocabWord>();
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
/*
if you don't have LabelAwareIterator handy, you can use synchronized labels generator
it will be used to label each document/sequence/line with it's own label.
But if you have LabelAwareIterator ready, you can can provide it, for your in-house labels
*/
LabelsSource source = new LabelsSource("DOC_");
ParagraphVectors vec = new ParagraphVectors.Builder()
.minWordFrequency(1)
.iterations(100)
.epochs(1)
.layerSize(50)
.learningRate(0.02)
.labelsSource(source)
.windowSize(5)
.iterate(iter)
.trainWordVectors(true)
.vocabCache(cache)
.tokenizerFactory(t)
.sampling(0)
.build();
vec.fit();
WordVectorSerializer.writeParagraphVectors(vec, locationToSave);
}
示例8: testSparkW2VonBiggerCorpus
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Ignore
@Test
public void testSparkW2VonBiggerCorpus() throws Exception {
SparkConf sparkConf = new SparkConf().setMaster("local[8]").setAppName("sparktest")
.set("spark.driver.maxResultSize", "4g").set("spark.driver.memory", "8g")
.set("spark.executor.memory", "8g");
// Set SparkContext
JavaSparkContext sc = new JavaSparkContext(sparkConf);
// Path of data part-00000
//String dataPath = new ClassPathResource("/big/raw_sentences.txt").getFile().getAbsolutePath();
// String dataPath = "/ext/Temp/SampleRussianCorpus.txt";
String dataPath = new ClassPathResource("spark_word2vec_test.txt").getFile().getAbsolutePath();
// Read in data
JavaRDD<String> corpus = sc.textFile(dataPath);
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new LowCasePreProcessor());
Word2Vec word2Vec = new Word2Vec.Builder().setNGrams(1)
// .setTokenizer("org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory")
// .setTokenPreprocessor("org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor")
// .setRemoveStop(false)
.tokenizerFactory(t).seed(42L).negative(3).useAdaGrad(false).layerSize(100).windowSize(5)
.learningRate(0.025).minLearningRate(0.0001).iterations(1).batchSize(100).minWordFrequency(5)
.useUnknown(true).build();
word2Vec.train(corpus);
sc.stop();
WordVectorSerializer.writeWordVectors(word2Vec.getLookupTable(), "/ext/Temp/sparkRuModel.txt");
}
示例9: testWord2VecCBOW
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testWord2VecCBOW() throws Exception {
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150)
.seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5)
.modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8)
.tokenizerFactory(t).elementsLearningAlgorithm(new CBOW<VocabWord>()).build();
vec.fit();
Collection<String> lst = vec.wordsNearest("day", 10);
log.info(Arrays.toString(lst.toArray()));
// assertEquals(10, lst.size());
double sim = vec.similarity("day", "night");
log.info("Day/night similarity: " + sim);
assertTrue(lst.contains("week"));
assertTrue(lst.contains("night"));
assertTrue(lst.contains("year"));
assertTrue(sim > 0.65f);
}
示例10: main
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
public static void main(String[] args) throws Exception {
ClassPathResource srcFile = new ClassPathResource("/raw_sentences.txt");
File file = srcFile.getFile();
SentenceIterator iter = new BasicLineIterator(file);
TokenizerFactory tFact = new DefaultTokenizerFactory();
tFact.setTokenPreProcessor(new CommonPreprocessor());
LabelsSource labelFormat = new LabelsSource("LINE_");
ParagraphVectors vec = new ParagraphVectors.Builder()
.minWordFrequency(1)
.iterations(5)
.epochs(1)
.layerSize(100)
.learningRate(0.025)
.labelsSource(labelFormat)
.windowSize(5)
.iterate(iter)
.trainWordVectors(false)
.tokenizerFactory(tFact)
.sampling(0)
.build();
vec.fit();
double similar1 = vec.similarity("LINE_9835", "LINE_12492");
out.println("Comparing lines 9836 & 12493 ('This is my house .'/'This is my world .') Similarity = " + similar1);
double similar2 = vec.similarity("LINE_3720", "LINE_16392");
out.println("Comparing lines 3721 & 16393 ('This is my way .'/'This is my work .') Similarity = " + similar2);
double similar3 = vec.similarity("LINE_6347", "LINE_3720");
out.println("Comparing lines 6348 & 3721 ('This is my case .'/'This is my way .') Similarity = " + similar3);
double dissimilar1 = vec.similarity("LINE_3720", "LINE_9852");
out.println("Comparing lines 3721 & 9853 ('This is my way .'/'We now have one .') Similarity = " + dissimilar1);
double dissimilar2 = vec.similarity("LINE_3720", "LINE_3719");
out.println("Comparing lines 3721 & 3720 ('This is my way .'/'At first he says no .') Similarity = " + dissimilar2);
}
开发者ID:PacktPublishing,项目名称:Machine-Learning-End-to-Endguide-for-Java-developers,代码行数:46,代码来源:ClassifyBySimilarity.java
示例11: shouldLoadAndCreateSameWord2Vec
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void shouldLoadAndCreateSameWord2Vec() {
//given
Pan15Parser parser = new Pan15Parser();
HashMap<String, Pan15Author> english = parser.parseCSVCorpus().get(Language.ENGLISH);
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
List<String> englishSentences = english.values().stream().map(Author::getDocuments)
.collect(Collectors.toList())
.stream().flatMap(List::stream).collect(Collectors.toList());
SentenceIterator englishIter = new CollectionSentenceIterator(new Pan15SentencePreProcessor(), englishSentences);
// when
Word2Vec englishVec = new Word2Vec.Builder()
.minWordFrequency(6)
.iterations(15)
.layerSize(250)
.seed(42)
.windowSize(5)
.iterate(englishIter)
.tokenizerFactory(t)
.build();
englishVec.fit();
Word2Vec loadedEnglishVec1 = new Pan15Word2Vec(new SkipGram<>()).readModelFromFile(Language.ENGLISH);
Word2Vec loadedEnglishVec2 = new Pan15Word2Vec(new CBOW<>()).readModelFromFile(Language.ENGLISH);
Word2Vec loadedEnglishVec3 = new Pan15Word2Vec(new GloVe<>()).readModelFromFile(Language.ENGLISH);
loadedEnglishVec1.setTokenizerFactory(t);
loadedEnglishVec1.setSentenceIterator(englishIter);
loadedEnglishVec2.setTokenizerFactory(t);
loadedEnglishVec2.setSentenceIterator(englishIter);
loadedEnglishVec3.setTokenizerFactory(t);
loadedEnglishVec3.setSentenceIterator(englishIter);
//then
Assert.assertNotNull(loadedEnglishVec1);
System.out.println(englishVec.wordsNearest("home", 15));
System.out.println(loadedEnglishVec1.wordsNearest("home", 15));
System.out.println(loadedEnglishVec2.wordsNearest("home", 15));
System.out.println(loadedEnglishVec3.wordsNearest("home", 15));
}
示例12: testRunWord2Vec
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testRunWord2Vec() throws Exception {
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(3).batchSize(64).layerSize(100)
.stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
.sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>())
//.negativeSample(10)
.epochs(1).windowSize(5).allowParallelTokenization(true)
.modelUtils(new BasicModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();
assertEquals(new ArrayList<String>(), vec.getStopWords());
vec.fit();
File tempFile = File.createTempFile("temp", "temp");
tempFile.deleteOnExit();
WordVectorSerializer.writeFullModel(vec, tempFile.getAbsolutePath());
Collection<String> lst = vec.wordsNearest("day", 10);
//log.info(Arrays.toString(lst.toArray()));
printWords("day", lst, vec);
assertEquals(10, lst.size());
double sim = vec.similarity("day", "night");
log.info("Day/night similarity: " + sim);
assertTrue(sim < 1.0);
assertTrue(sim > 0.4);
assertTrue(lst.contains("week"));
assertTrue(lst.contains("night"));
assertTrue(lst.contains("year"));
assertFalse(lst.contains(null));
lst = vec.wordsNearest("day", 10);
//log.info(Arrays.toString(lst.toArray()));
printWords("day", lst, vec);
assertTrue(lst.contains("week"));
assertTrue(lst.contains("night"));
assertTrue(lst.contains("year"));
new File("cache.ser").delete();
ArrayList<String> labels = new ArrayList<>();
labels.add("day");
labels.add("night");
labels.add("week");
INDArray matrix = vec.getWordVectors(labels);
assertEquals(matrix.getRow(0), vec.getWordVectorMatrix("day"));
assertEquals(matrix.getRow(1), vec.getWordVectorMatrix("night"));
assertEquals(matrix.getRow(2), vec.getWordVectorMatrix("week"));
WordVectorSerializer.writeWordVectors(vec, pathToWriteto);
}
示例13: testW2VnegativeOnRestore
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testW2VnegativeOnRestore() throws Exception {
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(3).batchSize(64).layerSize(100)
.stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
.sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>()).negativeSample(10).epochs(1)
.windowSize(5).useHierarchicSoftmax(false).allowParallelTokenization(true)
.modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();
assertEquals(false, vec.getConfiguration().isUseHierarchicSoftmax());
log.info("Fit 1");
vec.fit();
File tmpFile = File.createTempFile("temp", "file");
tmpFile.deleteOnExit();
WordVectorSerializer.writeWord2VecModel(vec, tmpFile);
iter.reset();
Word2Vec restoredVec = WordVectorSerializer.readWord2VecModel(tmpFile, true);
restoredVec.setTokenizerFactory(t);
restoredVec.setSentenceIterator(iter);
assertEquals(false, restoredVec.getConfiguration().isUseHierarchicSoftmax());
assertTrue(restoredVec.getModelUtils() instanceof FlatModelUtils);
assertTrue(restoredVec.getConfiguration().isAllowParallelTokenization());
log.info("Fit 2");
restoredVec.fit();
iter.reset();
restoredVec = WordVectorSerializer.readWord2VecModel(tmpFile, false);
restoredVec.setTokenizerFactory(t);
restoredVec.setSentenceIterator(iter);
assertEquals(false, restoredVec.getConfiguration().isUseHierarchicSoftmax());
assertTrue(restoredVec.getModelUtils() instanceof BasicModelUtils);
log.info("Fit 3");
restoredVec.fit();
}
示例14: testUnknown1
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testUnknown1() throws Exception {
// Strip white space before and after for each line
SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10).useUnknown(true)
.unknownElement(new VocabWord(1.0, "PEWPEW")).iterations(1).layerSize(100)
.stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001)
.sampling(0).elementsLearningAlgorithm(new CBOW<VocabWord>()).epochs(1).windowSize(5)
.useHierarchicSoftmax(true).allowParallelTokenization(true)
.modelUtils(new FlatModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build();
vec.fit();
assertTrue(vec.hasWord("PEWPEW"));
assertTrue(vec.getVocab().containsWord("PEWPEW"));
INDArray unk = vec.getWordVectorMatrix("PEWPEW");
assertNotEquals(null, unk);
File tempFile = File.createTempFile("temp", "file");
tempFile.deleteOnExit();
WordVectorSerializer.writeWord2VecModel(vec, tempFile);
log.info("Original configuration: {}", vec.getConfiguration());
Word2Vec restored = WordVectorSerializer.readWord2VecModel(tempFile);
assertTrue(restored.hasWord("PEWPEW"));
assertTrue(restored.getVocab().containsWord("PEWPEW"));
INDArray unk_restored = restored.getWordVectorMatrix("PEWPEW");
assertEquals(unk, unk_restored);
// now we're getting some junk word
INDArray random = vec.getWordVectorMatrix("hhsd7d7sdnnmxc_SDsda");
INDArray randomRestored = restored.getWordVectorMatrix("hhsd7d7sdnnmxc_SDsda");
log.info("Restored configuration: {}", restored.getConfiguration());
assertEquals(unk, random);
assertEquals(unk, randomRestored);
}
示例15: testIndexPersistence
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; //导入方法依赖的package包/类
@Test
public void testIndexPersistence() throws Exception {
File inputFile = new ClassPathResource("/big/raw_sentences.txt").getFile();
SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
// Split on white spaces in the line to get words
TokenizerFactory t = new DefaultTokenizerFactory();
t.setTokenPreProcessor(new CommonPreprocessor());
Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100)
.stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).seed(42).windowSize(5)
.iterate(iter).tokenizerFactory(t).build();
vec.fit();
VocabCache orig = vec.getVocab();
File tempFile = File.createTempFile("temp", "w2v");
tempFile.deleteOnExit();
WordVectorSerializer.writeWordVectors(vec, tempFile);
WordVectors vec2 = WordVectorSerializer.loadTxtVectors(tempFile);
VocabCache rest = vec2.vocab();
assertEquals(orig.totalNumberOfDocs(), rest.totalNumberOfDocs());
for (VocabWord word : vec.getVocab().vocabWords()) {
INDArray array1 = vec.getWordVectorMatrix(word.getLabel());
INDArray array2 = vec2.getWordVectorMatrix(word.getLabel());
assertEquals(array1, array2);
}
}