本文整理汇总了Java中com.optimaize.langdetect.text.CommonTextObjectFactories类的典型用法代码示例。如果您正苦于以下问题:Java CommonTextObjectFactories类的具体用法?Java CommonTextObjectFactories怎么用?Java CommonTextObjectFactories使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
CommonTextObjectFactories类属于com.optimaize.langdetect.text包,在下文中一共展示了CommonTextObjectFactories类的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: main
import com.optimaize.langdetect.text.CommonTextObjectFactories; //导入依赖的package包/类
public static void main(String[] args) throws IOException {
if (args.length != 3) {
System.out.println("Usage: " + LanguageDetectionTrainer.class.getName() + " <languageCode> <plainTextFile> <minimalFrequency>");
System.exit(1);
}
String langCode = args[0];
String fileName = args[1];
int minimalFrequency = Integer.parseInt(args[2]);
String text = IOUtils.toString(new FileReader(fileName));
TextObjectFactory textObjectFactory = CommonTextObjectFactories.forIndexingCleanText();
TextObject inputText = textObjectFactory.create().append(text);
LanguageProfile languageProfile = new LanguageProfileBuilder(langCode)
.ngramExtractor(NgramExtractors.standard())
.minimalFrequency(minimalFrequency)
.addText(inputText)
.build();
File outputDir = new File(System.getProperty("user.dir")); // current dir
new LanguageProfileWriter().writeToDirectory(languageProfile, outputDir);
System.out.println("Language profile written to " + new File(outputDir, langCode).getAbsolutePath());
}
示例2: initLanguageDetector
import com.optimaize.langdetect.text.CommonTextObjectFactories; //导入依赖的package包/类
private void initLanguageDetector() throws IOException {
List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAllBuiltIn();
languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
.withProfiles(languageProfiles)
.build();
textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
}
示例3: getLanguageFrom
import com.optimaize.langdetect.text.CommonTextObjectFactories; //导入依赖的package包/类
public static Language getLanguageFrom(String source){
TextObject textObject = CommonTextObjectFactories.forDetectingOnLargeText().forText(source);
Optional<LdLocale> lang = getLanguageDetector().detect(textObject);
if (lang.isPresent())
for(Language lg : Language.values())
if(lang.get().getLanguage().equals(lg.getAbrev().toLowerCase()))
return lg;
return null;
}
示例4: QALanguageDetector
import com.optimaize.langdetect.text.CommonTextObjectFactories; //导入依赖的package包/类
public QALanguageDetector() throws IOException {
languageProfiles = new LanguageProfileReader().readAllBuiltIn();
//build language detector:
languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
.withProfiles(languageProfiles)
.build();
//create a text object factory
textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
}
示例5: doInitialize
import com.optimaize.langdetect.text.CommonTextObjectFactories; //导入依赖的package包/类
@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
try{
List<LanguageProfile> languageProfiles = new LanguageProfileReader().readAllBuiltIn();
languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
.withProfiles(languageProfiles)
.build();
textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
}catch(IOException ioe){
throw new ResourceInitializationException(ioe);
}
}
示例6: testLanguageDetectorErrorRate
import com.optimaize.langdetect.text.CommonTextObjectFactories; //导入依赖的package包/类
@Test
public void testLanguageDetectorErrorRate() throws IOException {
//load target languages:
List<LanguageProfile> languageProfiles = new LanguageProfileReader().read(Arrays.asList(TARGET_LANGUAGES_FOR_YALDER));
//build language detector:
com.optimaize.langdetect.LanguageDetector languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
.withProfiles(languageProfiles)
.build();
//create a text object factory
TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingShortCleanText();
// TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
SummaryStatistics stats = new SummaryStatistics();
List<String> lines = EuroParlUtils.readLines();
int numHits = 0;
int numMisses = 0;
for (String line : lines) {
String[] pieces = line.split("\t", 2);
String language = pieces[0];
TextObject textObject = textObjectFactory.forText(pieces[1]);
List<DetectedLanguage> result = languageDetector.getProbabilities(textObject);
if (result.size() > 0 && result.get(0).getLocale().getLanguage().equals(language)) {
numHits += 1;
} else {
numMisses += 1;
}
}
double missPercentage = 100.0 * (double)numMisses/(double)(numMisses + numHits);
stats.addValue(missPercentage);
System.out.println(String.format("Total miss ratio = %.2f%%", missPercentage));
System.out.println(String.format("Min = %.2f%%, max = %.2f%%, mean = %.2f%%, std deviation = %f",
stats.getMin(), stats.getMax(), stats.getMean(), stats.getStandardDeviation()));
}
示例7: detectLang
import com.optimaize.langdetect.text.CommonTextObjectFactories; //导入依赖的package包/类
/**
* Language detection test for each file (--detectlang option)
*
* <pre>
* usage: --detectlang -d [profile directory] -a [alpha] -s [seed] [test file(s)]
* </pre>
*
*/
public void detectLang() throws IOException {
LanguageDetector languageDetector = makeDetector();
TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
for (String filename: arglist) {
try (BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "utf-8"))) {
TextObject textObject = textObjectFactory.create().append(is);
List<DetectedLanguage> probabilities = languageDetector.getProbabilities(textObject);
System.out.println(filename + ":" + probabilities);
}
}
}
示例8: detect
import com.optimaize.langdetect.text.CommonTextObjectFactories; //导入依赖的package包/类
public FoxParameter.Langs detect(final String text) {
final TextObject textObject = CommonTextObjectFactories.forDetectingOnLargeText().forText(text);
final List<DetectedLanguage> probs = languageDetector.getProbabilities(textObject);
FoxParameter.Langs lang = null;
for (final DetectedLanguage prob : probs) {
lang = FoxParameter.Langs.fromString(prob.getLanguage());
if (lang != null) {
break;
}
}
return lang;
}
示例9: LanguageIdentifier
import com.optimaize.langdetect.text.CommonTextObjectFactories; //导入依赖的package包/类
public LanguageIdentifier() {
try {
List<LanguageProfile> profiles = loadProfiles(getLanguageCodes());
languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
.minimalConfidence(MINIMAL_CONFIDENCE)
.withProfiles(profiles)
.build();
textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
} catch (IOException e) {
throw new RuntimeException("Could not set up language identifier", e);
}
}
示例10: lazyInitialize
import com.optimaize.langdetect.text.CommonTextObjectFactories; //导入依赖的package包/类
private void lazyInitialize() {
// lazy initialization
if (languageDetector == null) {
logger.debug("Initializing language detector...");
//load all languages:
try {
// read built-in profiles
this.languageProfiles = new LanguageProfileReader().readAllBuiltIn();
// read custom profiles
List<String> profileFileNames = new ArrayList<>();
// prevent FileSystemNotFoundException...
final Map<String, String> env = new HashMap<>();
final String[] array = this.getClass().getResource("/languageProfiles").toURI().toString().split("!");
Path path;
FileSystem fs = null;
if (array.length > 1) {
fs = FileSystems.newFileSystem(URI.create(array[0]), env);
path = fs.getPath(array[1]);
} else {
path = Paths.get(this.getClass().getResource("/languageProfiles").toURI());
}
Files.walk(path)
.forEach(file -> {
if (!Files.isDirectory(file))
profileFileNames.add(file.getFileName().toString());
});
if (fs != null)
fs.close();
// according to the documentation LanguageProfileReader#readAll should not be used for files within the .jar.
this.languageProfiles.addAll(new LanguageProfileReader().read("languageProfiles", profileFileNames));
} catch (IOException | URISyntaxException e) {
logger.error("Error loading language profiles", e);
}
this.languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()).withProfiles(languageProfiles).build();
this.textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
logger.debug("... language detector initialized");
}
}
示例11: checkIfNonEnglish
import com.optimaize.langdetect.text.CommonTextObjectFactories; //导入依赖的package包/类
public static String checkIfNonEnglish(Post post){
List<LanguageProfile> languageProfiles;
com.optimaize.langdetect.LanguageDetector optimaizeDetector;
org.apache.tika.language.detect.LanguageDetector tikaDetector;
TextObjectFactory textObjectFactory;
String dataToCheck = stripTags(stripBody(post)).replaceAll("\\p{Punct}+", "");
try {
languageProfiles = new LanguageProfileReader().readAllBuiltIn();
optimaizeDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
.withProfiles(languageProfiles)
.build();
textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
TextObject textObject = textObjectFactory.forText(dataToCheck);
Optional<LdLocale> lang = optimaizeDetector.detect(textObject);
if (!lang.isPresent()) {
if(dataToCheck.length()>50) {
tikaDetector = new OptimaizeLangDetector().loadModels();
LanguageWriter writer = new LanguageWriter(tikaDetector);
writer.append(dataToCheck);
LanguageResult result = writer.getLanguage();
String tikaLang = result.getLanguage();
writer.close();
if (!tikaLang.toLowerCase().equals("")) {
return tikaLang;
}
else{
return null;
}
}
else if(dataToCheck.length()<50){
return null;
}
if(checkIfNoCodeBlock(post)){
return "Gibberish";
}
return null;
}
return lang.get().getLanguage();
}
catch (IOException e){
e.printStackTrace();
}
return null;
}
示例12: testLanguageDetectorPerformance
import com.optimaize.langdetect.text.CommonTextObjectFactories; //导入依赖的package包/类
@Test
public void testLanguageDetectorPerformance() throws IOException {
// Turn off logging by language-detector.
System.setProperty("logging.root.level", "INFO");
Logger.getRootLogger().setLevel(Level.INFO);
//load target languages:
List<LanguageProfile> languageProfiles = new LanguageProfileReader().read(Arrays.asList(TARGET_LANGUAGES_FOR_YALDER));
//build language detector:
com.optimaize.langdetect.LanguageDetector languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
.withProfiles(languageProfiles)
.build();
//create a text object factory
TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingShortCleanText();
// TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
List<String> lines = EuroParlUtils.readLines();
// Do 10 runs, and take the fastest time.
long bestDuration = Long.MAX_VALUE;
for (int i = 0; i < 10; i++) {
int numHits = 0;
int numMisses = 0;
long startTime = System.currentTimeMillis();
for (String line : lines) {
String[] pieces = line.split("\t", 2);
String language = pieces[0];
TextObject textObject = textObjectFactory.forText(pieces[1]);
List<DetectedLanguage> result = languageDetector.getProbabilities(textObject);
if (result.size() > 0 && result.get(0).getLocale().getLanguage().equals(language)) {
numHits += 1;
} else {
numMisses += 1;
}
}
long duration = System.currentTimeMillis() - startTime;
System.out.println(String.format("Run #%d duration = %dms", i + 1, duration));
System.out.println(String.format("Run #%d error rate = %f%%", i + 1, 100.0 * (double)numMisses/(double)(numMisses + numHits)));
bestDuration = Math.min(bestDuration, duration);
}
System.out.println(String.format("Best duration = %dms", bestDuration));
}
示例13: shortCleanText
import com.optimaize.langdetect.text.CommonTextObjectFactories; //导入依赖的package包/类
private CharSequence shortCleanText(CharSequence text) {
return CommonTextObjectFactories.forDetectingShortCleanText().forText( text );
}
示例14: largeText
import com.optimaize.langdetect.text.CommonTextObjectFactories; //导入依赖的package包/类
private CharSequence largeText(CharSequence text) {
return CommonTextObjectFactories.forDetectingOnLargeText().forText( text );
}