本文整理汇总了Java中com.optimaize.langdetect.text.TextObject类的典型用法代码示例。如果您正苦于以下问题:Java TextObject类的具体用法?Java TextObject怎么用?Java TextObject使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
TextObject类属于com.optimaize.langdetect.text包,在下文中一共展示了TextObject类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: generate
import com.optimaize.langdetect.text.TextObject; //导入依赖的package包/类
/**
* Loads a text file and generate a language profile from its content. The input text file is supposed to be encoded in UTF-8.
* @param lang target language name.
* @param textFile input text file.
* @return Language profile instance
*/
public static LangProfile generate(String lang, File textFile) {
LangProfile profile = new LangProfile(lang);
InputStream is = null;
try {
is = new BufferedInputStream(new FileInputStream(textFile));
if (textFile.getName().endsWith(".gz")) is = new GZIPInputStream(is);
BufferedReader reader = new BufferedReader(new InputStreamReader(is, Charset.forName("UTF-8")));
String line;
while ((line = reader.readLine()) != null) {
TextObject textObject = textObjectFactory.forText(" "+line+" ");
Util.addCharSequence(profile, textObject);
}
} catch (IOException e) {
throw new RuntimeException("Can't open training database file '" + textFile.getName() + "'", e);
} finally {
IOUtils.closeQuietly(is);
}
return profile;
}
示例2: partition
import com.optimaize.langdetect.text.TextObject; //导入依赖的package包/类
private List<TextObject> partition() {
List<TextObject> result = new ArrayList<>(this.k);
if (!breakWords) {
int maxLength = this.inputSample.length() / (this.k - 1);
Pattern p = Pattern.compile("\\G\\s*(.{1," + maxLength + "})(?=\\s|$)", Pattern.DOTALL);
Matcher m = p.matcher(this.inputSample);
while (m.find())
result.add(textObjectFactory.create().append(m.group(1)));
} else {
Splitter splitter = Splitter.fixedLength(this.k);
for (String token : splitter.split(this.inputSample.toString())) {
result.add(textObjectFactory.create().append(token));
}
}
return result;
}
示例3: filter
import com.optimaize.langdetect.text.TextObject; //导入依赖的package包/类
@Override
public void filter(String url, byte[] content, DocumentFragment doc,
ParseResult parse) {
String text = parse.get(url).getText();
if (StringUtils.isBlank(text)) {
return;
}
TextObject textObject = textObjectFactory.forText(text);
synchronized (languageDetector) {
List<DetectedLanguage> probs = languageDetector
.getProbabilities(textObject);
if (probs == null || probs.size() == 0) {
return;
}
for (DetectedLanguage lang : probs) {
if (lang.getProbability() >= minProb) {
String code = lang.getLocale().getLanguage();
parse.get(url).getMetadata().addValue(mdKey, code);
}
}
}
}
示例4: main
import com.optimaize.langdetect.text.TextObject; //导入依赖的package包/类
public static void main(String[] args) throws IOException {
if (args.length != 3) {
System.out.println("Usage: " + LanguageDetectionTrainer.class.getName() + " <languageCode> <plainTextFile> <minimalFrequency>");
System.exit(1);
}
String langCode = args[0];
String fileName = args[1];
int minimalFrequency = Integer.parseInt(args[2]);
String text = IOUtils.toString(new FileReader(fileName));
TextObjectFactory textObjectFactory = CommonTextObjectFactories.forIndexingCleanText();
TextObject inputText = textObjectFactory.create().append(text);
LanguageProfile languageProfile = new LanguageProfileBuilder(langCode)
.ngramExtractor(NgramExtractors.standard())
.minimalFrequency(minimalFrequency)
.addText(inputText)
.build();
File outputDir = new File(System.getProperty("user.dir")); // current dir
new LanguageProfileWriter().writeToDirectory(languageProfile, outputDir);
System.out.println("Language profile written to " + new File(outputDir, langCode).getAbsolutePath());
}
示例5: identifyLanguage
import com.optimaize.langdetect.text.TextObject; //导入依赖的package包/类
@Override
public String identifyLanguage(String text) {
TextObject textObject = textObjectFactory.forText(text);
Optional<LdLocale> lang = languageDetector.detect(textObject);
if (lang.isPresent()) {
return lang.get().getLanguage();
} else {
return "Can't detect";
}
}
示例6: detectLanguage
import com.optimaize.langdetect.text.TextObject; //导入依赖的package包/类
@Override
public Optional<String> detectLanguage(String inputText) {
lazyInitialize();
//query:
TextObject textObject = this.textObjectFactory.forText(inputText);
com.google.common.base.Optional<LdLocale> lang = this.languageDetector.detect(textObject);
return Optional.of(lang.get().getLanguage());
}
示例7: detectVagueLanguage
import com.optimaize.langdetect.text.TextObject; //导入依赖的package包/类
@Override
public ProbabilisticLocale detectVagueLanguage(String inputText) {
lazyInitialize();
//query:
TextObject textObject = this.textObjectFactory.forText(inputText);
List<DetectedLanguage> detectedLanguages = this.languageDetector.getProbabilities(textObject);
return new ProbabilisticLocale(detectedLanguages.get(0).getLocale().getLanguage(),
detectedLanguages.get(0).getProbability());
}
示例8: getLanguageFrom
import com.optimaize.langdetect.text.TextObject; //导入依赖的package包/类
public static Language getLanguageFrom(String source){
TextObject textObject = CommonTextObjectFactories.forDetectingOnLargeText().forText(source);
Optional<LdLocale> lang = getLanguageDetector().detect(textObject);
if (lang.isPresent())
for(Language lg : Language.values())
if(lang.get().getLanguage().equals(lg.getAbrev().toLowerCase()))
return lg;
return null;
}
示例9: doProcess
import com.optimaize.langdetect.text.TextObject; //导入依赖的package包/类
@Override
public void doProcess(JCas aJCas) throws AnalysisEngineProcessException {
TextObject textObject = textObjectFactory.forText(aJCas.getDocumentText());
Optional<LdLocale> lang = languageDetector.detect(textObject);
if(lang.isPresent()){
aJCas.setDocumentLanguage(lang.get().getLanguage());
}
}
示例10: testLanguageDetectorErrorRate
import com.optimaize.langdetect.text.TextObject; //导入依赖的package包/类
@Test
public void testLanguageDetectorErrorRate() throws IOException {
//load target languages:
List<LanguageProfile> languageProfiles = new LanguageProfileReader().read(Arrays.asList(TARGET_LANGUAGES_FOR_YALDER));
//build language detector:
com.optimaize.langdetect.LanguageDetector languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
.withProfiles(languageProfiles)
.build();
//create a text object factory
TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingShortCleanText();
// TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
SummaryStatistics stats = new SummaryStatistics();
List<String> lines = EuroParlUtils.readLines();
int numHits = 0;
int numMisses = 0;
for (String line : lines) {
String[] pieces = line.split("\t", 2);
String language = pieces[0];
TextObject textObject = textObjectFactory.forText(pieces[1]);
List<DetectedLanguage> result = languageDetector.getProbabilities(textObject);
if (result.size() > 0 && result.get(0).getLocale().getLanguage().equals(language)) {
numHits += 1;
} else {
numMisses += 1;
}
}
double missPercentage = 100.0 * (double)numMisses/(double)(numMisses + numHits);
stats.addValue(missPercentage);
System.out.println(String.format("Total miss ratio = %.2f%%", missPercentage));
System.out.println(String.format("Min = %.2f%%, max = %.2f%%, mean = %.2f%%, std deviation = %f",
stats.getMin(), stats.getMax(), stats.getMean(), stats.getStandardDeviation()));
}
示例11: detectLang
import com.optimaize.langdetect.text.TextObject; //导入依赖的package包/类
/**
* Language detection test for each file (--detectlang option)
*
* <pre>
* usage: --detectlang -d [profile directory] -a [alpha] -s [seed] [test file(s)]
* </pre>
*
*/
public void detectLang() throws IOException {
LanguageDetector languageDetector = makeDetector();
TextObjectFactory textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
for (String filename: arglist) {
try (BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "utf-8"))) {
TextObject textObject = textObjectFactory.create().append(is);
List<DetectedLanguage> probabilities = languageDetector.getProbabilities(textObject);
System.out.println(filename + ":" + probabilities);
}
}
}
示例12: detect
import com.optimaize.langdetect.text.TextObject; //导入依赖的package包/类
public FoxParameter.Langs detect(final String text) {
final TextObject textObject = CommonTextObjectFactories.forDetectingOnLargeText().forText(text);
final List<DetectedLanguage> probs = languageDetector.getProbabilities(textObject);
FoxParameter.Langs lang = null;
for (final DetectedLanguage prob : probs) {
lang = FoxParameter.Langs.fromString(prob.getLanguage());
if (lang != null) {
break;
}
}
return lang;
}
示例13: detectLanguageCode
import com.optimaize.langdetect.text.TextObject; //导入依赖的package包/类
/**
* @return language or {@code null} if language could not be identified
*/
@Nullable
private String detectLanguageCode(String text) {
TextObject textObject = textObjectFactory.forText(text);
Optional<LdLocale> lang = languageDetector.detect(textObject);
// comment in for debugging:
//System.out.println(languageDetector.getProbabilities(textObject));
if (lang.isPresent()) {
return lang.get().getLanguage();
} else {
return null;
}
}
示例14: detect
import com.optimaize.langdetect.text.TextObject; //导入依赖的package包/类
public Optional<LdLocale> detect(String text) {
//query:
TextObject textObject = textObjectFactory.forText(text);
Optional<LdLocale> lang = languageDetector.detect(textObject);
return lang;
}
示例15: checkIfNonEnglish
import com.optimaize.langdetect.text.TextObject; //导入依赖的package包/类
public static String checkIfNonEnglish(Post post){
List<LanguageProfile> languageProfiles;
com.optimaize.langdetect.LanguageDetector optimaizeDetector;
org.apache.tika.language.detect.LanguageDetector tikaDetector;
TextObjectFactory textObjectFactory;
String dataToCheck = stripTags(stripBody(post)).replaceAll("\\p{Punct}+", "");
try {
languageProfiles = new LanguageProfileReader().readAllBuiltIn();
optimaizeDetector = LanguageDetectorBuilder.create(NgramExtractors.standard())
.withProfiles(languageProfiles)
.build();
textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText();
TextObject textObject = textObjectFactory.forText(dataToCheck);
Optional<LdLocale> lang = optimaizeDetector.detect(textObject);
if (!lang.isPresent()) {
if(dataToCheck.length()>50) {
tikaDetector = new OptimaizeLangDetector().loadModels();
LanguageWriter writer = new LanguageWriter(tikaDetector);
writer.append(dataToCheck);
LanguageResult result = writer.getLanguage();
String tikaLang = result.getLanguage();
writer.close();
if (!tikaLang.toLowerCase().equals("")) {
return tikaLang;
}
else{
return null;
}
}
else if(dataToCheck.length()<50){
return null;
}
if(checkIfNoCodeBlock(post)){
return "Gibberish";
}
return null;
}
return lang.get().getLanguage();
}
catch (IOException e){
e.printStackTrace();
}
return null;
}