当前位置: 首页>>代码示例>>Java>>正文


Java LanguageIdentifier类代码示例

本文整理汇总了Java中org.apache.tika.language.LanguageIdentifier的典型用法代码示例。如果您正苦于以下问题:Java LanguageIdentifier类的具体用法?Java LanguageIdentifier怎么用?Java LanguageIdentifier使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


LanguageIdentifier类属于org.apache.tika.language包,在下文中一共展示了LanguageIdentifier类的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: detectLanguage

import org.apache.tika.language.LanguageIdentifier; //导入依赖的package包/类
@Override
protected List<DetectedLanguage> detectLanguage(SolrInputDocument doc) {
  List<DetectedLanguage> languages = new ArrayList<>();
  String content = concatFields(doc);
  if (content.length() != 0) {
    LanguageIdentifier identifier = new LanguageIdentifier(content);
    // FIXME: Hack - we get the distance from toString and calculate our own certainty score
    Double distance = Double.parseDouble(tikaSimilarityPattern.matcher(identifier.toString()).replaceFirst("$1"));
    // This formula gives: 0.02 => 0.8, 0.1 => 0.5 which is a better sweetspot than isReasonablyCertain()
    Double certainty = 1 - (5 * distance); 
    certainty = (certainty < 0) ? 0 : certainty;
    DetectedLanguage language = new DetectedLanguage(identifier.getLanguage(), certainty);
    languages.add(language);
    log.debug("Language detected as "+language+" with a certainty of "+language.getCertainty()+" (Tika distance="+identifier.toString()+")");
  } else {
    log.debug("No input text to detect language from, returning empty list");
  }
  return languages;
}
 
开发者ID:europeana,项目名称:search,代码行数:20,代码来源:TikaLanguageIdentifierUpdateProcessor.java

示例2: getCurrentLanguage

import org.apache.tika.language.LanguageIdentifier; //导入依赖的package包/类
private Language getCurrentLanguage() {
	if (autoDetectBox.isSelected()) {
		final LanguageIdentifier langIdentifier = new LanguageIdentifier(
				textArea.getText());
		Language lang = Language.getLanguageForShortName(langIdentifier
				.getLanguage());
		if (lang == null) {
			lang = Language.ENGLISH;
		}
		for (int i = 0; i < languageBox.getItemCount(); i++) {
			final I18nLanguage boxLanguage = (I18nLanguage) languageBox
					.getItemAt(i);
			if (boxLanguage.toString().equals(
					lang.getTranslatedName(messages))) {
				languageBox.setSelectedIndex(i);
			}
		}
		return lang;
	} else {
		return ((I18nLanguage) languageBox.getSelectedItem()).getLanguage();
	}
}
 
开发者ID:markkohdev,项目名称:oStorybook,代码行数:23,代码来源:LangToolMain.java

示例3: detectLanguage

import org.apache.tika.language.LanguageIdentifier; //导入依赖的package包/类
@Override
protected List<DetectedLanguage> detectLanguage(String content) {
  List<DetectedLanguage> languages = new ArrayList<DetectedLanguage>();
  if(content.trim().length() != 0) { 
    LanguageIdentifier identifier = new LanguageIdentifier(content);
    // FIXME: Hack - we get the distance from toString and calculate our own certainty score
    Double distance = Double.parseDouble(tikaSimilarityPattern.matcher(identifier.toString()).replaceFirst("$1"));
    // This formula gives: 0.02 => 0.8, 0.1 => 0.5 which is a better sweetspot than isReasonablyCertain()
    Double certainty = 1 - (5 * distance); 
    certainty = (certainty < 0) ? 0 : certainty;
    DetectedLanguage language = new DetectedLanguage(identifier.getLanguage(), certainty);
    languages.add(language);
    log.debug("Language detected as "+language+" with a certainty of "+language.getCertainty()+" (Tika distance="+identifier.toString()+")");
  } else {
    log.debug("No input text to detect language from, returning empty list");
  }
  return languages;
}
 
开发者ID:pkarmstr,项目名称:NYBC,代码行数:19,代码来源:TikaLanguageIdentifierUpdateProcessor.java

示例4: identifyLanguage

import org.apache.tika.language.LanguageIdentifier; //导入依赖的package包/类
/** Use statistical language identification to extract page language */
private String identifyLanguage(Parse parse) {
  StringBuilder text = new StringBuilder();
  if (parse == null)
    return null;

  String title = parse.getData().getTitle();
  if (title != null) {
    text.append(title.toString());
  }

  String content = parse.getText();
  if (content != null) {
    text.append(" ").append(content.toString());
  }

  // trim content?
  String titleandcontent = text.toString();

  if (this.contentMaxlength != -1
      && titleandcontent.length() > this.contentMaxlength)
    titleandcontent = titleandcontent.substring(0, contentMaxlength);

  LanguageIdentifier identifier = new LanguageIdentifier(titleandcontent);

  if (onlyCertain) {
    if (identifier.isReasonablyCertain())
      return identifier.getLanguage();
    else
      return null;
  }
  return identifier.getLanguage();
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:34,代码来源:HTMLLanguageParser.java

示例5: getLanguageFromText

import org.apache.tika.language.LanguageIdentifier; //导入依赖的package包/类
/**
 * Cerca di identificare la lingua in cui è scritto un testo (attraverso
 * TIKA). In caso di indecisione ritorna Italiano
 *
 * @param text testo di cui identificare la lingua
 * @return lingua del documento
 */
public String getLanguageFromText(String text) {
    LanguageIdentifier identifier = new LanguageIdentifier(text);
    String lang = identifier.getLanguage();
    if (!MyAnalyzer.languagesSet.contains(lang)) {
        lang = "it";
    }
    return (String) intern.intern(lang);
}
 
开发者ID:fiohol,项目名称:theSemProject,代码行数:16,代码来源:DocumentParser.java

示例6: identifyLanguage

import org.apache.tika.language.LanguageIdentifier; //导入依赖的package包/类
/** Use statistical language identification to extract page language */
private String identifyLanguage(Parse parse) {
    StringBuilder text = new StringBuilder();
    if (parse == null)
        return null;

    String title = parse.getData().getTitle();
    if (title != null) {
        text.append(title.toString());
    }

    String content = parse.getText();
    if (content != null) {
        text.append(" ").append(content.toString());
    }

    // trim content?
    String titleandcontent = text.toString();

    if (this.contentMaxlength != -1
            && titleandcontent.length() > this.contentMaxlength)
        titleandcontent = titleandcontent.substring(0, contentMaxlength);

    LanguageIdentifier identifier = new LanguageIdentifier(titleandcontent);

    if (onlyCertain) {
        if (identifier.isReasonablyCertain())
            return identifier.getLanguage();
        else
            return null;
    }
    return identifier.getLanguage();
}
 
开发者ID:yahoo,项目名称:anthelion,代码行数:34,代码来源:HTMLLanguageParser.java

示例7: main

import org.apache.tika.language.LanguageIdentifier; //导入依赖的package包/类
public static void main(String[] args) {
    String content = "This is an example string of text that will be detected as English.";

    LanguageIdentifier identifier = new LanguageIdentifier(content);
    String language = identifier.getLanguage();

    System.out.println("The String is in " + language + ".");
}
 
开发者ID:tbpalsulich,项目名称:TikaExamples,代码行数:9,代码来源:DetectingExample.java

示例8: detectTest

import org.apache.tika.language.LanguageIdentifier; //导入依赖的package包/类
@Test
public void detectTest() {
    final String text = "this is text";
    LanguageProfile profile = new LanguageProfile(text);
    LanguageIdentifier identifier = new LanguageIdentifier(profile);
    System.out.println("Language:  " + identifier.getLanguage());
}
 
开发者ID:CoEIA,项目名称:DEM,代码行数:8,代码来源:NewEmptyJUnitTest.java

示例9: detectLanguage

import org.apache.tika.language.LanguageIdentifier; //导入依赖的package包/类
public static String detectLanguage(String text) {
	LanguageIdentifier identifier = new LanguageIdentifier(text);
	Map language = new HashMap();

	language.put("language", identifier.getLanguage());
	language.put("reasonablyCertain", identifier.isReasonablyCertain());

	return new Gson().toJson(language);
}
 
开发者ID:ICIJ,项目名称:node-tika,代码行数:10,代码来源:NodeTika.java

示例10: identifyLanguage

import org.apache.tika.language.LanguageIdentifier; //导入依赖的package包/类
/**
 * <p>Can return the wrong language; defaults to "en"</p>
 * <p>NOTE: we may want to simply truncate the string to some
 * short max length.</p>
 * @param string
 * @return
 */
public static String identifyLanguage(String string) {
	if (string == null)
		return "en";
	LanguageIdentifier identifier = new LanguageIdentifier(trimString(string));
   	String language = identifier.getLanguage();
   	boolean isReasonablyCertain = identifier.isReasonablyCertain();
   	if (language.equals("") || !isReasonablyCertain)
   		language = "en"; //defaults
   	return language;
}
 
开发者ID:agibsonccc,项目名称:solrsherlock-maven,代码行数:18,代码来源:LanguageUtil.java

示例11: execute

import org.apache.tika.language.LanguageIdentifier; //导入依赖的package包/类
@Override
public void execute(IngestDocument ingestDocument) {
    Map<String, Object> additionalFields = new HashMap<>();

    byte[] input = ingestDocument.getFieldValueAsBytes(field, ignoreMissing);

    if (input == null && ignoreMissing) {
        return;
    } else if (input == null) {
        throw new IllegalArgumentException("field [" + field + "] is null, cannot parse.");
    }

    try {
        Metadata metadata = new Metadata();
        String parsedContent = TikaImpl.parse(input, metadata, indexedChars);

        if (properties.contains(Property.CONTENT) && Strings.hasLength(parsedContent)) {
            // somehow tika seems to append a newline at the end automatically, lets remove that again
            additionalFields.put(Property.CONTENT.toLowerCase(), parsedContent.trim());
        }

        if (properties.contains(Property.LANGUAGE) && Strings.hasLength(parsedContent)) {
            LanguageIdentifier identifier = new LanguageIdentifier(parsedContent);
            String language = identifier.getLanguage();
            additionalFields.put(Property.LANGUAGE.toLowerCase(), language);
        }

        if (properties.contains(Property.DATE)) {
            String createdDate = metadata.get(TikaCoreProperties.CREATED);
            if (createdDate != null) {
                additionalFields.put(Property.DATE.toLowerCase(), createdDate);
            }
        }

        if (properties.contains(Property.TITLE)) {
            String title = metadata.get(TikaCoreProperties.TITLE);
            if (Strings.hasLength(title)) {
                additionalFields.put(Property.TITLE.toLowerCase(), title);
            }
        }

        if (properties.contains(Property.AUTHOR)) {
            String author = metadata.get("Author");
            if (Strings.hasLength(author)) {
                additionalFields.put(Property.AUTHOR.toLowerCase(), author);
            }
        }

        if (properties.contains(Property.KEYWORDS)) {
            String keywords = metadata.get("Keywords");
            if (Strings.hasLength(keywords)) {
                additionalFields.put(Property.KEYWORDS.toLowerCase(), keywords);
            }
        }

        if (properties.contains(Property.CONTENT_TYPE)) {
            String contentType = metadata.get(Metadata.CONTENT_TYPE);
            if (Strings.hasLength(contentType)) {
                additionalFields.put(Property.CONTENT_TYPE.toLowerCase(), contentType);
            }
        }

        if (properties.contains(Property.CONTENT_LENGTH)) {
            String contentLength = metadata.get(Metadata.CONTENT_LENGTH);
            long length;
            if (Strings.hasLength(contentLength)) {
                length = Long.parseLong(contentLength);
            } else {
                length = parsedContent.length();
            }
            additionalFields.put(Property.CONTENT_LENGTH.toLowerCase(), length);
        }
    } catch (Exception e) {
        throw new ElasticsearchParseException("Error parsing document in field [{}]", e, field);
    }

    ingestDocument.setFieldValue(targetField, additionalFields);
}
 
开发者ID:justor,项目名称:elasticsearch_my,代码行数:79,代码来源:AttachmentProcessor.java

示例12: identifyLanguage

import org.apache.tika.language.LanguageIdentifier; //导入依赖的package包/类
/**
 * identify language of text
 * @param text to check
 * @return language detected
 */
protected String identifyLanguage(String text) {
	LanguageIdentifier identifier = new LanguageIdentifier(text);
	return identifier.getLanguage();
}
 
开发者ID:mkalus,项目名称:tika-page-extractor,代码行数:10,代码来源:TikaPageExtractor.java

示例13: extractTika

import org.apache.tika.language.LanguageIdentifier; //导入依赖的package包/类
private JSONObject extractTika(String contents){
		
		JSONObject jObj = (JSONObject)JSONSerializer.toJSON(contents);
		
		if(jObj.containsKey("_source"))
		{
			JSONObject jObjSource = jObj.getJSONObject("_source");
			
			if(jObjSource.containsKey(htmlField))
			{
				String rawHtml = jObjSource.getString(htmlField);
				
				ByteArrayInputStream bIs = new ByteArrayInputStream(rawHtml.getBytes());
				
				Metadata metadata = new Metadata();
				
				AutoDetectParser adp = new AutoDetectParser();
				
				ContentHandler handler = new BodyContentHandler(10*1024*1024);
				
				
				
				try {
					adp.parse(bIs, handler, metadata);
					
					String[] metadataNames = metadata.names();
					
					
					JSONObject jObjMetadata = new JSONObject();
					
					for(String metadataName:metadataNames)
					{
						String[] values = metadata.getValues(metadataName);
						
						JSONArray jArray = new JSONArray();
						for(String mValue: values)
						{
							jArray.add(mValue);
						}
						
						jObjMetadata.accumulate(metadataName, jArray);
						
					}
					
					//remove empty lines from the text
					String rawTextAdjusted = handler.toString().replaceAll("(?m)^[ \t]*\r?\n", "");
					
					//detect language
					LanguageIdentifier li = new LanguageIdentifier(rawTextAdjusted);
					
					jObjSource.accumulate("tikametadata", jObjMetadata);
					jObjSource.accumulate("raw_text", rawTextAdjusted);
					jObjSource.accumulate("rawtextdetectedlanguage", li.getLanguage());
					
				} catch (Exception e) {
					LOG.error("Error:",e);;
				}
				
			}
			
		}
		return jObj;
}
 
开发者ID:usc-isi-i2,项目名称:dig-elasticsearch,代码行数:64,代码来源:ScanAndScroll.java


注:本文中的org.apache.tika.language.LanguageIdentifier类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。