本文整理汇总了Java中com.ibm.icu.text.CharsetMatch.getConfidence方法的典型用法代码示例。如果您正苦于以下问题:Java CharsetMatch.getConfidence方法的具体用法?Java CharsetMatch.getConfidence怎么用?Java CharsetMatch.getConfidence使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类com.ibm.icu.text.CharsetMatch
的用法示例。
在下文中一共展示了CharsetMatch.getConfidence方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: detectEncoding
import com.ibm.icu.text.CharsetMatch; //导入方法依赖的package包/类
/**
* 利用 icu4j 探测输入流编码,只能探测文本类型的输入流
* -
* 抛弃 juniversalchardet
*
* @param in
* @return
* @throws IOException
*/
public static Charset detectEncoding(InputStream in) throws IOException {
final CharsetDetector detector = new CharsetDetector();
detector.setText(in);
final CharsetMatch charsetMatch = detector.detect();
if (charsetMatch == null) {
log.info("Cannot detect source charset.");
return null;
}
//This is an integer from 0 to 100. The higher the value, the more confidence
//探测的相似度在 1~100 之间,相似度越高结果越准确。
int confidence = charsetMatch.getConfidence();
final String name = charsetMatch.getName();
log.info("CharsetMatch: {} ({}% 相似度,相似度小于 50% 时,可能编码无法判断。)", name, confidence);
//打印该文本编码,所有可能性
// CharsetMatch[] matches = detector.detectAll();
// System.out.println("All possibilities : " + Arrays.asList(matches));
return Charset.forName(name);
}
示例2: fileAnyEncodingToString
import com.ibm.icu.text.CharsetMatch; //导入方法依赖的package包/类
/**
* Read a text file detecting encoding using http://userguide.icu-project.org/conversion/detection
* Return the file contents as a String.
*/
public static String fileAnyEncodingToString(File f) throws IOException {
byte[] byteData = IOUtils.toByteArray(new FileInputStream(f));
CharsetDetector detector = new CharsetDetector();
String unicodeData = detector.getString(byteData, null);
// Add to newline at the end of the file otherwise the subtitle parser library can get confused by EOF
unicodeData += System.getProperty("line.separator") + System.getProperty("line.separator");
CharsetMatch match = detector.detect();
if (match != null && match.getConfidence() > 60) {
LOGGER.debug("{} has a detected encoding: {}", f.getName(), match.getName());
if (match.getLanguage() != null) {
LOGGER.debug("{} has a detected language: {}", f.getName(), match.getLanguage());
}
}
return unicodeData;
}
示例3: toReader
import com.ibm.icu.text.CharsetMatch; //导入方法依赖的package包/类
public static Reader toReader(InputStream input) throws IOException {
if (!input.markSupported())
input = new BufferedInputStream(input);
CharsetDetector charsetDetector = new CharsetDetector();
charsetDetector.setText(input);
CharsetMatch m = charsetDetector.detect();
Reader reader;
if (m.getConfidence() > 50) {
reader = m.getReader();
} else {
reader = new InputStreamReader(input);
}
return reader;
}
示例4: suggestEncoding
import com.ibm.icu.text.CharsetMatch; //导入方法依赖的package包/类
protected String suggestEncoding(final byte[] bytes) {
final CharsetDetector cd = new CharsetDetector();
cd.setText(bytes);
final CharsetMatch charsetMatch = cd.detect();
final String charSet = charsetMatch.getName();
final int confidence = charsetMatch.getConfidence();
logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence);
return charSet;
}
示例5: autoDetectEncoding
import com.ibm.icu.text.CharsetMatch; //导入方法依赖的package包/类
public String autoDetectEncoding(final byte[] bytes) {
final CharsetDetector cd = new CharsetDetector();
cd.setText(bytes);
final CharsetMatch charsetMatch = cd.detect();
final String charSet = charsetMatch.getName();
final int confidence = charsetMatch.getConfidence();
logger.info("CharsetMatch: {} ({}% confidence)", charSet, confidence);
setSelectedItem(charSet);
return charSet;
}
示例6: conversionImplementation
import com.ibm.icu.text.CharsetMatch; //导入方法依赖的package包/类
/**
* A converter that tries to get a decent encoding ASCII, UTF-8 or other,
* and then the buffer converted or not.
*
* IF ASCII OR UTF-8 accept file as is, do not convert, alter buffer...
* ELSE file must be read in and converted.
*
* CAVEAT: If file is short and low-confidence for encoding detection ALSO
* do not convert. Treat as a plain text file.
*/
@Override
protected ConvertedDocument conversionImplementation(java.io.InputStream in, java.io.File doc) throws IOException {
ConvertedDocument textdoc = new ConvertedDocument(doc);
byte[] data = null;
if (in != null) {
// Get byte data from input stream or file
if (doc != null) {
data = FileUtility.readBytesFrom(doc);
} else {
data = IOUtils.toByteArray(in);
}
in.close();
}
if (data == null){
return textdoc;
}
// Encoding heuristics here.....
//
// Objective: mark small plain text payloads with unknown character set
// as not worthy of conversion. Leave them as plain/text
// indeed they might even be straight Unicode
//
// Test for ASCII only first, otherwise try to detect the best charset for the text
//
textdoc.is_plaintext = true;
boolean is_ascii = TextUtils.isASCII(data);
if (is_ascii) {
textdoc.do_convert = false;
textdoc.setEncoding("ASCII");
textdoc.setText(new String(data));
} else {
chardet.setText(data);
CharsetMatch cs = chardet.detect();
if (ConvertedDocument.OUTPUT_ENCODING.equalsIgnoreCase(cs.getName())) {
textdoc.do_convert = false;
} else if (data.length < IGNORE_THRESHOLD_SIZE && cs.getConfidence() < IGNORE_THRESHOLD_CONF) {
textdoc.do_convert = false;
}
textdoc.setEncoding(cs.getName());
textdoc.setText(new String(data, cs.getName()));
}
return textdoc;
}
示例7: isTextFile
import com.ibm.icu.text.CharsetMatch; //导入方法依赖的package包/类
/**
* Checks if a file is text or binary.
*
* <p>
* The provided input stream will not be closed and the position in the
* stream will be unchanged after leaving this method.
* </p>
*
* @param is
* input stream of the file
* @return <code>true</code> if the file is more likely to be text file than
* binary file, <code>false</code> otherwise.
* @throws IOException
* in case of error while reading from the stream
*/
public static boolean isTextFile(InputStream is) throws IOException {
if (is == null)
return false;
CharsetDetector detector = new CharsetDetector();
detector.setText(is);
CharsetMatch match = detector.detect();
// The file is text if a charset with confidence of at least 10 (out
// of 100) is detected. Empty files have confidence 10 for UTF-8.
return match != null && match.getConfidence() >= 10;
}