本文整理汇总了Java中com.ibm.icu.text.CharsetMatch类的典型用法代码示例。如果您正苦于以下问题:Java CharsetMatch类的具体用法?Java CharsetMatch怎么用?Java CharsetMatch使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
CharsetMatch类属于com.ibm.icu.text包,在下文中一共展示了CharsetMatch类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: checkCharset
import com.ibm.icu.text.CharsetMatch; //导入依赖的package包/类
public static CharsetMatch checkCharset(InputStream input) {
// BufferedInputStream bis = new BufferedInputStream(input);
CharsetDetector cd = new CharsetDetector();
try {
cd.setText(input);
} catch (IOException e) {
try {
input.close();
} catch (IOException e1) {
e1.printStackTrace();
}
e.printStackTrace();
}
CharsetMatch cm = cd.detect();
// if (cm != null) {
// //reader = cm.getReader();
// return cm.getName();
// } else {
// throw new UnsupportedCharsetException(null);
// }
return cm;
}
示例2: detectEncoding
import com.ibm.icu.text.CharsetMatch; //导入依赖的package包/类
/**
* 利用 icu4j 探测输入流编码,只能探测文本类型的输入流
* -
* 抛弃 juniversalchardet
*
* @param in
* @return
* @throws IOException
*/
public static Charset detectEncoding(InputStream in) throws IOException {
final CharsetDetector detector = new CharsetDetector();
detector.setText(in);
final CharsetMatch charsetMatch = detector.detect();
if (charsetMatch == null) {
log.info("Cannot detect source charset.");
return null;
}
//This is an integer from 0 to 100. The higher the value, the more confidence
//探测的相似度在 1~100 之间,相似度越高结果越准确。
int confidence = charsetMatch.getConfidence();
final String name = charsetMatch.getName();
log.info("CharsetMatch: {} ({}% 相似度,相似度小于 50% 时,可能编码无法判断。)", name, confidence);
//打印该文本编码,所有可能性
// CharsetMatch[] matches = detector.detectAll();
// System.out.println("All possibilities : " + Arrays.asList(matches));
return Charset.forName(name);
}
示例3: getText
import com.ibm.icu.text.CharsetMatch; //导入依赖的package包/类
/**
* Extract text to be indexed
*/
public static String getText(String mimeType, String encoding, InputStream isContent) throws IOException {
BufferedInputStream bis = new BufferedInputStream(isContent);
TextExtractor te = engine.get(mimeType);
String text = null;
if (te != null) {
if (mimeType.startsWith("text/") && encoding == null) {
CharsetDetector detector = new CharsetDetector();
detector.setText(bis);
CharsetMatch cm = detector.detect();
encoding = cm.getName();
}
text = te.extractText(bis, mimeType, encoding);
} else {
throw new IOException("Full text indexing of '" + mimeType + "' is not supported");
}
IOUtils.closeQuietly(bis);
return text;
}
示例4: showEncode
import com.ibm.icu.text.CharsetMatch; //导入依赖的package包/类
protected String showEncode(Document doc) {
String charsetName = "";
try {
String convertedPlainText = doc.getText(0, doc.getLength());
try (InputStream is = convertStringToStream(convertedPlainText)) {
CharsetMatch charsetMatch = new CharsetDetector().setText(is).detect();
charsetName = charsetMatch.getName();
charsetName = charsetName != null ? charsetName : "NULL";
if (isPoorMatch(charsetMatch.getConfidence())) {
charsetName = verifyPossibleUtf8(charsetName, is);
}
charsetName += showByteOfMark(is);
}
} catch (BadLocationException | IOException ex) {
Exceptions.printStackTrace(ex);
}
return charsetName;
}
示例5: fileAnyEncodingToString
import com.ibm.icu.text.CharsetMatch; //导入依赖的package包/类
/**
* Read a text file detecting encoding using http://userguide.icu-project.org/conversion/detection
* Return the file contents as a String.
*/
public static String fileAnyEncodingToString(File f) throws IOException {
byte[] byteData = IOUtils.toByteArray(new FileInputStream(f));
CharsetDetector detector = new CharsetDetector();
String unicodeData = detector.getString(byteData, null);
// Add to newline at the end of the file otherwise the subtitle parser library can get confused by EOF
unicodeData += System.getProperty("line.separator") + System.getProperty("line.separator");
CharsetMatch match = detector.detect();
if (match != null && match.getConfidence() > 60) {
LOGGER.debug("{} has a detected encoding: {}", f.getName(), match.getName());
if (match.getLanguage() != null) {
LOGGER.debug("{} has a detected language: {}", f.getName(), match.getLanguage());
}
}
return unicodeData;
}
示例6: getFileCharset
import com.ibm.icu.text.CharsetMatch; //导入依赖的package包/类
/**
* Detects charset/encoding for given file. Not 100% accurate for
* non-Unicode files.
*
* @param file the file for which to detect charset/encoding.
* @return The detected {@link Charset} or {@code null} if not detected.
* @throws IOException If an IO error occurs during the operation.
*/
@Nullable
public static Charset getFileCharset(@Nullable File file) throws IOException {
if (file == null) {
return null;
}
CharsetMatch match = getFileCharsetMatch(file);
try {
if (Charset.isSupported(match.getName())) {
LOGGER.debug("Detected charset \"{}\" in file \"{}\"", match.getName(), file.getAbsolutePath());
return Charset.forName(match.getName());
}
LOGGER.debug(
"Detected charset \"{}\" in file \"{}\", but cannot use it because it's not supported by the Java Virual Machine",
match.getName(),
file.getAbsolutePath()
);
return null;
} catch (IllegalCharsetNameException e) {
LOGGER.debug("Illegal charset \"{}\" deteceted in file \"{}\"", match.getName(), file.getAbsolutePath());
}
LOGGER.debug("Found no matching charset for file \"{}\"", file.getAbsolutePath());
return null;
}
示例7: getFileCharsetName
import com.ibm.icu.text.CharsetMatch; //导入依赖的package包/类
/**
* Detects charset/encoding for given file. Not 100% accurate for
* non-Unicode files.
*
* @param file the file for which to detect charset/encoding.
* @return The name of the detected charset or {@code null} if not detected.
* @throws IOException If an IO error occurs during the operation.
*/
@Nullable
public static String getFileCharsetName(@Nullable File file) throws IOException {
if (file == null) {
return null;
}
CharsetMatch match = getFileCharsetMatch(file);
try {
if (Charset.isSupported(match.getName())) {
LOGGER.debug("Detected charset \"{}\" in file \"{}\"", match.getName(), file.getAbsolutePath());
return match.getName().toUpperCase(Locale.ROOT);
}
LOGGER.debug(
"Detected charset \"{}\" in file \"{}\", but cannot use it because it's not supported by the Java Virual Machine",
match.getName(),
file.getAbsolutePath()
);
return null;
} catch (IllegalCharsetNameException e) {
LOGGER.debug("Illegal charset \"{}\" deteceted in file \"{}\"", match.getName(), file.getAbsolutePath());
}
LOGGER.debug("Found no matching charset for file \"{}\"", file.getAbsolutePath());
return null;
}
示例8: guessEncoding
import com.ibm.icu.text.CharsetMatch; //导入依赖的package包/类
/**
* Detect charset encoding of a byte array
*
* @param bytes: the byte array to detect encoding from
* @return the charset encoding
*/
public static String guessEncoding(byte[] bytes) {
UniversalDetector detector = new UniversalDetector(null);
detector.handleData(bytes, 0, bytes.length);
detector.dataEnd();
String encoding = detector.getDetectedCharset();
detector.reset();
if (encoding == null || "MACCYRILLIC".equals(encoding)) {
// juniversalchardet incorrectly detects windows-1256 as MACCYRILLIC
// If encoding is MACCYRILLIC or null, we use ICU4J
CharsetMatch detected = new CharsetDetector().setText(bytes).detect();
if (detected != null) {
encoding = detected.getName();
} else {
encoding = "UTF-8";
}
}
return encoding;
}
示例9: getEncoding
import com.ibm.icu.text.CharsetMatch; //导入依赖的package包/类
protected String getEncoding( String requiredEncoding, File file, Log log )
throws IOException
{
FileInputStream fis = null;
try
{
fis = new FileInputStream( file );
CharsetDetector detector = new CharsetDetector();
detector.setDeclaredEncoding( requiredEncoding );
detector.setText( new BufferedInputStream( fis ) );
CharsetMatch[] charsets = detector.detectAll();
if ( charsets == null )
{
return null;
}
else
{
return charsets[0].getName();
}
}
finally
{
IOUtil.close( fis );
}
}
示例10: detectCharset
import com.ibm.icu.text.CharsetMatch; //导入依赖的package包/类
public static Charset detectCharset(byte[] bytes) {
com.ibm.icu.text.CharsetDetector charsetDetector
= new com.ibm.icu.text.CharsetDetector();
charsetDetector.setText(bytes);
CharsetMatch charsetMatch = charsetDetector.detect();
if (charsetMatch == null) {
return StandardCharsets.UTF_8;
}
try {
return Charset.forName(charsetMatch.getName()); // _ltr, _rtl?
} catch (IllegalArgumentException ex) {
return StandardCharsets.UTF_8;
}
}
示例11: guessCharset
import com.ibm.icu.text.CharsetMatch; //导入依赖的package包/类
private Charset guessCharset(Path file, Charset charset) throws IOException {
CharsetDetector detector = new CharsetDetector();
byte[] data;
try (SeekableByteChannel byteChannel = Files.newByteChannel(file, StandardOpenOption.READ)) {
long size = byteChannel.size();
if (size >= Integer.MAX_VALUE) {
return guessCharsetChardet(file, charset);
}
int smallsize = (int) size;
ByteBuffer buffer = ByteBuffer.allocate(smallsize);
byteChannel.read(buffer);
data = buffer.array();
}
detector.setText(data);
CharsetMatch match = detector.detect();
return Charset.forName(match.getName());
}
示例12: sniff
import com.ibm.icu.text.CharsetMatch; //导入依赖的package包/类
public Encoding sniff() throws IOException {
try {
CharsetDetector detector = new CharsetDetector();
detector.setText(this);
CharsetMatch match = detector.detect();
Encoding enc = Encoding.forName(match.getName());
Encoding actual = enc.getActualHtmlEncoding();
if (actual != null) {
enc = actual;
}
if (enc != Encoding.WINDOWS1252 && enc.isAsciiSuperset()) {
return enc;
} else {
return null;
}
} catch (Exception e) {
return null;
}
}
示例13: parseContent
import com.ibm.icu.text.CharsetMatch; //导入依赖的package包/类
@Override
protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang)
throws IOException {
CharsetDetector detector = new CharsetDetector();
BufferedInputStream bis = null;
try {
bis = new BufferedInputStream(streamLimiter.getNewInputStream());
detector.setText(bis);
CharsetMatch match = detector.detect();
String content;
if (match != null)
content = match.getString();
else
content = IOUtils.toString(streamLimiter.getNewInputStream(), "UTF-8");
ParserResultItem result = getNewParserResultItem();
result.addField(ParserFieldEnum.content, content);
result.langDetection(10000, ParserFieldEnum.content);
} finally {
IOUtils.close(bis);
}
}
示例14: getCharsetFromText
import com.ibm.icu.text.CharsetMatch; //导入依赖的package包/类
/**
* Use a third party library as last resort to guess the charset from the
* bytes.
*/
private static String getCharsetFromText(byte[] content,
String declaredCharset, int maxLengthCharsetDetection) {
String charset = null;
// filter HTML tags
CharsetDetector charsetDetector = new CharsetDetector();
charsetDetector.enableInputFilter(true);
// give it a hint
if (declaredCharset != null)
charsetDetector.setDeclaredEncoding(declaredCharset);
// trim the content of the text for the detection
byte[] subContent = content;
if (maxLengthCharsetDetection != -1
&& content.length > maxLengthCharsetDetection) {
subContent = Arrays.copyOfRange(content, 0,
maxLengthCharsetDetection);
}
charsetDetector.setText(subContent);
try {
CharsetMatch charsetMatch = charsetDetector.detect();
charset = validateCharset(charsetMatch.getName());
} catch (Exception e) {
charset = null;
}
return charset;
}
示例15: toReader
import com.ibm.icu.text.CharsetMatch; //导入依赖的package包/类
public static Reader toReader(InputStream input) throws IOException {
if (!input.markSupported())
input = new BufferedInputStream(input);
CharsetDetector charsetDetector = new CharsetDetector();
charsetDetector.setText(input);
CharsetMatch m = charsetDetector.detect();
Reader reader;
if (m.getConfidence() > 50) {
reader = m.getReader();
} else {
reader = new InputStreamReader(input);
}
return reader;
}