本文整理匯總了Java中org.mozilla.universalchardet.UniversalDetector類的典型用法代碼示例。如果您正苦於以下問題:Java UniversalDetector類的具體用法?Java UniversalDetector怎麽用?Java UniversalDetector使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
UniversalDetector類屬於org.mozilla.universalchardet包,在下文中一共展示了UniversalDetector類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Java代碼示例。
示例1: getFileCharset
import org.mozilla.universalchardet.UniversalDetector; //導入依賴的package包/類
public static String getFileCharset(File file) throws IOException {
byte[] buf = new byte[4096];
BufferedInputStream bufferedInputStream = new BufferedInputStream(
new FileInputStream(file));
final UniversalDetector universalDetector = new UniversalDetector(null);
int numberOfBytesRead;
while ((numberOfBytesRead = bufferedInputStream.read(buf)) > 0
&& !universalDetector.isDone()) {
universalDetector.handleData(buf, 0, numberOfBytesRead);
}
universalDetector.dataEnd();
String encoding = universalDetector.getDetectedCharset();
universalDetector.reset();
bufferedInputStream.close();
return encoding;
}
示例2: isBinary
import org.mozilla.universalchardet.UniversalDetector; //導入依賴的package包/類
public static boolean isBinary(InputStream in) throws IOException {
byte[] buf = new byte[4];
in.mark(5);
int len = in.read(buf);
in.reset();
UniversalDetector detector = new UniversalDetector(null);
detector.handleData(buf, 0, len);
if (detector.isDone()) {
return false;
}
//Not UTF check ASCII text
in.mark(LOOKAHEAD);
len = 0;
int b;
while ((b = in.read()) != -1 && len < (LOOKAHEAD - 192)) {
len++;
if (b == 0) {
in.reset();
return true;
}
}
in.reset();
return false;
}
示例3: determineCharset
import org.mozilla.universalchardet.UniversalDetector; //導入依賴的package包/類
@Override
public Charset determineCharset(byte[] bytes) {
UniversalDetector detector = charsetDetector.get();
try {
detector.handleData(bytes, 0, bytes.length);
detector.dataEnd();
String encoding = detector.getDetectedCharset();
if (encoding != null) {
return Charset.forName(encoding);
}
return null;
} finally {
detector.reset();
}
}
示例4: detectCharset
import org.mozilla.universalchardet.UniversalDetector; //導入依賴的package包/類
public static String detectCharset(InputStream fis) throws IOException {
byte[] buf = new byte[4096];
// (1)
UniversalDetector detector = new UniversalDetector(null);
// (2)
int nread;
while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
detector.handleData(buf, 0, nread);
}
// (3)
detector.dataEnd();
// (4)
String encoding = detector.getDetectedCharset();
// (5)
detector.reset();
return encoding;
}
示例5: guessEncoding
import org.mozilla.universalchardet.UniversalDetector; //導入依賴的package包/類
/**
* Detect charset encoding of a byte array
*
* @param bytes: the byte array to detect encoding from
* @return the charset encoding
*/
public static String guessEncoding(byte[] bytes) {
UniversalDetector detector = new UniversalDetector(null);
detector.handleData(bytes, 0, bytes.length);
detector.dataEnd();
String encoding = detector.getDetectedCharset();
detector.reset();
if (encoding == null || "MACCYRILLIC".equals(encoding)) {
// juniversalchardet incorrectly detects windows-1256 as MACCYRILLIC
// If encoding is MACCYRILLIC or null, we use ICU4J
CharsetMatch detected = new CharsetDetector().setText(bytes).detect();
if (detected != null) {
encoding = detected.getName();
} else {
encoding = "UTF-8";
}
}
return encoding;
}
示例6: createFromEventBody
import org.mozilla.universalchardet.UniversalDetector; //導入依賴的package包/類
public static <T> EnrichedEventBodyGeneric createFromEventBody(byte[] payload, boolean isEnriched, Class<T> clazz) throws IOException {
EnrichedEventBodyGeneric enrichedEventBodyGeneric;
if (isEnriched) {
JavaType javaType = JSONStringSerializer.getJavaType(EnrichedEventBodyGeneric.class, clazz);
enrichedEventBodyGeneric = (EnrichedEventBodyGeneric) JSONStringSerializer.fromBytes(payload, javaType);
} else {
// Detecting payload charset
UniversalDetector detector = new UniversalDetector(null);
detector.handleData(payload, 0, payload.length);
detector.dataEnd();
String charset = detector.getDetectedCharset();
detector.reset();
if (charset == null) {
charset = DEFAULT_CHARSET;
}
enrichedEventBodyGeneric = new EnrichedEventBodyGeneric(new String(payload, charset), clazz);
}
return enrichedEventBodyGeneric;
}
示例7: createFromEventBody
import org.mozilla.universalchardet.UniversalDetector; //導入依賴的package包/類
public static EnrichedEventBody createFromEventBody(byte[] payload, boolean isEnriched) throws IOException {
EnrichedEventBody enrichedBody;
if (isEnriched) {
enrichedBody = JSONStringSerializer.fromBytes(payload, EnrichedEventBody.class);
} else {
// Detecting payload charset
UniversalDetector detector = new UniversalDetector(null);
detector.handleData(payload, 0, payload.length);
detector.dataEnd();
String charset = detector.getDetectedCharset();
detector.reset();
if (charset == null) {
charset = DEFAULT_CHARSET;
}
enrichedBody = new EnrichedEventBody(new String(payload, charset));
}
return enrichedBody;
}
示例8: testEventCharset
import org.mozilla.universalchardet.UniversalDetector; //導入依賴的package包/類
@Test
public void testEventCharset() throws IOException {
String expectedCharset = StandardCharsets.UTF_8.name();
Path path = Paths.get("src/test/resources/notUTFString.txt");
byte[] payload = Files.readAllBytes(path);
EnrichedEventBody message = EnrichedEventBody.createFromEventBody(payload, false);
byte[] output = message.buildEventBody();
UniversalDetector detector = new UniversalDetector(null);
detector.handleData(output, 0, output.length);
detector.dataEnd();
String outputCharset = detector.getDetectedCharset();
detector.reset();
Assert.assertEquals(outputCharset, expectedCharset, "Invalid charset");
}
示例9: detectEncoding
import org.mozilla.universalchardet.UniversalDetector; //導入依賴的package包/類
/**
* Detect the encoding of the supplied file.
*
* @see <a href="https://code.google.com/p/juniversalchardet/">Original</a>
* @see <a href="https://github.com/amake/juniversalchardet">Fork</a>
*/
public static String detectEncoding(InputStream stream) throws IOException {
UniversalDetector detector = new UniversalDetector(null);
byte[] buffer = new byte[4096];
int read;
while ((read = stream.read(buffer)) > 0 && !detector.isDone()) {
detector.handleData(buffer, 0, read);
}
detector.dataEnd();
String encoding = detector.getDetectedCharset();
detector.reset();
return encoding;
}
示例10: detectCharacterSet
import org.mozilla.universalchardet.UniversalDetector; //導入依賴的package包/類
/**
* Detects the character set of the input text.
* @param input The input text as a byte array.
* @return The character set of the input text, or null if it cannot be detected.
*/
public static Charset detectCharacterSet(byte[] input) {
if (input == null) {
return null;
}
Charset charset = null;
input = input.clone();
UniversalDetector detector = new UniversalDetector(null);
detector.handleData(input, 0, input.length);
detector.dataEnd();
String detectedCharset = detector.getDetectedCharset();
if (StringUtils.hasText(detectedCharset)) {
try {
charset = Charset.forName(detectedCharset);
} catch (UnsupportedCharsetException e) {
throw new RuntimeException("Detected unsupported character set " + detectedCharset);
}
}
return charset;
}
示例11: detect
import org.mozilla.universalchardet.UniversalDetector; //導入依賴的package包/類
public static String detect(InputStream inputStream) throws IOException {
UniversalDetector detector = Charset.getSingleton()
.getCharsetDetector();
byte[] buf = new byte[4096];
int nread;
while ((nread = inputStream.read(buf)) > 0 && !detector.isDone()) {
detector.handleData(buf, 0, nread);
}
detector.dataEnd();
String encoding = detector.getDetectedCharset();
detector.reset();
inputStream.close();
if (encoding == null) {
// If none encoding is detected, we assume UTF-8
encoding = UTF8;
}
return encoding;
}
示例12: detectFileCharset
import org.mozilla.universalchardet.UniversalDetector; //導入依賴的package包/類
/**
* 探測文本編碼.
*/
public static String detectFileCharset(File file, int detectLength) throws IOException {
String charset = null;
FileInputStream fis = null;
try {
byte[] buf = new byte[detectLength];
fis = new FileInputStream(file);
UniversalDetector detector = new UniversalDetector(null);
int nread;
while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
detector.handleData(buf, 0, nread);
}
detector.dataEnd();
charset = detector.getDetectedCharset();
detector.reset();
} finally {
if (fis != null) {
fis.close();
}
}
return charset;
}
示例13: guessCharset
import org.mozilla.universalchardet.UniversalDetector; //導入依賴的package包/類
public static String guessCharset(String fileName) throws IOException{
byte[] buf = new byte[4096];
java.io.FileInputStream fis = new java.io.FileInputStream(fileName);
UniversalDetector detector = new UniversalDetector(null);
int nread;
while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
detector.handleData(buf, 0, nread);
}
detector.dataEnd();
String encoding = detector.getDetectedCharset();
if (encoding != null) {
Log.d("ConvertUtil",fileName+" detected encoding = " + encoding);
} else {
Log.d("ConvertUtil","No encoding detected = " + encoding);
}
detector.reset();
return encoding;
}
示例14: extractCharset
import org.mozilla.universalchardet.UniversalDetector; //導入依賴的package包/類
/**
* This method extracts the charset from the html source code.
* If the charset is not specified, it is set to UTF-8 by default
* @param is
* @return
*/
public static String extractCharset(InputStream is) throws java.io.IOException {
byte[] buf = new byte[4096];
UniversalDetector detector = new UniversalDetector(null);
int nread;
while ((nread = is.read(buf)) > 0 && !detector.isDone()) {
detector.handleData(buf, 0, nread);
}
detector.dataEnd();
String encoding = detector.getDetectedCharset();
if (encoding != null) {
LOGGER.debug("Detected encoding = " + encoding);
} else {
LOGGER.debug("No encoding detected.");
}
detector.reset();
if (encoding != null && CrawlUtils.isValidCharset(encoding)) {
return encoding;
} else {
return DEFAULT_CHARSET;
}
}
示例15: detect
import org.mozilla.universalchardet.UniversalDetector; //導入依賴的package包/類
public static Charset detect(File file) {
FileInputStream fis = null;
UniversalDetector detector = new UniversalDetector(null);
try {
byte[] buf = new byte[BUFFER_SIZE];
fis = new FileInputStream(file);
int nread;
while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
detector.handleData(buf, 0, nread);
}
detector.dataEnd();
return Charset.forName(detector.getDetectedCharset());
} catch (Exception e) {
return Charset.defaultCharset();
} finally {
Closeables.closeQuitely(fis);
}
}