本文整理汇总了Python中ngram.NGram.classify方法的典型用法代码示例。如果您正苦于以下问题:Python NGram.classify方法的具体用法?Python NGram.classify怎么用?Python NGram.classify使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类ngram.NGram
的用法示例。
在下文中一共展示了NGram.classify方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: LanguageIdentifier
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import classify [as 别名]
class LanguageIdentifier(object):
MODEL_DIR = get_abs_data_filename('langmodels')
"""The directory containing the ngram language model files."""
CONF_FILE = 'fpdb.conf'
"""
The name of the file that contains language name-code pairs
(relative to C{MODEL_DIR}).
"""
def __init__(self, model_dir=None, conf_file=None):
if model_dir is None:
model_dir = self.MODEL_DIR
if not path.isdir(model_dir):
raise ValueError('Directory does not exist: %s' % (model_dir))
if conf_file is None:
conf_file = self.CONF_FILE
conf_file = path.abspath(path.join(model_dir, conf_file))
if not path.isfile(conf_file):
raise ValueError('File does not exist: %s' % (conf_file))
self._load_config(conf_file)
self.ngram = NGram(model_dir)
def _load_config(self, conf_file):
"""Load the mapping of language names to language codes as given in the
configuration file."""
lines = open(conf_file).read().splitlines()
self._lang_codes = {}
for line in lines:
parts = line.split()
if not parts or line.startswith('#'):
continue # Skip comment- and empty lines
lname, lcode = parts[0], parts[1]
lname = path.split(lname)[-1] # Make sure lname is not prefixed by directory names
if extsep in lname:
lname = lname[:lname.rindex(extsep)] # Remove extension if it has
# Remove trailing '[_-]-utf8' from code
if lcode.endswith('-utf8'):
lcode = lcode[:-len('-utf8')]
if lcode.endswith('-') or lcode.endswith('_'):
lcode = lcode[:-1]
self._lang_codes[lname] = lcode
def identify_lang(self, text):
"""Identify the language of the text in the given string."""
if not text:
return None
result = self.ngram.classify(text)
if result in self._lang_codes:
result = self._lang_codes[result]
return result
def identify_source_lang(self, instore):
"""Identify the source language of the given translation store or
units.
@type instore: C{TranslationStore} or list or tuple of
C{TranslationUnit}s.
@param instore: The translation store to extract source text from.
@returns: The identified language's code or C{None} if the language
could not be identified."""
if not isinstance(instore, (TranslationStore, list, tuple)):
return None
text = u' '.join(unit.source for unit in instore[:50] if unit.istranslatable() and unit.source)
if not text:
return None
return self.identify_lang(text)
def identify_target_lang(self, instore):
"""Identify the target language of the given translation store or
units.
@type instore: C{TranslationStore} or list or tuple of
C{TranslationUnit}s.
@param instore: The translation store to extract target text from.
@returns: The identified language's code or C{None} if the language
could not be identified."""
if not isinstance(instore, (TranslationStore, list, tuple)):
return None
text = u' '.join(unit.target for unit in instore[:200] if unit.istranslatable() and unit.target)
if not text:
return None
return self.identify_lang(text)