本文整理汇总了Python中kenlm.LanguageModel方法的典型用法代码示例。如果您正苦于以下问题:Python kenlm.LanguageModel方法的具体用法?Python kenlm.LanguageModel怎么用?Python kenlm.LanguageModel使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类kenlm
的用法示例。
在下文中一共展示了kenlm.LanguageModel方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: import kenlm [as 别名]
# 或者: from kenlm import LanguageModel [as 别名]
def __init__(self, sick_path, target_directory, lm_path=None, wsd_algorithm='cosine', sampling_parameter=0.5,
min_substitutions=2, num_candidates=5, concatenate_corpora=True):
self.sick_path = sick_path
self.target_directory = target_directory
self.lm_path = lm_path
self.wsd_algorithm = wsd_algorithm
self.sampling_parameter = sampling_parameter
self.min_substitutions = min_substitutions
self.num_candidates = num_candidates
self.concatenate_corpora = concatenate_corpora
self.filtered_path = os.path.join(self.target_directory, 'filtered_sick.txt')
self.noscore_path = os.path.join(self.target_directory, 'noscore_sick.txt')
# Filter the original SICK corpus to match the expected format, and create file for LM training
if not os.path.exists(self.filtered_path) or not os.path.exists(self.noscore_path):
self.filter_sick()
if self.lm_path is None:
raise ValueError('No language model provided! Use the noscore_sick corpus to train an .klm LM, first.')
else:
self.language_model = kenlm.LanguageModel(self.lm_path)
示例2: score_generated_sentences
# 需要导入模块: import kenlm [as 别名]
# 或者: from kenlm import LanguageModel [as 别名]
def score_generated_sentences(generated_text_file_path, language_model_path):
log_probs = list()
perplexity_scores = list()
import kenlm
model = kenlm.LanguageModel(language_model_path)
with open(generated_text_file_path) as generated_text_file:
for sentence in generated_text_file:
cleaned_sentence = clean_text(sentence)
log_probs.append(model.score(cleaned_sentence))
perplexity_scores.append(model.perplexity(cleaned_sentence))
return statistics.mean(log_probs), statistics.mean(perplexity_scores)
示例3: __init__
# 需要导入模块: import kenlm [as 别名]
# 或者: from kenlm import LanguageModel [as 别名]
def __init__(self, labels=['hin', 'eng'], transliteration=False):
self.flag = transliteration
self.labels = labels
self.wxp = wxilp(order="wx2utf")
path = os.path.abspath(__file__).rpartition('/')[0]
self.tag_dct = {tag:i for i,tag in enumerate(labels)}
self.tree, self.queue, self.blm_wp, self.blm_sp = list(), list(), list(), list()
# load decision trees
for tag in self.labels:
if tag == "eng":
self.tree.append("_")
continue
if tag == "kan":
tag = "mal"
with open('%s/decision_trees/eng-%s.json' %(path, tag)) as fp:
self.tree.append(json.load(fp))
# load language-models
for tag in self.labels:
self.blm_wp.append(kenlm.LanguageModel('{}/blm_models/{}.tk.blm'.format(path, tag)))
self.blm_sp.append(kenlm.LanguageModel('{}/blm_models/{}.ts.blm'.format(path, tag)))
# load emoticon set
with open('%s/extras/emoticons.txt' %path) as fp:
self.emoticons = set(fp.read().split('\t'))
self.reg = re.compile(r"(^[^a-zA-Z0-9]+|[^-'a-zA-Z0-9]+|[^a-zA-Z0-9]+$)")
示例4: _test_log_p
# 需要导入模块: import kenlm [as 别名]
# 或者: from kenlm import LanguageModel [as 别名]
def _test_log_p(queries):
lm_ken = kenlm.LanguageModel(TEST_ARPA)
for p in PARSERS:
lm_me = arpa.loadf(TEST_ARPA, parser=p)[0]
results_me = []
results_ken = []
for ngram in queries:
prob_me = lm_me.log_p(ngram)
prob_ken = list(lm_ken.full_scores(' '.join(ngram), False, False))[-1][0]
results_me.append(prob_me)
results_ken.append(prob_ken)
assert all(round(m - k, 4) == 0 for m, k in zip(results_me, results_ken))
示例5: _test_log_s
# 需要导入模块: import kenlm [as 别名]
# 或者: from kenlm import LanguageModel [as 别名]
def _test_log_s(sentences, sos, eos):
lm_ken = kenlm.LanguageModel(TEST_ARPA)
for p in PARSERS:
lm_me = arpa.loadf(TEST_ARPA, parser=p)[0]
results_me = []
results_ken = []
for sentence in sentences:
score_me = lm_me.log_s(sentence, sos=sos, eos=eos)
score_ken = lm_ken.score(sentence, bool(sos), bool(eos))
results_me.append(score_me)
results_ken.append(score_ken)
assert all(round(m - k, 2) == 0 for m, k in zip(results_me, results_ken))
示例6: load_lm
# 需要导入模块: import kenlm [as 别名]
# 或者: from kenlm import LanguageModel [as 别名]
def load_lm(lm):
return kenlm.LanguageModel(lm)
示例7: __init__
# 需要导入模块: import kenlm [as 别名]
# 或者: from kenlm import LanguageModel [as 别名]
def __init__(self, ngram_model, token_list):
"""Initialize Ngrambase.
Args:
ngram_model: ngram model path
token_list: token list from dict or model.json
"""
self.chardict = [x if x != "<eos>" else "</s>" for x in token_list]
self.charlen = len(self.chardict)
self.lm = kenlm.LanguageModel(ngram_model)
self.tmpkenlmstate = kenlm.State()
示例8: load_lm
# 需要导入模块: import kenlm [as 别名]
# 或者: from kenlm import LanguageModel [as 别名]
def load_lm(self, lm_path:str):
self.lm_path=lm_path
self.lm=kenlm.LanguageModel(self.lm_path)
示例9: generateSummaries
# 需要导入模块: import kenlm [as 别名]
# 或者: from kenlm import LanguageModel [as 别名]
def generateSummaries(sentences, length=100, mode = "Extractive", ranker = rankingModes['TR']):
'''
This is where the ILP works to select the best sentences and form the summary
'''
if mode == "Abstractive":
import kenlm
lm = kenlm.LanguageModel(RESOURCES_DIR+'/lm-3g.klm')
'''
Here sentences should have POS tagged format
'''
taggedsentences=[]
for sent in sentences:
sent=sent.decode('utf-8','ignore')
tagged_sent=''
tagged_tokens=nltk.pos_tag(nltk.word_tokenize(sent))
for token in tagged_tokens:
word, pos=token
tagged_sent=tagged_sent+' '+word+"/"+pos
taggedsentences.append(tagged_sent.strip())
sentences=bigramTweetGenerator(taggedsentences)
genSentences, svolist=wg.retrieveNewSentences(sentences, stopwords)
if len(genSentences) <= 1:
return [k for k, v in genSentences]
finalSentencesRetained=wg.solveILPFactBased(genSentences,
lm,
stopwords,
ranker,
intraGenSimThreshold=0.5,
l_max=length,
mode="Abstractive"
)
summary=txtFromSents(finalSentencesRetained)
print "=======Summary:===== \n", summary
if mode == "Extractive":
lm=[] #No need of language model in Extractive
#if len(sentences) <= 2:
# summary=txtFromSents(sentences)
# print "Summary: ", summary
# return
print sentences
finalSentencesRetained=wg.solveILPFactBased(sentences,
lm,
stopwords,
ranker,
intraGenSimThreshold=0.7,
l_max=length,
mode="Extractive"
)
print 'Final sentences,', finalSentencesRetained
summary=txtFromSents(finalSentencesRetained)
print "=======Summary:===== \n", summary
示例10: train_lm
# 需要导入模块: import kenlm [as 别名]
# 或者: from kenlm import LanguageModel [as 别名]
def train_lm(self, text_path:str):
tokenized_f=NamedTemporaryFile("w", delete=False)
placeholderized_f=NamedTemporaryFile("w", delete=False)
#Tokenize text
with open(text_path) as input_f:
for line in input_f:
line=line.rstrip("\n")
sentences=self._sentence_split(line)
for s in sentences:
tokline=self._tokenize(s)
tokenized_f.write(tokline)
tokenized_f.write("\n")
tokenized_f.close()
#Perform placeholder replacement if needed
with open(tokenized_f.name) as tokenized_ff:
for line in tokenized_ff:
line=line.rstrip("\n")
with_placeholders=self._introduce_placeholders(line)
logging.debug("Processed training example: {}".format(with_placeholders))
placeholderized_f.write(with_placeholders)
placeholderized_f.write("\n")
placeholderized_f.close()
#Estimate LM
lm_file=NamedTemporaryFile(delete=False)
lm_file.close()
if self.type == LMType.CHARACTER:
params="-o 7 --discount_fallback"
else:
params="-o 7 --discount_fallback"
self._estimate_kenlm(placeholderized_f.name, lm_file.name,params)
self.lm_path=lm_file.name
self.lm=kenlm.LanguageModel(self.lm_path)
#Remove temporary files
os.remove(tokenized_f.name)
os.remove(placeholderized_f.name)