当前位置: 首页>>代码示例>>Python>>正文


Python kenlm.LanguageModel方法代码示例

本文整理汇总了Python中kenlm.LanguageModel方法的典型用法代码示例。如果您正苦于以下问题:Python kenlm.LanguageModel方法的具体用法?Python kenlm.LanguageModel怎么用?Python kenlm.LanguageModel使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在kenlm的用法示例。


在下文中一共展示了kenlm.LanguageModel方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: import kenlm [as 别名]
# 或者: from kenlm import LanguageModel [as 别名]
def __init__(self, sick_path, target_directory, lm_path=None, wsd_algorithm='cosine', sampling_parameter=0.5,
                 min_substitutions=2, num_candidates=5, concatenate_corpora=True):
        self.sick_path = sick_path
        self.target_directory = target_directory
        self.lm_path = lm_path
        self.wsd_algorithm = wsd_algorithm
        self.sampling_parameter = sampling_parameter
        self.min_substitutions = min_substitutions
        self.num_candidates = num_candidates
        self.concatenate_corpora = concatenate_corpora
        self.filtered_path = os.path.join(self.target_directory, 'filtered_sick.txt')
        self.noscore_path = os.path.join(self.target_directory, 'noscore_sick.txt')
        # Filter the original SICK corpus to match the expected format, and create file for LM training
        if not os.path.exists(self.filtered_path) or not os.path.exists(self.noscore_path):
            self.filter_sick()
        if self.lm_path is None:
            raise ValueError('No language model provided! Use the noscore_sick corpus to train an .klm LM, first.')
        else:
            self.language_model = kenlm.LanguageModel(self.lm_path) 
开发者ID:demelin,项目名称:Sentence-similarity-classifier-for-pyTorch,代码行数:21,代码来源:sick_extender.py

示例2: score_generated_sentences

# 需要导入模块: import kenlm [as 别名]
# 或者: from kenlm import LanguageModel [as 别名]
def score_generated_sentences(generated_text_file_path, language_model_path):
    log_probs = list()
    perplexity_scores = list()

    import kenlm
    model = kenlm.LanguageModel(language_model_path)
    with open(generated_text_file_path) as generated_text_file:
        for sentence in generated_text_file:
            cleaned_sentence = clean_text(sentence)
            log_probs.append(model.score(cleaned_sentence))
            perplexity_scores.append(model.perplexity(cleaned_sentence))

    return statistics.mean(log_probs), statistics.mean(perplexity_scores) 
开发者ID:vineetjohn,项目名称:linguistic-style-transfer,代码行数:15,代码来源:language_fluency.py

示例3: __init__

# 需要导入模块: import kenlm [as 别名]
# 或者: from kenlm import LanguageModel [as 别名]
def __init__(self, labels=['hin', 'eng'], transliteration=False):
        self.flag = transliteration        
        self.labels = labels
        self.wxp = wxilp(order="wx2utf")
        path = os.path.abspath(__file__).rpartition('/')[0]
        self.tag_dct = {tag:i for i,tag in enumerate(labels)}
        self.tree, self.queue, self.blm_wp, self.blm_sp = list(), list(), list(), list()

        # load decision trees
        for tag in self.labels:
            if tag == "eng": 
                self.tree.append("_")
                continue
            if tag == "kan":
                tag = "mal"
            with open('%s/decision_trees/eng-%s.json' %(path, tag)) as fp:
                self.tree.append(json.load(fp))

        # load language-models
        for tag in self.labels:
            self.blm_wp.append(kenlm.LanguageModel('{}/blm_models/{}.tk.blm'.format(path, tag)))
            self.blm_sp.append(kenlm.LanguageModel('{}/blm_models/{}.ts.blm'.format(path, tag)))

        # load emoticon set
        with open('%s/extras/emoticons.txt' %path) as fp:
            self.emoticons = set(fp.read().split('\t'))

        self.reg = re.compile(r"(^[^a-zA-Z0-9]+|[^-'a-zA-Z0-9]+|[^a-zA-Z0-9]+$)") 
开发者ID:irshadbhat,项目名称:litcm,代码行数:30,代码来源:litran.py

示例4: _test_log_p

# 需要导入模块: import kenlm [as 别名]
# 或者: from kenlm import LanguageModel [as 别名]
def _test_log_p(queries):
    lm_ken = kenlm.LanguageModel(TEST_ARPA)
    for p in PARSERS:
        lm_me = arpa.loadf(TEST_ARPA, parser=p)[0]
        results_me = []
        results_ken = []
        for ngram in queries:
            prob_me = lm_me.log_p(ngram)
            prob_ken = list(lm_ken.full_scores(' '.join(ngram), False, False))[-1][0]
            results_me.append(prob_me)
            results_ken.append(prob_ken)
        assert all(round(m - k, 4) == 0 for m, k in zip(results_me, results_ken)) 
开发者ID:sfischer13,项目名称:python-arpa,代码行数:14,代码来源:test_arpa_kenlm.py

示例5: _test_log_s

# 需要导入模块: import kenlm [as 别名]
# 或者: from kenlm import LanguageModel [as 别名]
def _test_log_s(sentences, sos, eos):
    lm_ken = kenlm.LanguageModel(TEST_ARPA)
    for p in PARSERS:
        lm_me = arpa.loadf(TEST_ARPA, parser=p)[0]
        results_me = []
        results_ken = []
        for sentence in sentences:
            score_me = lm_me.log_s(sentence, sos=sos, eos=eos)
            score_ken = lm_ken.score(sentence, bool(sos), bool(eos))
            results_me.append(score_me)
            results_ken.append(score_ken)
        assert all(round(m - k, 2) == 0 for m, k in zip(results_me, results_ken)) 
开发者ID:sfischer13,项目名称:python-arpa,代码行数:14,代码来源:test_arpa_kenlm.py

示例6: load_lm

# 需要导入模块: import kenlm [as 别名]
# 或者: from kenlm import LanguageModel [as 别名]
def load_lm(lm):
    return kenlm.LanguageModel(lm) 
开发者ID:jiaeyan,项目名称:Jiayan,代码行数:4,代码来源:__init__.py

示例7: __init__

# 需要导入模块: import kenlm [as 别名]
# 或者: from kenlm import LanguageModel [as 别名]
def __init__(self, ngram_model, token_list):
        """Initialize Ngrambase.

        Args:
            ngram_model: ngram model path
            token_list: token list from dict or model.json

        """
        self.chardict = [x if x != "<eos>" else "</s>" for x in token_list]
        self.charlen = len(self.chardict)
        self.lm = kenlm.LanguageModel(ngram_model)
        self.tmpkenlmstate = kenlm.State() 
开发者ID:espnet,项目名称:espnet,代码行数:14,代码来源:ngram.py

示例8: load_lm

# 需要导入模块: import kenlm [as 别名]
# 或者: from kenlm import LanguageModel [as 别名]
def load_lm(self, lm_path:str):
        self.lm_path=lm_path
        self.lm=kenlm.LanguageModel(self.lm_path) 
开发者ID:bitextor,项目名称:bicleaner,代码行数:5,代码来源:lm.py

示例9: generateSummaries

# 需要导入模块: import kenlm [as 别名]
# 或者: from kenlm import LanguageModel [as 别名]
def generateSummaries(sentences, length=100, mode = "Extractive", ranker = rankingModes['TR']):
    
        
    '''
    This is where the ILP works to select the best sentences and form the summary
    '''
    if mode == "Abstractive":
        import kenlm
        lm = kenlm.LanguageModel(RESOURCES_DIR+'/lm-3g.klm')
        '''
        Here sentences should have POS tagged format
        '''
        taggedsentences=[]
        for sent in sentences: 
            sent=sent.decode('utf-8','ignore')
            tagged_sent=''
            tagged_tokens=nltk.pos_tag(nltk.word_tokenize(sent))
            for token in tagged_tokens:
                word, pos=token
                tagged_sent=tagged_sent+' '+word+"/"+pos
            taggedsentences.append(tagged_sent.strip())
            
        sentences=bigramTweetGenerator(taggedsentences)
        genSentences, svolist=wg.retrieveNewSentences(sentences, stopwords)
    
        if len(genSentences) <= 1:
            return [k for k, v in genSentences]
        finalSentencesRetained=wg.solveILPFactBased(genSentences,
                                            lm,                                             
                                            stopwords, 
                                            ranker,
                                            intraGenSimThreshold=0.5, 
                                            l_max=length,
                                            mode="Abstractive"
                                            )
    
        
        summary=txtFromSents(finalSentencesRetained)
        print "=======Summary:===== \n", summary           
    
    if mode == "Extractive":
        lm=[] #No need of language model in Extractive
        #if len(sentences) <= 2:
        #    summary=txtFromSents(sentences)
        #    print "Summary: ", summary 
        #    return 
        
        print sentences
        finalSentencesRetained=wg.solveILPFactBased(sentences,
                                            lm,                                            
                                            stopwords, 
                                            ranker,
                                            intraGenSimThreshold=0.7, 
                                            l_max=length,
                                            mode="Extractive"
                                            )
        
        print 'Final sentences,', finalSentencesRetained
        summary=txtFromSents(finalSentencesRetained)
        print "=======Summary:===== \n", summary 
开发者ID:StevenLOL,项目名称:AbTextSumm,代码行数:62,代码来源:Example.py

示例10: train_lm

# 需要导入模块: import kenlm [as 别名]
# 或者: from kenlm import LanguageModel [as 别名]
def train_lm(self, text_path:str):
        tokenized_f=NamedTemporaryFile("w", delete=False)
        placeholderized_f=NamedTemporaryFile("w", delete=False)
        
        #Tokenize text
        with open(text_path) as input_f:
            for line in input_f:
                line=line.rstrip("\n")
                sentences=self._sentence_split(line)
                for s in sentences:
                    tokline=self._tokenize(s)
                    tokenized_f.write(tokline)
                    tokenized_f.write("\n")
        tokenized_f.close()
            
        #Perform placeholder replacement if needed
        with open(tokenized_f.name) as tokenized_ff:
            for line in tokenized_ff:
                line=line.rstrip("\n")
                with_placeholders=self._introduce_placeholders(line)
                logging.debug("Processed training example: {}".format(with_placeholders))
                placeholderized_f.write(with_placeholders)
                placeholderized_f.write("\n")
        placeholderized_f.close()
        
        #Estimate LM
        lm_file=NamedTemporaryFile(delete=False)
        lm_file.close()
        
        if self.type == LMType.CHARACTER:
            params="-o 7 --discount_fallback"
        else:
            params="-o 7 --discount_fallback"
    
        self._estimate_kenlm(placeholderized_f.name, lm_file.name,params)
        self.lm_path=lm_file.name
        
        self.lm=kenlm.LanguageModel(self.lm_path)
        
        #Remove temporary files
        os.remove(tokenized_f.name) 
        os.remove(placeholderized_f.name) 
开发者ID:bitextor,项目名称:bicleaner,代码行数:44,代码来源:lm.py


注:本文中的kenlm.LanguageModel方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。