本文整理汇总了Python中langid.classify方法的典型用法代码示例。如果您正苦于以下问题:Python langid.classify方法的具体用法?Python langid.classify怎么用?Python langid.classify使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类langid
的用法示例。
在下文中一共展示了langid.classify方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: search
# 需要导入模块: import langid [as 别名]
# 或者: from langid import classify [as 别名]
def search(self, q):
payload = {
"q": q,
"api_key": self.api_key,
"limit": self.limit,
"lang": langid.classify(q)[0],
}
query = urlencode(payload, quote_via=quote_plus)
r = requests.get(f"{self.base_url}search?{query}")
if r.status_code == 200:
result = r.json()["data"]
if len(result) == 0:
return None
choiced_gif = random.choice(result)
url = choiced_gif["images"]["downsized"]["url"]
attachments = MsgTemplate.make_giphy_template(q, url)
self.slackbot.send_message(attachments=attachments)
return True
else:
return None
示例2: language_filter
# 需要导入模块: import langid [as 别名]
# 或者: from langid import classify [as 别名]
def language_filter(temp_text, temp_comments, target_language, record_id, url):
'''Run external component (if installed) for language identification'''
# sanity check on language
if target_language is not None:
if LANGID_FLAG is True:
# comments
if len(temp_comments) > len(temp_text):
langtest = temp_comments
# default
else:
langtest = temp_text
langresult = langid.classify(langtest)
if langresult[0] != target_language:
LOGGER.warning('wrong language: %s %s %s', langresult, record_id, url)
LOGGER.debug('wrong language: %s %s', langresult, temp_text)
return True
else:
LOGGER.warning('langid not installed, no language detection run')
return False
示例3: clean_pair
# 需要导入模块: import langid [as 别名]
# 或者: from langid import classify [as 别名]
def clean_pair(pair):
bytes_en, bytes_zh = pair
line_en = str(bytes_en, 'utf-8')
line_zh = str(bytes_zh, 'utf-8')
en_words_list = line_en.split()
en_words = len(en_words_list)
if en_words < 2: return None
if en_words > 130 or len(line_en) > 1000: return None
zh_chars = count_hanzi(line_zh)
if zh_chars < 2: return None
if zh_chars > 200 or len(line_zh) > 300: return None
ratio = zh_chars / en_words
# Empericially determined from the 1st and 99th percentile in a sample of
# the UN corpus
if ratio < 0.6 or ratio > 2.6: return None
#if not detect(line_en) == 'en': return None
#if not detect(line_zh)[:2] == 'zh': return None
if not langid.classify(line_en)[0] == 'en': return None
if not langid.classify(line_zh)[0] == 'zh': return None
return ' '.join(en_words_list), ' '.join(line_zh.split())
示例4: _get_sent_laser
# 需要导入模块: import langid [as 别名]
# 或者: from langid import classify [as 别名]
def _get_sent_laser(
hypothesis: List[str], references: List[List[str]],
extra_args: Optional[Dict[str, str]] = None
) -> List[float]:
set_up()
import laserembeddings
import langid
import logging
logging.getLogger('langid').setLevel(logging.WARNING)
n_samples = len(hypothesis)
mid_idx = n_samples // 2
hypo_lang = langid.classify(hypothesis[mid_idx])[0]
ref_lang = langid.classify(references[0][mid_idx])[0]
laser = laserembeddings.Laser()
hypo_emb = laser.embed_sentences(hypothesis, lang=hypo_lang)
ref_emb = laser.embed_sentences(references[0], lang=ref_lang)
inner_product = np.sum(hypo_emb * ref_emb, axis=1)
hypo_l2 = np.linalg.norm(hypo_emb, axis=1)
ref_l2 = np.linalg.norm(ref_emb, axis=1)
return (inner_product / (hypo_l2 * ref_l2)).tolist()
示例5: tag_lang_pair
# 需要导入模块: import langid [as 别名]
# 或者: from langid import classify [as 别名]
def tag_lang_pair(cls, src: str, ref: Optional[str]) -> List[str]:
machine_tags = []
src_lang = langid.classify(src)[0]
ref_lang = None if ref is None else langid.classify(ref)[0]
if ref is not None and src_lang == ref_lang:
machine_tags.append(f'lang: {ref_lang}')
else:
machine_tags.append(f'src_lang: {src_lang}')
if ref is not None:
machine_tags.append(f'trg_lang: {ref_lang}')
if ref_lang is not None \
and ref_lang in cls.POTENTIAL_UNSEGMENTED_LANGUAGES \
and ref.find(' ') == -1:
machine_tags.append('unsegmented_trg')
return machine_tags
示例6: language
# 需要导入模块: import langid [as 别名]
# 或者: from langid import classify [as 别名]
def language():
import langid
data = dict(default_data)
data['message'] = "Language Detection API - Usage: 'text' GET/POST parameter "
data['langid'] = {}
params = {}
if request.method == 'GET':
params['text'] = request.args.get('text')
elif request.method == 'POST':
params = request.form # postdata
else:
data['error'] = 'Invalid request method'
return jsonify(data)
if not params:
data['error'] = 'Missing parameters'
return jsonify(data)
if not params['text']:
data['error'] = '[text] parameter not found'
return jsonify(data)
lang_data = langid.classify( params['text'] )
data['langid']['language'] = lang_data[0]
data['langid']['score'] = lang_data[1]
data['message'] = "Detected Language: " + data['langid']['language']
return jsonify(data)
示例7: __init__
# 需要导入模块: import langid [as 别名]
# 或者: from langid import classify [as 别名]
def __init__(self, channel=None, user=None, input_text=None):
self.slacker = Slacker(Config.slack.get("TOKEN", "<TOKEN>"))
self.channel = channel
self.data_handler = DataHandler()
self.user = user
self.channel = channel
if input_text is None:
self.lang_code = Config.bot.get("LANG_CODE", "en")
else:
self.lang_code = langid.classify(input_text)[0]
示例8: __init__
# 需要导入模块: import langid [as 别名]
# 或者: from langid import classify [as 别名]
def __init__(self, text):
self.text = text
self.lang_code = langid.classify(text)[0]
if self.lang_code == "ko":
self.instance = KorDisintegrator()
elif self.lang_code == "en":
self.instance = EngDisintegrator()
else:
self.instance = None
示例9: score
# 需要导入模块: import langid [as 别名]
# 或者: from langid import classify [as 别名]
def score(
self, hypothesis: List[str], references: List[List[str]],
tags: Optional[List[List[str]]] = None
) -> VizSeqScore:
corpus_score, sent_scores, group_scores = None, None, None
import bert_score as bs
import langid
import logging
logging.getLogger('pytorch_pretrained_bert').setLevel(logging.WARNING)
logging.getLogger('langid').setLevel(logging.WARNING)
lang = langid.classify(references[0][0])[0]
sent_scores = bs.score(
hypothesis, references[0], nthreads=self.n_workers, lang=lang,
verbose=self.verbose
)[2].tolist()
if self.corpus_level:
corpus_score = np.mean(sent_scores)
if tags is not None:
tag_set = self._unique(tags)
group_scores = {}
for t in tag_set:
indices = [i for i, cur in enumerate(tags) if t in cur]
group_scores[t] = np.mean([sent_scores[i] for i in indices])
return VizSeqScore.make(
corpus_score=corpus_score, sent_scores=sent_scores,
group_scores=group_scores
)
示例10: tag_lang
# 需要导入模块: import langid [as 别名]
# 或者: from langid import classify [as 别名]
def tag_lang(cls, ref: str) -> str:
return langid.classify(ref)[0]
示例11: process
# 需要导入模块: import langid [as 别名]
# 或者: from langid import classify [as 别名]
def process(line,dataset,essay_id):
changes=0
row = json.loads(re.sub(r'[\x00-\x1F]+', '', line))
if args.language != row[2]:
return False, 0
map=[]
num=0
match=0
correction = False
for i in range(len(row[4])):
row[4][i] = re.sub('\s+',' ', row[4][i].strip())
row[4][i] = clean_sentence(row[4][i])
if len(row[4][i]):
num+=1
s_language, _ = langid.classify(row[4][i])
if s_language==args.id:
match+=1
if correction == False:
for each in row[5][i]:
if each:
each = re.sub('\s+', ' ', each.strip())
each = clean_sentence(each)
if len(each):
t_language, _ = langid.classify(each)
if t_language == args.id and row[4][i] != each:
correction = True
break
map.append(s_language)
else:
map.append('null')
if match < 2 or correction == False:
return False, 0
essay=ET.SubElement(dataset, 'essay', attrib={'id': str(essay_id), 'journal_id':row[0], 'user_id':row[1], 'learning_language':row[2], 'native_language':row[3]})
sentence_id = 0
for i in range(len(row[4])):
if len(row[4][i]):
sentence=ET.SubElement(essay,'sentence', attrib={'id':str(sentence_id)})
source=ET.SubElement(sentence,'source')
source.text=row[4][i]
source.set("langid",map[i])
for each in row[5][i]:
if each:
each = re.sub('\s+',' ', each.strip())
each = clean_sentence(each)
if len(each):
target=ET.SubElement(sentence,'target')
target.text=each
t_language, _ = langid.classify(target.text)
target.set("langid",t_language)
if t_language==args.id and source.text!=target.text:
changes+=1
sentence_id += 1
return True, changes
示例12: detect_lang
# 需要导入模块: import langid [as 别名]
# 或者: from langid import classify [as 别名]
def detect_lang(main, file):
text = ''
try:
with open(file['path'], 'r', encoding = file['encoding']) as f:
if main.settings_custom['auto_detection']['detection_settings']['number_lines_no_limit']:
for line in f:
text += line
else:
for i, line in enumerate(f):
if i < main.settings_custom['auto_detection']['detection_settings']['number_lines']:
text += line
else:
break
lang_code_639_1 = langid.classify(text)[0]
# Chinese (Simplified) & Chinese (Traditional)
if lang_code_639_1 == 'zh':
lang_code_639_1 = 'zh_cn'
for lang in sorted(langdetect.detect_langs(text), key = lambda item: -item.prob):
if lang.lang in ['zh-cn', 'zh-tw']:
lang_code_639_1 = lang.lang.replace('-', '_')
break
# Norwegian Bokmål
elif lang_code_639_1 == 'no':
lang_code_639_1 = 'nb'
# Serbian (Cyrillic)
elif lang_code_639_1 == 'sr':
lang_code_639_1 = 'sr_cyrl'
lang = wl_conversion.to_iso_639_3(main, lang_code_639_1)
success = True
except:
lang = main.settings_custom['auto_detection']['default_settings']['default_lang']
success = False
return lang, success
示例13: extract_corrections
# 需要导入模块: import langid [as 别名]
# 或者: from langid import classify [as 别名]
def extract_corrections(file):
"""Search for sentences and corrections from Lang-8 HTML"""
#import json
import os.path
import langid
fname = os.path.basename(file)
if not os.path.isfile(file):
return
import re
re_sentences = re.compile(r'.*",\["(.*)"\],\[\[.*')
re_corrections = re.compile(r'.*\],\[\[(.*)\]\]\]')
sentences = "[]"
corrections = "[]"
language = ""
for line in open(file, 'r'):
line = line.rstrip()
journal_id = line.split(',')[0][2:-1]
author_id = line.split(',')[1][1:-1]
language = line.split(',')[3][1:-1]
m = re_sentences.match(line)
if m:
sentences = re.split('","',m.group(1))
m = re_corrections.match(line)
if m:
corrections = re.split('\],\[',m.group(1))
if len(sentences) != len (corrections) : continue
empty = False
for x in range(0,len(corrections)):
if len(corrections[x])>0 : break
if x == len(corrections)-1 : empty = True
if empty : continue
for x in range(0,len(corrections)):
lang, prob = langid.classify(sentences[x])
if lang!='en' : continue
if len(corrections[x])>0 :
corrections[x] = corrections[x][1:-1]
if corrections[x] == '':
print journal_id+'\t'+author_id+'\t'+language+'\t'+sentences[x]+'\t'+sentences[x]
else:
for corr in re.split('","',corrections[x]) :
lang, prob = langid.classify(corr)
if lang!='en' : continue
if len(corr.split(" ,.")) - len(sentences[x].split(" ,.")) > 5 : continue
else :
print journal_id+'\t'+author_id+'\t'+language+'\t'+sentences[x]+'\t'+corr