本文整理汇总了Python中pymystem3.Mystem.analyze方法的典型用法代码示例。如果您正苦于以下问题:Python Mystem.analyze方法的具体用法?Python Mystem.analyze怎么用?Python Mystem.analyze使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pymystem3.Mystem
的用法示例。
在下文中一共展示了Mystem.analyze方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: without_pronouns
# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
def without_pronouns(directory):
input_files = filter(lambda x: not x.endswith('~'), os.listdir(directory))
output_data = {}
m = Mystem()
#иду по документам
for input_file in input_files:
with open(directory + '/' + input_file) as data_file:
data = json.load(data_file)
list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), data['text'].split(' '))
my_list = list_of_terms
list_of_terms = []
for term in my_list:
if m.analyze(term)[0].get(u'analysis'):
if not m.analyze(term)[0][u'analysis'][0][u'gr'].startswith((u'SPRO', u'APRO')):
list_of_terms.append(term)
else:
list_of_terms.append(term)
text_of_output = ' '.join(['%s' % term for term in list_of_terms])
output_data[input_file] = {}
output_data[input_file]['id'] = data['id']
output_data[input_file]['positive'] = data['positive']
output_data[input_file]['sarcasm'] = data['sarcasm']
output_data[input_file]['text'] = text_of_output
with open(directory + '/' + input_file, 'w') as output_file:
json.dump(output_data[input_file], output_file)
示例2: lmtze
# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
def lmtze(textfile):
m = Mystem()
text = open(textfile, encoding='utf-8').readlines()
newfile = open(textfile.replace('txt', 'lem.txt'), 'w', encoding='utf-8')
result_full = []
for line in text:
try:
element = etree.fromstring(line.strip('\n'))
text_ = element.xpath('text()')
entities = element.xpath('*')
result = ['<sent>']
while text_:
l = text_.pop(0)
# open('temp.txt', 'w', encoding='utf-8').write(l)
# subprocess.call(['C:\\Mystem\\mystem', 'i'])
l = m.analyze(l)
# print(l)
for x in l:
if x.get('analysis') is not None:
if x.get('analysis') == []:
result.append(x['text'])
else:
result.append(x['analysis'][0]['lex'] + '_' + x['analysis'][0]['gr'].split(',')[0].split('=')[0])
else:
continue
if text_:
e = entities.pop(0)
e_ = m.analyze(e.text)
result.append('<' + e.tag + '>')
for x in e_:
if x.get('analysis') is not None:
if x.get('analysis') == []:
result.append(x['text'])
else:
result.append(x['analysis'][0]['lex'])
else:
continue
result.append('</' + e.tag + '>')
except Exception:
continue
result.append('</sent>')
result_full.append(result)
result = []
print(len(result_full), ' разобралось')
for sent in result_full:
prev = ''
for x in sent:
if '<' in x and '/' not in x:
newfile.write(prev + x)
prev = ''
elif '_' in x or x.isalpha():
newfile.write(prev + x)
prev = ' '
else:
newfile.write(x)
newfile.write('\n')
示例3: extract
# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
def extract(self):
try:
#вычисляем, сколько в директории лежит файлов
input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory))
output_data = {}
list_of_all_terms = {}
m = Mystem()
#иду по документам
for file in input_files:
with open(self.input_directory + '/' + file) as data_file:
data = json.load(data_file)
list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text']))
text = " ".join(["%s" % term for term in list_of_terms])
list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text))
my_list = list_of_terms
list_of_terms = []
for term in my_list:
if m.analyze(term)[0].get(u'analysis'):
if not m.analyze(term)[0][u'analysis'][0][u'gr'].startswith(self.service_parts_of_speech) and len(term) > 1:
list_of_terms.append(term)
if term == u'не':
list_of_terms.append(term)
else:
list_of_terms.append(term)
output_data[file] = {}
output_data[file]['id'] = data['id']
output_data[file]['positive'] = data['positive']
output_data[file]['sarcasm'] = data['sarcasm']
output_data[file]['terms'] = {}
#убираю повторяющиеся слова
for term in list_of_terms:
if term not in output_data[file]['terms']:
output_data[file]['terms'][term] = 1
else:
output_data[file]['terms'][term] += 1
for term in output_data[file]['terms'].keys():
if term not in list_of_all_terms:
list_of_all_terms[term] = 1
else:
list_of_all_terms[term] += 1
#подсчёт tf
count_of_terms = output_data[file]['terms'][term]
output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0,
'count': count_of_terms}
for file in input_files:
#подсчёт idf
for term in output_data[file]['terms'].keys():
output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term])
#запись результата
with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file:
json.dump(output_data[file], output_file)
except Exception:
return False
else:
return True
开发者ID:pombredanne,项目名称:senty,代码行数:58,代码来源:standard_extractor_with_mystem_without_service_parts_of_speech.py
示例4: set
# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
if options:
title = options.group(1)
for stuff in title.split('|'):
yield gr.replace("(" + title + ")", stuff)
else:
yield gr
lines = set([])
with open("data/test.txt", "r") as input_file:
logging.info("file opened")
for line in input_file:
for w in m.analyze(line):
if 'analysis' in w:
for item in w['analysis']:
for gramm_info in parse_gr(item['gr']):
lines.add("\t".join(
[gramm_info, item['lex'], w['text'].lower()]).encode("utf-8") + "\n")
with open("data/pairs_with_grammar.tsv", "w+") as f:
for line in lines:
f.write(line)
dict = {}
for line in open("data/pairs_with_grammar.tsv", "r+"):
if line.strip():
示例5: index
# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
def index(name = None):
if request.args:
story = request.args['joke']
mystem = Mystem()
gramm = mystem.analyze(story)
characters = set()
for i in gramm:
if (str(i).find("од=") != -1) and (str(i).find("неод=") == -1):
s1 = str(i)[str(i).find("'lex': '") + 8:]
characters.add(s1[:s1.find( "'")])
file = open("corp.txt", 'r', encoding = "UTF-8")
f = file.read()[1:].split('\n\n')
file.close()
file = open("ans.txt", 'w', encoding = "UTF-8")
for i in f:
words = ((re.sub('[,\.\?\!\—\-\(\)\:\;]', '', i)).lower()).split(' ')
if characters <= set(words):
f = file.write(i + '\n\n')
file.close()
with open("ans.txt", "r", encoding='utf-8') as f:
content = f.read().split('\n\n')
return render_template("index.html", content=content)
return render_template('index.html')
示例6: __init__
# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
def __init__(self, path):
self.text = open(path).read().lower()
self.sentences = [sentence for sentence in re.split(r'(?:[.]\s*){3}|[.?!]', self.text) if len(sentence) > 1]
self.pos_data = []
m = Mystem()
counter = [0, 0, 0, 0, 0]
for sentence in self.sentences:
# parse with mystem
# count adjectives A, nouns S, verbs V, adverbs ADV, pronouns PR
data = m.analyze(sentence)
for word in data:
analysis = word.get('analysis', None)
if analysis:
best = analysis[0]
gr = best['gr']
if 'S' in gr:
counter[3] += 1
elif 'ADV' in gr:
counter[1] += 1
elif 'A' in gr:
counter[0] += 1
elif 'V' in gr:
counter[4] += 1
elif 'PR' in gr:
counter[2] += 1
self.pos_data.append(counter)
counter = [0, 0, 0, 0, 0]
self.data = np.array(self.pos_data)
示例7: __init__
# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
def __init__(self, path, doc_id, limit):
"""
:param doc_id: numerical id of a document, pass manually
"""
self.text = open(path).read().lower().replace('\n', '.')
# need a better regex
self.sentences = [sentence for sentence in re.split(r'(?:[.]\s*){3}|[.?!]', self.text) if sentence and len(sentence.split()) > 2]
self.pos_data = []
self.testing_data = []
self.id = doc_id
m = Mystem()
counter = Counter(DEFAULTS)
if not limit or limit > len(self.sentences):
limit = len(self.sentences)
for sentence in self.sentences[:limit]:
# parse with mystem
data = m.analyze(sentence)
# get POS and count for each sentence
pos = [word.get('analysis', None)[0]['gr'].split('(')[0].split(',')[0].split('=')[0]
for word in data if word.get('analysis', None)]
counter.update(pos)
# append to dataset
self.pos_data.append([counter[key] for key in sorted(counter)])
# reset counter
counter = Counter(DEFAULTS)
示例8: with_not
# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
def with_not(directory):
input_files = filter(lambda x: not x.endswith('~'), os.listdir(directory))
output_data = {}
m = Mystem()
#иду по документам
for input_file in input_files:
with open(directory + '/' + input_file) as data_file:
data = json.load(data_file)
list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), data['text'].split(' '))
# обработка не + (слово)
nums_of_bigrams = []
helping_words = [u'совсем', u'очень', u'слишком', u'самый']
for i in range(0, len(list_of_terms)):
if list_of_terms[i] == u'не' and list_of_terms[i+1] not in helping_words:
if m.analyze(list_of_terms[i+1])[0].get(u'analysis'):
if not m.analyze(list_of_terms[i+1])[0][u'analysis'][0][u'gr'].startswith(u'S,'):
nums_of_bigrams.append((i, i+1))
elif list_of_terms[i] == u'не' and list_of_terms[i+1] in helping_words:
if m.analyze(list_of_terms[i+2])[0].get(u'analysis'):
if not m.analyze(list_of_terms[i+2])[0][u'analysis'][0][u'gr'].startswith(u'S,'):
nums_of_bigrams.append((i, i+2))
for i in range(0, len(nums_of_bigrams)):
if nums_of_bigrams[i][0] + 1 == nums_of_bigrams[i][1]:
list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + list_of_terms[nums_of_bigrams[i][1]]
list_of_terms[nums_of_bigrams[i][1]] = ''
elif nums_of_bigrams[i][0] + 2 == nums_of_bigrams[i][1]:
list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + list_of_terms[nums_of_bigrams[i][1]]
list_of_terms[nums_of_bigrams[i][1] - 1] = ''
list_of_terms[nums_of_bigrams[i][1]] = ''
list_of_terms = filter(lambda x: x != '', list_of_terms)
text_of_output = ' '.join(['%s' % term for term in list_of_terms])
output_data[input_file] = {}
output_data[input_file]['id'] = data['id']
output_data[input_file]['positive'] = data['positive']
output_data[input_file]['sarcasm'] = data['sarcasm']
output_data[input_file]['text'] = text_of_output
with open(directory + '/' + input_file, 'w') as output_file:
json.dump(output_data[input_file], output_file)
示例9: build_pos
# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
def build_pos(self):
m = Mystem()
counter = Counter(DEFAULTS)
for doc in self.documents:
# parse with mystem
data = m.analyze(doc.text)
# get POS and count for each sentence
pos = [word.get('analysis', None)[0]['gr'].split('(')[0].split(',')[0].split('=')[0]
for word in data if word.get('analysis', None)]
counter.update(pos)
# append to dataset
self.pos_data.append([counter[key] for key in sorted(counter)])
# reset counter
counter = Counter(DEFAULTS)
示例10: produce_lemmas
# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
def produce_lemmas(connection, tableName, outputTableName):
mystem = Mystem()
cursor = connection.cursor()
inserter = connection.cursor()
query = 'DELETE FROM `%s`' % outputTableName
inserter.execute(query)
connection.commit()
query = 'SELECT * FROM `%s`' % tableName
cursor.execute(query)
query = 'INSERT INTO `' + outputTableName + '` (`' + tableName + '_id`, `word_class_id`, `lex`, `gr`)' \
'SELECT %i, `id`, "%s", "%s" FROM `word_classes` WHERE `abbr`="%s"'
for id, concept, scheme in cursor:
lemmas = mystem.analyze(concept)
for lemma in lemmas:
for analysis in lemma.get('analysis', []):
inserter.execute(query % prepare_content(id, analysis))
connection.commit()
cursor.close()
示例11: fill_mystem
# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
def fill_mystem():
from pymystem3 import Mystem
m = Mystem()
for sentence in get_sentences(1):
lemmas = m.analyze(sentence.source)
items = list()
for lemma in lemmas:
text = lemma['text']
analysis = lemma.get('analysis')
if not analysis:
text = text.strip()
if not len(text):
print 'spaces = "%s"' % text
continue
if ' ' in text:
for item in re.split('\s+', text):
items.append("%s %s ?" % (item, item))
print 'several =', "|".join(re.split('\s+', text))
continue
print 'delimiter = "%s"' % text
items.append("%s %s ?" % (text, text))
continue
if not len(text.strip()):
raise Exception('Impossible')
if ' ' in text:
raise Exception('Impossible')
lexemes = list()
for lexeme in analysis:
print 'lex=', lexeme.get('lex', '-')
print 'gr=', lexeme.get('gr', '-')
lexemes.append("%s %s" % (lexeme['lex'], lexeme['gr']))
items.append("%s %s" % (text, ' '.join(lexemes)))
sentence.mystem = '\n'.join(items)
sentence.save()
示例12: Mystem
# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
import os, json, dicttoxml
from pymystem3 import Mystem
m = Mystem()
top = 'C:\\Users\\John\\Desktop\\py_files\\питон\\korpus\\no_marks'
for root, dirs, files in os.walk(top):
for name in files:
loc = os.path.join(root, name)
loc_list = loc.split('\\') #creates list in order to remove path content
new_root = loc.replace('\\no_marks\\{0}\\{1}\\{2}'.format(loc_list[8], loc_list[9], loc_list[10]), '') #removes path ending
dir_marks = os.path.join(new_root + '\\marks\\{0}\\{1}'.format(loc_list[8], loc_list[9])) #adds new path ending for json.docs
dir_xml = os.path.join(new_root + '\\xml\\{0}\\{1}'.format(loc_list[8], loc_list[9])) #adds new path ending for xml docs
new_name = name.replace('.txt', '')
if not os.path.exists(dir_marks): #makes nesessary dirs if not present
os.makedirs(dir_marks)
if not os.path.exists(dir_xml):
os.makedirs(dir_xml)
with open(loc, "r", encoding = 'utf-8') as doc:
text_doc = doc.read()
lines = doc.readlines()
info = json.dumps(m.analyze(text_doc), ensure_ascii = False) #creates text file with gram and lem info
with open("{0}\\{1}.json".format(dir_marks, new_name), 'w', encoding = 'utf-8') as doc_marks:
doc_marks.write(info)
xml = dicttoxml.dicttoxml(info).decode('utf-8') #converts json to xml
with open("{0}\\{1}.xml".format(dir_xml, new_name), 'w', encoding = 'utf-8') as doc_xml:
doc_xml.write(xml)
示例13: Mystem
# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
# coding: utf-8
from pymystem3 import Mystem
# text = "Голкипер «Нью-Йорк Айлендерс» а-б г-н ваыва-ыфвафыа Выступая на пресс-конференции в Лондоне, он подчеркнул, что опубликованные необработанные отчеты с мест боевых действий не содержат имен или информации, которая может повредить каким-либо лицам или организациям. Красивая, — 123.2 latin мама 4,5 7:8 красиво мыла раму"
text = "слив воды"
m = Mystem()
# lemmas = m.lemmatize(text)
# print(''.join(lemmas))
lemmas = m.analyze(text)
for lemma in lemmas:
print '#"%s"' % lemma['text']
a = lemma.get('analysis')
# print a
if a:
for b in a:
print 'lex=', b.get('lex', '-')
print 'gr=', b.get('gr', '-')
print
示例14: MystemOCTagger
# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
class MystemOCTagger(object):
def __init__(self):
self.mystem_inst = Mystem()
def run_and_convert(self, input_file, output_file, strict_match = False):
f_in = open(input_file, 'rb')
f_out = open(output_file, 'w+')
context = etree.iterparse(f_in, tag='sentence')
for event, sentence_elem in context:
sentence = sentence_elem.find('source')
analyzed = self.analyze_sentence(sentence.text)
tokens_tree = sentence_elem.find('tokens')
tokens = self.extract_tokens(tokens_tree)
matched = self.match_analyzed_tokens(tokens, analyzed, strict_match)
result = self.analyzed_to_csv_list(matched)
for s in result:
f_out.write(s+'\n')
sentence_elem.clear()
def analyze_sentence(self, sentence):
return self.mystem_inst.analyze(sentence)
# builds word-index mapping, indices sorted in order of appearance
def extract_tokens(self, tokens_tree):
tokens_dict = {}
for t in tokens_tree.iter('token'):
idx = t.get('id')
token = t.get('text')
token = strip_word(token)
if (len(token) > 0):
if token in tokens_dict:
tokens_dict.get(token).append(idx)
else:
tokens_dict[token] = [idx]
return tokens_dict
# matches analysis with original tokens indices
def match_analyzed_tokens(self, tokens_index, analyzed, strict_match = False):
analysis_indexed = {}
unindexed = []
for t in analyzed:
t_text = t.get('text')
t_text = strip_word(t_text)
if len(t_text) > 0:
if t_text in tokens_index:
idx = tokens_index.get(t_text).pop(0)
if (len(tokens_index.get(t_text)) == 0):
tokens_index.pop(t_text)
analysis_indexed[idx] = t.get('analysis')
else:
unindexed.append(t)
if (not strict_match):
analysis_not_strict = {}
if len(tokens_index) > 0:
analysis_not_strict = self.match_not_strict(tokens_index, unindexed)
analysis_indexed.update(analysis_not_strict)
not_analyzed = []
if len(tokens_index) > 0:
for t in tokens_index:
not_analyzed.append(t)
# if len(not_analyzed) > 0:
# f_unindexed = open('mismatch.txt', 'a+')
# f_unindexed.write('oc ')
# f_unindexed.write(str(not_analyzed)+' ')
#
# if len(unindexed) > 0:
# f_unindexed = open('mismatch.txt', 'a+')
# for u in unindexed:
# f_unindexed.write(' ')
# f_unindexed.write(str(u.get('text')))
# f_unindexed.write('\n')
return analysis_indexed
def match_not_strict(self, tokens_index, analyzed):
analysis_indexed = {}
for t_indexed, idx_list in tokens_index.items():
for idx in idx_list:
for i in range(0, len(analyzed)):
t_analyzed = analyzed[i]
if t_indexed.endswith(t_analyzed.get('text')):
analysis_indexed[idx] = t_analyzed.get('analysis')
#print(t_analyzed.get('text')+' '+t_indexed)
analyzed.pop(i)
idx_list.remove(idx)
break
idx_copy = tokens_index.copy()
for t, i in idx_copy.items():
#.........这里部分代码省略.........
示例15: main
# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
def main(argv):
with open(argv[1], encoding='utf-8') as f:
s = re.sub(r'\s+', ' ', f.read(), flags=re.M)
f=re.split(r'(?<=[.!?…]) ',s)
sentens=[]
for i,t in enumerate(f):
sentens.append(t)
print(str(i)," ",t)
morph = pymorphy2.MorphAnalyzer()
ZnakiP=[",","!","/n",".",":",";",'"',"'","\n","...","?","!","(",")","-"," "," "]
t = Mystem()
PARS=[]
for sent in sentens:
input_file=open("input.txt","w",encoding="utf-8")
input_file.write(sent)
input_file.close()
# Делаем синтаксический анализ текста, находим граматические основы
process = subprocess.Popen('tomitaparser.exe config.proto', stdout=subprocess.PIPE,shell=True)
process.communicate()
process.wait()
predicate=[]
Nouns=[]
DOP=[]
DOP.append({})
OPR=[]
with open("pretty.html",encoding='utf8') as fp:
soup = BeautifulSoup(fp,"html.parser")
par_f=soup.find_all('table')
for table in par_f:
th=table.find('th')
if(th.text=="Noun1"):
slovo=th.find_parent("table").find('a').text
Nouns.append(slovo)
if(th.text=="Verb1"):
slovo=th.find_parent("table").find('a').text
predicate.append(slovo)
if(th.text=="OPR1"):
sl=th.find_parent("table").find_all('a')
for slovo in sl:
OPR.append(slovo.text)
if(th.text=="DOP1"):
sl=th.find_parent("table").find_all('a')
for slovo in sl:
DOP[0][slovo.text.lower()]=slovo.next_element.next_element.next_element.next_element
TREE={}
TREE[Nouns[0]]={}
for v in predicate:
TREE[Nouns[0]][v]={}
if(OPR!=[]):
for temp in OPR:
for noun in TREE:
if(len(re.split(r"[,' ']",temp))==1):
TREE[Nouns[0]][temp]=t.analyze(temp)[0]['analysis'][0]['gr']
else:
m2=[]
for f in re.split(r"[,' ']",temp):
if(f!=''):
m2.append(f)
if(noun in m2):
mk=t.analyze(temp)
wsp=[]
for tr in mk:
if(not tr['text'] in ZnakiP):
if(not 'CONJ' in tr['analysis'][0]['gr']):
wsp.append(tr['text'])
for tl in wsp:
if(tl!=noun):
TREE[Nouns[0]][tl]=t.analyze(tl)[0]['analysis'][0]['gr']
for temp in TREE[Nouns[0]]:
if(temp in DOP[0].values()):
for sp in DOP[0]:
if(DOP[0][sp]==temp):
m2=[]
for f in re.split(r"[,' ']",sp):
if(f!=''):
m2.append(f)
for rg in m2:
TREE[Nouns[0]][temp][rg]={}
for _opr in OPR:
reg=re.split(r"[,' ']",temp)
if(noun in reg):
mk=t.analyze(_opr)
wsp=[]
for tr in mk:
if(not tr['text'] in ZnakiP):
if(not 'CONJ' in tr['analysis'][0]['gr']):
wsp.append(tr['text'])
#.........这里部分代码省略.........