本文整理汇总了Python中pymystem3.Mystem类的典型用法代码示例。如果您正苦于以下问题:Python Mystem类的具体用法?Python Mystem怎么用?Python Mystem使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Mystem类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self, path):
self.text = open(path).read().lower()
self.sentences = [sentence for sentence in re.split(r'(?:[.]\s*){3}|[.?!]', self.text) if len(sentence) > 1]
self.pos_data = []
m = Mystem()
counter = [0, 0, 0, 0, 0]
for sentence in self.sentences:
# parse with mystem
# count adjectives A, nouns S, verbs V, adverbs ADV, pronouns PR
data = m.analyze(sentence)
for word in data:
analysis = word.get('analysis', None)
if analysis:
best = analysis[0]
gr = best['gr']
if 'S' in gr:
counter[3] += 1
elif 'ADV' in gr:
counter[1] += 1
elif 'A' in gr:
counter[0] += 1
elif 'V' in gr:
counter[4] += 1
elif 'PR' in gr:
counter[2] += 1
self.pos_data.append(counter)
counter = [0, 0, 0, 0, 0]
self.data = np.array(self.pos_data)
示例2: mystem_using_with_considering_of_multiple_letters
def mystem_using_with_considering_of_multiple_letters(input_directory, output_directory):
input_files = filter(lambda x: not x.endswith('~'), os.listdir(input_directory))
output_data = {}
m = Mystem()
#иду по документам
for input_file in input_files:
with open(input_directory + '/' + input_file) as data_file:
data = json.load(data_file)
list_of_terms = filter(lambda x: x != '', re.split(''' |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +''', data['text']))
my_list_of_terms = []
for term in list_of_terms:
if term == m.lemmatize(term)[0]:
my_term = term
term = u''
prev_letter = my_term[0]
term += my_term[0]
for i in range(1, len(my_term)):
if my_term[i] != prev_letter:
term += my_term[i]
prev_letter = my_term[i]
my_list_of_terms.append(term)
else:
my_list_of_terms.append(term)
list_of_terms = my_list_of_terms
text = ' '.join(['%s' % term for term in list_of_terms])
list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), m.lemmatize(text))
text_of_output = ' '.join(['%s' % term for term in list_of_terms])
output_data[input_file] = {}
output_data[input_file]['id'] = data['id']
output_data[input_file]['positive'] = data['positive']
output_data[input_file]['sarcasm'] = data['sarcasm']
output_data[input_file]['text'] = text_of_output
with open(output_directory + '/' + input_file, 'w') as output_file:
json.dump(output_data[input_file], output_file)
示例3: without_pronouns
def without_pronouns(directory):
input_files = filter(lambda x: not x.endswith('~'), os.listdir(directory))
output_data = {}
m = Mystem()
#иду по документам
for input_file in input_files:
with open(directory + '/' + input_file) as data_file:
data = json.load(data_file)
list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), data['text'].split(' '))
my_list = list_of_terms
list_of_terms = []
for term in my_list:
if m.analyze(term)[0].get(u'analysis'):
if not m.analyze(term)[0][u'analysis'][0][u'gr'].startswith((u'SPRO', u'APRO')):
list_of_terms.append(term)
else:
list_of_terms.append(term)
text_of_output = ' '.join(['%s' % term for term in list_of_terms])
output_data[input_file] = {}
output_data[input_file]['id'] = data['id']
output_data[input_file]['positive'] = data['positive']
output_data[input_file]['sarcasm'] = data['sarcasm']
output_data[input_file]['text'] = text_of_output
with open(directory + '/' + input_file, 'w') as output_file:
json.dump(output_data[input_file], output_file)
示例4: __init__
def __init__(self, path, doc_id, limit):
"""
:param doc_id: numerical id of a document, pass manually
"""
self.text = open(path).read().lower().replace('\n', '.')
# need a better regex
self.sentences = [sentence for sentence in re.split(r'(?:[.]\s*){3}|[.?!]', self.text) if sentence and len(sentence.split()) > 2]
self.pos_data = []
self.testing_data = []
self.id = doc_id
m = Mystem()
counter = Counter(DEFAULTS)
if not limit or limit > len(self.sentences):
limit = len(self.sentences)
for sentence in self.sentences[:limit]:
# parse with mystem
data = m.analyze(sentence)
# get POS and count for each sentence
pos = [word.get('analysis', None)[0]['gr'].split('(')[0].split(',')[0].split('=')[0]
for word in data if word.get('analysis', None)]
counter.update(pos)
# append to dataset
self.pos_data.append([counter[key] for key in sorted(counter)])
# reset counter
counter = Counter(DEFAULTS)
示例5: index
def index(name = None):
if request.args:
story = request.args['joke']
mystem = Mystem()
gramm = mystem.analyze(story)
characters = set()
for i in gramm:
if (str(i).find("од=") != -1) and (str(i).find("неод=") == -1):
s1 = str(i)[str(i).find("'lex': '") + 8:]
characters.add(s1[:s1.find( "'")])
file = open("corp.txt", 'r', encoding = "UTF-8")
f = file.read()[1:].split('\n\n')
file.close()
file = open("ans.txt", 'w', encoding = "UTF-8")
for i in f:
words = ((re.sub('[,\.\?\!\—\-\(\)\:\;]', '', i)).lower()).split(' ')
if characters <= set(words):
f = file.write(i + '\n\n')
file.close()
with open("ans.txt", "r", encoding='utf-8') as f:
content = f.read().split('\n\n')
return render_template("index.html", content=content)
return render_template('index.html')
示例6: extract
def extract(self):
try:
#вычисляем, сколько в директории лежит файлов
input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory))
output_data = {}
list_of_all_terms = {}
m = Mystem()
#иду по документам
for file in input_files:
with open(self.input_directory + '/' + file) as data_file:
data = json.load(data_file)
list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|<|>|\*|!|@|_ +""", data['text']))
text = " ".join(["%s" % term for term in list_of_terms])
list_of_terms = filter(lambda x: x != " ", m.lemmatize(text))
count_of_rows = 0
for i in range(0, len(list_of_terms)):
if list_of_terms[i] == '\n' or list_of_terms[i] == ' \n':
count_of_rows += 1
if list_of_terms[i] == ' \n':
list_of_terms[i] = '\n'
if count_of_rows < self.threshold_of_rows_count:
first_list_of_terms = list_of_terms
list_of_terms = []
for i in range(0, len(first_list_of_terms)):
if first_list_of_terms[i] != '\n':
list_of_terms.append(first_list_of_terms[i])
output_data[file] = {}
output_data[file]['id'] = data['id']
output_data[file]['positive'] = data['positive']
output_data[file]['sarcasm'] = data['sarcasm']
output_data[file]['terms'] = {}
#убираю повторяющиеся слова
for term in list_of_terms:
if term not in output_data[file]['terms']:
output_data[file]['terms'][term] = 1
else:
output_data[file]['terms'][term] += 1
for term in output_data[file]['terms'].keys():
if term not in list_of_all_terms:
list_of_all_terms[term] = 1
else:
list_of_all_terms[term] += 1
#подсчёт tf
count_of_terms = output_data[file]['terms'][term]
output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0,
'count': count_of_terms}
for file in input_files:
#подсчёт idf
for term in output_data[file]['terms'].keys():
output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term])
#запись результата
with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file:
json.dump(output_data[file], output_file)
except Exception:
return False
else:
return True
示例7: lmtze
def lmtze(textfile):
m = Mystem()
text = open(textfile, encoding='utf-8').readlines()
newfile = open(textfile.replace('txt', 'lem.txt'), 'w', encoding='utf-8')
result_full = []
for line in text:
try:
element = etree.fromstring(line.strip('\n'))
text_ = element.xpath('text()')
entities = element.xpath('*')
result = ['<sent>']
while text_:
l = text_.pop(0)
# open('temp.txt', 'w', encoding='utf-8').write(l)
# subprocess.call(['C:\\Mystem\\mystem', 'i'])
l = m.analyze(l)
# print(l)
for x in l:
if x.get('analysis') is not None:
if x.get('analysis') == []:
result.append(x['text'])
else:
result.append(x['analysis'][0]['lex'] + '_' + x['analysis'][0]['gr'].split(',')[0].split('=')[0])
else:
continue
if text_:
e = entities.pop(0)
e_ = m.analyze(e.text)
result.append('<' + e.tag + '>')
for x in e_:
if x.get('analysis') is not None:
if x.get('analysis') == []:
result.append(x['text'])
else:
result.append(x['analysis'][0]['lex'])
else:
continue
result.append('</' + e.tag + '>')
except Exception:
continue
result.append('</sent>')
result_full.append(result)
result = []
print(len(result_full), ' разобралось')
for sent in result_full:
prev = ''
for x in sent:
if '<' in x and '/' not in x:
newfile.write(prev + x)
prev = ''
elif '_' in x or x.isalpha():
newfile.write(prev + x)
prev = ' '
else:
newfile.write(x)
newfile.write('\n')
示例8: extract
def extract(self):
try:
#вычисляем, сколько в директории лежит файлов
input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory))
output_data = {}
list_of_all_terms = {}
m = Mystem()
#иду по документам
for file in input_files:
with open(self.input_directory + '/' + file) as data_file:
data = json.load(data_file)
list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text']))
text = " ".join(["%s" % term for term in list_of_terms])
list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text))
my_list = list_of_terms
list_of_terms = []
for term in my_list:
if m.analyze(term)[0].get(u'analysis'):
if not m.analyze(term)[0][u'analysis'][0][u'gr'].startswith(self.service_parts_of_speech) and len(term) > 1:
list_of_terms.append(term)
if term == u'не':
list_of_terms.append(term)
else:
list_of_terms.append(term)
output_data[file] = {}
output_data[file]['id'] = data['id']
output_data[file]['positive'] = data['positive']
output_data[file]['sarcasm'] = data['sarcasm']
output_data[file]['terms'] = {}
#убираю повторяющиеся слова
for term in list_of_terms:
if term not in output_data[file]['terms']:
output_data[file]['terms'][term] = 1
else:
output_data[file]['terms'][term] += 1
for term in output_data[file]['terms'].keys():
if term not in list_of_all_terms:
list_of_all_terms[term] = 1
else:
list_of_all_terms[term] += 1
#подсчёт tf
count_of_terms = output_data[file]['terms'][term]
output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0,
'count': count_of_terms}
for file in input_files:
#подсчёт idf
for term in output_data[file]['terms'].keys():
output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term])
#запись результата
with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file:
json.dump(output_data[file], output_file)
except Exception:
return False
else:
return True
开发者ID:pombredanne,项目名称:senty,代码行数:56,代码来源:standard_extractor_with_mystem_without_service_parts_of_speech.py
示例9: extract
def extract(self):
try:
#вычисляем, сколько в директории лежит файлов
input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory))
output_data = {}
list_of_all_n_grams = {}
m = Mystem()
#иду по документам
for file in input_files:
with open(self.input_directory + '/' + file) as data_file:
data = json.load(data_file)
list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text']))
text = " ".join(["%s" % term for term in list_of_terms])
list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text))
list_of_n_grams_tuples = {}
for j in range(0, self.n):
list_of_n_grams_tuples[j] = zip(*[list_of_terms[i:] for i in range(j + 1)])
list_of_n_grams_strings = []
for j in range(0, self.n):
for gram_tuple in list_of_n_grams_tuples[j]:
string_of_n_gram = " ".join(["%s" % term for term in gram_tuple])
list_of_n_grams_strings.append(string_of_n_gram)
output_data[file] = {}
output_data[file]['id'] = data['id']
output_data[file]['positive'] = data['positive']
output_data[file]['sarcasm'] = data['sarcasm']
output_data[file]['terms'] = {}
#убираю повторяющиеся слова
for gram in list_of_n_grams_strings:
if gram not in output_data[file]['terms']:
output_data[file]['terms'][gram] = 1
else:
output_data[file]['terms'][gram] += 1
for gram in output_data[file]['terms'].keys():
if gram not in list_of_all_n_grams:
list_of_all_n_grams[gram] = 1
else:
list_of_all_n_grams[gram] += 1
#подсчёт tf
count_of_n_grams = output_data[file]['terms'][gram]
output_data[file]['terms'][gram] = {'tf': float(count_of_n_grams)/len(list_of_n_grams_strings), 'idf': 0,
'count': float(count_of_n_grams)}
for file in input_files:
#подсчёт idf
for gram in output_data[file]['terms'].keys():
output_data[file]['terms'][gram]['idf'] = math.log(float(len(input_files))/list_of_all_n_grams[gram])
#запись результата
with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file:
json.dump(output_data[file], output_file)
except Exception:
return False
else:
return True
示例10: with_not
def with_not(directory):
input_files = filter(lambda x: not x.endswith('~'), os.listdir(directory))
output_data = {}
m = Mystem()
#иду по документам
for input_file in input_files:
with open(directory + '/' + input_file) as data_file:
data = json.load(data_file)
list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), data['text'].split(' '))
# обработка не + (слово)
nums_of_bigrams = []
helping_words = [u'совсем', u'очень', u'слишком', u'самый']
for i in range(0, len(list_of_terms)):
if list_of_terms[i] == u'не' and list_of_terms[i+1] not in helping_words:
if m.analyze(list_of_terms[i+1])[0].get(u'analysis'):
if not m.analyze(list_of_terms[i+1])[0][u'analysis'][0][u'gr'].startswith(u'S,'):
nums_of_bigrams.append((i, i+1))
elif list_of_terms[i] == u'не' and list_of_terms[i+1] in helping_words:
if m.analyze(list_of_terms[i+2])[0].get(u'analysis'):
if not m.analyze(list_of_terms[i+2])[0][u'analysis'][0][u'gr'].startswith(u'S,'):
nums_of_bigrams.append((i, i+2))
for i in range(0, len(nums_of_bigrams)):
if nums_of_bigrams[i][0] + 1 == nums_of_bigrams[i][1]:
list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + list_of_terms[nums_of_bigrams[i][1]]
list_of_terms[nums_of_bigrams[i][1]] = ''
elif nums_of_bigrams[i][0] + 2 == nums_of_bigrams[i][1]:
list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + list_of_terms[nums_of_bigrams[i][1]]
list_of_terms[nums_of_bigrams[i][1] - 1] = ''
list_of_terms[nums_of_bigrams[i][1]] = ''
list_of_terms = filter(lambda x: x != '', list_of_terms)
text_of_output = ' '.join(['%s' % term for term in list_of_terms])
output_data[input_file] = {}
output_data[input_file]['id'] = data['id']
output_data[input_file]['positive'] = data['positive']
output_data[input_file]['sarcasm'] = data['sarcasm']
output_data[input_file]['text'] = text_of_output
with open(directory + '/' + input_file, 'w') as output_file:
json.dump(output_data[input_file], output_file)
示例11: Runner
class Runner(object):
def __init__(self, input_text):
self.lemmatize = None
while True:
response = raw_input("Do you want to lemmatize text first? (yes/no)\n").lower()
if response == "yes":
print "You should wait for a while"
self.lemmatize = True
self.stemmer = Mystem()
break
elif response == "no":
self.lemmatize = False
break
self.word_lists = list()
with open(input_text, "r") as f:
for line in f:
line += "."
if self.lemmatize:
lexemes = self.stemmer.lemmatize(line)
word_list = list() # список слов, неразделенных знаками пунктуации
for lexeme in lexemes:
lexeme = lexeme.strip()
if lexeme:
if lexeme.translate(None, '.,?!:;()"\' -\t\n'): # проверка, что лексема не является знаком пунктуации
lexeme = lexeme.decode("utf-8")
if is_cyrillic(lexeme):
word_list.append(lexeme)
else: # иначе, добавить биграмы из списка и завести новый пустой список
self.word_lists.append(word_list)
word_list = list()
else:
line = line.replace(".", " . ").replace(",", " , ").replace(":", " : ").replace(";", " ; ")\
.replace("?", " ? ").replace("!", " ! ").replace("(", " ( ").replace(")", " ) ")\
.replace("--", " -- ").replace(".", " . ")
word_list = list()
for lexeme in line.split():
# проверка, что лексема не является знаком пунктуации
lexeme = lexeme.translate(None, '.,?!:;()"\'').replace("--", "").decode("utf-8").strip().lower()
if lexeme:
if is_cyrillic(lexeme):
word_list.append(lexeme)
else:
if word_list:
self.word_lists.append(word_list)
word_list = list()
train, test = self.split()
self.lid = Lid(train, test)
self.lid.run()
def split(self):
n = len(self.word_lists)
train = self.word_lists[:n*9/10]
test = self.word_lists[n*9/10:]
return train, test
示例12: mystem_using
def mystem_using(input_directory, output_directory):
input_files = filter(lambda x: not x.endswith('~'), os.listdir(input_directory))
output_data = {}
m = Mystem()
for input_file in input_files:
with open(input_directory + '/' + input_file) as data_file:
data = json.load(data_file)
list_of_terms = filter(lambda x: x != '', re.split(''' |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +''', data['text']))
text = " ".join(["%s" % term for term in list_of_terms])
list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), m.lemmatize(text))
text_of_output = ' '.join(['%s' % term for term in list_of_terms])
output_data[input_file] = {}
output_data[input_file]['id'] = data['id']
output_data[input_file]['positive'] = data['positive']
output_data[input_file]['sarcasm'] = data['sarcasm']
output_data[input_file]['text'] = text_of_output
with open(output_directory + '/' + input_file, 'w') as output_file:
json.dump(output_data[input_file], output_file)
示例13: Index
class Index(object):
def __init__(self, input_file):
self.stemmer = Mystem()
self.documents = dict()
self.tokens = list()
self.terms = dict()
self.index = list()
# reading documents, making tokenization
with open(input_file, "r") as f:
for i, line in enumerate(f, start=1):
self.documents[i] = line.decode("utf-8")
for word in self.stemmer.lemmatize(line):
token = word.translate(None, '.,?!:;()"\'-').decode("utf-8").strip()
if token:
self.tokens.append((token, i))
# sorting by tokens first, then by frequency
self.tokens.sort(key=lambda tup: (tup[0], tup[1]))
# terminization and building index
current_term = self.tokens[0][0]
current_doc_id = self.tokens[0][1]
doc_ids = [current_doc_id]
for token, doc_id in self.tokens:
term = token.lower()
if term == current_term:
if doc_id != current_doc_id:
doc_ids.append(doc_id)
current_doc_id = doc_id
else:
self.terms[current_term] = (len(doc_ids), doc_ids)
self.index.append((current_term, len(doc_ids), doc_ids))
current_term = term
current_doc_id = doc_id
doc_ids = [doc_id]
self.terms[current_term] = (len(doc_ids), doc_ids)
self.index.append((current_term, len(doc_ids), doc_ids))
def print_to_file(self):
with open("result.txt", "w") as f:
for term, count, doc_ids in self.index:
f.write("{},\t{},\t{}\n".format(term.encode("utf-8"), count, doc_ids))
def print_statistics(self):
terms_num = len(self.terms)
terms_len = 0.
for term in self.terms:
terms_len += len(term)
print "***********************"
print "Number of terms = {}".format(terms_num)
print "Average term length = {}".format(terms_len / terms_num)
print "***********************"
示例14: search
def search():
cn = None
file = codecs.open('static/articles.xml', 'r', 'utf-8')
rfile = file.read()
tree = lxml.etree.fromstring(rfile)
res = tree.xpath('entry')
categ = {
'cat': 'Категория', 'wgroup': 'Группа слов с близким значением', 'comm': 'Комментарии',
'stdiff': 'Стилистические различия', 'overlap': 'Совпадающая часть значения',
'dom': 'Доминанта группы', 'diffmark': 'Различительные признаки, релевантные для данной группы',
'diff': 'Смысловые различия', 'rare': 'Редкие слова, примыкающие к группе',
'anmean': 'Другие значения слов, входящих в группу', 'comb': 'Сочетаемость', 'reg': 'Региональные варианты',
'adict': 'Данные академических словарей', 'doc': 'Нормативные документы',
'etim': 'Этимология', 'ill': 'Иллюстрации'
}
file.close()
ms = Mystem()
wordsearch = ms.lemmatize(request.form['search'].lower())[0]
for i in res:
if wordsearch == '':
cn = 'Пустой запрос'
elif i.text.lower().startswith(wordsearch):
arr = []
for j in i.iter():
for k in dict.keys(categ):
if j.tag == k:
if j.text != 'null':
arr.append('<font size="4"><b>' + str(categ[j.tag]) + '</b></font><br>' + str(j.text))
text = '<br><br>'.join([j for j in arr[1:]])
text = re.sub('\*', '<b>', text)
text = re.sub('\#', '</b>', text)
text = re.sub('\$', '<i>', text)
text = re.sub('\%', '</i>', text)
text = re.sub('\@', '<font color="#696969">', text)
text = re.sub('\+', '</font>', text)
cn = '<strong><big>' + i.text + '</big></strong><br><br>' + re.sub('\n', '<br>', text)
break
else:
cn = 'По Вашему запросу ничего не найдено. <br>' \
'Попробуйте использовать "Поиск по тегу" или измените запрос.'
return render_template('search.html', cn=Markup(cn))
示例15: build_pos
def build_pos(self):
m = Mystem()
counter = Counter(DEFAULTS)
for doc in self.documents:
# parse with mystem
data = m.analyze(doc.text)
# get POS and count for each sentence
pos = [word.get('analysis', None)[0]['gr'].split('(')[0].split(',')[0].split('=')[0]
for word in data if word.get('analysis', None)]
counter.update(pos)
# append to dataset
self.pos_data.append([counter[key] for key in sorted(counter)])
# reset counter
counter = Counter(DEFAULTS)