当前位置: 首页>>代码示例>>Python>>正文


Python Mystem.lemmatize方法代码示例

本文整理汇总了Python中pymystem3.Mystem.lemmatize方法的典型用法代码示例。如果您正苦于以下问题:Python Mystem.lemmatize方法的具体用法?Python Mystem.lemmatize怎么用?Python Mystem.lemmatize使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pymystem3.Mystem的用法示例。


在下文中一共展示了Mystem.lemmatize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: mystem_using_with_considering_of_multiple_letters

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import lemmatize [as 别名]
def mystem_using_with_considering_of_multiple_letters(input_directory, output_directory):
        input_files = filter(lambda x: not x.endswith('~'), os.listdir(input_directory))
        output_data = {}
        m = Mystem()
        #иду по документам
        for input_file in input_files:
            with open(input_directory + '/' + input_file) as data_file:
                data = json.load(data_file)
            list_of_terms = filter(lambda x: x != '', re.split(''' |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +''', data['text']))
            my_list_of_terms = []
            for term in list_of_terms:
                if term == m.lemmatize(term)[0]:
                    my_term = term
                    term = u''
                    prev_letter = my_term[0]
                    term += my_term[0]
                    for i in range(1, len(my_term)):
                        if my_term[i] != prev_letter:
                            term += my_term[i]
                        prev_letter = my_term[i]
                    my_list_of_terms.append(term)
                else:
                    my_list_of_terms.append(term)
            list_of_terms = my_list_of_terms
            text = ' '.join(['%s' % term for term in list_of_terms])
            list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), m.lemmatize(text))
            text_of_output = ' '.join(['%s' % term for term in list_of_terms])
            output_data[input_file] = {}
            output_data[input_file]['id'] = data['id']
            output_data[input_file]['positive'] = data['positive']
            output_data[input_file]['sarcasm'] = data['sarcasm']
            output_data[input_file]['text'] = text_of_output
            with open(output_directory + '/' + input_file, 'w') as output_file:
                json.dump(output_data[input_file], output_file)
开发者ID:pombredanne,项目名称:senty,代码行数:36,代码来源:features.py

示例2: extract

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import lemmatize [as 别名]
    def extract(self):
        try:
            #вычисляем, сколько в директории лежит файлов
            input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory))
            output_data = {}
            list_of_all_terms = {}
            m = Mystem()
            #иду по документам
            for file in input_files:
                with open(self.input_directory + '/' + file) as data_file:
                    data = json.load(data_file)
                list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|<|>|\*|!|@|_ +""", data['text']))
                text = " ".join(["%s" % term for term in list_of_terms])
                list_of_terms = filter(lambda x: x != " ", m.lemmatize(text))
                count_of_rows = 0
                for i in range(0, len(list_of_terms)):
                    if list_of_terms[i] == '\n' or list_of_terms[i] == ' \n':
                        count_of_rows += 1
                    if list_of_terms[i] == ' \n':
                        list_of_terms[i] = '\n'
                if count_of_rows < self.threshold_of_rows_count:
                    first_list_of_terms = list_of_terms
                    list_of_terms = []
                    for i in range(0, len(first_list_of_terms)):
                        if first_list_of_terms[i] != '\n':
                            list_of_terms.append(first_list_of_terms[i])
                output_data[file] = {}
                output_data[file]['id'] = data['id']
                output_data[file]['positive'] = data['positive']
                output_data[file]['sarcasm'] = data['sarcasm']
                output_data[file]['terms'] = {}
                #убираю повторяющиеся слова
                for term in list_of_terms:
                    if term not in output_data[file]['terms']:
                        output_data[file]['terms'][term] = 1
                    else:
                        output_data[file]['terms'][term] += 1
                for term in output_data[file]['terms'].keys():
                    if term not in list_of_all_terms:
                        list_of_all_terms[term] = 1
                    else:
                        list_of_all_terms[term] += 1
                    #подсчёт tf
                    count_of_terms = output_data[file]['terms'][term]
                    output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0,
                                                        'count': count_of_terms}

            for file in input_files:
                #подсчёт idf
                for term in output_data[file]['terms'].keys():
                    output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term])
                #запись результата
                with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file:
                    json.dump(output_data[file], output_file)
        except Exception:
            return False
        else:
            return True
开发者ID:pombredanne,项目名称:senty,代码行数:60,代码来源:standard_extractor_with_counting_number_of_rows.py

示例3: extract

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import lemmatize [as 别名]
    def extract(self):
        try:
            #вычисляем, сколько в директории лежит файлов
            input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory))
            output_data = {}
            list_of_all_terms = {}
            m = Mystem()
            #иду по документам
            for file in input_files:
                with open(self.input_directory + '/' + file) as data_file:
                    data = json.load(data_file)
                list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text']))
                text = " ".join(["%s" % term for term in list_of_terms])
                list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text))
                my_list = list_of_terms
                list_of_terms = []
                for term in my_list:
                    if m.analyze(term)[0].get(u'analysis'):
                        if not m.analyze(term)[0][u'analysis'][0][u'gr'].startswith(self.service_parts_of_speech) and len(term) > 1:
                            list_of_terms.append(term)
                        if term == u'не':
                            list_of_terms.append(term)
                    else:
                        list_of_terms.append(term)
                output_data[file] = {}
                output_data[file]['id'] = data['id']
                output_data[file]['positive'] = data['positive']
                output_data[file]['sarcasm'] = data['sarcasm']
                output_data[file]['terms'] = {}
                #убираю повторяющиеся слова
                for term in list_of_terms:
                    if term not in output_data[file]['terms']:
                        output_data[file]['terms'][term] = 1
                    else:
                        output_data[file]['terms'][term] += 1
                for term in output_data[file]['terms'].keys():
                    if term not in list_of_all_terms:
                        list_of_all_terms[term] = 1
                    else:
                        list_of_all_terms[term] += 1
                    #подсчёт tf
                    count_of_terms = output_data[file]['terms'][term]
                    output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0,
                                                        'count': count_of_terms}

            for file in input_files:
                #подсчёт idf
                for term in output_data[file]['terms'].keys():
                    output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term])
                #запись результата
                with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file:
                    json.dump(output_data[file], output_file)
        except Exception:
            return False
        else:
            return True
开发者ID:pombredanne,项目名称:senty,代码行数:58,代码来源:standard_extractor_with_mystem_without_service_parts_of_speech.py

示例4: Runner

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import lemmatize [as 别名]
class Runner(object):
    def __init__(self, input_text):
        self.lemmatize = None
        while True:
            response = raw_input("Do you want to lemmatize text first? (yes/no)\n").lower()
            if response == "yes":
                print "You should wait for a while"
                self.lemmatize = True
                self.stemmer = Mystem()
                break
            elif response == "no":
                self.lemmatize = False
                break

        self.word_lists = list()
        with open(input_text, "r") as f:
            for line in f:
                line += "."
                if self.lemmatize:
                    lexemes = self.stemmer.lemmatize(line)
                    word_list = list()  # список слов, неразделенных знаками пунктуации
                    for lexeme in lexemes:
                        lexeme = lexeme.strip()
                        if lexeme:
                            if lexeme.translate(None, '.,?!:;()"\' -\t\n'):  # проверка, что лексема не является знаком пунктуации
                                lexeme = lexeme.decode("utf-8")
                                if is_cyrillic(lexeme):
                                    word_list.append(lexeme)
                            else:  # иначе, добавить биграмы из списка и завести новый пустой список
                                self.word_lists.append(word_list)
                                word_list = list()
                else:
                    line = line.replace(".", " . ").replace(",", " , ").replace(":", " : ").replace(";", " ; ")\
                        .replace("?", " ? ").replace("!", " ! ").replace("(", " ( ").replace(")", " ) ")\
                        .replace("--", " -- ").replace(".", " . ")
                    word_list = list()
                    for lexeme in line.split():
                        # проверка, что лексема не является знаком пунктуации
                        lexeme = lexeme.translate(None, '.,?!:;()"\'').replace("--", "").decode("utf-8").strip().lower()
                        if lexeme:
                            if is_cyrillic(lexeme):
                                word_list.append(lexeme)
                        else:
                            if word_list:
                                self.word_lists.append(word_list)
                            word_list = list()

        train, test = self.split()
        self.lid = Lid(train, test)
        self.lid.run()

    def split(self):
        n = len(self.word_lists)
        train = self.word_lists[:n*9/10]
        test = self.word_lists[n*9/10:]
        return train, test
开发者ID:ialibekov,项目名称:InfoSearch,代码行数:58,代码来源:hw_5.py

示例5: Index

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import lemmatize [as 别名]
class Index(object):

    def __init__(self, input_file):
        self.stemmer = Mystem()
        self.documents = dict()
        self.tokens = list()
        self.terms = dict()
        self.index = list()

        # reading documents, making tokenization
        with open(input_file, "r") as f:
            for i, line in enumerate(f, start=1):
                self.documents[i] = line.decode("utf-8")
                for word in self.stemmer.lemmatize(line):
                    token = word.translate(None, '.,?!:;()"\'-').decode("utf-8").strip()
                    if token:
                        self.tokens.append((token, i))

        # sorting by tokens first, then by frequency
        self.tokens.sort(key=lambda tup: (tup[0], tup[1]))

        # terminization and building index
        current_term = self.tokens[0][0]
        current_doc_id = self.tokens[0][1]
        doc_ids = [current_doc_id]
        for token, doc_id in self.tokens:
            term = token.lower()
            if term == current_term:
                if doc_id != current_doc_id:
                    doc_ids.append(doc_id)
                    current_doc_id = doc_id
            else:
                self.terms[current_term] = (len(doc_ids), doc_ids)
                self.index.append((current_term, len(doc_ids), doc_ids))
                current_term = term
                current_doc_id = doc_id
                doc_ids = [doc_id]
        self.terms[current_term] = (len(doc_ids), doc_ids)
        self.index.append((current_term, len(doc_ids), doc_ids))

    def print_to_file(self):
        with open("result.txt", "w") as f:
            for term, count, doc_ids in self.index:
                f.write("{},\t{},\t{}\n".format(term.encode("utf-8"), count, doc_ids))

    def print_statistics(self):
        terms_num = len(self.terms)
        terms_len = 0.
        for term in self.terms:
            terms_len += len(term)

        print "***********************"
        print "Number of terms = {}".format(terms_num)
        print "Average term length = {}".format(terms_len / terms_num)
        print "***********************"
开发者ID:ialibekov,项目名称:InfoSearch,代码行数:57,代码来源:index.py

示例6: extract

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import lemmatize [as 别名]
 def extract(self):
     try:
         #вычисляем, сколько в директории лежит файлов
         input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory))
         output_data = {}
         list_of_all_n_grams = {}
         m = Mystem()
         #иду по документам
         for file in input_files:
             with open(self.input_directory + '/' + file) as data_file:
                 data = json.load(data_file)
             list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text']))
             text = " ".join(["%s" % term for term in list_of_terms])
             list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text))
             list_of_n_grams_tuples = {}
             for j in range(0, self.n):
                 list_of_n_grams_tuples[j] = zip(*[list_of_terms[i:] for i in range(j + 1)])
             list_of_n_grams_strings = []
             for j in range(0, self.n):
                 for gram_tuple in list_of_n_grams_tuples[j]:
                     string_of_n_gram = " ".join(["%s" % term for term in gram_tuple])
                     list_of_n_grams_strings.append(string_of_n_gram)
             output_data[file] = {}
             output_data[file]['id'] = data['id']
             output_data[file]['positive'] = data['positive']
             output_data[file]['sarcasm'] = data['sarcasm']
             output_data[file]['terms'] = {}
             #убираю повторяющиеся слова
             for gram in list_of_n_grams_strings:
                 if gram not in output_data[file]['terms']:
                     output_data[file]['terms'][gram] = 1
                 else:
                     output_data[file]['terms'][gram] += 1
             for gram in output_data[file]['terms'].keys():
                 if gram not in list_of_all_n_grams:
                     list_of_all_n_grams[gram] = 1
                 else:
                     list_of_all_n_grams[gram] += 1
                 #подсчёт tf
                 count_of_n_grams = output_data[file]['terms'][gram]
                 output_data[file]['terms'][gram] = {'tf': float(count_of_n_grams)/len(list_of_n_grams_strings), 'idf': 0,
                                                     'count': float(count_of_n_grams)}
         for file in input_files:
             #подсчёт idf
             for gram in output_data[file]['terms'].keys():
                 output_data[file]['terms'][gram]['idf'] = math.log(float(len(input_files))/list_of_all_n_grams[gram])
             #запись результата
             with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file:
                 json.dump(output_data[file], output_file)
     except Exception:
         return False
     else:
         return True
开发者ID:pombredanne,项目名称:senty,代码行数:55,代码来源:more_than_n_gram_extractor_with_mystem.py

示例7: mystem_using

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import lemmatize [as 别名]
def mystem_using(input_directory, output_directory):
    input_files = filter(lambda x: not x.endswith('~'), os.listdir(input_directory))
    output_data = {}
    m = Mystem()
    for input_file in input_files:
        with open(input_directory + '/' + input_file) as data_file:
            data = json.load(data_file)
        list_of_terms = filter(lambda x: x != '', re.split(''' |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +''', data['text']))
        text = " ".join(["%s" % term for term in list_of_terms])
        list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), m.lemmatize(text))
        text_of_output = ' '.join(['%s' % term for term in list_of_terms])
        output_data[input_file] = {}
        output_data[input_file]['id'] = data['id']
        output_data[input_file]['positive'] = data['positive']
        output_data[input_file]['sarcasm'] = data['sarcasm']
        output_data[input_file]['text'] = text_of_output

        with open(output_directory + '/' + input_file, 'w') as output_file:
                    json.dump(output_data[input_file], output_file)
开发者ID:pombredanne,项目名称:senty,代码行数:21,代码来源:features.py

示例8: search

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import lemmatize [as 别名]
def search():
    cn = None
    file = codecs.open('static/articles.xml', 'r', 'utf-8')
    rfile = file.read()
    tree = lxml.etree.fromstring(rfile)
    res = tree.xpath('entry')
    categ = {
        'cat': 'Категория', 'wgroup': 'Группа слов с близким значением', 'comm': 'Комментарии',
        'stdiff': 'Стилистические различия', 'overlap': 'Совпадающая часть значения',
        'dom': 'Доминанта группы', 'diffmark': 'Различительные признаки, релевантные для данной группы',
        'diff': 'Смысловые различия', 'rare': 'Редкие слова, примыкающие к группе',
        'anmean': 'Другие значения слов, входящих в группу', 'comb': 'Сочетаемость', 'reg': 'Региональные варианты',
        'adict': 'Данные академических словарей', 'doc': 'Нормативные документы',
        'etim': 'Этимология', 'ill': 'Иллюстрации'
    }
    file.close()
    ms = Mystem()
    wordsearch = ms.lemmatize(request.form['search'].lower())[0]

    for i in res:
        if wordsearch == '':
            cn = 'Пустой запрос'
        elif i.text.lower().startswith(wordsearch):
            arr = []
            for j in i.iter():
                for k in dict.keys(categ):
                    if j.tag == k:
                        if j.text != 'null':
                            arr.append('<font size="4"><b>' + str(categ[j.tag]) + '</b></font><br>' + str(j.text))
                text = '<br><br>'.join([j for j in arr[1:]])
                text = re.sub('\*', '<b>', text)
                text = re.sub('\#', '</b>', text)
                text = re.sub('\$', '<i>', text)
                text = re.sub('\%', '</i>', text)
                text = re.sub('\@', '<font color="#696969">', text)
                text = re.sub('\+', '</font>', text)
                cn = '<strong><big>' + i.text + '</big></strong><br><br>' + re.sub('\n', '<br>', text)
            break
        else:
            cn = 'По Вашему запросу ничего не найдено. <br>' \
                 'Попробуйте использовать "Поиск по тегу" или измените запрос.'
    return render_template('search.html', cn=Markup(cn))
开发者ID:piskunova,项目名称:everydayobjectsdictionary,代码行数:44,代码来源:dictionary.py

示例9: Mystem

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import lemmatize [as 别名]
from pymystem3 import Mystem
m = Mystem()

t = 'Чайника, сегодня не было'
lemma = m.lemmatize(t)


def lemmas(text):
    punc = list('.?!-;:",')
    text = [i for i in text if i not in punc]
    text = ''.join(text)
    text = m.lemmatize(text)
    textn = ''
    for w in text:
        if w is not ' ' or '\n':
            textn += w
    return textn


from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import os

s_w = stopwords.words('russian')
sw = [i for i in s_w]

v = TfidfVectorizer(stop_words=sw) # убираем стоп-слова
#v = TfidfVectorizer() # не убираем стоп-слова

totalCorpus = []
suspenseCorpus = ''
开发者ID:IraPS,项目名称:Suspense,代码行数:33,代码来源:tf_idf_for_lemmas.py

示例10: lemma

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import lemmatize [as 别名]
def lemma(text):
    m = Mystem()
    lemmas = m.lemmatize(text)
    titleStemmed = ''.join(lemmas)
    return titleStemmed
开发者ID:IvankovCL,项目名称:new-repository,代码行数:7,代码来源:crawler.py

示例11: open

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import lemmatize [as 别名]
with open("../data/" + PREFIX + "norm_sentences.txt", "w") as writer:
    count = 0
    raw = []
    normalized = []

    for line in open("../data/" + PREFIX + "parsed.txt"):

        if count % 1000 == 0:
            print count

        line = re.sub("[\W\d]+", " ", line.strip().decode("utf-8").strip(), 0, re.UNICODE)
        line = re.sub("\s+", " ", line.strip(), 0, re.UNICODE).lower()
        raw.extend(line.split(" "))
        writer.write("* " + line.encode("utf-8") + " **;")
        # print line, '->',
        line = " ".join(normalizer.lemmatize(line))
        line = re.sub("\s+", " ", line, 0, re.UNICODE)
        lemmatized = filter(lambda x: len(x.strip()) > 0, normalizer.lemmatize(line))
        normalized.extend(lemmatized)
        # print line
        writer.write("* " + " ".join(lemmatized).encode("utf-8") + " **\n")

        count += 1

# print 'saving raw'
#
# with open("../data/raw_terms.txt", "w") as f:
#     for term in set(raw):
#         f.write(term.encode("utf-8") + "\n")
#
# print 'saving norm'
开发者ID:alexeyev,项目名称:chgk_exploration,代码行数:33,代码来源:normalizer.py

示例12: poehali

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import lemmatize [as 别名]

#.........这里部分代码省略.........
						read_and_clean_xml = openindosug_xml.read()
						xml_data = amixml(read_and_clean_xml)
						#print(xml_data[2])
						openindosug_xml.close()
						'''
						Созидание директории для plain текста
						'''
						
						create_folder(path, year, transpose_month(month), "plain")
						forplain = path+"plain/"+year+"/"+transpose_month(month)+"/"+dest_html
						forplain_dir = path+"plain/"+year+"/"+transpose_month(month)+"/"
						shutil.copy(path+"html/"+year+"/"+transpose_month(month)+"/"+dest_html, forplain)
						print("FILE "+str(i)+" HB COPIED TO PLAIN")
						openindosug = open(forplain, "r")

						dates = re.sub("\">", "", dates)


						'''
						wri = лист для генерации ИНФО о статьи
						'''

						wri = ["briansk.ru", str(xml_data[1]), toddmmyyy(dates), "", row['url']]


						page2_txt = open(str(forplain_dir)+str(plain), 'w')
						for datline in openindosug:
							page2_txt.write(str(make_it_clean(datline)))
						
						page2_txt.close()
						print("PLAIN FOR "+str(i)+" HB CREATED")

						'''
						Окончательная очистка plain файла; оставляем только текст статьи или текст + ИНФО
						'''
						provide_clean_file(forplain_dir+str(plain),forplain_dir+str(plain_new), wri, "extra")
						provide_clean_file(forplain_dir+str(plain),forplain_dir+str(plain_stem), wri, "mystem")
						os.remove(forplain_dir+str(plain))
						os.remove(forplain)
						openindosug.close()

						'''
						xml_data[0] -- content
						xml_data[1] -- headerTag
						xml_data[2] -- content date
						'''

						'''
						Генерация XML
						'''
						pageEtree = etree.Element('html')
						doc = etree.ElementTree(pageEtree)
						infoTag = etree.SubElement(pageEtree, "body")
						dateTag = etree.SubElement(infoTag, "h1")
						dateTag.text = str(xml_data[2])
						headerTag = etree.SubElement(infoTag, "h2")
						headerTag.text = str(xml_data[1])
						mainTag = etree.SubElement(infoTag, "h3")
						contentTag = etree.SubElement(infoTag, "h4")
						contentTag.text = str(xml_data[0])
						outFile = open(str(forxml_dir)+str(i)+".xml", 'wb')
						doc.write(outFile, xml_declaration=True, encoding='utf-16') 
						outFile.close()
						print("FILE "+str(i)+" HB CODED TO XML")

						writer.writerow([str(path+"html/"+year+"/"+transpose_month(month)+"/"+dest_html) , "briansk.ru" , "" , "" , str(xml_data[1]) , toddmmyyy(dates), 'публицистика' , "" , "" , "категория" , "" , "нейтральный" , "н-возраст" , "н-уровень" , "городская" , str(row['url']) , "брянск.ru" , "" , str(year) , "газета" , "Россия" , "БРЯНСК" , "ru"])
						os.remove(forxml)


						input_plain = forplain_dir + plain_stem
						output_plain = forplain_dir + output_plain_stem


						'''
						pystem
						mystem 

						'''
						
						with open(input_plain) as file:
						    text = file.read()
						

						lemmas = m.lemmatize(text)
						with open(input_plain, 'w') as file:
							file.write(''.join(lemmas))

						os.system(r'/home/haniani/Загрузки/mystem -icd '+ input_plain + ' ' + output_plain)
						os.system(r'/home/haniani/Загрузки/mystem -icd --format xml '+ input_plain +' '+ xml_stem)
						

						print("MYSTEM'ed "+str(i))
						break

				i += 1
				print("PASSED ; NEXT: "+str(i)+"\n")
	csv_file.close()
	        
	for file in glob.glob(path+"*.html"):
		os.remove(file)
开发者ID:haniani,项目名称:hse_zadanie3,代码行数:104,代码来源:3parts.py

示例13: open

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import lemmatize [as 别名]
    with open(file_in) as parsed_in, \
         open("..\\data\\stemmed\\" + name + "_mystem.tsv", "wb") as mystem_out:
         # open("..\\data\\stemmed\\" + name + "_porter.tsv", "wb") as porter_out, \

        parsed_in = csv.reader(parsed_in, delimiter='\t')
        mystem_out = csv.writer(mystem_out, delimiter='\t') #, quoting=csv.QUOTE_NONE

        mystem = Mystem()
        prep_counter = 0

        for row in parsed_in:
            exclude = ['\'', '\"', '.', ',', '!', '?', u'«', u'»']
            s = ''.join(ch for ch in row[1].decode("utf-8") if ch not in exclude)

            stemmed_tokens = m.lemmatize(s)
            stemmed_tokens = [token if emoticon_re.search(token) else token.lower() for token in stemmed_tokens]

            # punctuation = list(string.punctuation.decode("utf-8"))
            # stop = punctuation
            # stop = ['!', '"', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/',
            #         ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~'] #'@',
            stop = ['rt', 'via', '...', "…".decode("utf-8")]
            stemmed_tokens = [token if token not in stop else '' for token in stemmed_tokens]

            stemmed_str = "".join([token for token in stemmed_tokens])
            mystem_out.writerow([row[0], stemmed_str.encode("utf-8").replace('\n', ' ')])

            # Print a status message every 1000th review
            if prep_counter % 100. == 0.:
                print "Lemmatize %d strings" % (prep_counter)
开发者ID:Semen52,项目名称:FSA2,代码行数:32,代码来源:preprocessing.py

示例14: extract

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import lemmatize [as 别名]
    def extract(self):
        try:
            # вычисляем, сколько в директории лежит файлов
            input_files = filter(lambda x: not x.endswith("~"), os.listdir(self.input_directory))
            output_data = {}
            list_of_all_terms = {}
            m = Mystem()
            # иду по документам
            for file in input_files:
                with open(self.input_directory + "/" + file) as data_file:
                    data = json.load(data_file)
                list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data["text"]))
                text = " ".join(["%s" % term for term in list_of_terms])
                list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text))
                my_list_of_terms = []
                for term in list_of_terms:
                    my_term = term
                    term = u""
                    prev_letter = my_term[0]
                    term += my_term[0]
                    for i in range(1, len(my_term)):
                        if my_term[i] != prev_letter:
                            term += my_term[i]
                        prev_letter = my_term[i]
                    my_list_of_terms.append(term)
                list_of_terms = my_list_of_terms
                output_data[file] = {}
                output_data[file]["id"] = data["id"]
                output_data[file]["positive"] = data["positive"]
                output_data[file]["sarcasm"] = data["sarcasm"]
                output_data[file]["terms"] = {}
                # убираю повторяющиеся слова
                for term in list_of_terms:
                    if term not in output_data[file]["terms"]:
                        output_data[file]["terms"][term] = 1
                    else:
                        output_data[file]["terms"][term] += 1
                for term in output_data[file]["terms"].keys():
                    if term not in list_of_all_terms:
                        list_of_all_terms[term] = 1
                    else:
                        list_of_all_terms[term] += 1
                    # подсчёт tf
                    count_of_terms = output_data[file]["terms"][term]
                    output_data[file]["terms"][term] = {
                        "tf": float(count_of_terms) / len(list_of_terms),
                        "idf": 0,
                        "count": count_of_terms,
                    }

            for file in input_files:
                # подсчёт idf
                for term in output_data[file]["terms"].keys():
                    output_data[file]["terms"][term]["idf"] = math.log(
                        float(len(input_files)) / list_of_all_terms[term]
                    )
                # запись результата
                with open(self.output_directory + "/" + file + "_tf-idf", "w") as output_file:
                    json.dump(output_data[file], output_file)
        except Exception:
            return False
        else:
            return True
开发者ID:pombredanne,项目名称:senty,代码行数:65,代码来源:standard_extractor_with_mystem_and_considering_multiple_letters.py

示例15: Mystem

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import lemmatize [as 别名]
# Using pymystem3 lemmatize texts
import sys
from pymystem3 import Mystem

text = sys.argv[1]
m = Mystem()

lemmas = m.lemmatize(text)

print(''.join(lemmas))
开发者ID:pased,项目名称:katabasia-twitter-sna,代码行数:12,代码来源:lemmatize.py


注:本文中的pymystem3.Mystem.lemmatize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。