当前位置: 首页>>代码示例>>Python>>正文


Python Mystem.analyze方法代码示例

本文整理汇总了Python中pymystem3.Mystem.analyze方法的典型用法代码示例。如果您正苦于以下问题:Python Mystem.analyze方法的具体用法?Python Mystem.analyze怎么用?Python Mystem.analyze使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pymystem3.Mystem的用法示例。


在下文中一共展示了Mystem.analyze方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: without_pronouns

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
def without_pronouns(directory):
    input_files = filter(lambda x: not x.endswith('~'), os.listdir(directory))
    output_data = {}
    m = Mystem()
    #иду по документам
    for input_file in input_files:
        with open(directory + '/' + input_file) as data_file:
            data = json.load(data_file)
        list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), data['text'].split(' '))
        my_list = list_of_terms
        list_of_terms = []
        for term in my_list:
            if m.analyze(term)[0].get(u'analysis'):
                if not m.analyze(term)[0][u'analysis'][0][u'gr'].startswith((u'SPRO', u'APRO')):
                    list_of_terms.append(term)
            else:
                list_of_terms.append(term)
        text_of_output = ' '.join(['%s' % term for term in list_of_terms])

        output_data[input_file] = {}
        output_data[input_file]['id'] = data['id']
        output_data[input_file]['positive'] = data['positive']
        output_data[input_file]['sarcasm'] = data['sarcasm']
        output_data[input_file]['text'] = text_of_output

        with open(directory + '/' + input_file, 'w') as output_file:
            json.dump(output_data[input_file], output_file)
开发者ID:pombredanne,项目名称:senty,代码行数:29,代码来源:features.py

示例2: lmtze

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
def lmtze(textfile):
    m = Mystem()
    text = open(textfile, encoding='utf-8').readlines()
    newfile = open(textfile.replace('txt', 'lem.txt'), 'w', encoding='utf-8')
    result_full = []
    for line in text:
        try:
            element = etree.fromstring(line.strip('\n'))
            text_ = element.xpath('text()')
            entities = element.xpath('*')
            result = ['<sent>']
            while text_:
                l = text_.pop(0)
                # open('temp.txt', 'w', encoding='utf-8').write(l)
                # subprocess.call(['C:\\Mystem\\mystem', 'i'])
                l = m.analyze(l)
                # print(l)
                for x in l:
                    if x.get('analysis') is not None:
                        if x.get('analysis') == []:
                            result.append(x['text'])
                        else:
                            result.append(x['analysis'][0]['lex'] + '_' + x['analysis'][0]['gr'].split(',')[0].split('=')[0])
                    else:
                        continue

                if text_:
                    e = entities.pop(0)
                    e_ = m.analyze(e.text)
                    result.append('<' + e.tag + '>')
                    for x in e_:
                        if x.get('analysis') is not None:
                            if x.get('analysis') == []:
                                result.append(x['text'])
                            else:
                                result.append(x['analysis'][0]['lex'])
                        else:
                            continue
                    result.append('</' + e.tag + '>')
        except Exception:
            continue
        result.append('</sent>')
        result_full.append(result)
        result = []
        print(len(result_full), ' разобралось')
    for sent in result_full:
        prev = ''
        for x in sent:
            if '<' in x and '/' not in x:
                newfile.write(prev + x)
                prev = ''
            elif '_' in x or x.isalpha():
                newfile.write(prev + x)
                prev = ' '
            else:
                newfile.write(x)
        newfile.write('\n')
开发者ID:mannefedov,项目名称:Relext,代码行数:59,代码来源:lmtze.py

示例3: extract

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
    def extract(self):
        try:
            #вычисляем, сколько в директории лежит файлов
            input_files = filter(lambda x: not x.endswith('~'), os.listdir(self.input_directory))
            output_data = {}
            list_of_all_terms = {}
            m = Mystem()
            #иду по документам
            for file in input_files:
                with open(self.input_directory + '/' + file) as data_file:
                    data = json.load(data_file)
                list_of_terms = filter(lambda x: x != "", re.split(""" |\.|,|:|\?|"|\n|<|>|\*|!|@|_ +""", data['text']))
                text = " ".join(["%s" % term for term in list_of_terms])
                list_of_terms = filter(lambda x: x not in (" ", "\n"), m.lemmatize(text))
                my_list = list_of_terms
                list_of_terms = []
                for term in my_list:
                    if m.analyze(term)[0].get(u'analysis'):
                        if not m.analyze(term)[0][u'analysis'][0][u'gr'].startswith(self.service_parts_of_speech) and len(term) > 1:
                            list_of_terms.append(term)
                        if term == u'не':
                            list_of_terms.append(term)
                    else:
                        list_of_terms.append(term)
                output_data[file] = {}
                output_data[file]['id'] = data['id']
                output_data[file]['positive'] = data['positive']
                output_data[file]['sarcasm'] = data['sarcasm']
                output_data[file]['terms'] = {}
                #убираю повторяющиеся слова
                for term in list_of_terms:
                    if term not in output_data[file]['terms']:
                        output_data[file]['terms'][term] = 1
                    else:
                        output_data[file]['terms'][term] += 1
                for term in output_data[file]['terms'].keys():
                    if term not in list_of_all_terms:
                        list_of_all_terms[term] = 1
                    else:
                        list_of_all_terms[term] += 1
                    #подсчёт tf
                    count_of_terms = output_data[file]['terms'][term]
                    output_data[file]['terms'][term] = {'tf': float(count_of_terms)/len(list_of_terms), 'idf': 0,
                                                        'count': count_of_terms}

            for file in input_files:
                #подсчёт idf
                for term in output_data[file]['terms'].keys():
                    output_data[file]['terms'][term]['idf'] = math.log(float(len(input_files))/list_of_all_terms[term])
                #запись результата
                with open(self.output_directory + '/' + file + '_tf-idf', 'w') as output_file:
                    json.dump(output_data[file], output_file)
        except Exception:
            return False
        else:
            return True
开发者ID:pombredanne,项目名称:senty,代码行数:58,代码来源:standard_extractor_with_mystem_without_service_parts_of_speech.py

示例4: set

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
    if options:
        title = options.group(1)
        for stuff in title.split('|'):
            yield gr.replace("(" + title + ")", stuff)
    else:
        yield gr


lines = set([])

with open("data/test.txt", "r") as input_file:
    logging.info("file opened")

    for line in input_file:
        for w in m.analyze(line):

            if 'analysis' in w:
                for item in w['analysis']:
                    for gramm_info in parse_gr(item['gr']):
                        lines.add("\t".join(
                            [gramm_info, item['lex'], w['text'].lower()]).encode("utf-8") + "\n")

with open("data/pairs_with_grammar.tsv", "w+") as f:
    for line in lines:
        f.write(line)

dict = {}

for line in open("data/pairs_with_grammar.tsv", "r+"):
    if line.strip():
开发者ID:alexeyev,项目名称:nm,代码行数:32,代码来源:data_preparation.py

示例5: index

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
def index(name = None):
    if request.args:
        story = request.args['joke'] 
        mystem = Mystem()
        gramm = mystem.analyze(story)
        characters = set()
        for i in gramm:
            if (str(i).find("од=") != -1) and (str(i).find("неод=") == -1):
                s1 = str(i)[str(i).find("'lex': '") + 8:]
                characters.add(s1[:s1.find(        "'")])
        
        file = open("corp.txt", 'r', encoding = "UTF-8")
        f = file.read()[1:].split('\n\n')
        file.close()
        
        file = open("ans.txt", 'w', encoding = "UTF-8")
        for i in f:
            words = ((re.sub('[,\.\?\!\—\-\(\)\:\;]', '', i)).lower()).split(' ')
            if characters <= set(words):
                f = file.write(i + '\n\n')
        file.close()
        with open("ans.txt", "r", encoding='utf-8') as f:
                content = f.read().split('\n\n')
        return render_template("index.html", content=content)        
    return render_template('index.html')
开发者ID:polinadyakonova,项目名称:homeworks,代码行数:27,代码来源:project.py

示例6: __init__

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
    def __init__(self, path):

        self.text = open(path).read().lower()
        self.sentences = [sentence for sentence in re.split(r'(?:[.]\s*){3}|[.?!]', self.text) if len(sentence) > 1]
        self.pos_data = []

        m = Mystem()
        counter = [0, 0, 0, 0, 0]

        for sentence in self.sentences:

            # parse with mystem
            # count adjectives A, nouns S, verbs V, adverbs ADV, pronouns PR
            data = m.analyze(sentence)
            for word in data:
                analysis = word.get('analysis', None)
                if analysis:
                    best = analysis[0]
                    gr = best['gr']
                    if 'S' in gr:
                        counter[3] += 1
                    elif 'ADV' in gr:
                        counter[1] += 1
                    elif 'A' in gr:
                        counter[0] += 1
                    elif 'V' in gr:
                        counter[4] += 1
                    elif 'PR' in gr:
                        counter[2] += 1

            self.pos_data.append(counter)
            counter = [0, 0, 0, 0, 0]

        self.data = np.array(self.pos_data)
开发者ID:Sereni,项目名称:assignments,代码行数:36,代码来源:genre_by_pos.py

示例7: __init__

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
    def __init__(self, path, doc_id, limit):
        """
        :param doc_id: numerical id of a document, pass manually
        """

        self.text = open(path).read().lower().replace('\n', '.')
        # need a better regex
        self.sentences = [sentence for sentence in re.split(r'(?:[.]\s*){3}|[.?!]', self.text) if sentence and len(sentence.split()) > 2]
        self.pos_data = []
        self.testing_data = []
        self.id = doc_id

        m = Mystem()
        counter = Counter(DEFAULTS)

        if not limit or limit > len(self.sentences):
            limit = len(self.sentences)

        for sentence in self.sentences[:limit]:

            # parse with mystem
            data = m.analyze(sentence)

            # get POS and count for each sentence
            pos = [word.get('analysis', None)[0]['gr'].split('(')[0].split(',')[0].split('=')[0]
                   for word in data if word.get('analysis', None)]
            counter.update(pos)

            # append to dataset
            self.pos_data.append([counter[key] for key in sorted(counter)])

            # reset counter
            counter = Counter(DEFAULTS)
开发者ID:Sereni,项目名称:assignments,代码行数:35,代码来源:ageeva_learning.py

示例8: with_not

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
def with_not(directory):
    input_files = filter(lambda x: not x.endswith('~'), os.listdir(directory))
    output_data = {}
    m = Mystem()
    #иду по документам
    for input_file in input_files:
        with open(directory + '/' + input_file) as data_file:
            data = json.load(data_file)
        list_of_terms = filter(lambda x: x not in ('', ' ', '\n'), data['text'].split(' '))


        # обработка не + (слово)
        nums_of_bigrams = []
        helping_words = [u'совсем', u'очень', u'слишком', u'самый']
        for i in range(0, len(list_of_terms)):
            if list_of_terms[i] == u'не' and list_of_terms[i+1] not in helping_words:
                if m.analyze(list_of_terms[i+1])[0].get(u'analysis'):
                    if not m.analyze(list_of_terms[i+1])[0][u'analysis'][0][u'gr'].startswith(u'S,'):
                        nums_of_bigrams.append((i, i+1))
            elif list_of_terms[i] == u'не' and list_of_terms[i+1] in helping_words:
                if m.analyze(list_of_terms[i+2])[0].get(u'analysis'):
                    if not m.analyze(list_of_terms[i+2])[0][u'analysis'][0][u'gr'].startswith(u'S,'):
                        nums_of_bigrams.append((i, i+2))
        for i in range(0, len(nums_of_bigrams)):
            if nums_of_bigrams[i][0] + 1 == nums_of_bigrams[i][1]:
                list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + list_of_terms[nums_of_bigrams[i][1]]
                list_of_terms[nums_of_bigrams[i][1]] = ''
            elif nums_of_bigrams[i][0] + 2 == nums_of_bigrams[i][1]:
                list_of_terms[nums_of_bigrams[i][0]] = list_of_terms[nums_of_bigrams[i][0]] + list_of_terms[nums_of_bigrams[i][1]]
                list_of_terms[nums_of_bigrams[i][1] - 1] = ''
                list_of_terms[nums_of_bigrams[i][1]] = ''
        list_of_terms = filter(lambda x: x != '', list_of_terms)


        text_of_output = ' '.join(['%s' % term for term in list_of_terms])

        output_data[input_file] = {}
        output_data[input_file]['id'] = data['id']
        output_data[input_file]['positive'] = data['positive']
        output_data[input_file]['sarcasm'] = data['sarcasm']
        output_data[input_file]['text'] = text_of_output

        with open(directory + '/' + input_file, 'w') as output_file:
            json.dump(output_data[input_file], output_file)
开发者ID:pombredanne,项目名称:senty,代码行数:46,代码来源:features.py

示例9: build_pos

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
    def build_pos(self):

        m = Mystem()
        counter = Counter(DEFAULTS)

        for doc in self.documents:

            # parse with mystem
            data = m.analyze(doc.text)

            # get POS and count for each sentence
            pos = [word.get('analysis', None)[0]['gr'].split('(')[0].split(',')[0].split('=')[0]
                   for word in data if word.get('analysis', None)]
            counter.update(pos)

            # append to dataset
            self.pos_data.append([counter[key] for key in sorted(counter)])

            # reset counter
            counter = Counter(DEFAULTS)
开发者ID:Sereni,项目名称:assignments,代码行数:22,代码来源:learning_news.py

示例10: produce_lemmas

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
def produce_lemmas(connection, tableName, outputTableName):
    mystem = Mystem()
    cursor = connection.cursor()
    inserter = connection.cursor()

    query = 'DELETE FROM `%s`' % outputTableName
    inserter.execute(query)
    connection.commit()

    query = 'SELECT * FROM `%s`' % tableName
    cursor.execute(query)
    query = 'INSERT INTO `' + outputTableName + '` (`' + tableName + '_id`, `word_class_id`, `lex`, `gr`)' \
            'SELECT %i, `id`, "%s", "%s" FROM `word_classes` WHERE `abbr`="%s"'
    for id, concept, scheme in cursor:
        lemmas = mystem.analyze(concept)
        for lemma in lemmas:
            for analysis in lemma.get('analysis', []):
                inserter.execute(query % prepare_content(id, analysis))
    connection.commit()

    cursor.close()
开发者ID:Brinit,项目名称:nlp,代码行数:23,代码来源:lemma.py

示例11: fill_mystem

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
def fill_mystem():
    from pymystem3 import Mystem
    m = Mystem()
    for sentence in get_sentences(1):
        lemmas = m.analyze(sentence.source)
        items = list()
        for lemma in lemmas:
            text = lemma['text']
            analysis = lemma.get('analysis')
            if not analysis:
                text = text.strip()
                if not len(text):
                    print 'spaces = "%s"' % text
                    continue
                if ' ' in text:
                    for item in re.split('\s+', text):
                        items.append("%s   %s ?" % (item, item))
                    print 'several =', "|".join(re.split('\s+', text))
                    continue
                print 'delimiter = "%s"' % text
                items.append("%s   %s ?" % (text, text))
                continue

            if not len(text.strip()):
                raise Exception('Impossible')
            if ' ' in text:
                raise Exception('Impossible')

            lexemes = list()
            for lexeme in analysis:
                print 'lex=', lexeme.get('lex', '-')
                print 'gr=', lexeme.get('gr', '-')
                lexemes.append("%s %s" % (lexeme['lex'], lexeme['gr']))
            items.append("%s   %s" % (text, '  '.join(lexemes)))
        sentence.mystem = '\n'.join(items)
        sentence.save()
开发者ID:2vitalik,项目名称:collocations,代码行数:38,代码来源:other.py

示例12: Mystem

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
import os, json, dicttoxml
from pymystem3 import Mystem

m = Mystem()
top = 'C:\\Users\\John\\Desktop\\py_files\\питон\\korpus\\no_marks'
for root, dirs, files in os.walk(top):
    for name in files:
        loc = os.path.join(root, name)
        loc_list = loc.split('\\')  #creates list in order to remove path content
        new_root = loc.replace('\\no_marks\\{0}\\{1}\\{2}'.format(loc_list[8], loc_list[9], loc_list[10]), '') #removes path ending
        dir_marks = os.path.join(new_root + '\\marks\\{0}\\{1}'.format(loc_list[8], loc_list[9]))   #adds new path ending for json.docs
        dir_xml = os.path.join(new_root + '\\xml\\{0}\\{1}'.format(loc_list[8], loc_list[9]))       #adds new path ending for xml docs
        new_name = name.replace('.txt', '')
        if not os.path.exists(dir_marks):   #makes nesessary dirs if not present
            os.makedirs(dir_marks)
        if not os.path.exists(dir_xml):
            os.makedirs(dir_xml)
        with open(loc, "r", encoding = 'utf-8') as doc:
            text_doc = doc.read()
            lines = doc.readlines()
            info = json.dumps(m.analyze(text_doc), ensure_ascii = False)  #creates text file with gram and lem info
        with open("{0}\\{1}.json".format(dir_marks, new_name), 'w', encoding = 'utf-8') as doc_marks:
            doc_marks.write(info)
        xml = dicttoxml.dicttoxml(info).decode('utf-8')     #converts json to xml
        with open("{0}\\{1}.xml".format(dir_xml, new_name), 'w', encoding = 'utf-8') as doc_xml:
            doc_xml.write(xml)


开发者ID:lylax47,项目名称:Homework-and-Stuffs,代码行数:28,代码来源:Stemmer.py

示例13: Mystem

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
# coding: utf-8

from pymystem3 import Mystem
# text = "Голкипер «Нью-Йорк Айлендерс» а-б г-н ваыва-ыфвафыа Выступая на пресс-конференции в Лондоне, он подчеркнул, что опубликованные необработанные отчеты с мест боевых действий не содержат имен или информации, которая может повредить каким-либо лицам или организациям. Красивая, — 123.2 latin мама 4,5 7:8 красиво мыла раму"
text = "слив воды"
m = Mystem()
# lemmas = m.lemmatize(text)
# print(''.join(lemmas))
lemmas = m.analyze(text)
for lemma in lemmas:
    print '#"%s"' % lemma['text']
    a = lemma.get('analysis')
    # print a
    if a:
        for b in a:
            print 'lex=', b.get('lex', '-')
            print 'gr=', b.get('gr', '-')
    print
开发者ID:2vitalik,项目名称:collocations,代码行数:20,代码来源:_helloworld.py

示例14: MystemOCTagger

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
class MystemOCTagger(object):
	def __init__(self):
		self.mystem_inst = Mystem()


	def run_and_convert(self, input_file, output_file, strict_match = False):
		f_in = open(input_file, 'rb')
		f_out = open(output_file, 'w+')
		context = etree.iterparse(f_in, tag='sentence')
		for event, sentence_elem in context:
			sentence = sentence_elem.find('source')
			analyzed = self.analyze_sentence(sentence.text)
			tokens_tree = sentence_elem.find('tokens')
			tokens = self.extract_tokens(tokens_tree)
			matched = self.match_analyzed_tokens(tokens, analyzed, strict_match)

			result = self.analyzed_to_csv_list(matched)
			for s in result:
				f_out.write(s+'\n')

			sentence_elem.clear()

	def analyze_sentence(self, sentence):
		return self.mystem_inst.analyze(sentence)

	# builds word-index mapping, indices sorted in order of appearance
	def extract_tokens(self, tokens_tree):
		tokens_dict = {}
		for t in tokens_tree.iter('token'):
			idx = t.get('id')
			token = t.get('text')
			token = strip_word(token)
			if (len(token) > 0):
				if token in tokens_dict:
					tokens_dict.get(token).append(idx)
				else:
					tokens_dict[token] = [idx]

		return tokens_dict


	# matches analysis with original tokens indices   
	def match_analyzed_tokens(self, tokens_index, analyzed, strict_match = False):
		analysis_indexed = {}
		unindexed = []
		for t in analyzed:
			t_text = t.get('text')
			t_text = strip_word(t_text)
			if len(t_text) > 0:
				if t_text in tokens_index:
					idx = tokens_index.get(t_text).pop(0)
					if (len(tokens_index.get(t_text)) == 0):
						tokens_index.pop(t_text)
					analysis_indexed[idx] = t.get('analysis')
				else:
					unindexed.append(t)

		if (not strict_match):
			analysis_not_strict = {}
			if len(tokens_index) > 0:
				analysis_not_strict = self.match_not_strict(tokens_index, unindexed)

			analysis_indexed.update(analysis_not_strict)

		not_analyzed = []
		if len(tokens_index) > 0:
			for t in tokens_index:
				not_analyzed.append(t)

#		if len(not_analyzed) > 0:
#			f_unindexed = open('mismatch.txt', 'a+')
#			f_unindexed.write('oc ')
#			f_unindexed.write(str(not_analyzed)+'  ')
#
#			if len(unindexed) > 0:
#				f_unindexed = open('mismatch.txt', 'a+')
#				for u in unindexed:
#					f_unindexed.write(' ')
#					f_unindexed.write(str(u.get('text')))

#			f_unindexed.write('\n')


		return analysis_indexed

	def match_not_strict(self, tokens_index, analyzed):
		analysis_indexed = {}
		for t_indexed, idx_list in tokens_index.items():
			for idx in idx_list:
				for i in range(0, len(analyzed)):
					t_analyzed = analyzed[i]
					if t_indexed.endswith(t_analyzed.get('text')):
						analysis_indexed[idx] = t_analyzed.get('analysis')
						#print(t_analyzed.get('text')+' '+t_indexed)
						analyzed.pop(i)
						idx_list.remove(idx)
						break

		idx_copy = tokens_index.copy()
		for t, i in idx_copy.items():
#.........这里部分代码省略.........
开发者ID:arkhycat,项目名称:oc_other_taggers,代码行数:103,代码来源:mystem_oc.py

示例15: main

# 需要导入模块: from pymystem3 import Mystem [as 别名]
# 或者: from pymystem3.Mystem import analyze [as 别名]
def main(argv):			
		with open(argv[1], encoding='utf-8') as f:
		    s = re.sub(r'\s+', ' ', f.read(), flags=re.M)
		f=re.split(r'(?<=[.!?…]) ',s)
		sentens=[]
		for i,t in enumerate(f):
		    sentens.append(t)
		    print(str(i)," ",t)




		morph = pymorphy2.MorphAnalyzer() 

		ZnakiP=[",","!","/n",".",":",";",'"',"'","\n","...","?","!","(",")","-"," ","  "]
		t = Mystem()
		PARS=[]
		for sent in sentens:
		    input_file=open("input.txt","w",encoding="utf-8")
		    input_file.write(sent)
		    input_file.close()
		    
		    # Делаем синтаксический анализ текста, находим граматические основы
		    process = subprocess.Popen('tomitaparser.exe config.proto', stdout=subprocess.PIPE,shell=True) 
		    process.communicate()
		    process.wait()
		    
		    predicate=[]
		    Nouns=[]
		    DOP=[]
		    DOP.append({})
		    OPR=[]
		    with open("pretty.html",encoding='utf8') as fp:
		            soup = BeautifulSoup(fp,"html.parser")    
		    par_f=soup.find_all('table')
		    for table in par_f:
		        th=table.find('th')    
		        if(th.text=="Noun1"):
		            slovo=th.find_parent("table").find('a').text
		            Nouns.append(slovo)
		        if(th.text=="Verb1"):
		            slovo=th.find_parent("table").find('a').text
		            predicate.append(slovo)
		        if(th.text=="OPR1"):
		            sl=th.find_parent("table").find_all('a')
		            for slovo in sl:
		                OPR.append(slovo.text)
		        if(th.text=="DOP1"):
		            sl=th.find_parent("table").find_all('a')
		            for slovo in sl:
		                DOP[0][slovo.text.lower()]=slovo.next_element.next_element.next_element.next_element
		    TREE={}
		    TREE[Nouns[0]]={} 

		    

		    for v in predicate:
		        TREE[Nouns[0]][v]={}
		    if(OPR!=[]):
		            for temp in OPR:
		                for noun in TREE:
		                    if(len(re.split(r"[,' ']",temp))==1):
		                        TREE[Nouns[0]][temp]=t.analyze(temp)[0]['analysis'][0]['gr']
		                    else:
		                            m2=[]
		                            for f in re.split(r"[,' ']",temp):
		                                if(f!=''):
		                                    m2.append(f)
		                            if(noun in m2):
		                                mk=t.analyze(temp)
		                                wsp=[]
		                                for tr in mk:
		                                    if(not tr['text'] in ZnakiP):
		                                        if(not 'CONJ' in tr['analysis'][0]['gr']):
		                                            wsp.append(tr['text'])
		                                for tl in wsp:
		                                    if(tl!=noun):
		                                        TREE[Nouns[0]][tl]=t.analyze(tl)[0]['analysis'][0]['gr']



		    for temp in TREE[Nouns[0]]:
		        if(temp in DOP[0].values()):
		            for sp in DOP[0]:
		                if(DOP[0][sp]==temp):
		                    m2=[]
		                    for f in re.split(r"[,' ']",sp):
		                        if(f!=''):
		                            m2.append(f)                         
		                    for rg in m2:                    
		                        TREE[Nouns[0]][temp][rg]={}
		                        for _opr in OPR:
		                            reg=re.split(r"[,' ']",temp)                        
		                            if(noun in reg):
		                                mk=t.analyze(_opr)
		                                wsp=[]
		                                for tr in mk:
		                                    if(not tr['text'] in ZnakiP):
		                                        if(not 'CONJ' in tr['analysis'][0]['gr']):
		                                            wsp.append(tr['text'])
#.........这里部分代码省略.........
开发者ID:mforv,项目名称:Signify,代码行数:103,代码来源:denotat.py


注:本文中的pymystem3.Mystem.analyze方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。