当前位置: 首页>>代码示例>>Python>>正文


Python nltk.RegexpParser方法代码示例

本文整理汇总了Python中nltk.RegexpParser方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.RegexpParser方法的具体用法?Python nltk.RegexpParser怎么用?Python nltk.RegexpParser使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk的用法示例。


在下文中一共展示了nltk.RegexpParser方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: text_to_num

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def text_to_num(text):
    tokenized = nltk.word_tokenize(text);
    tags = nltk.pos_tag(tokenized)
    print(tags)
    chunkPattern = r""" Chunk0: {((<NN|CD.?|RB>)<CD.?|VBD.?|VBP.?|VBN.?|NN.?|RB.?|JJ>*)<NN|CD.?>} """
    chunkParser = nltk.RegexpParser(chunkPattern)
    chunkedData = chunkParser.parse(tags)
    print(chunkedData)

    for subtree in chunkedData.subtrees(filter=lambda t: t.label() in "Chunk0"):
        exp = ""
        for l in subtree.leaves():
            exp += str(l[0]) + " "
        exp = exp[:-1]
        print(exp)
        try:
            text = text.replace(exp, str(t2n.text2num(exp)))
        except Exception as e:
            print("error text2num ->", e.args)
        print("text2num -> ", text)
    return text 
开发者ID:abhi007tyagi,项目名称:JARVIS,代码行数:23,代码来源:math_expression_calculator.py

示例2: __init__

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def __init__(self):
        self.grammar = r"""
                        VP: {<ADJ_SIM><V_PRS>}
                        VP: {<ADJ_INO><V.*>}
                        VP: {<V_PRS><N_SING><V_SUB>}
                        NP: {<N_SING><ADJ.*><N_SING>}
                        NP: {<N.*><PRO>}
                        VP: {<N_SING><V_.*>}
                        VP: {<V.*>+}
                        NP: {<ADJ.*>?<N.*>+ <ADJ.*>?}
                        DNP: {<DET><NP>}
                        PP: {<ADJ_CMPR><P>}
                        PP: {<ADJ_SIM><P>}
                        PP: {<P><N_SING>}
                        PP: {<P>*}
                        DDNP: {<NP><DNP>}
                        NPP: {<PP><NP>+}
                        """

        self.cp = nltk.RegexpParser(self.grammar) 
开发者ID:ICTRC,项目名称:Parsivar,代码行数:22,代码来源:chunker.py

示例3: myParser

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def myParser():
    grammar = '\n'.join([
	'NP: {<DT>*<NNP>}',
	'NP: {<JJ>*<NN>}',
	'NP: {<NNP>+}',
	])
    return nltk.RegexpParser(grammar) 
开发者ID:PacktPublishing,项目名称:Natural-Language-Processing-with-Python-Cookbook,代码行数:9,代码来源:Training.py

示例4: test_baseline

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def test_baseline():
    cp = nltk.RegexpParser("")
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
    # print(len(test_sents[0]))
    # print(test_sents[0])
    print(cp.evaluate(test_sents)) 
开发者ID:PacktPublishing,项目名称:Natural-Language-Processing-with-Python-Cookbook,代码行数:8,代码来源:Training.py

示例5: test_regexp

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def test_regexp():
    grammar = r"NP: {<[CDJNP].*>+}"
    cp = nltk.RegexpParser(grammar)
    test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
    print(cp.evaluate(test_sents)) 
开发者ID:PacktPublishing,项目名称:Natural-Language-Processing-with-Python-Cookbook,代码行数:7,代码来源:Training.py

示例6: extract_experience

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def extract_experience(resume_text):
    '''
    Helper function to extract experience from resume text

    :param resume_text: Plain resume text
    :return: list of experience
    '''
    wordnet_lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # word tokenization 
    word_tokens = nltk.word_tokenize(resume_text)

    # remove stop words and lemmatize  
    filtered_sentence = [w for w in word_tokens if not w in stop_words and wordnet_lemmatizer.lemmatize(w) not in stop_words] 
    sent = nltk.pos_tag(filtered_sentence)

    # parse regex
    cp = nltk.RegexpParser('P: {<NNP>+}')
    cs = cp.parse(sent)
    
    # for i in cs.subtrees(filter=lambda x: x.label() == 'P'):
    #     print(i)
    
    test = []
    
    for vp in list(cs.subtrees(filter=lambda x: x.label()=='P')):
        test.append(" ".join([i[0] for i in vp.leaves() if len(vp.leaves()) >= 2]))

    # Search the word 'experience' in the chunk and then print out the text after it
    x = [x[x.lower().index('experience') + 10:] for i, x in enumerate(test) if x and 'experience' in x.lower()]
    return x 
开发者ID:OmkarPathak,项目名称:ResumeParser,代码行数:34,代码来源:utils.py

示例7: setup_extractor

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def setup_extractor(self):
        self.splitter = PunktSentenceSplitter(self.language)
        grammar = self.grammars.get(self.language)
        if grammar:
            self.parser = RegexpParser(grammar)
        else:
            raise ValueError(
                "Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s" % (
                    self.language, self.grammars.keys())
            )

        for lemma, match_tokens in self.lemma_to_token.iteritems():
            self.lemma_to_token[lemma] = set([match.lower() for match in match_tokens]) 
开发者ID:Wikidata,项目名称:StrepHit,代码行数:15,代码来源:extract_sentences.py

示例8: get_parse_info

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def get_parse_info(parsestr, stemmer, language, stoplist):
    hash_token_pos = OrderedDict()
    if language=='german':
        grammar = r"""
            NBAR:
            {<N.*|ADJ.*>*<N.*>}  # Nouns and Adjectives, terminated with Nouns
            VP:
            {<V.*>}  # terminated with Verbs
            NP:
            {<NBAR>}
            {<NBAR><APPR><NBAR>}  # Above, connected with in/of/etc...
        """
    if language=='english':
        #Taken from Su Nam Kim Paper...
        grammar = r"""
            NBAR:
            {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
            VP:
            {<V.*>}  # terminated with Verbs
            NP:
            {<NBAR>}
            {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
        """
    
    chunker = RegexpParser(grammar)
    
    postoks = []
    for i in Tree.fromstring(parsestr).subtrees():
        if i.height() == 2:
            word, pos = i[0], i.label()
            hash_token_pos[stemmer.stem(word)] = word + u"::" + pos
            postoks.append((word, pos))
       
    chunk_tree = chunker.parse(postoks)
    phrases = get_terms(chunk_tree, stemmer, stoplist)
    phrase_list = [ ' '.join(term) for term in phrases if term]
    return hash_token_pos, phrase_list 
开发者ID:UKPLab,项目名称:acl2017-interactive_summarizer,代码行数:39,代码来源:data_helpers.py

示例9: nltk_parse_clause

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def nltk_parse_clause(sentence):
  """
  Natural Language Toolkit: code_cascaded_chunker
  http://www.nltk.org/book/ch07.html#code-cascaded-chunker
  """
  grammar = r"""
  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
  PP: {<IN><NP>}               # Chunk prepositions followed by NP
  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
  CLAUSE: {<NP><VP>}           # Chunk NP, VP
  """
  cp = nltk.RegexpParser(grammar)
  #sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"),  ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]
  parsed_sentence = cp.parse(sentence)
  #print('parsed_sentence=', parsed_sentence) 
开发者ID:nicolashernandez,项目名称:PyRATA,代码行数:17,代码来源:do_benchmark.py

示例10: extract_candidates

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def extract_candidates(text_obj, no_subset=False):
    """
    Based on part of speech return a list of candidate phrases
    :param text_obj: Input text Representation see @InputTextObj
    :param no_subset: if true won't put a candidate which is the subset of an other candidate
    :param lang: language (currently en, fr and de are supported)
    :return: list of candidate phrases (string)
    """

    keyphrase_candidate = set()

    np_parser = nltk.RegexpParser(get_grammar(text_obj.lang))  # Noun phrase parser
    trees = np_parser.parse_sents(text_obj.pos_tagged)  # Generator with one tree per sentence

    for tree in trees:
        for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'):  # For each nounphrase
            # Concatenate the token with a space
            keyphrase_candidate.add(' '.join(word for word, tag in subtree.leaves()))

    keyphrase_candidate = {kp for kp in keyphrase_candidate if len(kp.split()) <= 5}

    if no_subset:
        keyphrase_candidate = unique_ngram_candidates(keyphrase_candidate)
    else:
        keyphrase_candidate = list(keyphrase_candidate)

    return keyphrase_candidate 
开发者ID:swisscom,项目名称:ai-research-keyphrase-extraction,代码行数:29,代码来源:extractor.py

示例11: generate_tree

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def generate_tree(text):
    text = text.replace('“', '"') #to preserve quotes in text, primarily news content
    text = text.replace('”', '"')
    text = text.replace('’', "'")
    text = unidecode(text)
    chunker = nltk.RegexpParser(grammar)
    tokenized_text = nltk.tokenize.word_tokenize(text)
    postoks = nltk.tag.pos_tag(tokenized_text)
    tree = chunker.parse(postoks)
    return tree 
开发者ID:manasRK,项目名称:word2vec-recommender,代码行数:12,代码来源:phrases_extractor.py

示例12: extract_experience

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def extract_experience(resume_text):
    '''
    Helper function to extract experience from resume text

    :param resume_text: Plain resume text
    :return: list of experience
    '''
    wordnet_lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # word tokenization
    word_tokens = nltk.word_tokenize(resume_text)

    # remove stop words and lemmatize
    filtered_sentence = [
            w for w in word_tokens if w not
            in stop_words and wordnet_lemmatizer.lemmatize(w)
            not in stop_words
        ]
    sent = nltk.pos_tag(filtered_sentence)

    # parse regex
    cp = nltk.RegexpParser('P: {<NNP>+}')
    cs = cp.parse(sent)

    # for i in cs.subtrees(filter=lambda x: x.label() == 'P'):
    #     print(i)

    test = []

    for vp in list(
        cs.subtrees(filter=lambda x: x.label() == 'P')
    ):
        test.append(" ".join([
            i[0] for i in vp.leaves()
            if len(vp.leaves()) >= 2])
        )

    # Search the word 'experience' in the chunk and
    # then print out the text after it
    x = [
        x[x.lower().index('experience') + 10:]
        for i, x in enumerate(test)
        if x and 'experience' in x.lower()
    ]
    return x 
开发者ID:OmkarPathak,项目名称:pyresparser,代码行数:48,代码来源:utils.py

示例13: keywords_syntax_nltk

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def keywords_syntax_nltk(sentence):
	global text_terms
	terms = []
	phrases = NLP_sent.phrase_splitting(sentence)		
	for phrase in phrases:
		if len(phrase) <= 2: # e.g.'ii'
			continue
		if phrase in text_terms:
			phrase_terms = text_terms[phrase]
		else:
			#-------------------POS tagging output
			words = NLP_word.word_splitting(phrase.lower())
			pos_tags = NLP_word.word_pos_tagging(words)
	
			#-------------------parsed tree
			grammar = r"""
				NBAR:
					# Nouns and Adjectives, terminated with Nouns
					{<NN.*|JJ>*<NN.*>}
			
				NP:
					{<NBAR>}
					# Above, connected with in/of/etc...
					{<NBAR><IN><NBAR>}
			"""
		
			cp = nltk.RegexpParser(grammar, loop=2)
			cp_tree = cp.parse(pos_tags)
			phrase_terms = get_terms(cp_tree)
			text_terms[phrase] = phrase_terms

		terms += phrase_terms 

	keywords = []
	for term in terms:
		if len(term) > 0:
			keywords.append(' '.join(term))
	return keywords


# Ref to https://gist.github.com/879414
#from nltk.stem.wordnet import WordNetLemmatizer 
开发者ID:Tony-Hao,项目名称:Valx,代码行数:44,代码来源:sentence_keywords.py

示例14: fetch_all_organizations

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def fetch_all_organizations(resume_text):
  organizations = set()
  tokenized_sentences = nltk.sent_tokenize(resume_text)

  # Custom grammar with NLTK
  # NP - Noun Phrase
  # NN - Noun
  # NNP - Proper Noun
  # V - Verb
  # JJ - Adjective

  # In a sentence that contains NN NNNP V NN NN JJ NN.
  # The noun-phrases fetched are:
  # NP: NN NNP
  # NP: NN NN
  # NP: NN

  # Ex, "Application Developer at Delta Force"
  # => ["Application Developer", "Delta Force"]

  grammar = r"""NP: {<NN|NNP>+}"""
  parser = nltk.RegexpParser(grammar)

  avoid_organizations = utilities.get_avoid_organizations()

  for sentence in tokenized_sentences:

    # tags all parts of speech in the tokenized sentences
    tagged_words = nltk.pos_tag(nltk.word_tokenize(sentence))

    # then chunks with customize grammar
    # np_chunks are instances of class nltk.tree.Tree
    np_chunks = parser.parse(tagged_words)
    noun_phrases = []

    for np_chunk in np_chunks:
      if isinstance(np_chunk, nltk.tree.Tree) and np_chunk.label() == 'NP':
        # if np_chunk is of grammer 'NP' then create a space seperated string of all leaves under the 'NP' tree
        noun_phrase = ""
        for (org, tag) in np_chunk.leaves():
          noun_phrase += org + ' '

        noun_phrases.append(noun_phrase.rstrip())

    # Using name entity chunker to get all the organizations
    chunks = nltk.ne_chunk(tagged_words)
    for chunk in chunks:
      if isinstance(chunk, nltk.tree.Tree) and chunk.label() == 'ORGANIZATION':
        (organization, tag) = chunk[0]

        # if organization is in the noun_phrase, it means that there is a high chance of noun_phrase containing the employer name
        # eg, Delta Force is added to organizations even if only Delta is recognized as an organization but Delta Force is a noun-phrase
        for noun_phrase in noun_phrases:
          if organization in noun_phrase and organization not in avoid_organizations:
            organizations.add(noun_phrase.capitalize())

  return organizations 
开发者ID:skcript,项目名称:cvscan,代码行数:59,代码来源:language_parser.py


注:本文中的nltk.RegexpParser方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。