本文整理汇总了Python中nltk.RegexpParser方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.RegexpParser方法的具体用法?Python nltk.RegexpParser怎么用?Python nltk.RegexpParser使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk
的用法示例。
在下文中一共展示了nltk.RegexpParser方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: text_to_num
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def text_to_num(text):
tokenized = nltk.word_tokenize(text);
tags = nltk.pos_tag(tokenized)
print(tags)
chunkPattern = r""" Chunk0: {((<NN|CD.?|RB>)<CD.?|VBD.?|VBP.?|VBN.?|NN.?|RB.?|JJ>*)<NN|CD.?>} """
chunkParser = nltk.RegexpParser(chunkPattern)
chunkedData = chunkParser.parse(tags)
print(chunkedData)
for subtree in chunkedData.subtrees(filter=lambda t: t.label() in "Chunk0"):
exp = ""
for l in subtree.leaves():
exp += str(l[0]) + " "
exp = exp[:-1]
print(exp)
try:
text = text.replace(exp, str(t2n.text2num(exp)))
except Exception as e:
print("error text2num ->", e.args)
print("text2num -> ", text)
return text
示例2: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def __init__(self):
self.grammar = r"""
VP: {<ADJ_SIM><V_PRS>}
VP: {<ADJ_INO><V.*>}
VP: {<V_PRS><N_SING><V_SUB>}
NP: {<N_SING><ADJ.*><N_SING>}
NP: {<N.*><PRO>}
VP: {<N_SING><V_.*>}
VP: {<V.*>+}
NP: {<ADJ.*>?<N.*>+ <ADJ.*>?}
DNP: {<DET><NP>}
PP: {<ADJ_CMPR><P>}
PP: {<ADJ_SIM><P>}
PP: {<P><N_SING>}
PP: {<P>*}
DDNP: {<NP><DNP>}
NPP: {<PP><NP>+}
"""
self.cp = nltk.RegexpParser(self.grammar)
示例3: myParser
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def myParser():
grammar = '\n'.join([
'NP: {<DT>*<NNP>}',
'NP: {<JJ>*<NN>}',
'NP: {<NNP>+}',
])
return nltk.RegexpParser(grammar)
示例4: test_baseline
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def test_baseline():
cp = nltk.RegexpParser("")
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
# print(len(test_sents[0]))
# print(test_sents[0])
print(cp.evaluate(test_sents))
示例5: test_regexp
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def test_regexp():
grammar = r"NP: {<[CDJNP].*>+}"
cp = nltk.RegexpParser(grammar)
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
print(cp.evaluate(test_sents))
示例6: extract_experience
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def extract_experience(resume_text):
'''
Helper function to extract experience from resume text
:param resume_text: Plain resume text
:return: list of experience
'''
wordnet_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
# word tokenization
word_tokens = nltk.word_tokenize(resume_text)
# remove stop words and lemmatize
filtered_sentence = [w for w in word_tokens if not w in stop_words and wordnet_lemmatizer.lemmatize(w) not in stop_words]
sent = nltk.pos_tag(filtered_sentence)
# parse regex
cp = nltk.RegexpParser('P: {<NNP>+}')
cs = cp.parse(sent)
# for i in cs.subtrees(filter=lambda x: x.label() == 'P'):
# print(i)
test = []
for vp in list(cs.subtrees(filter=lambda x: x.label()=='P')):
test.append(" ".join([i[0] for i in vp.leaves() if len(vp.leaves()) >= 2]))
# Search the word 'experience' in the chunk and then print out the text after it
x = [x[x.lower().index('experience') + 10:] for i, x in enumerate(test) if x and 'experience' in x.lower()]
return x
示例7: setup_extractor
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def setup_extractor(self):
self.splitter = PunktSentenceSplitter(self.language)
grammar = self.grammars.get(self.language)
if grammar:
self.parser = RegexpParser(grammar)
else:
raise ValueError(
"Invalid or unsupported language: '%s'. Please use one of the currently supported ones: %s" % (
self.language, self.grammars.keys())
)
for lemma, match_tokens in self.lemma_to_token.iteritems():
self.lemma_to_token[lemma] = set([match.lower() for match in match_tokens])
示例8: get_parse_info
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def get_parse_info(parsestr, stemmer, language, stoplist):
hash_token_pos = OrderedDict()
if language=='german':
grammar = r"""
NBAR:
{<N.*|ADJ.*>*<N.*>} # Nouns and Adjectives, terminated with Nouns
VP:
{<V.*>} # terminated with Verbs
NP:
{<NBAR>}
{<NBAR><APPR><NBAR>} # Above, connected with in/of/etc...
"""
if language=='english':
#Taken from Su Nam Kim Paper...
grammar = r"""
NBAR:
{<NN.*|JJ>*<NN.*>} # Nouns and Adjectives, terminated with Nouns
VP:
{<V.*>} # terminated with Verbs
NP:
{<NBAR>}
{<NBAR><IN><NBAR>} # Above, connected with in/of/etc...
"""
chunker = RegexpParser(grammar)
postoks = []
for i in Tree.fromstring(parsestr).subtrees():
if i.height() == 2:
word, pos = i[0], i.label()
hash_token_pos[stemmer.stem(word)] = word + u"::" + pos
postoks.append((word, pos))
chunk_tree = chunker.parse(postoks)
phrases = get_terms(chunk_tree, stemmer, stoplist)
phrase_list = [ ' '.join(term) for term in phrases if term]
return hash_token_pos, phrase_list
示例9: nltk_parse_clause
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def nltk_parse_clause(sentence):
"""
Natural Language Toolkit: code_cascaded_chunker
http://www.nltk.org/book/ch07.html#code-cascaded-chunker
"""
grammar = r"""
NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN
PP: {<IN><NP>} # Chunk prepositions followed by NP
VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
CLAUSE: {<NP><VP>} # Chunk NP, VP
"""
cp = nltk.RegexpParser(grammar)
#sentence = [("Mary", "NN"), ("saw", "VBD"), ("the", "DT"), ("cat", "NN"), ("sit", "VB"), ("on", "IN"), ("the", "DT"), ("mat", "NN")]
parsed_sentence = cp.parse(sentence)
#print('parsed_sentence=', parsed_sentence)
示例10: extract_candidates
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def extract_candidates(text_obj, no_subset=False):
"""
Based on part of speech return a list of candidate phrases
:param text_obj: Input text Representation see @InputTextObj
:param no_subset: if true won't put a candidate which is the subset of an other candidate
:param lang: language (currently en, fr and de are supported)
:return: list of candidate phrases (string)
"""
keyphrase_candidate = set()
np_parser = nltk.RegexpParser(get_grammar(text_obj.lang)) # Noun phrase parser
trees = np_parser.parse_sents(text_obj.pos_tagged) # Generator with one tree per sentence
for tree in trees:
for subtree in tree.subtrees(filter=lambda t: t.label() == 'NP'): # For each nounphrase
# Concatenate the token with a space
keyphrase_candidate.add(' '.join(word for word, tag in subtree.leaves()))
keyphrase_candidate = {kp for kp in keyphrase_candidate if len(kp.split()) <= 5}
if no_subset:
keyphrase_candidate = unique_ngram_candidates(keyphrase_candidate)
else:
keyphrase_candidate = list(keyphrase_candidate)
return keyphrase_candidate
示例11: generate_tree
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def generate_tree(text):
text = text.replace('“', '"') #to preserve quotes in text, primarily news content
text = text.replace('”', '"')
text = text.replace('’', "'")
text = unidecode(text)
chunker = nltk.RegexpParser(grammar)
tokenized_text = nltk.tokenize.word_tokenize(text)
postoks = nltk.tag.pos_tag(tokenized_text)
tree = chunker.parse(postoks)
return tree
示例12: extract_experience
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def extract_experience(resume_text):
'''
Helper function to extract experience from resume text
:param resume_text: Plain resume text
:return: list of experience
'''
wordnet_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
# word tokenization
word_tokens = nltk.word_tokenize(resume_text)
# remove stop words and lemmatize
filtered_sentence = [
w for w in word_tokens if w not
in stop_words and wordnet_lemmatizer.lemmatize(w)
not in stop_words
]
sent = nltk.pos_tag(filtered_sentence)
# parse regex
cp = nltk.RegexpParser('P: {<NNP>+}')
cs = cp.parse(sent)
# for i in cs.subtrees(filter=lambda x: x.label() == 'P'):
# print(i)
test = []
for vp in list(
cs.subtrees(filter=lambda x: x.label() == 'P')
):
test.append(" ".join([
i[0] for i in vp.leaves()
if len(vp.leaves()) >= 2])
)
# Search the word 'experience' in the chunk and
# then print out the text after it
x = [
x[x.lower().index('experience') + 10:]
for i, x in enumerate(test)
if x and 'experience' in x.lower()
]
return x
示例13: keywords_syntax_nltk
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def keywords_syntax_nltk(sentence):
global text_terms
terms = []
phrases = NLP_sent.phrase_splitting(sentence)
for phrase in phrases:
if len(phrase) <= 2: # e.g.'ii'
continue
if phrase in text_terms:
phrase_terms = text_terms[phrase]
else:
#-------------------POS tagging output
words = NLP_word.word_splitting(phrase.lower())
pos_tags = NLP_word.word_pos_tagging(words)
#-------------------parsed tree
grammar = r"""
NBAR:
# Nouns and Adjectives, terminated with Nouns
{<NN.*|JJ>*<NN.*>}
NP:
{<NBAR>}
# Above, connected with in/of/etc...
{<NBAR><IN><NBAR>}
"""
cp = nltk.RegexpParser(grammar, loop=2)
cp_tree = cp.parse(pos_tags)
phrase_terms = get_terms(cp_tree)
text_terms[phrase] = phrase_terms
terms += phrase_terms
keywords = []
for term in terms:
if len(term) > 0:
keywords.append(' '.join(term))
return keywords
# Ref to https://gist.github.com/879414
#from nltk.stem.wordnet import WordNetLemmatizer
示例14: fetch_all_organizations
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import RegexpParser [as 别名]
def fetch_all_organizations(resume_text):
organizations = set()
tokenized_sentences = nltk.sent_tokenize(resume_text)
# Custom grammar with NLTK
# NP - Noun Phrase
# NN - Noun
# NNP - Proper Noun
# V - Verb
# JJ - Adjective
# In a sentence that contains NN NNNP V NN NN JJ NN.
# The noun-phrases fetched are:
# NP: NN NNP
# NP: NN NN
# NP: NN
# Ex, "Application Developer at Delta Force"
# => ["Application Developer", "Delta Force"]
grammar = r"""NP: {<NN|NNP>+}"""
parser = nltk.RegexpParser(grammar)
avoid_organizations = utilities.get_avoid_organizations()
for sentence in tokenized_sentences:
# tags all parts of speech in the tokenized sentences
tagged_words = nltk.pos_tag(nltk.word_tokenize(sentence))
# then chunks with customize grammar
# np_chunks are instances of class nltk.tree.Tree
np_chunks = parser.parse(tagged_words)
noun_phrases = []
for np_chunk in np_chunks:
if isinstance(np_chunk, nltk.tree.Tree) and np_chunk.label() == 'NP':
# if np_chunk is of grammer 'NP' then create a space seperated string of all leaves under the 'NP' tree
noun_phrase = ""
for (org, tag) in np_chunk.leaves():
noun_phrase += org + ' '
noun_phrases.append(noun_phrase.rstrip())
# Using name entity chunker to get all the organizations
chunks = nltk.ne_chunk(tagged_words)
for chunk in chunks:
if isinstance(chunk, nltk.tree.Tree) and chunk.label() == 'ORGANIZATION':
(organization, tag) = chunk[0]
# if organization is in the noun_phrase, it means that there is a high chance of noun_phrase containing the employer name
# eg, Delta Force is added to organizations even if only Delta is recognized as an organization but Delta Force is a noun-phrase
for noun_phrase in noun_phrases:
if organization in noun_phrase and organization not in avoid_organizations:
organizations.add(noun_phrase.capitalize())
return organizations