本文整理汇总了Python中nltk.ne_chunk函数的典型用法代码示例。如果您正苦于以下问题:Python ne_chunk函数的具体用法?Python ne_chunk怎么用?Python ne_chunk使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了ne_chunk函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: extract_entities2
def extract_entities2(text):
entities = []
"""t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
t2.evaluate(test_sents)"""
for sentence in sent_tokenize(text):
#print pos_tag(nltk.word_tokenize(sentence))
print sentence
tags=pos_tag(nltk.word_tokenize(sentence))
tags=tagear(tags)
chunks = ne_chunk(pos_tag(nltk.word_tokenize(sentence)))
#chunks = ne_chunk(regexp_tagger.tag((nltk.word_tokenize(text))))
chunks = ne_chunk(tags)
#chunks.draw()
#print chunks
for chunk in chunks:
#print chunk
#if hasattr(chunk, 'node'):
# print chunk.node
if hasattr(chunk, 'node') :
print chunk
entities.extend([chunk for chunk in chunks if hasattr(chunk, 'node')])
return entities
示例2: test_nltkNERParsing
def test_nltkNERParsing(self):
testString = 'Natural Sciences and Engineering Research Council of Canada'
unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams()
posTagged = nltk.pos_tag(unigrams)
chunked = nltk.ne_chunk(posTagged)
getGPEs = []
for treeBranch in chunked:
if hasattr(treeBranch, 'label') and treeBranch.label() == 'GPE':
getGPEs.append(str(treeBranch))
self.assertEqual(1, len(getGPEs))
testString = 'Milwaukee Foundation'
unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams()
posTagged = nltk.pos_tag(unigrams)
chunked = nltk.ne_chunk(posTagged)
# returns (S (PERSON Milwaukee/NNP) (ORGANIZATION Foundation/NNP))
testString = 'New England Board of Higher Education'
unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams()
posTagged = nltk.pos_tag(unigrams)
chunked = nltk.ne_chunk(posTagged)
# returns (S (GPE New/NNP)(ORGANIZATION England/NNP Board/NNP) of/IN (PERSON Higher/NNP Education/NNP))
testString = 'New England Board of Higher Education'
unigrams = TokenizeOnWhitespacePunctuation(testString).getUnigrams()
posTagged = nltk.pos_tag(unigrams)
chunked = nltk.ne_chunk(posTagged)
示例3: extractNE
def extractNE(sentence, withClass):
words = nltk.word_tokenize(sentence) # Extract words from sentence: Stopwords removed, punctuations removed
if withClass:
tree = nltk.ne_chunk(nltk.pos_tag(words), binary=False)
return extractNEwithClass(tree)
else:
tree = nltk.ne_chunk(nltk.pos_tag(words), binary=True)
return extractNEwithoutClass(tree)
示例4: nameEntityExtract
def nameEntityExtract(document):
sentences = nltk.sent_tokenize(document)
sentences = [nltk.word_tokenize(sent) for sent in sentences]
sentences = [nltk.pos_tag(sent) for sent in sentences]
print sentences[0]
print "the length of sentences is: " + str(len(sentences))
sent = sentences[0]
print nltk.ne_chunk(sent,binary=True)
示例5: English_NER
def English_NER(sentence):
# 命名实体只被标注为NE
print '命名实体只被标注为NE:'
print nltk.ne_chunk(sentence, binary=True)
# 命名实体会添加类型标签,例如PERSON,ORGANIZATION,GPE等
print '命名实体会添加类型标签,例如PERSON,ORGANIZATION,GPE等:'
print nltk.ne_chunk(sentence)
示例6: main
def main():
sent = nltk.corpus.treebank.tagged_sents()[22]
print "sent (nltk):", sent
#print nltk.ne_chunk(sent, binary=True)
#print nltk.ne_chunk(sent)
sent = ie_preprocess("""Injured personnel consisting of six Schlum employees were immediately transported
to nearby hospitals and most of them (were)
discharged after having received treatment""")
print sent
print nltk.ne_chunk(sent[0])
示例7: process_contents
def process_contents():
try:
for i in tokenized[5:]:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
namedEnt = nltk.ne_chunk(tagged) #White, House
namedEnt = nltk.ne_chunk(tagged, binary = True) #White House
namedEnt.draw()
except Exception as e:
print(str(e))
示例8: process_content
def process_content():
try:
for i in tokenized:
words = nltk.word_tokenize(i);
tagged = nltk.pos_tag(words)
namedEnt1 = nltk.ne_chunk(tagged) #Give all named entities with category
namedEnt2 = nltk.ne_chunk(tagged, binary=True) #This gives named entity without category
namedEnt2.draw()
except Exception as e:
print(str(e))
示例9: process_content
def process_content():
for i in custom_tokenized[5:]:
words = word_tokenize(i)
tagged = nltk.pos_tag(words)
namedEnt = nltk.ne_chunk(tagged);
print(namedEnt)
示例10: get_entities
def get_entities(self,sentences):
""" The function returns the dictionary containing the results for
the Name Entity Recognition analyze.
Args:
sentences: the sentences list.
Returns:
dictionary:
"""
entities = dict([])
# Tokenization
tokens = [nltk.tokenize.word_tokenize(s) for s in sentences]
# Part-Of-Speech tagging
pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
# Chunking
chunked_nes = [nltk.ne_chunk(c) for c in pos_tagged_tokens]
for tree in chunked_nes:
for s in tree.subtrees(lambda t: (t.height()==2)):
if s.label()!='S':
entity = ' '.join(i[0] for i in s.leaves())
if s.label() in entities.keys():
if entity not in entities[s.label()]:
entities[s.label()].append(entity)
entities[s.label()].sort()
else:
entities[s.label()] = [entity]
return entities
示例11: parse_questions
def parse_questions():
print "Parsing Questions..."
parsed_questions = {}
with open(DIR+'/questions.txt', 'r') as f:
data = f.read()
questions = re.split('[\s]*</top>[\s]*', data)
if len(questions[-1].strip()) == 0: questions.pop()
qc = QuestionClassifier.QuestionClassifier()
for question in questions:
question_number = int(re.search(r"<num>[\s]*Number:[\s]*([0-9]+)", question).group(1))
question = re.search(r"<desc>[\s]*Description:[\s]*([a-zA-Z0-9\-\?\'\. ]+)", question).group(1)
question_words = nltk.word_tokenize(question)
question_pos = nltk.pos_tag(question_words)
question_nes = nltk.ne_chunk(question_pos)
question_tree = Chunker.chunker.parse(question_pos)
question_classification = qc.classify(question)
qwords, nouns, nes = [], [], []
for part in question_nes:
try:
nes.append((part.node, part.leaves()[0][0]))
except:
if part[1] == 'WP' or part[1] == 'WRB':
qwords.append(part[0])
elif part[1] == 'NN' or part[1] == 'NNP':
nouns.append(part[0])
# print qwords, nouns, nes
# print question_pos
parsed_questions[question_number] = { "question": question, "pos": question_pos, "ne": question_nes, "parse_tree": question_tree, "question_classification": question_classification, "question_words": qwords, "nouns": nouns, "ne_words": nes }
with open(DIR+'/parsed_questions.txt', 'wb') as f:
pickle.dump(parsed_questions, f)
示例12: extract_normal_ne
def extract_normal_ne(self, text):
result = []
for sent in sent_tokenize(text) if text else []:
for chunk in ne_chunk(pos_tag(word_tokenize(sent))):
if hasattr(chunk, "node"):
result.append(" ".join([c[0] for c in chunk.leaves()]))
return result
示例13: extract_concepts
def extract_concepts(text):
"""
Uses the NLTK natural language processing library to
extract from a text the essential terms that appeared in it.
"""
try:
ignored_words = corpus.stopwords.words('english')
ignored_words.append("n't")
appeared = {}
concepts = []
tokenized = nltk.word_tokenize(text)
tagged = nltk.pos_tag(tokenized)
named_entities = nltk.ne_chunk(tagged)
for ne in named_entities.leaves():
#if ne[1] in ('NNS', 'NNP', 'NN'):
if len(ne[0]) > 2 and ne[0].lower() not in ignored_words and not (ne[0].startswith("http") or ne[0].startswith("//")):
name = ne[0]
if name in appeared:
continue
concepts.append(name)
appeared[name] = True
except:
print "extract concepts failed:", sys.exc_info()
return concepts
示例14: ne_tag
def ne_tag(sentences):
tagged = raw_trigram_tag(sentences, tagger_file="tagger.pkl")[1]
fin = []
for tagged_sent in tagged:
# print tagged_sent
fin.append(nltk.ne_chunk(tagged_sent))
return fin
示例15: processor
def processor(data):
try:
tokenized = nltk.word_tokenize(data)
tagged = nltk.pos_tag(tokenized)
namedEnt = nltk.ne_chunk(tagged, binary=True)
entities = re.findall(r'NE\s(.*?)/', str(namedEnt))
# ('
descriptives = re.findall(r'\(\'(\w*)\'.\s\'JJ\w?\'',str(tagged))
if len(entities) > 1:
pass
elif len(entities) == 0:
pass
elif str(entities) == '_blank':
pass
else:
print 'Named: ', entities[0]
print 'Description: '
for eachDesc in descriptives:
print eachDesc
currentTime = time.time()
dateStamp = datetime.datetime.fromtimestamp(currentTime).strftime('%Y-%m-%d %H:%M:%S')
namedEntity = entities[0]
relatedWord = eachDesc
c.execute("INSERT INTO knowledgeBase (unix, dateStamp, namedEntity, relatedWord) VALUES (?,?,?,?)",
(currentTime, dateStamp, namedEntity, relatedWord))
conn.commit()
except Exception, e:
print 'failed in the first try of processor'
print str(e)