本文整理汇总了Python中nltk.batch_ne_chunk函数的典型用法代码示例。如果您正苦于以下问题:Python batch_ne_chunk函数的具体用法?Python batch_ne_chunk怎么用?Python batch_ne_chunk使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了batch_ne_chunk函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self, query_string):
self.query_string = query_string
sentences = nltk.sent_tokenize(query_string)
self.tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
self.tagged_sentences = [nltk.pos_tag(sentence) for sentence in self.tokenized_sentences]
self.binary_chunked_sentences = nltk.batch_ne_chunk(self.tagged_sentences, binary=True)
self.multiclass_chunked_sentences = nltk.batch_ne_chunk(self.tagged_sentences, binary=False)
self.temporal_sentences = timex.ground(timex.tag(query_string), mx.DateTime.gmt())
示例2: extract_entities
def extract_entities(shorttext_rows, site):
# { short text id -> (noun entities, named entities) }
shorttext_entities = {}
# nltk entity classes
nltk_entity_types = __get_nltk_entity_types__()
for shorttext_row in shorttext_rows:
shorttext_id = shorttext_row[0]
shorttext_str = shorttext_row[1]
noun_entities = []
named_entities = []
sentences = nltk.sent_tokenize(shorttext_str)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.batch_ne_chunk(tagged_sentences)
for tree in chunked_sentences:
__extract_valid_entities__(tree, (noun_entities, named_entities), nltk_entity_types)
shorttext_entities[shorttext_id] = (noun_entities, named_entities)
# Cache extracted entities
pkl_util.write_pickle(__output_str__, shorttext_entities, __get_nltk_entities_cache_path__(site))
示例3: _nltk_ner
def _nltk_ner(self, text, searched_entity, question):
# Entity Classification
sentences = nltk.sent_tokenize(text)
tokenized_sentences = [nltk.word_tokenize(s) for s in sentences]
tagged_sentences = [nltk.pos_tag(s) for s in tokenized_sentences]
ne_chunked_sentences = nltk.batch_ne_chunk(tagged_sentences)
# Entity Extraction
entities = []
all_entities = []
for tree in ne_chunked_sentences:
for child in tree:
if isinstance(child, Tree):
entity = " ".join([word for (word, pos) in child.leaves()])
if child.node == searched_entity:
entities.append(entity)
all_entities.append(entity)
if 'OTHER' == searched_entity:
entities += self._other_recognition(tagged_sentences, all_entities, question)
if 'NUMBER' == searched_entity:
entities += self._number_recognition(text, tagged_sentences, all_entities)
return entities
示例4: obtenerNEs
def obtenerNEs(lista):
listaGeneral = []
for (tweet, listaPalabras, clasificacion, diferenciaProbabilidad) in lista:
# Condicionamos para que solo evalue los positivos
print clasificacion
if clasificacion == 'positive':
sentences = nltk.tokenize.sent_tokenize(tweet)
# Hacemos split en lugar de tokenize, para poder extrar las menciones a usuario.
# El word_tokenize, separa el @ entonces no podemos filtrar
nuevaSentences = []
for s in sentences:
subLista = quitarExcedenteSimple(s.split())
nuevaSentences.append(' '.join(subLista))
tokens = [nltk.tokenize.word_tokenize(s) for s in nuevaSentences]
pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens, binary=True)
listaNEs = []
for subArbol in ne_chunks:
traverse(subArbol, listaNEs, False)
if listaNEs:
listaGeneral.append((tweet, listaPalabras, listaNEs))
web.debug('Tweets con NEs:' + str(len(listaGeneral)))
return listaGeneral
示例5: process_entities
def process_entities(sentence):
words = []
#print sentence
#now break sentences into tokens
tokens = nltk.word_tokenize(sentence)
#print tokens
#A bit of POS tagging
pos_tagged_tokens = [nltk.pos_tag(tokens)]
#Chunk extraction time
ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens)
# Flatten the list since we're not using sentence structure
# and sentences are guaranteed to be separated by a special
# POS tuple such as ('.', '.')
pos_tagged_tokens = [token for sent in pos_tagged_tokens for token in sent]
#Entity extraction
#Code from Mining data from the social web: https://github.com/ptwobrussell/Mining-the-Social-Web/blob/master/python_code/blogs_and_nlp__extract_entities.py
post = {}
all_entity_chunks = []
previous_pos = None
current_entity_chunk = []
#print pos_tagged_tokens
for (token, pos) in pos_tagged_tokens:
if pos == previous_pos and pos.startswith('NN'):
current_entity_chunk.append(token)
elif pos.startswith('NN'):
if current_entity_chunk != []:
# Note that current_entity_chunk could be a duplicate when appended,
# so frequency analysis again becomes a consideration
all_entity_chunks.append((' '.join(current_entity_chunk), pos))
current_entity_chunk = [token]
previous_pos = pos
# Store the chunks as an index for the document
# and account for frequency while we're at it...
post['entities'] = {}
for c in all_entity_chunks:
post['entities'][c] = post['entities'].get(c, 0) + 1
# For example, we could display just the title-cased entities
proper_nouns = []
for (entity, pos) in post['entities']:
if entity.istitle():
proper_nouns.append(entity)
#print '\t%s (%s)' % (entity, post['entities'][(entity, pos)])
#print entity
#[(entity, pos)]
return proper_nouns
示例6: nlp_extract_tags
def nlp_extract_tags(text, lang=None):
"""
Return a list of tags extracted from provided text.
"""
sentences = nltk.sent_tokenize(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
def extract_entity_names(t):
entity_names = []
if hasattr(t, "node") and t.node:
if t.node == "NE":
entity_names.append(" ".join([child[0] for child in t]))
else:
for child in t:
entity_names.extend(extract_entity_names(child))
return entity_names
entity_names = []
for tree in chunked_sentences:
entity_names.extend(extract_entity_names(tree))
result = {"tags": list(set(entity_names))}
return jsonp({"status": "ok", "result": result})
示例7: get_named_entities
def get_named_entities(self,text):
sentences = nltk.sent_tokenize(text)
sentences = [nltk.word_tokenize(sent) for sent in sentences]
sentences = [nltk.pos_tag(sent) for sent in sentences] #takes 3ish seconds
nes = nltk.batch_ne_chunk(sentences,binary=False) #takes 2ish seconds
named_entities = {}
stop_names = ['Mr.']
# Loop through the tagged sentences, looking for named entites, and put their "leaves" together
# e.g. "White" + " " + "House"
#
for i in nes:
for j in i:
if re.search('PERSON|ORGANIZATION|LOCATION|GPE|FACILITY',str(j)):
name = ' '.join(c[0] for c in j.leaves())
# Attempt to merge people names if you've seen them before
# e.g. Ms. Clinton gets merged into Hillary Clinton
if not (name in stop_names):
regex = re.compile(r'^'+name.split(' ')[-1]+'|\s'+name.split(' ')[-1]+'$')
regex_match = filter(regex.search,named_entities.keys())
if (name in named_entities):
named_entities[name]+=1
elif (len(regex_match)>0 and re.search('PERSON',str(j))!=None):
named_entities[regex_match[0]]+=1
else:
named_entities[name] = 1
# Sort named entities by count and take first 8
sorted_names = sorted(named_entities.iteritems(), key=operator.itemgetter(1), reverse=True)
names=[]
for name in sorted_names[:8]:
names.append(name[0].lower())
return names
示例8: extractchunk
def extractchunk(tweettuple):
sentences = [nltk.tokenize.sent_tokenize(nltk.clean_html(str(w))) for (a,w) in tweettuple]
cid = [str(a) for (a,w) in tweettuple]
tokens = [nltk.tokenize.word_tokenize(str(s)) for s in sentences]
pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens)
return dict(zip(cid, ne_chunks))
示例9: extract_entities
def extract_entities(sample):
print 'extracting entities'
sentences = nltk.sent_tokenize(sample)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
entity_names = []
for tree in chunked_sentences:
entity_names.extend(extract_entity_names(tree))
#create a map with entity,count count representing
# the number of occurences of an entity
entity_count = {}
for entity in entity_names:
if entity in entity_count:
entity_count[entity] += 1
else:
entity_count[entity] = 1
sorted_occurences = sorted(entity_count.iteritems(), reverse=True, key=operator.itemgetter(1))
#return OrderedDict(entity_count)
# Print unique entity names
#print set(entity_names)
return sorted_occurences
示例10: extract_chunked_sentences
def extract_chunked_sentences( raw ):
"""
"""
sentences = nltk.sent_tokenize(raw)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
return chunked_sentences
示例11: extractNamedEntities
def extractNamedEntities(sentences):
tok_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tag_sentences = [nltk.pos_tag(sentence) for sentence in tok_sentences]
cnk_sentences = nltk.batch_ne_chunk(tag_sentences, binary=True)
all_named_entities = []
for tree in cnk_sentences:
named_entities = extractNamedEntitiesFromChunkSentence(tree)
all_named_entities.extend(named_entities)
return list(set(all_named_entities))
示例12: extractchunk
def extractchunk(tweettuple):
#Break each tweet into groups of sentences and words
#Run through the nltk standard pos tag and chunker functions
sentences = [nltk.tokenize.sent_tokenize(nltk.clean_html(str(c))) for (a,w,c) in tweettuple]
cid = [str(a) for (a,w, c) in tweettuple]
tnum =[w for (a,w,c) in tweettuple]
tokens = [nltk.tokenize.word_tokenize(str(s)) for s in sentences]
pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens)
return zip(cid, tnum, ne_chunks)
示例13: get_entities
def get_entities(sentences):
#sentences = nltk.sent_tokenize(doc) # some nltk preprocessing: tokenize, tag, chunk, NER
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
entities = []
for t in chunked_sentences:
entities.append(entitify(t))
return entities
示例14: get_entities3
def get_entities3(text):
sentences = nltk.sent_tokenize(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
entity_names=[]
for tree in chunked_sentences:
entity_names.extend(extract_entity_names(tree))
return filter_entities(entity_names)
示例15: gen_ners
def gen_ners(self,sample):
""" returns NERS in the sample given as a list """
sentences = nltk.sent_tokenize(sample)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
entity_names = []
for tree in chunked_sentences:
entity_names.extend(self._extract_entity_names(tree))
unique_ners = list(set(entity_names))
return unique_ners