当前位置: 首页>>代码示例>>Python>>正文


Python nltk.batch_ne_chunk函数代码示例

本文整理汇总了Python中nltk.batch_ne_chunk函数的典型用法代码示例。如果您正苦于以下问题:Python batch_ne_chunk函数的具体用法?Python batch_ne_chunk怎么用?Python batch_ne_chunk使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了batch_ne_chunk函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

 def __init__(self, query_string):
     self.query_string = query_string
     sentences = nltk.sent_tokenize(query_string)
     self.tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
     self.tagged_sentences = [nltk.pos_tag(sentence) for sentence in self.tokenized_sentences]
     self.binary_chunked_sentences = nltk.batch_ne_chunk(self.tagged_sentences, binary=True)
     self.multiclass_chunked_sentences = nltk.batch_ne_chunk(self.tagged_sentences, binary=False)
     self.temporal_sentences = timex.ground(timex.tag(query_string), mx.DateTime.gmt())
开发者ID:summera,项目名称:python-natural-language-search,代码行数:8,代码来源:text_search.py

示例2: extract_entities

def extract_entities(shorttext_rows, site):

    # { short text id -> (noun entities, named entities) }
    shorttext_entities = {}
    
    # nltk entity classes
    nltk_entity_types = __get_nltk_entity_types__()
    
    for shorttext_row in shorttext_rows:
        
        shorttext_id = shorttext_row[0]
        shorttext_str = shorttext_row[1]
        
        noun_entities = []
        named_entities = []
        
        sentences = nltk.sent_tokenize(shorttext_str)
        tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
        tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
        chunked_sentences = nltk.batch_ne_chunk(tagged_sentences)
        for tree in chunked_sentences:
            __extract_valid_entities__(tree, (noun_entities, named_entities), nltk_entity_types)    
            
        shorttext_entities[shorttext_id] = (noun_entities, named_entities)
        
    # Cache extracted entities
    pkl_util.write_pickle(__output_str__, shorttext_entities, __get_nltk_entities_cache_path__(site))
开发者ID:Big-Data,项目名称:reslve,代码行数:27,代码来源:nltk_extraction_dataset_mgr.py

示例3: _nltk_ner

    def _nltk_ner(self, text, searched_entity, question):
        # Entity Classification
        sentences = nltk.sent_tokenize(text)
        tokenized_sentences = [nltk.word_tokenize(s) for s in sentences]
        tagged_sentences = [nltk.pos_tag(s) for s in tokenized_sentences]
        ne_chunked_sentences = nltk.batch_ne_chunk(tagged_sentences)

        # Entity Extraction
        entities = []
        all_entities = []
        for tree in ne_chunked_sentences:
            for child in tree:
                if isinstance(child, Tree):
                    entity = " ".join([word for (word, pos) in child.leaves()])
                    if child.node == searched_entity:
                        entities.append(entity)
                    all_entities.append(entity)

        if 'OTHER' == searched_entity:
            entities += self._other_recognition(tagged_sentences, all_entities, question)

        if 'NUMBER' == searched_entity:
            entities += self._number_recognition(text, tagged_sentences, all_entities)

        return entities
开发者ID:danigarabato,项目名称:qa,代码行数:25,代码来源:answer.py

示例4: obtenerNEs

def obtenerNEs(lista):

    listaGeneral = []

    for (tweet, listaPalabras, clasificacion, diferenciaProbabilidad) in lista:
        # Condicionamos para que solo evalue los positivos
        print clasificacion
        if clasificacion == 'positive':
            sentences = nltk.tokenize.sent_tokenize(tweet)
            # Hacemos split en lugar de tokenize, para poder extrar las menciones a usuario.
            # El word_tokenize, separa el @ entonces no podemos filtrar
            nuevaSentences = []
            for s in sentences:
                subLista = quitarExcedenteSimple(s.split())
                nuevaSentences.append(' '.join(subLista))

            tokens = [nltk.tokenize.word_tokenize(s) for s in nuevaSentences]

            pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
            ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens, binary=True)

            listaNEs = []
            for subArbol in ne_chunks:
                traverse(subArbol, listaNEs, False)

            if listaNEs:
                listaGeneral.append((tweet, listaPalabras, listaNEs))

    web.debug('Tweets con NEs:' + str(len(listaGeneral)))
    return listaGeneral
开发者ID:JavierOgg,项目名称:proyectoFinal,代码行数:30,代码来源:funciones.py

示例5: process_entities

def process_entities(sentence):  
    words = []
    #print sentence

    #now break sentences into tokens
    tokens = nltk.word_tokenize(sentence)
    #print tokens

    #A bit of POS tagging
    pos_tagged_tokens = [nltk.pos_tag(tokens)]

    #Chunk extraction time
    ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens)

    # Flatten the list since we're not using sentence structure
    # and sentences are guaranteed to be separated by a special
    # POS tuple such as ('.', '.')
    pos_tagged_tokens = [token for sent in pos_tagged_tokens for token in sent]

    #Entity extraction

    #Code from Mining data from the social web: https://github.com/ptwobrussell/Mining-the-Social-Web/blob/master/python_code/blogs_and_nlp__extract_entities.py
    post = {}
    all_entity_chunks = []
    previous_pos = None
    current_entity_chunk = []
    #print pos_tagged_tokens
    for (token, pos) in pos_tagged_tokens:

        if pos == previous_pos and pos.startswith('NN'):
            current_entity_chunk.append(token)
        elif pos.startswith('NN'):
            if current_entity_chunk != []:

                # Note that current_entity_chunk could be a duplicate when appended,
                # so frequency analysis again becomes a consideration

                all_entity_chunks.append((' '.join(current_entity_chunk), pos))
            current_entity_chunk = [token]

        previous_pos = pos

    # Store the chunks as an index for the document
    # and account for frequency while we're at it...

    post['entities'] = {}
    for c in all_entity_chunks:
        post['entities'][c] = post['entities'].get(c, 0) + 1

    # For example, we could display just the title-cased entities


    proper_nouns = []
    for (entity, pos) in post['entities']:
        if entity.istitle():
            proper_nouns.append(entity)
            #print '\t%s (%s)' % (entity, post['entities'][(entity, pos)])
            #print entity
            #[(entity, pos)]
    return proper_nouns
开发者ID:carriercomm,项目名称:scraperwiki-scraper-vault,代码行数:60,代码来源:entities_speedcamera.py

示例6: nlp_extract_tags

def nlp_extract_tags(text, lang=None):
    """
    Return a list of tags extracted from provided text.
    """

    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)

    def extract_entity_names(t):
        entity_names = []

        if hasattr(t, "node") and t.node:
            if t.node == "NE":
                entity_names.append(" ".join([child[0] for child in t]))
            else:
                for child in t:
                    entity_names.extend(extract_entity_names(child))

        return entity_names

    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))

    result = {"tags": list(set(entity_names))}

    return jsonp({"status": "ok", "result": result})
开发者ID:rautarchana9,项目名称:hascore,代码行数:29,代码来源:nlp.py

示例7: get_named_entities

	def get_named_entities(self,text):
		sentences = nltk.sent_tokenize(text)
		sentences = [nltk.word_tokenize(sent) for sent in sentences]
		sentences = [nltk.pos_tag(sent) for sent in sentences] #takes 3ish seconds
		nes = nltk.batch_ne_chunk(sentences,binary=False) #takes 2ish seconds
		named_entities = {}
		stop_names = ['Mr.']
		
		# Loop through the tagged sentences, looking for named entites, and put their "leaves" together
		# e.g. "White" + " " + "House"
		#
		for i in nes:
			for j in i:
				if re.search('PERSON|ORGANIZATION|LOCATION|GPE|FACILITY',str(j)):
					name = ' '.join(c[0] for c in j.leaves())
					
					# Attempt to merge people names if you've seen them before
					# e.g. Ms. Clinton gets merged into Hillary Clinton
					if not (name in stop_names):
						regex = re.compile(r'^'+name.split(' ')[-1]+'|\s'+name.split(' ')[-1]+'$')
						regex_match = filter(regex.search,named_entities.keys())
						if (name in named_entities):
							named_entities[name]+=1
						elif  (len(regex_match)>0 and re.search('PERSON',str(j))!=None):
							named_entities[regex_match[0]]+=1
						else:
							named_entities[name] = 1
		
		# Sort named entities by count and take first 8
		sorted_names = sorted(named_entities.iteritems(), key=operator.itemgetter(1), reverse=True)
		names=[]
		for name in sorted_names[:8]:
			names.append(name[0].lower())		
		return names
开发者ID:visbe,项目名称:long-view,代码行数:34,代码来源:keyword_getter.py

示例8: extractchunk

 def extractchunk(tweettuple):
     sentences = [nltk.tokenize.sent_tokenize(nltk.clean_html(str(w))) for (a,w) in tweettuple]
     cid = [str(a) for (a,w) in tweettuple]
     tokens = [nltk.tokenize.word_tokenize(str(s)) for s in sentences]
     pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
     ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens)
     return dict(zip(cid, ne_chunks))
开发者ID:hkilter,项目名称:bullwhip_effect,代码行数:7,代码来源:iterate_couchdb__extract_timelinecomparisons.py

示例9: extract_entities

def extract_entities(sample):

    print 'extracting entities'
    sentences = nltk.sent_tokenize(sample)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
     
    entity_names = []
    for tree in chunked_sentences:
        entity_names.extend(extract_entity_names(tree))

    #create a map with entity,count count representing 
    # the number of occurences of an entity     
    entity_count = {}
    for entity in entity_names:
        if entity in entity_count:
            entity_count[entity] += 1
        else:
            entity_count[entity] = 1

    sorted_occurences = sorted(entity_count.iteritems(), reverse=True, key=operator.itemgetter(1))
    #return OrderedDict(entity_count)

    # Print unique entity names
    #print set(entity_names)
    return sorted_occurences
开发者ID:ebegoli,项目名称:Agatha,代码行数:27,代码来源:agatha.py

示例10: extract_chunked_sentences

def extract_chunked_sentences( raw ):
    """
    """    
    sentences = nltk.sent_tokenize(raw)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
    chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)    
    return chunked_sentences
开发者ID:ebegoli,项目名称:AffectiveNLP,代码行数:8,代码来源:recognizer.py

示例11: extractNamedEntities

def extractNamedEntities(sentences):
    tok_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    tag_sentences = [nltk.pos_tag(sentence) for sentence in tok_sentences]
    cnk_sentences = nltk.batch_ne_chunk(tag_sentences, binary=True)
    all_named_entities = []
    for tree in cnk_sentences:      
        named_entities = extractNamedEntitiesFromChunkSentence(tree)
        all_named_entities.extend(named_entities)
    return list(set(all_named_entities))
开发者ID:Kevinwenya,项目名称:textmining-3,代码行数:9,代码来源:simple-nltk-webservice.py

示例12: extractchunk

def extractchunk(tweettuple):
    #Break each tweet into groups of sentences and words
    #Run through the nltk standard pos tag and chunker functions

    sentences = [nltk.tokenize.sent_tokenize(nltk.clean_html(str(c))) for (a,w,c) in tweettuple]
    cid = [str(a) for (a,w, c) in tweettuple]
    tnum =[w for (a,w,c) in tweettuple]
    tokens = [nltk.tokenize.word_tokenize(str(s)) for s in sentences]
    pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
    ne_chunks = nltk.batch_ne_chunk(pos_tagged_tokens)
    return zip(cid, tnum, ne_chunks)
开发者ID:hkilter,项目名称:bullwhip_effect,代码行数:11,代码来源:couchdb__extract_searchcomparisons.py

示例13: get_entities

    def get_entities(sentences):
        #sentences = nltk.sent_tokenize(doc) # some nltk preprocessing: tokenize, tag, chunk, NER
        tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
        tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
        chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)

        entities = []
        for t in chunked_sentences:
            entities.append(entitify(t))

        return entities
开发者ID:W4ngatang,项目名称:DocumentSummarizer,代码行数:11,代码来源:build.py

示例14: get_entities3

def get_entities3(text):
  sentences = nltk.sent_tokenize(text)
  tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
  tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
  chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
  
  entity_names=[]
  for tree in chunked_sentences:
    entity_names.extend(extract_entity_names(tree))

  return filter_entities(entity_names)
开发者ID:bstewartny,项目名称:Political-News,代码行数:11,代码来源:feeds.py

示例15: gen_ners

 def gen_ners(self,sample):
     """ returns NERS in the sample given as a list """
     sentences = nltk.sent_tokenize(sample)
     tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
     tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
     chunked_sentences = nltk.batch_ne_chunk(tagged_sentences, binary=True)
     entity_names = []
     for tree in chunked_sentences:
             entity_names.extend(self._extract_entity_names(tree))
     unique_ners = list(set(entity_names))
     return unique_ners
开发者ID:digitaltracer,项目名称:info-beanstalk,代码行数:11,代码来源:consumer_threads.py


注:本文中的nltk.batch_ne_chunk函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。