当前位置: 首页>>代码示例>>Python>>正文


Python StanfordNERTagger.tag方法代码示例

本文整理汇总了Python中nltk.tag.StanfordNERTagger.tag方法的典型用法代码示例。如果您正苦于以下问题:Python StanfordNERTagger.tag方法的具体用法?Python StanfordNERTagger.tag怎么用?Python StanfordNERTagger.tag使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.tag.StanfordNERTagger的用法示例。


在下文中一共展示了StanfordNERTagger.tag方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: trial2

# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def trial2():
    """
    Let's try using the nltk and one of the readability texts
    :return:
    """
    pretrained_model_path = '/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/www-experiments/stanford-ner-2015-12-09/'
    all3class = pretrained_model_path+'classifiers/english.all.3class.distsim.crf.ser.gz'
    conll4class = pretrained_model_path+'classifiers/english.conll.4class.distsim.crf.ser.gz'
    muc7class = pretrained_model_path+'classifiers/english.muc.7class.distsim.crf.ser.gz'
    st_muc = StanfordNERTagger(muc7class,
                           pretrained_model_path+'stanford-ner.jar',
                           encoding='utf-8')
    st_conll = StanfordNERTagger(conll4class,
                           pretrained_model_path+'stanford-ner.jar',
                           encoding='utf-8')
    st_3class = StanfordNERTagger(all3class,
                                 pretrained_model_path + 'stanford-ner.jar',
                                 encoding='utf-8')
    annotated_cities_file = '/Users/mayankkejriwal/datasets/memex-evaluation-november/annotated-cities/ann_city_title_state_1_50.txt'
    TP = 0
    FP = 0
    FN = 0
    with codecs.open(annotated_cities_file, 'r', 'utf-8') as f:
        for line in f:
            obj = json.loads(line)
            text = obj['high_recall_readability_text']
            tokenized_text = word_tokenize(text)
            classified_text_muc = st_muc.tag(tokenized_text)
            classified_text_conll = st_conll.tag(tokenized_text)
            classified_text_3class = st_3class.tag(tokenized_text)
            tagged_locations = set()

            correct_locations = _build_locations_true_positives_set(obj, ['correct_cities','correct_states','correct_cities_title'])
            # if 'correct_country' in obj and obj['correct_country']:
            #     correct_locations = correct_locations.union(set(TextPreprocessors.TextPreprocessors._preprocess_tokens
            #                                                     (obj['correct_country'].split(),['lower'])))
            for i in range(0, len(classified_text_muc)):
                tag_muc = classified_text_muc[i]
                tag_conll = classified_text_conll[i]
                tag_3class = classified_text_3class[i]
                if str(tag_3class[1]) == 'LOCATION':
                # if str(tag_muc[1]) == 'LOCATION' or str(tag_conll[1]) == 'LOCATION' or str(tag_3class[1]) == 'LOCATION':
                    tagged_locations.add(tag_3class[0].lower())
            # print tagged_locations
            # print correct_locations
            TP += len(tagged_locations.intersection(correct_locations))
            FP += (len(tagged_locations)-len(tagged_locations.intersection(correct_locations)))
            FN += (len(correct_locations)-len(tagged_locations.intersection(correct_locations)))
            # print classified_text[0][1]
            # print(classified_text)
            # break
    print 'TP, FP, FN are...'
    print TP
    print FP
    print FN
开发者ID:mayankkejriwal,项目名称:pycharm-projects-ubuntu,代码行数:57,代码来源:StanfordNER.py

示例2: extract_named_entities

# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def extract_named_entities(threadName,output_collection,fetchedTweets):
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
    try:
        counter = 0
        mongo_list = []
        for fetchedTweet in fetchedTweets:
            counter += 1
            named_entities = []
            sentence = fetchedTweet['cleaned_text']
            neList = st.tag(sentence.split())
            for ne in neList:
                if ne[1] in ['PERSON', 'ORGANIZATION', 'LOCATION']:
                    named_entities.append((ne[0], ne[1]))
            fetchedTweet['named_entities'] = named_entities
            
            mongo_list.append(fetchedTweet)
            if counter % 100 == 0:
                logging.info("{}: Tweets processed: {} tweets".format(threadName, counter))
                write_mongo(threadName,output_collection,mongo_list)
                mongo_list = []
        if len(mongo_list) > 0:
            write_mongo(threadName,output_collection,mongo_list)
            mongo_list = []
    except Exception, e:
        print(e)
        sys.exit()
开发者ID:anammari,项目名称:optimum_repo,代码行数:28,代码来源:NerWMTweetsMongoIntraDaysMTv3.py

示例3: pretag

# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
	def pretag(self):
		text=self.text
		st = StanfordNERTagger("/Users/victorstorchan/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz",\
	"/Users/victorstorchan/Downloads/stanford-ner-2014-06-16/stanford-ner.jar")
		paragraphs = []
		paragraphs_string=''
		for x in text:
			paragraphs.append(str(x))
		paragraphs_string=' '.join(paragraphs)
		tagging=st.tag(paragraphs_string.split())
		symlist=[ 'company','corporation','multinational', 'Corporation','open-source','social', 'network','software','system']
		badlist=['integrated','first','check','computer','linear', 'solution','services','limited','tech','solutions','technology','open','model','on','applied','network', 'pricing','customers','social','big','subscribe','social','sign','monitor','software','machine','learning','compute','management','up']
		badlist_stem=[]
		self.badlist=badlist
		self.symlist=symlist
		for i in range(len(badlist)):
			badlist_stem.append(stemmer.stem(badlist[i]))
		self.badlist_stem=badlist_stem
		pretag1= [tag for (tag,label) in tagging if label in set(("ORGANIZATION","PERSON")) or (count_upper(tag)>=2 and len(tag)<11 ) ]
		pretag2=[tag for (tag,label) in tagging if tag.lower() in dict_1m or tag in dict_apps]
		pretag3=[tag for (tag,label) in tagging if tag.lower() in dict_tech]
		pretag= pretag1+pretag2+pretag3
		domain2synsets = defaultdict(list)
		synset2domains = defaultdict(list)
		self.pretag=pretag
开发者ID:victorstorchan,项目名称:NER,代码行数:27,代码来源:extract_named_entities.py

示例4: test_model_in_mem

# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def test_model_in_mem(stanford_ner_path, model_name, sent_obj, type):
    stanford_tagger = StanfordNERTagger(
        model_name,
        stanford_ner_path,
        encoding='utf-8')

    text = sent_obj.sentence
    tokenized_text = list()
    spans = list()
    #Recover spans here
    for match in re.finditer("\S+", text):
        start = match.start()
        end = match.end()
        word = match.group(0)
        tokenized_text.append(word.rstrip(",.;:"))
        spans.append((start,end))
    tokenized_text = strip_sec_headers_tokenized_text(tokenized_text)
    classified_text = stanford_tagger.tag(tokenized_text)

    # Expand tuple to have span as well
    len_diff = len(spans) - len(classified_text) #Headers were stripped, so if this occured in the previous step, we have t account for the offset
    final_class_and_span = list()
    for idx,tup in enumerate(classified_text):
        combined = (classified_text[idx][0],classified_text[idx][1],spans[idx+len_diff][0],spans[idx+len_diff][1])
        final_class_and_span.append(combined)

    #print(classified_text)
    sent_obj.tok_sent_with_crf_predicted_attribs[type] = final_class_and_span
    return sent_obj
开发者ID:abbottLane,项目名称:substance_abuse_extractor,代码行数:31,代码来源:EntityExtractor.py

示例5: get_location

# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def get_location(loc):
    """
    currently working only on my computer
    english Model
        english.muc.7class.distsim.crf.ser.gz
    german Models
        german.dewac_175m_600.crf.ser.gz
        german.hgc_175m_600.crf.ser.gz
    """
    # Named Entity Recognizer: recognizes named entities and assigns types like location, person, organization to the entity
    st = StanfordNERTagger('stanford-ner-2015-12-09/classifiers/english.muc.7class.distsim.crf.ser.gz',
    'stanford-ner-2015-12-09/stanford-ner-3.6.0.jar')
    loc_ner = st.tag(loc)
    """
    might be faster starting from back to front
        'LOCATION' for English
        'I-LOC' for German
    """
    # code that glues named entities like 'New York' back together
    loc_tuples = [item[0] for item in loc_ner if 'LOCATION' in item]
    try:
        location = loc_tuples[0]
        if len(loc_tuples) > 1:
            for i in range(1,len(loc_tuples)):
                location += ' ' + loc_tuples[i]
    except IndexError:
        # if no location is specified
        return None
    return location
开发者ID:phucdev,项目名称:weatherbot,代码行数:31,代码来源:extractor.py

示例6: ner

# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def ner():
	os.environ['STANFORD_NER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer'
	os.environ['STANFORD_POSTAGGER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27'
	os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer/stanford-ner.jar'
	os.environ['STANFORD_POSTAGGER'] = os.environ['CLASSPATH']

	eng_tagger = StanfordNERTagger('/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer/classifiers/english.all.3class.distsim.crf.ser.gz')
	for x in content:
		print(eng_tagger.tag(x.split()))
开发者ID:choon94,项目名称:choon94.github.io,代码行数:11,代码来源:newsTest.py

示例7: getEntityCount

# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def getEntityCount(tweet):
    # Use the Stanford NER Tagger
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') 
    # tokenize the tweet
    tokenized_text = word_tokenize(tweet)
    classified_text = st.tag(tokenized_text)
    countPerson =0
    for text in classified_text:
        if "PERSON" in text[1]:
            countPerson+=1 
    return countPerson
开发者ID:RohithEngu,项目名称:Opinion-Summarizer,代码行数:13,代码来源:Attributes.py

示例8: NERTagging

# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def NERTagging(text):
    log_file = open("Dump/log/Main_output.txt", "a")
    st = StanfordNERTagger('resources/ner/classifiers/english.all.3class.distsim.crf.ser.gz',
					   'resources/ner/stanford-ner.jar',
					   encoding='utf-8')
    tokenized_text = word_tokenize(text)
    classified_text = st.tag(tokenized_text)
    log_file.write('NER \n %s \n' % classified_text)
    print(classified_text)
    log_file.close()
    return
开发者ID:MoizRauf,项目名称:OQuant_Wiki_Clustering,代码行数:13,代码来源:NLPHelper.py

示例9: nltk_ner

# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def nltk_ner(remainders):
	st = StanfordNERTagger('../stanford-ner/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar') 
	for item in remainders:
		name = ""
		tagged = st.tag(item.split())
		for entity in tagged:
			if entity[1] == u'PERSON':
				name += (entity[0].title() + ' ')
		if name: 
			return True, name, item
		else:
			return False, name, item
开发者ID:mwcurry,项目名称:tracker,代码行数:14,代码来源:parser.py

示例10: get_namedentities

# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def get_namedentities(text):
  """
  Returns named entities in text using StanfordNERTagger
  """
  st = StanfordNERTagger('utils/english.conll.4class.caseless.distsim.crf.ser.gz','utils/stanford-ner.jar')   
  ner_tagged = st.tag(text.lower().split())     
  
  named_entities = []
  if len(ner_tagged) > 0:
    for n in ner_tagged:
      if n[1]!='O':
        named_entities.append(remove_punctuation(n[0]))

  named_entities = [n for n in named_entities if n] 
  return named_entities
开发者ID:veryluckyxyz,项目名称:keywordfinder,代码行数:17,代码来源:features.py

示例11: trial1

# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def trial1():
    """
    Just to make sure we're not screwing everything up.
    :return:
    """
    st = StanfordNERTagger('/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/stanford-ner-2015-12-09/annotated-cities-model.ser.gz',
                           '/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/stanford-ner-2015-12-09/stanford-ner.jar',
                           encoding='utf-8')

    text = 'While in France, Mrs. Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.'

    tokenized_text = word_tokenize(text)
    classified_text = st.tag(tokenized_text)

    print(classified_text)
开发者ID:mayankkejriwal,项目名称:pycharm-projects-ubuntu,代码行数:17,代码来源:StanfordNER.py

示例12: classify_text

# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def classify_text(text):
    """Using the 3-class Stanford Named Entity Recognition model, classify each
       word in the input text as a PERSON, LOCATION, ORGANIZATION, or O (for
       other)."""

    directory = "C:/Users/liabbott/Documents/Projects/CBP OIT/stanford_ner/"
    mod = "classifiers/english.all.3class.distsim.crf.ser.gz"
    tag = "stanford-ner.jar"
    path_to_model = os.path.normpath(directory + mod)
    path_to_tagger = os.path.normpath(directory + tag)
    st = StanfordNERTagger(path_to_model, path_to_tagger, encoding='utf-8')

    tokenized_text = word_tokenize(text)
    classified_text = st.tag(tokenized_text)

    return classified_text
开发者ID:liameabbott,项目名称:named_entity_recognition,代码行数:18,代码来源:namedEntityRecognition.py

示例13: stanford_entities

# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def stanford_entities(model, jar, fileids=None, corpus=kddcorpus, section = None):
    """
    Extract entities using the Stanford NER tagger.
    Must pass in the path to the tagging model and jar as downloaded from the
    Stanford Core NLP website.
    """
    results = defaultdict(lambda: defaultdict(list))
    fileids = fileids or corpus.fileids()
    tagger  = StanfordNERTagger(model, jar)
    section = section

    for fileid in fileids:
        if section is not None:
            text = nltk.word_tokenize(list(sectpull([fileid],section=section))[0][1])
        else:
            text  = corpus.words(fileid)

        chunk = []

        for token, tag in tagger.tag(text):
            if tag == 'O':
                if chunk:
                    # Flush the current chunk
                    etext =  " ".join([c[0] for c in chunk])
                    etag  = chunk[0][1]
                    chunk = []

                    # if etag == 'PERSON':
                    #     key = 'persons'
                    # elif etag == 'ORGANIZATION':
                    #     key = 'organizations'
                    # elif etag == 'LOCATION':
                    #     key = 'locations'
                    # else:
                    #     key = 'other'

                    if etag == 'LOCATION':
                        key = 'locations'
                    else:
                        key = 'other'
                    results[fileid][key].append(etext)

            else:
                # Build chunk from tags
                chunk.append((token, tag))

    return results
开发者ID:goldin2008,项目名称:Research_in_NLP,代码行数:49,代码来源:extract_NER.py

示例14: html_ner

# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def html_ner(content):
    st = StanfordNERTagger(
        './lib/classifiers/english.all.3class.distsim.crf.ser.gz',
        './lib/stanford-ner-3.5.2.jar')
    soup = BeautifulSoup(content, "html.parser")
    for script in soup(["script", "style", "sup"]):
        script.extract()
    tokenised_sents = list(soup.stripped_strings)
    tokenised_words = [wordpunct_tokenize(sent) for sent in tokenised_sents]
    tagged_sents = [st.tag(sent) for sent in tokenised_words]

    result = list()

    for sent in tagged_sents:
        for tag, chunk in groupby(sent, lambda x: x[1]):
            if tag != 'O':
                result.append((tag, ' '.join(w for w, t in chunk).encode('utf-8').strip()))
    return result
开发者ID:Sinderella,项目名称:OSINT,代码行数:20,代码来源:ners.py

示例15: sanitize_result

# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
    def sanitize_result(self, text):
        
        
        st = StanfordNERTagger('C:\Python27\stanford_ner\classifiers\english.all.3class.distsim.crf.ser.gz',
                                                   'C:\Python27\stanford_ner\stanford-ner.jar',
                                                   encoding='utf-8')
        tokenized_text = word_tokenize(self.capitalize_first_letter(text))
        classified_text = st.tag(tokenized_text)

        named_entities = self.get_continuous_chunks(classified_text)
        named_entities_str = [" ".join([token for token, tag in ne]) for ne in named_entities]
        named_entities_str_tag = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in named_entities]


        for tag, chunk in groupby(named_entities_str_tag, lambda x:x[1]):
            if tag == "PERSON":
                #print "%-12s"%tag, " ".join(w for w, t in chunk)
                name = " ".join(w for w, t in chunk)
               
        return name
开发者ID:tseg,项目名称:online_img_search,代码行数:22,代码来源:stanford.py


注:本文中的nltk.tag.StanfordNERTagger.tag方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。