本文整理汇总了Python中nltk.tag.StanfordNERTagger.tag方法的典型用法代码示例。如果您正苦于以下问题:Python StanfordNERTagger.tag方法的具体用法?Python StanfordNERTagger.tag怎么用?Python StanfordNERTagger.tag使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tag.StanfordNERTagger
的用法示例。
在下文中一共展示了StanfordNERTagger.tag方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: trial2
# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def trial2():
"""
Let's try using the nltk and one of the readability texts
:return:
"""
pretrained_model_path = '/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/www-experiments/stanford-ner-2015-12-09/'
all3class = pretrained_model_path+'classifiers/english.all.3class.distsim.crf.ser.gz'
conll4class = pretrained_model_path+'classifiers/english.conll.4class.distsim.crf.ser.gz'
muc7class = pretrained_model_path+'classifiers/english.muc.7class.distsim.crf.ser.gz'
st_muc = StanfordNERTagger(muc7class,
pretrained_model_path+'stanford-ner.jar',
encoding='utf-8')
st_conll = StanfordNERTagger(conll4class,
pretrained_model_path+'stanford-ner.jar',
encoding='utf-8')
st_3class = StanfordNERTagger(all3class,
pretrained_model_path + 'stanford-ner.jar',
encoding='utf-8')
annotated_cities_file = '/Users/mayankkejriwal/datasets/memex-evaluation-november/annotated-cities/ann_city_title_state_1_50.txt'
TP = 0
FP = 0
FN = 0
with codecs.open(annotated_cities_file, 'r', 'utf-8') as f:
for line in f:
obj = json.loads(line)
text = obj['high_recall_readability_text']
tokenized_text = word_tokenize(text)
classified_text_muc = st_muc.tag(tokenized_text)
classified_text_conll = st_conll.tag(tokenized_text)
classified_text_3class = st_3class.tag(tokenized_text)
tagged_locations = set()
correct_locations = _build_locations_true_positives_set(obj, ['correct_cities','correct_states','correct_cities_title'])
# if 'correct_country' in obj and obj['correct_country']:
# correct_locations = correct_locations.union(set(TextPreprocessors.TextPreprocessors._preprocess_tokens
# (obj['correct_country'].split(),['lower'])))
for i in range(0, len(classified_text_muc)):
tag_muc = classified_text_muc[i]
tag_conll = classified_text_conll[i]
tag_3class = classified_text_3class[i]
if str(tag_3class[1]) == 'LOCATION':
# if str(tag_muc[1]) == 'LOCATION' or str(tag_conll[1]) == 'LOCATION' or str(tag_3class[1]) == 'LOCATION':
tagged_locations.add(tag_3class[0].lower())
# print tagged_locations
# print correct_locations
TP += len(tagged_locations.intersection(correct_locations))
FP += (len(tagged_locations)-len(tagged_locations.intersection(correct_locations)))
FN += (len(correct_locations)-len(tagged_locations.intersection(correct_locations)))
# print classified_text[0][1]
# print(classified_text)
# break
print 'TP, FP, FN are...'
print TP
print FP
print FN
示例2: extract_named_entities
# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def extract_named_entities(threadName,output_collection,fetchedTweets):
st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
try:
counter = 0
mongo_list = []
for fetchedTweet in fetchedTweets:
counter += 1
named_entities = []
sentence = fetchedTweet['cleaned_text']
neList = st.tag(sentence.split())
for ne in neList:
if ne[1] in ['PERSON', 'ORGANIZATION', 'LOCATION']:
named_entities.append((ne[0], ne[1]))
fetchedTweet['named_entities'] = named_entities
mongo_list.append(fetchedTweet)
if counter % 100 == 0:
logging.info("{}: Tweets processed: {} tweets".format(threadName, counter))
write_mongo(threadName,output_collection,mongo_list)
mongo_list = []
if len(mongo_list) > 0:
write_mongo(threadName,output_collection,mongo_list)
mongo_list = []
except Exception, e:
print(e)
sys.exit()
示例3: pretag
# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def pretag(self):
text=self.text
st = StanfordNERTagger("/Users/victorstorchan/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz",\
"/Users/victorstorchan/Downloads/stanford-ner-2014-06-16/stanford-ner.jar")
paragraphs = []
paragraphs_string=''
for x in text:
paragraphs.append(str(x))
paragraphs_string=' '.join(paragraphs)
tagging=st.tag(paragraphs_string.split())
symlist=[ 'company','corporation','multinational', 'Corporation','open-source','social', 'network','software','system']
badlist=['integrated','first','check','computer','linear', 'solution','services','limited','tech','solutions','technology','open','model','on','applied','network', 'pricing','customers','social','big','subscribe','social','sign','monitor','software','machine','learning','compute','management','up']
badlist_stem=[]
self.badlist=badlist
self.symlist=symlist
for i in range(len(badlist)):
badlist_stem.append(stemmer.stem(badlist[i]))
self.badlist_stem=badlist_stem
pretag1= [tag for (tag,label) in tagging if label in set(("ORGANIZATION","PERSON")) or (count_upper(tag)>=2 and len(tag)<11 ) ]
pretag2=[tag for (tag,label) in tagging if tag.lower() in dict_1m or tag in dict_apps]
pretag3=[tag for (tag,label) in tagging if tag.lower() in dict_tech]
pretag= pretag1+pretag2+pretag3
domain2synsets = defaultdict(list)
synset2domains = defaultdict(list)
self.pretag=pretag
示例4: test_model_in_mem
# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def test_model_in_mem(stanford_ner_path, model_name, sent_obj, type):
stanford_tagger = StanfordNERTagger(
model_name,
stanford_ner_path,
encoding='utf-8')
text = sent_obj.sentence
tokenized_text = list()
spans = list()
#Recover spans here
for match in re.finditer("\S+", text):
start = match.start()
end = match.end()
word = match.group(0)
tokenized_text.append(word.rstrip(",.;:"))
spans.append((start,end))
tokenized_text = strip_sec_headers_tokenized_text(tokenized_text)
classified_text = stanford_tagger.tag(tokenized_text)
# Expand tuple to have span as well
len_diff = len(spans) - len(classified_text) #Headers were stripped, so if this occured in the previous step, we have t account for the offset
final_class_and_span = list()
for idx,tup in enumerate(classified_text):
combined = (classified_text[idx][0],classified_text[idx][1],spans[idx+len_diff][0],spans[idx+len_diff][1])
final_class_and_span.append(combined)
#print(classified_text)
sent_obj.tok_sent_with_crf_predicted_attribs[type] = final_class_and_span
return sent_obj
示例5: get_location
# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def get_location(loc):
"""
currently working only on my computer
english Model
english.muc.7class.distsim.crf.ser.gz
german Models
german.dewac_175m_600.crf.ser.gz
german.hgc_175m_600.crf.ser.gz
"""
# Named Entity Recognizer: recognizes named entities and assigns types like location, person, organization to the entity
st = StanfordNERTagger('stanford-ner-2015-12-09/classifiers/english.muc.7class.distsim.crf.ser.gz',
'stanford-ner-2015-12-09/stanford-ner-3.6.0.jar')
loc_ner = st.tag(loc)
"""
might be faster starting from back to front
'LOCATION' for English
'I-LOC' for German
"""
# code that glues named entities like 'New York' back together
loc_tuples = [item[0] for item in loc_ner if 'LOCATION' in item]
try:
location = loc_tuples[0]
if len(loc_tuples) > 1:
for i in range(1,len(loc_tuples)):
location += ' ' + loc_tuples[i]
except IndexError:
# if no location is specified
return None
return location
示例6: ner
# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def ner():
os.environ['STANFORD_NER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer'
os.environ['STANFORD_POSTAGGER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27'
os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer/stanford-ner.jar'
os.environ['STANFORD_POSTAGGER'] = os.environ['CLASSPATH']
eng_tagger = StanfordNERTagger('/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer/classifiers/english.all.3class.distsim.crf.ser.gz')
for x in content:
print(eng_tagger.tag(x.split()))
示例7: getEntityCount
# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def getEntityCount(tweet):
# Use the Stanford NER Tagger
st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
# tokenize the tweet
tokenized_text = word_tokenize(tweet)
classified_text = st.tag(tokenized_text)
countPerson =0
for text in classified_text:
if "PERSON" in text[1]:
countPerson+=1
return countPerson
示例8: NERTagging
# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def NERTagging(text):
log_file = open("Dump/log/Main_output.txt", "a")
st = StanfordNERTagger('resources/ner/classifiers/english.all.3class.distsim.crf.ser.gz',
'resources/ner/stanford-ner.jar',
encoding='utf-8')
tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)
log_file.write('NER \n %s \n' % classified_text)
print(classified_text)
log_file.close()
return
示例9: nltk_ner
# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def nltk_ner(remainders):
st = StanfordNERTagger('../stanford-ner/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar')
for item in remainders:
name = ""
tagged = st.tag(item.split())
for entity in tagged:
if entity[1] == u'PERSON':
name += (entity[0].title() + ' ')
if name:
return True, name, item
else:
return False, name, item
示例10: get_namedentities
# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def get_namedentities(text):
"""
Returns named entities in text using StanfordNERTagger
"""
st = StanfordNERTagger('utils/english.conll.4class.caseless.distsim.crf.ser.gz','utils/stanford-ner.jar')
ner_tagged = st.tag(text.lower().split())
named_entities = []
if len(ner_tagged) > 0:
for n in ner_tagged:
if n[1]!='O':
named_entities.append(remove_punctuation(n[0]))
named_entities = [n for n in named_entities if n]
return named_entities
示例11: trial1
# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def trial1():
"""
Just to make sure we're not screwing everything up.
:return:
"""
st = StanfordNERTagger('/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/stanford-ner-2015-12-09/annotated-cities-model.ser.gz',
'/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/stanford-ner-2015-12-09/stanford-ner.jar',
encoding='utf-8')
text = 'While in France, Mrs. Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.'
tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)
print(classified_text)
示例12: classify_text
# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def classify_text(text):
"""Using the 3-class Stanford Named Entity Recognition model, classify each
word in the input text as a PERSON, LOCATION, ORGANIZATION, or O (for
other)."""
directory = "C:/Users/liabbott/Documents/Projects/CBP OIT/stanford_ner/"
mod = "classifiers/english.all.3class.distsim.crf.ser.gz"
tag = "stanford-ner.jar"
path_to_model = os.path.normpath(directory + mod)
path_to_tagger = os.path.normpath(directory + tag)
st = StanfordNERTagger(path_to_model, path_to_tagger, encoding='utf-8')
tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)
return classified_text
示例13: stanford_entities
# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def stanford_entities(model, jar, fileids=None, corpus=kddcorpus, section = None):
"""
Extract entities using the Stanford NER tagger.
Must pass in the path to the tagging model and jar as downloaded from the
Stanford Core NLP website.
"""
results = defaultdict(lambda: defaultdict(list))
fileids = fileids or corpus.fileids()
tagger = StanfordNERTagger(model, jar)
section = section
for fileid in fileids:
if section is not None:
text = nltk.word_tokenize(list(sectpull([fileid],section=section))[0][1])
else:
text = corpus.words(fileid)
chunk = []
for token, tag in tagger.tag(text):
if tag == 'O':
if chunk:
# Flush the current chunk
etext = " ".join([c[0] for c in chunk])
etag = chunk[0][1]
chunk = []
# if etag == 'PERSON':
# key = 'persons'
# elif etag == 'ORGANIZATION':
# key = 'organizations'
# elif etag == 'LOCATION':
# key = 'locations'
# else:
# key = 'other'
if etag == 'LOCATION':
key = 'locations'
else:
key = 'other'
results[fileid][key].append(etext)
else:
# Build chunk from tags
chunk.append((token, tag))
return results
示例14: html_ner
# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def html_ner(content):
st = StanfordNERTagger(
'./lib/classifiers/english.all.3class.distsim.crf.ser.gz',
'./lib/stanford-ner-3.5.2.jar')
soup = BeautifulSoup(content, "html.parser")
for script in soup(["script", "style", "sup"]):
script.extract()
tokenised_sents = list(soup.stripped_strings)
tokenised_words = [wordpunct_tokenize(sent) for sent in tokenised_sents]
tagged_sents = [st.tag(sent) for sent in tokenised_words]
result = list()
for sent in tagged_sents:
for tag, chunk in groupby(sent, lambda x: x[1]):
if tag != 'O':
result.append((tag, ' '.join(w for w, t in chunk).encode('utf-8').strip()))
return result
示例15: sanitize_result
# 需要导入模块: from nltk.tag import StanfordNERTagger [as 别名]
# 或者: from nltk.tag.StanfordNERTagger import tag [as 别名]
def sanitize_result(self, text):
st = StanfordNERTagger('C:\Python27\stanford_ner\classifiers\english.all.3class.distsim.crf.ser.gz',
'C:\Python27\stanford_ner\stanford-ner.jar',
encoding='utf-8')
tokenized_text = word_tokenize(self.capitalize_first_letter(text))
classified_text = st.tag(tokenized_text)
named_entities = self.get_continuous_chunks(classified_text)
named_entities_str = [" ".join([token for token, tag in ne]) for ne in named_entities]
named_entities_str_tag = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in named_entities]
for tag, chunk in groupby(named_entities_str_tag, lambda x:x[1]):
if tag == "PERSON":
#print "%-12s"%tag, " ".join(w for w, t in chunk)
name = " ".join(w for w, t in chunk)
return name