本文整理汇总了Python中nltk.tag.StanfordNERTagger类的典型用法代码示例。如果您正苦于以下问题:Python StanfordNERTagger类的具体用法?Python StanfordNERTagger怎么用?Python StanfordNERTagger使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了StanfordNERTagger类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: pretag
def pretag(self):
text=self.text
st = StanfordNERTagger("/Users/victorstorchan/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz",\
"/Users/victorstorchan/Downloads/stanford-ner-2014-06-16/stanford-ner.jar")
paragraphs = []
paragraphs_string=''
for x in text:
paragraphs.append(str(x))
paragraphs_string=' '.join(paragraphs)
tagging=st.tag(paragraphs_string.split())
symlist=[ 'company','corporation','multinational', 'Corporation','open-source','social', 'network','software','system']
badlist=['integrated','first','check','computer','linear', 'solution','services','limited','tech','solutions','technology','open','model','on','applied','network', 'pricing','customers','social','big','subscribe','social','sign','monitor','software','machine','learning','compute','management','up']
badlist_stem=[]
self.badlist=badlist
self.symlist=symlist
for i in range(len(badlist)):
badlist_stem.append(stemmer.stem(badlist[i]))
self.badlist_stem=badlist_stem
pretag1= [tag for (tag,label) in tagging if label in set(("ORGANIZATION","PERSON")) or (count_upper(tag)>=2 and len(tag)<11 ) ]
pretag2=[tag for (tag,label) in tagging if tag.lower() in dict_1m or tag in dict_apps]
pretag3=[tag for (tag,label) in tagging if tag.lower() in dict_tech]
pretag= pretag1+pretag2+pretag3
domain2synsets = defaultdict(list)
synset2domains = defaultdict(list)
self.pretag=pretag
示例2: test_model_in_mem
def test_model_in_mem(stanford_ner_path, model_name, sent_obj, type):
stanford_tagger = StanfordNERTagger(
model_name,
stanford_ner_path,
encoding='utf-8')
text = sent_obj.sentence
tokenized_text = list()
spans = list()
#Recover spans here
for match in re.finditer("\S+", text):
start = match.start()
end = match.end()
word = match.group(0)
tokenized_text.append(word.rstrip(",.;:"))
spans.append((start,end))
tokenized_text = strip_sec_headers_tokenized_text(tokenized_text)
classified_text = stanford_tagger.tag(tokenized_text)
# Expand tuple to have span as well
len_diff = len(spans) - len(classified_text) #Headers were stripped, so if this occured in the previous step, we have t account for the offset
final_class_and_span = list()
for idx,tup in enumerate(classified_text):
combined = (classified_text[idx][0],classified_text[idx][1],spans[idx+len_diff][0],spans[idx+len_diff][1])
final_class_and_span.append(combined)
#print(classified_text)
sent_obj.tok_sent_with_crf_predicted_attribs[type] = final_class_and_span
return sent_obj
示例3: extract_named_entities
def extract_named_entities(threadName,output_collection,fetchedTweets):
st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
try:
counter = 0
mongo_list = []
for fetchedTweet in fetchedTweets:
counter += 1
named_entities = []
sentence = fetchedTweet['cleaned_text']
neList = st.tag(sentence.split())
for ne in neList:
if ne[1] in ['PERSON', 'ORGANIZATION', 'LOCATION']:
named_entities.append((ne[0], ne[1]))
fetchedTweet['named_entities'] = named_entities
mongo_list.append(fetchedTweet)
if counter % 100 == 0:
logging.info("{}: Tweets processed: {} tweets".format(threadName, counter))
write_mongo(threadName,output_collection,mongo_list)
mongo_list = []
if len(mongo_list) > 0:
write_mongo(threadName,output_collection,mongo_list)
mongo_list = []
except Exception, e:
print(e)
sys.exit()
示例4: get_location
def get_location(loc):
"""
currently working only on my computer
english Model
english.muc.7class.distsim.crf.ser.gz
german Models
german.dewac_175m_600.crf.ser.gz
german.hgc_175m_600.crf.ser.gz
"""
# Named Entity Recognizer: recognizes named entities and assigns types like location, person, organization to the entity
st = StanfordNERTagger('stanford-ner-2015-12-09/classifiers/english.muc.7class.distsim.crf.ser.gz',
'stanford-ner-2015-12-09/stanford-ner-3.6.0.jar')
loc_ner = st.tag(loc)
"""
might be faster starting from back to front
'LOCATION' for English
'I-LOC' for German
"""
# code that glues named entities like 'New York' back together
loc_tuples = [item[0] for item in loc_ner if 'LOCATION' in item]
try:
location = loc_tuples[0]
if len(loc_tuples) > 1:
for i in range(1,len(loc_tuples)):
location += ' ' + loc_tuples[i]
except IndexError:
# if no location is specified
return None
return location
示例5: ner
def ner():
os.environ['STANFORD_NER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer'
os.environ['STANFORD_POSTAGGER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27'
os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer/stanford-ner.jar'
os.environ['STANFORD_POSTAGGER'] = os.environ['CLASSPATH']
eng_tagger = StanfordNERTagger('/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer/classifiers/english.all.3class.distsim.crf.ser.gz')
for x in content:
print(eng_tagger.tag(x.split()))
示例6: getEntityCount
def getEntityCount(tweet):
# Use the Stanford NER Tagger
st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
# tokenize the tweet
tokenized_text = word_tokenize(tweet)
classified_text = st.tag(tokenized_text)
countPerson =0
for text in classified_text:
if "PERSON" in text[1]:
countPerson+=1
return countPerson
示例7: NERTagging
def NERTagging(text):
log_file = open("Dump/log/Main_output.txt", "a")
st = StanfordNERTagger('resources/ner/classifiers/english.all.3class.distsim.crf.ser.gz',
'resources/ner/stanford-ner.jar',
encoding='utf-8')
tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)
log_file.write('NER \n %s \n' % classified_text)
print(classified_text)
log_file.close()
return
示例8: nltk_ner
def nltk_ner(remainders):
st = StanfordNERTagger('../stanford-ner/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar')
for item in remainders:
name = ""
tagged = st.tag(item.split())
for entity in tagged:
if entity[1] == u'PERSON':
name += (entity[0].title() + ' ')
if name:
return True, name, item
else:
return False, name, item
示例9: trial1
def trial1():
"""
Just to make sure we're not screwing everything up.
:return:
"""
st = StanfordNERTagger('/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/stanford-ner-2015-12-09/annotated-cities-model.ser.gz',
'/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/stanford-ner-2015-12-09/stanford-ner.jar',
encoding='utf-8')
text = 'While in France, Mrs. Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.'
tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)
print(classified_text)
示例10: get_namedentities
def get_namedentities(text):
"""
Returns named entities in text using StanfordNERTagger
"""
st = StanfordNERTagger('utils/english.conll.4class.caseless.distsim.crf.ser.gz','utils/stanford-ner.jar')
ner_tagged = st.tag(text.lower().split())
named_entities = []
if len(ner_tagged) > 0:
for n in ner_tagged:
if n[1]!='O':
named_entities.append(remove_punctuation(n[0]))
named_entities = [n for n in named_entities if n]
return named_entities
示例11: classify_text
def classify_text(text):
"""Using the 3-class Stanford Named Entity Recognition model, classify each
word in the input text as a PERSON, LOCATION, ORGANIZATION, or O (for
other)."""
directory = "C:/Users/liabbott/Documents/Projects/CBP OIT/stanford_ner/"
mod = "classifiers/english.all.3class.distsim.crf.ser.gz"
tag = "stanford-ner.jar"
path_to_model = os.path.normpath(directory + mod)
path_to_tagger = os.path.normpath(directory + tag)
st = StanfordNERTagger(path_to_model, path_to_tagger, encoding='utf-8')
tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)
return classified_text
示例12: __init__
def __init__(self, use_stanford=False, NER_model=None, NER_tagger=None, POS_model=None, POS_tagger=None):
"""The initializer of the class
:param NER_model: NER model path
:param NER_tagger: NER tagger path
:param POS_model: POS model path
:param POS_tagger: POS tagger path
:param use_stanford: boolean, if using stanford NER and POS tagging
"""
self.NER_model = NER_model
self.NER_tagger = NER_tagger
self.POS_model = POS_model
self.POS_tagger = POS_tagger
self.use_stanford = use_stanford
if use_stanford:
if NER_model is None or NER_tagger is None or POS_model is None or POS_tagger is None:
sys.exit("tagging initialization: Stanford models and taggers" " have to be provided!")
else:
self.post = StanfordPOSTagger(self.POS_model, self.POS_tagger).tag
self.nert = StanfordNERTagger(self.NER_model, self.NER_tagger).tag
else:
self.post = nltk.pos_tag
self.nert = nltk.ne_chunk
示例13: stanford_entities
def stanford_entities(model, jar, fileids=None, corpus=kddcorpus, section = None):
"""
Extract entities using the Stanford NER tagger.
Must pass in the path to the tagging model and jar as downloaded from the
Stanford Core NLP website.
"""
results = defaultdict(lambda: defaultdict(list))
fileids = fileids or corpus.fileids()
tagger = StanfordNERTagger(model, jar)
section = section
for fileid in fileids:
if section is not None:
text = nltk.word_tokenize(list(sectpull([fileid],section=section))[0][1])
else:
text = corpus.words(fileid)
chunk = []
for token, tag in tagger.tag(text):
if tag == 'O':
if chunk:
# Flush the current chunk
etext = " ".join([c[0] for c in chunk])
etag = chunk[0][1]
chunk = []
# if etag == 'PERSON':
# key = 'persons'
# elif etag == 'ORGANIZATION':
# key = 'organizations'
# elif etag == 'LOCATION':
# key = 'locations'
# else:
# key = 'other'
if etag == 'LOCATION':
key = 'locations'
else:
key = 'other'
results[fileid][key].append(etext)
else:
# Build chunk from tags
chunk.append((token, tag))
return results
示例14: main
def main():
parser = StanfordParser(path_to_jar=script_wrapper.stanford_parser_jar, path_to_models_jar=script_wrapper.stanford_model_jar)
st = StanfordNERTagger(model_filename='../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar")
raw_sent = "Dempsey was drafted by Major League Soccer club New England Revolution."
sent = word_tokenize(raw_sent)
ne_tuple = st.cur_tag(sent) # ##need write interface for tokenized sent (http://nlp.stanford.edu/software/crf-faq.shtml#tokenized)
print ne_tuple
print parser.raw_parse(raw_sent).next()
return
# find name entity
f = 0
ne_list = []
for (ne, label) in ne_tuple:
if label == 'PERSON':
f = 1
if f and label != 'PERSON':
break
if f:
ne_list.append(ne)
# print ne_list
init_file(main_tree)
####### my issue here: 1. don't know how to get NP. 2. is there a quicker way to find PERON ?
# try head to ask who/what
pattern = "S < NP=np"
head = check_output(['bash', ###add bash !!!!
tregex_path,
'-s',
pattern,
init_tree_file])
print head
def get_main_verbs(tree):
pattern = '/(VB.?)/=main >+ (VP) (S > ROOT)'
main_verbs = check_output(['bash', ###add bash !!!!
tregex_path,
'-s',
pattern,
init_tree_file])
print main_verbs
main_verbs = main_verbs.split('\n')[:-1]
main_verbs = [Tree.fromstring(main_verb) for main_verb in main_verbs]
return main_verbs
示例15: html_ner
def html_ner(content):
st = StanfordNERTagger(
'./lib/classifiers/english.all.3class.distsim.crf.ser.gz',
'./lib/stanford-ner-3.5.2.jar')
soup = BeautifulSoup(content, "html.parser")
for script in soup(["script", "style", "sup"]):
script.extract()
tokenised_sents = list(soup.stripped_strings)
tokenised_words = [wordpunct_tokenize(sent) for sent in tokenised_sents]
tagged_sents = [st.tag(sent) for sent in tokenised_words]
result = list()
for sent in tagged_sents:
for tag, chunk in groupby(sent, lambda x: x[1]):
if tag != 'O':
result.append((tag, ' '.join(w for w, t in chunk).encode('utf-8').strip()))
return result