当前位置: 首页>>代码示例>>Python>>正文


Python tag.StanfordNERTagger类代码示例

本文整理汇总了Python中nltk.tag.StanfordNERTagger的典型用法代码示例。如果您正苦于以下问题:Python StanfordNERTagger类的具体用法?Python StanfordNERTagger怎么用?Python StanfordNERTagger使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了StanfordNERTagger类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: pretag

	def pretag(self):
		text=self.text
		st = StanfordNERTagger("/Users/victorstorchan/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz",\
	"/Users/victorstorchan/Downloads/stanford-ner-2014-06-16/stanford-ner.jar")
		paragraphs = []
		paragraphs_string=''
		for x in text:
			paragraphs.append(str(x))
		paragraphs_string=' '.join(paragraphs)
		tagging=st.tag(paragraphs_string.split())
		symlist=[ 'company','corporation','multinational', 'Corporation','open-source','social', 'network','software','system']
		badlist=['integrated','first','check','computer','linear', 'solution','services','limited','tech','solutions','technology','open','model','on','applied','network', 'pricing','customers','social','big','subscribe','social','sign','monitor','software','machine','learning','compute','management','up']
		badlist_stem=[]
		self.badlist=badlist
		self.symlist=symlist
		for i in range(len(badlist)):
			badlist_stem.append(stemmer.stem(badlist[i]))
		self.badlist_stem=badlist_stem
		pretag1= [tag for (tag,label) in tagging if label in set(("ORGANIZATION","PERSON")) or (count_upper(tag)>=2 and len(tag)<11 ) ]
		pretag2=[tag for (tag,label) in tagging if tag.lower() in dict_1m or tag in dict_apps]
		pretag3=[tag for (tag,label) in tagging if tag.lower() in dict_tech]
		pretag= pretag1+pretag2+pretag3
		domain2synsets = defaultdict(list)
		synset2domains = defaultdict(list)
		self.pretag=pretag
开发者ID:victorstorchan,项目名称:NER,代码行数:25,代码来源:extract_named_entities.py

示例2: test_model_in_mem

def test_model_in_mem(stanford_ner_path, model_name, sent_obj, type):
    stanford_tagger = StanfordNERTagger(
        model_name,
        stanford_ner_path,
        encoding='utf-8')

    text = sent_obj.sentence
    tokenized_text = list()
    spans = list()
    #Recover spans here
    for match in re.finditer("\S+", text):
        start = match.start()
        end = match.end()
        word = match.group(0)
        tokenized_text.append(word.rstrip(",.;:"))
        spans.append((start,end))
    tokenized_text = strip_sec_headers_tokenized_text(tokenized_text)
    classified_text = stanford_tagger.tag(tokenized_text)

    # Expand tuple to have span as well
    len_diff = len(spans) - len(classified_text) #Headers were stripped, so if this occured in the previous step, we have t account for the offset
    final_class_and_span = list()
    for idx,tup in enumerate(classified_text):
        combined = (classified_text[idx][0],classified_text[idx][1],spans[idx+len_diff][0],spans[idx+len_diff][1])
        final_class_and_span.append(combined)

    #print(classified_text)
    sent_obj.tok_sent_with_crf_predicted_attribs[type] = final_class_and_span
    return sent_obj
开发者ID:abbottLane,项目名称:substance_abuse_extractor,代码行数:29,代码来源:EntityExtractor.py

示例3: extract_named_entities

def extract_named_entities(threadName,output_collection,fetchedTweets):
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
    try:
        counter = 0
        mongo_list = []
        for fetchedTweet in fetchedTweets:
            counter += 1
            named_entities = []
            sentence = fetchedTweet['cleaned_text']
            neList = st.tag(sentence.split())
            for ne in neList:
                if ne[1] in ['PERSON', 'ORGANIZATION', 'LOCATION']:
                    named_entities.append((ne[0], ne[1]))
            fetchedTweet['named_entities'] = named_entities
            
            mongo_list.append(fetchedTweet)
            if counter % 100 == 0:
                logging.info("{}: Tweets processed: {} tweets".format(threadName, counter))
                write_mongo(threadName,output_collection,mongo_list)
                mongo_list = []
        if len(mongo_list) > 0:
            write_mongo(threadName,output_collection,mongo_list)
            mongo_list = []
    except Exception, e:
        print(e)
        sys.exit()
开发者ID:anammari,项目名称:optimum_repo,代码行数:26,代码来源:NerWMTweetsMongoIntraDaysMTv3.py

示例4: get_location

def get_location(loc):
    """
    currently working only on my computer
    english Model
        english.muc.7class.distsim.crf.ser.gz
    german Models
        german.dewac_175m_600.crf.ser.gz
        german.hgc_175m_600.crf.ser.gz
    """
    # Named Entity Recognizer: recognizes named entities and assigns types like location, person, organization to the entity
    st = StanfordNERTagger('stanford-ner-2015-12-09/classifiers/english.muc.7class.distsim.crf.ser.gz',
    'stanford-ner-2015-12-09/stanford-ner-3.6.0.jar')
    loc_ner = st.tag(loc)
    """
    might be faster starting from back to front
        'LOCATION' for English
        'I-LOC' for German
    """
    # code that glues named entities like 'New York' back together
    loc_tuples = [item[0] for item in loc_ner if 'LOCATION' in item]
    try:
        location = loc_tuples[0]
        if len(loc_tuples) > 1:
            for i in range(1,len(loc_tuples)):
                location += ' ' + loc_tuples[i]
    except IndexError:
        # if no location is specified
        return None
    return location
开发者ID:phucdev,项目名称:weatherbot,代码行数:29,代码来源:extractor.py

示例5: ner

def ner():
	os.environ['STANFORD_NER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer'
	os.environ['STANFORD_POSTAGGER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27'
	os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer/stanford-ner.jar'
	os.environ['STANFORD_POSTAGGER'] = os.environ['CLASSPATH']

	eng_tagger = StanfordNERTagger('/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer/classifiers/english.all.3class.distsim.crf.ser.gz')
	for x in content:
		print(eng_tagger.tag(x.split()))
开发者ID:choon94,项目名称:choon94.github.io,代码行数:9,代码来源:newsTest.py

示例6: getEntityCount

def getEntityCount(tweet):
    # Use the Stanford NER Tagger
    st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') 
    # tokenize the tweet
    tokenized_text = word_tokenize(tweet)
    classified_text = st.tag(tokenized_text)
    countPerson =0
    for text in classified_text:
        if "PERSON" in text[1]:
            countPerson+=1 
    return countPerson
开发者ID:RohithEngu,项目名称:Opinion-Summarizer,代码行数:11,代码来源:Attributes.py

示例7: NERTagging

def NERTagging(text):
    log_file = open("Dump/log/Main_output.txt", "a")
    st = StanfordNERTagger('resources/ner/classifiers/english.all.3class.distsim.crf.ser.gz',
					   'resources/ner/stanford-ner.jar',
					   encoding='utf-8')
    tokenized_text = word_tokenize(text)
    classified_text = st.tag(tokenized_text)
    log_file.write('NER \n %s \n' % classified_text)
    print(classified_text)
    log_file.close()
    return
开发者ID:MoizRauf,项目名称:OQuant_Wiki_Clustering,代码行数:11,代码来源:NLPHelper.py

示例8: nltk_ner

def nltk_ner(remainders):
	st = StanfordNERTagger('../stanford-ner/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar') 
	for item in remainders:
		name = ""
		tagged = st.tag(item.split())
		for entity in tagged:
			if entity[1] == u'PERSON':
				name += (entity[0].title() + ' ')
		if name: 
			return True, name, item
		else:
			return False, name, item
开发者ID:mwcurry,项目名称:tracker,代码行数:12,代码来源:parser.py

示例9: trial1

def trial1():
    """
    Just to make sure we're not screwing everything up.
    :return:
    """
    st = StanfordNERTagger('/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/stanford-ner-2015-12-09/annotated-cities-model.ser.gz',
                           '/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/stanford-ner-2015-12-09/stanford-ner.jar',
                           encoding='utf-8')

    text = 'While in France, Mrs. Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.'

    tokenized_text = word_tokenize(text)
    classified_text = st.tag(tokenized_text)

    print(classified_text)
开发者ID:mayankkejriwal,项目名称:pycharm-projects-ubuntu,代码行数:15,代码来源:StanfordNER.py

示例10: get_namedentities

def get_namedentities(text):
  """
  Returns named entities in text using StanfordNERTagger
  """
  st = StanfordNERTagger('utils/english.conll.4class.caseless.distsim.crf.ser.gz','utils/stanford-ner.jar')   
  ner_tagged = st.tag(text.lower().split())     
  
  named_entities = []
  if len(ner_tagged) > 0:
    for n in ner_tagged:
      if n[1]!='O':
        named_entities.append(remove_punctuation(n[0]))

  named_entities = [n for n in named_entities if n] 
  return named_entities
开发者ID:veryluckyxyz,项目名称:keywordfinder,代码行数:15,代码来源:features.py

示例11: classify_text

def classify_text(text):
    """Using the 3-class Stanford Named Entity Recognition model, classify each
       word in the input text as a PERSON, LOCATION, ORGANIZATION, or O (for
       other)."""

    directory = "C:/Users/liabbott/Documents/Projects/CBP OIT/stanford_ner/"
    mod = "classifiers/english.all.3class.distsim.crf.ser.gz"
    tag = "stanford-ner.jar"
    path_to_model = os.path.normpath(directory + mod)
    path_to_tagger = os.path.normpath(directory + tag)
    st = StanfordNERTagger(path_to_model, path_to_tagger, encoding='utf-8')

    tokenized_text = word_tokenize(text)
    classified_text = st.tag(tokenized_text)

    return classified_text
开发者ID:liameabbott,项目名称:named_entity_recognition,代码行数:16,代码来源:namedEntityRecognition.py

示例12: __init__

    def __init__(self, use_stanford=False, NER_model=None, NER_tagger=None, POS_model=None, POS_tagger=None):
        """The initializer of the class

        :param NER_model: NER model path
        :param NER_tagger: NER tagger path
        :param POS_model: POS model path
        :param POS_tagger: POS tagger path
        :param use_stanford: boolean, if using stanford NER and POS tagging

        """
        self.NER_model = NER_model
        self.NER_tagger = NER_tagger
        self.POS_model = POS_model
        self.POS_tagger = POS_tagger
        self.use_stanford = use_stanford

        if use_stanford:
            if NER_model is None or NER_tagger is None or POS_model is None or POS_tagger is None:
                sys.exit("tagging initialization: Stanford models and taggers" " have to be provided!")
            else:
                self.post = StanfordPOSTagger(self.POS_model, self.POS_tagger).tag
                self.nert = StanfordNERTagger(self.NER_model, self.NER_tagger).tag
        else:
            self.post = nltk.pos_tag
            self.nert = nltk.ne_chunk
开发者ID:CDIPSDataScience2016,项目名称:AmazonTrend,代码行数:25,代码来源:NLP_tagging.py

示例13: stanford_entities

def stanford_entities(model, jar, fileids=None, corpus=kddcorpus, section = None):
    """
    Extract entities using the Stanford NER tagger.
    Must pass in the path to the tagging model and jar as downloaded from the
    Stanford Core NLP website.
    """
    results = defaultdict(lambda: defaultdict(list))
    fileids = fileids or corpus.fileids()
    tagger  = StanfordNERTagger(model, jar)
    section = section

    for fileid in fileids:
        if section is not None:
            text = nltk.word_tokenize(list(sectpull([fileid],section=section))[0][1])
        else:
            text  = corpus.words(fileid)

        chunk = []

        for token, tag in tagger.tag(text):
            if tag == 'O':
                if chunk:
                    # Flush the current chunk
                    etext =  " ".join([c[0] for c in chunk])
                    etag  = chunk[0][1]
                    chunk = []

                    # if etag == 'PERSON':
                    #     key = 'persons'
                    # elif etag == 'ORGANIZATION':
                    #     key = 'organizations'
                    # elif etag == 'LOCATION':
                    #     key = 'locations'
                    # else:
                    #     key = 'other'

                    if etag == 'LOCATION':
                        key = 'locations'
                    else:
                        key = 'other'
                    results[fileid][key].append(etext)

            else:
                # Build chunk from tags
                chunk.append((token, tag))

    return results
开发者ID:goldin2008,项目名称:Research_in_NLP,代码行数:47,代码来源:extract_NER.py

示例14: main

def main():
    parser = StanfordParser(path_to_jar=script_wrapper.stanford_parser_jar, path_to_models_jar=script_wrapper.stanford_model_jar)
    st = StanfordNERTagger(model_filename='../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar")
    raw_sent = "Dempsey was drafted by Major League Soccer club New England Revolution."
    sent = word_tokenize(raw_sent)
    ne_tuple = st.cur_tag(sent)  # ##need write interface for tokenized sent (http://nlp.stanford.edu/software/crf-faq.shtml#tokenized)
    print ne_tuple
    
    print parser.raw_parse(raw_sent).next()

    return
    # find name entity
    f = 0
    ne_list = []
    for (ne, label) in ne_tuple:
        if label == 'PERSON':
            f = 1
        if f and label != 'PERSON':
            break
        if f:
            ne_list.append(ne)
    # print ne_list

    init_file(main_tree)
                    ####### my issue here: 1. don't know how to get NP. 2. is there a quicker way to find PERON ?
    # try head to ask who/what
    pattern = "S < NP=np"
    head = check_output(['bash',  ###add bash !!!!
                         tregex_path,
                         '-s',
                         pattern,
                         init_tree_file])
    print head

    def get_main_verbs(tree):
        pattern = '/(VB.?)/=main >+ (VP) (S > ROOT)'
        main_verbs = check_output(['bash',  ###add bash !!!!
                                   tregex_path,
                                   '-s',
                                   pattern,
                                   init_tree_file])
        print main_verbs
        main_verbs = main_verbs.split('\n')[:-1]
        main_verbs = [Tree.fromstring(main_verb) for main_verb in main_verbs]
        return main_verbs
开发者ID:DerrickZhu1,项目名称:11611teamproject-YenYuan-,代码行数:45,代码来源:2.py

示例15: html_ner

def html_ner(content):
    st = StanfordNERTagger(
        './lib/classifiers/english.all.3class.distsim.crf.ser.gz',
        './lib/stanford-ner-3.5.2.jar')
    soup = BeautifulSoup(content, "html.parser")
    for script in soup(["script", "style", "sup"]):
        script.extract()
    tokenised_sents = list(soup.stripped_strings)
    tokenised_words = [wordpunct_tokenize(sent) for sent in tokenised_sents]
    tagged_sents = [st.tag(sent) for sent in tokenised_words]

    result = list()

    for sent in tagged_sents:
        for tag, chunk in groupby(sent, lambda x: x[1]):
            if tag != 'O':
                result.append((tag, ' '.join(w for w, t in chunk).encode('utf-8').strip()))
    return result
开发者ID:Sinderella,项目名称:OSINT,代码行数:18,代码来源:ners.py


注:本文中的nltk.tag.StanfordNERTagger类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。