当前位置: 首页>>代码示例>>Python>>正文


Python tag.pos_tag函数代码示例

本文整理汇总了Python中nltk.tag.pos_tag函数的典型用法代码示例。如果您正苦于以下问题:Python pos_tag函数的具体用法?Python pos_tag怎么用?Python pos_tag使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了pos_tag函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: make_pos

        def make_pos(target_tag, edit_rev):
            tags, srcs, dsts = edit_rev_triple

            # target_tag: 文中に存在する
            # 品詞を付与する前に、文中から削除・追加タグが存在する部分を取り除く
            if target_tag == del_tag:
                sentence = dsts
            elif target_tag == add_tag:
                sentence = srcs

            if target_tag in tags:
                tag_indexes = [i for i, x in enumerate(tags) if x == target_tag]
                trimed = sentence
                for tag_index in tag_indexes:
                    trimed = trimed[:tag_index] + trimed[tag_index+1:]

                posed = pos_tag(trimed)
                pos = [w[1] for w in posed]
                for tag_index in tag_indexes:
                    pos.insert(tag_index, u'')

                #debug
                None_indexes = [i for i, x in enumerate(pos) if x == u'']
                if tag_indexes != None_indexes:
                    print >>sys.stderr, tag_indexes
                    print >>sys.stderr, None_indexes
                    print >>sys.stderr, tags
                    print >>sys.stderr, pos
            else:
                posed = pos_tag(u' '.join(sentence).split())
                pos = [w[1] for w in posed]

            return pos
开发者ID:keisks,项目名称:epair,代码行数:33,代码来源:englishword_edit_distance.py

示例2: number_of_exact_word_match

def number_of_exact_word_match(a, b, word_tokenizer, lemmatizer, stop_words):
    pos_a = map(get_tagged_words, pos_tag(word_tokenizer.tokenize(a)))
    pos_b = map(get_tagged_words, pos_tag(word_tokenizer.tokenize(b)))
    lemmae_a = [lemmatizer.lemmatize(token.lower().strip(punctuation), pos) for token, pos in pos_a \
                    if token.lower().strip(punctuation) not in stop_words]
    lemmae_b = [lemmatizer.lemmatize(token.lower().strip(punctuation), pos) for token, pos in pos_b \
                    if token.lower().strip(punctuation) not in stop_words]
    matched_words = set(lemmae_a).intersection(lemmae_b)
    return [len(matched_words), matched_words, b]
开发者ID:Majestic12,项目名称:supervised_news_summarization,代码行数:9,代码来源:word_analysis.py

示例3: number_of_noun_match

def number_of_noun_match(a, b, word_tokenizer, lemmatizer, stop_words):
    pos_a = map(get_tagged_words, pos_tag(word_tokenizer.tokenize(a)))
    pos_b = map(get_tagged_words, pos_tag(word_tokenizer.tokenize(b)))
    lemmae_a = [lemmatizer.lemmatize(token.lower().strip(punctuation), pos) for token, pos in pos_a \
                    if pos == NOUN and token.lower().strip(punctuation) not in stop_words]
    lemmae_b = [lemmatizer.lemmatize(token.lower().strip(punctuation), pos) for token, pos in pos_b \
                    if pos == NOUN and token.lower().strip(punctuation) not in stop_words]
    # Calculate Jaccard similarity
    #ratio = len(set(lemmae_a).intersection(lemmae_b)) / float(len(set(lemmae_a).union(lemmae_b)))
    #return (ratio > 0.66)
    matched_words = set(lemmae_a).intersection(lemmae_b)
    return [len(matched_words), matched_words, b]
开发者ID:Majestic12,项目名称:supervised_news_summarization,代码行数:12,代码来源:word_analysis.py

示例4: keep_nouns

def keep_nouns(tf):
    n_tf = {}
    for k in tf:
        if pos_tag([k])[0][1].find('N') == 0:
            n_tf[k] = tf[k]

    return n_tf 
开发者ID:catsdogone,项目名称:video-indexing,代码行数:7,代码来源:utils.py

示例5: title_permutations

def title_permutations(title_expanded):
    title_tagged = pos_tag(title_expanded.split())
    st = PorterStemmer()
    title_pos = [st.stem(word) for word, pos in title_tagged if pos != 'IN']

    title_perms = list(map("*".join, permutations(title_pos)))
    return title_perms
开发者ID:DivyaKarthikeyan,项目名称:NLPCareerTrajectory,代码行数:7,代码来源:JobTitleNormalization.py

示例6: extract

def extract(query):
    sentence = query
    tagged_sent = pos_tag(sentence.split())
    propernouns = [word for word,pos in tagged_sent if pos == 'NN']   
    return propernouns

#extract("I want to buy a car and a dog and plane")
开发者ID:ljsou,项目名称:QueryAnalyzer,代码行数:7,代码来源:chunker_noun.py

示例7: test_run

    def test_run():
        results = {}
        nouns = []
        product_list = {}
        for p in Post.query.all():
            tagged_sent = pos_tag(p.story.split())
            propernouns = [word for word,pos in tagged_sent if pos == 'NNP']
            for n in propernouns:
                if n == "I’m" or n == "It’s" or n == "Can’t":
                    continue
                results[n.replace('.', '')] = True

        for r in results.keys():
            nouns.append(r)

        for i in range(10):
            noun = random.choice(nouns)
            # print('Using "%s"', (noun,))
            for k in test_keywords:
                try:
                    products = amazon.search(Keywords=noun, SearchIndex=k)
                    for product in products:
                        product_list[product.title] = True
                except:
                    continue

        for p in product_list.keys():
            print("     Found title: %s" % (p,))
开发者ID:davidmaignan,项目名称:giftsmarts,代码行数:28,代码来源:test_nltk.py

示例8: analiseSentimento

def analiseSentimento(resposta):
	texto = resposta['corpo']
	frases = sentencesTokenizer.tokenize(texto)
	palavras = []
	for frase in frases:
		palavras.extend(wordsTokenizer.tokenize(frase))
	posTags = pos_tag(palavras)
	positivo = 0
	negativo = 0
	for palavra, tag in posTags:
		synsets = None
		if tag.startswith('J'):
			synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADJ)
		elif tag.startswith('V'):
			synsets = sentiwordnet.senti_synsets(palavra, wordnet.VERB)
		elif tag.startswith('N'):
			synsets = sentiwordnet.senti_synsets(palavra, wordnet.NOUN)
		elif tag.startswith('R'):
			synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADV)
		else:
			synsets = sentiwordnet.senti_synsets(palavra, '')
		if synsets != None:
			synsets = list(synsets)
			if len(synsets) > 0:
				synset = synsets[0]
				positivo = positivo + synset.pos_score()
				negativo = negativo + synset.neg_score()
	if positivo > negativo:
		return (resposta, 'positivo')
	elif negativo > positivo:
		return (resposta, 'negativo')
	else:
		return (resposta, 'neutro')
开发者ID:vbozelli,项目名称:Sentiment-Analysis,代码行数:33,代码来源:analise_sentimento_sentiwordnet_com_stopwords.py

示例9: _process_simpleHash

 def _process_simpleHash(self, simpleHash):
     # Extract entities from keys resulting from SimpleExtractor process_*
     entityHash = {}
     for data in simpleHash:
         occs = simpleHash[data]['occurences']
         proxLoc = simpleHash[data]['proxLoc']
         # Tokenize sentences
         for sent in tokenize_sentences(data):
             # Tokenize words
             tokens = tokenize_words(sent)
             # Tag words with Parts of Speech
             tagged = pos_tag(tokens)
             # Identify named entities
             entities = ne_chunk(tagged)
             for ent in entities:
                 if isinstance(ent, NLTKParseTree):
                     # Is it a wanted type?
                     if ent.node in self.types:
                         # Should we keep the PoS tag?
                         if self.keepPos:
                             txts = ['/'.join(token) for token in ent.leaves()]
                         else:
                             txts = [token[0] for token in ent.leaves()]
                         txt = ' '.join(txts)
                         new = {txt: {'text': txt,
                                      'occurences': occs,
                                      'proxLoc': proxLoc[:]}}
                         entityHash = self._mergeHash(entityHash, new)
     return entityHash
开发者ID:digging-into-data-berkeley,项目名称:cheshire3,代码行数:29,代码来源:extractor.py

示例10: process_raw_text

def process_raw_text(text):
    """
        First some code to standardize the formatting, then basic nlp.
    """
    # Remove breaks and tabs
    for char in ["\t", "\n"]:
        text = text.replace(char, " ")
    text = text.replace('."', '".')
    text = text.replace(".'", "'.")
    # Split special characters from words
    for char in ["'", '"', ",", ".", "?", "!", ";", ":"]:
        text = text.replace(char, " " + char + " ")
    # Magic to remove all multi-spaces
    text = ' '.join(text.split())

    # get the words, sentences, POS tags, and chunks.
    chunks = [ tuple([ c.type for c in t.chunks ]) for t in parsetree(text) ]
    sentences = sent_tokenize(text)
    sentences = [ word_tokenize(s) for s in sentences ]
    sentences_tags = [ tuple([ (w, simplify_tag(t)) for w, t in pos_tag(s) ]) for s in sentences ]

    sentences = [ tuple([ w for w, _ in s]) for s in sentences_tags ]
    tags = [ tuple([ t for _, t in s]) for s in sentences_tags ]
    words = flatten(sentences)

    return tuple(words), tuple(sentences), tuple(tags), tuple(chunks)
开发者ID:RemideZ,项目名称:Stylometry,代码行数:26,代码来源:create_Datasets.py

示例11: GetContractPage

def GetContractPage(x):
    url = 'http://www.defense.gov/contracts/contract.aspx?contractid=%d' % x
    html = urllib.urlopen(url).read()
    if re.search("The Official Home of the Department of Defense", html):
        return
    soup = BeautifulSoup(html)
    p_tags = soup.findAll("p")
    p_tags_text_list = [tag.text for tag in p_tags]

    tokenized_list = []

    for text in p_tags_text_list:
        tokenized_list = tokenize.word_tokenize(text)
        tokenized_list.append(nltk_tag.pos_tag(tokenized_list))
    tagged_list = tokenized_list[-1]

    data = {
        "url": url}

    for token in tagged_list[1:]:
        if token[1]=="NNP":
            data['entity'] = token[0]
            break
    

    for token in tagged_list[1:]:
        if token[1]=="CD":
            data['Amount'] = token[0]
            break

    print data
开发者ID:carriercomm,项目名称:scraperwiki-scraper-vault,代码行数:31,代码来源:dod_with_nltk.py

示例12: processoFeatures

def processoFeatures(resposta):
    frases = tokenizerFrases.tokenize(resposta["corpo"])
    palavras = []
    palavrasTexto = {}
    for frase in frases:
        palavrasTemp = tokenizerPalavras.tokenize(frase)
        for palavra in palavrasTemp:
            palavrasTexto[palavra] = True
    posTags = pos_tag(palavras)
    positivo = 0
    negativo = 0
    for palavra, tag in posTags:
        synsets = None
        if tag.startswith("J"):
            synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADJ)
        elif tag.startswith("V"):
            synsets = sentiwordnet.senti_synsets(palavra, wordnet.VERB)
        elif tag.startswith("N"):
            synsets = sentiwordnet.senti_synsets(palavra, wordnet.NOUN)
        elif tag.startswith("R"):
            synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADV)
        else:
            synsets = sentiwordnet.senti_synsets(palavra, "")
        if synsets != None:
            synsets = list(synsets)
            if len(synsets) > 0:
                synset = synsets[0]
                positivo = positivo + synset.pos_score()
                negativo = negativo + synset.neg_score()
    if positivo > negativo:
        return (palavrasTexto, "positivo")
    elif negativo > positivo:
        return (palavrasTexto, "negativo")
    else:
        return (palavrasTexto, "neutro")
开发者ID:vbozelli,项目名称:Sentiment-Analysis,代码行数:35,代码来源:criar_classificador_com_stopwords.py

示例13: count_words_unigram_pos

def count_words_unigram_pos(input_filename, output_path=''):

    txt = get_file_text(input_filename)

    word_regex = '[a-zA-Z]+'
    word_frequency = {}
    total_words = 0.

    matches = re.findall(word_regex, txt, re.M + re.S + re.U)
    for m in matches:
        word_frequency[m] = word_frequency.get(m, 0.) + 1.
        total_words+=1.

    sorted_words = sorted(word_frequency.iteritems(), key=operator.itemgetter(1))

    word_analysis = []
    for word in sorted_words:
        pos = pos_tag([word[0]])
        word_analysis.append([word[0], word[1], pos[0][1]])

    o_file = make_output_file(input_filename, output_path=output_path, prefix='', suffix='-words_unigram_pos')
    o_file.write('word\tcount\tpos\n')
    for w in word_analysis:
        o_file.write('%s\t%d\t%s\n' % (w[0], w[1], w[2]))

    o_file.close()
开发者ID:bchoatejr,项目名称:religion,代码行数:26,代码来源:nlp_word_tools.py

示例14: extract_pos

def extract_pos(tokens, simple=True):
	"""
	Simple parts of speech of speech are:
	VERB - verbs (all tenses and modes)
	NOUN - nouns (common and proper)
	PRON - pronouns
	ADJ - adjectives
	ADV - adverbs
	ADP - adpositions (prepositions and postpositions)
	CONJ - conjunctions
	DET - determiners
	NUM - cardinal numbers
	PRT - particles or other function words
	X - other: foreign words, typos, abbreviations
	. - punctuation
	:param tokens:
	:return:
	"""
	tokens_pos = pos_tag(tokens)
	pos = [p for t, p in tokens_pos]
	if simple:
		# translate larger set of part of speech tags into small, simpler set
		pos_dict = nltk.tagset_mapping('en-ptb', 'universal')
		pos = [pos_dict[p] for p in pos]
	return pos
开发者ID:robert-giaquinto,项目名称:sentence_boundary_detection,代码行数:25,代码来源:extract_features.py

示例15: lda_train

def lda_train(raw):
    stop = set(stopwords.words('english'))
    p_stemmer = PorterStemmer()
    text_array = []
    for i in range(len(raw)):
        text = raw[i].lower()
        text = text.replace('\r\n', ' ')
        text = re.sub("[^a-z0-9]", " ", text)
        # Tokenization segments a document into its atomic elements.
        words = text.split()
        # Stop words
        # Certain parts of English speech, like (for, or) or the word the are meaningless to a topic model.
        # These terms are called stop words and need to be removed from our token list.
        words = [j for j in words if j not in stop]
        tokenized = nltk.word_tokenize(text)
        tagged_sent = pos_tag(words)
        words = [word for word,pos in tagged_sent if pos == 'NN']
        # Stemming words is another common NLP technique to reduce topically similar words to their root.
        # stemming reduces those terms to stem. This is important for topic modeling, which would otherwise view those terms as separate entities and reduce their importance in the model.
        #words = [p_stemmer.stem(s) for s in words]
        text_array.append(words)
    dictionary = corpora.Dictionary(text_array)
    dictionary.save('dictionary.dic')
    corpus = [dictionary.doc2bow(text) for text in text_array]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    ldamodel = models.ldamodel.LdaModel(corpus, num_topics=15, id2word=dictionary, passes=20)
    filename = 'finalized_model_15.sav'
    joblib.dump(ldamodel, filename)
    print(ldamodel.print_topics(num_topics=15, num_words=6))
    return ldamodel,dictionary
开发者ID:ZhenqiWangC,项目名称:models,代码行数:31,代码来源:lda_train.py


注:本文中的nltk.tag.pos_tag函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。