当前位置: 首页>>代码示例>>Python>>正文


Python nltk.pos_tag函数代码示例

本文整理汇总了Python中nltk.pos_tag函数的典型用法代码示例。如果您正苦于以下问题:Python pos_tag函数的具体用法?Python pos_tag怎么用?Python pos_tag使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了pos_tag函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: create_synonyms

def create_synonyms(orig_word):
    '''
    funation for creating synonyms by passing word
    '''
    try:
        headers = {
            "X-Mashape-Key": "aIder4iWr4msh5Scn073WRoddmAEp1qA0I3jsnSR8lfJwtyzpg",
            "Accept": "application/json"}

        response = requests.get("https://wordsapiv1.p.mashape.com/words/{}/synonyms".format(orig_word), headers=headers)
        if response.status_code == 200:
            json = response.json()
            synonyms = json['synonyms']
            # synonyms = nltk.word_tokenize(synonyms)
            synonyms = nltk.pos_tag(synonyms)
            word = nltk.word_tokenize(orig_word)
            word = nltk.pos_tag(word)[0]
            print(synonyms)
            good_syns = []
            for syn in synonyms:
                print(word[1], syn[1])
                if word[1] == syn[1]:
                    print('*')
                    good_syns.append(syn[0])
            word = Word.objects.get_or_create(word=orig_word)            
            for syn in good_syns[:2]:
                try:
                    new_word = Word.objects.create(word=syn.lower(), is_synonym=True)
                except Exception:
                    new_word = Word.objects.get(word=word)
                syn = Synonym.objects.create(word=new_word)
                syn.synonym_to.add(word)
            return good_syns
    except Exception as e:
        print(e)
开发者ID:Dambre,项目名称:social_advisor,代码行数:35,代码来源:dictionary.py

示例2: extract_pos_pair

def extract_pos_pair(event_mention_1, event_mention_2):
    trigger1=""
    extent1=""
    trigger2=""
    extent2=""
    for one_anchor in event_mention_1.findall("anchor"):
        trigger1=one_anchor[0].text
    for one_anchor in event_mention_2.findall("anchor"):
        trigger2=one_anchor[0].text
    for one_extent in event_mention_1.findall("extent"):
        extent1=one_extent[0].text
    for one_extent in event_mention_2.findall("extent"):
        extent2=one_extent[0].text
    text1 = nltk.word_tokenize(extent1)
    dict1 = nltk.pos_tag(text1)
    for one_pair in dict1:
        if one_pair[0] in trigger1 or trigger1 in one_pair[0]:
            pos1=one_pair[1]
            break
    text2 = nltk.word_tokenize(extent2)
    dict2 = nltk.pos_tag(text2)
    for one_pair in dict2:
        if one_pair[0] in trigger2 or trigger2 in one_pair[0]:
            pos2=one_pair[1]
            break
    return (pos1, pos2)
开发者ID:wtl-zju,项目名称:KBP2015,代码行数:26,代码来源:coref_feature_extraction.py

示例3: writeOut

def writeOut(lsummary_out, allwordsphrases=[],  outputpath='.', gridset=''):    
 
    # Write data out for the last folder (gridset) encountered - MUST BE A BETTER WAY THAN THIS?
    uWordsPhrases = uniqueSet(allwordsphrases)              # Set of unique words.
    uwords =[]
    uphrases = []
    words = []
    phrases =[]
    wordtypes =[]
    wordtypes =[]
    total_wordsphrases = total_uwordsphrases = total_words = total_phrases = 0

    ldata_out = UnicodeWriter(open(outputpath + '/'+ gridset +'/language-data.csv', 'wb'), delimiter=',', quotechar='"')
    ldata_out.writerow(["WORD", "NUMBER OF WORDS", "COUNT", "TYPE"])
    
   # Output metrics  to file.
    for item in uWordsPhrases:
       num_words = len(item.split())
       item_count = allwordsphrases.count(item)
       if num_words == 1:                          # Single word
          word_type = nltk.pos_tag(item)[-1][-1]
          #word_type_help = nltk.help.upenn_tagset(word_type)
# MAYBE CONVERT TAGS INTO MORE USEFUL WORDS?!
          ldata_out.writerow([item, str(num_words), str(item_count), word_type])
          uwords.append(item)
          wordtypes.append(word_type)
       elif num_words > 1:                         # Phrase
          nltk_words = nltk.word_tokenize(item)
          word_pos = nltk.pos_tag(nltk_words) ### HOW TO DEAL WITH PHRASES???
          word_types = [x[1] for x in word_pos]
          ldata_out.writerow([item, str(num_words), str(item_count), " ,".join(word_types)])
# HOW TO OUTPUT EACH POS TO A COLUMN???
          uphrases.append(item)

    for item in allwordsphrases:
        num_words = len(item.split())
        if num_words == 1:
            words.append(item)
        elif num_words > 1:
            phrases.append(item)
        
    uword_types = countDuplicatesInList(wordtypes)
    
    total_wordsphrases = len(allwordsphrases)
    total_uwordsphrases = len(uWordsPhrases)
    total_uwords = len(uwords)
    total_uphrases = len(uphrases)

    total_words = len(words)
    total_phrases = len(phrases)
    
    #["File Name", "Total Words or Phrases", "Total Unique Words or Phrases", "Total Words", "Total Phrases", "Total Unique Words", "Total Unique Phrases", "Types of Word"])
    lsummary_out.writerow([gridset, str(total_wordsphrases), str(total_uwordsphrases), str(total_words), str(total_phrases), str(total_uwords), str(total_uphrases), ', '.join(map(str, uword_types))])

    raw_words_out = open(outputpath + '/'+ gridset +'/raw-unique-words.text', 'wb')
    raw_words_out.writelines('\n'.join(uWordsPhrases).encode('utf-8'))
    raw_phrases_out = open(outputpath + '/'+ gridset +'/raw-unique-phrases.txt', 'wb')
    raw_phrases_out.writelines('\n'.join(uphrases).encode('utf-8'))
    raw_words_out = open(outputpath + '/'+ gridset +'/raw-wordsphrases.text', 'wb')
    raw_words_out.writelines('\n'.join(allwordsphrases).encode('utf-8'))
开发者ID:simonjudge,项目名称:AAC-Tools,代码行数:60,代码来源:wordlistMetrics.py

示例4: nltk_filter

def nltk_filter(sent):
  b1, b2 = sent.split(blockSeparator)
  b2 = b2.rstrip()

  b1            = b1.lower()
  tokens        = word_tokenize(b1)
  pos_tags      = pos_tag(tokens)
  filtered_sent = ' '
  for token in tokens:
    filtered_sent += '1'+token + ' '
  # for pos_t in pos_tags:
  #   if pos_t[1] in filterList:
  #     #filtered_sent += stemmer.stem(pos_t[0]) + ' '
  #     filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' '

#note: 1 concat stemmer(word) == stemmer(1 concat word)

  b2            = b2.lower()
  tokens        = word_tokenize(b2)
  pos_tags      = pos_tag(tokens)
  # filtered_sent = ' '
  # for pos_t in pos_tags:
  #   if pos_t[1] in filterList:
  #     #filtered_sent += stemmer.stem(pos_t[0]) + ' '
  #     filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' '

  for token in tokens:
    filtered_sent += '2' + token + ' '

  return filtered_sent
开发者ID:gthandavam,项目名称:Recipes,代码行数:30,代码来源:builder.py

示例5: load_data

def load_data(path):
    sentences_pos = []
    r1 = re.compile(r'\<([^ ]+)\>')
    r2 = re.compile(r'\$US(\d)')
    for l in open(path):
        if not l.strip():
            continue
        l = l.decode('utf-8')
        l = l.replace(u'’', "'")
        l = l.replace(u'``', '"')
        l = l.replace(u"''", '"')
        l = l.replace(u"—", '--')
        l = l.replace(u"–", '--')
        l = l.replace(u"´", "'")
        l = l.replace(u"-", " ")
        l = l.replace(u"/", " ")
        l = r1.sub(r'\1', l)
        l = r2.sub(r'$\1', l)
        s = l.strip().split('\t')
        sa, sb = tuple(nltk.word_tokenize(s)
                          for s in l.strip().split('\t') if s) # ignore double \t
        sa, sb = ([x.encode('utf-8') for x in sa],
                  [x.encode('utf-8') for x in sb])

        for s in (sa, sb):
            for i in xrange(len(s)):
                if s[i] == "n't":
                    s[i] = "not"
                elif s[i] == "'m":
                    s[i] = "am"
        sa, sb = fix_compounds(sa, sb), fix_compounds(sb, sa)
        sentences_pos.append((nltk.pos_tag(sa), nltk.pos_tag(sb)))
    return sentences_pos
开发者ID:STS-NTNU,项目名称:STS13,代码行数:33,代码来源:simpfeats.py

示例6: replace_proper_nouns

    def replace_proper_nouns(self, o_sent, n_sent):
        proper_nouns = []
        p_pnouns = []

        o_tagged = pos_tag(word_tokenize(o_sent))
        n_tagged = pos_tag(word_tokenize(n_sent))
        # print("\nTransforming the output:")
        # print("Input sentence:", o_sent)
        # print("Found sentence:", n_sent)
        # print("Input sentence tagged:", o_tagged)
        # print("Found sentence tagged:", n_tagged)

        for o in o_tagged:
            if o[1] == 'NNP' and o not in proper_nouns:
                proper_nouns.append(o)

        for n in n_tagged:
            if (n[1] == 'PRP' or n[1] == 'PRP$' or n[1] == 'NNP') and n not in p_pnouns:
                p_pnouns.append(n)

        # print("")

        if (len(proper_nouns) == 1) and (len(p_pnouns) > 0):
            n_sent = sub(r"\b%s\b" %p_pnouns[0][0] , proper_nouns[0][0], n_sent, 1)
            gender = self.gp.classify(proper_nouns[0][0])
            # print(proper_nouns[0][0], "is classified as", gender)
            for pnoun in p_pnouns:
                n_pnoun = self.change_gender(pnoun[0], gender)
                n_sent = sub(r"\b%s\b" %pnoun[0] , n_pnoun, n_sent)
        elif len(proper_nouns) < 1:
            print("No proper nouns to replace")
        else:
            print("Not yet implemented, :P")

        return n_sent
开发者ID:theopak,项目名称:storytellingbot,代码行数:35,代码来源:Extrapolate.py

示例7: normalize_word

def normalize_word(word, lowercase=True, lemmatize=True):
    "Normalize word by stripping plural nouns"
    global NORMWORD_CACHE
    global NORMWORD_POS
    if NORMWORD_WNL is None:
        init_normword_wnl()
    if lowercase:
        word = word.lower()
    if word in NORMWORD_CACHE:
        return NORMWORD_CACHE[word]
    if not lemmatize:
        return word
    treebank_tag = nltk.pos_tag([word])[0][1]
    newword = word
    if ( len(newword) > 4 ) and ( treebank_tag == 'NNS' ):
        #  Only lemmatize plural nouns, leave verbs alone
        wnpos = get_wordnet_pos(treebank_tag)
        if wnpos:
            newword = NORMWORD_WNL.lemmatize(newword, wnpos)
        if newword != word:
            LOGGER.debug('Changing %s to %s' % (word, newword))
        NORMWORD_POS[newword] = nltk.pos_tag([newword])[0][1]
    else:
        NORMWORD_POS[word] = treebank_tag
    NORMWORD_CACHE[word] = newword
    return newword
开发者ID:markgraves,项目名称:sanal,代码行数:26,代码来源:sautil.py

示例8: test_nltkNERParsing

    def test_nltkNERParsing(self):
        testString = 'Natural Sciences and Engineering Research Council of Canada'
        unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams()
        posTagged = nltk.pos_tag(unigrams)
        chunked = nltk.ne_chunk(posTagged)
        getGPEs = []

        for treeBranch in chunked:
            if hasattr(treeBranch, 'label') and treeBranch.label() == 'GPE':
                getGPEs.append(str(treeBranch))

        self.assertEqual(1, len(getGPEs))

        testString = 'Milwaukee Foundation'
        unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams()
        posTagged = nltk.pos_tag(unigrams)
        chunked = nltk.ne_chunk(posTagged)
        # returns (S (PERSON Milwaukee/NNP) (ORGANIZATION Foundation/NNP))

        testString = 'New England Board of Higher Education'
        unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams()
        posTagged = nltk.pos_tag(unigrams)
        chunked = nltk.ne_chunk(posTagged)
        # returns (S (GPE New/NNP)(ORGANIZATION England/NNP Board/NNP) of/IN (PERSON Higher/NNP Education/NNP))

        testString = 'New England Board of Higher Education'
        unigrams = TokenizeOnWhitespacePunctuation(testString).getUnigrams()
        posTagged = nltk.pos_tag(unigrams)
        chunked = nltk.ne_chunk(posTagged)
开发者ID:kyajmiller,项目名称:Cerebro,代码行数:29,代码来源:TestClassifyBadScholarships.py

示例9: printer

def printer(sentencescorelist, sentenceList, wordscorelist, wordList):
    outFile = open('./tldr/outFile.txt', 'w')
    for s in range(0, len(sentenceList)):
        if s in sentencescorelist:
            printsentence(sentenceList[s], outFile)
    outFile.write("Topics to research: ")

    topics = []
    numtopics = 3
    poswords = nltk.pos_tag(wordList)
    poskeep = ["NN", "NNS", "NNP", "NNPS"]

    while numtopics > 0:
        temp = max(wordscorelist.iteritems(), key=operator.itemgetter(1))[0]
        templist = [temp]
        templist = nltk.pos_tag(templist)
        if templist[0][1] in poskeep:
            numtopics -= 1
            topics.append(temp)
        del wordscorelist[temp]
    for i in range(0, len(topics)):
        if i != len(topics) - 1:
            outFile.write(topics[i] + ", ")
        else:
            outFile.write(topics[i])
    outFile.close()
开发者ID:fernandest,项目名称:TLDR_Twist,代码行数:26,代码来源:main.py

示例10: parse_stock_name

    def parse_stock_name(self, stockname):
        p = engine()

        instruction_set = stockname.split(',')
        word_list = instruction_set[0].split(' ')
        index = 1
        categories_ignored = ['RB', 'TO']
        tokens = word_tokenize(instruction_set[0])
        tags = pos_tag(tokens)
        i=0
        while i < len(tags):
            if tags[i][1] in categories_ignored:
                index += 1
                i+= 1
            else:
                break

        quantity = word_list[index-1]
        disallowed = ['g', 'ml', 'x', 'kg', 'cups', 'cup', 'grams', 'can', 'tbsp', 'tsp', 'tbsps', 'tsps',
                 'small', 'bunch', 'piece', 'handful', 'pack', 'chopped', 'large', 'a', 'pinch',
                 'fresh', 'dried', 'heaped', 'thick', 'slices', 'slice', 'of', 'about']
        while index < len(word_list):
            if word_list[index] not in disallowed:
                break
            else:
                index+=1
        sentence = " ".join(word_list[index:])
        tokens = word_tokenize(sentence)
        categories = pos_tag(tokens)
        words = []
        for category in categories:
            if category[1] not in ['NNS', 'VBN', 'VBG']:
                words.append(category[0])
        word = " ".join(words)
        return quantity, word, None
开发者ID:Godley,项目名称:MealPlanner,代码行数:35,代码来源:pipelines.py

示例11: test

def test(ws,wf,s,pf,wm,alfa2):
    f1=open('test_data.data','rb')
    f2=open('test.csv','rb')
    val_text=f1.read()
    comt=f2.read().splitlines()
    val_lines=val_text.splitlines()
    acc=0
    lc=0
    for line in val_lines:
        token = line.split(' | ')
        token[2]="<S> "+token[2]+" <E>"
        t_t =token[2].split(' %% ')
        if t_t[0]!="<S> ":
            bff = nltk.pos_tag(t_t[0].split(".")[-1].split(" "))[-1][1]
        else:
            bff="<S>"
        if t_t[2]!=" <E>":
            aff = nltk.pos_tag(t_t[2].split(".")[0].split(" "))[0][1]
        else:
            aff="<E>"
        val_label = nb(ws,wf,s,token[0],pf,aff,bff,alfa2)
        if val_label==comt[lc].split(",")[1]:
            acc+=1
        lc+=1
    print float(acc)/len(val_lines)
    f1.close()
    f2.close()
开发者ID:saumyakb,项目名称:CS4740-NLP-Superwised-WSD,代码行数:27,代码来源:trainer2.py

示例12: m_surrounding

 def m_surrounding(self):
    D = {}
    sent = self.sentence["form"]
    l = len(sent)
    #print sent 
    K = self.index
    '''
    for k in range(l):
        if sent[k] == self.word:
            K = k
            break
    '''
    #print K, l
    tagp = tagn = ""
    if (K+1) < l:
        tagn = nt.word_tokenize(sent[K+1])
        tagn = nt.pos_tag(tagn)     
    if (K-1) >=0:
        tagp = nt.word_tokenize(sent[K-1])
        tagp = nt.pos_tag(tagp)        
        
    if tagp != "":
        D["ptag"] = tagp[0][1]
    else: 
        D["ptag"] = ""
    if tagn != "":    
        D["ntag"] = tagn[0][1]
    else:
        D["ntag"] = ""
        
    print D
    return D 
开发者ID:korlev91,项目名称:CWI---complex-word,代码行数:32,代码来源:WordFeatures.py

示例13: score_glove_pos

def score_glove_pos(src, dst, numpy_arrays, labels_array, g, normalize=True):
	b1 = []
	b2 = []
	lines = 0
	with open(src) as p:
		for i, line in enumerate(p):
			s = line.split('\t')
			b1.append(s[0])
			b2.append(s[1][:-1]) #remove \n
			lines = i + 1

	b1_pos = [nltk.pos_tag(nltk.word_tokenize(re.sub(r'[^\x00-\x7F]+',' ', text))) for text in b1]
	b2_pos = [nltk.pos_tag(nltk.word_tokenize(re.sub(r'[^\x00-\x7F]+',' ', text))) for text in b2]

	res = []
	for i in range(lines):
		tags1 = [tag[0] for tag in b1_pos[i] if tag[1] in NOUN]
		tags2 = [tag[0] for tag in b2_pos[i] if tag[1] in NOUN]
		r = [1 - spatial.distance.cosine(g[tag1], g[tag2]) for tag1 in tags1 for tag2 in tags2 if tag1 in labels_array and tag2 in labels_array]
		if len(r) == 0:
			res.append(0)
		else:
			res.append(round(5*max(r), 2))

	if normalize:
		res = normarlize_score(res)
			
	with open(dst, 'w') as thefile:
		thefile.write("\n".join(str(i) for i in res))
	print src + ' finished!'
开发者ID:wintor12,项目名称:SemEval2015,代码行数:30,代码来源:run.py

示例14: test

def test(ws,wf,s,pf):
    f1=open('validation_data.data','rb')
    #f2=open('test_data.csv','w')
    val_text=f1.read()
    val_lines=val_text.splitlines()
    acc=0

    for line in val_lines:
        token = line.split(' | ')
        t_t =token[2].split(' %% ')
        if t_t[0]!="<S>":
            bff = nltk.pos_tag(t_t[0].split(".")[-1].split(" "))[-1][1]
        else:
            bff="<S>"
        if t_t[2]!="<\S>":
            aff = nltk.pos_tag(t_t[2].split(".")[0].split(" "))[0][1]
        else:
            aff="<\S>"
        val_label = nb(ws,wf,s,token[0],pf,aff,bff)
        #f2.write(token[0]+" | "+val_label+" | "+token[2])
    #f1.close()
    #f2.close()
    #print "Done"
    
        

        if val_label==token[1]:
            acc+=1
    print float(acc)/len(val_lines)
开发者ID:saumyakb,项目名称:CS4740-NLP-Superwised-WSD,代码行数:29,代码来源:MakeDictKaggle.py

示例15: expand_with_wordnet

def expand_with_wordnet(query):
    """
    This function expands every contentful word in the query with its wordnet
    definition. The word itself is not removed. Stop words are removed from the
    word definition as well.
    (Contentful means that it is not a stopword or punctuation sign)

    INPUT:
        query   --  user query that is a simple string
    OUTPUT:
        expanded_query  --  user query + definitions of contentful words
    """
    stop = stopwords.words("english")
    stop += EXCLUDED
    contentful_tokens = [tok for tok in query.split() if tok not in stop]
    # take the first definition for the current word
    defs = []
    for token in contentful_tokens:
        syn1 = wn.synsets(token, pos=wn.ADJ)[:1]
        syn2 = wn.synsets(token, pos=wn.NOUN)[:1]
        # we take into account only adj defs
        if syn1:
            defs.append(token)
            def_tokenized = word_tokenize(syn1[0].definition())
            [defs.append(t[0]) for t in pos_tag(def_tokenized) if t[1] in ["NN", "JJ"]]
        elif syn2:
            defs.append(token)
            def_tokenized = word_tokenize(syn2[0].definition())
            [defs.append(t[0]) for t in pos_tag(def_tokenized) if t[1] in ["NN", "JJ"]]
    # expansion can add some EXCLUDED words back in the query
    defs = set(defs) - set(EXCLUDED)  # removing again
    expanded = " ".join(defs)
    return expanded
开发者ID:tastyminerals,项目名称:cocktail_bot,代码行数:33,代码来源:cocktail_ir.py


注:本文中的nltk.pos_tag函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。