當前位置: 首頁>>代碼示例>>Python>>正文


Python tag.UnigramTagger類代碼示例

本文整理匯總了Python中nltk.tag.UnigramTagger的典型用法代碼示例。如果您正苦於以下問題:Python UnigramTagger類的具體用法?Python UnigramTagger怎麽用?Python UnigramTagger使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。


在下文中一共展示了UnigramTagger類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: pos_tag

def pos_tag(pos_type, tokenized_sent):
	if pos_type == 'unigram':
		brown_train = pickle.load(open('res/brown_train.pkl', 'rb'))
		unigram_tagger = UnigramTagger(brown_train)
		return unigram_tagger.tag(tokenized_sent)
	elif pos_type == 'max_pos':
		return nltk.pos_tag(tokenized_sent)		
開發者ID:merkhofer,項目名稱:parsel,代碼行數:7,代碼來源:nltk_magic.py

示例2: tag_unigrams_by_topic

 def tag_unigrams_by_topic(self, dict_of_sentences_by_topic):
     tagged_unigrams_by_topic = {}
     train_sents = mac_morpho.tagged_sents()[:5000]
     tagger = UnigramTagger(train_sents)
     for k, v in dict_of_sentences_by_topic.items():
         tagged_unigrams_by_topic[k] = tagger.batch_tag(dict_of_sentences_by_topic[k])
     return tagged_unigrams_by_topic
開發者ID:EduardoCarvalho,項目名稱:nltkPhraseDetector,代碼行數:7,代碼來源:extractPhrases.py

示例3: trainUniTnT

 def trainUniTnT(self):
     """train unigram and tnt seperatly without DefaultTagger"""
     self.split_into_folds()
     for k in range(1, (self.folds + 1)):
         train_sents = sum(self.foldlist[: (self.folds - 1)], [])
         tnt_tagger = tnt.TnT(N=100)
         tnt_tagger.train(train_sents)
         print(str(k) + " fold: tnt evaluated")
         unigram = UnigramTagger(train_sents)
         print(str(k) + " fold: unigram evaluated")
         to_tag = [untag(i) for i in self.foldlist[self.folds - 1]]
         self.tnt_tagged += tnt_tagger.tag_sents(to_tag)
         self.uni_tagged += unigram.tag_sents(to_tag)
         self.org_tagged += self.foldlist[self.folds - 1]
         self.foldlist = [self.foldlist[self.folds - 1]] + self.foldlist[: (self.folds - 1)]
     self.tnt = tnt_tagger
     self.unigram = unigram
     self.tnt_avg_acc = accuracy(sum(self.org_tagged, []), sum(self.tnt_tagged, []))
     self.uni_avg_acc = accuracy(sum(self.org_tagged, []), sum(self.uni_tagged, []))
     print("Accuracy of concatenated tnt-tagged sentences: ", self.tnt_avg_acc)
     print("Accuracy of concatenated unigram-tagged sentences: ", self.uni_avg_acc)
     (self.tnt_tagprecision, self.tnt_tagrecall) = self.tagprecision_recall(
         tnt_tagger, self.tnt_tagged, self.org_tagged
     )
     (self.unigram_tagprecision, self.unigram_tagrecall) = self.tagprecision_recall(
         unigram, self.uni_tagged, self.org_tagged
     )
     # delete following values so that trainRegexp has the inicial values
     self.org_tagged = []
     self.foldlist = []
     for i in range(1, self.folds + 1):
         self.foldlist.append(self.create_fold(i))
開發者ID:Batene,項目名稱:Bamanankan,代碼行數:32,代碼來源:CrossValidation.py

示例4: tag_words

	def tag_words(self, words, sents):
		train_sents = treebank.tagged_sents()
		tagger = UnigramTagger(train_sents)
		test_sents = tagger.tag(sents[0])
		# test_sents = treebank.tagged_sents()[3000:]
		# print treebank.tagged_sents()[1:]
		# print "accuracy: " + str(self._tagger.evaluate(test_sents))
		# print self._tagger.tag(words)
		# print test_sents
		print tagger.evaluate(test_sents)
開發者ID:jayvachon,項目名稱:managerisk-reflection-search,代碼行數:10,代碼來源:sentiment-analysis.py

示例5: baseline

def baseline(tagged_sentences):
    from nltk.tag import UnigramTagger
    from nltk.tag import DefaultTagger
    from collections import Counter

    # lowercase everything
    # remove all instances of non-universal tags for propper comparison with
    # the other methods
    new_tagged_sentences = []
    for sent in tagged_sentences:
        sent = [(x[0].lower(), x[1]) for x in sent]
        sent = [x for x in sent if x[1] in _UNI]
        new_tagged_sentences.append(sent)
    tagged_sentences = new_tagged_sentences

    # size of corpus
    corpus_size = sum([len(sent) for sent in tagged_sentences])
    print('Corpus size: {} docs'.format(len(tagged_sentences)))
    print('Corpus size: {} tokens'.format(corpus_size))
    
    # train/test split
    test_pct = 0.3
    test_len = int(len(tagged_sentences) * test_pct)
    test_idx = len(tagged_sentences) - test_len
    train_set = tagged_sentences[:test_idx]
    test_set = tagged_sentences[test_idx:]
    print('Train set: {} docs'.format(len(train_set)))
    print('Test set: {} docs'.format(len(test_set)))

    # calculate test set size in tokens
    test_size = sum([len(sent) for sent in test_set])
    print('Test set: {} tokens'.format(test_size))

    # calculate most comman tag in the train set
    # this should be 'NOUN'
    tag_dist = []
    for sent in train_set:
        tag_dist += [x[1] for x in sent]
    counts = Counter()
    counts.update(tag_dist)
    most_common = counts.most_common(1)[0][0]
    print('Most common tag: {}'.format(most_common))

    # Create model
    backoff = DefaultTagger(most_common)
    tagger = UnigramTagger(train=train_set, backoff=backoff, cutoff=5)

    # Evaluate
    acc = tagger.evaluate(test_set)
    print('Baseline: {}'.format(acc))
開發者ID:lrei,項目名稱:xlime_twitter_corpus,代碼行數:50,代碼來源:experiment.py

示例6: getUnigramTaggerAccuracy

def getUnigramTaggerAccuracy(trainingSet, testingSet):
    # trains and returns the accuracy of the UnigramTagger

    # get untagged sentences and gold POS tags
    testingUntaggedSentences = [[taggedWord[0] for taggedWord in sentence] for sentence in testingSet]
    testingGoldPOSTags = [[taggedWord[1] for taggedWord in sentence] for sentence in testingSet]

    # train tagger
    unigramTagger = UnigramTagger(trainingSet)

    # test tagger and get predicted POS tags
    unigramTaggedSentences = unigramTagger.tag_sents(testingUntaggedSentences)
    unigramTaggedSentencesPOSTags = [[taggedWord[1] for taggedWord in sentence] for sentence in unigramTaggedSentences]

    # calculate and return accuracy
    return calculateAccuracy(testingGoldPOSTags, unigramTaggedSentencesPOSTags)
開發者ID:kyajmiller,項目名稱:LING-539,代碼行數:16,代碼來源:q2.py

示例7: tag_penn

def tag_penn(words):
    """
    Tokenizes text by using a Penn Treebank tagged sentence and word tokenizer.

    Parameters
    ----------
    words: A list of strings.

    Returns
    -------
    A list of tuples of (str, str)
    """

    pt_tagger = UnigramTagger(treebank.tagged_sents())
    tags = pt_tagger.tag(words)

    return tags
開發者ID:nwngeek212,項目名稱:NaturalLanguageProcessing,代碼行數:17,代碼來源:helper.py

示例8: contextual_rules

def contextual_rules(wikicorpus_dir, context_file):
    sentences = wikicorpus(wikicorpus_dir, words=1000000)

    ANONYMOUS = "anonymous"
    for s in sentences:
        for i, (w, tag) in enumerate(s):
            if tag == "NP": # NP = proper noun in Parole tagset.
                s[i] = (ANONYMOUS, "NP")

    ctx = fntbl37()

    tagger = UnigramTagger(sentences)
    tagger = BrillTaggerTrainer(tagger, ctx, trace=0)
    tagger = tagger.train(sentences, max_rules=100)

    #print tagger.evaluate(wikicorpus(10000, start=1))

    with open(context_file, "w") as f:
        for rule in tagger.rules():
            f.write("%s\n" % rule)
開發者ID:jgsogo,項目名稱:lingwars,代碼行數:20,代碼來源:pattern_wikicorpus.py

示例9: make_pos_model

def make_pos_model(model_type):
    now = time.time()

    reader = TaggedCorpusReader('.', 'greek_training_set.pos')
    train_sents = reader.tagged_sents()
    if model_type == 'unigram':
        tagger = UnigramTagger(train_sents)
        file = 'unigram.pickle'
    elif model_type == 'bigram':
        tagger = BigramTagger(train_sents)
        file = 'bigram.pickle'
    elif model_type == 'trigram':
        tagger = TrigramTagger(train_sents)
        file = 'trigram.pickle'
    elif model_type == 'backoff':
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger = TrigramTagger(train_sents, backoff=tagger2)
        file = '123grambackoff.pickle'
    elif model_type == 'tnt':
        tagger = tnt.TnT()
        tagger.train(train_sents)
        file = 'tnt.pickle'
    else:
        print('Invalid model_type.')

    _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos')
    path = os.path.join(_dir, file)
    with open(path, 'wb') as f:
        pickle.dump(tagger, f)

    print('Completed training {0} model in {1} seconds to {2}.'.format(model_type, time.time() - now, path))
開發者ID:wencanluo,項目名稱:greek_treebank_perseus,代碼行數:32,代碼來源:make_pos_models.py

示例10: tag_linked

def tag_linked(words, default_tag='INFO'):
    """
    Tokenizes text by using a Penn Treebank tagged sentence and word tokenizers.
    Uses DefaultTagger to assign "default_tag" to any element missed by Penn Treebank tagger.

    Parameters
    ----------
    words: A list of strings.

    Returns
    -------
    A list of tuples of (str, str)
    :param default_tag:
    """

    default_tagger = DefaultTagger(default_tag)
    pt_tagger = UnigramTagger(treebank.tagged_sents())

    pt_tagger._taggers = [pt_tagger, default_tagger]

    tags = pt_tagger.tag(words)

    return tags
開發者ID:nwngeek212,項目名稱:NaturalLanguageProcessing,代碼行數:23,代碼來源:helper.py

示例11: PyTenseShift

class PyTenseShift(object):

    """Initialization of PyTenseShift objects.
    
    The important part when you use the PlPyTenseShift is that
    we allow you to implmenent your own Tagger to optimize your
    results in translating from present to past tense. So, you need
    to implement the taggerinterface and change the second line of
    this code
    """
    def __init__(self, corpus, isPl):
        if isPl:
            self.tagger = FirstTagger(corpus)
        else:
            dtag = DefaultTagger("NN")
            self.__utag = UnigramTagger(corpus.tagged_sents(), backoff = dtag)

    """ Tokenize the input sentence into words.
    This kind of representation is better to evaluate.
    
    """
    def _tokenize(self, tense, isPl):
        if isPl:
            return self.tagger.tag(tense)
        else:
            return self.__utag.tag(tokenize(tense))

    def getPastTense(self, tense):
        """Translates sentence given in present tense into past tense 
        
        Args:
            sentence (str): Sentence to translate
        Returns:
            str. Sentence in past tense
        """
        raise NotImplementedError("abstract method")
開發者ID:perfidia,項目名稱:pytenseshift,代碼行數:36,代碼來源:__init__.py

示例12: write_word_list

from nltk.corpus import brown
from nltk.tag import UnigramTagger
import cPickle as pickle

INPUT_FILE = "/dfs/scratch0/googlengrams/2012-eng-fic/info/commonnonstop-1900-2000-8-6.pkl"

def write_word_list(filename, word_list):
    out_fp = open(filename, "w")
    print >> out_fp, "\n".join(word_list)

if __name__ == '__main__':
    in_fp = open(INPUT_FILE, "rb") 
    words = pickle.load(in_fp)
    tagger = UnigramTagger(brown.tagged_sents())
    good_words = []
    for word in words:
        tag = tagger.tag([word])[0][1]
        if tag == None:
            continue
        if "NP" in tag:
            continue
        good_words.append(word)
    write_word_list("brown.txt", good_words)
開發者ID:viveksck,項目名稱:langchange,代碼行數:23,代碼來源:brown_words.py

示例13:

import nltk
from nltk.tag import UnigramTagger
from nltk.corpus import treebank
training= treebank.tagged_sents()[:7000]
unitagger=UnigramTagger(training)
print(treebank.sents()[0])
print(unitagger.tag(treebank.sents()[0]))
開發者ID:PacktPublishing,項目名稱:Mastering-Natural-Language-Processing-with-Python,代碼行數:7,代碼來源:ch4_16.py

示例14:

import nltk
from nltk.tag import UnigramTagger
from nltk.tag import DefaultTagger
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
tag1=DefaultTagger('NN')
tag2=UnigramTagger(training,backoff=tag1)
print(tag2.evaluate(testing))
開發者ID:xenron,項目名稱:sandbox-da-python,代碼行數:9,代碼來源:ch4_19.py

示例15: UnigramTagger

import nltk
import json

from nltk.corpus import brown
from nltk.tag import UnigramTagger
tagger = UnigramTagger(brown.tagged_sents(tagset='universal'))
sent = ['Mitchell', 'decried', 'the', 'high', 'rate', 'of', 'unemployment']
for word, tag in tagger.tag(sent):
	if tag == "VERB":
		print(word, '->', tag)


verbs_tagged = open("../assets/inputText/verbs_tagged_questions.txt", 'w+')
with open("../assets/inputText/all_questions.txt", 'r') as all_lines:
	for line in all_lines:
		splitLine = line.split(' ')
		for word, tag in tagger.tag(splitLine):
			if tag == "VERB":
				verbs_tagged.write(word + "\n")
				#verbs_tagged.write(word + " \"" + line[:-1] + "\"\n")
				



開發者ID:diana-wang,項目名稱:NLP_Research,代碼行數:20,代碼來源:unigramTagging.py


注:本文中的nltk.tag.UnigramTagger類示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。