Python nltk.corpus方法代碼示例

本文整理匯總了Python中nltk.corpus方法的典型用法代碼示例。如果您正苦於以下問題：Python nltk.corpus方法的具體用法？Python nltk.corpus怎麽用？Python nltk.corpus使用的例子？那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類nltk的用法示例。

在下文中一共展示了nltk.corpus方法的15個代碼示例，這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚，您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: closure

# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import corpus [as 別名]
def closure(self, rel, depth=-1):
        """Return the transitive closure of source under the rel
        relationship, breadth-first

            >>> from nltk.corpus import wordnet as wn
            >>> dog = wn.synset('dog.n.01')
            >>> hyp = lambda s:s.hypernyms()
            >>> list(dog.closure(hyp))
            [Synset('canine.n.02'), Synset('domestic_animal.n.01'),
            Synset('carnivore.n.01'), Synset('animal.n.01'),
            Synset('placental.n.01'), Synset('organism.n.01'),
            Synset('mammal.n.01'), Synset('living_thing.n.01'),
            Synset('vertebrate.n.01'), Synset('whole.n.02'),
            Synset('chordate.n.01'), Synset('object.n.01'),
            Synset('physical_entity.n.01'), Synset('entity.n.01')]

        """
        from nltk.util import breadth_first
        synset_offsets = []
        for synset in breadth_first(self, rel, depth):
            if synset._offset != self._offset:
                if synset._offset not in synset_offsets:
                    synset_offsets.append(synset._offset)
                    yield synset

開發者ID:rafasashi，項目名稱:razzy-spinner，代碼行數:26，代碼來源:wordnet.py

示例2: res_similarity

# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import corpus [as 別名]
def res_similarity(self, other, ic, verbose=False):
        """
        Resnik Similarity:
        Return a score denoting how similar two word senses are, based on the
        Information Content (IC) of the Least Common Subsumer (most specific
        ancestor node).

        :type  other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type ic: dict
        :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
        :return: A float score denoting the similarity of the two ``Synset`` objects.
            Synsets whose LCS is the root node of the taxonomy will have a
            score of 0 (e.g. N['dog'][0] and N['table'][0]).
        """

        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
        return lcs_ic

開發者ID:rafasashi，項目名稱:razzy-spinner，代碼行數:20，代碼來源:wordnet.py

示例3: lin_similarity

# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import corpus [as 別名]
def lin_similarity(self, other, ic, verbose=False):
        """
        Lin Similarity:
        Return a score denoting how similar two word senses are, based on the
        Information Content (IC) of the Least Common Subsumer (most specific
        ancestor node) and that of the two input Synsets. The relationship is
        given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).

        :type other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type ic: dict
        :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
        :return: A float score denoting the similarity of the two ``Synset`` objects,
            in the range 0 to 1.
        """

        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
        return (2.0 * lcs_ic) / (ic1 + ic2)

開發者ID:rafasashi，項目名稱:razzy-spinner，代碼行數:20，代碼來源:wordnet.py

示例4: get_pos_tagger

# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import corpus [as 別名]
def get_pos_tagger(self):
        from nltk.corpus import brown
        regexp_tagger = RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
             (r'(The|the|A|a|An|an)$', 'AT'),   # articles
             (r'.*able$', 'JJ'),                # adjectives
             (r'.*ness$', 'NN'),                # nouns formed from adjectives
             (r'.*ly$', 'RB'),                  # adverbs
             (r'.*s$', 'NNS'),                  # plural nouns
             (r'.*ing$', 'VBG'),                # gerunds
             (r'.*ed$', 'VBD'),                 # past tense verbs
             (r'.*', 'NN')                      # nouns (default)
        ])
        brown_train = brown.tagged_sents(categories='news')
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        #Override particular words
        main_tagger = RegexpTagger(
            [(r'(A|a|An|an)$', 'ex_quant'),
             (r'(Every|every|All|all)$', 'univ_quant')
        ], backoff=trigram_tagger)

        return main_tagger

開發者ID:rafasashi，項目名稱:razzy-spinner，代碼行數:27，代碼來源:glue.py

示例5: ieer_headlines

# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import corpus [as 別名]
def ieer_headlines():

    from nltk.corpus import ieer
    from nltk.tree import Tree
    
    print("IEER: First 20 Headlines")
    print("=" * 45)  
    
    trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print()
        print("%s:\n%s" % tree)



#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################

開發者ID:rafasashi，項目名稱:razzy-spinner，代碼行數:20，代碼來源:relextract.py

示例6: conllesp

# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import corpus [as 別名]
def conllesp():
    from nltk.corpus import conll2002

    de = """
    .*
    (
    de/SP|
    del/SP
    )
    """
    DE = re.compile(de, re.VERBOSE)

    print()
    print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
    print("=" * 45)
    rels = [rel for doc in conll2002.chunked_sents('esp.train')
            for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
    for r in rels[:10]: print(clause(r, relsym='DE'))
    print()

開發者ID:rafasashi，項目名稱:razzy-spinner，代碼行數:21，代碼來源:relextract.py

示例7: evaluate

# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import corpus [as 別名]
def evaluate(trueValues, predicted, decimals, note):
	print note
	label = 1
	avg = 'weighted'
	a = accuracy_score(trueValues, predicted)
	p = precision_score(trueValues, predicted, pos_label=label, average=avg)
	r = recall_score(trueValues, predicted, pos_label=label, average=avg)
	avg_f1 = f1_score(trueValues, predicted, pos_label=label, average=avg)
	fclasses = f1_score(trueValues, predicted, average=None)
	f1c1 = fclasses[0]; f1c2 = fclasses[1]
	fw = (f1c1 + f1c2)/2.0

	print 'accuracy:\t', str(round(a,decimals))
	print 'precision:\t', str(round(p,decimals))
	print 'recall:\t', str(round(r,decimals))
	print 'avg f1:\t', str(round(avg_f1,decimals))
	print 'c1 f1:\t', str(round(f1c1,decimals))
	print 'c2 f1:\t', str(round(f1c2,decimals))
	print 'avg(c1,c2):\t', str(round(fw,decimals))
	print '------------'

###################################################################################


# split a parallel or comparable corpus into two parts

開發者ID:motazsaad，項目名稱:comparable-text-miner，代碼行數:27，代碼來源:textpro.py

示例8: build_lsi_model

# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import corpus [as 別名]
def build_lsi_model(corpus_name, corpus_path, topics=300):
	logging.info( 'building lsi model for %s corpus', corpus_name )
	dictFile = corpus_path + corpus_name + '.dict'
	corpus_tfidf_file = corpus_path + corpus_name + '.tfidf.mm'
	
	logging.info( 'loading dictionary ...' )
	dictionary = corpora.Dictionary.load(dictFile)
	logging.info( 'loading tfidf corpus ...' )
	corpus_tfidf = corpora.MmCorpus(corpus_tfidf_file)
	logging.info( 'building lsi model' )
	lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topics)
	logging.info( 'saving lsi' )
	lsiFile = corpus_path + corpus_name + '.lsi'
	lsi.save(lsiFile)
	logging.info( 'lsi model is ready' )
##################################################################################

開發者ID:motazsaad，項目名稱:comparable-text-miner，代碼行數:18，代碼來源:textpro.py

示例9: aligning_doc_by_interlanguage_links

# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import corpus [as 別名]
def aligning_doc_by_interlanguage_links(source_doc, target_corpus, source_language, target_language, output_path):
	
	source = None
	target = None
	
	source_title = get_title_from_interlanguage_links(source_doc, source_language)			
		
	for d in target_corpus:
		target_title = get_title_from_interlanguage_links(d, target_language)
		if source_title == target_title:
			source = source_doc
			target = d
			
	return source, target		
				
##################################################################################

# takes a wikipedia corpus (extracted by WikiExtractor.py) and splits the corpus into documents and clean them

開發者ID:motazsaad，項目名稱:comparable-text-miner，代碼行數:20，代碼來源:textpro.py

示例10: pos_tag_convert_penn_to_wn

# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import corpus [as 別名]
def pos_tag_convert_penn_to_wn(tag):
    """
    Convert POS tag from Penn tagset to WordNet tagset.

    :param tag: a tag from Penn tagset
    :return: a tag from WordNet tagset or None if no corresponding tag could be found
    """
    from nltk.corpus import wordnet as wn

    if tag in ['JJ', 'JJR', 'JJS']:
        return wn.ADJ
    elif tag in ['RB', 'RBR', 'RBS']:
        return wn.ADV
    elif tag in ['NN', 'NNS', 'NNP', 'NNPS']:
        return wn.NOUN
    elif tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
        return wn.VERB
    return None

開發者ID:WZBSocialScienceCenter，項目名稱:tmtoolkit，代碼行數:20，代碼來源:_common.py

示例11: documents

# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import corpus [as 別名]
def documents(self, fold=None, train=False, test=False):
        """
        A generator of documents being streamed from disk. Each document is
        a list of paragraphs, which are a list of sentences, which in turn is
        a list of tuples of (token, tag) pairs. All preprocessing is done by
        NLTK and the CorpusReader object this object wraps.

        If a fold is specified (should be an integer between 0 and folds),
        then the loader will return documents from that fold. Further, train
        or test must be specified to split the fold correctly. This method
        allows us to maintain the generator properties of document reads.
        """
        for fileid in self.fileids(fold, train, test):
            yield list(self.corpus.tagged(fileids=fileid))


##########################################################################
## Normalize Transformer
##########################################################################

開發者ID:DistrictDataLabs，項目名稱:partisan-discourse，代碼行數:21，代碼來源:learn.py

示例12: init

# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import corpus [as 別名]
def __init__(self, sick_path, target_directory, lm_path=None, wsd_algorithm='cosine', sampling_parameter=0.5,
                 min_substitutions=2, num_candidates=5, concatenate_corpora=True):
        self.sick_path = sick_path
        self.target_directory = target_directory
        self.lm_path = lm_path
        self.wsd_algorithm = wsd_algorithm
        self.sampling_parameter = sampling_parameter
        self.min_substitutions = min_substitutions
        self.num_candidates = num_candidates
        self.concatenate_corpora = concatenate_corpora
        self.filtered_path = os.path.join(self.target_directory, 'filtered_sick.txt')
        self.noscore_path = os.path.join(self.target_directory, 'noscore_sick.txt')
        # Filter the original SICK corpus to match the expected format, and create file for LM training
        if not os.path.exists(self.filtered_path) or not os.path.exists(self.noscore_path):
            self.filter_sick()
        if self.lm_path is None:
            raise ValueError('No language model provided! Use the noscore_sick corpus to train an .klm LM, first.')
        else:
            self.language_model = kenlm.LanguageModel(self.lm_path)

開發者ID:demelin，項目名稱:Sentence-similarity-classifier-for-pyTorch，代碼行數:21，代碼來源:sick_extender.py

示例13: filter_sick

# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import corpus [as 別名]
def filter_sick(self):
        """ Processes the original S.I.C.K. corpus into a format where each line contains the two compared sentences
        followed by their relatedness score. """
        # Filter the SICK dataset for sentences and relatedness score only
        df_origin = pd.read_table(self.sick_path)
        df_classify = df_origin.loc[:, ['sentence_A', 'sentence_B', 'relatedness_score']]
        # Scale relatedness score to to lie ∈ [0, 1] for training of the classifier
        df_classify['relatedness_score'] = df_classify['relatedness_score'].apply(
            lambda x: "{:.4f}".format(float(x)/5.0))

        df_noscore = df_origin.loc[:, ['sentence_A', 'sentence_B']]
        df_noscore = df_noscore.stack()

        # Write the filtered set to a .csv file
        df_classify.to_csv(self.filtered_path, sep='\t', index=False, header=False)
        print('Filtered corpus saved to %s.' % self.filtered_path)

        # Write a score-free set to a .csv file to be used in the training of the KN language model
        df_noscore.to_csv(self.noscore_path, index=False, header=False)
        print('Filtered corpus saved to %s.' % self.noscore_path)

開發者ID:demelin，項目名稱:Sentence-similarity-classifier-for-pyTorch，代碼行數:22，代碼來源:sick_extender.py

示例14: line_prep

# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import corpus [as 別名]
def line_prep(self, line):
        """ Tokenizes and POS-tags a line from the SICK corpus to be compatible with WordNet synset lookup. """
        # Split line into sentences + score
        s1, s2, sim_score = line.split('\t')
        # Tokenize
        s1_tokens = word_tokenize(s1)
        s2_tokens = word_tokenize(s2)
        # Assign part of speech tags
        s1_penn_pos = nltk.pos_tag(s1_tokens)
        s2_penn_pos = nltk.pos_tag(s2_tokens)
        # Convert to WordNet POS tags and store word position in sentence for replacement
        # Each tuple contains (word, WordNet_POS_tag, position)
        s1_wn_pos = list()
        s2_wn_pos = list()
        for idx, item in enumerate(s1_penn_pos):
            if self.get_wordnet_pos(item[1]) != 'OTHER':
                s1_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s1_penn_pos.index(item)))
        for idx, item in enumerate(s2_penn_pos):
            if self.get_wordnet_pos(item[1]) != 'OTHER':
                s2_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s2_penn_pos.index(item)))

        # Each tuple contains (word, WordNet_POS_tag, position); Source sentence provided for use in disambiguation
        return [(s1_wn_pos, s1_tokens), (s2_wn_pos, s2_tokens)], sim_score

開發者ID:demelin，項目名稱:Sentence-similarity-classifier-for-pyTorch，代碼行數:25，代碼來源:sick_extender.py

示例15: init

# 需要導入模塊: import nltk [as 別名]
# 或者: from nltk import corpus [as 別名]
def __init__(self, skip_download_check: bool = False, spacy_model="en_core_web_sm"):
        try:
            from nltk.corpus import wordnet
            import nltk
        except ImportError:
            raise ImportError(
                "WordNet-based data augmentation requires nltk to be installed."
            )

        self.wn = wordnet

        try:
            import spacy
            from spacy.tokens import Token
        except ImportError:
            raise ImportError(
                "WordNet-based data augmentation requires spaCy and a language "
                "model to be installed (for part of speech tagging)."
            )

        if not skip_download_check:
            nltk.download("wordnet")

        self.nlp = spacy.load(spacy_model, parser=False, tagger=True, entity=False)
        Token.set_extension("replacement", default=None, force=True)

開發者ID:RTIInternational，項目名稱:gobbli，代碼行數:27，代碼來源:wordnet.py

注：本文中的nltk.corpus方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台，相關代碼片段篩選自各路編程大神貢獻的開源項目，源碼版權歸原作者所有，傳播和使用請參考對應項目的License；未經允許，請勿轉載。