当前位置: 首页>>代码示例>>Python>>正文


Python nltk.corpus方法代码示例

本文整理汇总了Python中nltk.corpus方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.corpus方法的具体用法?Python nltk.corpus怎么用?Python nltk.corpus使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk的用法示例。


在下文中一共展示了nltk.corpus方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: closure

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def closure(self, rel, depth=-1):
        """Return the transitive closure of source under the rel
        relationship, breadth-first

            >>> from nltk.corpus import wordnet as wn
            >>> dog = wn.synset('dog.n.01')
            >>> hyp = lambda s:s.hypernyms()
            >>> list(dog.closure(hyp))
            [Synset('canine.n.02'), Synset('domestic_animal.n.01'),
            Synset('carnivore.n.01'), Synset('animal.n.01'),
            Synset('placental.n.01'), Synset('organism.n.01'),
            Synset('mammal.n.01'), Synset('living_thing.n.01'),
            Synset('vertebrate.n.01'), Synset('whole.n.02'),
            Synset('chordate.n.01'), Synset('object.n.01'),
            Synset('physical_entity.n.01'), Synset('entity.n.01')]

        """
        from nltk.util import breadth_first
        synset_offsets = []
        for synset in breadth_first(self, rel, depth):
            if synset._offset != self._offset:
                if synset._offset not in synset_offsets:
                    synset_offsets.append(synset._offset)
                    yield synset 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:26,代码来源:wordnet.py

示例2: res_similarity

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def res_similarity(self, other, ic, verbose=False):
        """
        Resnik Similarity:
        Return a score denoting how similar two word senses are, based on the
        Information Content (IC) of the Least Common Subsumer (most specific
        ancestor node).

        :type  other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type ic: dict
        :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
        :return: A float score denoting the similarity of the two ``Synset`` objects.
            Synsets whose LCS is the root node of the taxonomy will have a
            score of 0 (e.g. N['dog'][0] and N['table'][0]).
        """

        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
        return lcs_ic 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:20,代码来源:wordnet.py

示例3: lin_similarity

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def lin_similarity(self, other, ic, verbose=False):
        """
        Lin Similarity:
        Return a score denoting how similar two word senses are, based on the
        Information Content (IC) of the Least Common Subsumer (most specific
        ancestor node) and that of the two input Synsets. The relationship is
        given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).

        :type other: Synset
        :param other: The ``Synset`` that this ``Synset`` is being compared to.
        :type ic: dict
        :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
        :return: A float score denoting the similarity of the two ``Synset`` objects,
            in the range 0 to 1.
        """

        ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
        return (2.0 * lcs_ic) / (ic1 + ic2) 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:20,代码来源:wordnet.py

示例4: get_pos_tagger

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def get_pos_tagger(self):
        from nltk.corpus import brown
        regexp_tagger = RegexpTagger(
            [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
             (r'(The|the|A|a|An|an)$', 'AT'),   # articles
             (r'.*able$', 'JJ'),                # adjectives
             (r'.*ness$', 'NN'),                # nouns formed from adjectives
             (r'.*ly$', 'RB'),                  # adverbs
             (r'.*s$', 'NNS'),                  # plural nouns
             (r'.*ing$', 'VBG'),                # gerunds
             (r'.*ed$', 'VBD'),                 # past tense verbs
             (r'.*', 'NN')                      # nouns (default)
        ])
        brown_train = brown.tagged_sents(categories='news')
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        #Override particular words
        main_tagger = RegexpTagger(
            [(r'(A|a|An|an)$', 'ex_quant'),
             (r'(Every|every|All|all)$', 'univ_quant')
        ], backoff=trigram_tagger)

        return main_tagger 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:27,代码来源:glue.py

示例5: ieer_headlines

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def ieer_headlines():

    from nltk.corpus import ieer
    from nltk.tree import Tree
    
    print("IEER: First 20 Headlines")
    print("=" * 45)  
    
    trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
    for tree in trees[:20]:
        print()
        print("%s:\n%s" % tree)



#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
############################################# 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:20,代码来源:relextract.py

示例6: conllesp

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def conllesp():
    from nltk.corpus import conll2002

    de = """
    .*
    (
    de/SP|
    del/SP
    )
    """
    DE = re.compile(de, re.VERBOSE)

    print()
    print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
    print("=" * 45)
    rels = [rel for doc in conll2002.chunked_sents('esp.train')
            for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
    for r in rels[:10]: print(clause(r, relsym='DE'))
    print() 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:21,代码来源:relextract.py

示例7: evaluate

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def evaluate(trueValues, predicted, decimals, note):
	print note
	label = 1
	avg = 'weighted'
	a = accuracy_score(trueValues, predicted)
	p = precision_score(trueValues, predicted, pos_label=label, average=avg)
	r = recall_score(trueValues, predicted, pos_label=label, average=avg)
	avg_f1 = f1_score(trueValues, predicted, pos_label=label, average=avg)
	fclasses = f1_score(trueValues, predicted, average=None)
	f1c1 = fclasses[0]; f1c2 = fclasses[1]
	fw = (f1c1 + f1c2)/2.0

	print 'accuracy:\t', str(round(a,decimals))
	print 'precision:\t', str(round(p,decimals))
	print 'recall:\t', str(round(r,decimals))
	print 'avg f1:\t', str(round(avg_f1,decimals))
	print 'c1 f1:\t', str(round(f1c1,decimals))
	print 'c2 f1:\t', str(round(f1c2,decimals))
	print 'avg(c1,c2):\t', str(round(fw,decimals))
	print '------------'

###################################################################################


# split a parallel or comparable corpus into two parts 
开发者ID:motazsaad,项目名称:comparable-text-miner,代码行数:27,代码来源:textpro.py

示例8: build_lsi_model

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def build_lsi_model(corpus_name, corpus_path, topics=300):
	logging.info( 'building lsi model for %s corpus', corpus_name )
	dictFile = corpus_path + corpus_name + '.dict'
	corpus_tfidf_file = corpus_path + corpus_name + '.tfidf.mm'
	
	logging.info( 'loading dictionary ...' )
	dictionary = corpora.Dictionary.load(dictFile)
	logging.info( 'loading tfidf corpus ...' )
	corpus_tfidf = corpora.MmCorpus(corpus_tfidf_file)
	logging.info( 'building lsi model' )
	lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topics)
	logging.info( 'saving lsi' )
	lsiFile = corpus_path + corpus_name + '.lsi'
	lsi.save(lsiFile)
	logging.info( 'lsi model is ready' )
################################################################################## 
开发者ID:motazsaad,项目名称:comparable-text-miner,代码行数:18,代码来源:textpro.py

示例9: aligning_doc_by_interlanguage_links

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def aligning_doc_by_interlanguage_links(source_doc, target_corpus, source_language, target_language, output_path):
	
	source = None
	target = None
	
	source_title = get_title_from_interlanguage_links(source_doc, source_language)			
		
	for d in target_corpus:
		target_title = get_title_from_interlanguage_links(d, target_language)
		if source_title == target_title:
			source = source_doc
			target = d
			
	return source, target		
				
##################################################################################

# takes a wikipedia corpus (extracted by WikiExtractor.py) and splits the corpus into documents and clean them 
开发者ID:motazsaad,项目名称:comparable-text-miner,代码行数:20,代码来源:textpro.py

示例10: pos_tag_convert_penn_to_wn

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def pos_tag_convert_penn_to_wn(tag):
    """
    Convert POS tag from Penn tagset to WordNet tagset.

    :param tag: a tag from Penn tagset
    :return: a tag from WordNet tagset or None if no corresponding tag could be found
    """
    from nltk.corpus import wordnet as wn

    if tag in ['JJ', 'JJR', 'JJS']:
        return wn.ADJ
    elif tag in ['RB', 'RBR', 'RBS']:
        return wn.ADV
    elif tag in ['NN', 'NNS', 'NNP', 'NNPS']:
        return wn.NOUN
    elif tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
        return wn.VERB
    return None 
开发者ID:WZBSocialScienceCenter,项目名称:tmtoolkit,代码行数:20,代码来源:_common.py

示例11: documents

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def documents(self, fold=None, train=False, test=False):
        """
        A generator of documents being streamed from disk. Each document is
        a list of paragraphs, which are a list of sentences, which in turn is
        a list of tuples of (token, tag) pairs. All preprocessing is done by
        NLTK and the CorpusReader object this object wraps.

        If a fold is specified (should be an integer between 0 and folds),
        then the loader will return documents from that fold. Further, train
        or test must be specified to split the fold correctly. This method
        allows us to maintain the generator properties of document reads.
        """
        for fileid in self.fileids(fold, train, test):
            yield list(self.corpus.tagged(fileids=fileid))


##########################################################################
## Normalize Transformer
########################################################################## 
开发者ID:DistrictDataLabs,项目名称:partisan-discourse,代码行数:21,代码来源:learn.py

示例12: __init__

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def __init__(self, sick_path, target_directory, lm_path=None, wsd_algorithm='cosine', sampling_parameter=0.5,
                 min_substitutions=2, num_candidates=5, concatenate_corpora=True):
        self.sick_path = sick_path
        self.target_directory = target_directory
        self.lm_path = lm_path
        self.wsd_algorithm = wsd_algorithm
        self.sampling_parameter = sampling_parameter
        self.min_substitutions = min_substitutions
        self.num_candidates = num_candidates
        self.concatenate_corpora = concatenate_corpora
        self.filtered_path = os.path.join(self.target_directory, 'filtered_sick.txt')
        self.noscore_path = os.path.join(self.target_directory, 'noscore_sick.txt')
        # Filter the original SICK corpus to match the expected format, and create file for LM training
        if not os.path.exists(self.filtered_path) or not os.path.exists(self.noscore_path):
            self.filter_sick()
        if self.lm_path is None:
            raise ValueError('No language model provided! Use the noscore_sick corpus to train an .klm LM, first.')
        else:
            self.language_model = kenlm.LanguageModel(self.lm_path) 
开发者ID:demelin,项目名称:Sentence-similarity-classifier-for-pyTorch,代码行数:21,代码来源:sick_extender.py

示例13: filter_sick

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def filter_sick(self):
        """ Processes the original S.I.C.K. corpus into a format where each line contains the two compared sentences
        followed by their relatedness score. """
        # Filter the SICK dataset for sentences and relatedness score only
        df_origin = pd.read_table(self.sick_path)
        df_classify = df_origin.loc[:, ['sentence_A', 'sentence_B', 'relatedness_score']]
        # Scale relatedness score to to lie ∈ [0, 1] for training of the classifier
        df_classify['relatedness_score'] = df_classify['relatedness_score'].apply(
            lambda x: "{:.4f}".format(float(x)/5.0))

        df_noscore = df_origin.loc[:, ['sentence_A', 'sentence_B']]
        df_noscore = df_noscore.stack()

        # Write the filtered set to a .csv file
        df_classify.to_csv(self.filtered_path, sep='\t', index=False, header=False)
        print('Filtered corpus saved to %s.' % self.filtered_path)

        # Write a score-free set to a .csv file to be used in the training of the KN language model
        df_noscore.to_csv(self.noscore_path, index=False, header=False)
        print('Filtered corpus saved to %s.' % self.noscore_path) 
开发者ID:demelin,项目名称:Sentence-similarity-classifier-for-pyTorch,代码行数:22,代码来源:sick_extender.py

示例14: line_prep

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def line_prep(self, line):
        """ Tokenizes and POS-tags a line from the SICK corpus to be compatible with WordNet synset lookup. """
        # Split line into sentences + score
        s1, s2, sim_score = line.split('\t')
        # Tokenize
        s1_tokens = word_tokenize(s1)
        s2_tokens = word_tokenize(s2)
        # Assign part of speech tags
        s1_penn_pos = nltk.pos_tag(s1_tokens)
        s2_penn_pos = nltk.pos_tag(s2_tokens)
        # Convert to WordNet POS tags and store word position in sentence for replacement
        # Each tuple contains (word, WordNet_POS_tag, position)
        s1_wn_pos = list()
        s2_wn_pos = list()
        for idx, item in enumerate(s1_penn_pos):
            if self.get_wordnet_pos(item[1]) != 'OTHER':
                s1_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s1_penn_pos.index(item)))
        for idx, item in enumerate(s2_penn_pos):
            if self.get_wordnet_pos(item[1]) != 'OTHER':
                s2_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s2_penn_pos.index(item)))

        # Each tuple contains (word, WordNet_POS_tag, position); Source sentence provided for use in disambiguation
        return [(s1_wn_pos, s1_tokens), (s2_wn_pos, s2_tokens)], sim_score 
开发者ID:demelin,项目名称:Sentence-similarity-classifier-for-pyTorch,代码行数:25,代码来源:sick_extender.py

示例15: __init__

# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def __init__(self, skip_download_check: bool = False, spacy_model="en_core_web_sm"):
        try:
            from nltk.corpus import wordnet
            import nltk
        except ImportError:
            raise ImportError(
                "WordNet-based data augmentation requires nltk to be installed."
            )

        self.wn = wordnet

        try:
            import spacy
            from spacy.tokens import Token
        except ImportError:
            raise ImportError(
                "WordNet-based data augmentation requires spaCy and a language "
                "model to be installed (for part of speech tagging)."
            )

        if not skip_download_check:
            nltk.download("wordnet")

        self.nlp = spacy.load(spacy_model, parser=False, tagger=True, entity=False)
        Token.set_extension("replacement", default=None, force=True) 
开发者ID:RTIInternational,项目名称:gobbli,代码行数:27,代码来源:wordnet.py


注:本文中的nltk.corpus方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。