本文整理汇总了Python中nltk.corpus方法的典型用法代码示例。如果您正苦于以下问题:Python nltk.corpus方法的具体用法?Python nltk.corpus怎么用?Python nltk.corpus使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk
的用法示例。
在下文中一共展示了nltk.corpus方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: closure
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def closure(self, rel, depth=-1):
"""Return the transitive closure of source under the rel
relationship, breadth-first
>>> from nltk.corpus import wordnet as wn
>>> dog = wn.synset('dog.n.01')
>>> hyp = lambda s:s.hypernyms()
>>> list(dog.closure(hyp))
[Synset('canine.n.02'), Synset('domestic_animal.n.01'),
Synset('carnivore.n.01'), Synset('animal.n.01'),
Synset('placental.n.01'), Synset('organism.n.01'),
Synset('mammal.n.01'), Synset('living_thing.n.01'),
Synset('vertebrate.n.01'), Synset('whole.n.02'),
Synset('chordate.n.01'), Synset('object.n.01'),
Synset('physical_entity.n.01'), Synset('entity.n.01')]
"""
from nltk.util import breadth_first
synset_offsets = []
for synset in breadth_first(self, rel, depth):
if synset._offset != self._offset:
if synset._offset not in synset_offsets:
synset_offsets.append(synset._offset)
yield synset
示例2: res_similarity
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def res_similarity(self, other, ic, verbose=False):
"""
Resnik Similarity:
Return a score denoting how similar two word senses are, based on the
Information Content (IC) of the Least Common Subsumer (most specific
ancestor node).
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type ic: dict
:param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
:return: A float score denoting the similarity of the two ``Synset`` objects.
Synsets whose LCS is the root node of the taxonomy will have a
score of 0 (e.g. N['dog'][0] and N['table'][0]).
"""
ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
return lcs_ic
示例3: lin_similarity
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def lin_similarity(self, other, ic, verbose=False):
"""
Lin Similarity:
Return a score denoting how similar two word senses are, based on the
Information Content (IC) of the Least Common Subsumer (most specific
ancestor node) and that of the two input Synsets. The relationship is
given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
:type other: Synset
:param other: The ``Synset`` that this ``Synset`` is being compared to.
:type ic: dict
:param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``).
:return: A float score denoting the similarity of the two ``Synset`` objects,
in the range 0 to 1.
"""
ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
return (2.0 * lcs_ic) / (ic1 + ic2)
示例4: get_pos_tagger
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def get_pos_tagger(self):
from nltk.corpus import brown
regexp_tagger = RegexpTagger(
[(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'), # articles
(r'.*able$', 'JJ'), # adjectives
(r'.*ness$', 'NN'), # nouns formed from adjectives
(r'.*ly$', 'RB'), # adverbs
(r'.*s$', 'NNS'), # plural nouns
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # past tense verbs
(r'.*', 'NN') # nouns (default)
])
brown_train = brown.tagged_sents(categories='news')
unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)
#Override particular words
main_tagger = RegexpTagger(
[(r'(A|a|An|an)$', 'ex_quant'),
(r'(Every|every|All|all)$', 'univ_quant')
], backoff=trigram_tagger)
return main_tagger
示例5: ieer_headlines
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def ieer_headlines():
from nltk.corpus import ieer
from nltk.tree import Tree
print("IEER: First 20 Headlines")
print("=" * 45)
trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)]
for tree in trees[:20]:
print()
print("%s:\n%s" % tree)
#############################################
## Dutch CONLL2002: take_on_role(PER, ORG
#############################################
示例6: conllesp
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def conllesp():
from nltk.corpus import conll2002
de = """
.*
(
de/SP|
del/SP
)
"""
DE = re.compile(de, re.VERBOSE)
print()
print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
print("=" * 45)
rels = [rel for doc in conll2002.chunked_sents('esp.train')
for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
for r in rels[:10]: print(clause(r, relsym='DE'))
print()
示例7: evaluate
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def evaluate(trueValues, predicted, decimals, note):
print note
label = 1
avg = 'weighted'
a = accuracy_score(trueValues, predicted)
p = precision_score(trueValues, predicted, pos_label=label, average=avg)
r = recall_score(trueValues, predicted, pos_label=label, average=avg)
avg_f1 = f1_score(trueValues, predicted, pos_label=label, average=avg)
fclasses = f1_score(trueValues, predicted, average=None)
f1c1 = fclasses[0]; f1c2 = fclasses[1]
fw = (f1c1 + f1c2)/2.0
print 'accuracy:\t', str(round(a,decimals))
print 'precision:\t', str(round(p,decimals))
print 'recall:\t', str(round(r,decimals))
print 'avg f1:\t', str(round(avg_f1,decimals))
print 'c1 f1:\t', str(round(f1c1,decimals))
print 'c2 f1:\t', str(round(f1c2,decimals))
print 'avg(c1,c2):\t', str(round(fw,decimals))
print '------------'
###################################################################################
# split a parallel or comparable corpus into two parts
示例8: build_lsi_model
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def build_lsi_model(corpus_name, corpus_path, topics=300):
logging.info( 'building lsi model for %s corpus', corpus_name )
dictFile = corpus_path + corpus_name + '.dict'
corpus_tfidf_file = corpus_path + corpus_name + '.tfidf.mm'
logging.info( 'loading dictionary ...' )
dictionary = corpora.Dictionary.load(dictFile)
logging.info( 'loading tfidf corpus ...' )
corpus_tfidf = corpora.MmCorpus(corpus_tfidf_file)
logging.info( 'building lsi model' )
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=topics)
logging.info( 'saving lsi' )
lsiFile = corpus_path + corpus_name + '.lsi'
lsi.save(lsiFile)
logging.info( 'lsi model is ready' )
##################################################################################
示例9: aligning_doc_by_interlanguage_links
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def aligning_doc_by_interlanguage_links(source_doc, target_corpus, source_language, target_language, output_path):
source = None
target = None
source_title = get_title_from_interlanguage_links(source_doc, source_language)
for d in target_corpus:
target_title = get_title_from_interlanguage_links(d, target_language)
if source_title == target_title:
source = source_doc
target = d
return source, target
##################################################################################
# takes a wikipedia corpus (extracted by WikiExtractor.py) and splits the corpus into documents and clean them
示例10: pos_tag_convert_penn_to_wn
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def pos_tag_convert_penn_to_wn(tag):
"""
Convert POS tag from Penn tagset to WordNet tagset.
:param tag: a tag from Penn tagset
:return: a tag from WordNet tagset or None if no corresponding tag could be found
"""
from nltk.corpus import wordnet as wn
if tag in ['JJ', 'JJR', 'JJS']:
return wn.ADJ
elif tag in ['RB', 'RBR', 'RBS']:
return wn.ADV
elif tag in ['NN', 'NNS', 'NNP', 'NNPS']:
return wn.NOUN
elif tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
return wn.VERB
return None
示例11: documents
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def documents(self, fold=None, train=False, test=False):
"""
A generator of documents being streamed from disk. Each document is
a list of paragraphs, which are a list of sentences, which in turn is
a list of tuples of (token, tag) pairs. All preprocessing is done by
NLTK and the CorpusReader object this object wraps.
If a fold is specified (should be an integer between 0 and folds),
then the loader will return documents from that fold. Further, train
or test must be specified to split the fold correctly. This method
allows us to maintain the generator properties of document reads.
"""
for fileid in self.fileids(fold, train, test):
yield list(self.corpus.tagged(fileids=fileid))
##########################################################################
## Normalize Transformer
##########################################################################
示例12: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def __init__(self, sick_path, target_directory, lm_path=None, wsd_algorithm='cosine', sampling_parameter=0.5,
min_substitutions=2, num_candidates=5, concatenate_corpora=True):
self.sick_path = sick_path
self.target_directory = target_directory
self.lm_path = lm_path
self.wsd_algorithm = wsd_algorithm
self.sampling_parameter = sampling_parameter
self.min_substitutions = min_substitutions
self.num_candidates = num_candidates
self.concatenate_corpora = concatenate_corpora
self.filtered_path = os.path.join(self.target_directory, 'filtered_sick.txt')
self.noscore_path = os.path.join(self.target_directory, 'noscore_sick.txt')
# Filter the original SICK corpus to match the expected format, and create file for LM training
if not os.path.exists(self.filtered_path) or not os.path.exists(self.noscore_path):
self.filter_sick()
if self.lm_path is None:
raise ValueError('No language model provided! Use the noscore_sick corpus to train an .klm LM, first.')
else:
self.language_model = kenlm.LanguageModel(self.lm_path)
示例13: filter_sick
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def filter_sick(self):
""" Processes the original S.I.C.K. corpus into a format where each line contains the two compared sentences
followed by their relatedness score. """
# Filter the SICK dataset for sentences and relatedness score only
df_origin = pd.read_table(self.sick_path)
df_classify = df_origin.loc[:, ['sentence_A', 'sentence_B', 'relatedness_score']]
# Scale relatedness score to to lie ∈ [0, 1] for training of the classifier
df_classify['relatedness_score'] = df_classify['relatedness_score'].apply(
lambda x: "{:.4f}".format(float(x)/5.0))
df_noscore = df_origin.loc[:, ['sentence_A', 'sentence_B']]
df_noscore = df_noscore.stack()
# Write the filtered set to a .csv file
df_classify.to_csv(self.filtered_path, sep='\t', index=False, header=False)
print('Filtered corpus saved to %s.' % self.filtered_path)
# Write a score-free set to a .csv file to be used in the training of the KN language model
df_noscore.to_csv(self.noscore_path, index=False, header=False)
print('Filtered corpus saved to %s.' % self.noscore_path)
示例14: line_prep
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def line_prep(self, line):
""" Tokenizes and POS-tags a line from the SICK corpus to be compatible with WordNet synset lookup. """
# Split line into sentences + score
s1, s2, sim_score = line.split('\t')
# Tokenize
s1_tokens = word_tokenize(s1)
s2_tokens = word_tokenize(s2)
# Assign part of speech tags
s1_penn_pos = nltk.pos_tag(s1_tokens)
s2_penn_pos = nltk.pos_tag(s2_tokens)
# Convert to WordNet POS tags and store word position in sentence for replacement
# Each tuple contains (word, WordNet_POS_tag, position)
s1_wn_pos = list()
s2_wn_pos = list()
for idx, item in enumerate(s1_penn_pos):
if self.get_wordnet_pos(item[1]) != 'OTHER':
s1_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s1_penn_pos.index(item)))
for idx, item in enumerate(s2_penn_pos):
if self.get_wordnet_pos(item[1]) != 'OTHER':
s2_wn_pos.append((item[0], self.get_wordnet_pos(item[1]), s2_penn_pos.index(item)))
# Each tuple contains (word, WordNet_POS_tag, position); Source sentence provided for use in disambiguation
return [(s1_wn_pos, s1_tokens), (s2_wn_pos, s2_tokens)], sim_score
示例15: __init__
# 需要导入模块: import nltk [as 别名]
# 或者: from nltk import corpus [as 别名]
def __init__(self, skip_download_check: bool = False, spacy_model="en_core_web_sm"):
try:
from nltk.corpus import wordnet
import nltk
except ImportError:
raise ImportError(
"WordNet-based data augmentation requires nltk to be installed."
)
self.wn = wordnet
try:
import spacy
from spacy.tokens import Token
except ImportError:
raise ImportError(
"WordNet-based data augmentation requires spaCy and a language "
"model to be installed (for part of speech tagging)."
)
if not skip_download_check:
nltk.download("wordnet")
self.nlp = spacy.load(spacy_model, parser=False, tagger=True, entity=False)
Token.set_extension("replacement", default=None, force=True)