本文整理汇总了Python中nltk.data.find方法的典型用法代码示例。如果您正苦于以下问题:Python data.find方法的具体用法?Python data.find怎么用?Python data.find使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.data
的用法示例。
在下文中一共展示了data.find方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_vocabulary_martin_mode
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def test_vocabulary_martin_mode(self):
"""Tests all words from the test vocabulary provided by M Porter
The sample vocabulary and output were sourced from:
http://tartarus.org/martin/PorterStemmer/voc.txt
http://tartarus.org/martin/PorterStemmer/output.txt
and are linked to from the Porter Stemmer algorithm's homepage
at
http://tartarus.org/martin/PorterStemmer/
"""
self._test_against_expected_output(
PorterStemmer.MARTIN_EXTENSIONS,
data.find('stemmers/porter_test/porter_martin_output.txt')
.open(encoding='utf-8')
.read()
.splitlines()
)
示例2: __init__
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def __init__(self):
"""
Contains various synset related functions.
"""
try:
data.find(os.path.join("corpora", "wordnet"))
except LookupError:
download("wordnet")
self.API = ImageNetAPI()
示例3: build_model
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def build_model(fmt='binary'):
print('Loading training data...')
train_paths = [find('corpora/ace_data/ace.dev'),
find('corpora/ace_data/ace.heldout'),
find('corpora/ace_data/bbn.dev'),
find('corpora/ace_data/muc.dev')]
train_trees = load_ace_data(train_paths, fmt)
train_data = [postag_tree(t) for t in train_trees]
print('Training...')
cp = NEChunkParser(train_data)
del train_data
print('Loading eval data...')
eval_paths = [find('corpora/ace_data/ace.eval')]
eval_trees = load_ace_data(eval_paths, fmt)
eval_data = [postag_tree(t) for t in eval_trees]
print('Evaluating...')
chunkscore = ChunkScore()
for i, correct in enumerate(eval_data):
guess = cp.parse(correct.leaves())
chunkscore.score(correct, guess)
if i < 3: cmp_chunks(correct, guess)
print(chunkscore)
outfilename = '/tmp/ne_chunker_%s.pickle' % fmt
print('Saving chunker to %s...' % outfilename)
with open(outfilename, 'wb') as outfile:
pickle.dump(cp, outfile, -1)
return cp
示例4: _chunk_parse
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def _chunk_parse(self, grammar=None, root_label='record', trace=0, **kwargs):
"""
Returns an element tree structure corresponding to a toolbox data file
parsed according to the chunk grammar.
:type grammar: str
:param grammar: Contains the chunking rules used to parse the
database. See ``chunk.RegExp`` for documentation.
:type root_label: str
:param root_label: The node value that should be used for the
top node of the chunk structure.
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
``1`` will generate normal tracing output; and ``2`` or
higher will generate verbose tracing output.
:type kwargs: dict
:param kwargs: Keyword arguments passed to ``toolbox.StandardFormat.fields()``
:rtype: ElementTree._ElementInterface
"""
from nltk import chunk
from nltk.tree import Tree
cp = chunk.RegexpParser(grammar, root_label=root_label, trace=trace)
db = self.parse(**kwargs)
tb_etree = Element('toolbox_data')
header = db.find('header')
tb_etree.append(header)
for record in db.findall('record'):
parsed = cp.parse([(elem.text, elem.tag) for elem in record])
tb_etree.append(self._tree2etree(parsed))
return tb_etree
示例5: add_default_fields
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def add_default_fields(elem, default_fields):
"""
Add blank elements and subelements specified in default_fields.
:param elem: toolbox data in an elementtree structure
:type elem: ElementTree._ElementInterface
:param default_fields: fields to add to each type of element and subelement
:type default_fields: dict(tuple)
"""
for field in default_fields.get(elem.tag, []):
if elem.find(field) is None:
SubElement(elem, field)
for child in elem:
add_default_fields(child, default_fields)
示例6: demo
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def demo():
from itertools import islice
# zip_path = find('corpora/toolbox.zip')
# lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse()
file_path = find('corpora/toolbox/rotokas.dic')
lexicon = ToolboxData(file_path).parse()
print('first field in fourth record:')
print(lexicon[3][0].tag)
print(lexicon[3][0].text)
print('\nfields in sequential order:')
for field in islice(lexicon.find('record'), 10):
print(field.tag, field.text)
print('\nlx fields:')
for field in islice(lexicon.findall('record/lx'), 10):
print(field.text)
settings = ToolboxSettings()
file_path = find('corpora/toolbox/MDF/MDF_AltH.typ')
settings.open(file_path)
# settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ'))
tree = settings.parse(unwrap=False, encoding='cp1252')
print(tree.find('expset/expMDF/rtfPageSetup/paperSize').text)
settings_tree = ElementTree(tree)
print(to_settings_string(settings_tree).encode('utf8'))
示例7: __init__
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def __init__(self, load=True):
'''
:param load: Load the pickled model upon instantiation.
'''
self.model = AveragedPerceptron()
self.tagdict = {}
self.classes = set()
if load:
AP_MODEL_LOC = str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
self.load(AP_MODEL_LOC)
示例8: __init__
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def __init__(self, load=True):
'''
:param load: Load the pickled model upon instantiation.
'''
self.model = AveragedPerceptron()
self.tagdict = {}
self.classes = set()
if load:
AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
self.load(AP_MODEL_LOC)
开发者ID:SignalMedia,项目名称:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda,代码行数:12,代码来源:perceptron.py
示例9: nltk_download_corpus
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def nltk_download_corpus(resource_path):
"""
Download the specified NLTK corpus file
unless it has already been downloaded.
Returns True if the corpus needed to be downloaded.
"""
from nltk.data import find
from nltk import download
from os.path import split
# Download the wordnet data only if it is not already downloaded
_, corpus_name = split(resource_path)
## From http://www.nltk.org/api/nltk.html ##
# When using find() to locate a directory contained in a zipfile,
# the resource name must end with the forward slash character.
# Otherwise, find() will not locate the directory.
####
# Helps when resource_path=='sentiment/vader_lexicon''
if not resource_path.endswith('/'):
resource_path = resource_path + '/'
downloaded = False
try:
find(resource_path)
except LookupError:
download(corpus_name)
downloaded = True
return downloaded
示例10: build_model
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def build_model(fmt='binary'):
print('Loading training data...')
train_paths = [find('corpora/ace_data/ace.dev'),
find('corpora/ace_data/ace.heldout'),
find('corpora/ace_data/bbn.dev'),
find('corpora/ace_data/muc.dev')]
train_trees = load_ace_data(train_paths, fmt)
train_data = [postag_tree(t) for t in train_trees]
print('Training...')
cp = NEChunkParser(train_data)
del train_data
print('Loading eval data...')
eval_paths = [find('corpora/ace_data/ace.eval')]
eval_trees = load_ace_data(eval_paths, fmt)
eval_data = [postag_tree(t) for t in eval_trees]
print('Evaluating...')
chunkscore = ChunkScore()
for i, correct in enumerate(eval_data):
guess = cp.parse(correct.leaves())
chunkscore.score(correct, guess)
if i < 3: cmp_chunks(correct, guess)
print(chunkscore)
outfilename = '/tmp/ne_chunker_{0}.pickle'.format(fmt)
print('Saving chunker to {0}...'.format(outfilename))
with open(outfilename, 'wb') as outfile:
pickle.dump(cp, outfile, -1)
return cp
示例11: test_corpus_bleu
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def test_corpus_bleu(self):
ref_file = find('models/wmt15_eval/ref.ru')
hyp_file = find('models/wmt15_eval/google.ru')
mteval_output_file = find('models/wmt15_eval/mteval-13a.output')
# Reads the BLEU scores from the `mteval-13a.output` file.
# The order of the list corresponds to the order of the ngrams.
with open(mteval_output_file, 'r') as mteval_fin:
# The numbers are located in the last 2nd line of the file.
# The first and 2nd item in the list are the score and system names.
mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])
with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
# Whitespace tokenize the file.
# Note: split() automatically strip().
hypothesis = list(map(lambda x: x.split(), hyp_fin))
# Note that the corpus_bleu input is list of list of references.
references = list(map(lambda x: [x.split()], ref_fin))
# Without smoothing.
for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores):
nltk_bleu = corpus_bleu(references, hypothesis, weights=(1.0/i,)*i)
# Check that the BLEU scores difference is less than 0.005 .
# Note: This is an approximate comparison; as much as
# +/- 0.01 BLEU might be "statistically significant",
# the actual translation quality might not be.
assert abs(mteval_bleu - nltk_bleu) < 0.005
# With the same smoothing method used in mteval-v13a.pl
chencherry = SmoothingFunction()
for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores):
nltk_bleu = corpus_bleu(references, hypothesis,
weights=(1.0/i,)*i,
smoothing_function=chencherry.method3)
assert abs(mteval_bleu - nltk_bleu) < 0.005
示例12: _vocabulary
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def _vocabulary(self):
return (
data.find('stemmers/porter_test/porter_vocabulary.txt')
.open(encoding='utf-8')
.read()
.splitlines()
)
示例13: test_vocabulary_original_mode
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def test_vocabulary_original_mode(self):
# The list of stems for this test was generated by taking the
# Martin-blessed stemmer from
# http://tartarus.org/martin/PorterStemmer/c.txt
# and removing all the --DEPARTURE-- sections from it and
# running it against Martin's test vocabulary.
self._test_against_expected_output(
PorterStemmer.ORIGINAL_ALGORITHM,
data.find('stemmers/porter_test/porter_original_output.txt')
.open(encoding='utf-8')
.read()
.splitlines()
)
示例14: demo
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def demo():
"""This assumes the Python module bllipparser is installed."""
# download and install a basic unified parsing model (Wall Street Journal)
# sudo python -m nltk.downloader bllip_wsj_no_aux
from nltk.data import find
model_dir = find('models/bllip_wsj_no_aux').path
print('Loading BLLIP Parsing models...')
# the easiest way to get started is to use a unified model
bllip = BllipParser.from_unified_model_dir(model_dir)
print('Done.')
sentence1 = 'British left waffles on Falklands .'.split()
sentence2 = 'I saw the man with the telescope .'.split()
# this sentence is known to fail under the WSJ parsing model
fail1 = '# ! ? : -'.split()
for sentence in (sentence1, sentence2, fail1):
print('Sentence: %r' % ' '.join(sentence))
try:
tree = next(bllip.parse(sentence))
print(tree)
except StopIteration:
print("(parse failed)")
# n-best parsing demo
for i, parse in enumerate(bllip.parse(sentence1)):
print('parse %d:\n%s' % (i, parse))
# using external POS tag constraints
print("forcing 'tree' to be 'NN':",
next(bllip.tagged_parse([('A', None), ('tree', 'NN')])))
print("forcing 'A' to be 'DT' and 'tree' to be 'NNP':",
next(bllip.tagged_parse([('A', 'DT'), ('tree', 'NNP')])))
# constraints don't have to make sense... (though on more complicated
# sentences, they may cause the parse to fail)
print("forcing 'A' to be 'NNP':",
next(bllip.tagged_parse([('A', 'NNP'), ('tree', None)])))
示例15: _get_tagger
# 需要导入模块: from nltk import data [as 别名]
# 或者: from nltk.data import find [as 别名]
def _get_tagger(lang=None):
if lang == 'rus':
tagger = PerceptronTagger(False)
ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE))
tagger.load(ap_russian_model_loc)
elif lang == 'eng':
tagger = PerceptronTagger()
else:
tagger = PerceptronTagger()
return tagger