本文整理汇总了Python中nltk.data.find函数的典型用法代码示例。如果您正苦于以下问题:Python find函数的具体用法?Python find怎么用?Python find使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了find函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_corpus_bleu
def test_corpus_bleu(self):
ref_file = find('models/wmt15_eval/ref.ru')
hyp_file = find('models/wmt15_eval/google.ru')
mteval_output_file = find('models/wmt15_eval/mteval-13a.output')
# Reads the BLEU scores from the `mteval-13a.output` file.
# The order of the list corresponds to the order of the ngrams.
with open(mteval_output_file, 'r') as mteval_fin:
# The numbers are located in the last 2nd line of the file.
# The first and 2nd item in the list are the score and system names.
mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])
with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
# Whitespace tokenize the file.
# Note: split() automatically strip().
hypothesis = list(map(lambda x: x.split(), hyp_fin))
# Note that the corpus_bleu input is list of list of references.
references = list(map(lambda x: [x.split()],ref_fin))
# Without smoothing.
for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores):
nltk_bleu = corpus_bleu(references, hypothesis, weights=(1.0/i,)*i)
# Check that the BLEU scores difference is less than 0.005 .
# Note: This is an approximate comparison; as much as
# +/- 0.01 BLEU might be "statistically significant",
# the actual translation quality might not be.
assert abs(mteval_bleu - nltk_bleu) < 0.005
# With the same smoothing method used in mteval-v13a.pl
chencherry = SmoothingFunction()
for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores):
nltk_bleu = corpus_bleu(references, hypothesis,
weights=(1.0/i,)*i,
smoothing_function=chencherry.method3)
assert abs(mteval_bleu - nltk_bleu) < 0.005
示例2: demo
def demo():
from itertools import islice
# zip_path = find('corpora/toolbox.zip')
# lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse()
file_path = find('corpora/toolbox/rotokas.dic')
lexicon = ToolboxData(file_path).parse()
print('first field in fourth record:')
print(lexicon[3][0].tag)
print(lexicon[3][0].text)
print('\nfields in sequential order:')
for field in islice(lexicon.find('record'), 10):
print(field.tag, field.text)
print('\nlx fields:')
for field in islice(lexicon.findall('record/lx'), 10):
print(field.text)
settings = ToolboxSettings()
file_path = find('corpora/toolbox/MDF/MDF_AltH.typ')
settings.open(file_path)
# settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ'))
tree = settings.parse(unwrap=False, encoding='cp1252')
print(tree.find('expset/expMDF/rtfPageSetup/paperSize').text)
settings_tree = ElementTree(tree)
print(to_settings_string(settings_tree).encode('utf8'))
示例3: build_model
def build_model(fmt="binary"):
print("Loading training data...")
train_paths = [
find("corpora/ace_data/ace.dev"),
find("corpora/ace_data/ace.heldout"),
find("corpora/ace_data/bbn.dev"),
find("corpora/ace_data/muc.dev"),
]
train_trees = load_ace_data(train_paths, fmt)
train_data = [postag_tree(t) for t in train_trees]
print("Training...")
cp = NEChunkParser(train_data)
del train_data
print("Loading eval data...")
eval_paths = [find("corpora/ace_data/ace.eval")]
eval_trees = load_ace_data(eval_paths, fmt)
eval_data = [postag_tree(t) for t in eval_trees]
print("Evaluating...")
chunkscore = ChunkScore()
for i, correct in enumerate(eval_data):
guess = cp.parse(correct.leaves())
chunkscore.score(correct, guess)
if i < 3:
cmp_chunks(correct, guess)
print(chunkscore)
outfilename = "/tmp/ne_chunker_%s.pickle" % fmt
print("Saving chunker to %s..." % outfilename)
with open(outfilename, "wb") as out:
pickle.dump(cp, out, -1)
return cp
示例4: nltk_download_corpus
def nltk_download_corpus(resource_path):
"""
Download the specified NLTK corpus file
unless it has already been downloaded.
Returns True if the corpus needed to be downloaded.
"""
from nltk.data import find
from nltk import download
from os.path import split
# Download the wordnet data only if it is not already downloaded
_, corpus_name = split(resource_path)
## From http://www.nltk.org/api/nltk.html ##
# When using find() to locate a directory contained in a zipfile,
# the resource name must end with the forward slash character.
# Otherwise, find() will not locate the directory.
####
# Helps when resource_path=='sentiment/vader_lexicon''
if not resource_path.endswith('/'):
resource_path = resource_path + '/'
downloaded = False
try:
find(resource_path)
except LookupError:
download(corpus_name)
downloaded = True
return downloaded
示例5: build_model
def build_model(fmt='binary'):
print('Loading training data...')
train_paths = [find('corpora/ace_data/ace.dev'),
find('corpora/ace_data/ace.heldout'),
find('corpora/ace_data/bbn.dev'),
find('corpora/ace_data/muc.dev')]
train_trees = load_ace_data(train_paths, fmt)
train_data = [postag_tree(t) for t in train_trees]
print('Training...')
cp = NEChunkParser(train_data)
del train_data
print('Loading eval data...')
eval_paths = [find('corpora/ace_data/ace.eval')]
eval_trees = load_ace_data(eval_paths, fmt)
eval_data = [postag_tree(t) for t in eval_trees]
print('Evaluating...')
chunkscore = ChunkScore()
for i, correct in enumerate(eval_data):
guess = cp.parse(correct.leaves())
chunkscore.score(correct, guess)
if i < 3: cmp_chunks(correct, guess)
print(chunkscore)
outfilename = '/tmp/ne_chunker_%s.pickle' % fmt
print('Saving chunker to %s...' % outfilename)
with open(outfilename, 'wb') as outfile:
pickle.dump(cp, outfile, -1)
return cp
示例6: __init__
def __init__(self):
from nltk.data import find
from nltk import download
try:
find('wordnet.zip')
except LookupError:
download('wordnet')
示例7: namedEntityRecognizer
def namedEntityRecognizer():
echo2("Performing NER on incoming stream")
content = request.stream.read()
#print content
if Verbose:
echo2("Incoming content is "+content)
PICKLE = "averaged_perceptron_tagger.pickle"
AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
tagger = PerceptronTagger(load=False)
tagger.load(AP_MODEL_LOC)
pos_tag = tagger.tag
start = time.time()
#date_time = timex.tag(content)
tokenized = nltk.word_tokenize(content)
tagged = pos_tag(tokenized)
namedEnt = nltk.ne_chunk(tagged, binary=True)
names = extract_entity_names(namedEnt, 'NE')
#names.extend(date_time)
result = {"result" : "success", "names" : names}
if Units:
grammar = '''unit: {<CD><NNS>?<NN.*>?},
unit: {<CD><JJ>?<NN.*>}
'''
parser = nltk.RegexpParser(grammar)
units = extract_entity_names(parser.parse(tagged),'unit')
result['units'] = units
jsonDoc = json.dumps(result, sort_keys=True, indent=4, separators=(',', ': '))
end = time.time()
print "NER took "+str(end - start)+" seconds"
return jsonDoc
示例8: _vocabulary
def _vocabulary(self):
return (
data.find('stemmers/porter_test/porter_vocabulary.txt')
.open(encoding='utf-8')
.read()
.splitlines()
)
示例9: _vocabulary
def _vocabulary(self):
with closing(
data.find('stemmers/porter_test/porter_vocabulary.txt').open(
encoding='utf-8'
)
) as fp:
return fp.read().splitlines()
示例10: demo
def demo():
from nltk.data import find
corpus_root = find('corpora/childes/data-xml/Eng-USA/')
childes = CHILDESCorpusReader(corpus_root, u'.*.xml')
# describe all corpus
for file in childes.fileids()[:5]:
corpus = ''
corpus_id = ''
for (key,value) in childes.corpus(file)[0].items():
if key == "Corpus": corpus = value
if key == "Id": corpus_id = value
print 'Reading', corpus,corpus_id,' .....'
print "words:", childes.words(file)[:7],"..."
print "words with replaced words:", childes.words(file, replace=True)[:7]," ..."
print "words with pos tags:", childes.words(file, pos=True)[:7]," ..."
print "words (only MOT):", childes.words(file, speaker='MOT')[:7], "..."
print "words (only CHI):", childes.words(file, speaker='CHI')[:7], "..."
print "stemmed words:", childes.words(file, stem=True)[:7]," ..."
print "words with relations and pos-tag:", childes.words(file, relation=True)[:5]," ..."
print "sentence:", childes.sents(file)[:2]," ..."
for (participant, values) in childes.participants(file)[0].items():
for (key, value) in values.items():
print "\tparticipant", participant, key, ":", value
print "num of sent:", len(childes.sents(file))
print "num of morphemes:", len(childes.words(file, stem=True))
print "age:", childes.age(file)
print "age in month:", childes.age(file, month=True)
print "MLU:", childes.MLU(file)
print '\r'
示例11: test_vocabulary_nltk_mode
def test_vocabulary_nltk_mode(self):
self._test_against_expected_output(
PorterStemmer.NLTK_EXTENSIONS,
data.find('stemmers/porter_test/porter_nltk_output.txt')
.open(encoding='utf-8')
.read()
.splitlines()
)
示例12: _get_tagger
def _get_tagger(lang=None):
if lang == 'rus':
tagger = PerceptronTagger(False)
ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE))
tagger.load(ap_russian_model_loc)
else:
tagger = PerceptronTagger()
return tagger
示例13: __init__
def __init__(self):
from nltk.data import find
from nltk import download
import os
# Download the wordnet data only if it is not already downloaded
wordnet_path = None
if os.name == 'nt':
wordnet_path = os.path.join(os.getenv('APPDATA'), 'nltk_data',
'corpora', 'wordnet.zip')
else:
wordnet_path = os.path.join(os.path.expanduser('~'), 'nltk_data',
'corpora', 'wordnet.zip')
try:
if not os.path.isfile(wordnet_path):
find('wordnet.zip')
except LookupError:
download('wordnet')
示例14: demo
def demo(corpus_root=None):
"""
The CHILDES corpus should be manually downloaded and saved
to ``[NLTK_Data_Dir]/corpora/childes/``
"""
if not corpus_root:
from nltk.data import find
corpus_root = find('corpora/childes/data-xml/Eng-USA/')
try:
childes = CHILDESCorpusReader(corpus_root, '.*.xml')
# describe all corpus
for file in childes.fileids()[:5]:
corpus = ''
corpus_id = ''
for (key, value) in childes.corpus(file)[0].items():
if key == "Corpus":
corpus = value
if key == "Id":
corpus_id = value
print('Reading', corpus, corpus_id, ' .....')
print("words:", childes.words(file)[:7], "...")
print(
"words with replaced words:",
childes.words(file, replace=True)[:7],
" ...",
)
print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
print("words (only MOT):", childes.words(file, speaker='MOT')[:7], "...")
print("words (only CHI):", childes.words(file, speaker='CHI')[:7], "...")
print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
print(
"words with relations and pos-tag:",
childes.words(file, relation=True)[:5],
" ...",
)
print("sentence:", childes.sents(file)[:2], " ...")
for (participant, values) in childes.participants(file)[0].items():
for (key, value) in values.items():
print("\tparticipant", participant, key, ":", value)
print("num of sent:", len(childes.sents(file)))
print("num of morphemes:", len(childes.words(file, stem=True)))
print("age:", childes.age(file))
print("age in month:", childes.age(file, month=True))
print("MLU:", childes.MLU(file))
print()
except LookupError as e:
print(
"""The CHILDES corpus, or the parts you need, should be manually
downloaded from https://childes.talkbank.org/data-xml/ and saved at
[NLTK_Data_Dir]/corpora/childes/
Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
demo('/path/to/childes/data-xml/Eng-USA/")
"""
)
示例15: __init__
def __init__(self):
from nltk.data import find
from nltk import download
import os
# Download the punkt data only if it is not already downloaded
punkt_path = None
if os.name == 'nt':
punkt_path = os.path.join(os.getenv('APPDATA'), 'nltk_data',
'tokenizers', 'punkt.zip')
else:
punkt_path = os.path.join(os.path.expanduser('~'), 'nltk_data',
'tokenizers', 'punkt.zip')
try:
if not os.path.isfile(punkt_path):
find('punkt.zip')
except LookupError:
download('punkt')