本文整理汇总了Python中nltk_trainer.load_corpus_reader函数的典型用法代码示例。如果您正苦于以下问题:Python load_corpus_reader函数的具体用法?Python load_corpus_reader怎么用?Python load_corpus_reader使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了load_corpus_reader函数的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: print
eval_group = parser.add_argument_group('Tagger Evaluation',
'Evaluation metrics for part-of-speech taggers')
eval_group.add_argument('--no-eval', action='store_true', default=False,
help="don't do any evaluation")
# TODO: word coverage of test words, how many get a tag != '-NONE-'
args = parser.parse_args()
###################
## corpus reader ##
###################
if args.trace:
print('loading %s' % args.corpus)
tagged_corpus = load_corpus_reader(args.corpus, reader=args.reader, fileids=args.fileids)
fileids = args.fileids
kwargs = {}
# all other corpora are assumed to support simplify_tags kwarg
if simplify_wsj_tag and args.simplify_tags and args.corpus not in ['conll2000', 'switchboard', 'pl196x']:
kwargs['simplify_tags'] = True
# these corpora do not support simplify_tags, and have no known workaround
elif simplify_wsj_tag and args.simplify_tags and args.corpus in ['pl196x']:
raise ValueError('%s does not support simplify_tags' % args.corpus)
elif not simplify_wsj_tag and args.tagset:
kwargs['tagset'] = args.tagset
if args.trace:
print('using %s tagset' % args.tagset)
示例2: import_attr
reader_args = []
reader_kwargs = {}
if args.word_tokenizer:
reader_kwargs['word_tokenizer'] = import_attr(args.word_tokenizer)()
if args.sent_tokenizer:
reader_kwargs['sent_tokenizer'] = nltk.data.LazyLoader(args.sent_tokenizer)
if args.para_block_reader:
reader_kwargs['para_block_reader'] = import_attr(args.para_block_reader)
if args.trace:
print 'loading %s' % args.source_corpus
input_corpus = load_corpus_reader(args.source_corpus, args.reader,
*reader_args, **reader_kwargs)
#################
## translation ##
#################
for fileid in input_corpus.fileids():
# TODO: use ~/nltk_data/corpora as dir prefix?
path = os.path.join(args.target_corpus, fileid)
dirname = os.path.dirname(path)
if not os.path.exists(dirname):
if args.trace:
print 'making directory %s' % dirname
os.makedirs(dirname)
示例3: import_attr
reader_args.append(args.cat_pattern)
reader_kwargs['cat_pattern'] = re.compile(args.cat_pattern)
if args.word_tokenizer:
reader_kwargs['word_tokenizer'] = import_attr(args.word_tokenizer)()
if args.sent_tokenizer:
reader_kwargs['sent_tokenizer'] = nltk.data.LazyLoader(args.sent_tokenizer)
if args.para_block_reader:
reader_kwargs['para_block_reader'] = import_attr(args.para_block_reader)
if args.trace:
print 'loading %s' % args.corpus
categorized_corpus = load_corpus_reader(args.corpus, args.reader,
*reader_args, **reader_kwargs)
if not hasattr(categorized_corpus, 'categories'):
raise ValueError('%s is does not have categories for classification')
if len(args.labels) > 0:
labels = args.labels.split(",")
else:
labels = categorized_corpus.categories()
nlabels = len(labels)
if args.trace:
print '%d labels: %s' % (nlabels, labels)
if not nlabels:
raise ValueError('corpus does not have any categories')
示例4: load_corpus_reader
help='Full module path to a corpus reader class, defaults to %(default)s.')
corpus_group.add_argument('--fileids', default=None,
help='Specify fileids to load from corpus')
corpus_group.add_argument('--sent-tokenizer', default='tokenizers/punkt/english.pickle',
help='Path to pickled sentence tokenizer')
corpus_group.add_argument('--word-tokenizer', default='nltk.tokenize.WordPunctTokenizer',
help='Full module path to a tokenizer class, defaults to %(default)s.')
args = parser.parse_args()
###################
## corpus reader ##
###################
source_corpus = load_corpus_reader(args.source_corpus, reader=args.reader,
fileids=args.fileids, encoding='utf-8', sent_tokenizer=args.sent_tokenizer,
word_tokenizer=args.word_tokenizer)
if not source_corpus:
raise ValueError('%s is an unknown corpus')
if args.trace:
print 'loaded %s' % args.source_corpus
############
## tagger ##
############
# TODO: from analyze_tagger_coverage.py
if args.trace:
print 'loading tagger %s' % args.tagger
示例5: load_corpus_reader
help='''The fraction of the corpus to use for training a binary or
multi-class classifier, the rest will be used for evaulation.
The default is to use the entire corpus, and to test the classifier
against the same training data. Any number < 1 will test against
the remaining fraction.''')
args = parser.parse_args()
###################
## corpus reader ##
###################
if args.trace:
print 'loading corpus %s' % args.corpus
corpus = load_corpus_reader(args.corpus)
methods = {
'sents': nltk_trainer.classification.corpus.category_sent_strings,
'paras': nltk_trainer.classification.corpus.category_para_strings,
'files': nltk_trainer.classification.corpus.category_file_strings
}
cat_instances = methods[args.instances](corpus)
################
## CSV output ##
################
filename = args.filename
示例6: load_corpus_reader
trans_group = parser.add_argument_group('Language Translation')
trans_group.add_argument('--source', default='english', choices=langs, help='source language')
trans_group.add_argument('--target', default=None, choices=langs, help='target language')
trans_group.add_argument('--retries', default=3, type=int,
help='Number of babelfish retries before quiting')
trans_group.add_argument('--sleep', default=3, type=int,
help='Sleep time between retries')
args = parser.parse_args()
###################
## corpus reader ##
###################
source_corpus = load_corpus_reader(args.source_corpus, args.reader)
if not source_corpus:
raise ValueError('%s is an unknown corpus')
if args.trace:
print 'loaded %s' % args.source_corpus
########################
## text normalization ##
########################
# TODO: copied from analyze_classifier_coverage, so abstract
if args.filter_stopwords == 'no':
stopset = set()