本文整理匯總了Python中nltk.corpus.reader.api.CorpusReader類的典型用法代碼示例。如果您正苦於以下問題:Python CorpusReader類的具體用法?Python CorpusReader怎麽用?Python CorpusReader使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了CorpusReader類的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: __init__
def __init__(
self,
root,
fileids,
sep="/",
word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
alignedsent_block_reader=read_alignedsent_block,
encoding="latin1",
):
"""
Construct a new Aligned Corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
示例2: __init__
def __init__(self, root, fileids=None, encoding='utf8', skip_keywords=None,
target_language=None, paragraph_separator='\n\n', **kwargs):
"""
:param root: The file root of the corpus directory
:param fileids: the list of file ids to consider, or wildcard expression
:param skip_keywords: a list of words which indicate whole paragraphs that should
be skipped by the paras and words methods()
:param target_language: which files to select; sometimes a corpus contains English
translations, we expect these files to be named ...english.json -- if not, pass in fileids
:param paragraph_separator: character sequence demarcating paragraph separation
:param encoding: utf8
:param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer,
word_tokenizer.
"""
if not target_language:
target_language = ''
if not fileids:
fileids = r'.*{}\.json'.format(target_language)
# Initialize the NLTK corpus reader objects
CorpusReader.__init__(self, root, fileids, encoding)
if 'sent_tokenizer' in kwargs:
self._sent_tokenizer = kwargs['sent_tokenizer']
if 'word_tokenizer' in kwargs:
self._word_tokenizer = kwargs['word_tokenizer']
self.skip_keywords = skip_keywords
self.paragraph_separator = paragraph_separator
示例3: __init__
def __init__(self, root, fileids, encoding='utf8', morphs2str=_morphs2str_default):
"""
Initialize KNBCorpusReader
morphs2str is a function to convert morphlist to str for tree representation
for _parse()
"""
CorpusReader.__init__(self, root, fileids, encoding)
self.morphs2str = morphs2str
示例4: __init__
def __init__(self, root, fileids=DOC_PATTERN, **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
the ``CategorizedCorpusReader`` constructor. The remaining
arguments are passed to the ``CorpusReader`` constructor.
"""
CorpusReader.__init__(self, root, fileids)
示例5: __init__
def __init__(self, root, fileids, tone, tag, wrap_etree=False):
self.fileids = fileids
self._wrap_etree = wrap_etree
CorpusReader.__init__(self, root, fileids)
self.tagged_sents = []
self.sents = []
self.words = []
self.tagged_words = []
self.option_tone = tone
self.option_tag = tag
示例6: __init__
def __init__(self, root, fileids,
syntax_parser=CaboChaParser(),
word_tokenizer=MeCabTokenizer(),
sent_tokenizer=jp_sent_tokenizer,
case_parser=KNPParser(),
encoding='utf-8'):
CorpusReader.__init__(self, root, fileids, encoding)
self._syntax_parser = syntax_parser
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._case_parser = case_parser
示例7: __init__
def __init__(self, root, zipfile, fileids):
if isinstance(root, basestring):
root = FileSystemPathPointer(root)
elif not isinstance(root, PathPointer):
raise TypeError('CorpusReader: expected a string or a PathPointer')
# convert to a ZipFilePathPointer
root = ZipFilePathPointer(root.join(zipfile))
CorpusReader.__init__(self, root, fileids)
self._parse_char_replacements()
示例8: __init__
def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
the ``CategorizedCorpusReader`` constructor. The remaining arguments
are passed to the ``CorpusReader`` constructor.
"""
# Add the default category pattern if not passed into the class.
if not any(key.startswith('cat_') for key in kwargs.keys()):
kwargs['cat_pattern'] = CAT_PATTERN
CategorizedCorpusReader.__init__(self, kwargs)
CorpusReader.__init__(self, root, fileids)
示例9: __init__
def __init__(self, root, fileids,
sep='/', word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
encoding=None):
"""
@param root: The root directory for this corpus.
@param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader=None,
self._alignedsent_block_reader = self._alignedsent_block_reader
self._alignedsent_corpus_view = None
示例10: assemble_corpus
def assemble_corpus(corpus_reader: CorpusReader,
types_requested: List[str],
type_dirs: Dict[str, List[str]] = None,
type_files: Dict[str, List[str]] = None) -> CorpusReader:
"""
Create a filtered corpus.
:param corpus_reader: This get mutated
:param types_requested: a list of string types, which are to be found in the type_dirs and
type_files mappings
:param type_dirs: a dict of corpus types to directories
:param type_files: a dict of corpus types to files
:return: a CorpusReader object containing only the mappings desired
"""
fileid_names = [] # type: List[str]
try:
all_file_ids = list(corpus_reader.fileids())
clean_ids_types = [] # type: List[Tuple[str, str]]
if type_files:
for key, valuelist in type_files.items():
if key in types_requested:
for value in valuelist:
if value in all_file_ids:
if key:
clean_ids_types.append((value, key))
if type_dirs:
for key, valuelist in type_dirs.items():
if key in types_requested:
for value in valuelist:
corrected_dir = value.replace('./', '')
corrected_dir = '{}/'.format(corrected_dir)
for name in all_file_ids:
if name and name.startswith(corrected_dir):
clean_ids_types.append((name, key))
clean_ids_types.sort(key=lambda x: x[0])
fileid_names, categories = zip(*clean_ids_types) # type: ignore
corpus_reader._fileids = fileid_names
return corpus_reader
except Exception:
LOG.exception('failure in corpus building')
示例11: __init__
def __init__(self, root, fileids=None,
word_tokenizer=TweetTokenizer(),
encoding='utf8'):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param word_tokenizer: Tokenizer for breaking the text of Tweets into
smaller units, including but not limited to words.
"""
CorpusReader.__init__(self, root, fileids, encoding)
for path in self.abspaths(self._fileids):
if isinstance(path, ZipFilePathPointer):
pass
elif os.path.getsize(path) == 0:
raise ValueError("File {} is empty".format(path))
"""Check that all user-created corpus files are non-empty."""
self._word_tokenizer = word_tokenizer
示例12: __init__
def __init__(self,
root,
fileids,
column_types=None,
top_node='S',
beginning_of_sentence=r'#BOS.+$',
end_of_sentence=r'#EOS.+$',
encoding=None):
""" Construct a new corpus reader for reading NEGRA corpus files.
@param root: The root directory of the corpus files.
@param fileids: A list of or regex specifying the files to read from.
@param column_types: An optional C{list} of columns in the corpus.
@param top_node: The top node of parsed sentence trees.
@param beginning_of_sentence: A regex specifying the start of a sentence
@param end_of_sentence: A regex specifying the end of a sentence
@param encoding: The default corpus file encoding.
"""
# Make sure there are no invalid column type
if isinstance(column_types, list):
for column_type in column_types:
if column_type not in self.COLUMN_TYPES:
raise ValueError("Column %r is not supported." % columntype)
else:
column_types = self.COLUMN_TYPES
# Define stuff
self._top_node = top_node
self._column_types = column_types
self._fileids = fileids
self._bos = beginning_of_sentence
self._eos = end_of_sentence
self._colmap = dict((c,i) for (i,c) in enumerate(column_types))
# Finish constructing by calling the extended class' constructor
CorpusReader.__init__(self, root, fileids, encoding)
示例13: fileids
def fileids(self, channels=None, domains=None, categories=None):
if channels is not None and domains is not None and \
categories is not None:
raise ValueError('You can specify only one of channels, domains '
'and categories parameter at once')
if channels is None and domains is None and \
categories is None:
return CorpusReader.fileids(self)
if isinstance(channels, basestring):
channels = [channels]
if isinstance(domains, basestring):
domains = [domains]
if isinstance(categories, basestring):
categories = [categories]
if channels:
return self._list_morph_files_by('channel', channels)
elif domains:
return self._list_morph_files_by('domain', domains)
else:
return self._list_morph_files_by('keyTerm', categories,
map=self._map_category)
示例14: __init__
def __init__(self, root, fileids):
CorpusReader.__init__(self, root, fileids, None, None)
示例15: __init__
def __init__(self, root, fileids, wrap_etree=False):
self._wrap_etree = wrap_etree
CorpusReader.__init__(self, root, fileids)