本文整理汇总了Python中nltk.corpus.reader.api.CorpusReader.__init__方法的典型用法代码示例。如果您正苦于以下问题:Python CorpusReader.__init__方法的具体用法?Python CorpusReader.__init__怎么用?Python CorpusReader.__init__使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.corpus.reader.api.CorpusReader
的用法示例。
在下文中一共展示了CorpusReader.__init__方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
def __init__(
self,
root,
fileids,
sep="/",
word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer("\n", gaps=True),
alignedsent_block_reader=read_alignedsent_block,
encoding="latin1",
):
"""
Construct a new Aligned Corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/...path to corpus.../'
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader = alignedsent_block_reader
示例2: __init__
# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
def __init__(self, root, fileids=None, encoding='utf8', skip_keywords=None,
target_language=None, paragraph_separator='\n\n', **kwargs):
"""
:param root: The file root of the corpus directory
:param fileids: the list of file ids to consider, or wildcard expression
:param skip_keywords: a list of words which indicate whole paragraphs that should
be skipped by the paras and words methods()
:param target_language: which files to select; sometimes a corpus contains English
translations, we expect these files to be named ...english.json -- if not, pass in fileids
:param paragraph_separator: character sequence demarcating paragraph separation
:param encoding: utf8
:param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer,
word_tokenizer.
"""
if not target_language:
target_language = ''
if not fileids:
fileids = r'.*{}\.json'.format(target_language)
# Initialize the NLTK corpus reader objects
CorpusReader.__init__(self, root, fileids, encoding)
if 'sent_tokenizer' in kwargs:
self._sent_tokenizer = kwargs['sent_tokenizer']
if 'word_tokenizer' in kwargs:
self._word_tokenizer = kwargs['word_tokenizer']
self.skip_keywords = skip_keywords
self.paragraph_separator = paragraph_separator
示例3: __init__
# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
def __init__(self, root, fileids, encoding='utf8', morphs2str=_morphs2str_default):
"""
Initialize KNBCorpusReader
morphs2str is a function to convert morphlist to str for tree representation
for _parse()
"""
CorpusReader.__init__(self, root, fileids, encoding)
self.morphs2str = morphs2str
示例4: __init__
# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
def __init__(self, root, fileids=DOC_PATTERN, **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
the ``CategorizedCorpusReader`` constructor. The remaining
arguments are passed to the ``CorpusReader`` constructor.
"""
CorpusReader.__init__(self, root, fileids)
示例5: __init__
# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
def __init__(self, root, fileids, tone, tag, wrap_etree=False):
self.fileids = fileids
self._wrap_etree = wrap_etree
CorpusReader.__init__(self, root, fileids)
self.tagged_sents = []
self.sents = []
self.words = []
self.tagged_words = []
self.option_tone = tone
self.option_tag = tag
示例6: __init__
# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
def __init__(self, root, fileids,
syntax_parser=CaboChaParser(),
word_tokenizer=MeCabTokenizer(),
sent_tokenizer=jp_sent_tokenizer,
case_parser=KNPParser(),
encoding='utf-8'):
CorpusReader.__init__(self, root, fileids, encoding)
self._syntax_parser = syntax_parser
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._case_parser = case_parser
示例7: __init__
# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
def __init__(self, root, zipfile, fileids):
if isinstance(root, basestring):
root = FileSystemPathPointer(root)
elif not isinstance(root, PathPointer):
raise TypeError('CorpusReader: expected a string or a PathPointer')
# convert to a ZipFilePathPointer
root = ZipFilePathPointer(root.join(zipfile))
CorpusReader.__init__(self, root, fileids)
self._parse_char_replacements()
示例8: __init__
# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
the ``CategorizedCorpusReader`` constructor. The remaining arguments
are passed to the ``CorpusReader`` constructor.
"""
# Add the default category pattern if not passed into the class.
if not any(key.startswith('cat_') for key in kwargs.keys()):
kwargs['cat_pattern'] = CAT_PATTERN
CategorizedCorpusReader.__init__(self, kwargs)
CorpusReader.__init__(self, root, fileids)
示例9: __init__
# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
def __init__(self, root, fileids,
sep='/', word_tokenizer=WhitespaceTokenizer(),
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
encoding=None):
"""
@param root: The root directory for this corpus.
@param fileids: A list or regexp specifying the fileids in this corpus.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._sep = sep
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._alignedsent_block_reader=None,
self._alignedsent_block_reader = self._alignedsent_block_reader
self._alignedsent_corpus_view = None
示例10: __init__
# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
def __init__(self, root, fileids=None,
word_tokenizer=TweetTokenizer(),
encoding='utf8'):
"""
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param word_tokenizer: Tokenizer for breaking the text of Tweets into
smaller units, including but not limited to words.
"""
CorpusReader.__init__(self, root, fileids, encoding)
for path in self.abspaths(self._fileids):
if isinstance(path, ZipFilePathPointer):
pass
elif os.path.getsize(path) == 0:
raise ValueError("File {} is empty".format(path))
"""Check that all user-created corpus files are non-empty."""
self._word_tokenizer = word_tokenizer
示例11: __init__
# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
def __init__(self,
root,
fileids,
column_types=None,
top_node='S',
beginning_of_sentence=r'#BOS.+$',
end_of_sentence=r'#EOS.+$',
encoding=None):
""" Construct a new corpus reader for reading NEGRA corpus files.
@param root: The root directory of the corpus files.
@param fileids: A list of or regex specifying the files to read from.
@param column_types: An optional C{list} of columns in the corpus.
@param top_node: The top node of parsed sentence trees.
@param beginning_of_sentence: A regex specifying the start of a sentence
@param end_of_sentence: A regex specifying the end of a sentence
@param encoding: The default corpus file encoding.
"""
# Make sure there are no invalid column type
if isinstance(column_types, list):
for column_type in column_types:
if column_type not in self.COLUMN_TYPES:
raise ValueError("Column %r is not supported." % columntype)
else:
column_types = self.COLUMN_TYPES
# Define stuff
self._top_node = top_node
self._column_types = column_types
self._fileids = fileids
self._bos = beginning_of_sentence
self._eos = end_of_sentence
self._colmap = dict((c,i) for (i,c) in enumerate(column_types))
# Finish constructing by calling the extended class' constructor
CorpusReader.__init__(self, root, fileids, encoding)
示例12: __init__
# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
def __init__(self, root, fileids):
CorpusReader.__init__(self, root, fileids, None, None)
示例13: __init__
# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
def __init__(self, root, fileids, wrap_etree=False):
self._wrap_etree = wrap_etree
CorpusReader.__init__(self, root, fileids)