当前位置: 首页>>代码示例>>Python>>正文


Python CorpusReader.__init__方法代码示例

本文整理汇总了Python中nltk.corpus.reader.api.CorpusReader.__init__方法的典型用法代码示例。如果您正苦于以下问题:Python CorpusReader.__init__方法的具体用法?Python CorpusReader.__init__怎么用?Python CorpusReader.__init__使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.corpus.reader.api.CorpusReader的用法示例。


在下文中一共展示了CorpusReader.__init__方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
    def __init__(
        self,
        root,
        fileids,
        sep="/",
        word_tokenizer=WhitespaceTokenizer(),
        sent_tokenizer=RegexpTokenizer("\n", gaps=True),
        alignedsent_block_reader=read_alignedsent_block,
        encoding="latin1",
    ):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader
开发者ID:Reagankm,项目名称:KnockKnock,代码行数:27,代码来源:aligned.py

示例2: __init__

# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
    def __init__(self, root, fileids=None, encoding='utf8', skip_keywords=None,
                 target_language=None, paragraph_separator='\n\n', **kwargs):
        """
        :param root: The file root of the corpus directory
        :param fileids: the list of file ids to consider, or wildcard expression
        :param skip_keywords: a list of words which indicate whole paragraphs that should
        be skipped by the paras and words methods()
        :param target_language: which files to select; sometimes a corpus contains English
         translations, we expect these files to be named ...english.json -- if not, pass in fileids
        :param paragraph_separator: character sequence demarcating paragraph separation
        :param encoding: utf8
        :param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer,
        word_tokenizer.
        """

        if not target_language:
            target_language = ''
        if not fileids:
            fileids = r'.*{}\.json'.format(target_language)

        # Initialize the NLTK corpus reader objects
        CorpusReader.__init__(self, root, fileids, encoding)
        if 'sent_tokenizer' in kwargs:
            self._sent_tokenizer = kwargs['sent_tokenizer']
        if 'word_tokenizer' in kwargs:
            self._word_tokenizer = kwargs['word_tokenizer']
        self.skip_keywords = skip_keywords
        self.paragraph_separator = paragraph_separator
开发者ID:diyclassics,项目名称:cltk,代码行数:30,代码来源:readers.py

示例3: __init__

# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
 def __init__(self, root, fileids, encoding='utf8', morphs2str=_morphs2str_default):
     """
     Initialize KNBCorpusReader
     morphs2str is a function to convert morphlist to str for tree representation
     for _parse()
     """
     CorpusReader.__init__(self, root, fileids, encoding)
     self.morphs2str = morphs2str
开发者ID:DrDub,项目名称:nltk,代码行数:10,代码来源:knbc.py

示例4: __init__

# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
 def __init__(self, root, fileids=DOC_PATTERN, **kwargs):
     """
     Initialize the corpus reader.  Categorization arguments
     (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
     the ``CategorizedCorpusReader`` constructor.  The remaining
     arguments are passed to the ``CorpusReader`` constructor.
     """
     CorpusReader.__init__(self, root, fileids)
开发者ID:yokeyong,项目名称:atap,代码行数:10,代码来源:am_reader.py

示例5: __init__

# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
 def __init__(self, root, fileids, tone, tag, wrap_etree=False):
     self.fileids = fileids
     self._wrap_etree = wrap_etree
     CorpusReader.__init__(self, root, fileids)
     self.tagged_sents = []
     self.sents = []
     self.words = []
     self.tagged_words = []
     self.option_tone = tone
     self.option_tag = tag
开发者ID:Batene,项目名称:Bamanankan,代码行数:12,代码来源:htmlreaderALL.py

示例6: __init__

# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
 def __init__(self, root, fileids,
              syntax_parser=CaboChaParser(),
              word_tokenizer=MeCabTokenizer(),
              sent_tokenizer=jp_sent_tokenizer,
              case_parser=KNPParser(),
              encoding='utf-8'):
   CorpusReader.__init__(self, root, fileids, encoding)
   self._syntax_parser = syntax_parser
   self._word_tokenizer = word_tokenizer
   self._sent_tokenizer = sent_tokenizer
   self._case_parser = case_parser
开发者ID:miyamofigo,项目名称:Japanese-corpus-and-utility,代码行数:13,代码来源:corpus.py

示例7: __init__

# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
 def __init__(self, root, zipfile, fileids):
     if isinstance(root, basestring):
         root = FileSystemPathPointer(root)
     elif not isinstance(root, PathPointer): 
         raise TypeError('CorpusReader: expected a string or a PathPointer')
     
     # convert to a ZipFilePathPointer
     root = ZipFilePathPointer(root.join(zipfile))
     
     CorpusReader.__init__(self, root, fileids)
     
     self._parse_char_replacements()
开发者ID:IMAmuseum,项目名称:getty-vocab-reconciliation,代码行数:14,代码来源:getty.py

示例8: __init__

# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
    def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)
开发者ID:yokeyong,项目名称:atap,代码行数:15,代码来源:reader.py

示例9: __init__

# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
 def __init__(self, root, fileids, 
              sep='/', word_tokenizer=WhitespaceTokenizer(),
              sent_tokenizer=RegexpTokenizer('\n', gaps=True),
              encoding=None):
     """
     @param root: The root directory for this corpus.
     @param fileids: A list or regexp specifying the fileids in this corpus.
     """
     CorpusReader.__init__(self, root, fileids, encoding)
     self._sep = sep
     self._word_tokenizer = word_tokenizer
     self._sent_tokenizer = sent_tokenizer
     self._alignedsent_block_reader=None,
     self._alignedsent_block_reader = self._alignedsent_block_reader
     self._alignedsent_corpus_view = None
开发者ID:yochananmkp,项目名称:clir,代码行数:17,代码来源:aligned_corpus_reader.py

示例10: __init__

# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
    def __init__(self, root, fileids=None,
                 word_tokenizer=TweetTokenizer(),
                 encoding='utf8'):
        """

        :param root: The root directory for this corpus.

        :param fileids: A list or regexp specifying the fileids in this corpus.

        :param word_tokenizer: Tokenizer for breaking the text of Tweets into
        smaller units, including but not limited to words.

        """
        CorpusReader.__init__(self, root, fileids, encoding)

        for path in self.abspaths(self._fileids):
            if isinstance(path, ZipFilePathPointer):
                pass
            elif os.path.getsize(path) == 0:
                raise ValueError("File {} is empty".format(path))
        """Check that all user-created corpus files are non-empty."""

        self._word_tokenizer = word_tokenizer
开发者ID:Weiming-Hu,项目名称:text-based-six-degree,代码行数:25,代码来源:twitter.py

示例11: __init__

# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
    def __init__(self,
                 root,
                 fileids,
                 column_types=None,
                 top_node='S',
                 beginning_of_sentence=r'#BOS.+$',
                 end_of_sentence=r'#EOS.+$',
                 encoding=None):
        """ Construct a new corpus reader for reading NEGRA corpus files.
        @param root: The root directory of the corpus files.
        @param fileids: A list of or regex specifying the files to read from.
        @param column_types: An optional C{list} of columns in the corpus.
        @param top_node: The top node of parsed sentence trees.
        @param beginning_of_sentence: A regex specifying the start of a sentence
        @param end_of_sentence: A regex specifying the end of a sentence
        @param encoding: The default corpus file encoding.
        """

        # Make sure there are no invalid column type
        if isinstance(column_types, list):
            for column_type in column_types:
                if column_type not in self.COLUMN_TYPES:
                    raise ValueError("Column %r is not supported." % columntype)
        else:
            column_types = self.COLUMN_TYPES

        # Define stuff
        self._top_node = top_node
        self._column_types = column_types
        self._fileids = fileids
        self._bos = beginning_of_sentence
        self._eos = end_of_sentence
        self._colmap = dict((c,i) for (i,c) in enumerate(column_types))

        # Finish constructing by calling the extended class' constructor
        CorpusReader.__init__(self, root, fileids, encoding)
开发者ID:wroberts,项目名称:NLTK-Contributions,代码行数:38,代码来源:NegraCorpusReader.py

示例12: __init__

# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
 def __init__(self, root, fileids):
     CorpusReader.__init__(self, root, fileids, None, None)
开发者ID:B-Rich,项目名称:Fem-Coding-Challenge,代码行数:4,代码来源:ipipan.py

示例13: __init__

# 需要导入模块: from nltk.corpus.reader.api import CorpusReader [as 别名]
# 或者: from nltk.corpus.reader.api.CorpusReader import __init__ [as 别名]
 def __init__(self, root, fileids, wrap_etree=False):
     self._wrap_etree = wrap_etree
     CorpusReader.__init__(self, root, fileids)
开发者ID:NavinManaswi,项目名称:nltk,代码行数:5,代码来源:xmldocs.py


注:本文中的nltk.corpus.reader.api.CorpusReader.__init__方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。