当前位置: 首页>>代码示例>>Python>>正文


Python tokenize.WordPunctTokenizer方法代码示例

本文整理汇总了Python中nltk.tokenize.WordPunctTokenizer方法的典型用法代码示例。如果您正苦于以下问题:Python tokenize.WordPunctTokenizer方法的具体用法?Python tokenize.WordPunctTokenizer怎么用?Python tokenize.WordPunctTokenizer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.tokenize的用法示例。


在下文中一共展示了tokenize.WordPunctTokenizer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: words

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        word_tokenizer=WordPunctTokenizer()
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
        return out 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:26,代码来源:xmldocs.py

示例2: __init__

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)

        self._word_tokenizer = WordPunctTokenizer()
        self._sent_tokenizer = nltk.data.LazyLoader(
            'tokenizers/punkt/english.pickle') 
开发者ID:foxbook,项目名称:atap,代码行数:19,代码来源:reader.py

示例3: __init__

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def __init__(self, root, fileids=DOC_PATTERN,
                 word_tokenizer=WordPunctTokenizer(),
                 sent_tokenizer=nltk.data.LazyLoader(
                     'tokenizers/punkt/english.pickle'),
                 encoding='latin-1', **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining
        arguments are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)

        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._tags = TAGS 
开发者ID:foxbook,项目名称:atap,代码行数:23,代码来源:reader.py

示例4: words

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        word_tokenizer=WordPunctTokenizer()
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
        return out 
开发者ID:blackye,项目名称:luscan-devel,代码行数:23,代码来源:xmldocs.py

示例5: __getitem__

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def __getitem__(self, idx):
        idx, start, end = self.lst[idx]
        dialog = self.raw[idx][start:end]
        source, target = dialog[:-1], dialog[-1]

        spks, utts = list(zip(*[(speaker, WordPunctTokenizer().tokenize(uttr)) for speaker, uttr, _ in source]))

        spks = list(spks)

        while len(spks) < 10:
            spks.append(0)

        source = '|||'.join([' '.join(uttr) for uttr in utts])
        target_test = ' '.join(WordPunctTokenizer().tokenize(target[1]))

        return spks, source, target_test, target[0] 
开发者ID:asyml,项目名称:texar,代码行数:18,代码来源:sw_loader.py

示例6: getredundantComponents

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def getredundantComponents(sentences):
    window_size=4
    introList=[]
    midlist=[]
    endlist=[]
    
    for sent in sentences:
        words = WordPunctTokenizer().tokenize(sent)
        length_sent=len(words)
        
        f_point = (length_sent)//3
        m_point=(length_sent)//2
        index_span=window_size//2
        intro=' '.join(word for word in words[0:window_size])
        mid=' '.join(word for word in words[m_point-index_span:m_point+index_span])
        end=' '.join(word for word in words[-window_size:])
        introList.append(intro)
        midlist.append(mid)
        endlist.append(end)
    return introList, midlist, endlist 
开发者ID:StevenLOL,项目名称:AbTextSumm,代码行数:22,代码来源:WGGraph.py

示例7: words

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        word_tokenizer = WordPunctTokenizer()
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
        return out 
开发者ID:V1EngineeringInc,项目名称:V1EngineeringInc-Docs,代码行数:26,代码来源:xmldocs.py

示例8: data_tockenize

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def data_tockenize(text):
  tokenizer = WordPunctTokenizer()
  tokens = tokenizer.tokenize(text)
  return (" ".join(tokens)).strip() 
开发者ID:lambdal,项目名称:lambda-deep-learning-demo,代码行数:6,代码来源:preprocess_aclImdb_v1.py

示例9: __init__

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def __init__(self, model_path):
        self.model_path = model_path
        print("loading fastText model ...")
        #self.model = pickle.load(open(self.model_path,"rb"))
        self.model = KeyedVectors.load_word2vec_format(self.model_path, encoding='utf-8', unicode_errors='ignore')
        print("done fastText loading model")
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"'
        self.vocab = self.model.vocab 
开发者ID:husseinmozannar,项目名称:SOQAL,代码行数:12,代码来源:fasttext_embedding.py

示例10: __init__

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def __init__(self, P):
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.docs = self.get_answer_canditates(P)
        docs_stem = []
        for doc in self.docs:
            docs_stem.append(self.stem_string(doc))
        self.stopwords = stopwords.words('arabic')
        self.vectorizer = TfidfVectorizer(ngram_range=(1, 4), norm=None)  # , stop_words=self.stopwords)
        self.tfidf_matrix = self.vectorizer.fit_transform(docs_stem) 
开发者ID:husseinmozannar,项目名称:SOQAL,代码行数:12,代码来源:tfidf_reader.py

示例11: __init__

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def __init__(self):
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"' 
开发者ID:husseinmozannar,项目名称:SOQAL,代码行数:6,代码来源:slidingwindow_distance.py

示例12: __init__

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def __init__(self, docs, k, ngrams, vectorizer=None, tfidf_matrix=None):
        self.k = k  # number of documents to return
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.docs = docs
        self.stopwords = stopwords.words('arabic')
        self.vectorizer = TfidfVectorizer(ngram_range=(1, ngrams), norm=None, stop_words=self.stopwords)
        if tfidf_matrix is None or vectorizer is None:
            docs_stemmed = self.docs_stem()
            self.tfidf_matrix = self.vectorizer.fit_transform(docs_stemmed)
        else:
            self.vectorizer = vectorizer
            self.tfidf_matrix = tfidf_matrix 
开发者ID:husseinmozannar,项目名称:SOQAL,代码行数:15,代码来源:TfidfRetriever.py

示例13: __get_words_boundaries

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def __get_words_boundaries(self):
        """
        function to tokenize words in the document and return words
        boundaries of each sentence using a tokenizer.
        :return:
        """
        tokenizer = WordPunctTokenizer()
        words = list(tokenizer.span_tokenize(self.text))
        return words 
开发者ID:hadyelsahar,项目名称:RE-NLG-Dataset,代码行数:11,代码来源:pipeline.py

示例14: __init__

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def __init__(self):

        self._sent_analyzer = SIA()
        self._word_tokenizer = WordPunctTokenizer().tokenize
        self._sent_tokenizer = nltk.data.LazyLoader(
            'tokenizers/punkt/english.pickle'
        ).tokenize
        self._ids = [] 
开发者ID:carlomazzaferro,项目名称:kryptoflow,代码行数:10,代码来源:sent_analysis.py

示例15: __init__

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 别名]
def __init__(self):
        self.tokenizers = {
            'en': TweetTokenizer(),
            'de': WordPunctTokenizer(),
            'it': WordPunctTokenizer(),
            'fr': WordPunctTokenizer(),
            'default': WordPunctTokenizer()
        }

        self.tokenizer = TweetTokenizer() 
开发者ID:spinningbytes,项目名称:deep-mlsa,代码行数:12,代码来源:parse_utils.py


注:本文中的nltk.tokenize.WordPunctTokenizer方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。