當前位置: 首頁>>代碼示例>>Python>>正文


Python tokenize.WordPunctTokenizer方法代碼示例

本文整理匯總了Python中nltk.tokenize.WordPunctTokenizer方法的典型用法代碼示例。如果您正苦於以下問題:Python tokenize.WordPunctTokenizer方法的具體用法?Python tokenize.WordPunctTokenizer怎麽用?Python tokenize.WordPunctTokenizer使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在nltk.tokenize的用法示例。


在下文中一共展示了tokenize.WordPunctTokenizer方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: words

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 別名]
def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        word_tokenizer=WordPunctTokenizer()
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
        return out 
開發者ID:rafasashi,項目名稱:razzy-spinner,代碼行數:26,代碼來源:xmldocs.py

示例2: __init__

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 別名]
def __init__(self, root, fileids=PKL_PATTERN, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining arguments
        are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids)

        self._word_tokenizer = WordPunctTokenizer()
        self._sent_tokenizer = nltk.data.LazyLoader(
            'tokenizers/punkt/english.pickle') 
開發者ID:foxbook,項目名稱:atap,代碼行數:19,代碼來源:reader.py

示例3: __init__

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 別名]
def __init__(self, root, fileids=DOC_PATTERN,
                 word_tokenizer=WordPunctTokenizer(),
                 sent_tokenizer=nltk.data.LazyLoader(
                     'tokenizers/punkt/english.pickle'),
                 encoding='latin-1', **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining
        arguments are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)

        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._tags = TAGS 
開發者ID:foxbook,項目名稱:atap,代碼行數:23,代碼來源:reader.py

示例4: words

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 別名]
def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        word_tokenizer=WordPunctTokenizer()
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
        return out 
開發者ID:blackye,項目名稱:luscan-devel,代碼行數:23,代碼來源:xmldocs.py

示例5: __getitem__

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 別名]
def __getitem__(self, idx):
        idx, start, end = self.lst[idx]
        dialog = self.raw[idx][start:end]
        source, target = dialog[:-1], dialog[-1]

        spks, utts = list(zip(*[(speaker, WordPunctTokenizer().tokenize(uttr)) for speaker, uttr, _ in source]))

        spks = list(spks)

        while len(spks) < 10:
            spks.append(0)

        source = '|||'.join([' '.join(uttr) for uttr in utts])
        target_test = ' '.join(WordPunctTokenizer().tokenize(target[1]))

        return spks, source, target_test, target[0] 
開發者ID:asyml,項目名稱:texar,代碼行數:18,代碼來源:sw_loader.py

示例6: getredundantComponents

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 別名]
def getredundantComponents(sentences):
    window_size=4
    introList=[]
    midlist=[]
    endlist=[]
    
    for sent in sentences:
        words = WordPunctTokenizer().tokenize(sent)
        length_sent=len(words)
        
        f_point = (length_sent)//3
        m_point=(length_sent)//2
        index_span=window_size//2
        intro=' '.join(word for word in words[0:window_size])
        mid=' '.join(word for word in words[m_point-index_span:m_point+index_span])
        end=' '.join(word for word in words[-window_size:])
        introList.append(intro)
        midlist.append(mid)
        endlist.append(end)
    return introList, midlist, endlist 
開發者ID:StevenLOL,項目名稱:AbTextSumm,代碼行數:22,代碼來源:WGGraph.py

示例7: words

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 別名]
def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        word_tokenizer = WordPunctTokenizer()
        iterator = elt.getiterator()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
        return out 
開發者ID:V1EngineeringInc,項目名稱:V1EngineeringInc-Docs,代碼行數:26,代碼來源:xmldocs.py

示例8: data_tockenize

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 別名]
def data_tockenize(text):
  tokenizer = WordPunctTokenizer()
  tokens = tokenizer.tokenize(text)
  return (" ".join(tokens)).strip() 
開發者ID:lambdal,項目名稱:lambda-deep-learning-demo,代碼行數:6,代碼來源:preprocess_aclImdb_v1.py

示例9: __init__

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 別名]
def __init__(self, model_path):
        self.model_path = model_path
        print("loading fastText model ...")
        #self.model = pickle.load(open(self.model_path,"rb"))
        self.model = KeyedVectors.load_word2vec_format(self.model_path, encoding='utf-8', unicode_errors='ignore')
        print("done fastText loading model")
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"'
        self.vocab = self.model.vocab 
開發者ID:husseinmozannar,項目名稱:SOQAL,代碼行數:12,代碼來源:fasttext_embedding.py

示例10: __init__

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 別名]
def __init__(self, P):
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.docs = self.get_answer_canditates(P)
        docs_stem = []
        for doc in self.docs:
            docs_stem.append(self.stem_string(doc))
        self.stopwords = stopwords.words('arabic')
        self.vectorizer = TfidfVectorizer(ngram_range=(1, 4), norm=None)  # , stop_words=self.stopwords)
        self.tfidf_matrix = self.vectorizer.fit_transform(docs_stem) 
開發者ID:husseinmozannar,項目名稱:SOQAL,代碼行數:12,代碼來源:tfidf_reader.py

示例11: __init__

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 別名]
def __init__(self):
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.SYMBOLS = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\"' 
開發者ID:husseinmozannar,項目名稱:SOQAL,代碼行數:6,代碼來源:slidingwindow_distance.py

示例12: __init__

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 別名]
def __init__(self, docs, k, ngrams, vectorizer=None, tfidf_matrix=None):
        self.k = k  # number of documents to return
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = ARLSTem()
        self.docs = docs
        self.stopwords = stopwords.words('arabic')
        self.vectorizer = TfidfVectorizer(ngram_range=(1, ngrams), norm=None, stop_words=self.stopwords)
        if tfidf_matrix is None or vectorizer is None:
            docs_stemmed = self.docs_stem()
            self.tfidf_matrix = self.vectorizer.fit_transform(docs_stemmed)
        else:
            self.vectorizer = vectorizer
            self.tfidf_matrix = tfidf_matrix 
開發者ID:husseinmozannar,項目名稱:SOQAL,代碼行數:15,代碼來源:TfidfRetriever.py

示例13: __get_words_boundaries

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 別名]
def __get_words_boundaries(self):
        """
        function to tokenize words in the document and return words
        boundaries of each sentence using a tokenizer.
        :return:
        """
        tokenizer = WordPunctTokenizer()
        words = list(tokenizer.span_tokenize(self.text))
        return words 
開發者ID:hadyelsahar,項目名稱:RE-NLG-Dataset,代碼行數:11,代碼來源:pipeline.py

示例14: __init__

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 別名]
def __init__(self):

        self._sent_analyzer = SIA()
        self._word_tokenizer = WordPunctTokenizer().tokenize
        self._sent_tokenizer = nltk.data.LazyLoader(
            'tokenizers/punkt/english.pickle'
        ).tokenize
        self._ids = [] 
開發者ID:carlomazzaferro,項目名稱:kryptoflow,代碼行數:10,代碼來源:sent_analysis.py

示例15: __init__

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import WordPunctTokenizer [as 別名]
def __init__(self):
        self.tokenizers = {
            'en': TweetTokenizer(),
            'de': WordPunctTokenizer(),
            'it': WordPunctTokenizer(),
            'fr': WordPunctTokenizer(),
            'default': WordPunctTokenizer()
        }

        self.tokenizer = TweetTokenizer() 
開發者ID:spinningbytes,項目名稱:deep-mlsa,代碼行數:12,代碼來源:parse_utils.py


注:本文中的nltk.tokenize.WordPunctTokenizer方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。