當前位置: 首頁>>代碼示例>>Python>>正文


Python tokenize.RegexpTokenizer方法代碼示例

本文整理匯總了Python中nltk.tokenize.RegexpTokenizer方法的典型用法代碼示例。如果您正苦於以下問題:Python tokenize.RegexpTokenizer方法的具體用法?Python tokenize.RegexpTokenizer怎麽用?Python tokenize.RegexpTokenizer使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在nltk.tokenize的用法示例。


在下文中一共展示了tokenize.RegexpTokenizer方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: clean_text

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 別名]
def clean_text(df, wrong_words_dict, autocorrect=True):
    df.fillna("__NA__", inplace=True)
    tokinizer = RegexpTokenizer(r'\w+')
    regexps = [re.compile("([a-zA-Z]+)([0-9]+)"), re.compile("([0-9]+)([a-zA-Z]+)")]
    texts = df.tolist()
    result = []
    for text in tqdm(texts):
        tokens = tokinizer.tokenize(text.lower())
        tokens = [split_text_and_digits(token, regexps) for token in tokens]
        tokens = [substitute_repeats(token, 3) for token in tokens]
        text = ' '.join(tokens)
        if autocorrect:
            for wrong, right in wrong_words_dict.items():
                text = text.replace(wrong, right)
        result.append(text)
    return result 
開發者ID:Donskov7,項目名稱:toxic_comments,代碼行數:18,代碼來源:preprocessing.py

示例2: __init__

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 別名]
def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding='latin1'):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader 
開發者ID:rafasashi,項目名稱:razzy-spinner,代碼行數:22,代碼來源:aligned.py

示例3: __init__

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 別名]
def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding=None):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt')

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader 
開發者ID:blackye,項目名稱:luscan-devel,代碼行數:22,代碼來源:aligned.py

示例4: get_ngram_features_from_map

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 別名]
def get_ngram_features_from_map(tweets, ngram_map, n):
    regexp_tknzr = RegexpTokenizer(r'\w+')
    tweet_tknzr = TweetTokenizer()
    features = []
    for tweet in tweets:
        feature_list = [0] * np.zeros(len(ngram_map))
        tweet = tweet.lower()
        ngram_list = get_ngram_list(tweet_tknzr, tweet, 1)
        if n > 1:
            ngram_list += get_ngram_list(regexp_tknzr, tweet, 2)
        if n > 2:
            ngram_list += get_ngram_list(regexp_tknzr, tweet, 3)
        for gram in ngram_list:
            if gram in ngram_map:
                feature_list[ngram_map[gram]] += 1.0
        features.append(feature_list)
    return features 
開發者ID:MirunaPislar,項目名稱:Sarcasm-Detection,代碼行數:19,代碼來源:extract_baseline_features.py

示例5: vocab_index_descriptions

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 別名]
def vocab_index_descriptions(vocab_file, vectors_file):
    #load lookups
    vocab = set()
    with open(vocab_file, 'r') as vocabfile:
        for i,line in enumerate(vocabfile):
            line = line.strip()
            if line != '':
                vocab.add(line)
    ind2w = {i+1:w for i,w in enumerate(sorted(vocab))}
    w2ind = {w:i for i,w in ind2w.items()}
    desc_dict = datasets.load_code_descriptions()
        
    tokenizer = RegexpTokenizer(r'\w+')

    with open(vectors_file, 'w') as of:
        w = csv.writer(of, delimiter=' ')
        w.writerow(["CODE", "VECTOR"])
        for code, desc in tqdm(desc_dict.items()):
            #same preprocessing steps as in get_discharge_summaries
            tokens = [t.lower() for t in tokenizer.tokenize(desc) if not t.isnumeric()]
            inds = [w2ind[t] if t in w2ind.keys() else len(w2ind)+1 for t in tokens]
            w.writerow([code] + [str(i) for i in inds]) 
開發者ID:jamesmullenbach,項目名稱:caml-mimic,代碼行數:24,代碼來源:vocab_index_descriptions.py

示例6: process

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 別名]
def process(input_text):
    # Create a regular expression tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Create a Snowball stemmer 
    stemmer = SnowballStemmer('english')

    # Get the list of stop words 
    stop_words = stopwords.words('english')
    
    # Tokenize the input string
    tokens = tokenizer.tokenize(input_text.lower())

    # Remove the stop words 
    tokens = [x for x in tokens if not x in stop_words]
    
    # Perform stemming on the tokenized words 
    tokens_stemmed = [stemmer.stem(x) for x in tokens]

    return tokens_stemmed 
開發者ID:PacktPublishing,項目名稱:Artificial-Intelligence-with-Python,代碼行數:22,代碼來源:topic_modeler.py

示例7: transform

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 別名]
def transform(self, texts, y=None):
        tokenizer = RegexpTokenizer(r'[a-z]+|\d+')

        tokenized_texts = []
        stoplist = []

        if self.ignore_stopwords:
            stoplist = stopwords.words('english')

        for text in texts:
            tokenized_text = []
            for word in tokenizer.tokenize(text.lower()):
                if word not in stoplist:
                    tokenized_text.append(word.strip())

            tokenized_texts.append(tokenized_text)
        return tokenized_texts 
開發者ID:itdxer,項目名稱:neupy,代碼行數:19,代碼來源:preprocessing.py

示例8: __init__

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 別名]
def __init__(
        self,
        root,
        fileids,
        sep='/',
        word_tokenizer=WhitespaceTokenizer(),
        sent_tokenizer=RegexpTokenizer('\n', gaps=True),
        alignedsent_block_reader=read_alignedsent_block,
        encoding='latin1',
    ):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader 
開發者ID:V1EngineeringInc,項目名稱:V1EngineeringInc-Docs,代碼行數:27,代碼來源:aligned.py

示例9: __init__

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 別名]
def __init__(self, root, items, encoding='utf8'):
        gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
        TaggedCorpusReader.__init__(self, root, items, sep='_',
                                    sent_tokenizer=sent_tokenizer)

#: A list of all documents and their titles in ycoe. 
開發者ID:rafasashi,項目名稱:razzy-spinner,代碼行數:9,代碼來源:ycoe.py

示例10: __init__

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 別名]
def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        :param rtepair: a ``RTEPair`` from which features should be extracted
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        """
        self.stop = stop
        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
                              'have', 'are', 'were', 'and', 'very', '.', ','])

        self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
                             'denied'])
        # Try to tokenize so that abbreviations like U.S.and monetary amounts
        # like "$23.00" are kept as tokens.
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')

        #Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set(lemmatize(token) for token in self.text_tokens)
            self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words 
開發者ID:rafasashi,項目名稱:razzy-spinner,代碼行數:36,代碼來源:rte_classify.py

示例11: get_tokens

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 別名]
def get_tokens(text):
    """Tokenize the input text."""
    soup = BeautifulSoup(text, "html.parser")
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(soup.get_text()) 
開發者ID:9b,項目名稱:chirp,代碼行數:7,代碼來源:helpers.py

示例12: load_captions

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 別名]
def load_captions(self, data_dir, filenames):
        all_captions = []
        for i in range(len(filenames)):
            cap_path = '%s/text/%s.txt' % (data_dir, filenames[i])
            with open(cap_path, "r") as f:
                captions = f.read().decode('utf8').split('\n')
                cnt = 0
                for cap in captions:
                    if len(cap) == 0:
                        continue
                    cap = cap.replace("\ufffd\ufffd", " ")
                    # picks out sequences of alphanumeric characters as tokens
                    # and drops everything else
                    tokenizer = RegexpTokenizer(r'\w+')
                    tokens = tokenizer.tokenize(cap.lower())
                    # print('tokens', tokens)
                    if len(tokens) == 0:
                        print('cap', cap)
                        continue

                    tokens_new = []
                    for t in tokens:
                        t = t.encode('ascii', 'ignore').decode('ascii')
                        if len(t) > 0:
                            tokens_new.append(t)
                    all_captions.append(tokens_new)
                    cnt += 1
                    if cnt == self.embeddings_num:
                        break
                if cnt < self.embeddings_num:
                    print('ERROR: the captions for %s less than %d'
                          % (filenames[i], cnt))
        return all_captions 
開發者ID:MinfengZhu,項目名稱:DM-GAN,代碼行數:35,代碼來源:datasets.py

示例13: __tokenize

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 別名]
def __tokenize(self, docs):
		output = []
		for doc in docs:
			tokenizer = RegexpTokenizer(r'\w\w\w\w\w+')
			output.append(tokenizer.tokenize(doc.lower()))
		return output 
開發者ID:skashyap7,項目名稱:TBBTCorpus,代碼行數:8,代碼來源:topic_extractor.py

示例14: cleanDocuments

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 別名]
def cleanDocuments(self):
        tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
        en_stop = set(stopwords.words('english'))
        self.cleaned = []
        for doc in self.documents:
            lowercase_doc = doc.lower()
            words = tokenizer.tokenize(lowercase_doc)
            non_stopped_words = [i for i in words if not i in en_stop]
            self.cleaned.append(non_stopped_words)
        print("INFO: Clearning {} documents completed".format(len(self.documents))) 
開發者ID:PacktPublishing,項目名稱:Natural-Language-Processing-with-Python-Cookbook,代碼行數:12,代碼來源:IdentifyingTopic.py

示例15: __init__

# 需要導入模塊: from nltk import tokenize [as 別名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 別名]
def __init__(self):
        # Create a regular expression tokenizer
        self.tokenizer = RegexpTokenizer(r'\w+')

        # get the list of stop words 
        self.stop_words_english = stopwords.words('english')

        # Create a Snowball stemmer 
        self.stemmer = SnowballStemmer('english')
        
    # Tokenizing, stop word removal, and stemming 
開發者ID:PacktPublishing,項目名稱:Python-Machine-Learning-Cookbook-Second-Edition,代碼行數:13,代碼來源:topic_modeling.py


注:本文中的nltk.tokenize.RegexpTokenizer方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。