当前位置: 首页>>代码示例>>Python>>正文


Python tokenize.RegexpTokenizer方法代码示例

本文整理汇总了Python中nltk.tokenize.RegexpTokenizer方法的典型用法代码示例。如果您正苦于以下问题:Python tokenize.RegexpTokenizer方法的具体用法?Python tokenize.RegexpTokenizer怎么用?Python tokenize.RegexpTokenizer使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.tokenize的用法示例。


在下文中一共展示了tokenize.RegexpTokenizer方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: clean_text

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def clean_text(df, wrong_words_dict, autocorrect=True):
    df.fillna("__NA__", inplace=True)
    tokinizer = RegexpTokenizer(r'\w+')
    regexps = [re.compile("([a-zA-Z]+)([0-9]+)"), re.compile("([0-9]+)([a-zA-Z]+)")]
    texts = df.tolist()
    result = []
    for text in tqdm(texts):
        tokens = tokinizer.tokenize(text.lower())
        tokens = [split_text_and_digits(token, regexps) for token in tokens]
        tokens = [substitute_repeats(token, 3) for token in tokens]
        text = ' '.join(tokens)
        if autocorrect:
            for wrong, right in wrong_words_dict.items():
                text = text.replace(wrong, right)
        result.append(text)
    return result 
开发者ID:Donskov7,项目名称:toxic_comments,代码行数:18,代码来源:preprocessing.py

示例2: __init__

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding='latin1'):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:22,代码来源:aligned.py

示例3: __init__

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def __init__(self, root, fileids,
                 sep='/', word_tokenizer=WhitespaceTokenizer(),
                 sent_tokenizer=RegexpTokenizer('\n', gaps=True),
                 alignedsent_block_reader=read_alignedsent_block,
                 encoding=None):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt')

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader 
开发者ID:blackye,项目名称:luscan-devel,代码行数:22,代码来源:aligned.py

示例4: get_ngram_features_from_map

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def get_ngram_features_from_map(tweets, ngram_map, n):
    regexp_tknzr = RegexpTokenizer(r'\w+')
    tweet_tknzr = TweetTokenizer()
    features = []
    for tweet in tweets:
        feature_list = [0] * np.zeros(len(ngram_map))
        tweet = tweet.lower()
        ngram_list = get_ngram_list(tweet_tknzr, tweet, 1)
        if n > 1:
            ngram_list += get_ngram_list(regexp_tknzr, tweet, 2)
        if n > 2:
            ngram_list += get_ngram_list(regexp_tknzr, tweet, 3)
        for gram in ngram_list:
            if gram in ngram_map:
                feature_list[ngram_map[gram]] += 1.0
        features.append(feature_list)
    return features 
开发者ID:MirunaPislar,项目名称:Sarcasm-Detection,代码行数:19,代码来源:extract_baseline_features.py

示例5: vocab_index_descriptions

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def vocab_index_descriptions(vocab_file, vectors_file):
    #load lookups
    vocab = set()
    with open(vocab_file, 'r') as vocabfile:
        for i,line in enumerate(vocabfile):
            line = line.strip()
            if line != '':
                vocab.add(line)
    ind2w = {i+1:w for i,w in enumerate(sorted(vocab))}
    w2ind = {w:i for i,w in ind2w.items()}
    desc_dict = datasets.load_code_descriptions()
        
    tokenizer = RegexpTokenizer(r'\w+')

    with open(vectors_file, 'w') as of:
        w = csv.writer(of, delimiter=' ')
        w.writerow(["CODE", "VECTOR"])
        for code, desc in tqdm(desc_dict.items()):
            #same preprocessing steps as in get_discharge_summaries
            tokens = [t.lower() for t in tokenizer.tokenize(desc) if not t.isnumeric()]
            inds = [w2ind[t] if t in w2ind.keys() else len(w2ind)+1 for t in tokens]
            w.writerow([code] + [str(i) for i in inds]) 
开发者ID:jamesmullenbach,项目名称:caml-mimic,代码行数:24,代码来源:vocab_index_descriptions.py

示例6: process

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def process(input_text):
    # Create a regular expression tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Create a Snowball stemmer 
    stemmer = SnowballStemmer('english')

    # Get the list of stop words 
    stop_words = stopwords.words('english')
    
    # Tokenize the input string
    tokens = tokenizer.tokenize(input_text.lower())

    # Remove the stop words 
    tokens = [x for x in tokens if not x in stop_words]
    
    # Perform stemming on the tokenized words 
    tokens_stemmed = [stemmer.stem(x) for x in tokens]

    return tokens_stemmed 
开发者ID:PacktPublishing,项目名称:Artificial-Intelligence-with-Python,代码行数:22,代码来源:topic_modeler.py

示例7: transform

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def transform(self, texts, y=None):
        tokenizer = RegexpTokenizer(r'[a-z]+|\d+')

        tokenized_texts = []
        stoplist = []

        if self.ignore_stopwords:
            stoplist = stopwords.words('english')

        for text in texts:
            tokenized_text = []
            for word in tokenizer.tokenize(text.lower()):
                if word not in stoplist:
                    tokenized_text.append(word.strip())

            tokenized_texts.append(tokenized_text)
        return tokenized_texts 
开发者ID:itdxer,项目名称:neupy,代码行数:19,代码来源:preprocessing.py

示例8: __init__

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def __init__(
        self,
        root,
        fileids,
        sep='/',
        word_tokenizer=WhitespaceTokenizer(),
        sent_tokenizer=RegexpTokenizer('\n', gaps=True),
        alignedsent_block_reader=read_alignedsent_block,
        encoding='latin1',
    ):
        """
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._sep = sep
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._alignedsent_block_reader = alignedsent_block_reader 
开发者ID:V1EngineeringInc,项目名称:V1EngineeringInc-Docs,代码行数:27,代码来源:aligned.py

示例9: __init__

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def __init__(self, root, items, encoding='utf8'):
        gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*'
        sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True)
        TaggedCorpusReader.__init__(self, root, items, sep='_',
                                    sent_tokenizer=sent_tokenizer)

#: A list of all documents and their titles in ycoe. 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:9,代码来源:ycoe.py

示例10: __init__

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        :param rtepair: a ``RTEPair`` from which features should be extracted
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        """
        self.stop = stop
        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
                              'have', 'are', 'were', 'and', 'very', '.', ','])

        self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
                             'denied'])
        # Try to tokenize so that abbreviations like U.S.and monetary amounts
        # like "$23.00" are kept as tokens.
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')

        #Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set(lemmatize(token) for token in self.text_tokens)
            self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words 
开发者ID:rafasashi,项目名称:razzy-spinner,代码行数:36,代码来源:rte_classify.py

示例11: get_tokens

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def get_tokens(text):
    """Tokenize the input text."""
    soup = BeautifulSoup(text, "html.parser")
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(soup.get_text()) 
开发者ID:9b,项目名称:chirp,代码行数:7,代码来源:helpers.py

示例12: load_captions

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def load_captions(self, data_dir, filenames):
        all_captions = []
        for i in range(len(filenames)):
            cap_path = '%s/text/%s.txt' % (data_dir, filenames[i])
            with open(cap_path, "r") as f:
                captions = f.read().decode('utf8').split('\n')
                cnt = 0
                for cap in captions:
                    if len(cap) == 0:
                        continue
                    cap = cap.replace("\ufffd\ufffd", " ")
                    # picks out sequences of alphanumeric characters as tokens
                    # and drops everything else
                    tokenizer = RegexpTokenizer(r'\w+')
                    tokens = tokenizer.tokenize(cap.lower())
                    # print('tokens', tokens)
                    if len(tokens) == 0:
                        print('cap', cap)
                        continue

                    tokens_new = []
                    for t in tokens:
                        t = t.encode('ascii', 'ignore').decode('ascii')
                        if len(t) > 0:
                            tokens_new.append(t)
                    all_captions.append(tokens_new)
                    cnt += 1
                    if cnt == self.embeddings_num:
                        break
                if cnt < self.embeddings_num:
                    print('ERROR: the captions for %s less than %d'
                          % (filenames[i], cnt))
        return all_captions 
开发者ID:MinfengZhu,项目名称:DM-GAN,代码行数:35,代码来源:datasets.py

示例13: __tokenize

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def __tokenize(self, docs):
		output = []
		for doc in docs:
			tokenizer = RegexpTokenizer(r'\w\w\w\w\w+')
			output.append(tokenizer.tokenize(doc.lower()))
		return output 
开发者ID:skashyap7,项目名称:TBBTCorpus,代码行数:8,代码来源:topic_extractor.py

示例14: cleanDocuments

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def cleanDocuments(self):
        tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
        en_stop = set(stopwords.words('english'))
        self.cleaned = []
        for doc in self.documents:
            lowercase_doc = doc.lower()
            words = tokenizer.tokenize(lowercase_doc)
            non_stopped_words = [i for i in words if not i in en_stop]
            self.cleaned.append(non_stopped_words)
        print("INFO: Clearning {} documents completed".format(len(self.documents))) 
开发者ID:PacktPublishing,项目名称:Natural-Language-Processing-with-Python-Cookbook,代码行数:12,代码来源:IdentifyingTopic.py

示例15: __init__

# 需要导入模块: from nltk import tokenize [as 别名]
# 或者: from nltk.tokenize import RegexpTokenizer [as 别名]
def __init__(self):
        # Create a regular expression tokenizer
        self.tokenizer = RegexpTokenizer(r'\w+')

        # get the list of stop words 
        self.stop_words_english = stopwords.words('english')

        # Create a Snowball stemmer 
        self.stemmer = SnowballStemmer('english')
        
    # Tokenizing, stop word removal, and stemming 
开发者ID:PacktPublishing,项目名称:Python-Machine-Learning-Cookbook-Second-Edition,代码行数:13,代码来源:topic_modeling.py


注:本文中的nltk.tokenize.RegexpTokenizer方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。