当前位置: 首页>>代码示例>>Python>>正文


Python RegexpTokenizer.tokenize方法代码示例

本文整理汇总了Python中nltk.tokenize.RegexpTokenizer.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python RegexpTokenizer.tokenize方法的具体用法?Python RegexpTokenizer.tokenize怎么用?Python RegexpTokenizer.tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在nltk.tokenize.RegexpTokenizer的用法示例。


在下文中一共展示了RegexpTokenizer.tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
 def __init__(self, rtepair, stop=True, lemmatize=False):
     """
     @param rtepair: a L{RTEPair} from which features should be extracted
     @param stop: if C{True}, stopwords are thrown away.
     @type stop: C{bool}
     """
     self.stop = stop
     self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to',
                           'have', 'is', 'are', 'were', 'and', 'very', '.',','])
     
     self.negwords = set(['no', 'not', 'never', 'failed' 'rejected', 'denied'])
     # Try to tokenize so that abbreviations like U.S.and monetary amounts
     # like "$23.00" are kept as tokens.
     from nltk.tokenize import RegexpTokenizer
     tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')
     
     #Get the set of word types for text and hypothesis
     self.text_tokens = tokenizer.tokenize(rtepair.text)
     self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
     self.text_words = set(self.text_tokens)
     self.hyp_words = set(self.hyp_tokens)
     
     if lemmatize:
         self.text_words = set([lemmatize(token) for token in self.text_tokens])
         self.hyp_words = set([lemmatize(token) for token in self.hyp_tokens])
     
     if self.stop:
         self.text_words = self.text_words - self.stopwords
         self.hyp_words = self.hyp_words - self.stopwords
         
     self._overlap = self.hyp_words & self.text_words
     self._hyp_extra = self.hyp_words - self.text_words
     self._txt_extra = self.text_words - self.hyp_words
开发者ID:LowResourceLanguages,项目名称:hltdi-l3,代码行数:35,代码来源:rte_classify.py

示例2: demo

# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
def demo():
#    from nltk.corpus import brown
#    from nltk.probability import LidstoneProbDist, WittenBellProbDist
#    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
#    estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2)
    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer("[\w']+")
    lm = NgramcModel(5)
    print lm
    
    sent = "Like a bridge over troubled water, I will lay it down."
    print sent
    words = tokenizer.tokenize(sent)
    print "Entropy: ", lm.entropy(words)
    
    sent = "over twenty year and he"
    print sent
    words = tokenizer.tokenize(sent)
    print "Entropy: ", lm.entropy(words)
    
    sent = "over twenty years and he"
    print sent
    words = tokenizer.tokenize(sent)
    print "Entropy: ", lm.entropy(words)    

    print lm.getBetter(["men" ,"are" ,"imporant" ,"for" ,"the"], ["men" ,"are" ,"important" ,"for" ,"the"])
开发者ID:tonyqtian,项目名称:sentence_checker,代码行数:28,代码来源:ngramc.py

示例3: parse_questions

# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
    def parse_questions(self):
        stemmer = PorterStemmer()
        tokenizer = RegexpTokenizer(r'\w+')
        for questions_key in self.rawSamples:
            # Stem the Question Text
            question_text = self.rawSamples[questions_key][0]
            words_array = tokenizer.tokenize(question_text)
            question_text = ""
            for word in words_array:
                if word.isnumeric():
                    continue
                if word not in text.ENGLISH_STOP_WORDS:
                    word = stemmer.stem(word)
                word = stemmer.stem(word)
                question_text += (word + " ")
            self.rawSamples[questions_key][0] = question_text

            # Stem the topic names
            topics_text = self.rawSamples[questions_key][2]
            words_array = tokenizer.tokenize(topics_text)
            topics_text = ""
            for word in words_array:
                if word.isnumeric():
                    continue
                if word not in text.ENGLISH_STOP_WORDS:
                    word = stemmer.stem(word)
                word = stemmer.stem(word)
                topics_text += (word + " ")
            self.rawSamples[questions_key][2] = topics_text
开发者ID:suket22,项目名称:CS246,代码行数:31,代码来源:LoadData.py

示例4: StringSpellchecksFinder

# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
class StringSpellchecksFinder(object):
    """
    Compares two strings, finding words that been
    """
    def __init__(self, similarity=0.7):
        self.tokenizer = RegexpTokenizer('[\w-]+')
        self.similarity = similarity

    def find(self, text_before, text_after):
        """
        Finds all spellchecks tuple(mistake, correction) in the given text
        """
        spellchecks = []
        text_before_tokens = map(lambda x: x.lower(), self.tokenizer.tokenize(text_before))
        text_after_tokens = map(lambda x: x.lower(), self.tokenizer.tokenize(text_after))
        diff_matching = SequenceMatcher(None, text_before_tokens, text_after_tokens)
        for difference in filter(lambda x: x[0] == 'replace', diff_matching.get_opcodes()):
            sequence_before = text_before_tokens[difference[1]:difference[2]]
            sequence_after = text_after_tokens[difference[3]:difference[4]]
            spellchecks += self.find_best_match(sequence_before, sequence_after)
        return spellchecks

    def find_best_match(self, sequence_before, sequence_after):
        """
        Finds the best matching of elements pairs that are most probable pairs
        """
        pairs = []
        possibilities = map(lambda element1: map(lambda element2: (element1, element2, SequenceMatcher(None, element1, element2).ratio()) , sequence_after) , sequence_before)
        for possibility in possibilities:
            possibility = filter(lambda p: p[2] >= self.similarity, possibility)
            if possibility:
                possibility.sort(key=lambda p: p[2], reverse=True)
                pairs.append((possibility[0][0], possibility[0][1]))
        return pairs
开发者ID:agh-glk,项目名称:spelldiffer,代码行数:36,代码来源:spelldiffer.py

示例5: getData

# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
def getData():
    tokenizer = RegexpTokenizer(r'\w+')
    f = open("msr_paraphrase_train.txt", "r")
    f.readline()
    trainInput = []
    trainClass = [0] * 8160
    i = 0
    while i < 8160:
        tokens = f.readline().strip().split('\t')
        trainClass[i] = trainClass[i+1] = int(tokens[0])
        i += 2
        S = tokenizer.tokenize(tokens[3].lower())
        Smatrix1 = sentenceToMatrix(S)
        S = tokenizer.tokenize(tokens[4].lower())
        Smatrix2 = sentenceToMatrix(S)
        trainInput.append([np.transpose(Smatrix1+Smatrix2)])
        trainInput.append([np.transpose(Smatrix2+Smatrix1)])

    f.close()

    f = open("msr_paraphrase_test.txt", "r")
    f.readline()
    testInput = []
    testClass = [0] * 1725
    for i in range(0,1725):
        tokens = f.readline().strip().split('\t')
        testClass[i] = int(tokens[0])
        S = tokenizer.tokenize(tokens[3].lower())
        Smatrix = sentenceToMatrix(S)
        S = tokenizer.tokenize(tokens[4].lower())
        Smatrix.extend(sentenceToMatrix(S))
        testInput.append([np.transpose(Smatrix)])

    f.close()
    return trainInput, trainClass, testInput, testClass
开发者ID:metzzo,项目名称:Paraphrase_Identification,代码行数:37,代码来源:convolutionalSolution.py

示例6: __init__

# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
    def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        @param rtepair: a L{RTEPair} from which features should be extracted
        @param stop: if C{True}, stopwords are thrown away.
        @type stop: C{bool}
        """
        self.stop = stop
        self.stopwords = set(
            ["a", "the", "it", "they", "of", "in", "to", "have", "is", "are", "were", "and", "very", ".", ","]
        )

        self.negwords = set(["no", "not", "never", "failed" "rejected", "denied"])
        # Try to tokenize so that abbreviations like U.S.and monetary amounts
        # like "$23.00" are kept as tokens.
        from nltk.tokenize import RegexpTokenizer

        tokenizer = RegexpTokenizer("([A-Z]\.)+|\w+|\$[\d\.]+")

        # Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set([lemmatize(token) for token in self.text_tokens])
            self.hyp_words = set([lemmatize(token) for token in self.hyp_tokens])

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words
开发者ID:altaha,项目名称:ArgoJsonRDBMS,代码行数:37,代码来源:rte_classify.py

示例7: categorize_input_query

# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
    def categorize_input_query(self,input_query):
        query_category=OrderedDict([])

        input_query=self.replace_punctuation_in_query_string(input_query)
        phrasal_not_tokenizer = RegexpTokenizer(r'![\"]+(\w+[-]*(\w+)*(\s*)(\w)*)*[\"]')
        word_not_tokenizer = RegexpTokenizer(r'!(\w+[-]*(\w)*)')

        not_queries_set=set(word_not_tokenizer.tokenize(input_query))
        not_queries_set=not_queries_set.union(set(phrasal_not_tokenizer.tokenize(input_query)))
        string_copy=input_query
        string_copy = re.sub(r"\".*?\"", "", string_copy)
        string_copy = re.sub(r"!.*?(\s|$)", "", string_copy)

        modified_not_words=[]
        for words in not_queries_set:
            #removing the not words
            modified_not_words.append(words[1:])
        phrase_tokenizer = RegexpTokenizer(r'[\"]+(\w+[-]*(\w+)*(\s*)(\w)*)*[\"]')
        phrase_queries_set=set(phrase_tokenizer.tokenize(input_query))

        phrase_queries_set=phrase_queries_set.difference(set(modified_not_words))
        query_category["PHRASE"]=phrase_queries_set
        query_category["NOT"]=modified_not_words
        normal_words=string_copy.split()
        normal_word_set=set(normal_words )
        query_category["WORD"]=normal_word_set
        return query_category
开发者ID:rogersjeffreyl,项目名称:InvertedIndex,代码行数:29,代码来源:query_processor.py

示例8: __init__

# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
    def __init__(self, rtepair, stop=True, lemmatize=False):
        """
        :param rtepair: a ``RTEPair`` from which features should be extracted
        :param stop: if ``True``, stopwords are thrown away.
        :type stop: bool
        """
        self.stop = stop
        self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
                              'have', 'are', 'were', 'and', 'very', '.', ','])

        self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
                             'denied'])
        # Try to tokenize so that abbreviations, monetary amounts, email
        # addresses, URLs are single tokens.
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer('([\[email protected]:/])+|\w+|\$[\d.]+')

        #Get the set of word types for text and hypothesis
        self.text_tokens = tokenizer.tokenize(rtepair.text)
        self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
        self.text_words = set(self.text_tokens)
        self.hyp_words = set(self.hyp_tokens)

        if lemmatize:
            self.text_words = set(lemmatize(token) for token in self.text_tokens)
            self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)

        if self.stop:
            self.text_words = self.text_words - self.stopwords
            self.hyp_words = self.hyp_words - self.stopwords

        self._overlap = self.hyp_words & self.text_words
        self._hyp_extra = self.hyp_words - self.text_words
        self._txt_extra = self.text_words - self.hyp_words
开发者ID:esabelhaus,项目名称:secret-octo-dubstep,代码行数:36,代码来源:rte_classify.py

示例9: __init__

# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
class HashtagMatch:

    def __init__(self, name_matcher):
        from nltk.tokenize import RegexpTokenizer
        self._name_matcher = name_matcher
        self._hashtag_extract = RegexpTokenizer('(#[A-Za-z][A-Za-z0-9-_]+)')
        self._at_extract = RegexpTokenizer('(@[A-Za-z][A-Za-z0-9-_]+)')

    def extract_hashtag(self, text):
        return self._hashtag_extract.tokenize(text)

    def extract_at(self, text):
        return self._at_extract.tokenize(text)

    def match(self, text):
        segs = [' '.join(seg) for seg in self.segment(text[1:])]
        entities = map(self._name_matcher.exact_match, segs)
        return [e for e in entities if e]

    def segment(self, text):
        n = len(text) - 1
        count = 2 ** n
        sequences = map(lambda x: bin(x)[2:].zfill(n), range(count))
        segmentations = []
        for s in sequences:
            segmentation = []
            begin = 0
            for i in range(n):
                end = i + 1
                if s[i] == '1':
                    segmentation.append(''.join(text[begin:end]))
                    begin = end
            segmentation.append(''.join(text[begin:end + 1]))
            segmentations.append(segmentation)
        return segmentations
开发者ID:gsi-upm,项目名称:sematch,代码行数:37,代码来源:nlp.py

示例10: get_outbreak_countries

# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
 def get_outbreak_countries(disease=all):
     tokenizer = RegexpTokenizer(r'\w+|[^\w\s]+')
     
     countries = []
     
     if disease == all:
         for location in Location.objects.all():
             country = tokenizer.tokenize(location.name)
             country = country[len(country)-1]
             
             if country not in countries:
                 countries.append(str(country))
     else:
         for tweet in Tweet.objects.filter(disease_type__contains=disease):
             if tweet.location:
                 country = tokenizer.tokenize(tweet.location.name)
                 country = country[len(country)-1]
                 country_disease_count = [str(country), \
                 len(Tweet.objects.filter(disease_type__contains=disease, \
                 location_string__contains=country)), disease]
                 
                 if country_disease_count not in countries:
                     countries.append(country_disease_count)
                 
     return countries
开发者ID:onebit1984,项目名称:epidetect,代码行数:27,代码来源:models.py

示例11: average_sentence_length

# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
def average_sentence_length(text):
    tokenizer = RegexpTokenizer(r' ([A-Z][^\.!?]*[\.!?])')
    sentences = tokenizer.tokenize(text)
    s = np.zeros(len(sentences))
    for inds, sentence in enumerate(sentences):
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(sentence)
        s[inds] = len(tokens)
    return s, np.mean(s), np.std(s)
开发者ID:aerows,项目名称:NLP1-Project,代码行数:11,代码来源:helper_functions.py

示例12: stopWordRemoval

# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
def stopWordRemoval() :


	f = open('repos', 'r')
	strn = f.read()
	lst = strn.split('\n')

	i = 0
	while i < (len(lst) - 1) :
	
		name = lst[i].split("/")

		dummyFile = 'filteredData/' + name[1] + '/dummy.txt';
		dr = os.path.dirname(dummyFile)

		if not os.path.exists(dr) :
			os.makedirs(dr)

		ft = open('data/'+name[1]+'/title.txt')
		st = ft.read().lower()

		fd = open('data/'+name[1]+'/description.txt')
		sd = fd.read().lower()

		fc = open('data/'+name[1]+'/content.txt')
		sc = fc.read().lower()
		

		tokenizer = RegexpTokenizer(r'\w+')

		wordArrTitle = tokenizer.tokenize(st)
		wordArrDesc = tokenizer.tokenize(sd)
		wordArrData = tokenizer.tokenize(sc)

		filteredWordsTitle = [w for w in wordArrTitle if not w in stopwords.words('english')]
		filteredWordsDesc = [w for w in wordArrDesc if not w in stopwords.words('english')]
		filteredWordsData = [w for w in wordArrData if not w in stopwords.words('english')]

		wordnet_lem= WordNetLemmatizer()


		ftf = open('filteredData/'+name[1]+'/title.lst','w')
		for w in filteredWordsTitle:
			#print w
			ftf.write(wordnet_lem.lemmatize(w)+'\n')

		fdf = open('filteredData/'+name[1]+'/description.lst','w')
		for w in filteredWordsDesc:
			#print w
			fdf.write(wordnet_lem.lemmatize(w)+'\n')

		fcf = open('filteredData/'+name[1]+'/content.lst','w')
		for w in filteredWordsData:
			print w+'\n'
			fcf.write(wordnet_lem.lemmatize(w)+'\n')
		
		i=i+2
开发者ID:g31pranjal,项目名称:git-analysis,代码行数:59,代码来源:script.py

示例13: calculate_freqs

# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
def calculate_freqs(data, toExclude):
    # lemmatizer = WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words("english")
    sents = nltk.tokenize.sent_tokenize(data)
    tokenizer = RegexpTokenizer(r"\w+\'?\w+")

    # tagged_sentences = [w for s in sents for w in nltk.pos_tag(word_tokenize(s))]
    # words = [lemmatizer.lemmatize(w[0].lower(), get_wordnet_pos(w[1])) for w in tagged_sentences] # if w.lower() not in stopwords]
    if toExclude:
        words = [w for s in sents for w in tokenizer.tokenize(s) if w.lower() not in stopwords]
    else:
        words = [w for s in sents for w in tokenizer.tokenize(s)]
    return words
开发者ID:oowowaee,项目名称:wwlns,代码行数:15,代码来源:extract_features.py

示例14: _generate_answer_question_pair

# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
    def _generate_answer_question_pair(self, question, article, X_train_words, Y_train_words, max_seqlen, max_queslen):

        tokenizer = RegexpTokenizer(r'\w+')
        answer =  re.split(r'\t+', question)[1]
        question_txt = tokenizer.tokenize(question)[1:-2]
        ref = int(re.split(r'\t+', question)[-1]) - 1
        seq = tokenizer.tokenize(article[ref])[1:] + question_txt

        if len(seq) > max_seqlen:
            max_seqlen = len(seq)
        X_train_words.append(seq)
        Y_train_words.append(answer)
        return max_seqlen, max_queslen
开发者ID:danstrawser,项目名称:Nlp2Commands,代码行数:15,代码来源:babi_processor.py

示例15: parse_document

# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
def parse_document(filename,query):
    myfile = codecs.open(filename,"r","utf-8")
    raw = myfile.read()
    sentences = sent_tokenize(raw)
    tokenizer = RegexpTokenizer(r'\w+') #tokenizer.tokenize(sentences[0])
    stop = stopwords.words('english')

    sents = [[token.lower() for token in tokenizer.tokenize(sentence) if
               not(token in stop or token.isdigit())] for sentence in sentences]

    query_t = [token for token in tokenizer.tokenize(query) if not(token in stop or token.isdigit())]
    cloud = " ".join(list(itertools.chain(*sents)))
    return cloud,query_t
开发者ID:cderici,项目名称:hazircevap,代码行数:15,代码来源:cloud_stats.py


注:本文中的nltk.tokenize.RegexpTokenizer.tokenize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。