本文整理汇总了Python中nltk.tokenize.RegexpTokenizer.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python RegexpTokenizer.tokenize方法的具体用法?Python RegexpTokenizer.tokenize怎么用?Python RegexpTokenizer.tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tokenize.RegexpTokenizer
的用法示例。
在下文中一共展示了RegexpTokenizer.tokenize方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
def __init__(self, rtepair, stop=True, lemmatize=False):
"""
@param rtepair: a L{RTEPair} from which features should be extracted
@param stop: if C{True}, stopwords are thrown away.
@type stop: C{bool}
"""
self.stop = stop
self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to',
'have', 'is', 'are', 'were', 'and', 'very', '.',','])
self.negwords = set(['no', 'not', 'never', 'failed' 'rejected', 'denied'])
# Try to tokenize so that abbreviations like U.S.and monetary amounts
# like "$23.00" are kept as tokens.
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')
#Get the set of word types for text and hypothesis
self.text_tokens = tokenizer.tokenize(rtepair.text)
self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
self.text_words = set(self.text_tokens)
self.hyp_words = set(self.hyp_tokens)
if lemmatize:
self.text_words = set([lemmatize(token) for token in self.text_tokens])
self.hyp_words = set([lemmatize(token) for token in self.hyp_tokens])
if self.stop:
self.text_words = self.text_words - self.stopwords
self.hyp_words = self.hyp_words - self.stopwords
self._overlap = self.hyp_words & self.text_words
self._hyp_extra = self.hyp_words - self.text_words
self._txt_extra = self.text_words - self.hyp_words
示例2: demo
# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
def demo():
# from nltk.corpus import brown
# from nltk.probability import LidstoneProbDist, WittenBellProbDist
# estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
# estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2)
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w']+")
lm = NgramcModel(5)
print lm
sent = "Like a bridge over troubled water, I will lay it down."
print sent
words = tokenizer.tokenize(sent)
print "Entropy: ", lm.entropy(words)
sent = "over twenty year and he"
print sent
words = tokenizer.tokenize(sent)
print "Entropy: ", lm.entropy(words)
sent = "over twenty years and he"
print sent
words = tokenizer.tokenize(sent)
print "Entropy: ", lm.entropy(words)
print lm.getBetter(["men" ,"are" ,"imporant" ,"for" ,"the"], ["men" ,"are" ,"important" ,"for" ,"the"])
示例3: parse_questions
# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
def parse_questions(self):
stemmer = PorterStemmer()
tokenizer = RegexpTokenizer(r'\w+')
for questions_key in self.rawSamples:
# Stem the Question Text
question_text = self.rawSamples[questions_key][0]
words_array = tokenizer.tokenize(question_text)
question_text = ""
for word in words_array:
if word.isnumeric():
continue
if word not in text.ENGLISH_STOP_WORDS:
word = stemmer.stem(word)
word = stemmer.stem(word)
question_text += (word + " ")
self.rawSamples[questions_key][0] = question_text
# Stem the topic names
topics_text = self.rawSamples[questions_key][2]
words_array = tokenizer.tokenize(topics_text)
topics_text = ""
for word in words_array:
if word.isnumeric():
continue
if word not in text.ENGLISH_STOP_WORDS:
word = stemmer.stem(word)
word = stemmer.stem(word)
topics_text += (word + " ")
self.rawSamples[questions_key][2] = topics_text
示例4: StringSpellchecksFinder
# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
class StringSpellchecksFinder(object):
"""
Compares two strings, finding words that been
"""
def __init__(self, similarity=0.7):
self.tokenizer = RegexpTokenizer('[\w-]+')
self.similarity = similarity
def find(self, text_before, text_after):
"""
Finds all spellchecks tuple(mistake, correction) in the given text
"""
spellchecks = []
text_before_tokens = map(lambda x: x.lower(), self.tokenizer.tokenize(text_before))
text_after_tokens = map(lambda x: x.lower(), self.tokenizer.tokenize(text_after))
diff_matching = SequenceMatcher(None, text_before_tokens, text_after_tokens)
for difference in filter(lambda x: x[0] == 'replace', diff_matching.get_opcodes()):
sequence_before = text_before_tokens[difference[1]:difference[2]]
sequence_after = text_after_tokens[difference[3]:difference[4]]
spellchecks += self.find_best_match(sequence_before, sequence_after)
return spellchecks
def find_best_match(self, sequence_before, sequence_after):
"""
Finds the best matching of elements pairs that are most probable pairs
"""
pairs = []
possibilities = map(lambda element1: map(lambda element2: (element1, element2, SequenceMatcher(None, element1, element2).ratio()) , sequence_after) , sequence_before)
for possibility in possibilities:
possibility = filter(lambda p: p[2] >= self.similarity, possibility)
if possibility:
possibility.sort(key=lambda p: p[2], reverse=True)
pairs.append((possibility[0][0], possibility[0][1]))
return pairs
示例5: getData
# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
def getData():
tokenizer = RegexpTokenizer(r'\w+')
f = open("msr_paraphrase_train.txt", "r")
f.readline()
trainInput = []
trainClass = [0] * 8160
i = 0
while i < 8160:
tokens = f.readline().strip().split('\t')
trainClass[i] = trainClass[i+1] = int(tokens[0])
i += 2
S = tokenizer.tokenize(tokens[3].lower())
Smatrix1 = sentenceToMatrix(S)
S = tokenizer.tokenize(tokens[4].lower())
Smatrix2 = sentenceToMatrix(S)
trainInput.append([np.transpose(Smatrix1+Smatrix2)])
trainInput.append([np.transpose(Smatrix2+Smatrix1)])
f.close()
f = open("msr_paraphrase_test.txt", "r")
f.readline()
testInput = []
testClass = [0] * 1725
for i in range(0,1725):
tokens = f.readline().strip().split('\t')
testClass[i] = int(tokens[0])
S = tokenizer.tokenize(tokens[3].lower())
Smatrix = sentenceToMatrix(S)
S = tokenizer.tokenize(tokens[4].lower())
Smatrix.extend(sentenceToMatrix(S))
testInput.append([np.transpose(Smatrix)])
f.close()
return trainInput, trainClass, testInput, testClass
示例6: __init__
# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
def __init__(self, rtepair, stop=True, lemmatize=False):
"""
@param rtepair: a L{RTEPair} from which features should be extracted
@param stop: if C{True}, stopwords are thrown away.
@type stop: C{bool}
"""
self.stop = stop
self.stopwords = set(
["a", "the", "it", "they", "of", "in", "to", "have", "is", "are", "were", "and", "very", ".", ","]
)
self.negwords = set(["no", "not", "never", "failed" "rejected", "denied"])
# Try to tokenize so that abbreviations like U.S.and monetary amounts
# like "$23.00" are kept as tokens.
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("([A-Z]\.)+|\w+|\$[\d\.]+")
# Get the set of word types for text and hypothesis
self.text_tokens = tokenizer.tokenize(rtepair.text)
self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
self.text_words = set(self.text_tokens)
self.hyp_words = set(self.hyp_tokens)
if lemmatize:
self.text_words = set([lemmatize(token) for token in self.text_tokens])
self.hyp_words = set([lemmatize(token) for token in self.hyp_tokens])
if self.stop:
self.text_words = self.text_words - self.stopwords
self.hyp_words = self.hyp_words - self.stopwords
self._overlap = self.hyp_words & self.text_words
self._hyp_extra = self.hyp_words - self.text_words
self._txt_extra = self.text_words - self.hyp_words
示例7: categorize_input_query
# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
def categorize_input_query(self,input_query):
query_category=OrderedDict([])
input_query=self.replace_punctuation_in_query_string(input_query)
phrasal_not_tokenizer = RegexpTokenizer(r'![\"]+(\w+[-]*(\w+)*(\s*)(\w)*)*[\"]')
word_not_tokenizer = RegexpTokenizer(r'!(\w+[-]*(\w)*)')
not_queries_set=set(word_not_tokenizer.tokenize(input_query))
not_queries_set=not_queries_set.union(set(phrasal_not_tokenizer.tokenize(input_query)))
string_copy=input_query
string_copy = re.sub(r"\".*?\"", "", string_copy)
string_copy = re.sub(r"!.*?(\s|$)", "", string_copy)
modified_not_words=[]
for words in not_queries_set:
#removing the not words
modified_not_words.append(words[1:])
phrase_tokenizer = RegexpTokenizer(r'[\"]+(\w+[-]*(\w+)*(\s*)(\w)*)*[\"]')
phrase_queries_set=set(phrase_tokenizer.tokenize(input_query))
phrase_queries_set=phrase_queries_set.difference(set(modified_not_words))
query_category["PHRASE"]=phrase_queries_set
query_category["NOT"]=modified_not_words
normal_words=string_copy.split()
normal_word_set=set(normal_words )
query_category["WORD"]=normal_word_set
return query_category
示例8: __init__
# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
def __init__(self, rtepair, stop=True, lemmatize=False):
"""
:param rtepair: a ``RTEPair`` from which features should be extracted
:param stop: if ``True``, stopwords are thrown away.
:type stop: bool
"""
self.stop = stop
self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
'have', 'are', 'were', 'and', 'very', '.', ','])
self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
'denied'])
# Try to tokenize so that abbreviations, monetary amounts, email
# addresses, URLs are single tokens.
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('([\[email protected]:/])+|\w+|\$[\d.]+')
#Get the set of word types for text and hypothesis
self.text_tokens = tokenizer.tokenize(rtepair.text)
self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
self.text_words = set(self.text_tokens)
self.hyp_words = set(self.hyp_tokens)
if lemmatize:
self.text_words = set(lemmatize(token) for token in self.text_tokens)
self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)
if self.stop:
self.text_words = self.text_words - self.stopwords
self.hyp_words = self.hyp_words - self.stopwords
self._overlap = self.hyp_words & self.text_words
self._hyp_extra = self.hyp_words - self.text_words
self._txt_extra = self.text_words - self.hyp_words
示例9: __init__
# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
class HashtagMatch:
def __init__(self, name_matcher):
from nltk.tokenize import RegexpTokenizer
self._name_matcher = name_matcher
self._hashtag_extract = RegexpTokenizer('(#[A-Za-z][A-Za-z0-9-_]+)')
self._at_extract = RegexpTokenizer('(@[A-Za-z][A-Za-z0-9-_]+)')
def extract_hashtag(self, text):
return self._hashtag_extract.tokenize(text)
def extract_at(self, text):
return self._at_extract.tokenize(text)
def match(self, text):
segs = [' '.join(seg) for seg in self.segment(text[1:])]
entities = map(self._name_matcher.exact_match, segs)
return [e for e in entities if e]
def segment(self, text):
n = len(text) - 1
count = 2 ** n
sequences = map(lambda x: bin(x)[2:].zfill(n), range(count))
segmentations = []
for s in sequences:
segmentation = []
begin = 0
for i in range(n):
end = i + 1
if s[i] == '1':
segmentation.append(''.join(text[begin:end]))
begin = end
segmentation.append(''.join(text[begin:end + 1]))
segmentations.append(segmentation)
return segmentations
示例10: get_outbreak_countries
# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
def get_outbreak_countries(disease=all):
tokenizer = RegexpTokenizer(r'\w+|[^\w\s]+')
countries = []
if disease == all:
for location in Location.objects.all():
country = tokenizer.tokenize(location.name)
country = country[len(country)-1]
if country not in countries:
countries.append(str(country))
else:
for tweet in Tweet.objects.filter(disease_type__contains=disease):
if tweet.location:
country = tokenizer.tokenize(tweet.location.name)
country = country[len(country)-1]
country_disease_count = [str(country), \
len(Tweet.objects.filter(disease_type__contains=disease, \
location_string__contains=country)), disease]
if country_disease_count not in countries:
countries.append(country_disease_count)
return countries
示例11: average_sentence_length
# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
def average_sentence_length(text):
tokenizer = RegexpTokenizer(r' ([A-Z][^\.!?]*[\.!?])')
sentences = tokenizer.tokenize(text)
s = np.zeros(len(sentences))
for inds, sentence in enumerate(sentences):
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(sentence)
s[inds] = len(tokens)
return s, np.mean(s), np.std(s)
示例12: stopWordRemoval
# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
def stopWordRemoval() :
f = open('repos', 'r')
strn = f.read()
lst = strn.split('\n')
i = 0
while i < (len(lst) - 1) :
name = lst[i].split("/")
dummyFile = 'filteredData/' + name[1] + '/dummy.txt';
dr = os.path.dirname(dummyFile)
if not os.path.exists(dr) :
os.makedirs(dr)
ft = open('data/'+name[1]+'/title.txt')
st = ft.read().lower()
fd = open('data/'+name[1]+'/description.txt')
sd = fd.read().lower()
fc = open('data/'+name[1]+'/content.txt')
sc = fc.read().lower()
tokenizer = RegexpTokenizer(r'\w+')
wordArrTitle = tokenizer.tokenize(st)
wordArrDesc = tokenizer.tokenize(sd)
wordArrData = tokenizer.tokenize(sc)
filteredWordsTitle = [w for w in wordArrTitle if not w in stopwords.words('english')]
filteredWordsDesc = [w for w in wordArrDesc if not w in stopwords.words('english')]
filteredWordsData = [w for w in wordArrData if not w in stopwords.words('english')]
wordnet_lem= WordNetLemmatizer()
ftf = open('filteredData/'+name[1]+'/title.lst','w')
for w in filteredWordsTitle:
#print w
ftf.write(wordnet_lem.lemmatize(w)+'\n')
fdf = open('filteredData/'+name[1]+'/description.lst','w')
for w in filteredWordsDesc:
#print w
fdf.write(wordnet_lem.lemmatize(w)+'\n')
fcf = open('filteredData/'+name[1]+'/content.lst','w')
for w in filteredWordsData:
print w+'\n'
fcf.write(wordnet_lem.lemmatize(w)+'\n')
i=i+2
示例13: calculate_freqs
# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
def calculate_freqs(data, toExclude):
# lemmatizer = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words("english")
sents = nltk.tokenize.sent_tokenize(data)
tokenizer = RegexpTokenizer(r"\w+\'?\w+")
# tagged_sentences = [w for s in sents for w in nltk.pos_tag(word_tokenize(s))]
# words = [lemmatizer.lemmatize(w[0].lower(), get_wordnet_pos(w[1])) for w in tagged_sentences] # if w.lower() not in stopwords]
if toExclude:
words = [w for s in sents for w in tokenizer.tokenize(s) if w.lower() not in stopwords]
else:
words = [w for s in sents for w in tokenizer.tokenize(s)]
return words
示例14: _generate_answer_question_pair
# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
def _generate_answer_question_pair(self, question, article, X_train_words, Y_train_words, max_seqlen, max_queslen):
tokenizer = RegexpTokenizer(r'\w+')
answer = re.split(r'\t+', question)[1]
question_txt = tokenizer.tokenize(question)[1:-2]
ref = int(re.split(r'\t+', question)[-1]) - 1
seq = tokenizer.tokenize(article[ref])[1:] + question_txt
if len(seq) > max_seqlen:
max_seqlen = len(seq)
X_train_words.append(seq)
Y_train_words.append(answer)
return max_seqlen, max_queslen
示例15: parse_document
# 需要导入模块: from nltk.tokenize import RegexpTokenizer [as 别名]
# 或者: from nltk.tokenize.RegexpTokenizer import tokenize [as 别名]
def parse_document(filename,query):
myfile = codecs.open(filename,"r","utf-8")
raw = myfile.read()
sentences = sent_tokenize(raw)
tokenizer = RegexpTokenizer(r'\w+') #tokenizer.tokenize(sentences[0])
stop = stopwords.words('english')
sents = [[token.lower() for token in tokenizer.tokenize(sentence) if
not(token in stop or token.isdigit())] for sentence in sentences]
query_t = [token for token in tokenizer.tokenize(query) if not(token in stop or token.isdigit())]
cloud = " ".join(list(itertools.chain(*sents)))
return cloud,query_t