本文整理汇总了Python中nltk.regexp_tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python regexp_tokenize函数的具体用法?Python regexp_tokenize怎么用?Python regexp_tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了regexp_tokenize函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_freqs
def get_freqs(text):
stop_words = nltk.corpus.stopwords.words('english')
frequencies = defaultdict(int)
pattern = r'''(?x) # set flag to allow verbose regexps
([A-Z]\.)+ # abbreviations, e.g. U.S.A.
| \w+(-\w+)* # words with optional internal hyphens
| \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?():-_`] # these are separate tokens
'''
if type(text) == list:
print 'number of links: '+ str(len(text))
for t in text:
content = t['content']
tokens = nltk.regexp_tokenize(content, pattern)
for word in tokens:
if len(word) > 2 and word.lower() not in stop_words:
cap = word[0].upper() + word[1:]
frequencies[cap] += 1
else:
tokens = nltk.regexp_tokenize(text, pattern)
for word in tokens:
if len(word) > 2 and word not in stop_words:
frequencies[word] += 1
print "frequency size: "+str(len(frequencies))
return frequencies
示例2: bag_of_words
def bag_of_words(data, label_codebook, feature_codebook, theta):
""""""
word_dict = Alphabet()
stopset = set(stopwords.words('english'))
for key, value in data.items():
label_codebook.add(key)
for doc in value:
doc_tokens = set(nltk.regexp_tokenize(doc, pattern="\w+"))
for word in doc_tokens:
if word not in stopset:
word_dict.add(word)
all_words = word_dict._label_to_index.keys()
fdict = FreqDist([w for w in all_words])
word_feature = fdict.keys()[theta:]
for word in all_words:
if word in word_feature:
feature_codebook.add(word)
instance_list = {}
for label, document_list in data.items():
instance_list[label] = []
for document in document_list:
vector = np.zeros(feature_codebook.size())
tokens = set(nltk.regexp_tokenize(document, pattern="\w+"))
indice = 0
for word in tokens:
if feature_codebook.has_label(word):
indice = feature_codebook.get_index(word)
vector[indice] = 1.
instance_list[label].append(vector)
return instance_list
示例3: load
def load(f=str):
import re
files = open(f)
raw = files.read()
pattern = re.compile(r"""\$?\d+(\.\d+)?%? # currency
\d+/\d+/\d+ #dates""", re.VERBOSE)
nltk.regexp_tokenize(raw, pattern)
示例4: nltkTest
def nltkTest():
s = "russia licenza 8.1.5 U.S."
res = nltk.regexp_tokenize(s, helper.nltkPattern)
print(res)
s = "Saldo vs. Fattura n. 2015/004"
res = nltk.regexp_tokenize(s, helper.nltkPattern)
print(res)
示例5: regularExpressionTokenizer
def regularExpressionTokenizer():
text = 'That U.S.A. poster-print costs $12.40...'
pattern = r'''(?x) # set flag to allow verbose regexps
([A-Z]\.)+ # abbreviations, e.g. U.S.A.
| \w+(-\w+)* # words with optional internal hyphens
| \$?\d+(\.\d+)?%? # currency and percentages, e.g. $12.40, 82%
| \.\.\. # ellipsis
| [][.,;"'?():-_`] # these are separate tokens
'''
print nltk.regexp_tokenize(text, pattern)
示例6: get_links
def get_links(text):
# checks only for 'http://...' and 'www...'
text = text + " "
pat = "http://.*?\s"
links = nltk.regexp_tokenize(text, pat)
text = " " + text + " "
pat = "\swww\..*?\..*?\s"
links.extend(nltk.regexp_tokenize(text, pat))
links = map(lambda x: x[:-1], links)
return links
示例7: poss_test
def poss_test(test_file,test_write,sw_file):
"""
Arguments:
- `train_file`:
"""
a = 0
f = open(test_file)
reader = csv.reader(f)
t = open(test_write,"w")
sw = open(sw_file)
sw = sw.readlines()
sw = [word.strip() for word in sw]
stopwords = sw
print "停顿词表长度",len(stopwords)
stopwords = set(stopwords)
g = lambda x : x not in stopwords
for row in reader:
if a == 0:
a += 1
continue
if a%1000 == 0:
print a
a += 1
#if a == 8:
# sys.exit(1)
title = row[1].lower()
#clean html
body = nltk.clean_html(row[2].lower())
#work tokenize
pattern = r"([a-z])\w+"
body = nltk.regexp_tokenize(body, pattern)
title = nltk.regexp_tokenize(title, pattern)
#light stem
#title = set([stem(word) for word in title])
#body = set(body)
#body = set([stem(word) for word in body])
#remove stopwords
#body = filter(g,body)
#title = filter(g,title)
body = ' '.join(body)
title = ' '.join(title)
t.write('%s , %s \n'%(title,body))
示例8: poss_test
def poss_test(test_file,test_write,sw_file):
"""
Arguments:
- `train_file`:
"""
a = 0
f = open(test_file)
reader = csv.reader(f)
t = open(test_write,"w")
sw = open(sw_file)
sw = sw.readlines()
sw = [word.strip() for word in sw]
#stopwords = sw
stopwords = nltk.corpus.stopwords.words('english')
stopwords = set(stopwords)
g = lambda x : x not in stopwords
for row in reader:
if a%10000 == 0:
print(a)
a += 1
#if a == 8:
# sys.exit(1)
title = row[1].lower()
#clean html
body = nltk.clean_html(row[2].lower())
#work tokenize
pattern = r"(\.?[a-z][a-z0-9\+\.\#\-]+[a-z0-9\+\#])"
body = nltk.regexp_tokenize(body, pattern)
title = nltk.regexp_tokenize(title, pattern)
#remove stopwords
body = filter(g,body)
title = filter(g,title)
#light stem
title = set([stem(word) for word in title])
body = set(body)
body = set([stem(word) for word in body])
body = ' '.join(body)
title = ' '.join(title)
t.write('"%s","%s","%s"\n'%(row[0],title,body))
示例9: query_episode
def query_episode(self, show_title,
ep_title, se_number, ep_number, runtime):
"""build video list prior to scoring
"""
qres = {}
# Query 1
qlist = (show_title, ep_title)
# Search YouTube
tmp = self.search('%s %s' % qlist)
for k, v in tmp.items():
qres[k] = v
# Query 2
qlist = (show_title, ep_title,
se_number, ep_number)
# Search YouTube
tmp = self.search('%s %s %s %s' % qlist)
for k, v in tmp.items():
qres[k] = v
# Query 3
qlist = (show_title,
se_number, ep_number)
# Search YouTube
tmp = self.search('%s s%02de%02d' % qlist)
for k, v in tmp.items():
qres[k] = v
# Show tokens
sh_stem = [self._lancaster.stem(t) \
for t in nltk.regexp_tokenize(
show_title.encode('utf8'), r"\w+")]
# Episode stem tokens if exist
if ep_title:
ep_stem = [self._lancaster.stem(t) \
for t in nltk.regexp_tokenize(
ep_title.encode('utf8'), r"\w+")]
else:
ep_stem = None
res = {'Output': qres,
'Input': {},}
res['Input']['show_title'] = show_title
res['Input']['ep_title'] = ep_title
res['Input']['sh_stem'] = sh_stem
res['Input']['ep_stem'] = ep_stem
res['Input']['se_number'] = se_number
res['Input']['ep_number'] = ep_number
res['Input']['runtime'] = runtime
return res
示例10: poss_train
def poss_train(train_file,train_write,sw_file):
"""
Arguments:
- `train_file`:
"""
a = 0
f = open(train_file)
reader = csv.reader(f)
t = open(train_write,"w")
sw = open(sw_file)
sw = sw.readlines()
sw = [word.strip() for word in sw]
#stopwords = sw # use nltk stopwords
stopwords = nltk.corpus.stopwords.words('english')
print "停顿词表长度",len(stopwords)
stopwords = set(stopwords)
g = lambda x : x not in stopwords
for row in reader:
if a%100000 == 0:
print a
a += 1
title = row[1].lower()
#clean html
body = nltk.clean_html(row[2].lower())
#word tokenize
pattern = r"([a-z])\w+"
body = nltk.regexp_tokenize(body, pattern)
title = nltk.regexp_tokenize(title, pattern)
#remove stopwords
body = filter(g,body)
title = filter(g,title)
#light stem
#st = LancasterStemmer()
title = set([stem(word) for word in title])
body = set(body)
body = set([stem(word) for word in body])
# list to string
body = ' '.join(body)
title = ' '.join(title)
t.write('"%s","%s","%s","%s"\n'%(row[0], title,body,row[3]))
示例11: normalized
def normalized(text, lowercase=True, fix=True, tuples=False):
"""Tokenize, remove capitalization and exclude punctuation
"""
if fix:
text = fix_text(unicode(text))
pattern = r"""(?x) # verbose regexps
\w+(-\w+)* # words with optional internal hyphens
"""
result = [w for w in nltk.regexp_tokenize(text, pattern)]
if lowercase:
result = [w.lower() for w in nltk.regexp_tokenize(text, pattern)]
if tuples:
result = tuple(result)
return result
示例12: compute_df
def compute_df(self, document_list):
'''Compute document frequency based on input document list'''
df_cache = dict()
df_output = dict()
d_index = 0
for document in document_list:
d_index += 1
# tokenize each document
reg_toks = nltk.regexp_tokenize(document, SENTENCE_RE)
for item in reg_toks:
# change each word to lower case and lemmatize
item = normalise(item)
if item not in df_cache:
df_cache[item] = set([d_index])
else:
df_cache[item].add(d_index)
for item in df_cache:
if acceptable_word(item):
df_output[item] = len(df_cache[item])
df_output['total_document'] = len(document_list)
return df_output
示例13: main
def main(self, text):
"""Breaks a single string into a tree using the grammar and returns
the specified words as a string."""
if text is None:
return None
try:
text = text.encode("ascii", "ignore")
except:
text = text.decode("utf-8", "ignore").encode("ascii", "ignore")
chunker = nltk.RegexpParser(grammar)
toks = nltk.regexp_tokenize(text, sentence_re)
postoks = nltk.tag.pos_tag(toks)
#print postoks
tree = chunker.parse(postoks)
terms = self.get_terms(tree)
words = self.get_words(terms)
return words
示例14: generate_vocab
def generate_vocab(papers):
"""Returns the vocabulary used in the papers given in parameters, after cleaning and stopwords removal.
Args:
papers (list of tuples): the raw list of papers from which generates the vocabulary (each element is a tuple of 3 strings: id, title and abstract)
Returns:
list of strings: the list of tokens forming the vocabulary
"""
sc = StringCleaner()
# Generate author's vocabulary
corpus = " ".join(p[1] + " " + p[2] for p in papers)
# Cleaning
corpus = sc.clean_string(corpus)
# Tokenization
pattern = r"(?:[A-Z]\.)+|\w+(?:-\w+)*|\d+(?:\.\d+)?%?"
# we keep tokens that are words (with optional internal hyphens), acronyms and percentages
tokens = set(nltk.regexp_tokenize(corpus, pattern)) - set(nltk.corpus.stopwords.words("english"))
num_re = re.compile("^\d+$")
tokens = set([t for t in tokens if not num_re.match(t)]) # we remove only-numeric tokens
# Stemming
porter = nltk.stem.PorterStemmer()
return [porter.stem(t) for t in tokens]
示例15: extract
def extract(self, text):
''' Extract and freudify noun phrases from text, return all succesfully
freudified noun phrases. '''
toks = nltk.regexp_tokenize(text, self.sentence_re)
postoks = nltk.tag.pos_tag(toks)
tree = self.chunker.parse(postoks)
terms = self._get_terms(tree)
phrases = sets.Set()
# Loop through all the noun phrases and try to freudify them.
for term in terms:
if (len(term)) < 2: continue
changed = False
context = ""
phrase = []
for part in term:
word, tag = part
word = word.encode('ascii', 'replace')
phrase.append(word.lower())
rpl = self.replace_word(tag[:2], word)
if len(rpl[2]) > 0:
context = rpl[2]
phrase[-1] = rpl[0]
changed = True
if changed:
phrase = " ".join(phrase).strip()
phrase.encode('ascii', 'replace')
phrase = str(phrase)
if phrase not in self.own_phrases[context]:
phrases.add((str(phrase), context))
phrases = list(phrases)
return phrases