本文整理汇总了Python中util.Util.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python Util.tokenize方法的具体用法?Python Util.tokenize怎么用?Python Util.tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类util.Util
的用法示例。
在下文中一共展示了Util.tokenize方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tokenize [as 别名]
def main(argv):
"""Compute the sentence frequency of each term"""
# How many sentences does each word appear in?
lexicon = defaultdict(lambda: set())
for arg in argv:
with open(arg, 'r') as fin:
sentences = list()
for line in fin:
s = Util.tokenize(line, Summarizer.non_space)
sentence = []
for w in s:
sentence.append(w)
if Summarizer.sentence_terminator.search(w):
sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer)
sentences.append(sent)
sentence = []
for sent in sentences:
for w in sent.stemmed:
lexicon[w].add(sent) # set() will de-duplicate
sf = {}
for w in lexicon:
sf[w] = len(lexicon[w])
#print sf
with open('sf.dat', 'wb') as out:
pickle.dump(sf, out)
示例2: summarize
# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tokenize [as 别名]
def summarize(self, document_path):
sentences = {}
counter = 0
with open(document_path, 'r') as f:
for line in f:
s = Util.tokenize(line, Summarizer.non_space)
sentence = []
for w in s:
sentence.append(w)
if Summarizer.sentence_terminator.search(w):
sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer)
sentences[sent] = (sent.tfidf(self.tf, self.df, Summarizer.NUM_DOCS), counter)
sentence = []
counter += 1
totalWords = 0
selected = []
already_included = set()
# Use the tf-idf score to sort the sentences
for sent in sorted(sentences, key=lambda x: sentences[x][0], reverse=True):
if sent not in already_included: # no duplicates
already_included.add(sent)
selected.append(sent)
totalWords += sent.getLength()
if totalWords > 100:
break
# return the selected sentences in their order of appearance in the document
return sorted(selected, key=lambda x: sentences[x][1])
示例3: format
# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tokenize [as 别名]
def format(file):
# review is the name of the file that will be cleaned
# open the review file
f = open(file, "r")
# begin reading
# number of reviews; tracker
number = 0
for line in f:
# only gets the actual review
if "review/text" in line:
# increments
number += 1
# gets rid of the review
temp = line.replace("review/text:", "")
# turns the review into a tokenized list
words = Util.tokenize(temp)
# creates new file with incremented number
fi = open(str(number) + ".txt", "w")
# for each word in the review...
for word in words:
# only adds words that meet the follow regular pattern
if (re.compile("^[a-zA-Z'-]+$")).match(word):
# add the word to the file
fi.write(word + " ")
# close the file
fi.close()
示例4: summarize
# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tokenize [as 别名]
def summarize(self, document_path):
allwords = {}
sentences = []
with open(document_path, 'r') as f:
index = 0
for line in f:
s = Util.tokenize(line, Summarizer.non_space)
sentence = []
for w in s:
sentence.append(w)
if Summarizer.sentence_terminator.search(w):
sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer)
sentences.append(sent)
for t in sent.stemmed:
if t not in allwords:
allwords[t] = index
index += 1
sentence = []
matrix = np.zeros((len(sentences), len(allwords)))
for i, sent in enumerate(sentences):
for t in sent.stemmed:
matrix[i, allwords[t]] = Util.tfidf(t, self.tf, self.df, Summarizer.NUM_DOCS)
# Normalize
normalizer = np.reshape(np.sum(matrix**2, axis=1)**0.5, (len(matrix), 1))
matrix /= normalizer
model = KMeans(n_clusters=Cluster.NUM_CLUSTERS, tol=1e-9)
model.fit_predict(np.nan_to_num(matrix))
labels = model.labels_
totalWords = 0
selected = []
# From each cluster, pick the sentence that is nearest to the cluster
# centroid
for i in range(Cluster.NUM_CLUSTERS):
member_indices = np.where(labels == i)
distances = np.dot(matrix[member_indices], model.cluster_centers_[i])
closest_index = np.argmin(distances, 0)
# 'closest_index' is the index into the member_indices array
member_index = member_indices[0][closest_index]
selected.append((member_index, sentences[member_index])) # stash the index of the sentence as well
totalWords += sentences[member_index].getLength()
if totalWords > 100:
break
# return the selected sentences in their order of appearance in the document
return [s[1] for s in sorted(selected, key=lambda x: x[0])]
示例5: summarize
# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tokenize [as 别名]
def summarize(self, document_path):
allwords = {}
sentences = []
with open(document_path, 'r') as f:
index = 0
for line in f:
s = Util.tokenize(line, Summarizer.non_space)
sentence = []
for w in s:
sentence.append(w)
if Summarizer.sentence_terminator.search(w):
sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer)
sentences.append(sent)
for t in sent.stemmed:
if t not in allwords:
allwords[t] = index
index += 1
sentence = []
matrix = np.zeros((len(allwords), len(sentences)))
for i, sent in enumerate(sentences):
for t in sent.stemmed:
matrix[allwords[t], i] = Util.tfidf(t, self.tf, self.df, Summarizer.NUM_DOCS)
U, sigma, V_T = np.linalg.svd(matrix, full_matrices=False) # V is already transposed
# The rows of V_T correspond to 'independent topics', and the columns are the sentences.
# For each topic, we pick the sentence that has the highest strength (value) in the row.
max_cols = V_T.argmax(axis=1)
already_included = set()
totalWords = 0
selected = []
for i in max_cols:
if i not in already_included:
already_included.add(i)
selected.append((i, sentences[i])) # stash the index of the sentence as well
totalWords += sentences[i].getLength()
if totalWords > 100:
break
# return the selected sentences in their order of appearance in the document
return [s[1] for s in sorted(selected, key=lambda x: x[0])]
示例6: unigrams
# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tokenize [as 别名]
def unigrams(document_path):
"""Break sentences in a document into unigrams"""
sentences = set()
with open(document_path, 'r') as f:
for line in f:
s = Util.tokenize(line, Summarizer.non_space)
sentence = []
for w in s:
sentence.append(w)
if Summarizer.sentence_terminator.search(w):
sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer)
sentences.add(sent)
sentence = []
all_unigrams = set()
for sentence in sentences:
stemmed = sentence.stemmed
for i in range(len(stemmed)):
all_unigrams.add(stemmed[i])
return all_unigrams
示例7: analyze
# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tokenize [as 别名]
def analyze(self, filename=None, text=None):
# analyze a new document using the stored values
# if there is a filename given, create a new Document object
if filename != None:
doc = Document(None, filename)
words = doc.tokenize()
# otherwise, analyze the given text
elif text != None:
words = Util.tokenize(text)
# if both are None, return error
else:
print "Analyzer requires a filename or text to analyze. Please try again."
return
# store dict of log value sums
log_sums = {}
# for every heuristic...
for key in self.log_values:
# initialize a value to 0
current_sum = 0.0
# iterate over words
for word in words:
current_sum += self.log_values[key].get(word)
# store new sum
log_sums[key] = current_sum
# calculate largest log sum; this can be improved by doing this inside above loop
# for clarity, we will add an extra loop here
# track largest sum
largest = -1.0
# track largest key
largest_heuristic = ""
# iterate through all the keys
for key in log_sums:
# if the new value is larger...
if log_sums[key] > largest:
# update values
largest = log_sums[key]
largest_heuristic = key
# return best key
return largest_heuristic
示例8: format
# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tokenize [as 别名]
def format(file, max_r):
# review is the name of the file that will be cleaned
# open the review file
f = open(file, "r")
# begin reading
# number of reviews
number1 = 0
number2 = 0
# initialize counters
counter1 = -1
counter2 = -1
for line in f:
# break if we already have all the reviews we want
if number1 == max_r and number2 == max_r:
break;
# only gets the actual review
if "review/score" in line:
score = line.replace("review/score: ", "")
score = float(score)
score = int(score)
if score == 1 :
counter1 = 3
elif score == 5 :
counter2 = 3
elif counter1 == 0 and number1 < max_r:
# gets rid of the review
temp = line.replace("review/text:", "")
# turns the review into a tokenized list
words = Util.tokenize(temp)
# creates new file with incremented number
f = open("negative/" + str(number1) + ".txt", "w")
# increments
number1 += 1
# print out words
for word in words:
# only adds words that meet the follow regular pattern
if (re.compile("^[a-zA-Z'-]+$")).match(word):
f.write(word + " ")
f.close()
elif counter2 == 0 and number2 < max_r:
# gets rid of the review
temp = line.replace("review/text:", "")
# turns the review into a tokenized list
words = Util.tokenize(temp)
# creates new file with incremented number
f = open("positive/" + str(number2) + ".txt","w")
# increments
number2 += 1
# print out words
for word in words:
# only adds words that meet the follow regular pattern
if (re.compile("^[a-zA-Z'-]+$")).match(word):
f.write(word + " ")
f.close()
# if either counter is nonnegative, decrement
if counter1 > -1:
counter1 -= 1
if counter2 > -1:
counter2 -= 1
s = open("seed.txt", "w")
s.write("positive\n" + str(number1) + "\n")
s.write("negative\n" + str(number2))
s.close()
示例9: summarize
# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tokenize [as 别名]
def summarize(self, document_path):
"""
Compute the PageRank of each sentence based on edge weights that are
derived from the cosine similarity between sentences.
"""
sentences = {} # key=index, value=sentence
with open(document_path, 'r') as f:
index = 0
for line in f:
s = Util.tokenize(line, Summarizer.non_space)
sentence = []
for w in s:
sentence.append(w)
if Summarizer.sentence_terminator.search(w):
sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer)
sentences[index] = sent
sentence = []
index += 1
# Now that we have the sentences, we need a lexicon so that we can compute
# the cosine similarity between sentences.
#
lexicon = {}
with open( os.path.join(Summarizer.BASE_DIR, 'LEXICON'), 'r') as f:
word_counter = 0
for line in f:
count, word = line.split()
lexicon[word] = word_counter
word_counter += 1
# Multi-dimensional vectors representing sentences in the space of words
S = np.zeros(len(sentences) * len(lexicon), dtype=np.dtype('Float64')).reshape(len(sentences), len(lexicon))
for i in range(len(sentences)):
for w in sentences[i].stemmed:
S[i, lexicon[w]] = 1
P = np.dot(S, S.transpose()) # Numerator of the cosine similarity expression
# Now zero out the diagonal elements (corresponds to removing self loops)
P -= np.diag(np.diag(P))
# calculate the denominator of the cosine similarity expression (i.e. the
# normalization factor): this is the product of the magnitudes of the two
# vectors whose dot product makes up the numerator
#
# we start with calculating the square root of the sum of the squares of
# each vector's components
D = np.reshape( np.sum(S**2, axis=1)**0.5, (len(S), 1) )
# Now we compute the product of the square roots of the different vectors
DD = np.dot(D, D.transpose())
# element-wise division to get the similarity scores
P = np.divide(P, DD)
# At this point, P is essentially a weighted Adjacency matrix
G = networkx.DiGraph(np.nan_to_num(P)) # Look up documentation of numpy.nan_to_num()
pagerank = networkx.pagerank_numpy(G)
important = heapq.nlargest(100, pagerank, key=lambda x: pagerank[x])
#print [(x, pagerank[x]) for x in important]
totalWords = 0
selected = []
already_included = set()
for i in important:
if sentences[i] not in already_included: # no duplicates
already_included.add(sentences[i])
selected.append((i, sentences[i]))
totalWords += sentences[i].getLength()
if totalWords > 100:
break
# return the selected sentences in their order of appearance in the document
return [s[1] for s in sorted(selected, key=lambda x: x[0])]
示例10: tokenize
# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tokenize [as 别名]
def tokenize(self, delimiter=" "):
# tokenize the string using an optional delimiter
# del is of type String
return Util.tokenize(self.text)