当前位置: 首页>>代码示例>>Python>>正文


Python Util.tokenize方法代码示例

本文整理汇总了Python中util.Util.tokenize方法的典型用法代码示例。如果您正苦于以下问题:Python Util.tokenize方法的具体用法?Python Util.tokenize怎么用?Python Util.tokenize使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在util.Util的用法示例。


在下文中一共展示了Util.tokenize方法的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tokenize [as 别名]
def main(argv):
    """Compute the sentence frequency of each term"""

    # How many sentences does each word appear in?
    lexicon = defaultdict(lambda: set())

    for arg in argv:
        with open(arg, 'r') as fin:
            sentences = list()
            for line in fin:
                s = Util.tokenize(line, Summarizer.non_space)
                sentence = []
                for w in s:
                    sentence.append(w)
                    if Summarizer.sentence_terminator.search(w):
                        sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer)
                        sentences.append(sent)
                        sentence = []

            for sent in sentences:
                for w in sent.stemmed:
                    lexicon[w].add(sent) # set() will de-duplicate

        sf = {}
        for w in lexicon:
            sf[w] = len(lexicon[w])

        #print sf
        with open('sf.dat', 'wb') as out:
            pickle.dump(sf, out)
开发者ID:ChenluJi,项目名称:cs224n-project,代码行数:32,代码来源:sf.py

示例2: summarize

# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tokenize [as 别名]
    def summarize(self, document_path):
        sentences = {}
        counter = 0

        with open(document_path, 'r') as f:
            for line in f:
                s = Util.tokenize(line, Summarizer.non_space)
                sentence = []
                for w in s:
                    sentence.append(w)
                    if Summarizer.sentence_terminator.search(w):
                        sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer)
                        sentences[sent] = (sent.tfidf(self.tf, self.df, Summarizer.NUM_DOCS), counter)
                        sentence = []
                        counter += 1

        totalWords = 0
        selected = []
        already_included = set()
        # Use the tf-idf score to sort the sentences
        for sent in sorted(sentences, key=lambda x: sentences[x][0], reverse=True):
            if sent not in already_included: # no duplicates
                already_included.add(sent)
                selected.append(sent)
                totalWords += sent.getLength()
                if totalWords > 100:
                    break

        # return the selected sentences in their order of appearance in the document
        return sorted(selected, key=lambda x: sentences[x][1])
开发者ID:ChenluJi,项目名称:cs224n-project,代码行数:32,代码来源:tfidf.py

示例3: format

# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tokenize [as 别名]
def format(file):
  # review is the name of the file that will be cleaned
  # open the review file
  f = open(file, "r")
  # begin reading
  # number of reviews; tracker
  number = 0
  for line in f:
    # only gets the actual review
    if "review/text" in line:
      # increments
      number += 1
      # gets rid of the review
      temp = line.replace("review/text:", "")
      # turns the review into a tokenized list
      words = Util.tokenize(temp)
      # creates new file with incremented number
      fi = open(str(number) + ".txt", "w")
      # for each word in the review...
      for word in words:
        # only adds words that meet the follow regular pattern
        if (re.compile("^[a-zA-Z'-]+$")).match(word):
          # add the word to the file
          fi.write(word + " ")
      # close the file
      fi.close()
开发者ID:tjkang01,项目名称:cs51-final,代码行数:28,代码来源:cleaner.py

示例4: summarize

# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tokenize [as 别名]
    def summarize(self, document_path):
        allwords = {}
        sentences = []

        with open(document_path, 'r') as f:
            index = 0
            for line in f:
                s = Util.tokenize(line, Summarizer.non_space)
                sentence = []
                for w in s:
                    sentence.append(w)
                    if Summarizer.sentence_terminator.search(w):
                        sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer)
                        sentences.append(sent)
                        for t in sent.stemmed:
                            if t not in allwords:
                                allwords[t] = index
                                index += 1
                        sentence = []

        matrix = np.zeros((len(sentences), len(allwords)))
        for i, sent in enumerate(sentences):
            for t in sent.stemmed:
                matrix[i, allwords[t]] = Util.tfidf(t, self.tf, self.df, Summarizer.NUM_DOCS)

        # Normalize
        normalizer = np.reshape(np.sum(matrix**2, axis=1)**0.5, (len(matrix), 1))
        matrix /= normalizer

        model = KMeans(n_clusters=Cluster.NUM_CLUSTERS, tol=1e-9)
        model.fit_predict(np.nan_to_num(matrix))
        labels = model.labels_

        totalWords = 0
        selected = []

        # From each cluster, pick the sentence that is nearest to the cluster
        # centroid
        for i in range(Cluster.NUM_CLUSTERS):
            member_indices = np.where(labels == i)
            distances = np.dot(matrix[member_indices], model.cluster_centers_[i])
            closest_index = np.argmin(distances, 0)
            # 'closest_index' is the index into the member_indices array
            member_index = member_indices[0][closest_index]
            selected.append((member_index, sentences[member_index]))  # stash the index of the sentence as well
            totalWords += sentences[member_index].getLength()
            if totalWords > 100:
                break

        # return the selected sentences in their order of appearance in the document
        return [s[1] for s in sorted(selected, key=lambda x: x[0])]
开发者ID:ChenluJi,项目名称:cs224n-project,代码行数:53,代码来源:cluster.py

示例5: summarize

# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tokenize [as 别名]
    def summarize(self, document_path):
        allwords = {}
        sentences = []

        with open(document_path, 'r') as f:
            index = 0
            for line in f:
                s = Util.tokenize(line, Summarizer.non_space)
                sentence = []
                for w in s:
                    sentence.append(w)
                    if Summarizer.sentence_terminator.search(w):
                        sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer)
                        sentences.append(sent)
                        for t in sent.stemmed:
                            if t not in allwords:
                                allwords[t] = index
                                index += 1
                        sentence = []

        matrix = np.zeros((len(allwords), len(sentences)))
        for i, sent in enumerate(sentences):
            for t in sent.stemmed:
                matrix[allwords[t], i] = Util.tfidf(t, self.tf, self.df, Summarizer.NUM_DOCS)

        U, sigma, V_T = np.linalg.svd(matrix, full_matrices=False) # V is already transposed

        # The rows of V_T correspond to 'independent topics', and the columns are the sentences.
        # For each topic, we pick the sentence that has the highest strength (value) in the row.
        max_cols = V_T.argmax(axis=1)

        already_included = set()
        totalWords = 0
        selected = []

        for i in max_cols:
            if i not in already_included:
                already_included.add(i)
                selected.append((i, sentences[i])) # stash the index of the sentence as well
                totalWords += sentences[i].getLength()
                if totalWords > 100:
                    break

        # return the selected sentences in their order of appearance in the document
        return [s[1] for s in sorted(selected, key=lambda x: x[0])]
开发者ID:ChenluJi,项目名称:cs224n-project,代码行数:47,代码来源:svd.py

示例6: unigrams

# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tokenize [as 别名]
def unigrams(document_path):
    """Break sentences in a document into unigrams"""
    sentences = set()

    with open(document_path, 'r') as f:
        for line in f:
            s = Util.tokenize(line, Summarizer.non_space)
            sentence = []
            for w in s:
                sentence.append(w)
                if Summarizer.sentence_terminator.search(w):
                    sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer)
                    sentences.add(sent)
                    sentence = []

    all_unigrams = set()
    for sentence in sentences:
        stemmed = sentence.stemmed
        for i in range(len(stemmed)):
            all_unigrams.add(stemmed[i])
    return all_unigrams
开发者ID:ChenluJi,项目名称:cs224n-project,代码行数:23,代码来源:evaluation.py

示例7: analyze

# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tokenize [as 别名]
 def analyze(self, filename=None, text=None):
   # analyze a new document using the stored values
   # if there is a filename given, create a new Document object
   if filename != None:
     doc = Document(None, filename)
     words = doc.tokenize()
   # otherwise, analyze the given text
   elif text != None:
     words = Util.tokenize(text)
   # if both are None, return error
   else:
     print "Analyzer requires a filename or text to analyze.  Please try again."
     return
   # store dict of log value sums
   log_sums = {}
   # for every heuristic...
   for key in self.log_values:
     # initialize a value to 0
     current_sum = 0.0
     # iterate over words
     for word in words:
       current_sum += self.log_values[key].get(word)
     # store new sum
     log_sums[key] = current_sum
   
   # calculate largest log sum; this can be improved by doing this inside above loop
   # for clarity, we will add an extra loop here
   # track largest sum
   largest = -1.0
   # track largest key
   largest_heuristic = ""
   # iterate through all the keys
   for key in log_sums:
     # if the new value is larger...
     if log_sums[key] > largest:
       # update values
       largest = log_sums[key]
       largest_heuristic = key
   # return best key
   return largest_heuristic
开发者ID:tjkang01,项目名称:cs51-final,代码行数:42,代码来源:analyzer_d.py

示例8: format

# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tokenize [as 别名]
def format(file, max_r):
  # review is the name of the file that will be cleaned
  # open the review file
  f = open(file, "r")
  # begin reading
  # number of reviews
  number1 = 0
  number2 = 0
  # initialize counters
  counter1 = -1
  counter2 = -1
  for line in f:
    # break if we already have all the reviews we want
    if number1 == max_r and number2 == max_r:
      break;
    # only gets the actual review
    if "review/score" in line:
      score = line.replace("review/score: ", "")
      score = float(score)
      score = int(score)
      if score == 1 : 
        counter1 = 3
      elif score == 5 : 
        counter2 = 3 
    elif counter1 == 0 and number1 < max_r:
      # gets rid of the review
      temp = line.replace("review/text:", "")
      # turns the review into a tokenized list
      words = Util.tokenize(temp)
      # creates new file with incremented number
      f = open("negative/" + str(number1) + ".txt", "w")
      # increments
      number1 += 1
      # print out words
      for word in words:
        # only adds words that meet the follow regular pattern
        if (re.compile("^[a-zA-Z'-]+$")).match(word):
          f.write(word + " ")
      f.close()
    elif counter2 == 0 and number2 < max_r:
      # gets rid of the review
      temp = line.replace("review/text:", "")
      # turns the review into a tokenized list
      words = Util.tokenize(temp)
      # creates new file with incremented number
      f = open("positive/" + str(number2) + ".txt","w")
      # increments
      number2 += 1
      # print out words
      for word in words:
        # only adds words that meet the follow regular pattern
        if (re.compile("^[a-zA-Z'-]+$")).match(word):
          f.write(word + " ")
      f.close() 
    # if either counter is nonnegative, decrement
    if counter1 > -1:
      counter1 -= 1
    if counter2 > -1:
      counter2 -= 1
  s = open("seed.txt", "w")
  s.write("positive\n" + str(number1) + "\n")
  s.write("negative\n" + str(number2))
  s.close()
开发者ID:tjkang01,项目名称:cs51-final,代码行数:65,代码来源:grouper.py

示例9: summarize

# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tokenize [as 别名]
    def summarize(self, document_path):
        """
        Compute the PageRank of each sentence based on edge weights that are
        derived from the cosine similarity between sentences.
        """
        sentences = {} # key=index, value=sentence

        with open(document_path, 'r') as f:
            index = 0
            for line in f:
                s = Util.tokenize(line, Summarizer.non_space)
                sentence = []
                for w in s:
                    sentence.append(w)
                    if Summarizer.sentence_terminator.search(w):
                        sent = Sentence(sentence, Summarizer.punctuation, Summarizer.stopwords, Summarizer.stemmer)
                        sentences[index] = sent
                        sentence = []
                        index += 1

        # Now that we have the sentences, we need a lexicon so that we can compute
        # the cosine similarity between sentences.
        #
        lexicon = {}
        with open( os.path.join(Summarizer.BASE_DIR, 'LEXICON'), 'r') as f:
            word_counter = 0
            for line in f:
                count, word = line.split()
                lexicon[word] = word_counter
                word_counter += 1

        # Multi-dimensional vectors representing sentences in the space of words
        S = np.zeros(len(sentences) * len(lexicon), dtype=np.dtype('Float64')).reshape(len(sentences), len(lexicon))
        for i in range(len(sentences)):
            for w in sentences[i].stemmed:
                S[i, lexicon[w]] = 1

        P = np.dot(S, S.transpose()) # Numerator of the cosine similarity expression

        # Now zero out the diagonal elements (corresponds to removing self loops)
        P -= np.diag(np.diag(P))

        # calculate the denominator of the cosine similarity expression (i.e. the
        # normalization factor): this is the product of the magnitudes of the two
        # vectors whose dot product makes up the numerator
        #
        # we start with calculating the square root of the sum of the squares of
        # each vector's components
        D = np.reshape( np.sum(S**2, axis=1)**0.5, (len(S), 1) )
        # Now we compute the product of the square roots of the different vectors
        DD = np.dot(D, D.transpose())
        # element-wise division to get the similarity scores
        P = np.divide(P, DD)

        # At this point, P is essentially a weighted Adjacency matrix

        G = networkx.DiGraph(np.nan_to_num(P)) # Look up documentation of numpy.nan_to_num()
        pagerank = networkx.pagerank_numpy(G)

        important = heapq.nlargest(100, pagerank, key=lambda x: pagerank[x])
        #print [(x, pagerank[x]) for x in important]
        totalWords = 0
        selected = []
        already_included = set()

        for i in important:
            if sentences[i] not in already_included: # no duplicates
                already_included.add(sentences[i])
                selected.append((i, sentences[i]))
                totalWords += sentences[i].getLength()
                if totalWords > 100:
                    break

        # return the selected sentences in their order of appearance in the document
        return [s[1] for s in sorted(selected, key=lambda x: x[0])]
开发者ID:ChenluJi,项目名称:cs224n-project,代码行数:77,代码来源:pagerank.py

示例10: tokenize

# 需要导入模块: from util import Util [as 别名]
# 或者: from util.Util import tokenize [as 别名]
 def tokenize(self, delimiter=" "):
   # tokenize the string using an optional delimiter
   # del is of type String
   return Util.tokenize(self.text)
开发者ID:tjkang01,项目名称:cs51-final,代码行数:6,代码来源:document.py


注:本文中的util.Util.tokenize方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。