当前位置: 首页>>代码示例>>Python>>正文


Python tokenizer.tokenize函数代码示例

本文整理汇总了Python中spambayes.tokenizer.tokenize函数的典型用法代码示例。如果您正苦于以下问题:Python tokenize函数的具体用法?Python tokenize怎么用?Python tokenize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了tokenize函数的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: test_dbm_export

 def test_dbm_export(self):
     # Create a dbm classifier to export.
     bayes = DBDictClassifier(TEMP_DBM_NAME)
     # Stuff some messages in it so it's not empty.
     bayes.learn(tokenize(spam1), True)
     bayes.learn(tokenize(good1), False)
     # Save & Close.
     bayes.store()
     bayes.close()
     # Export.
     sb_dbexpimp.runExport(TEMP_DBM_NAME, "dbm", TEMP_CSV_NAME)
     # Reopen the original.
     bayes = open_storage(TEMP_DBM_NAME, "dbm")
     # Verify that the CSV holds all the original data (and, by using
     # the CSV module to open it, that it is valid CSV data).
     fp = open(TEMP_CSV_NAME, "rb")
     reader = sb_dbexpimp.csv.reader(fp)
     (nham, nspam) = reader.next()
     self.assertEqual(int(nham), bayes.nham)
     self.assertEqual(int(nspam), bayes.nspam)
     for (word, hamcount, spamcount) in reader:
         word = sb_dbexpimp.uunquote(word)
         self.assert_(word in bayes._wordinfokeys())
         wi = bayes._wordinfoget(word)
         self.assertEqual(int(hamcount), wi.hamcount)
         self.assertEqual(int(spamcount), wi.spamcount)
开发者ID:bloggse,项目名称:spambayes-lite,代码行数:26,代码来源:test_sb_dbexpimp.py

示例2: _update

    def _update(self, folders, is_spam):
        changed = False
        for f in folders:
            log("update from %s" % f.path)
            added, removed = f.read()
            if added:
                log("added %d" % len(added))
            if removed:
                log("removed %d" % len(removed))
            get_transaction().commit()
            if not (added or removed):
                continue
            changed = True

            # It's important not to commit a transaction until
            # after update_probabilities is called in update().
            # Otherwise some new entries will cause scoring to fail.
            for msg in added.keys():
                self.classifier.learn(tokenize(msg), is_spam)
            del added
            get_transaction().commit(1)
            log("learned")
            for msg in removed.keys():
                self.classifier.unlearn(tokenize(msg), is_spam)
            if removed:
                log("unlearned")
            del removed
            get_transaction().commit(1)
        return changed
开发者ID:Xodarap,项目名称:Eipi,代码行数:29,代码来源:profile.py

示例3: mapmessages

def mapmessages(f, mboxtype, mapdb):
    i = 0
    for msg in getmbox(f):
        i += 1
        sys.stdout.write('\r%s: %d' % (f, i))
        sys.stdout.flush()
        msgid = msg.get("message-id")
        if msgid is None:
            continue
        for t in tokenize(msg):
            ham, spam = mapdb.get(t, ({}, {}))
            if mboxtype == "ham":
                msgids = ham.get(f, set())
                msgids.add(msgid)
                ham[f] = msgids
            else:
                msgids = spam.get(f, set())
                msgids.add(msgid)
                spam[f] = msgids
            mapdb[t] = (ham, spam)
        if options["Classifier", "x-use_bigrams"]:
            for t in Classifier()._enhance_wordstream(tokenize(msg)):
                ham, spam = mapdb.get(t, ({}, {}))
                if mboxtype == "ham":
                    msgids = ham.get(f, set())
                    msgids.add(msgid)
                    ham[f] = msgids
                else:
                    msgids = spam.get(f, set())
                    msgids.add(msgid)
                    spam[f] = msgids
                mapdb[t] = (ham, spam)
    sys.stdout.write("\n")
开发者ID:ehuelsmann,项目名称:spambayes,代码行数:33,代码来源:mkreversemap.py

示例4: train_message

def train_message(msg, is_spam, cdata):
    # Train an individual message.
    # Returns True if newly added (message will be correctly
    # untrained if it was in the wrong category), False if already
    # in the correct category.  Catch your own damn exceptions.
    # If re-classified AND rescore = True, then a new score will
    # be written to the message (so the user can see some effects)
    from spambayes.tokenizer import tokenize

    if not cdata.message_db.has_key(msg.searchkey):
        was_spam = None
    else:
        was_spam = cdata.message_db[msg.searchkey]=='1'
    if was_spam == is_spam:
        return False    # already correctly classified

    # Brand new (was_spam is None), or incorrectly classified.
    stream = msg.GetEmailPackageObject()
    if was_spam is not None:
        # The classification has changed; unlearn the old classification.
        cdata.bayes.unlearn(tokenize(stream), was_spam)

    # Learn the correct classification.
    cdata.bayes.learn(tokenize(stream), is_spam)
    cdata.message_db[msg.searchkey] = ['0', '1'][is_spam]
    cdata.dirty = True
    return True
开发者ID:Xodarap,项目名称:Eipi,代码行数:27,代码来源:train.py

示例5: runUIAndProxy

 def runUIAndProxy():
     httpServer = UserInterfaceServer(8881)
     proxyUI = ProxyUserInterface(state, _recreateState)
     httpServer.register(proxyUI)
     BayesProxyListener('localhost', 8110, ('', 8111))
     state.bayes.learn(tokenizer.tokenize(spam1), True)
     state.bayes.learn(tokenizer.tokenize(good1), False)
     proxyReady.set()
     Dibbler.run()
开发者ID:ArildF,项目名称:rogie,代码行数:9,代码来源:test_sb-server.py

示例6: test_untrain_spam

 def test_untrain_spam(self):
     self.h.open('c')
     # Put a message in the classifier to be removed.
     self.h.h.bayes.learn(tokenize(spam1), True)
     # Verify that the classifier gets untrained with the message.
     self.h.untrain_spam(spam1)
     self.assertEqual(self.h.h.bayes.nham, 0)
     self.assertEqual(self.h.h.bayes.nspam, 0)
     for token in tokenize(spam1):
         wi = self.h.h.bayes._wordinfoget(token)
         self.assertEqual(wi, None)
开发者ID:Xodarap,项目名称:Eipi,代码行数:11,代码来源:test_sb_filter.py

示例7: test_filter

 def test_filter(self):
     # Verify that the msg has the classification header added.
     self.h.open('c')
     self.h.h.bayes.learn(tokenize(good1), False)
     self.h.h.bayes.learn(tokenize(spam1), True)
     self.h.h.store()
     result = email.message_from_string(self.h.filter(spam1))
     self.assert_(result[options["Headers",
                                 "classification_header_name"]].\
                  startswith(options["Headers", "header_spam_string"]))
     result = email.message_from_string(self.h.filter(good1))
     self.assert_(result[options["Headers",
                                 "classification_header_name"]].\
                  startswith(options["Headers", "header_ham_string"]))
开发者ID:Xodarap,项目名称:Eipi,代码行数:14,代码来源:test_sb_filter.py

示例8: print_message_score

def print_message_score(msg_name, msg_fp):
    msg = email.message_from_file(msg_fp)
    bayes = CdbClassifier(open(DB_FILE, 'rb'))
    prob, evidence = bayes.spamprob(tokenize(msg), evidence=True)
    print msg_name, prob
    for word, prob in evidence:
        print '  ', repr(word), prob
开发者ID:ehuelsmann,项目名称:spambayes,代码行数:7,代码来源:sb_mailsort.py

示例9: test_merge_to_dbm

 def test_merge_to_dbm(self):
     # Create a dbm classifier to merge with.
     bayes = DBDictClassifier(TEMP_DBM_NAME)
     # Stuff some messages in it so it's not empty.
     bayes.learn(tokenize(spam1), True)
     bayes.learn(tokenize(good1), False)
     # Save data to check against.
     original_nham = bayes.nham
     original_nspam = bayes.nspam
     original_data = {}
     for key in bayes._wordinfokeys():
         original_data[key] = bayes._wordinfoget(key)
     # Save & Close.
     bayes.store()
     bayes.close()
     # Create a CSV file to import.
     nham, nspam = 3,4
     temp = open(TEMP_CSV_NAME, "wb")
     temp.write("%d,%d\n" % (nham, nspam))
     csv_data = {"this":(2,1), "is":(0,1), "a":(3,4), 'test':(1,1),
                 "of":(1,0), "the":(1,2), "import":(3,1)}
     for word, (ham, spam) in csv_data.items():
         temp.write("%s,%s,%s\n" % (word, ham, spam))
     temp.close()
     sb_dbexpimp.runImport(TEMP_DBM_NAME, "dbm", False, TEMP_CSV_NAME)
     # Open the converted file and verify that it has all the data from
     # the CSV file (and by opening it, that it is a valid dbm file),
     # and the data from the original dbm database.
     bayes2 = open_storage(TEMP_DBM_NAME, "dbm")
     self.assertEqual(bayes2.nham, nham + original_nham)
     self.assertEqual(bayes2.nspam, nspam + original_nspam)
     words = original_data.keys()[:]
     words.extend(csv_data.keys())
     for word in words:
         word = sb_dbexpimp.uquote(word)
         self.assert_(word in bayes2._wordinfokeys())
         h, s = csv_data.get(word, (0,0))
         wi = original_data.get(word, None)
         if wi:
             h += wi.hamcount
             s += wi.spamcount
         wi2 = bayes2._wordinfoget(word)
         self.assertEqual(h, wi2.hamcount)
         self.assertEqual(s, wi2.spamcount)
开发者ID:bloggse,项目名称:spambayes-lite,代码行数:44,代码来源:test_sb_dbexpimp.py

示例10: test_train_spam

 def test_train_spam(self):
     # Verify that the classifier gets trained with the message.
     self.h.open('c')
     self.h.train_spam(spam1)
     self.assertEqual(self.h.h.bayes.nham, 0)
     self.assertEqual(self.h.h.bayes.nspam, 1)
     for token in tokenize(spam1):
         wi = self.h.h.bayes._wordinfoget(token)
         self.assertEqual(wi.hamcount, 0)
         self.assertEqual(wi.spamcount, 1)
开发者ID:Xodarap,项目名称:Eipi,代码行数:10,代码来源:test_sb_filter.py


注:本文中的spambayes.tokenizer.tokenize函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。