本文整理汇总了Python中preprocessor.Preprocessor.preprocessor_main方法的典型用法代码示例。如果您正苦于以下问题:Python Preprocessor.preprocessor_main方法的具体用法?Python Preprocessor.preprocessor_main怎么用?Python Preprocessor.preprocessor_main使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类preprocessor.Preprocessor
的用法示例。
在下文中一共展示了Preprocessor.preprocessor_main方法的2个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: classify
# 需要导入模块: from preprocessor import Preprocessor [as 别名]
# 或者: from preprocessor.Preprocessor import preprocessor_main [as 别名]
def classify(self) :
t1 = time.time()
# Schedule a crawl job with the query
try :
crawler = Search(self.search_query)
crawler.googleSearch()
except Exception as e :
print e
print "Error in initializing Google search"
t2 = time.time()
print "Google search done in " + str(t2-t1) + " secs"
# Extract data crawled
try :
crawler.get_crawled_urls()
except Exception as e :
print e
print "Error in extracting crawl data"
t3 = time.time()
print "Test data extraction done in " + str(t3-t2) + " secs"
# Preprocess test data
try :
preproc_test = Preprocessor(crawler.all_urls)
preproc_test.preprocessor_main()
except Exception as e :
print e
print "Error in preprocessing crawl data"
t4 = time.time()
print "Test data preprocessing done in " + str(t4-t3) + " secs"
# Send a search request to Dig server with the query
dig_search = Dig_Search(self.search_query)
dig_search.search_request()
t5 = time.time()
print "Dig Search done in " + str(t5-t4) + " secs"
# Extract results returned by search query
dig_search.dig_extraction()
t6 = time.time()
print "Dig extraction done in " + str(t6-t5) + " secs"
# Preprocess the search results
try :
preproc_train = Preprocessor(dig_search.urls_dig)
preproc_train.preprocessor_main()
dig_search.filter_dig_result(preproc_train.data)
except Exception as e :
print e
print "Error in preprocessing training data"
t7 = time.time()
print "Training data preprocessing done in " + str(t7-t6) + " secs"
# Compute tfidf vectors of data
try :
tfidf_train = Tfidf_Vectorize(dig_search.urls_dig)
tfidf_train.tfidf_vectorize_train()
tfidf_train.tfidf_vectorize_test(preproc_test.data)
except Exception as e :
print e
print "Error in computing tfidf vectorization"
t9 = time.time()
print "Tfidf computation done in " + str(t9-t7) + " secs"
# Compute similarity of training data with its centroid vector
try :
sim_train = Similarity(tfidf_train.tfidf_centroid_train, tfidf_train.features_train, tfidf_train.tfidf_train)
similarity_train = sim_train.similarity_main()
except Exception as e :
print e
print "Error in computing cosine similarity"
t10 = time.time()
print "Training data similarity computation done in " + str(t10-t9) + " secs"
# Compute similarity of test data with training data
try :
sim_test = Similarity(tfidf_train.tfidf_centroid_train, tfidf_train.features_train, tfidf_train.tfidf_test)
similarity_test = sim_test.similarity_main()
except Exception as e :
print e
print "Error in computing cosine similarity"
t11 = time.time()
print "Similarity computation done in " + str(t11-t10) + " secs"
print "Total time = " + str(t11-t1)
evaluator = Evaluation(similarity_train, similarity_test)
urls_classified = evaluator.compare_similarity(preproc_test)
classified_output = self.formatOutput(urls_classified)
return classified_output
示例2: classify
# 需要导入模块: from preprocessor import Preprocessor [as 别名]
# 或者: from preprocessor.Preprocessor import preprocessor_main [as 别名]
def classify(self) :
t1 = time.time()
# Schedule a crawl job with the query
try :
crawler = Search(self.search_query)
crawler.googleSearch()
except Exception as e :
print "Error in initializing Google search"
t2 = time.time()
print "Google search done in " + str(t2-t1) + " secs"
# Extract data crawled
try :
crawler.get_crawled_urls()
except Exception as e :
print "Error in extracting crawl data"
t3 = time.time()
print "Test data extraction done in " + str(t3-t2) + " secs"
# Preprocess test data
try :
preproc_test = Preprocessor(crawler.all_urls)
preproc_test.preprocessor_main()
except Exception as e :
print e
print "Error in preprocessing crawl data"
t4 = time.time()
print "Test data preprocessing done in " + str(t4-t3) + " secs"
# Send a search request to Dig server with the query
dig_search = Dig_Search(self.search_query)
dig_search.search_request()
t5 = time.time()
print "Dig Search done in " + str(t5-t4) + " secs"
# Extract results returned by search query
dig_search.dig_extraction()
t6 = time.time()
print "Dig extraction done in " + str(t6-t5) + " secs"
# Preprocess the search results
try :
preproc_train = Preprocessor(dig_search.urls_dig)
preproc_train.preprocessor_main()
dig_search.filter_dig_result(preproc_train.data)
except Exception as e :
print e
print "Error in preprocessing training data"
t7 = time.time()
print "Training data preprocessing done in " + str(t7-t6) + " secs"
# Compute tfidf vectors of data
try :
tfidf_train = Tfidf_Vectorize(dig_search.urls_dig)
tfidf_train.tfidf_vectorize_train()
tfidf_train.tfidf_vectorize_test(preproc_test.data)
except Exception as e :
print e
print "Error in computing tfidf vectorization"
t9 = time.time()
print "Tfidf computation done in " + str(t9-t7) + " secs"
# Compute similarity of training data with its centroid vector
try :
sim_train = Similarity(tfidf_train.tfidf_centroid_train, tfidf_train.features_train, tfidf_train.tfidf_train)
similarity_train = sim_train.similarity_main()
except Exception as e :
print e
print "Error in computing cosine similarity"
t10 = time.time()
print "Training data similarity computation done in " + str(t10-t9) + " secs"
# Compute similarity of test data with training data
try :
sim_test = Similarity(tfidf_train.tfidf_centroid_train, tfidf_train.features_train, tfidf_train.tfidf_test)
similarity_test = sim_test.similarity_main()
except Exception as e :
print e
print "Error in computing cosine similarity"
t11 = time.time()
print "Similarity computation done in " + str(t11-t10) + " secs"
print "Total time = " + str(t11-t1)
evaluator = Evaluation(similarity_train, similarity_test)
similarity_count = evaluator.compare_similarity(preproc_test)
avg_train_similarity = numpy.mean(similarity_train)
epsilon = 0.4 * avg_train_similarity
classifier_output = open("output/" + self.search_query.replace(' ','_') + "2.html","w")
urls_classified = []
#.........这里部分代码省略.........