本文整理汇总了Python中page.Page.read_tf_idf方法的典型用法代码示例。如果您正苦于以下问题:Python Page.read_tf_idf方法的具体用法?Python Page.read_tf_idf怎么用?Python Page.read_tf_idf使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类page.Page
的用法示例。
在下文中一共展示了Page.read_tf_idf方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from page import Page [as 别名]
# 或者: from page.Page import read_tf_idf [as 别名]
def __init__(self, folder_path, dataset, date="Mar15", num_samples=None, mode="write",debug=True): # mode: {raw, read, write}
self.folder_path = folder_path
self.dataset = dataset
self.date = date
print folder_path,dataset,date," creating allPages "
#print folder_path
self.threshold = 4.0
self.pages = []
self.path_list = []
self.category = [] # prediction
self.xpaths_set = Set()
self.ground_truth = [] # ground truth list for all pages
self.idf = {}
self.selected_df = {}
self.df = {}
self.features = []
self.mode = mode
self.num_samples = num_samples
if debug:
print "debug for pageCluster"
print "num_samples", num_samples, type(num_samples)
if num_samples is None:
feat_folder = "./{}/feature/".format(date) + dataset
else:
feat_folder = "./{0}/feature/{1}/".format(date,num_samples) + dataset
if not os.path.exists(feat_folder):
if not os.path.exists("./{0}/feature/{1}/".format(date,num_samples)):
os.mkdir("./{0}/feature/{1}/".format(date,num_samples))
os.mkdir(feat_folder)
else:
if num_samples == None:
feat_folder = "../{}/feature/".format(date) + dataset
else:
feat_folder = "../{0}/feature/{1}/".format(date,num_samples) + dataset
if not os.path.exists(feat_folder):
if not os.path.exists("../{0}/feature/{1}/".format(date,num_samples)):
os.mkdir("./{0}/feature/{1}/".format(date,num_samples))
os.mkdir(feat_folder)
print feat_folder, "feat folder"
if not os.path.exists(feat_folder):
os.makedirs(feat_folder)
if mode == "read":
page_list = open(feat_folder+"/pages.txt","r").readlines()
tf_idf_lines = open(feat_folder+"/tf_idf.txt","r").readlines()
log_tf_idf_lines = open(feat_folder+ "/log_tf_idf.txt","r").readlines()
features = open(feat_folder + "/xpaths.txt","r").readlines()
idf_file = open(feat_folder + "/idf.txt","r")
#file_size_file = open("./{}/feature/".format(date)+ dataset +"/size.txt","r")
num_samples = len(page_list)
for i in range(num_samples):
pid = page_list[i].strip().split(":")[0]
file_path = ":".join(page_list[i].strip().split(":")[1:])
file_page = Page(file_path,mode="read")
self.path_list.append(file_path)
tf_idf_features = tf_idf_lines[i].strip().split(":")[-1]
file_page.read_tf_idf(tf_idf_features)
log_tf_idf_features = log_tf_idf_lines[i].strip().split(":")[-1]
file_page.read_log_tf_idf(log_tf_idf_features)
self.pages.append(file_page)
for i in range(len(features)):
fid =features[i].strip().split(":")[0]
xpath = features[i].strip().split(":")[1]
self.features.append(xpath)
self.idf = pickle.load(idf_file)
#self.file_size_list = pickle.load(file_size_file)
self.category = [0 for i in range(num_samples)]
self.get_ground_truth(dataset)
self.num = len(page_list)
elif mode == "c_baseline":
print "it is the baseline of v.crescenzi"
self.add_page_anchor(folder_path)
self.get_ground_truth(dataset)
elif mode == "irobot":
print "it is for the baseline irobot with partial tree alignment "
self.get_ground_truth(dataset)
else:
# initialize data structure
# update attributes
self.addPages(folder_path)
self.expandXpaths()
self.updateidf()
#self.get_ground_truth(dataset)
self.num = len(self.pages)
#self.top_local_stop_structure_gt(0.9)
self.updatetfidf()
#self.filter_df(0.01,1.0)
#self.filter_dfs_xpaths_list()
#self.Leung_baseline() # binary feature
self.selected_tfidf()
self.get_ground_truth(dataset)
#.........这里部分代码省略.........