当前位置: 首页>>代码示例>>Python>>正文


Python Page.read_tf_idf方法代码示例

本文整理汇总了Python中page.Page.read_tf_idf方法的典型用法代码示例。如果您正苦于以下问题:Python Page.read_tf_idf方法的具体用法?Python Page.read_tf_idf怎么用?Python Page.read_tf_idf使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在page.Page的用法示例。


在下文中一共展示了Page.read_tf_idf方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: __init__

# 需要导入模块: from page import Page [as 别名]
# 或者: from page.Page import read_tf_idf [as 别名]
    def __init__(self, folder_path, dataset, date="Mar15", num_samples=None, mode="write",debug=True): # mode: {raw, read, write}
        self.folder_path = folder_path
        self.dataset = dataset
        self.date = date
        print folder_path,dataset,date," creating allPages "
        #print folder_path
        self.threshold = 4.0
        self.pages = []
        self.path_list = []
        self.category = [] # prediction
        self.xpaths_set = Set()
        self.ground_truth = []  # ground truth list for all pages
        self.idf = {}
        self.selected_df = {}
        self.df = {}
        self.features = []
        self.mode = mode
        self.num_samples = num_samples

        if debug:
            print "debug for pageCluster"
            print "num_samples", num_samples, type(num_samples)
            if num_samples is None:
                feat_folder = "./{}/feature/".format(date) + dataset
            else:
                feat_folder = "./{0}/feature/{1}/".format(date,num_samples) + dataset
                if not os.path.exists(feat_folder):
                    if not os.path.exists("./{0}/feature/{1}/".format(date,num_samples)):
                        os.mkdir("./{0}/feature/{1}/".format(date,num_samples))
                    os.mkdir(feat_folder)
        else:
            if num_samples == None:
                feat_folder = "../{}/feature/".format(date) + dataset
            else:
                feat_folder = "../{0}/feature/{1}/".format(date,num_samples) + dataset
            if not os.path.exists(feat_folder):
                if not os.path.exists("../{0}/feature/{1}/".format(date,num_samples)):
                    os.mkdir("./{0}/feature/{1}/".format(date,num_samples))
                os.mkdir(feat_folder)
        print feat_folder, "feat folder"

        if not os.path.exists(feat_folder):
            os.makedirs(feat_folder)
        if mode == "read":
            page_list = open(feat_folder+"/pages.txt","r").readlines()
            tf_idf_lines = open(feat_folder+"/tf_idf.txt","r").readlines()
            log_tf_idf_lines = open(feat_folder+ "/log_tf_idf.txt","r").readlines()
            features = open(feat_folder + "/xpaths.txt","r").readlines()
            idf_file = open(feat_folder + "/idf.txt","r")
            #file_size_file = open("./{}/feature/".format(date)+ dataset +"/size.txt","r")

            num_samples = len(page_list)
            for i in range(num_samples):

                    pid = page_list[i].strip().split(":")[0]
                    file_path = ":".join(page_list[i].strip().split(":")[1:])
                    file_page = Page(file_path,mode="read")
                    self.path_list.append(file_path)

                    tf_idf_features = tf_idf_lines[i].strip().split(":")[-1]
                    file_page.read_tf_idf(tf_idf_features)
                    
                    log_tf_idf_features = log_tf_idf_lines[i].strip().split(":")[-1]
                    file_page.read_log_tf_idf(log_tf_idf_features)

                    self.pages.append(file_page)

            for i in range(len(features)):
                fid =features[i].strip().split(":")[0]
                xpath = features[i].strip().split(":")[1]
                self.features.append(xpath)

            self.idf = pickle.load(idf_file)
            #self.file_size_list = pickle.load(file_size_file)
            self.category = [0 for i in range(num_samples)]
            self.get_ground_truth(dataset)
            self.num = len(page_list)
        elif mode == "c_baseline":
            print "it is the baseline of v.crescenzi"
            self.add_page_anchor(folder_path)
            self.get_ground_truth(dataset)
        elif mode == "irobot":
            print "it is for the baseline irobot with partial tree alignment "
            self.get_ground_truth(dataset)

        else:
        # initialize data structure
            #  update attributes
            self.addPages(folder_path)
            self.expandXpaths()
            self.updateidf()
            #self.get_ground_truth(dataset)
            self.num = len(self.pages)
            #self.top_local_stop_structure_gt(0.9)
            self.updatetfidf()
            #self.filter_df(0.01,1.0)
            #self.filter_dfs_xpaths_list()
            #self.Leung_baseline()  # binary feature
            self.selected_tfidf()
            self.get_ground_truth(dataset)
#.........这里部分代码省略.........
开发者ID:rivercold,项目名称:webStructure,代码行数:103,代码来源:pages.py


注:本文中的page.Page.read_tf_idf方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。