当前位置: 首页>>代码示例>>Python>>正文


Python DOM.by_attribute方法代码示例

本文整理汇总了Python中pattern.web.DOM.by_attribute方法的典型用法代码示例。如果您正苦于以下问题:Python DOM.by_attribute方法的具体用法?Python DOM.by_attribute怎么用?Python DOM.by_attribute使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pattern.web.DOM的用法示例。


在下文中一共展示了DOM.by_attribute方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1:

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_attribute [as 别名]
                
                type = re.sub('\n|\d+.*|\(.*\)','', g.content.encode('ascii', 'ignore').strip('\r\n'))
                
                
                if ((type != ' \n') and not (re.match('^\s+', type))):
                    genre.append(type)
            
            
            genresStr = ';'.join(genre)
            
            
        #=======================================================================
        # Get the directors
        #=======================================================================
        directors = []
        for movie in movieDom.by_attribute(itemprop="director"):
            
            # Get rid of the html tags
            dir = re.sub('<[a-zA-Z\/][^>]*>','', movie.content.encode('ascii','ignore').lstrip('\r\n'))
            
            # Get rid of new line
            dirs = re.sub('\n', '', dir)
            
            # Directors for other movies have leading spaces - don't add them
            if not re.match('^\s+', dirs):
                directors.append(dirs)
        
        directorsStr = ';'.join(directors)


        #=======================================================================
开发者ID:goodspeedj,项目名称:csci-e64,代码行数:33,代码来源:complex_HTML.py

示例2: get_info

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_attribute [as 别名]
def get_info(baseurl, out_filename, npages=200):

    output = open(out_filename, "w")
    w = writer.UnicodeWriter(output)
    # TODO: fix this header
    w.writerow(
        [
            "Title",
            "Rating",
            "Calories (kcal)",
            "Cholesterol (mg)",
            "Fat (g)",
            "Protein (g)",
            "Fiber (g)",
            "Sodium (mg)",
            "Cook Time",
            "Ingredients",
            "Full Ingredients",
        ]
    )

    for page in range(1, npages):
        try:
            url = URL(baseurl + "?Page=%d" % page)
            dom = DOM(url.download(cached=True))
            links = dom.by_class("rectitlediv")

            # goes through the 20 recipes on a given page
            for index in range(len(links)):
                # print index
                # get the link name
                title = links[index].content.split("/recipe/")[1].split("/detail")[0]
                # download individual recipe
                rpage = URL(os.path.join(base, title, end))
                pdom = DOM(rpage.download(cached=True))

                # average rating value
                rating = pdom.by_attribute(itemprop="ratingValue")[0].source.split('"')[3]

                # list of nutrition elements
                nut_list = pdom.by_class("nutrSumWrap")[0].by_class("nutrSumList")
                nut_vals = []
                for i in range(len(nut_list)):
                    val = nut_list[i].by_attribute(id="lblNutrientValue")[0].content
                    nut_vals.append(val)
                nuts = "\t".join(nut_vals)

                # time needed to cook
                try:
                    cook_hours = pdom.by_attribute(id="cookHoursSpan")[0].content
                    cook_hours = cook_hours.replace("<em>", " ").replace("</em>", " ")
                except:
                    cook_hours = "0"
                try:
                    cook_mins = pdom.by_attribute(id="cookMinsSpan")[0].content
                    cook_mins = cook_mins.replace("<em>", " ").replace("</em>", " ")
                except:
                    cook_mins = "0"
                mins = str(int(cook_hours.split()[0]) * 60 + int(cook_mins.split()[0]))

                # ingredients

                ## gets the block containing both the amount and the amount
                all_ings = pdom.by_attribute(itemprop="ingredients")
                ing_units = []
                ing_vals = []
                for ing_index in range(len(all_ings)):
                    tmp_ing = all_ings[ing_index].by_id("lblIngName").content
                    if "&nbsp;" in all_ings[ing_index].content:
                        continue
                    try:
                        tmp_amount = all_ings[ing_index].by_id("lblIngAmount").content
                    except:
                        tmp_amount = ""  # LET THIS BE THE EMPTY CHAR we decide on
                    ing_units.append(tmp_amount)
                    ing_vals.append(tmp_ing)
                ings = ";".join(ing_vals)

                ing_units = [x + "|" for x in ing_units]
                str_ings = [str(x) for x in zip(ing_units, ing_vals)]
                str_ings = [x.replace(",", " ") for x in str_ings]
                full_ings = ";".join(str_ings)
                full_ings = (
                    full_ings.replace("u'", "")
                    .replace("'", "")
                    .replace(", u", "")
                    .replace("(", "")
                    .replace(")", "")
                    .replace("  ", " ")
                )

                assert len(ing_vals) == len(ing_units)

                w.writerow([title, rating, nuts, mins, ings, full_ings])

        except:
            pass

    output.close()
开发者ID:dicai,项目名称:datavis,代码行数:101,代码来源:scrape.py

示例3: get_title_attributes

# 需要导入模块: from pattern.web import DOM [as 别名]
# 或者: from pattern.web.DOM import by_attribute [as 别名]
def get_title_attributes(title, titleLink):

    url = URL(titleLink)
    dom = DOM(url.download(cached=True))
    titleObj = Title(title.encode('ascii','replace'))

    print "Movie: ", title

    # Get Directors
    print "-> About to print directors... "

    directors = dom.by_attribute(itemprop="director")[0]
    directorNames =  directors.by_tag("a")


    for director in directorNames:
        print director.content

        dirName  = unicodedata.normalize('NFD', director.content).encode('ascii','replace')
        #str(director.content).encode("utf-8")
        print "Director ===> ", dirName

        titleObj.addDirectors( dirName )

    # Get writers
    print "-> About to print writers... "

    try:
        writers = dom.by_attribute(itemprop="writer")
        for writer in writers:
            # print writer[1][1].content
            titleObj.addWriters( str(writer[1][1].content).encode('ascii', 'replace'))
    except:
        pass



    print "--> About to get actors... "
    try:
        actors = dom.by_attribute(itemprop="actors" )
        for actor in actors:
            # print actor[1][1].content
            titleObj.addActors( str(actor[1][1].content).encode('ascii', 'replace'))
    except:
        pass


    print "--> Aboutb to get rating information... "


    try:
        ratingsInfo = dom.by_class("star-box-giga-star")

        for rating in ratingsInfo:
            # print rating.content
            titleObj.addRating(str(rating.content).encode('ascii', 'replace'))
    except:
        pass


    print "--> About to print other stuff...  "



    for item in dom.by_class("infobar"):

        try:
            objMatch = re.search("(\d+)", item.by_tag("time")[0].content )

            if objMatch:
                # print objMatch.group(1)
                titleObj.addRunTime( str(objMatch.group(1)).encode('ascii', 'replace'))
        except:
            pass



        for genreItem in item.by_tag("a"):

            try:
                objMatch = re.search("genre", genreItem.attributes['href'] )

                if objMatch:
                    titleObj.addGenre(str(genreItem.content).encode('ascii', 'replace'))
                    # print genreItem.attributes['href']
                    # print genreItem.content
            except:
                pass


    return  titleObj
开发者ID:aeggermont,项目名称:cs171,代码行数:93,代码来源:complex_HTML.py


注:本文中的pattern.web.DOM.by_attribute方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。