当前位置: 首页>>代码示例>>Python>>正文


Python PyQuery.make_links_absolute方法代码示例

本文整理汇总了Python中pyquery.PyQuery.make_links_absolute方法的典型用法代码示例。如果您正苦于以下问题:Python PyQuery.make_links_absolute方法的具体用法?Python PyQuery.make_links_absolute怎么用?Python PyQuery.make_links_absolute使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在pyquery.PyQuery的用法示例。


在下文中一共展示了PyQuery.make_links_absolute方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: list_page

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
 def list_page(self, response):
     result_content = {}
 
     content_iter = re.finditer(r"STK && STK.pageletM && STK.pageletM.view\((?P<content>\{.*?\})\)", response.content)
     for iter in content_iter:
         ok, content = safe_loads(iter.groupdict()['content'])
         if ok and "pl_weibo_direct" == content.get("pid"):
             result_content = content
             break
     else:
         return {}
     
     pyquery_doc = PyQuery(result_content["html"])
     pyquery_doc.make_links_absolute(response.url)
     
     items = []
     for item in pyquery_doc("DIV.feed_lists>DIV.WB_cardwrap>DIV").items():
         weibo_href = item("DIV.content>DIV.feed_from>A").attr.href
         if weibo_href:
             weibo_pics = []
             for pic in item("DIV.feed_content DIV.media_box IMG").items():
                 weibo_pics.append(pic.attr.src)
                 
             data = {
                 "content": item("DIV.feed_content P.comment_txt").text(),
                 "nickname": item("DIV.feed_content A.W_texta").attr.title,
                 "href": weibo_href,
                 "quote_nickname": item("DIV.feed_content DIV.comment DIV.comment_info A.W_texta").attr.title,
                 "quote_content": item("DIV.feed_content DIV.comment DIV.comment_info P.comment_txt").text(),
                 "pics": ''.join(weibo_pics)
             }
             self.crawl("data:,%s" % weibo_href, callback = self.detail_page, data_fetch_content=data)
开发者ID:jttoday,项目名称:spider,代码行数:34,代码来源:weibo_weixin.py

示例2: download

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
def download(threadUrl):
    """
    """
    d = PyQuery(url=threadUrl, parser='soup')
    links = d('a[href^="job.php?action=download&aid="]')

    # 获取 verify 的值
    tmp = d('script:contains("var verifyhash =")').text()
    verify = re.search(r"var verifyhash = '(.*?)'", tmp).group(1)

    total = len(links)
    d.make_links_absolute()
    for i, e in enumerate(links.items(), start=1):
        filename = e.text()
        print('%s/%s %s' % (i, total, filename))

        if not os.path.exists(os.path.join(SAVE_PATH, filename)):
            params = urlencode(
                {'check': 1, 'verify': verify, 'nowtime': int(time.time() * 1000)})
            url = '%s?%s' % (e.attr['href'], params)

            print('  fetch: ' + url)
            downDoc = PyQuery(url, headers=headers)
            # 第0个是电信下载点,第1个是移动下载点
            downUrl = BASE_URL + downDoc('a[href^="remotedown.php"]').eq(1).attr('href')
            addToIDM(downUrl, SAVE_PATH, filename)
            time.sleep(1.5)

    wefiler_urls = checkWefiler(d)
    if wefiler_urls:
        print(wefiler_urls)
开发者ID:fishlee,项目名称:DownloadHelpers,代码行数:33,代码来源:weiphone.py

示例3: SegmentfaultTagSpider

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
class SegmentfaultTagSpider(object):

    def __init__(self, tag_name, page=1):
        self.url = 'http://segmentfault.com/t/%s?type=newest&page=%s' % (tag_name, page)
        self.tag_name = tag_name
        self.page = page
        self._dom = None

    @property
    def dom(self):
        if not self._dom:
            document = requests.get(self.url)
            document.encoding = 'utf-8'
            self._dom = PyQuery(document.text)
            self._dom.make_links_absolute(base_url="http://segmentfault.com/") # 相对链接变成绝对链接 爽
        return self._dom


    @property
    def questions(self):
        return [question.attr('href') for question in self.dom('h2.title > a').items()]

    @property
    def has_next_page(self): # 看看还有没有下一页,这个有必要
        return bool(self.dom('ul.pagination > li.next')) # 看看有木有下一页

    def next_page(self): # 把这个蜘蛛杀了, 产生一个新的蜘蛛 抓取下一页。 由于这个本来就是个动词,所以就不加@property了
        if self.has_next_page:
            self.__init__(tag_name=self.tag_name ,page=self.page+1)
        else:
            return None
开发者ID:huaxinscnu,项目名称:django_web,代码行数:33,代码来源:spider.py

示例4: scrape_page

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
async def scrape_page(session, url):
    async with session.get(url) as resp:
        content = await resp.text()

    print('parsing url: {}'.format(url))
    doc = PyQuery(content)
    doc.make_links_absolute(base_url=url)

    table = doc('#rz-main-container section:eq(1) .WriteSmallTableTop table:eq(1)')

    results = []

    for row in table.items('tr:gt(0)'):
        company_col = row('td').eq(0)
        phone_col = row('td').eq(1)
        website_col = row('td').eq(2)

        company = {
            'name': company_col.text(),
            'phone': phone_col.text(),
            'url': website_col('a').attr('href'),
            'details_url': company_col('a').attr('href'),
        }

        results.append(company)

    return results
开发者ID:DerbyPy,项目名称:2016-screen-scraping,代码行数:29,代码来源:step-6.py

示例5: get_urls

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
def get_urls(base_url, exclude=set()):
    urls = []
    if base_url.endswith("/"):
        base_url = base_url[:-1]
    doc = PyQuery(base_url + "/plog/")
    doc.make_links_absolute(base_url=base_url)
    for a in doc("dd a"):
        href = a.attrib["href"]
        if href in exclude:
            continue
        urls.append(href)

    doc = PyQuery(base_url + "/")
    doc.make_links_absolute(base_url=base_url)
    for a in doc("a"):
        try:
            href = a.attrib["href"]
        except KeyError:
            pass
        if not href.startswith(base_url):
            continue
        if href.endswith(".html") or href.endswith(".png"):
            continue
        if href not in urls and href not in exclude:
            urls.append(href)
            urls.append(href)
            urls.append(href)

    url_start = base_url + "/p"
    for i in range(2, 10):
        url = url_start + str(i)
        if url in exclude:
            continue
        urls.append(url)
    return urls
开发者ID:peterbe,项目名称:django-peterbecom,代码行数:37,代码来源:download-all-plogs.py

示例6: doc

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
def doc(rsp):
    """Returns a PyQuery object of a request's content"""
    parser = lxml.html.HTMLParser(encoding=encoding(rsp))
    elements = lxml.html.fromstring(rsp.content, parser=parser)
    if isinstance(elements, lxml.etree._ElementTree):
        elements = elements.getroot()
    doc =PyQuery(elements)
    doc.make_links_absolute(rsp.url)
    return doc
开发者ID:zymtech,项目名称:parse_newspage,代码行数:11,代码来源:parserstandalone.py

示例7: dom

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
 def dom(self):
     if not self._dom:
         d = requests.get(self.url)
         d.encoding = self.encoding
         __dom = Pq(d.text)
         if self.absolute_link:
             try:
                 __dom.make_links_absolute(base_url=self.base_url)
             except ValueError:
                 raise ValueError('When absolute_link is enabled, a base_url must be specified')
         self._dom = __dom
     return self._dom
开发者ID:ericls,项目名称:gextractor,代码行数:14,代码来源:base.py

示例8: get_urls

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
def get_urls(base_url, top_urls, exclude=set()):
    urls = []
    if base_url.endswith("/"):
        base_url = base_url[:-1]
    doc = PyQuery(base_url + "/plog/")
    doc.make_links_absolute(base_url=base_url)
    for a in doc("dd a"):
        href = a.attrib["href"]
        if href in exclude:
            continue
        urls.append(href)
        if len(urls) >= top_urls:
            break

    return urls
开发者ID:peterbe,项目名称:django-peterbecom,代码行数:17,代码来源:cdn-crawler.py

示例9: collect_variable_listing_sources

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
def collect_variable_listing_sources(data_source, output_dir, verbose):
    for letter in string.ascii_uppercase:
        i, url = 0, variable_listing_url(data_source, letter)
        while url:
            if verbose: print("\tFetching: %s" % url)
            src = requests.get(url).text
            save_source(src, output_dir, letter, i)
            doc = PyQuery(src, parser='html')
            doc.make_links_absolute("https://%s.ipums.org/" % data_source)
            next_page = doc('a.next_page')
            if next_page:
                url = next_page.attr['href']
                i += 1
            else:
                url = None
开发者ID:americanist,项目名称:IPUMS_Codebooks,代码行数:17,代码来源:fetch_var_list_pages.py

示例10: get_all_links

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
def get_all_links():

    try:
        return pickle.load(open('.links'))
    except IOError:
        URL_BASE = "http://www.casarosada.gob.ar/informacion/discursos?start={}"
        links = []
        for start in pages:
            url = URL_BASE.format(start)
            logging.info('Descargando links desde {}'.format(url))
            pq = PyQuery(url=url, headers=headers)
            pq.make_links_absolute()
            page_links = pq('div.category-item-title a')

            links.extend(list(reversed(page_links)))
        links = [pq(a).attr('href') for a in links]
        pickle.dump(links, open('.links', 'w'))
        return links
开发者ID:leomartinez,项目名称:discursos_cfk,代码行数:20,代码来源:scrapper.py

示例11: scrape_page

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
def scrape_page(url):
    print('getting url: {}'.format(url))
    doc = PyQuery(url)
    doc.make_links_absolute()

    table = doc('#rz-main-container section:eq(1) .WriteSmallTableTop table:eq(1)')

    for row in table.items('tr:gt(0)'):
        company_col = row('td').eq(0)
        phone_col = row('td').eq(1)
        website_col = row('td').eq(2)

        company = {
            'name': company_col.text(),
            'phone': phone_col.text(),
            'url': website_col('a').attr('href'),
            'details_url': company_col('a').attr('href'),
        }

        yield company
开发者ID:DerbyPy,项目名称:2016-screen-scraping,代码行数:22,代码来源:step-5.py

示例12: PyQuery

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
from pyquery import PyQuery


doc = PyQuery('https://www.rigzone.com/search/alpha/a/')
doc.make_links_absolute()

table = doc('#rz-main-container section:eq(1) .WriteSmallTableTop table:eq(1)')

for row in table.items('tr:gt(0)'):
    company_col = row('td').eq(0)
    phone_col = row('td').eq(1)
    website_col = row('td').eq(2)

    details_url = company_col('a').attr('href')
    company_name = company_col.text()
    company_phone = phone_col.text()
    company_url = website_col('a').attr('href')

    print(company_name, company_phone, company_url, details_url)
    break
开发者ID:DerbyPy,项目名称:2016-screen-scraping,代码行数:22,代码来源:step-3.py

示例13: get_urls_from_podcast

# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
def get_urls_from_podcast(url, verbose=False):
    """given the url to a podcast, return the list of urls to each audiocut"""
    pq = PyQuery(url)
    pq.make_links_absolute()
    return [PyQuery(a).attr('href') for a in pq('.cut_brief h4 a')]
开发者ID:mgaitan,项目名称:radiocut_downloader,代码行数:7,代码来源:__init__.py


注:本文中的pyquery.PyQuery.make_links_absolute方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。