本文整理汇总了Python中pyquery.PyQuery.make_links_absolute方法的典型用法代码示例。如果您正苦于以下问题:Python PyQuery.make_links_absolute方法的具体用法?Python PyQuery.make_links_absolute怎么用?Python PyQuery.make_links_absolute使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pyquery.PyQuery
的用法示例。
在下文中一共展示了PyQuery.make_links_absolute方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: list_page
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
def list_page(self, response):
result_content = {}
content_iter = re.finditer(r"STK && STK.pageletM && STK.pageletM.view\((?P<content>\{.*?\})\)", response.content)
for iter in content_iter:
ok, content = safe_loads(iter.groupdict()['content'])
if ok and "pl_weibo_direct" == content.get("pid"):
result_content = content
break
else:
return {}
pyquery_doc = PyQuery(result_content["html"])
pyquery_doc.make_links_absolute(response.url)
items = []
for item in pyquery_doc("DIV.feed_lists>DIV.WB_cardwrap>DIV").items():
weibo_href = item("DIV.content>DIV.feed_from>A").attr.href
if weibo_href:
weibo_pics = []
for pic in item("DIV.feed_content DIV.media_box IMG").items():
weibo_pics.append(pic.attr.src)
data = {
"content": item("DIV.feed_content P.comment_txt").text(),
"nickname": item("DIV.feed_content A.W_texta").attr.title,
"href": weibo_href,
"quote_nickname": item("DIV.feed_content DIV.comment DIV.comment_info A.W_texta").attr.title,
"quote_content": item("DIV.feed_content DIV.comment DIV.comment_info P.comment_txt").text(),
"pics": ''.join(weibo_pics)
}
self.crawl("data:,%s" % weibo_href, callback = self.detail_page, data_fetch_content=data)
示例2: download
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
def download(threadUrl):
"""
"""
d = PyQuery(url=threadUrl, parser='soup')
links = d('a[href^="job.php?action=download&aid="]')
# 获取 verify 的值
tmp = d('script:contains("var verifyhash =")').text()
verify = re.search(r"var verifyhash = '(.*?)'", tmp).group(1)
total = len(links)
d.make_links_absolute()
for i, e in enumerate(links.items(), start=1):
filename = e.text()
print('%s/%s %s' % (i, total, filename))
if not os.path.exists(os.path.join(SAVE_PATH, filename)):
params = urlencode(
{'check': 1, 'verify': verify, 'nowtime': int(time.time() * 1000)})
url = '%s?%s' % (e.attr['href'], params)
print(' fetch: ' + url)
downDoc = PyQuery(url, headers=headers)
# 第0个是电信下载点,第1个是移动下载点
downUrl = BASE_URL + downDoc('a[href^="remotedown.php"]').eq(1).attr('href')
addToIDM(downUrl, SAVE_PATH, filename)
time.sleep(1.5)
wefiler_urls = checkWefiler(d)
if wefiler_urls:
print(wefiler_urls)
示例3: SegmentfaultTagSpider
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
class SegmentfaultTagSpider(object):
def __init__(self, tag_name, page=1):
self.url = 'http://segmentfault.com/t/%s?type=newest&page=%s' % (tag_name, page)
self.tag_name = tag_name
self.page = page
self._dom = None
@property
def dom(self):
if not self._dom:
document = requests.get(self.url)
document.encoding = 'utf-8'
self._dom = PyQuery(document.text)
self._dom.make_links_absolute(base_url="http://segmentfault.com/") # 相对链接变成绝对链接 爽
return self._dom
@property
def questions(self):
return [question.attr('href') for question in self.dom('h2.title > a').items()]
@property
def has_next_page(self): # 看看还有没有下一页,这个有必要
return bool(self.dom('ul.pagination > li.next')) # 看看有木有下一页
def next_page(self): # 把这个蜘蛛杀了, 产生一个新的蜘蛛 抓取下一页。 由于这个本来就是个动词,所以就不加@property了
if self.has_next_page:
self.__init__(tag_name=self.tag_name ,page=self.page+1)
else:
return None
示例4: scrape_page
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
async def scrape_page(session, url):
async with session.get(url) as resp:
content = await resp.text()
print('parsing url: {}'.format(url))
doc = PyQuery(content)
doc.make_links_absolute(base_url=url)
table = doc('#rz-main-container section:eq(1) .WriteSmallTableTop table:eq(1)')
results = []
for row in table.items('tr:gt(0)'):
company_col = row('td').eq(0)
phone_col = row('td').eq(1)
website_col = row('td').eq(2)
company = {
'name': company_col.text(),
'phone': phone_col.text(),
'url': website_col('a').attr('href'),
'details_url': company_col('a').attr('href'),
}
results.append(company)
return results
示例5: get_urls
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
def get_urls(base_url, exclude=set()):
urls = []
if base_url.endswith("/"):
base_url = base_url[:-1]
doc = PyQuery(base_url + "/plog/")
doc.make_links_absolute(base_url=base_url)
for a in doc("dd a"):
href = a.attrib["href"]
if href in exclude:
continue
urls.append(href)
doc = PyQuery(base_url + "/")
doc.make_links_absolute(base_url=base_url)
for a in doc("a"):
try:
href = a.attrib["href"]
except KeyError:
pass
if not href.startswith(base_url):
continue
if href.endswith(".html") or href.endswith(".png"):
continue
if href not in urls and href not in exclude:
urls.append(href)
urls.append(href)
urls.append(href)
url_start = base_url + "/p"
for i in range(2, 10):
url = url_start + str(i)
if url in exclude:
continue
urls.append(url)
return urls
示例6: doc
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
def doc(rsp):
"""Returns a PyQuery object of a request's content"""
parser = lxml.html.HTMLParser(encoding=encoding(rsp))
elements = lxml.html.fromstring(rsp.content, parser=parser)
if isinstance(elements, lxml.etree._ElementTree):
elements = elements.getroot()
doc =PyQuery(elements)
doc.make_links_absolute(rsp.url)
return doc
示例7: dom
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
def dom(self):
if not self._dom:
d = requests.get(self.url)
d.encoding = self.encoding
__dom = Pq(d.text)
if self.absolute_link:
try:
__dom.make_links_absolute(base_url=self.base_url)
except ValueError:
raise ValueError('When absolute_link is enabled, a base_url must be specified')
self._dom = __dom
return self._dom
示例8: get_urls
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
def get_urls(base_url, top_urls, exclude=set()):
urls = []
if base_url.endswith("/"):
base_url = base_url[:-1]
doc = PyQuery(base_url + "/plog/")
doc.make_links_absolute(base_url=base_url)
for a in doc("dd a"):
href = a.attrib["href"]
if href in exclude:
continue
urls.append(href)
if len(urls) >= top_urls:
break
return urls
示例9: collect_variable_listing_sources
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
def collect_variable_listing_sources(data_source, output_dir, verbose):
for letter in string.ascii_uppercase:
i, url = 0, variable_listing_url(data_source, letter)
while url:
if verbose: print("\tFetching: %s" % url)
src = requests.get(url).text
save_source(src, output_dir, letter, i)
doc = PyQuery(src, parser='html')
doc.make_links_absolute("https://%s.ipums.org/" % data_source)
next_page = doc('a.next_page')
if next_page:
url = next_page.attr['href']
i += 1
else:
url = None
示例10: get_all_links
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
def get_all_links():
try:
return pickle.load(open('.links'))
except IOError:
URL_BASE = "http://www.casarosada.gob.ar/informacion/discursos?start={}"
links = []
for start in pages:
url = URL_BASE.format(start)
logging.info('Descargando links desde {}'.format(url))
pq = PyQuery(url=url, headers=headers)
pq.make_links_absolute()
page_links = pq('div.category-item-title a')
links.extend(list(reversed(page_links)))
links = [pq(a).attr('href') for a in links]
pickle.dump(links, open('.links', 'w'))
return links
示例11: scrape_page
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
def scrape_page(url):
print('getting url: {}'.format(url))
doc = PyQuery(url)
doc.make_links_absolute()
table = doc('#rz-main-container section:eq(1) .WriteSmallTableTop table:eq(1)')
for row in table.items('tr:gt(0)'):
company_col = row('td').eq(0)
phone_col = row('td').eq(1)
website_col = row('td').eq(2)
company = {
'name': company_col.text(),
'phone': phone_col.text(),
'url': website_col('a').attr('href'),
'details_url': company_col('a').attr('href'),
}
yield company
示例12: PyQuery
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
from pyquery import PyQuery
doc = PyQuery('https://www.rigzone.com/search/alpha/a/')
doc.make_links_absolute()
table = doc('#rz-main-container section:eq(1) .WriteSmallTableTop table:eq(1)')
for row in table.items('tr:gt(0)'):
company_col = row('td').eq(0)
phone_col = row('td').eq(1)
website_col = row('td').eq(2)
details_url = company_col('a').attr('href')
company_name = company_col.text()
company_phone = phone_col.text()
company_url = website_col('a').attr('href')
print(company_name, company_phone, company_url, details_url)
break
示例13: get_urls_from_podcast
# 需要导入模块: from pyquery import PyQuery [as 别名]
# 或者: from pyquery.PyQuery import make_links_absolute [as 别名]
def get_urls_from_podcast(url, verbose=False):
"""given the url to a podcast, return the list of urls to each audiocut"""
pq = PyQuery(url)
pq.make_links_absolute()
return [PyQuery(a).attr('href') for a in pq('.cut_brief h4 a')]