本文整理匯總了Python中bs4.UnicodeDammit.xpath方法的典型用法代碼示例。如果您正苦於以下問題:Python UnicodeDammit.xpath方法的具體用法?Python UnicodeDammit.xpath怎麽用?Python UnicodeDammit.xpath使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類bs4.UnicodeDammit
的用法示例。
在下文中一共展示了UnicodeDammit.xpath方法的1個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: _fetch_data
# 需要導入模塊: from bs4 import UnicodeDammit [as 別名]
# 或者: from bs4.UnicodeDammit import xpath [as 別名]
def _fetch_data(self, entry_name, url):
# url = url.decode('utf-8')
# if url[:5] == 'http:':
# url = 'https' + url[4:]
# url = url.encode('utf-8')
original_entry_name = entry_name
data = dict()
try:
with contextlib.closing(urllib2.urlopen(url.encode('utf-8'))) as page_source:
page_content = page_source.read()
doc = UnicodeDammit(page_content, is_html=True)
parser = lxml.html.HTMLParser(encoding=doc.original_encoding)
doc = lxml.html.document_fromstring(page_content, parser=parser)
bar_name = doc.xpath('//a[contains(@class, "star_title_h3")]')
if not bar_name:
bar_name = doc.xpath('//a[contains(@class, "card_title_fname")]')
if type(bar_name) is list and len(bar_name) > 0:
entry_name = bar_name[0].text_content().strip()
num_visits = doc.xpath('//span[contains(@class, "j_visit_num")]')
if not num_visits:
num_visits = doc.xpath('//span[contains(@class, "card_menNum")]')
num_posts = doc.xpath('//span[contains(@class, "j_post_num")]')
if not num_posts:
num_posts = doc.xpath('//span[contains(@class, "card_infoNum")]')
if type(num_visits) is list and len(num_visits) > 0:
num_visits = num_visits[0].text_content()
num_visits = cogtu_misc.get_first_number_from_text(num_visits)
else:
num_visits = 0
if type(num_posts) is list and len(num_posts) > 0:
num_posts = num_posts[0].text_content()
num_posts = cogtu_misc.get_first_number_from_text(num_posts)
else:
num_posts = 0
num_groups = doc.xpath("//a[contains(@class, 'star_nav_ico_group')]/span")
if type(num_groups) is list and len(num_groups) > 0:
num_groups = num_groups[0].text_content()
num_groups = cogtu_misc.get_first_number_from_text(num_groups)
else:
num_groups = 0
except urllib2.HTTPError:
logging.info('urllib2.HTTPError. Skip.')
return None, None
except urllib2.URLError:
logging.info('urllib2.URLError. Skip.')
return None, None
data['num_visits'] = int(num_visits)
data['num_posts'] = int(num_posts)
data['num_groups'] = int(num_groups)
data['entry_name'] = entry_name
data['original_entry_name'] = original_entry_name
data['url'] = url
return entry_name, data