本文整理匯總了Python中scrapy.linkextractors.LinkExtractor方法的典型用法代碼示例。如果您正苦於以下問題:Python linkextractors.LinkExtractor方法的具體用法?Python linkextractors.LinkExtractor怎麽用?Python linkextractors.LinkExtractor使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類scrapy.linkextractors
的用法示例。
在下文中一共展示了linkextractors.LinkExtractor方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: __init__
# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self,
restrict_xpaths=(),
restrict_css=(),
restrict_regex=(),
allow_domains=(),
link_extractor_cls=LinkExtractor, **kwargs):
"""
:param restrict_xpaths: list of xpaths for links Extraction.
:param restrict_css: list of xpath for links extraction
:param restrict_regex: list of regex patterns
:param link_extractor_cls: defaults to scrapy link extractor
:param allow_domains: defaults to the allowed domains of spider
"""
self.restrict_xpaths = restrict_xpaths
self.restrict_css = restrict_css
self.restrict_regex = restrict_regex
self.allow_domains = allow_domains
self.link_extractor_cls = link_extractor_cls
示例2: generate_spider_kwargs
# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def generate_spider_kwargs(self):
extractor = LinkExtractor()
rules = [
Rule(extractor, follow=True) # TODO - add regex types of needed.
]
print(self.manifest)
spider_kwargs = {
"start_urls": self.spider_config['start_urls'],
"allowed_domains": [],
"rules": rules,
"spider_config": self.spider_config,
"manifest": self.manifest,
"context": self.context,
# "default_storage":
}
spider_kwargs.update(self.extra_arguments)
return spider_kwargs
示例3: __init__
# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self, conf=None, conn=None):
# Save conf/conn
self.conf = conf
self.conn = conn
# Make urls
self.start_urls = [
'http://www.takedaclinicaltrials.com/browse/?protocol_id=',
]
# Make rules
self.rules = [
Rule(LinkExtractor(
allow=r'browse/summary/',
), callback=parse_record),
Rule(LinkExtractor(
allow=r'browse',
)),
]
# Inherit parent
super(Spider, self).__init__()
示例4: __init__
# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self, conf=None, conn=None):
# Save conf/conn
self.conf = conf
self.conn = conn
# Make urls
self.start_urls = [
'http://www.pfizer.com/research/clinical_trials/find_a_trial?recr=0',
]
# Make rules
self.rules = [
Rule(LinkExtractor(
allow=r'find_a_trial/NCT\d+',
), callback=parse_record),
Rule(LinkExtractor(
allow=r'page=\d+',
)),
]
# Inherit parent
super(Spider, self).__init__()
示例5: __init__
# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self, domains, directory, allow=(), deny=(), unix=False):
self.directory = directory
self.unix = unix
self.rules = (
Rule(LinkExtractor(allow=allow, deny=deny), callback='save_page'),
)
# parse the allowed domains and start urls
self.allowed_domains = []
self.start_urls = []
for domain in domains:
url_parts = domain.split('://')
unqualified_url = url_parts[-1]
url_scheme = url_parts[0] if len(url_parts) > 1 else 'http'
full_url = '{0}://{1}'.format(url_scheme, unqualified_url)
bare_domain = unqualified_url.split('/')[0]
self.allowed_domains.append(bare_domain)
self.start_urls.append(full_url)
super().__init__()
示例6: request_index
# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def request_index(self, response):
categories = list(set(response.css('#topMenuItem a::attr("href")').re('/([^\/]+)/$')))
if self.category is not None:
if self.category in categories:
categories = [self.category]
else:
raise ValueError('invalid category slug. available slugs: %s' % ", ".join(categories))
date_processing = self.start_date
while date_processing <= self.end_date:
for category in categories:
# redifining the rule again according to the specific date url
SamakalSpider.rules = (Rule(LinkExtractor(allow=('/' + date_processing.strftime('%Y/%m/%d') + '/\d+$',),
restrict_xpaths=('//div[@class="main-body"]')),
callback="parse_content", follow=True),)
super(SamakalSpider, self)._compile_rules()
# http://bangla.samakal.net/-education/2016/06/01
url = 'http://bangla.samakal.net/{0}/{1}'.format(
category,
date_processing.strftime('%Y/%m/%d')
)
yield self.make_requests_from_url(url)
date_processing += datetime.timedelta(days=1)
示例7: parse
# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def parse(self, response):
link = LinkExtractor(restrict_css='ul.textlarge22', allow='areaindex')
links = link.extract_links(response)
for _link in links:
# yield scrapy.Request('http://www.66ip.cn/areaindex_1/1.html', callback=self.parse_list)
yield scrapy.Request(_link.url, callback=self.parse_list)
示例8: __init__
# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self, conf=None, conn=None, date_from=None, date_to=None):
# Save conf/conn
self.conf = conf
self.conn = conn
# Make start urls
self.start_urls = _make_start_urls(
prefix='http://www.isrctn.com/search',
date_from=date_from, date_to=date_to)
# Make rules
self.rules = [
Rule(LinkExtractor(
allow=r'ISRCTN\d+',
), callback=parse_record),
Rule(LinkExtractor(
allow=r'page=\d+',
)),
]
# Inherit parent
super(Spider, self).__init__()
# Internal
示例9: __init__
# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self, conf=None, conn=None, page_from=None, page_to=None):
# Save conf/conn
self.conf = conf
self.conn = conn
# Default values
if page_from is None:
page_from = '1'
if page_to is None:
page_to = '1'
# Make start urls
self.start_urls = _make_start_urls(
prefix='https://upload.umin.ac.jp/cgi-open-bin/ctr_e/index.cgi',
page_from=page_from)
# Make rules
self.rules = [
Rule(LinkExtractor(
allow=r'cgi-open-bin/ctr_e/ctr_view.cgi',
), callback=parse_record),
Rule(LinkExtractor(
allow=r'page=\d+',
process_value=partial(_process_url, page_from, page_to),
)),
]
# Inherit parent
super(Spider, self).__init__()
# Internal
示例10: __init__
# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self, conf=None, conn=None, http_user=None, http_pass=None):
# Save conf/conn
self.conf = conf
self.conn = conn
# Save creadentials
self.http_user = http_user
self.http_pass = http_pass
# Make urls
self.start_urls = [
'http://apps.who.int/trialsearch/crawl/crawl0.aspx',
]
# Make rules
self.rules = [
Rule(LinkExtractor(
allow=r'trialsearch/Trial\d+\.aspx\?trialid=.+',
), callback=parse_record),
Rule(LinkExtractor(
allow=r'trialsearch/crawl/crawl\d+\.aspx',
)),
]
# Inherit parent
super(Spider, self).__init__()
示例11: __init__
# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self, conf=None, conn=None, date_from=None, date_to=None):
# Save conf/conn
self.conf = conf
self.conn = conn
# Make start urls
self.start_urls = _make_start_urls(
prefix='http://www.anzctr.org.au/TrialSearch.aspx',
date_from=date_from, date_to=date_to)
# Make rules
self.rules = [
Rule(LinkExtractor(
allow=r'Trial/Registration/TrialReview.aspx',
process_value=lambda value: value.replace('http', 'https', 1),
), callback=parse_record),
Rule(LinkExtractor(
allow=r'page=\d+',
)),
]
# Inherit parent
super(Spider, self).__init__()
# Internal
示例12: __init__
# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self, book_url=None, **kw):
super(FollowAllSpider, self).__init__(**kw)
url = book_url
if not url.startswith('http://') and not url.startswith('https://'):
url = 'http://%s/' % url
self.url = url
self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)]
self.link_extractor = LinkExtractor()
self.cookies_seen = set()
self.previtem = 0
self.items = 0
self.timesec = datetime.datetime.utcnow()
示例13: main
# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def main():
url = 'http://scrapinghub.com/'
link_extractor = LinkExtractor()
total = 0
time = 0
tar = tarfile.open("sites.tar.gz")
for member in tar.getmembers():
f = tar.extractfile(member)
html = f.read()
start = timer()
response = HtmlResponse(url=url, body=html, encoding='utf8')
links = link_extractor.extract_links(response)
end = timer()
total = total + len(links)
time = time + end - start
print("\nTotal number of links extracted = {0}".format(total))
print("Time taken = {0}".format(time))
click.secho("Rate of link extraction : {0} links/second\n".format(
float(total / time)), bold=True)
with open("Benchmark.txt", 'w') as g:
g.write(" {0}".format((float(total / time))))
示例14: __init__
# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self, **kw):
super(BroadBenchSpider, self).__init__(**kw)
self.link_extractor = LinkExtractor()
self.cookies_seen = set()
self.previtem = 0
self.items = 0
self.timesec = datetime.datetime.utcnow()
self.start_urls = [
'http://domain{}:{}/index.html'.format(i, self.port) for i in range(1, self.n_domains + 1)]
示例15: __init__
# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self, url, credentials, *args, **kwargs):
self.credentials = credentials
self.start_urls = [url]
self.link_extractor = LinkExtractor(allow_domains=[get_domain(url)])
self.found_login = False
self.found_registration = False
super(FormSpider, self).__init__(*args, **kwargs)