當前位置: 首頁>>代碼示例>>Python>>正文


Python linkextractors.LinkExtractor方法代碼示例

本文整理匯總了Python中scrapy.linkextractors.LinkExtractor方法的典型用法代碼示例。如果您正苦於以下問題:Python linkextractors.LinkExtractor方法的具體用法?Python linkextractors.LinkExtractor怎麽用?Python linkextractors.LinkExtractor使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在scrapy.linkextractors的用法示例。


在下文中一共展示了linkextractors.LinkExtractor方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: __init__

# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self,
                 restrict_xpaths=(),
                 restrict_css=(),
                 restrict_regex=(),
                 allow_domains=(),
                 link_extractor_cls=LinkExtractor, **kwargs):
        """

        :param restrict_xpaths: list of xpaths for links Extraction.
        :param restrict_css: list of xpath for links extraction
        :param restrict_regex: list of regex patterns
        :param link_extractor_cls: defaults to scrapy link extractor
        :param allow_domains: defaults to the allowed domains of spider
        """
        self.restrict_xpaths = restrict_xpaths
        self.restrict_css = restrict_css
        self.restrict_regex = restrict_regex
        self.allow_domains = allow_domains
        self.link_extractor_cls = link_extractor_cls 
開發者ID:invanalabs,項目名稱:invana-bot,代碼行數:21,代碼來源:generic.py

示例2: generate_spider_kwargs

# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def generate_spider_kwargs(self):
        extractor = LinkExtractor()
        rules = [
            Rule(extractor, follow=True)  # TODO - add regex types of needed.
        ]
        print(self.manifest)
        spider_kwargs = {
            "start_urls": self.spider_config['start_urls'],
            "allowed_domains": [],
            "rules": rules,
            "spider_config": self.spider_config,
            "manifest": self.manifest,
            "context": self.context,
            # "default_storage":
        }
        spider_kwargs.update(self.extra_arguments)
        return spider_kwargs 
開發者ID:invanalabs,項目名稱:invana-bot,代碼行數:19,代碼來源:single.py

示例3: __init__

# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self, conf=None, conn=None):

        # Save conf/conn
        self.conf = conf
        self.conn = conn

        # Make urls
        self.start_urls = [
            'http://www.takedaclinicaltrials.com/browse/?protocol_id=',
        ]

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=r'browse/summary/',
            ), callback=parse_record),
            Rule(LinkExtractor(
                allow=r'browse',
            )),
        ]

        # Inherit parent
        super(Spider, self).__init__() 
開發者ID:opentrials,項目名稱:collectors,代碼行數:25,代碼來源:spider.py

示例4: __init__

# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self, conf=None, conn=None):

        # Save conf/conn
        self.conf = conf
        self.conn = conn

        # Make urls
        self.start_urls = [
            'http://www.pfizer.com/research/clinical_trials/find_a_trial?recr=0',
        ]

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=r'find_a_trial/NCT\d+',
            ), callback=parse_record),
            Rule(LinkExtractor(
                allow=r'page=\d+',
            )),
        ]

        # Inherit parent
        super(Spider, self).__init__() 
開發者ID:opentrials,項目名稱:collectors,代碼行數:25,代碼來源:spider.py

示例5: __init__

# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self, domains, directory, allow=(), deny=(), unix=False):
        self.directory = directory
        self.unix = unix
        self.rules = (
            Rule(LinkExtractor(allow=allow, deny=deny), callback='save_page'),
        )

        # parse the allowed domains and start urls
        self.allowed_domains = []
        self.start_urls = []
        for domain in domains:
            url_parts = domain.split('://')
            unqualified_url = url_parts[-1]
            url_scheme = url_parts[0] if len(url_parts) > 1 else 'http'
            full_url = '{0}://{1}'.format(url_scheme, unqualified_url)
            bare_domain = unqualified_url.split('/')[0]
            self.allowed_domains.append(bare_domain)
            self.start_urls.append(full_url)

        super().__init__() 
開發者ID:sangaline,項目名稱:wayback-machine-scraper,代碼行數:22,代碼來源:mirror_spider.py

示例6: request_index

# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def request_index(self, response):
        categories = list(set(response.css('#topMenuItem a::attr("href")').re('/([^\/]+)/$')))

        if self.category is not None:
            if self.category in categories:
                categories = [self.category]
            else:
                raise ValueError('invalid category slug. available slugs: %s' % ", ".join(categories))

        date_processing = self.start_date
        while date_processing <= self.end_date:
            for category in categories:
                # redifining the rule again according to the specific date url
                SamakalSpider.rules = (Rule(LinkExtractor(allow=('/' + date_processing.strftime('%Y/%m/%d') + '/\d+$',),
                                                          restrict_xpaths=('//div[@class="main-body"]')),
                                            callback="parse_content", follow=True),)
                super(SamakalSpider, self)._compile_rules()
                # http://bangla.samakal.net/-education/2016/06/01 
                url = 'http://bangla.samakal.net/{0}/{1}'.format(
                    category,
                    date_processing.strftime('%Y/%m/%d')
                )
                yield self.make_requests_from_url(url)
            date_processing += datetime.timedelta(days=1) 
開發者ID:banglakit,項目名稱:corpus-builder,代碼行數:26,代碼來源:samakal.py

示例7: parse

# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def parse(self, response):
        link = LinkExtractor(restrict_css='ul.textlarge22', allow='areaindex')
        links = link.extract_links(response)
        for _link in links:
            # yield scrapy.Request('http://www.66ip.cn/areaindex_1/1.html', callback=self.parse_list)
            yield scrapy.Request(_link.url, callback=self.parse_list) 
開發者ID:aox-lei,項目名稱:aox_proxy_pool,代碼行數:8,代碼來源:ip66.py

示例8: __init__

# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self, conf=None, conn=None, date_from=None, date_to=None):

        # Save conf/conn
        self.conf = conf
        self.conn = conn

        # Make start urls
        self.start_urls = _make_start_urls(
                prefix='http://www.isrctn.com/search',
                date_from=date_from, date_to=date_to)

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=r'ISRCTN\d+',
            ), callback=parse_record),
            Rule(LinkExtractor(
                allow=r'page=\d+',
            )),
        ]

        # Inherit parent
        super(Spider, self).__init__()


# Internal 
開發者ID:opentrials,項目名稱:collectors,代碼行數:28,代碼來源:spider.py

示例9: __init__

# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self, conf=None, conn=None, page_from=None, page_to=None):

        # Save conf/conn
        self.conf = conf
        self.conn = conn

        # Default values
        if page_from is None:
            page_from = '1'
        if page_to is None:
            page_to = '1'

        # Make start urls
        self.start_urls = _make_start_urls(
                prefix='https://upload.umin.ac.jp/cgi-open-bin/ctr_e/index.cgi',
                page_from=page_from)

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=r'cgi-open-bin/ctr_e/ctr_view.cgi',
            ), callback=parse_record),
            Rule(LinkExtractor(
                allow=r'page=\d+',
                process_value=partial(_process_url, page_from, page_to),
            )),
        ]

        # Inherit parent
        super(Spider, self).__init__()


# Internal 
開發者ID:opentrials,項目名稱:collectors,代碼行數:35,代碼來源:spider.py

示例10: __init__

# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self, conf=None, conn=None, http_user=None, http_pass=None):

        # Save conf/conn
        self.conf = conf
        self.conn = conn

        # Save creadentials
        self.http_user = http_user
        self.http_pass = http_pass

        # Make urls
        self.start_urls = [
            'http://apps.who.int/trialsearch/crawl/crawl0.aspx',
        ]

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=r'trialsearch/Trial\d+\.aspx\?trialid=.+',
            ), callback=parse_record),
            Rule(LinkExtractor(
                allow=r'trialsearch/crawl/crawl\d+\.aspx',
            )),
        ]

        # Inherit parent
        super(Spider, self).__init__() 
開發者ID:opentrials,項目名稱:collectors,代碼行數:29,代碼來源:spider.py

示例11: __init__

# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self, conf=None, conn=None, date_from=None, date_to=None):

        # Save conf/conn
        self.conf = conf
        self.conn = conn

        # Make start urls
        self.start_urls = _make_start_urls(
            prefix='http://www.anzctr.org.au/TrialSearch.aspx',
            date_from=date_from, date_to=date_to)

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=r'Trial/Registration/TrialReview.aspx',
                process_value=lambda value: value.replace('http', 'https', 1),
            ), callback=parse_record),
            Rule(LinkExtractor(
                allow=r'page=\d+',
            )),
        ]

        # Inherit parent
        super(Spider, self).__init__()


# Internal 
開發者ID:opentrials,項目名稱:collectors,代碼行數:29,代碼來源:spider.py

示例12: __init__

# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self, book_url=None, **kw):
        super(FollowAllSpider, self).__init__(**kw)

        url = book_url
        if not url.startswith('http://') and not url.startswith('https://'):
            url = 'http://%s/' % url
        self.url = url
        self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)]
        self.link_extractor = LinkExtractor()
        self.cookies_seen = set()
        self.previtem = 0
        self.items = 0
        self.timesec = datetime.datetime.utcnow() 
開發者ID:scrapy,項目名稱:scrapy-bench,代碼行數:15,代碼來源:followall.py

示例13: main

# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def main():
    url = 'http://scrapinghub.com/'
    link_extractor = LinkExtractor()
    total = 0
    time = 0
    tar = tarfile.open("sites.tar.gz")
    for member in tar.getmembers():
        f = tar.extractfile(member)
        html = f.read()

        start = timer()

        response = HtmlResponse(url=url, body=html, encoding='utf8')
        links = link_extractor.extract_links(response)

        end = timer()

        total = total + len(links)
        time = time + end - start

    print("\nTotal number of links extracted = {0}".format(total))
    print("Time taken = {0}".format(time))
    click.secho("Rate of link extraction : {0} links/second\n".format(
        float(total / time)), bold=True)

    with open("Benchmark.txt", 'w') as g:
        g.write(" {0}".format((float(total / time)))) 
開發者ID:scrapy,項目名稱:scrapy-bench,代碼行數:29,代碼來源:link.py

示例14: __init__

# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self, **kw):
        super(BroadBenchSpider, self).__init__(**kw)

        self.link_extractor = LinkExtractor()
        self.cookies_seen = set()
        self.previtem = 0
        self.items = 0
        self.timesec = datetime.datetime.utcnow()
        self.start_urls = [
            'http://domain{}:{}/index.html'.format(i, self.port) for i in range(1, self.n_domains + 1)] 
開發者ID:scrapy,項目名稱:scrapy-bench,代碼行數:12,代碼來源:broadspider.py

示例15: __init__

# 需要導入模塊: from scrapy import linkextractors [as 別名]
# 或者: from scrapy.linkextractors import LinkExtractor [as 別名]
def __init__(self, url, credentials, *args, **kwargs):
        self.credentials = credentials
        self.start_urls = [url]
        self.link_extractor = LinkExtractor(allow_domains=[get_domain(url)])
        self.found_login = False
        self.found_registration = False
        super(FormSpider, self).__init__(*args, **kwargs) 
開發者ID:TeamHG-Memex,項目名稱:autologin,代碼行數:9,代碼來源:spiders.py


注:本文中的scrapy.linkextractors.LinkExtractor方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。