当前位置: 首页>>代码示例>>Python>>正文


Python user_agent.generate_user_agent方法代码示例

本文整理汇总了Python中user_agent.generate_user_agent方法的典型用法代码示例。如果您正苦于以下问题:Python user_agent.generate_user_agent方法的具体用法?Python user_agent.generate_user_agent怎么用?Python user_agent.generate_user_agent使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在user_agent的用法示例。


在下文中一共展示了user_agent.generate_user_agent方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: getBaiduDictCate

# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def getBaiduDictCate():
    """
    功能:得到百度词库的分类,有三级分类,因为三级分类太细而且较少,所以将三级分类纳入其二级分类
    :return:两个词典,第一个词典记录大类的ID和内容的对应关系,第二个词典记录了第一个词典中每一类大类下的所有分类
    """
    bigCateDict = {}
    smallCateDict ={}
    initPageURL = r'https://shurufa.baidu.com/dict'
    cateBaseURL = r'https://shurufa.baidu.com/dict_list?cid='

    # 防止502错误
    userAgent = generate_user_agent()
    referrer = 'http://shurufa.baidu.com/dict.html'  
    headers = {}
    headers['User-Agent'] = userAgent
    headers['Referer'] = referrer

    # 抓取大类
    try:
        request = urllib2.Request(url=initPageURL, headers=headers)
        response = urllib2.urlopen(request)
        data = response.read()
    except urllib2.HTTPError, e:
        print 'Error while getting the big category,error code:',e.code
        sys.exit() 
开发者ID:WuLC,项目名称:ThesaurusSpider,代码行数:27,代码来源:getCategory.py

示例2: __init__

# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def __init__(self, proxy):
        """init the webdriver by setting the proxy and user-agent
        
        Args:
            proxy (str): proxy in the form of ip:port
        """
        # set proxy
        ip, port = proxy.split(':')
        profile = webdriver.FirefoxProfile()
        profile.set_preference("network.proxy.type", 1)
        profile.set_preference("network.proxy.http", ip)
        profile.set_preference("network.proxy.http_port", port)
        # set user_agent
        profile.set_preference("general.useragent.override", generate_user_agent())

        profile.update_preferences()
        self.driver = webdriver.Firefox(firefox_profile=profile)
        
        print 'current proxy: %s'%proxy 
开发者ID:WuLC,项目名称:AmazonRobot,代码行数:21,代码来源:Robot.py

示例3: is_valid

# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def is_valid(target_url, ip, referer):
    """judge if a proxy ip is valid for target_url
    
    Args:
        target_url (str): url that need to visite with a proxy
        ip (str): the set in redis to get 
        referer (str, optional): referer part of  headers  of the request
    
    Returns:
        boolean
    """
    ignore_warnings()
    proxy = {
    'http': 'http://%s' %ip
    }
    headers = {'user-agent': generate_user_agent(), 'referer': referer}
    try:
        r = requests.get(target_url, headers = headers, proxies = proxy, timeout = 6)
        return True
    except Exception:
        return False 
开发者ID:WuLC,项目名称:AmazonRobot,代码行数:23,代码来源:GetProxy.py

示例4: get_phone_visa

# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def get_phone_visa():
    """fetch phone, visa from http://www.fakeaddressgenerator.com/World/us_address_generator"""
    url = r'http://www.fakeaddressgenerator.com/World/us_address_generator'
    referer = r'http://www.fakeaddressgenerator.com/World'
    header = {'user-agent' : generate_user_agent() , 'referer':referer }
    text = requests.get(url, headers = header).text
    soup = BeautifulSoup(text, 'lxml')
    info = soup.find_all('input')
    """
    print 'name:',info[0]['value']
    print 'phone:',info[9]['value']
    print 'visa:',info[11]['value']
    print 'expires:',info[13]['value']
    """
    name_phone =  info[0]['value']+'#'+info[9]['value']
    name_visa = info[0]['value']+'#'+info[11]['value']+'#'+info[13]['value']
    print name_phone, name_visa
    return name_phone, name_visa 
开发者ID:WuLC,项目名称:AmazonRobot,代码行数:20,代码来源:GetUserInfo.py

示例5: download_page

# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def download_page(url):
    """download raw content of the page
    
    Args:
        url (str): url of the page 
    
    Returns:
        raw content of the page
    """
    try:
        headers = {}
        headers['User-Agent'] = generate_user_agent()
        headers['Referer'] = 'https://www.google.com'
        req = urllib.request.Request(url, headers = headers)
        resp = urllib.request.urlopen(req)
        return str(resp.read())
    except Exception as e:
        print('error while downloading page {0}'.format(url))
        logging.error('error while downloading page {0}'.format(url))
        return None 
开发者ID:WuLC,项目名称:GoogleImagesDownloader,代码行数:22,代码来源:download_with_urllib.py

示例6: http_request_get

# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def http_request_get(url, session=None, payload=None, parse=True):
    """ Sends a GET HTTP request to a website and returns its HTML content and full url address. """

    if payload is None:
        payload = {}

    try:
        if session:
            content = session.get(url, params=payload, verify_ssl=False, headers={'User-Agent': generate_user_agent()})
        else:
            content = requests.get(url, params=payload, verify=False, headers={'User-Agent': generate_user_agent()})

        content.raise_for_status()  # Raise HTTPError for bad requests (4xx or 5xx)

        if parse:
            return html.fromstring(content.text), content.url
        else:
            return content.text, content.url
    except (asyncio.TimeoutError, requests.exceptions.Timeout):
        raise ConnectionTimeout(url) 
开发者ID:mariostoev,项目名称:finviz,代码行数:22,代码来源:request_functions.py

示例7: downloadSingleCate

# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def downloadSingleCate(cateID, dirName, downloadLog, tryBest = True):
    """下载某一类别的词库

    :param cateID: 类别ID
    :param dirName: 下载的目录
    :parm downloadLog: 下载日志,记录下载不成功的文件
    :parm downloadLog: 是否达到最大尝试次数
    :return: None
    """
    pageBaseUrl = r'https://shurufa.baidu.com/dict_list?cid=%s' %cateID
    fileBaseUrl = r'https://shurufa.baidu.com/dict_innerid_download?innerid='

    pagePattern = re.compile(r'page=(\d+)#page')  # 非贪婪匹配,查找跳转到其他页面的url
    filePattern = re.compile(r'dict-name="(.*?)" dict-innerid="(\d+)"')   # 非贪婪匹配,查找可下载的文件的id和

    visited = set()       # 记录某个url是否已经被访问了
    downloaded = set()    # 记录某个文件是否被下载了


    # 防止502错误
    userAgent = generate_user_agent()
    referrer = 'http://shurufa.baidu.com/dict.html'  
    headers = {}
    headers['User-Agent'] = userAgent
    headers['Referer'] = referrer

    # 找到最大页的页码,然后所有页面就是1到最大页面
    try:
        request = urllib2.Request(url=pageBaseUrl, headers=headers)
        response = urllib2.urlopen(request)
        data = response.read()
    except urllib2.HTTPError, e:
        if tryBest:
            with io.open(downloadLog.decode('utf8'), mode = 'a', encoding = 'utf8') as f:
                f.write((str(e.code)+' error while parsing url '+pageBaseUrl+'\n').decode('utf8'))
        return False 
开发者ID:WuLC,项目名称:ThesaurusSpider,代码行数:38,代码来源:singleThreadDownload.py

示例8: getCategoryPages

# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def getCategoryPages(caterotyID,downloadDIR):
    """通过类别的初始页面得到该类别的总页数,并将所有的页数放到 PAGE_QUEUE 中供所有线程下载

    :param caterotyID: 下载的词库类型的 ID,用于找到正确 url
    :param downloadDIR: 下载词库的存放目录
    :return:
    """
    global CATEID, DOWNLOAD_DIR, PAGE_BASE_URL, THREAD_LOCK
    CATEID = caterotyID
    DOWNLOAD_DIR = downloadDIR
    PAGE_BASE_URL = 'https://shurufa.baidu.com/dict_list?cid=%s' % CATEID
    pagePattern = re.compile(r'page=(\d+)#page')    # 在网页源码找到其他页面的URL的正则表达匹配模式
    
    # 防止502错误
    userAgent = generate_user_agent()
    referrer = 'http://shurufa.baidu.com/dict.html'  
    headers = {}
    headers['User-Agent'] = userAgent
    headers['Referer'] = referrer

    # 找到最大页的页码,然后所有页面就是1到最大页面
    # 可能会返回502,500错误,最多尝试5次
    maxTry = 8
    data = None
    for i in xrange(maxTry):
        try:
            request = urllib2.Request(url=PAGE_BASE_URL, headers=headers)
            response = urllib2.urlopen(request)
            data = response.read()
            break
        except urllib2.HTTPError, e:
            if i == maxTry-1:
                with io.open(DOWNLOAD_LOG.decode('utf8'), mode = 'a', encoding = 'utf8') as f:
                    f.write((str(e.code)+' error while parsing url '+PAGE_BASE_URL+'\n').decode('utf8'))
        except: 
开发者ID:WuLC,项目名称:ThesaurusSpider,代码行数:37,代码来源:multiThreadDownload.py

示例9: generate_profile

# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def generate_profile(useragent="(default)"):
    profile = FirefoxProfile()
    if useragent.strip().lower()=="(default)":
        status("Using the default useragent")
        return profile
    elif useragent.strip().lower()=="(random)":
        random_useragent = generate_user_agent(os=('mac', 'linux'))
        profile.set_preference("general.useragent.override", random_useragent) # To make our useragent random
        status("Using random useragent "+random_useragent)
        return profile
    else:
        profile.set_preference("general.useragent.override", useragent)
        status("Using useragent "+useragent)
        return profile 
开发者ID:OWASP,项目名称:QRLJacking,代码行数:16,代码来源:browser.py

示例10: get_user_agent

# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def get_user_agent(os=None, navigator=None, device_type=None):
    try:
        u = generate_user_agent(os=os, navigator=navigator, device_type=device_type)
    except Exception as e:
        u = str(e)
    return u 
开发者ID:Hopetree,项目名称:izone,代码行数:8,代码来源:useragent.py

示例11: get_proxies

# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def get_proxies(proxy_type, ip_set, start_page, end_page):
    """extract proxies from page source code, store them in redis
    
    Args:
        proxy_type (str): base url for proxy type, like the global variables CHINA and OTHER
        ip_set (str): which set should the ips be stored in redis
        start_page (int):  which page to start crawling
        end_page (int): which page to stop crawling
    """
    try:
        conn = get_connection()
    except Exception:
        print 'Error while connecting to redis'
        return
    proxies, curr_proxy =[], None
    for page in xrange(start_page, end_page+1):
        if page % 2 == 0:
            time.sleep(20)
        # get page source code
        headers = {'user-agent': generate_user_agent(), 'referer': 'http://www.xicidaili.com/'}
        text = requests.get(proxy_type+str(page), headers = headers).text
        # extract ips from source code
        soup = BeautifulSoup(text, 'lxml')
        for tr in soup.find_all('tr')[1:]:
            tds = tr.find_all('td')
            #if u'美国' in tds[3].text:
            proxy = tds[1].text+':'+tds[2].text               
            if is_valid('https://www.amazon.com/', proxy):
                conn.sadd(ip_set, proxy)
                print '%s added to ip set %s' %(proxy, ip_set) 
开发者ID:WuLC,项目名称:AmazonRobot,代码行数:32,代码来源:GetProxy.py

示例12: get_address

# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def get_address(proxy):
    """fetch american address from https://fakena.me/random-real-address/
    
    Args:
        proxy (str): proxy to visit the target site, ip:port
    
    Returns:
        format_addr (str): american address in the form of "address_line # city # state # zip"
    """
    ignore_warnings()
    url = r'https://fakena.me/random-real-address/'
    referer = r'https://fakena.me'
    header = {'user-agent' : generate_user_agent() , 'referer':referer }
    curr_proxy ={
    'http': 'http://%s'%proxy
    }

    text = requests.get(url, headers = header, proxies = curr_proxy).text
    pattern = re.compile('<strong>(.+)<br>(.+)</strong>')
    result = re.findall(pattern, text)
    if result: # sometimes the result is empty
        print result[0][0], result[0][1]
        address_line = result[0][0]
        city, state_zip = result[0][1].split(',')
        state, zip = state_zip.split()
        format_addr = address_line+'#'+city+'#'+state+'#'+zip
        return format_addr
    else:
        return '' 
开发者ID:WuLC,项目名称:AmazonRobot,代码行数:31,代码来源:GetUserInfo.py

示例13: __http_request__async

# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def __http_request__async(self, url, session):
        """ Sends asynchronous http request to URL address and scrapes the webpage. """

        try:
            async with session.get(url, headers={'User-Agent': generate_user_agent()}) as response:
                page_html = await response.read()

                if self.cssselect is True:
                    return self.scrape_function(html.fromstring(page_html), url=url, *self.arguments)
                else:
                    return self.scrape_function(page_html, url=url, *self.arguments)
        except (asyncio.TimeoutError, requests.exceptions.Timeout):
            raise ConnectionTimeout(url) 
开发者ID:mariostoev,项目名称:finviz,代码行数:15,代码来源:request_functions.py

示例14: __async_scraper

# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def __async_scraper(self):
        """ Adds a URL's into a list of tasks and requests their response asynchronously. """

        async_tasks = []
        conn = aiohttp.TCPConnector(limit_per_host=connection_settings['CONCURRENT_CONNECTIONS'])
        timeout = aiohttp.ClientTimeout(total=connection_settings['CONNECTION_TIMEOUT'])

        async with aiohttp.ClientSession(connector=conn,
                                         timeout=timeout,
                                         headers={'User-Agent': generate_user_agent()}) as session:
            for n in self.tasks:
                async_tasks.append(self.__http_request__async(n, session))

            self.data = await asyncio.gather(*async_tasks) 
开发者ID:mariostoev,项目名称:finviz,代码行数:16,代码来源:request_functions.py

示例15: send_to_proxy_from_definition

# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def send_to_proxy_from_definition(running_config: RunningConfig):
    openapi3_content: dict = await openapi3_from_db(running_config.api_id)

    session_user_agent = generate_user_agent()

    async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(
            verify_ssl=False)) as session:

        raw_endpoints = search(openapi3_content, "paths")
        query = request_generator(openapi3_content)
        resolver = ref_resolver(openapi3_content)
        endpoints = transform_tree(raw_endpoints, resolver)

        http_scheme, netloc, path, *_ = urlparse(
            running_config.api_url
        )

        host, port = split_netloc(netloc, http_scheme)

        for url, endpoint in endpoints.items():

            logger.info(f"Generating data for End Point: {url}")

            try:
                for method in ("get", "put", "post", "delete"):
                    if method in endpoint:
                        gen = query(url, method=method)
                        req: dict = next(gen)
                        break
                else:
                    raise APICheckException("Unknown method in url: ", url)

            except ValueError as ve:
                logger.error(f"cannot generate data: {ve} - {url}")

            url = f"{http_scheme}://{host}:{port}{path}{req['path']}"

            custom_headers = req["headers"]
            custom_headers["user-agent"] = session_user_agent

            fn_params = dict(
                url=url,
                headers=custom_headers,
                proxy=f"http://{running_config.proxy_ip}:"
                      f"{running_config.proxy_port}",
                skip_auto_headers=("content-type", "user-agent")
            )

            try:
                fn_params["data"] = req["body"]
            except KeyError:
                fn_params["data"] = None

            fn_method = getattr(session, req["method"])

            async with fn_method(**fn_params) as response:
                resp = await response.text() 
开发者ID:BBVA,项目名称:apicheck,代码行数:59,代码来源:run.py


注:本文中的user_agent.generate_user_agent方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。