本文整理汇总了Python中user_agent.generate_user_agent方法的典型用法代码示例。如果您正苦于以下问题:Python user_agent.generate_user_agent方法的具体用法?Python user_agent.generate_user_agent怎么用?Python user_agent.generate_user_agent使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类user_agent
的用法示例。
在下文中一共展示了user_agent.generate_user_agent方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: getBaiduDictCate
# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def getBaiduDictCate():
"""
功能:得到百度词库的分类,有三级分类,因为三级分类太细而且较少,所以将三级分类纳入其二级分类
:return:两个词典,第一个词典记录大类的ID和内容的对应关系,第二个词典记录了第一个词典中每一类大类下的所有分类
"""
bigCateDict = {}
smallCateDict ={}
initPageURL = r'https://shurufa.baidu.com/dict'
cateBaseURL = r'https://shurufa.baidu.com/dict_list?cid='
# 防止502错误
userAgent = generate_user_agent()
referrer = 'http://shurufa.baidu.com/dict.html'
headers = {}
headers['User-Agent'] = userAgent
headers['Referer'] = referrer
# 抓取大类
try:
request = urllib2.Request(url=initPageURL, headers=headers)
response = urllib2.urlopen(request)
data = response.read()
except urllib2.HTTPError, e:
print 'Error while getting the big category,error code:',e.code
sys.exit()
示例2: __init__
# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def __init__(self, proxy):
"""init the webdriver by setting the proxy and user-agent
Args:
proxy (str): proxy in the form of ip:port
"""
# set proxy
ip, port = proxy.split(':')
profile = webdriver.FirefoxProfile()
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.http", ip)
profile.set_preference("network.proxy.http_port", port)
# set user_agent
profile.set_preference("general.useragent.override", generate_user_agent())
profile.update_preferences()
self.driver = webdriver.Firefox(firefox_profile=profile)
print 'current proxy: %s'%proxy
示例3: is_valid
# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def is_valid(target_url, ip, referer):
"""judge if a proxy ip is valid for target_url
Args:
target_url (str): url that need to visite with a proxy
ip (str): the set in redis to get
referer (str, optional): referer part of headers of the request
Returns:
boolean
"""
ignore_warnings()
proxy = {
'http': 'http://%s' %ip
}
headers = {'user-agent': generate_user_agent(), 'referer': referer}
try:
r = requests.get(target_url, headers = headers, proxies = proxy, timeout = 6)
return True
except Exception:
return False
示例4: get_phone_visa
# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def get_phone_visa():
"""fetch phone, visa from http://www.fakeaddressgenerator.com/World/us_address_generator"""
url = r'http://www.fakeaddressgenerator.com/World/us_address_generator'
referer = r'http://www.fakeaddressgenerator.com/World'
header = {'user-agent' : generate_user_agent() , 'referer':referer }
text = requests.get(url, headers = header).text
soup = BeautifulSoup(text, 'lxml')
info = soup.find_all('input')
"""
print 'name:',info[0]['value']
print 'phone:',info[9]['value']
print 'visa:',info[11]['value']
print 'expires:',info[13]['value']
"""
name_phone = info[0]['value']+'#'+info[9]['value']
name_visa = info[0]['value']+'#'+info[11]['value']+'#'+info[13]['value']
print name_phone, name_visa
return name_phone, name_visa
示例5: download_page
# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def download_page(url):
"""download raw content of the page
Args:
url (str): url of the page
Returns:
raw content of the page
"""
try:
headers = {}
headers['User-Agent'] = generate_user_agent()
headers['Referer'] = 'https://www.google.com'
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
return str(resp.read())
except Exception as e:
print('error while downloading page {0}'.format(url))
logging.error('error while downloading page {0}'.format(url))
return None
示例6: http_request_get
# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def http_request_get(url, session=None, payload=None, parse=True):
""" Sends a GET HTTP request to a website and returns its HTML content and full url address. """
if payload is None:
payload = {}
try:
if session:
content = session.get(url, params=payload, verify_ssl=False, headers={'User-Agent': generate_user_agent()})
else:
content = requests.get(url, params=payload, verify=False, headers={'User-Agent': generate_user_agent()})
content.raise_for_status() # Raise HTTPError for bad requests (4xx or 5xx)
if parse:
return html.fromstring(content.text), content.url
else:
return content.text, content.url
except (asyncio.TimeoutError, requests.exceptions.Timeout):
raise ConnectionTimeout(url)
示例7: downloadSingleCate
# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def downloadSingleCate(cateID, dirName, downloadLog, tryBest = True):
"""下载某一类别的词库
:param cateID: 类别ID
:param dirName: 下载的目录
:parm downloadLog: 下载日志,记录下载不成功的文件
:parm downloadLog: 是否达到最大尝试次数
:return: None
"""
pageBaseUrl = r'https://shurufa.baidu.com/dict_list?cid=%s' %cateID
fileBaseUrl = r'https://shurufa.baidu.com/dict_innerid_download?innerid='
pagePattern = re.compile(r'page=(\d+)#page') # 非贪婪匹配,查找跳转到其他页面的url
filePattern = re.compile(r'dict-name="(.*?)" dict-innerid="(\d+)"') # 非贪婪匹配,查找可下载的文件的id和
visited = set() # 记录某个url是否已经被访问了
downloaded = set() # 记录某个文件是否被下载了
# 防止502错误
userAgent = generate_user_agent()
referrer = 'http://shurufa.baidu.com/dict.html'
headers = {}
headers['User-Agent'] = userAgent
headers['Referer'] = referrer
# 找到最大页的页码,然后所有页面就是1到最大页面
try:
request = urllib2.Request(url=pageBaseUrl, headers=headers)
response = urllib2.urlopen(request)
data = response.read()
except urllib2.HTTPError, e:
if tryBest:
with io.open(downloadLog.decode('utf8'), mode = 'a', encoding = 'utf8') as f:
f.write((str(e.code)+' error while parsing url '+pageBaseUrl+'\n').decode('utf8'))
return False
示例8: getCategoryPages
# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def getCategoryPages(caterotyID,downloadDIR):
"""通过类别的初始页面得到该类别的总页数,并将所有的页数放到 PAGE_QUEUE 中供所有线程下载
:param caterotyID: 下载的词库类型的 ID,用于找到正确 url
:param downloadDIR: 下载词库的存放目录
:return:
"""
global CATEID, DOWNLOAD_DIR, PAGE_BASE_URL, THREAD_LOCK
CATEID = caterotyID
DOWNLOAD_DIR = downloadDIR
PAGE_BASE_URL = 'https://shurufa.baidu.com/dict_list?cid=%s' % CATEID
pagePattern = re.compile(r'page=(\d+)#page') # 在网页源码找到其他页面的URL的正则表达匹配模式
# 防止502错误
userAgent = generate_user_agent()
referrer = 'http://shurufa.baidu.com/dict.html'
headers = {}
headers['User-Agent'] = userAgent
headers['Referer'] = referrer
# 找到最大页的页码,然后所有页面就是1到最大页面
# 可能会返回502,500错误,最多尝试5次
maxTry = 8
data = None
for i in xrange(maxTry):
try:
request = urllib2.Request(url=PAGE_BASE_URL, headers=headers)
response = urllib2.urlopen(request)
data = response.read()
break
except urllib2.HTTPError, e:
if i == maxTry-1:
with io.open(DOWNLOAD_LOG.decode('utf8'), mode = 'a', encoding = 'utf8') as f:
f.write((str(e.code)+' error while parsing url '+PAGE_BASE_URL+'\n').decode('utf8'))
except:
示例9: generate_profile
# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def generate_profile(useragent="(default)"):
profile = FirefoxProfile()
if useragent.strip().lower()=="(default)":
status("Using the default useragent")
return profile
elif useragent.strip().lower()=="(random)":
random_useragent = generate_user_agent(os=('mac', 'linux'))
profile.set_preference("general.useragent.override", random_useragent) # To make our useragent random
status("Using random useragent "+random_useragent)
return profile
else:
profile.set_preference("general.useragent.override", useragent)
status("Using useragent "+useragent)
return profile
示例10: get_user_agent
# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def get_user_agent(os=None, navigator=None, device_type=None):
try:
u = generate_user_agent(os=os, navigator=navigator, device_type=device_type)
except Exception as e:
u = str(e)
return u
示例11: get_proxies
# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def get_proxies(proxy_type, ip_set, start_page, end_page):
"""extract proxies from page source code, store them in redis
Args:
proxy_type (str): base url for proxy type, like the global variables CHINA and OTHER
ip_set (str): which set should the ips be stored in redis
start_page (int): which page to start crawling
end_page (int): which page to stop crawling
"""
try:
conn = get_connection()
except Exception:
print 'Error while connecting to redis'
return
proxies, curr_proxy =[], None
for page in xrange(start_page, end_page+1):
if page % 2 == 0:
time.sleep(20)
# get page source code
headers = {'user-agent': generate_user_agent(), 'referer': 'http://www.xicidaili.com/'}
text = requests.get(proxy_type+str(page), headers = headers).text
# extract ips from source code
soup = BeautifulSoup(text, 'lxml')
for tr in soup.find_all('tr')[1:]:
tds = tr.find_all('td')
#if u'美国' in tds[3].text:
proxy = tds[1].text+':'+tds[2].text
if is_valid('https://www.amazon.com/', proxy):
conn.sadd(ip_set, proxy)
print '%s added to ip set %s' %(proxy, ip_set)
示例12: get_address
# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def get_address(proxy):
"""fetch american address from https://fakena.me/random-real-address/
Args:
proxy (str): proxy to visit the target site, ip:port
Returns:
format_addr (str): american address in the form of "address_line # city # state # zip"
"""
ignore_warnings()
url = r'https://fakena.me/random-real-address/'
referer = r'https://fakena.me'
header = {'user-agent' : generate_user_agent() , 'referer':referer }
curr_proxy ={
'http': 'http://%s'%proxy
}
text = requests.get(url, headers = header, proxies = curr_proxy).text
pattern = re.compile('<strong>(.+)<br>(.+)</strong>')
result = re.findall(pattern, text)
if result: # sometimes the result is empty
print result[0][0], result[0][1]
address_line = result[0][0]
city, state_zip = result[0][1].split(',')
state, zip = state_zip.split()
format_addr = address_line+'#'+city+'#'+state+'#'+zip
return format_addr
else:
return ''
示例13: __http_request__async
# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def __http_request__async(self, url, session):
""" Sends asynchronous http request to URL address and scrapes the webpage. """
try:
async with session.get(url, headers={'User-Agent': generate_user_agent()}) as response:
page_html = await response.read()
if self.cssselect is True:
return self.scrape_function(html.fromstring(page_html), url=url, *self.arguments)
else:
return self.scrape_function(page_html, url=url, *self.arguments)
except (asyncio.TimeoutError, requests.exceptions.Timeout):
raise ConnectionTimeout(url)
示例14: __async_scraper
# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def __async_scraper(self):
""" Adds a URL's into a list of tasks and requests their response asynchronously. """
async_tasks = []
conn = aiohttp.TCPConnector(limit_per_host=connection_settings['CONCURRENT_CONNECTIONS'])
timeout = aiohttp.ClientTimeout(total=connection_settings['CONNECTION_TIMEOUT'])
async with aiohttp.ClientSession(connector=conn,
timeout=timeout,
headers={'User-Agent': generate_user_agent()}) as session:
for n in self.tasks:
async_tasks.append(self.__http_request__async(n, session))
self.data = await asyncio.gather(*async_tasks)
示例15: send_to_proxy_from_definition
# 需要导入模块: import user_agent [as 别名]
# 或者: from user_agent import generate_user_agent [as 别名]
def send_to_proxy_from_definition(running_config: RunningConfig):
openapi3_content: dict = await openapi3_from_db(running_config.api_id)
session_user_agent = generate_user_agent()
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(
verify_ssl=False)) as session:
raw_endpoints = search(openapi3_content, "paths")
query = request_generator(openapi3_content)
resolver = ref_resolver(openapi3_content)
endpoints = transform_tree(raw_endpoints, resolver)
http_scheme, netloc, path, *_ = urlparse(
running_config.api_url
)
host, port = split_netloc(netloc, http_scheme)
for url, endpoint in endpoints.items():
logger.info(f"Generating data for End Point: {url}")
try:
for method in ("get", "put", "post", "delete"):
if method in endpoint:
gen = query(url, method=method)
req: dict = next(gen)
break
else:
raise APICheckException("Unknown method in url: ", url)
except ValueError as ve:
logger.error(f"cannot generate data: {ve} - {url}")
url = f"{http_scheme}://{host}:{port}{path}{req['path']}"
custom_headers = req["headers"]
custom_headers["user-agent"] = session_user_agent
fn_params = dict(
url=url,
headers=custom_headers,
proxy=f"http://{running_config.proxy_ip}:"
f"{running_config.proxy_port}",
skip_auto_headers=("content-type", "user-agent")
)
try:
fn_params["data"] = req["body"]
except KeyError:
fn_params["data"] = None
fn_method = getattr(session, req["method"])
async with fn_method(**fn_params) as response:
resp = await response.text()