本文整理匯總了Python中user_agent.generate_user_agent方法的典型用法代碼示例。如果您正苦於以下問題:Python user_agent.generate_user_agent方法的具體用法?Python user_agent.generate_user_agent怎麽用?Python user_agent.generate_user_agent使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類user_agent
的用法示例。
在下文中一共展示了user_agent.generate_user_agent方法的15個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: getBaiduDictCate
# 需要導入模塊: import user_agent [as 別名]
# 或者: from user_agent import generate_user_agent [as 別名]
def getBaiduDictCate():
"""
功能:得到百度詞庫的分類,有三級分類,因為三級分類太細而且較少,所以將三級分類納入其二級分類
:return:兩個詞典,第一個詞典記錄大類的ID和內容的對應關係,第二個詞典記錄了第一個詞典中每一類大類下的所有分類
"""
bigCateDict = {}
smallCateDict ={}
initPageURL = r'https://shurufa.baidu.com/dict'
cateBaseURL = r'https://shurufa.baidu.com/dict_list?cid='
# 防止502錯誤
userAgent = generate_user_agent()
referrer = 'http://shurufa.baidu.com/dict.html'
headers = {}
headers['User-Agent'] = userAgent
headers['Referer'] = referrer
# 抓取大類
try:
request = urllib2.Request(url=initPageURL, headers=headers)
response = urllib2.urlopen(request)
data = response.read()
except urllib2.HTTPError, e:
print 'Error while getting the big category,error code:',e.code
sys.exit()
示例2: __init__
# 需要導入模塊: import user_agent [as 別名]
# 或者: from user_agent import generate_user_agent [as 別名]
def __init__(self, proxy):
"""init the webdriver by setting the proxy and user-agent
Args:
proxy (str): proxy in the form of ip:port
"""
# set proxy
ip, port = proxy.split(':')
profile = webdriver.FirefoxProfile()
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.http", ip)
profile.set_preference("network.proxy.http_port", port)
# set user_agent
profile.set_preference("general.useragent.override", generate_user_agent())
profile.update_preferences()
self.driver = webdriver.Firefox(firefox_profile=profile)
print 'current proxy: %s'%proxy
示例3: is_valid
# 需要導入模塊: import user_agent [as 別名]
# 或者: from user_agent import generate_user_agent [as 別名]
def is_valid(target_url, ip, referer):
"""judge if a proxy ip is valid for target_url
Args:
target_url (str): url that need to visite with a proxy
ip (str): the set in redis to get
referer (str, optional): referer part of headers of the request
Returns:
boolean
"""
ignore_warnings()
proxy = {
'http': 'http://%s' %ip
}
headers = {'user-agent': generate_user_agent(), 'referer': referer}
try:
r = requests.get(target_url, headers = headers, proxies = proxy, timeout = 6)
return True
except Exception:
return False
示例4: get_phone_visa
# 需要導入模塊: import user_agent [as 別名]
# 或者: from user_agent import generate_user_agent [as 別名]
def get_phone_visa():
"""fetch phone, visa from http://www.fakeaddressgenerator.com/World/us_address_generator"""
url = r'http://www.fakeaddressgenerator.com/World/us_address_generator'
referer = r'http://www.fakeaddressgenerator.com/World'
header = {'user-agent' : generate_user_agent() , 'referer':referer }
text = requests.get(url, headers = header).text
soup = BeautifulSoup(text, 'lxml')
info = soup.find_all('input')
"""
print 'name:',info[0]['value']
print 'phone:',info[9]['value']
print 'visa:',info[11]['value']
print 'expires:',info[13]['value']
"""
name_phone = info[0]['value']+'#'+info[9]['value']
name_visa = info[0]['value']+'#'+info[11]['value']+'#'+info[13]['value']
print name_phone, name_visa
return name_phone, name_visa
示例5: download_page
# 需要導入模塊: import user_agent [as 別名]
# 或者: from user_agent import generate_user_agent [as 別名]
def download_page(url):
"""download raw content of the page
Args:
url (str): url of the page
Returns:
raw content of the page
"""
try:
headers = {}
headers['User-Agent'] = generate_user_agent()
headers['Referer'] = 'https://www.google.com'
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
return str(resp.read())
except Exception as e:
print('error while downloading page {0}'.format(url))
logging.error('error while downloading page {0}'.format(url))
return None
示例6: http_request_get
# 需要導入模塊: import user_agent [as 別名]
# 或者: from user_agent import generate_user_agent [as 別名]
def http_request_get(url, session=None, payload=None, parse=True):
""" Sends a GET HTTP request to a website and returns its HTML content and full url address. """
if payload is None:
payload = {}
try:
if session:
content = session.get(url, params=payload, verify_ssl=False, headers={'User-Agent': generate_user_agent()})
else:
content = requests.get(url, params=payload, verify=False, headers={'User-Agent': generate_user_agent()})
content.raise_for_status() # Raise HTTPError for bad requests (4xx or 5xx)
if parse:
return html.fromstring(content.text), content.url
else:
return content.text, content.url
except (asyncio.TimeoutError, requests.exceptions.Timeout):
raise ConnectionTimeout(url)
示例7: downloadSingleCate
# 需要導入模塊: import user_agent [as 別名]
# 或者: from user_agent import generate_user_agent [as 別名]
def downloadSingleCate(cateID, dirName, downloadLog, tryBest = True):
"""下載某一類別的詞庫
:param cateID: 類別ID
:param dirName: 下載的目錄
:parm downloadLog: 下載日誌,記錄下載不成功的文件
:parm downloadLog: 是否達到最大嘗試次數
:return: None
"""
pageBaseUrl = r'https://shurufa.baidu.com/dict_list?cid=%s' %cateID
fileBaseUrl = r'https://shurufa.baidu.com/dict_innerid_download?innerid='
pagePattern = re.compile(r'page=(\d+)#page') # 非貪婪匹配,查找跳轉到其他頁麵的url
filePattern = re.compile(r'dict-name="(.*?)" dict-innerid="(\d+)"') # 非貪婪匹配,查找可下載的文件的id和
visited = set() # 記錄某個url是否已經被訪問了
downloaded = set() # 記錄某個文件是否被下載了
# 防止502錯誤
userAgent = generate_user_agent()
referrer = 'http://shurufa.baidu.com/dict.html'
headers = {}
headers['User-Agent'] = userAgent
headers['Referer'] = referrer
# 找到最大頁的頁碼,然後所有頁麵就是1到最大頁麵
try:
request = urllib2.Request(url=pageBaseUrl, headers=headers)
response = urllib2.urlopen(request)
data = response.read()
except urllib2.HTTPError, e:
if tryBest:
with io.open(downloadLog.decode('utf8'), mode = 'a', encoding = 'utf8') as f:
f.write((str(e.code)+' error while parsing url '+pageBaseUrl+'\n').decode('utf8'))
return False
示例8: getCategoryPages
# 需要導入模塊: import user_agent [as 別名]
# 或者: from user_agent import generate_user_agent [as 別名]
def getCategoryPages(caterotyID,downloadDIR):
"""通過類別的初始頁麵得到該類別的總頁數,並將所有的頁數放到 PAGE_QUEUE 中供所有線程下載
:param caterotyID: 下載的詞庫類型的 ID,用於找到正確 url
:param downloadDIR: 下載詞庫的存放目錄
:return:
"""
global CATEID, DOWNLOAD_DIR, PAGE_BASE_URL, THREAD_LOCK
CATEID = caterotyID
DOWNLOAD_DIR = downloadDIR
PAGE_BASE_URL = 'https://shurufa.baidu.com/dict_list?cid=%s' % CATEID
pagePattern = re.compile(r'page=(\d+)#page') # 在網頁源碼找到其他頁麵的URL的正則表達匹配模式
# 防止502錯誤
userAgent = generate_user_agent()
referrer = 'http://shurufa.baidu.com/dict.html'
headers = {}
headers['User-Agent'] = userAgent
headers['Referer'] = referrer
# 找到最大頁的頁碼,然後所有頁麵就是1到最大頁麵
# 可能會返回502,500錯誤,最多嘗試5次
maxTry = 8
data = None
for i in xrange(maxTry):
try:
request = urllib2.Request(url=PAGE_BASE_URL, headers=headers)
response = urllib2.urlopen(request)
data = response.read()
break
except urllib2.HTTPError, e:
if i == maxTry-1:
with io.open(DOWNLOAD_LOG.decode('utf8'), mode = 'a', encoding = 'utf8') as f:
f.write((str(e.code)+' error while parsing url '+PAGE_BASE_URL+'\n').decode('utf8'))
except:
示例9: generate_profile
# 需要導入模塊: import user_agent [as 別名]
# 或者: from user_agent import generate_user_agent [as 別名]
def generate_profile(useragent="(default)"):
profile = FirefoxProfile()
if useragent.strip().lower()=="(default)":
status("Using the default useragent")
return profile
elif useragent.strip().lower()=="(random)":
random_useragent = generate_user_agent(os=('mac', 'linux'))
profile.set_preference("general.useragent.override", random_useragent) # To make our useragent random
status("Using random useragent "+random_useragent)
return profile
else:
profile.set_preference("general.useragent.override", useragent)
status("Using useragent "+useragent)
return profile
示例10: get_user_agent
# 需要導入模塊: import user_agent [as 別名]
# 或者: from user_agent import generate_user_agent [as 別名]
def get_user_agent(os=None, navigator=None, device_type=None):
try:
u = generate_user_agent(os=os, navigator=navigator, device_type=device_type)
except Exception as e:
u = str(e)
return u
示例11: get_proxies
# 需要導入模塊: import user_agent [as 別名]
# 或者: from user_agent import generate_user_agent [as 別名]
def get_proxies(proxy_type, ip_set, start_page, end_page):
"""extract proxies from page source code, store them in redis
Args:
proxy_type (str): base url for proxy type, like the global variables CHINA and OTHER
ip_set (str): which set should the ips be stored in redis
start_page (int): which page to start crawling
end_page (int): which page to stop crawling
"""
try:
conn = get_connection()
except Exception:
print 'Error while connecting to redis'
return
proxies, curr_proxy =[], None
for page in xrange(start_page, end_page+1):
if page % 2 == 0:
time.sleep(20)
# get page source code
headers = {'user-agent': generate_user_agent(), 'referer': 'http://www.xicidaili.com/'}
text = requests.get(proxy_type+str(page), headers = headers).text
# extract ips from source code
soup = BeautifulSoup(text, 'lxml')
for tr in soup.find_all('tr')[1:]:
tds = tr.find_all('td')
#if u'美國' in tds[3].text:
proxy = tds[1].text+':'+tds[2].text
if is_valid('https://www.amazon.com/', proxy):
conn.sadd(ip_set, proxy)
print '%s added to ip set %s' %(proxy, ip_set)
示例12: get_address
# 需要導入模塊: import user_agent [as 別名]
# 或者: from user_agent import generate_user_agent [as 別名]
def get_address(proxy):
"""fetch american address from https://fakena.me/random-real-address/
Args:
proxy (str): proxy to visit the target site, ip:port
Returns:
format_addr (str): american address in the form of "address_line # city # state # zip"
"""
ignore_warnings()
url = r'https://fakena.me/random-real-address/'
referer = r'https://fakena.me'
header = {'user-agent' : generate_user_agent() , 'referer':referer }
curr_proxy ={
'http': 'http://%s'%proxy
}
text = requests.get(url, headers = header, proxies = curr_proxy).text
pattern = re.compile('<strong>(.+)<br>(.+)</strong>')
result = re.findall(pattern, text)
if result: # sometimes the result is empty
print result[0][0], result[0][1]
address_line = result[0][0]
city, state_zip = result[0][1].split(',')
state, zip = state_zip.split()
format_addr = address_line+'#'+city+'#'+state+'#'+zip
return format_addr
else:
return ''
示例13: __http_request__async
# 需要導入模塊: import user_agent [as 別名]
# 或者: from user_agent import generate_user_agent [as 別名]
def __http_request__async(self, url, session):
""" Sends asynchronous http request to URL address and scrapes the webpage. """
try:
async with session.get(url, headers={'User-Agent': generate_user_agent()}) as response:
page_html = await response.read()
if self.cssselect is True:
return self.scrape_function(html.fromstring(page_html), url=url, *self.arguments)
else:
return self.scrape_function(page_html, url=url, *self.arguments)
except (asyncio.TimeoutError, requests.exceptions.Timeout):
raise ConnectionTimeout(url)
示例14: __async_scraper
# 需要導入模塊: import user_agent [as 別名]
# 或者: from user_agent import generate_user_agent [as 別名]
def __async_scraper(self):
""" Adds a URL's into a list of tasks and requests their response asynchronously. """
async_tasks = []
conn = aiohttp.TCPConnector(limit_per_host=connection_settings['CONCURRENT_CONNECTIONS'])
timeout = aiohttp.ClientTimeout(total=connection_settings['CONNECTION_TIMEOUT'])
async with aiohttp.ClientSession(connector=conn,
timeout=timeout,
headers={'User-Agent': generate_user_agent()}) as session:
for n in self.tasks:
async_tasks.append(self.__http_request__async(n, session))
self.data = await asyncio.gather(*async_tasks)
示例15: send_to_proxy_from_definition
# 需要導入模塊: import user_agent [as 別名]
# 或者: from user_agent import generate_user_agent [as 別名]
def send_to_proxy_from_definition(running_config: RunningConfig):
openapi3_content: dict = await openapi3_from_db(running_config.api_id)
session_user_agent = generate_user_agent()
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(
verify_ssl=False)) as session:
raw_endpoints = search(openapi3_content, "paths")
query = request_generator(openapi3_content)
resolver = ref_resolver(openapi3_content)
endpoints = transform_tree(raw_endpoints, resolver)
http_scheme, netloc, path, *_ = urlparse(
running_config.api_url
)
host, port = split_netloc(netloc, http_scheme)
for url, endpoint in endpoints.items():
logger.info(f"Generating data for End Point: {url}")
try:
for method in ("get", "put", "post", "delete"):
if method in endpoint:
gen = query(url, method=method)
req: dict = next(gen)
break
else:
raise APICheckException("Unknown method in url: ", url)
except ValueError as ve:
logger.error(f"cannot generate data: {ve} - {url}")
url = f"{http_scheme}://{host}:{port}{path}{req['path']}"
custom_headers = req["headers"]
custom_headers["user-agent"] = session_user_agent
fn_params = dict(
url=url,
headers=custom_headers,
proxy=f"http://{running_config.proxy_ip}:"
f"{running_config.proxy_port}",
skip_auto_headers=("content-type", "user-agent")
)
try:
fn_params["data"] = req["body"]
except KeyError:
fn_params["data"] = None
fn_method = getattr(session, req["method"])
async with fn_method(**fn_params) as response:
resp = await response.text()