本文整理汇总了Python中mechanize.Browser.addheaders方法的典型用法代码示例。如果您正苦于以下问题:Python Browser.addheaders方法的具体用法?Python Browser.addheaders怎么用?Python Browser.addheaders使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类mechanize.Browser
的用法示例。
在下文中一共展示了Browser.addheaders方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_data
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import addheaders [as 别名]
def get_data ():
html = scraperwiki.scrape (edd_url)
process_ex_dividend_data (html)
br = Browser()
br.set_handle_robots (False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
br.open (edd_url)
links = {}
for link in br.links():
if link.text in ['2', '3', '4']:
links [link.text] = link.url
for k, link in links.items():
m = re.search (edd_pat, link)
br = Browser()
br.set_handle_robots (False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
br.open (edd_url)
br.select_form(nr=0)
br.set_all_readonly(False)
br["__EVENTTARGET"] = m.group(1)
br["__EVENTARGUMENT"] = ''
for c in br.controls:
if c.type == 'submit':
c.disabled = True
response = br.submit()
process_ex_dividend_data (response.read())
示例2: get_google_news_by_url
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import addheaders [as 别名]
def get_google_news_by_url(url):
# Construct browser object
browser = Browser()
ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.81 Safari/537.36'
browser.addheaders = [('User-Agent', ua), ('Accept', '*/*')]
# Do not observe rules from robots.txt
browser.set_handle_robots(False)
# Create HTML document
html = fromstring(browser.open(url).read())
# get number of pages
xpath_pages = '//a[@class="fl"]'
page_num = len(html.xpath(xpath_pages)) + 1
# get all pages url
urls = generate_url_pages(url, page_num)
print 'On ' + str(len(urls)) + ' pages:'
df = [None] * page_num
# iterate through all pages of this url
for index, url in enumerate(urls):
page_html = fromstring(browser.open(url).read())
df[index] = get_google_news_in_page(page_html)
return pd.concat(df, ignore_index=True)
示例3: getRandomXKCDComic
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import addheaders [as 别名]
def getRandomXKCDComic(urlBase):
br = Browser()
br.addheaders = [('User-agent', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6; en-us) AppleWebKit/531.9 (KHTML, like Gecko) Version/4.0.3 Safari/531.9')]
br.set_handle_robots(False)
#XKCD Comics are enumerated in the following type by URL: http://www.xkcd.com/1, http://www.xkcd.com/2, ..., http://www.xkcd.com/n
upperBound = 1
lowerBound = 1
#Multiply by two until address no longer exists
while True:
link = urlBase + str(upperBound) + "/"
try:
response = br.open(link)
except:
break
lowerBound = upperBound
upperBound = upperBound * 2
#Binary Search for last Comic
while True:
pivot = (upperBound + lowerBound)/2
link = urlBase + str(pivot) + "/"
if lowerBound == upperBound or pivot == lowerBound:
randomComicID = random.randint(1, pivot)
randPageLink = urlBase + str(randomComicID) + "/"
return br.open(randPageLink)
try:
response = br.open(link)
lowerBound = pivot
except:
upperBound = pivot
示例4: scrape
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import addheaders [as 别名]
def scrape(self):
"""
Opens the html page and parses the pdf links.
"""
browser = Browser()
#-----------
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values1 = {'name' : 'Michael Foord',
'location' : 'Northampton',
'language' : 'Python' }
headers = { 'User-Agent' : user_agent }
browser.set_handle_redirect(True)
browser.set_handle_referer(True)
browser.set_handle_robots(False)
browser.addheaders = [('User-Agent', 'Firefox')]
#-------------
browser.set_handle_robots(False)
html = browser.open(self.site)
lines = html.read().splitlines()
for line in lines:
urls = re.findall('<a href="?\'?([^"\'>]*)', line)
for url in urls:
if '.pdf"' in url:
self.pdf_urls.append(url)
示例5: find_first_article
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import addheaders [as 别名]
def find_first_article():
mech = Browser()
cj = cookielib.LWPCookieJar()
mech.set_handle_equiv(True)
# mech.set_handle_gzip(True)
mech.set_handle_redirect(True)
mech.set_handle_referer(True)
mech.set_handle_robots(False)
# mech.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
mech.addheaders = [
(
"User-agent",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1",
)
]
page = mech.open("https://bitcointalk.org/index.php?board=77.0")
html = page.read()
soup = BeautifulSoup(html)
first_article_tag = soup.find("td", class_="windowbg")
global startingpost
startingpost = first_article_tag.span.a.get("href")
print startingpost
示例6: extract_article_url
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import addheaders [as 别名]
def extract_article_url(posturl):
mech = Browser()
cj = cookielib.LWPCookieJar()
mech.set_handle_equiv(True)
# mech.set_handle_gzip(True)
mech.set_handle_redirect(True)
mech.set_handle_referer(True)
mech.set_handle_robots(False)
# mech.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
mech.addheaders = [
(
"User-agent",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1",
)
]
page = mech.open(posturl)
html = page.read()
global soup
soup = BeautifulSoup(html)
global articleURL
# print soup.prettify()
for item in soup.find_all("div", class_="post"):
for link in item.find_all("a"):
string = link.get("href")
if prog.match(string):
# find the link that is to the article (link outside of bitcointalk.org forum)
articleURL = link.get("href")
return link.get("href")
return "No article url"
示例7: login_url
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import addheaders [as 别名]
def login_url(
url,
login,
passwd,
form_nomber,
login_name,
paswd_name,
submit_nomber
):
br = Browser(); showMessage('Создаю интерфейс браузера')
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
br.open(url); showMessage('Загружаю сайт и произвожу вход')
br.select_form(nr = form_nomber)
br[login_name] = login
br[paswd_name] = passwd
res = br.submit(nr = submit_nomber)
content = res.read()
#определить число страниц
maxPage = int(max_page(content)); showMessage('Определяю количество страниц и перехожу на последнюю')
curPage = 84
while curPage < maxPage:
res = br.open('http://forum.rsload.net/cat-kryaki-seriyniki-varez/topic-4820-page-%d.html' % (maxPage))
curPage = maxPage
maxPage = int(max_page(content))
content = res.read()
#парсинг ключей
if get_all_keys(content):
webbrowser.open_new_tab('http://forum.rsload.net/cat-kryaki-seriyniki-varez/topic-4820-page-%d.html' % (maxPage)) # Вернет True и откроет вкладку
示例8: parseFeeds
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import addheaders [as 别名]
def parseFeeds(self):
mech = Browser()
mech.addheaders = [ ('User-agent', 'Mozilla/5.0 (compatible)') ]
mech.set_handle_robots(False)
for url in self.feedUrls:
#url = "http://feeds.feedburner.com/PurdueEngNews?format=xml"
page = mech.open(url)
html = page.read()
soup = BeautifulStoneSoup(html)
headlines = []
descriptions = []
i=0
self.newsList = []
for item in soup.findAll('item'):
if (i > 20):
break
date = item.find('pubdate')
title = item.find('title')
link = item.find('link')
desc = item.find('description')
if (len(title.contents) > 0):
title2 = title.contents[0]
else:
title2 = 'None'
self.newsList.append(NewsStory(date.contents[0], title2, link.contents[0], \
desc.contents[0]))
i+=1
for story in self.newsList:
headlines.append(story.title)
descriptions.append(story.link)
#story.display()
self.headlineList.append(headlines)
self.descList.append(descriptions)
self.populateTopicList()
示例9: scrap_query
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import addheaders [as 别名]
def scrap_query(query, bang=None):
r = ddg_query('imbd ' + query, bang=bang)
if 'redirect' in dir(r) and 'primary' in dir(r.redirect):
url = r.redirect.primary
else:
logger.info('Could not find imdb searchpage from DuckDuckGo bang')
return None
br = Browser()
br.set_handle_robots(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2;\
WOW64) AppleWebKit/537.11 (KHTML, like Gecko)\
Chrome/23.0.1271.97 Safari/537.11')]
r = br.open(url)
soup = BeautifulSoup(r)
for link in soup.find_all('a'):
href = link.get('href','')
match = re.search(r"imdb\.com/.*tt(?P<number>[^/]*)", href)
if match:
imdb_id = check_imdb(match.group('number'))
return imdb_id
return None
示例10: respond
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import addheaders [as 别名]
def respond(permalink, text):
br = Browser()
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1'
br.addheaders = [('User-agent', user_agent)]
soup = BeautifulSoup(br.open(permalink).read())
urlopen = urllib2.urlopen
Request = urllib2.Request
encode = urllib.urlencode
cj = cookielib.LWPCookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
root_comment = soup.find('form', attrs={'class': 'usertext border'})
thing_id = root_comment.find('input', attrs={'name': 'thing_id'})['value']
print 'thing_id', thing_id
# LOG THE FUCK IN
req = Request('http://www.reddit.com/api/login/username', encode({'user': 'acid-trip-bot', 'passwd': 'hackny', 'api_type': 'json'}), {'User-Agent': user_agent})
req_open = urlopen(req)
read = json.loads(req_open.read())
modhash = read['json']['data']['modhash']
# POST THE FUCKING COMMENT
req = Request('http://www.reddit.com/api/comment', encode({'thing_id': thing_id, 'text': text + '\n\n*This is an automated response.*', 'uh': modhash}), {'User-Agent': user_agent})
req_open = urlopen(req)
read = json.dumps(req_open.read())
示例11: scrape_info
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import addheaders [as 别名]
def scrape_info():
browser = Browser()
browser.set_handle_robots(False)
browser.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
parkIds = []
for name in config['names']:
browser.open("https://www.recreation.gov")
browser.select_form(nr=0)
browser['locationCriteria'] = name
response = browser.submit()
content = response.read()
soup = BeautifulSoup(content, 'html.parser')
scripts = soup.select('script')
for script in scripts:
if 'SuggestedPlaces' in str(script):
jsonStr = str(script).strip('<script>var SuggestedPlaces = ').strip(';</script>')
places = json.loads(jsonStr)
query = urlparse.parse_qs(places[0]['value'])
if 'parkId' in query:
print('FOUND!: ' + unicode(query['parkId'][0]))
parkIds.append(unicode(query['parkId'][0]))
else:
print('No results for ' + name + ': ' + places[0]['value'])
pprint(parkIds)
示例12: createbrowser
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import addheaders [as 别名]
def createbrowser(self):
br = Browser()
br.set_handle_gzip(True)
br.set_handle_robots(False)
br.set_handle_redirect(True)
br.addheaders = [('User-agent', 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 5_1 like Mac OS X; en-US) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B179 Safari/7534.48.3')]
return br
示例13: searchTitle
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import addheaders [as 别名]
def searchTitle(rawtitle):
br = Browser()
# Ignore robots.txt
br.set_handle_robots( False )
# Google demands a user-agent that isn't a robot
br.addheaders = [('User-agent', 'Firefox')]
br.open( "http://www.google.com " )
br.select_form( 'f' )
s='imdb'+' + '+' '.join(re.compile('[\.]').split(rawtitle))
br.form[ 'q' ] = s
br.submit()
resp = None
for link in br.links():
siteMatch = re.compile( 'www.imdb.com/title/tt[0-9]*/$' ).search( link.url )
if siteMatch:
resp = br.follow_link( link )
print link.url
break
soup = BeautifulSoup(resp.get_data())
title = re.sub(' - IMDb','',soup.find('title').string)
title = re.sub('\([0-9]*\)','',title)
return title
示例14: generateSentence
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import addheaders [as 别名]
def generateSentence(var):
br = Browser()
br.set_handle_robots(False)
br.set_handle_equiv(False)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
url= 'http://www.oxforddictionaries.com/definition/english/' + str(var)
# url= 'https://www.google.co.in/search?q=define+utilitarian'
try:
br.open(url)
except:
print "what word is this, man? " + var
return
soup = BeautifulSoup(br.response().read())
sentence=""
counter=0
for i in soup.find_all('ul',{'class':'sentence_dictionary'}):
if i is not None:
soup2 = BeautifulSoup(str(i))
for j in soup2.find_all('li',{'class':'sentence'}):
if j is not None:
sentence = sentence + str(counter+1)+") "+j.string.replace(',',' ').strip()+"\n"
counter+=1
if counter == 2:
return sentence
return sentence
示例15: process
# 需要导入模块: from mechanize import Browser [as 别名]
# 或者: from mechanize.Browser import addheaders [as 别名]
def process(time):
br = Browser()
# Ignore robots.txt
br.set_handle_robots( False )
# Google demands a user-agent that isn't a robot
br.addheaders = [('User-agent', 'Firefox')]
br.open("http://heasarc.gsfc.nasa.gov/cgi-bin/Tools/xTime/xTime.pl")
br.select_form("form")
br["time_in_i"] = time # Enter your time in here in the format "2015-06-27 04:23:23.68"
response=br.submit()
html=response.read()
soup = BeautifulSoup(html)
table =soup.find("table", border=5)
g = table.findAll('tr')
row= g[7] #Select the correct row
cols = row.findAll('td')
value = cols[1].string #This is the MET time
return value