本文整理汇总了Python中boilerpipe.extract.Extractor.getHTML方法的典型用法代码示例。如果您正苦于以下问题:Python Extractor.getHTML方法的具体用法?Python Extractor.getHTML怎么用?Python Extractor.getHTML使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类boilerpipe.extract.Extractor
的用法示例。
在下文中一共展示了Extractor.getHTML方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: get_articles
# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getHTML [as 别名]
def get_articles(url):
doc = urllib.request.urlopen(url)
docContent = BeautifulSoup(doc, 'html.parser')
articles = []
for element in docContent.find_all('div'):
try:
if element.attrs['style'] == 'width:550px':
article = defaultdict(str)
article_link = 'http://www.moneycontrol.com' + element.a['href']
for p in element.find_all('p'):
if 'a_10dgry' in p.attrs['class']:
article_time = p.contents[0].split('|')[0]
article_date = p.contents[0].split('|')[1][:-1]
article['link'] = article_link
article['time'] = article_time
article['date'] = article_date
extractor = Extractor(extractor='ArticleExtractor',
url=article_link)
article['content'] = extractor.getText()
article['title'] = BeautifulSoup(extractor.getHTML(),
'html.parser').find_all('h1')[0].contents[0]
articles.append(article)
break
except:
logging.debug('div has no width attribute')
return articles
示例2: detag_html_file
# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getHTML [as 别名]
def detag_html_file(infile, outfile, id):
from boilerpipe.extract import Extractor
if not USE_BOILERPLATE:
return detag_html_file_bs(infile, outfile, id)
tempfile = "%s.tmp.html" % (infile,) # boilerplate seems to need an html extension
try:
copyfile(infile, tempfile)
extractor = Extractor(extractor='ArticleExtractor', url="file://"+tempfile)
os.unlink(tempfile)
extracted_text = extractor.getText()
extracted_html = extractor.getHTML()
soup = BeautifulSoup(extracted_html)
output = codecs.open(outfile, encoding='utf-8', mode='w')
output.write(u"<DOC>\n<DOCNO>" + unicode(id) + u"</DOCNO>\n<DOCHDR>\n</DOCHDR>\n");
head = soup.find('head')
if head:
title_tag = head.find('title')
if title_tag and title_tag.string:
output.write(u"<TITLE>" + title_tag.string.replace('\n', ' ') + u"</TITLE>\n")
extract_para(soup, output)
output.write(u"</DOC>\n")
output.close()
except Exception, exc:
try:
os.unlink(tempfile)
except:
pass
return detag_html_file_bs(infile, outfile, id)
示例3: parse
# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getHTML [as 别名]
def parse(self, response):
hxs = Selector(response)
item = ArticleItem()
item["title"] = hxs.xpath('//title/text()').extract()
item["link"] = response.url
item["source"] = hxs.xpath('//p').extract()
extractor = Extractor(extractor='ArticleExtractor', url=item["link"])
source = extractor.getHTML()
item["text"] = extractor.getText()
item["html"] = source
page = html.fromstring(source)
links = page.xpath("//p//a/@href")
linkPattern = re.compile("^(?:ftp|http|https):\/\/(?:[\w\.\-\+]+:{0,1}[\w\.\-\+]*@)?(?:[a-z0-9\-\.]+)(?::[0-9]+)?(?:\/|\/(?:[\w#!:\.\?\+=&%@!\-\/\(\)]+)|\?(?:[\w#!:\.\?\+=&%@!\-\/\(\)]+))?$")
for link in links:
if linkPattern.match(link) and not link in self.crawled_links:
self.crawled_links.append(link)
yield Request(link, self.parse)
yield item
示例4: get_text
# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getHTML [as 别名]
def get_text(url):
from boilerpipe.extract import Extractor
try :
extractor = Extractor(extractor='DefaultExtractor', url=url)
return extractor.getText(), extractor.getHTML()
except:
return "",""
示例5: test_boilerpipe
# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getHTML [as 别名]
def test_boilerpipe():
your_url = "http://stackoverflow.com/questions/9352259/trouble-importing-boilerpipe-in-python"
extractor = Extractor(extractor='ArticleExtractor', url=your_url)
extracted_html = extractor.getHTML()
extracted_text = extractor.getText()
print '\nfunction: %s ' % inspect.stack()[0][3]
print 'extracted html: %i text: %i' % (len(extracted_html), len(extracted_text))
print ''
n.assert_greater(len(extracted_text), min_str_length)
示例6: Text_extractor
# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getHTML [as 别名]
def Text_extractor(y, page, team, team_i):
"""Extract the text of team pages using BoilerPipe."""
upage = urllib.quote_plus(page)
url = "http://" + y + ".igem.org/wiki/index.php?title=" + upage
extractor = Extractor(extractor='ArticleExtractor', url=url)
f = open('results/%s/%s/%s_-_-_CONTENT.html' % (y, team, page.replace('/', '#')), 'w')
f.write(extractor.getHTML())
f.close()
f = open('results/%s/%s/%s_-_-_TEXT.html' % (y, team, page.replace('/', '#')), 'w')
f.write(extractor.getText())
f.close()
path = 'results/%s/%s/%s_-_-_TEXT.html' % (y, team, page.replace('/', '#'))
# text = text.replace('\\n', '\\\\n')
output = '%s\t%s\t%s\t%s\n' % (y, str(teams_id[team_i]), page, path)
teams_pages_text_db.write(output)
示例7: parse
# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getHTML [as 别名]
def parse(self, response):
for article in response.xpath('//channel/item'):
item = ArticleItem()
# Grab the title and the link to the article
item ["title"] = article.xpath("title/text()").extract()
item ["link"] = article.xpath("link/text()").extract()
item ["date"] = article.xpath("pubDate/text()").extract()
link = item["link"][0]
extractor = Extractor(extractor='ArticleExtractor',
url=link)
item ["text"] = extractor.getText()
item ["html"] = extractor.getHTML()
# Grab the source of the page by making another Request
yield Request(link,callback = self.parse_link, meta = dict(item = item))
示例8: extract_boilerpipe
# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getHTML [as 别名]
def extract_boilerpipe(self, html):
"""
Extract an article with Boilerpipe
NOTE: This is an optional method as
boilerpipe is dependency-heavy and
will be potentially cumbersome
to run on manta.
"""
try:
from boilerpipe.extract import Extractor
except ImportError:
return
bp_extract = Extractor(html=html)
return bp_extract.getHTML()
示例9: extract_content
# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getHTML [as 别名]
def extract_content(page_id, ext_id, htmlReturn=False): # htmlReturn=False: by default returns text content
if (page_id is None or "") or (ext_id is None or ""): return badrequest()
page = Page.get_page(page_id)
if page is None: return documentnotfound()
extraction = Extraction.get_extraction(ext_id)
if extraction is None: return documentnotfound()
original_content = page.content
if original_content is None or original_content is "": return nocontent()
if not jpype.isThreadAttachedToJVM():
jpype.attachThreadToJVM()
extractor = Extractor(extractor='DefaultExtractor', html=original_content)
if not htmlReturn:
bp_content = extractor.getText()
else:
bp_content = extractor.getHTML()
if bp_content is None: nocontent()
extraction.update(bp_content=bp_content)
return success()
示例10: ExtractPolicyTextWithBoilerpipe
# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getHTML [as 别名]
def ExtractPolicyTextWithBoilerpipe(policyUrl, extractorType = 'ArticleExtractor', verbose = False, minLinesPerPolicy = 30):
if verbose:
if policyUrl == '-':
print 'ExtractPolicyTextWithBoilerpipe called with policyUrl = {0}. do nothing.'.format(policyUrl)
else:
print 'extracting policy text from {0} using {1}'.format(policyUrl, extractorType)
# trivial return
if policyUrl == '-':
return (None, None)
try:
if policyUrl.startswith('http'):
extractor = Extractor(extractor=extractorType, url=policyUrl)
# the policyUrl may also be a local file path
else:
contentFile = open(policyUrl, 'r')
extractor = Extractor(extractor=extractorType, html=contextFile.read().decode('utf8'))
html = extractor.getHTML()
text = extractor.getText()
if len(text.split(u'\n')) > minLinesPerPolicy:
if verbose:
print 'OK'
text = text.replace(u'\n', u' ')
return (text, html)
elif len(text) > 0 and len(html) > 0:
print 'Policy {1} ignored. Number of paragraphs in extracted policy is less than {0}.'.format(minLinesPerPolicy, policyUrl)
return (None, None)
else:
print 'boilerpipe extracted nothing from {0}'.format(policyUrl)
return (None, None)
except TypeError as e:
print 'TypeError thrown while using boilerpipe to extract {0}: {1}'.format(policyUrl, e)
return (None, None)
except socket.error as e:
print 'socket.error thrown while using boilerpipe to extract {0}: {1}'.format(policyUrl, e)
return (None, None)
except BadStatusLine as e:
print 'httplib.BadStatusLine thrown while using boilerpipe to extract {0}: {1}'.format(policyUrl, e)
return (None, None)
except IncompleteRead as e:
print 'httplib.IncompleteRead thrown while using boilerpipe to extract {0}: {1}'.format(policyUrl, e)
return (None, None)
except LookupError as e:
print 'LookupError using boilerpipe to extract {0}: {1}'.format(policyUrl, e)
return (None, None)
except UnicodeDecodeError as e:
print 'UnicodeDecodeError using boilerpipe to extract {0}: {1}'.format(policyUrl, e)
return (None, None)
except ValueError as e:
print 'ValueError using boilerpipe to extract {0}: {1}'.format(policyUrl, e)
return (None, None)
except urllib2.HTTPError as e:
print 'HTTPError using boilerpipe to extract {0}: {1}'.format(policyUrl, e)
return (None, None)
except urllib2.URLError as e:
print 'URLError using boilerpipe to extract {0}: {1}'.format(policyUrl, e)
return (None, None)
except socket.timeout as e:
print 'socket.timeout thrown while using boilerpipe to extract {0}: {1}'.format(policyUrl, e)
return (None, None)
示例11: update_content_by_url
# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getHTML [as 别名]
def update_content_by_url(self):
from boilerpipe.extract import Extractor
extractor = Extractor(extractor='ArticleExtractor', url=self.url)
self.content_html = extractor.getHTML()
self.content_text = extractor.getText()
示例12: get_basic
# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getHTML [as 别名]
def get_basic():
url = request.args.get('url')
extractor = Extractor(extractor='ArticleExtractor', url=url)
return extractor.getHTML()
示例13: Extractor
# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getHTML [as 别名]
elif obj =='num' and args.result !="":
query += source[obj]
query += args.result
elif obj =='lang' and args.lang !="":
query += source[obj]
query += args.lang
#elif obj =='sortby' and args.sortby !="":
# query += source[obj]
# query += args.sortby
else:
query += source[obj]
query += source[obj+'_def']
#retrieve HTML page of the URL source
try:
extractor = Extractor(extractor='ArticleExtractor', url=query)
extracted_html = extractor.getHTML()
except:
e = sys.exc_info()[0]
print("\n***ERROR (in main.py, extractor 1): "+str(e))
# sleep for 4 seconds before trying crawling agian, otherwise you will be identified and blocked
time.sleep(4)
continue
#retrieve URLs from the HTML page
doc = lxml.html.document_fromstring(extracted_html)
urlList = list()
for url in doc.xpath(XPATH):
url_tmp = str(url.attrib.get('href'))
if not 'http' in url_tmp:
url_tmp = source['url']+url_tmp
urlList.append(url_tmp)
示例14: Document
# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getHTML [as 别名]
html = urllib.urlopen(eachurl).read()
content = Document(html).summary()
title = Document(html).short_title()
except:
print 'Failed URl %s' %eachurl
content = '_'
title = '_'
body_score[-1].append(fscore(word_tokenize(content), data))
title_score[-1].append(fscore(word_tokenize(title), title_true))
############################################################################################
print 'Boilerpipe...'
try:
article = Extractor(url=eachurl)
title = '_'
#title = article.getTitle()
content = article.getHTML()
except:
print 'Failed URl %s' %eachurl
content = '_'
title = '_'
body_score[-1].append(fscore(word_tokenize(content), data))
title_score[-1].append(fscore(word_tokenize(title), title_true))
######################################################################################
print 'libextract...'
# html = urllib.urlopen(eachurl).read()
textnodes = list(extract(html))
try:
content = ' '.join(each.text_content() for each in textnodes[:5])
except:
print 'Not combining unicode %s' %eachurl
content = '_'