本文整理汇总了Python中boilerpipe.extract.Extractor类的典型用法代码示例。如果您正苦于以下问题:Python Extractor类的具体用法?Python Extractor怎么用?Python Extractor使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Extractor类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: extract_blog_posts
def extract_blog_posts(url_string, PAGES = 48):
blog_posts = []
page_count = 0
while(page_count<=PAGES):
page_count+=1
url = url_string.format(page_count) # create url
driver.get(url)
try:
article = driver.find_elements_by_tag_name('article')
articles_size = len(article)
print 'processing ', url
except SocketError as e:
if e.errno != errno.ECONNRESET:
raise # Not error we are looking for
continue
for i in xrange(articles_size):
headers = article[i].find_elements_by_tag_name("header")
for header in headers:
article_a = header.find_elements_by_xpath("//h1/a[@title]")
print 'extracting ...'
for e in article_a:
extractor = Extractor(extractor = 'ArticleExtractor', url = e.get_attribute('href'))
texts = extractor.getText()
blog_posts.append({'title': e.text, 'content': clean_html(texts), 'link': e.get_attribute('href')})
return blog_posts
示例2: parse
def parse(self, response):
hxs = Selector(response)
item = ArticleItem()
item["title"] = hxs.xpath('//title/text()').extract()
item["link"] = response.url
item["source"] = hxs.xpath('//p').extract()
extractor = Extractor(extractor='ArticleExtractor', url=item["link"])
source = extractor.getHTML()
item["text"] = extractor.getText()
item["html"] = source
page = html.fromstring(source)
links = page.xpath("//p//a/@href")
linkPattern = re.compile("^(?:ftp|http|https):\/\/(?:[\w\.\-\+]+:{0,1}[\w\.\-\+]*@)?(?:[a-z0-9\-\.]+)(?::[0-9]+)?(?:\/|\/(?:[\w#!:\.\?\+=&%@!\-\/\(\)]+)|\?(?:[\w#!:\.\?\+=&%@!\-\/\(\)]+))?$")
for link in links:
if linkPattern.match(link) and not link in self.crawled_links:
self.crawled_links.append(link)
yield Request(link, self.parse)
yield item
示例3: get_text
def get_text(url):
from boilerpipe.extract import Extractor
try :
extractor = Extractor(extractor='DefaultExtractor', url=url)
return extractor.getText(), extractor.getHTML()
except:
return "",""
示例4: GOOGLE_get_data
def GOOGLE_get_data(company):
google_news_rss_url = "https://news.google.com/news/?q=%s&output=rss" % company
rss_feed = feedparser.parse(google_news_rss_url)
content_list = list()
for entry in rss_feed['entries']:
title = entry['title']
link = entry['link']
try:
news_page = urllib2.urlopen(link).read()
extractor = Extractor(extractor='ArticleExtractor', html=news_page)
except:
continue
content = extractor.getText()
now = datetime.datetime.now()
content_list.append({"title": title,
"article": content,
"link": link,
"source": "GOOGLE",
"target": company,
"date": "%04d%02d%02d" % (now.year, now.month, now.day),
"hash": hashlib.sha224(title.encode("UTF-8")).hexdigest()})
DBOperation.save_db(content_list)
示例5: get_articles
def get_articles(url):
doc = urllib.request.urlopen(url)
docContent = BeautifulSoup(doc, 'html.parser')
articles = []
for element in docContent.find_all('div'):
try:
if element.attrs['style'] == 'width:550px':
article = defaultdict(str)
article_link = 'http://www.moneycontrol.com' + element.a['href']
for p in element.find_all('p'):
if 'a_10dgry' in p.attrs['class']:
article_time = p.contents[0].split('|')[0]
article_date = p.contents[0].split('|')[1][:-1]
article['link'] = article_link
article['time'] = article_time
article['date'] = article_date
extractor = Extractor(extractor='ArticleExtractor',
url=article_link)
article['content'] = extractor.getText()
article['title'] = BeautifulSoup(extractor.getHTML(),
'html.parser').find_all('h1')[0].contents[0]
articles.append(article)
break
except:
logging.debug('div has no width attribute')
return articles
示例6: detag_html_file
def detag_html_file(infile, outfile, id):
from boilerpipe.extract import Extractor
if not USE_BOILERPLATE:
return detag_html_file_bs(infile, outfile, id)
tempfile = "%s.tmp.html" % (infile,) # boilerplate seems to need an html extension
try:
copyfile(infile, tempfile)
extractor = Extractor(extractor='ArticleExtractor', url="file://"+tempfile)
os.unlink(tempfile)
extracted_text = extractor.getText()
extracted_html = extractor.getHTML()
soup = BeautifulSoup(extracted_html)
output = codecs.open(outfile, encoding='utf-8', mode='w')
output.write(u"<DOC>\n<DOCNO>" + unicode(id) + u"</DOCNO>\n<DOCHDR>\n</DOCHDR>\n");
head = soup.find('head')
if head:
title_tag = head.find('title')
if title_tag and title_tag.string:
output.write(u"<TITLE>" + title_tag.string.replace('\n', ' ') + u"</TITLE>\n")
extract_para(soup, output)
output.write(u"</DOC>\n")
output.close()
except Exception, exc:
try:
os.unlink(tempfile)
except:
pass
return detag_html_file_bs(infile, outfile, id)
示例7: parse_item
def parse_item(self, response):
response_news = NewsItem()
response_news['url'] = response.url
response_news['html'] = Binary(zlib.compress(response.body, 9))
extractor = Extractor(extractor='ArticleExtractor', html=response.body)
response_news['content'] = extractor.getText()
return response_news
示例8: extract_and_save
def extract_and_save(url, path):
try:
handle = urllib2.urlopen(url)
html_content = handle.read()
extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content)
text = extractor.getText()
if text:
if detect_english(text):
links = get_all_urls(html_content, url)
for link in links:
try:
handle = urllib2.urlopen(url)
html_content = handle.read()
#extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content)
#text_content = extractor.getText()
#if text_content:
# if detect_english(text_content):
encoded_url = encode(link)
f = open(path + "/" + encoded_url, "w")
f.write(html_content)
f.close()
except:
print url
traceback.print_exc()
return None
except:
print url
traceback.print_exc()
return None
示例9: download_article_file
def download_article_file(articleURL, articleFileDirectory, code):
articleFilePath = articleFileDirectory + code
# Download the article and save as file
if (articleURL == ""):
print "ERROR: Empty URL detected! File not created"
return None
else:
# If a directory for files doesn't exist, create it
dir = os.path.dirname(articleFileDirectory)
if not os.path.isdir(dir):
#print "Created directory: " + dir
os.makedirs(dir)
try:
#fullArticle = urllib2.urlopen(articleURL)
#fullArticleText = fullArticle.read()
# Use boilerpipe to remove boilerplate and formatting
extractor = Extractor(extractor='ArticleExtractor', url=articleURL)
fullArticleText = extractor.getText()
# Test to see if article is in English. If not, then return None
top_language = cld.detect(fullArticleText.encode('utf-8'))[0]
if (top_language != 'ENGLISH'):
print "SKIPPED: Article is in " + top_language
return None
outfile = open(articleFilePath, 'w+')
outfile.write(fullArticleText.encode('ascii', 'ignore'))
outfile.close
# Use lxml's HTML cleaner to remove markup
#htmltree = lxml.html.fromstring(fullArticleText)
#cleaner = lxml.html.clean.Cleaner(remove_unknown_tags=True)
#cleaned_tree = cleaner.clean_html(htmltree)
#return cleaned_tree.text_content()
return fullArticleText
except urllib2.HTTPError:
print "ERROR: HTTPError. Article file download skipped: " + articleURL
return None
except urllib2.URLError:
print "ERROR: URLError. Article file download skipped: " + articleURL
return None
except LookupError:
print "ERROR: LookupError. Article file download skipped: " + articleURL
return None
except UnicodeDecodeError:
print "ERROR: UnicodeDecodeError. Article file download skipped: " + articleURL
return None
except:
print "ERROR: ", sys.exc_info()[0]
return None
示例10: main
def main():
parser = argparse.ArgumentParser()
parser.add_argument("raw_dir_path")
parser.add_argument("out_file_path")
args = parser.parse_args()
f_names = [(int(f), f) for f in listdir(args.raw_dir_path)]
f_names = sorted(f_names)
fout = open(args.out_file_path, 'w')
for int_f_name, f_name in f_names:
trec_reader = TrecReader(join(args.raw_dir_path, f_name))
empty_cnt = 0
err_cnt = 0
for docno, html_text in trec_reader:
if not html_text:
empty_cnt += 1
try:
extractor = Extractor(extractor='ArticleExtractor', html=html_text)
text = extractor.getText()
text = text.replace('\n', ' ').replace('\t', ' ')
text = text.encode('ascii', 'ignore')
text = text_clean(text)
if text:
fout.write(docno + '\t' + text + '\n')
else:
empty_cnt += 1
except Exception as e:
err_cnt += 1
fout.close()
print empty_cnt, err_cnt
示例11: get_text_boilerpipe
def get_text_boilerpipe(html_text):
try:
extractor = Extractor(extractor='ArticleExtractor', html=html_text)
return extractor.getText()
except:
print "Exception"
return None
示例12: parse_page
def parse_page(self, response):
if response.meta.has_key('crawldepth'):
depth = response.meta['crawldepth']
else:
# Set search depth here
depth = 1
log.msg('Depth = %s' % str(depth), level=log.INFO)
if not isinstance(response, HtmlResponse):
log.msg('Not an HTML file: %s' % response.url, level=log.WARNING)
return
log.msg('Response from: %s' % response.url, level=log.INFO)
url_bf.add(response.url)
# TODO: Extract page title
extractor = Extractor(extractor='ArticleExtractor', html=response.body_as_unicode())
cleaned_text = extractor.getText()
# Eliminate duplicates
keywordset = set(keywordlist)
found_list = []
for keyword in keywordset: # TODO: Is there a more efficient way to do this?
# Look at word boundaries to match entire words only
if (re.search(r'\b' + re.escape(keyword) + r'\b', cleaned_text)):
found_list.append(keyword)
# Parse this page
item = BiffleItem()
if (len(found_list) > 0):
item['url'] = response.url
item['body'] = cleaned_text
item['keywords'] = ', '.join(found_list)
item['process_date'] = datetime.today()
log.msg("Keyword(s) found: %s" % ', '.join(found_list), level=log.INFO)
self.map_keyword_count(found_list)
yield item
if (depth > 0):
# Find the next requests and yield those
hxs = HtmlXPathSelector(response)
links = hxs.select('//a/@href').extract()
log.msg('Links on page: %s' % len(links), level=log.INFO)
depth -= 1
log.msg('Depth has been decremented, new value = %s' % str(depth), level=log.INFO)
for l in links:
l = urlparse.urljoin(response.url, l)
if (l in url_bf):
pass
#log.msg('Duplicate URL found: %s' % l, level=log.INFO)
else:
url_bf.add(l)
#log.msg('Found link: %s | From URL: %s' % (l, response.url), level=log.INFO)
# Decrement depth for next layer of links
#callback = lambda response, depth = depth: self.parse_page(response, depth)
callback = lambda response: self.parse_page(response)
request = Request(l, callback=callback)
request.meta['crawldepth'] = depth
yield request
示例13: extractor
def extractor(URL):
extractor = Extractor(extractor='ArticleExtractor', url=URL)
data = extractor.getText()
file = open("data.txt", "w")
file.write(data.encode('UTF-8'))
file.close()
#Scinde la contenu en phrase
with open('data.txt', 'r') as f:
s = f.read()
sentences = s.split('.')
#Liste de mot vide
w=[]
#Scinde les phrase en mots
for sentence in sentences :
w.extend(sentence.split(' '))
print w
#Retourne la liste de Mot
return w
示例14: run
def run(self):
count = 0
docCount = self.doc_cursor.count()
for doc in self.doc_cursor:
url = doc['url']
if (self.keepText(url)):
try:
extractor = Extractor(extractor='ArticleExtractor', url=url)
extracted_text = extractor.getText()
if (len(extracted_text) > 0):
title = extractor.getTitle()
if title != None:
doc['title'] = title
doc['extracted_text'] = title + " " + extracted_text
else:
doc['extracted_text'] = extracted_text
self.db_collection.save(doc)
print 'OK -' + url
except IOError, err:
print "IOError with url " + url
print str(err)
except (LookupError):
print "LookupError - Maybe not text or weird encoding " + url
except (UnicodeDecodeError, UnicodeEncodeError):
print "UnicodeDecodeError or UnicodeEncodeError- " + url
示例15: process_text
def process_text(self, text):
if text == "":
return text
extractor = Extractor(extractor='ArticleExtractor',
html=text)
new_val = extractor.getText()
return new_val