当前位置: 首页>>代码示例>>Python>>正文


Python Extractor.getText方法代码示例

本文整理汇总了Python中boilerpipe.extract.Extractor.getText方法的典型用法代码示例。如果您正苦于以下问题:Python Extractor.getText方法的具体用法?Python Extractor.getText怎么用?Python Extractor.getText使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在boilerpipe.extract.Extractor的用法示例。


在下文中一共展示了Extractor.getText方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: build

# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getText [as 别名]
    def build(self, urls, docs=None, extacted_cache=False):
        """ """
        indexed_docs = []
        doc_id = 0
        # ---------------------------------
        if docs and len(urls) != len(docs):
            raise ValueError

        for i,URL in enumerate(urls):
            doc = StrictIndexDocument()
            doc.id, doc.url = doc_id, URL
            doc_id += 1

            if os.path.exists('./data/extracted-'+ str(doc_id) +'-cached.txt'):
                with codecs.open('./data/extracted-'+ str(doc_id) +'-cached.txt', 'r', encoding='utf-8') as f:
                    extr_text = f.read()
            else:
                if docs:
                    extractor = Extractor(extractor='ArticleExtractor', html=docs[i])
                else:
                    extractor = Extractor(extractor='ArticleExtractor', url=URL)
                extr_text = extractor.getText()
                # ---------------------------------
                if len(extr_text) < len(docs[i]) * self.threshold_extractor_fails:
                    if docs:
                        exKeepAll = Extractor(extractor='KeepEverythingExtractor', html=docs[i])
                    else:
                        exKeepAll = Extractor(extractor='KeepEverythingExtractor', url=URL)
                    extr_text = exKeepAll.getText()
                # ---------------------------------
                if extacted_cache:
                    with codecs.open('./data/extracted-'+ str(doc_id) +'-cached.txt', 'w', encoding='utf-8') as f:
                        print >>f, extr_text

            doc.words = self.extract_words(extr_text)
            doc.n_words = len(doc.words)

            paragraph = self.re_set_paragraph.sub(u' ', extr_text)
            sentences, poss = self.SS.predict(paragraph)

            # with codecs.open(u'data/outs.txt', 'a',encoding='utf-8') as f_out:
            #      print >>f_out, u'\n\n-----\n', u','.join( [ str(p) for p in poss ] )
            #      # print >>f_out, u','.join( [ str(p) for p in poss2 ] )
            #      for s in sentences:
            #          print >>f_out, s, '\n'

            poss = [0] + list(poss)
            doc.sentences = [ (poss[i], poss[i+1], sentences[i]) for i in xrange(len(poss)-1) ]
            doc.n_sentences = len(doc.sentences)

            indexed_docs.append(doc)
        return indexed_docs
开发者ID:favorart,项目名称:InfoSearch3,代码行数:54,代码来源:index_builder.py

示例2: get_articles

# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getText [as 别名]
def get_articles(url):
    doc = urllib.request.urlopen(url)
    docContent = BeautifulSoup(doc, 'html.parser')
    articles = []
    for element in docContent.find_all('div'):
        try:
            if element.attrs['style'] == 'width:550px':
                article = defaultdict(str)
                article_link = 'http://www.moneycontrol.com' + element.a['href']
                for p in element.find_all('p'):
                    if 'a_10dgry' in p.attrs['class']:
                        article_time = p.contents[0].split('|')[0]
                        article_date = p.contents[0].split('|')[1][:-1]
                        article['link'] = article_link
                        article['time'] = article_time
                        article['date'] = article_date
                        extractor = Extractor(extractor='ArticleExtractor',
                                              url=article_link)
                        article['content'] = extractor.getText()
                        article['title'] = BeautifulSoup(extractor.getHTML(),
                                                         'html.parser').find_all('h1')[0].contents[0]
                        articles.append(article)
                        break
        except:
            logging.debug('div has no width attribute')
    return articles
开发者ID:aknirala,项目名称:crawler,代码行数:28,代码来源:crawler.py

示例3: parse

# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getText [as 别名]
    def parse(self, response):
        hxs = Selector(response)
        
        item = ArticleItem()
        item["title"] = hxs.xpath('//title/text()').extract()
        item["link"] = response.url
        item["source"] = hxs.xpath('//p').extract()
        
        extractor = Extractor(extractor='ArticleExtractor', url=item["link"])
        
        source = extractor.getHTML()
        item["text"] = extractor.getText()
        item["html"] = source
        
        page = html.fromstring(source)
        links = page.xpath("//p//a/@href")

        linkPattern = re.compile("^(?:ftp|http|https):\/\/(?:[\w\.\-\+]+:{0,1}[\w\.\-\+]*@)?(?:[a-z0-9\-\.]+)(?::[0-9]+)?(?:\/|\/(?:[\w#!:\.\?\+=&amp;%@!\-\/\(\)]+)|\?(?:[\w#!:\.\?\+=&amp;%@!\-\/\(\)]+))?$")
        
        for link in links:
            if linkPattern.match(link) and not link in self.crawled_links:
                self.crawled_links.append(link)
                yield Request(link, self.parse)
        

        yield item
开发者ID:KanwarGill,项目名称:NewsInvestigator,代码行数:28,代码来源:article_spider.py

示例4: extract_blog_posts

# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getText [as 别名]
def extract_blog_posts(url_string, PAGES = 48):
    blog_posts = []
    page_count = 0
    
    while(page_count<=PAGES):
        page_count+=1
        url = url_string.format(page_count) # create url
        driver.get(url)
        
        try:        
            article = driver.find_elements_by_tag_name('article')        
            articles_size = len(article)
            print 'processing ', url
        except SocketError as e:
            if e.errno != errno.ECONNRESET:
                raise # Not error we are looking for
            continue
            
        for i in xrange(articles_size):
            headers = article[i].find_elements_by_tag_name("header")
        for header in headers:
            article_a = header.find_elements_by_xpath("//h1/a[@title]")
        print 'extracting ...'             
        for e in article_a:
            extractor = Extractor(extractor = 'ArticleExtractor', url = e.get_attribute('href'))
            texts = extractor.getText()    
            
            blog_posts.append({'title': e.text, 'content': clean_html(texts), 'link': e.get_attribute('href')})
            return blog_posts
开发者ID:ehnaton,项目名称:piton,代码行数:31,代码来源:processing+grammarly+webblog.py

示例5: GOOGLE_get_data

# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getText [as 别名]
def GOOGLE_get_data(company):

    google_news_rss_url = "https://news.google.com/news/?q=%s&output=rss" % company
    rss_feed = feedparser.parse(google_news_rss_url)

    content_list = list()

    for entry in rss_feed['entries']:
        title = entry['title']
        link = entry['link']
        try:
            news_page = urllib2.urlopen(link).read()
            extractor = Extractor(extractor='ArticleExtractor', html=news_page)
        except:
            continue
        content = extractor.getText()
        now = datetime.datetime.now()
        content_list.append({"title": title,
                            "article": content,
                            "link": link,
                            "source": "GOOGLE",
                            "target": company,
                            "date": "%04d%02d%02d" % (now.year, now.month, now.day),
                            "hash": hashlib.sha224(title.encode("UTF-8")).hexdigest()})
                            

    DBOperation.save_db(content_list)
开发者ID:happinesstaker,项目名称:FYP,代码行数:29,代码来源:crawler_google.py

示例6: run

# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getText [as 别名]
	def run(self):
		count = 0
		docCount = self.doc_cursor.count()
		for doc in self.doc_cursor:
			url = doc['url']
			if (self.keepText(url)):
				try:
					extractor = Extractor(extractor='ArticleExtractor', url=url)
					extracted_text = extractor.getText()
				
					if (len(extracted_text) > 0):
						title = extractor.getTitle()
						
						if title != None:
							doc['title'] = title
							doc['extracted_text'] = title + " " + extracted_text
						else:
							doc['extracted_text'] = extracted_text
						self.db_collection.save(doc)
						print 'OK -' + url
				except IOError, err:
					print "IOError with url " + url
					print str(err)
				except (LookupError):
					print "LookupError - Maybe not text or weird encoding " + url
				except (UnicodeDecodeError, UnicodeEncodeError):
					print "UnicodeDecodeError or UnicodeEncodeError- " + url
开发者ID:c4fcm,项目名称:Terra-Incognita,代码行数:29,代码来源:batch_extractor.py

示例7: get_text

# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getText [as 别名]
def get_text(url):
    from boilerpipe.extract import Extractor
    try :
        extractor = Extractor(extractor='DefaultExtractor', url=url)
        return extractor.getText(), extractor.getHTML()
    except:
        return "",""
开发者ID:PhilippeNguyen,项目名称:COMP598Ass1,代码行数:9,代码来源:data_extractor.py

示例8: get_text_boilerpipe

# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getText [as 别名]
def get_text_boilerpipe(html_text):
    try:
        extractor = Extractor(extractor='ArticleExtractor', html=html_text)
        return extractor.getText()
    except:
        print "Exception"
        return None
开发者ID:ViDA-NYU,项目名称:memex,代码行数:9,代码来源:extract.py

示例9: extract_and_save

# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getText [as 别名]
def extract_and_save(url, path):
	try:
		handle = urllib2.urlopen(url)
		html_content = handle.read()
		extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content)
		text = extractor.getText()
		if text:
			if detect_english(text):
				links = get_all_urls(html_content, url)
				for link in links:
					try:
						handle = urllib2.urlopen(url)
						html_content = handle.read()
						#extractor = Extractor(extractor='KeepEverythingExtractor', html=html_content)		
						#text_content = extractor.getText()
						#if text_content:
						#	if detect_english(text_content):
						encoded_url = encode(link)
						f = open(path + "/" + encoded_url, "w")
						f.write(html_content)
						f.close()
					except:
						print url
						traceback.print_exc()
						return None
	except:
		print url
		traceback.print_exc()
		return None
开发者ID:ViDA-NYU,项目名称:memex,代码行数:31,代码来源:expand_urls.py

示例10: detag_html_file

# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getText [as 别名]
def detag_html_file(infile, outfile, id):
    from boilerpipe.extract import Extractor

    if not USE_BOILERPLATE:
        return detag_html_file_bs(infile, outfile, id)

    tempfile = "%s.tmp.html" % (infile,) # boilerplate seems to need an html extension
    try:
        copyfile(infile, tempfile)
        extractor = Extractor(extractor='ArticleExtractor', url="file://"+tempfile)
        os.unlink(tempfile)

        extracted_text = extractor.getText()
        extracted_html = extractor.getHTML()

        soup = BeautifulSoup(extracted_html)
        output = codecs.open(outfile, encoding='utf-8', mode='w')
        output.write(u"<DOC>\n<DOCNO>" + unicode(id) + u"</DOCNO>\n<DOCHDR>\n</DOCHDR>\n");
        head = soup.find('head')
        if head:
            title_tag = head.find('title')
            if title_tag and title_tag.string:
                output.write(u"<TITLE>" + title_tag.string.replace('\n', ' ') + u"</TITLE>\n")

        extract_para(soup, output)
        output.write(u"</DOC>\n")
        output.close()
    except Exception, exc:
        try:
            os.unlink(tempfile)
        except:
            pass

        return detag_html_file_bs(infile, outfile, id)
开发者ID:Big-Data,项目名称:hunter-gatherer,代码行数:36,代码来源:html_to_trec.py

示例11: download_article_file

# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getText [as 别名]
def download_article_file(articleURL, articleFileDirectory, code):
	articleFilePath = articleFileDirectory + code
				
	# Download the article and save as file
	if (articleURL == ""):
		print "ERROR: Empty URL detected! File not created"
		return None
	else:
		# If a directory for files doesn't exist, create it
		dir = os.path.dirname(articleFileDirectory)

		if not os.path.isdir(dir):
			#print "Created directory: " + dir
			os.makedirs(dir)
		
		try:
			#fullArticle = urllib2.urlopen(articleURL)
			#fullArticleText = fullArticle.read()

			# Use boilerpipe to remove boilerplate and formatting
			extractor = Extractor(extractor='ArticleExtractor', url=articleURL)
			fullArticleText = extractor.getText()

			# Test to see if article is in English. If not, then return None
			top_language = cld.detect(fullArticleText.encode('utf-8'))[0]
			if (top_language != 'ENGLISH'):
				print "SKIPPED: Article is in " + top_language
				return None

			outfile = open(articleFilePath, 'w+')			
			outfile.write(fullArticleText.encode('ascii', 'ignore'))
			outfile.close

			# Use lxml's HTML cleaner to remove markup
			#htmltree = lxml.html.fromstring(fullArticleText)		
			#cleaner = lxml.html.clean.Cleaner(remove_unknown_tags=True)
			#cleaned_tree = cleaner.clean_html(htmltree)
			#return cleaned_tree.text_content()
			return fullArticleText
	

		except urllib2.HTTPError:
			print "ERROR: HTTPError. Article file download skipped: " + articleURL	
			return None

		except urllib2.URLError:
			print "ERROR: URLError. Article file download skipped: " + articleURL	
			return None

		except LookupError:
			print "ERROR: LookupError. Article file download skipped: " + articleURL	
			return None
		
		except UnicodeDecodeError:
			print "ERROR: UnicodeDecodeError. Article file download skipped: " + articleURL
			return None

		except:
	                print "ERROR: ", sys.exc_info()[0]
        	        return None
开发者ID:Akibalogh,项目名称:biffle-prototype,代码行数:62,代码来源:parse-and-download.py

示例12: main

# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getText [as 别名]
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("raw_dir_path")
    parser.add_argument("out_file_path")
    args = parser.parse_args()

    f_names = [(int(f), f) for f in listdir(args.raw_dir_path)]
    f_names = sorted(f_names)
    fout = open(args.out_file_path, 'w')

    for int_f_name, f_name in f_names:
        trec_reader = TrecReader(join(args.raw_dir_path, f_name))
        empty_cnt = 0
        err_cnt = 0

        for docno, html_text in trec_reader:
            if not html_text:
                empty_cnt += 1
            try:
                extractor = Extractor(extractor='ArticleExtractor', html=html_text)
                text = extractor.getText()
                text = text.replace('\n', ' ').replace('\t', ' ')
                text = text.encode('ascii', 'ignore')
                text = text_clean(text)
                if text:
                    fout.write(docno + '\t' + text + '\n')
                else:
                    empty_cnt += 1
            except Exception as e:
                err_cnt += 1

    fout.close()
    print empty_cnt, err_cnt
开发者ID:AdeDZY,项目名称:clean-gov2,代码行数:35,代码来源:clean_gov2.py

示例13: parse_page

# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getText [as 别名]
	def parse_page(self, response):
		if response.meta.has_key('crawldepth'):
			depth = response.meta['crawldepth']
		else:
		#       Set search depth here
			depth = 1
		log.msg('Depth = %s' % str(depth), level=log.INFO)
		if not isinstance(response, HtmlResponse):
		    log.msg('Not an HTML file: %s' % response.url, level=log.WARNING)
		    return

		log.msg('Response from: %s' % response.url, level=log.INFO)
		url_bf.add(response.url)
	
		# TODO: Extract page title
	
		extractor = Extractor(extractor='ArticleExtractor', html=response.body_as_unicode())
		cleaned_text = extractor.getText()

		# Eliminate duplicates
		keywordset = set(keywordlist)

		found_list = []
		for keyword in keywordset: # TODO: Is there a more efficient way to do this?
			# Look at word boundaries to match entire words only
			if (re.search(r'\b' + re.escape(keyword) + r'\b', cleaned_text)):
				found_list.append(keyword)

		# Parse this page		
		item = BiffleItem()
		if (len(found_list) > 0):
			item['url'] = response.url
			item['body'] = cleaned_text
			item['keywords'] = ', '.join(found_list)
			item['process_date'] = datetime.today()
			log.msg("Keyword(s) found: %s" % ', '.join(found_list), level=log.INFO)
			self.map_keyword_count(found_list)
			yield item

		if (depth > 0):	
			# Find the next requests and yield those
			hxs = HtmlXPathSelector(response)
			links = hxs.select('//a/@href').extract()
			log.msg('Links on page: %s' % len(links), level=log.INFO)
			depth -= 1
			log.msg('Depth has been decremented, new value = %s' % str(depth), level=log.INFO)
			for l in links:
				l = urlparse.urljoin(response.url, l)
				if (l in url_bf):
					pass
					#log.msg('Duplicate URL found: %s' % l, level=log.INFO)
				else:
					url_bf.add(l)
					#log.msg('Found link: %s | From URL: %s' % (l, response.url), level=log.INFO)
					# Decrement depth for next layer of links
					#callback = lambda response, depth = depth: self.parse_page(response, depth)			
					callback = lambda response: self.parse_page(response)
					request = Request(l, callback=callback)
					request.meta['crawldepth'] = depth
					yield request
开发者ID:Akibalogh,项目名称:biffle-prototype,代码行数:62,代码来源:biffle_spider.py

示例14: parse_item

# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getText [as 别名]
 def parse_item(self, response):
     response_news = NewsItem()
     response_news['url'] = response.url
     response_news['html'] = Binary(zlib.compress(response.body, 9))
     extractor = Extractor(extractor='ArticleExtractor', html=response.body)
     response_news['content'] = extractor.getText()
     return response_news
开发者ID:ciheul,项目名称:bigcrawler,代码行数:9,代码来源:kompas.py

示例15: extractor

# 需要导入模块: from boilerpipe.extract import Extractor [as 别名]
# 或者: from boilerpipe.extract.Extractor import getText [as 别名]
def extractor(URL):

    extractor = Extractor(extractor='ArticleExtractor', url=URL)

    data = extractor.getText()

    file = open("data.txt", "w")
    file.write(data.encode('UTF-8'))
    file.close()

    #Scinde la contenu en phrase
    with open('data.txt', 'r') as f:
        s = f.read()
        sentences = s.split('.')

    #Liste de mot vide
    w=[]

    #Scinde les phrase en mots
    for sentence in sentences :
        w.extend(sentence.split(' '))

    print w

    #Retourne la liste de Mot
    return w
开发者ID:arnalex,项目名称:Python_Nltk,代码行数:28,代码来源:webSemantic.py


注:本文中的boilerpipe.extract.Extractor.getText方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。