本文整理汇总了Python中amcat.models.Article.set_property方法的典型用法代码示例。如果您正苦于以下问题:Python Article.set_property方法的具体用法?Python Article.set_property怎么用?Python Article.set_property使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类amcat.models.Article
的用法示例。
在下文中一共展示了Article.set_property方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: scrape_unit
# 需要导入模块: from amcat.models import Article [as 别名]
# 或者: from amcat.models.Article import set_property [as 别名]
def scrape_unit(self, unit):
date = iso8601.iso8601.parse_date(unit["datum"], default_timezone=None)
hostname = urlparse(unit["url"]).hostname
publisher = ".".join(hostname.split(".")[-2:])
title = unit["titel"].strip() or "[No title]"
article = Article(title=title, text=unit["bericht tekst"], url=unit["url"], date=date)
article.set_property("author", unit["auteur"])
article.set_property("publisher", publisher)
return article
示例2: _parse_comment
# 需要导入模块: from amcat.models import Article [as 别名]
# 或者: from amcat.models.Article import set_property [as 别名]
def _parse_comment(self, comment, base_title, base_url):
text = html2text(comment.cssselect("p"))
article_id = comment.get("id")
title = "{base_title}#{article_id}".format(**locals())
url = "{base_url}#{article_id}".format(**locals())
author, timestamp = _parse_comment_footer(comment.cssselect("footer")[0].text_content())
article = Article(date=timestamp, title=title, text=text.strip() or ".", url=url)
article.set_property("author", author.strip())
article.set_property("medium", "GeenStijl Comments")
return article
示例3: scrape_unit_meta
# 需要导入模块: from amcat.models import Article [as 别名]
# 或者: from amcat.models.Article import set_property [as 别名]
def scrape_unit_meta(self, article_element):
article_html = article_element.get_attribute("outerHTML")
article_doc = lxml.html.fromstring(article_html, base_url=SEARCH_URL)
def get_byline_prop(prop):
for meta_element in article_doc.cssselect(".article_byline__element.{}".format(prop)):
prop_value = meta_element.text_content().strip()
if prop_value:
return prop_value
else:
raise ValueError("Article {} has no property '{}'.".format(title, prop))
text_url = article_doc.cssselect("a.article_headline")[0].get("href")
url = "newsdesk://{}".format(get_newsdesk_article_id(text_url))
title = article_doc.cssselect("a.article_headline")[0].text_content().strip()
publisher = get_byline_prop("source")
date = get_byline_prop("harvest_date")
date, pub_date = date.split("(gepubliceerd: ")
date = dutch_strptime(date.strip(), "%d %b %Y %H:%M")
pub_date = dutch_strptime(pub_date.strip()[:-1], "%d %b %Y %H:%M")
article = Article(url=url, title=title, date=date)
article.set_property("publisher", publisher)
article.set_property("text_url", text_url)
# Crashes AmCAT API:
#article.set_property("pubdate_date", pub_date)
try:
article.set_property("author", get_byline_prop("author"))
except ValueError:
pass
try:
article.set_property("wordcount_int", int(get_byline_prop("word_count").split()[0]))
except ValueError:
pass
try:
article.set_property("country", get_byline_prop("source_country"))
except ValueError:
pass
return NewsdeskUnit(article_element, article)
示例4: scrape_unit
# 需要导入模块: from amcat.models import Article [as 别名]
# 或者: from amcat.models.Article import set_property [as 别名]
def scrape_unit(self, date_and_article_url):
date, article_url = date_and_article_url
log.info("Fetching {}".format(article_url))
article_doc = self.session.get_html(article_url)
article_el = article_doc.cssselect("#content > article")
if not article_el:
log.error("Could not find article on {article_url}".format(**locals()))
return None
title = article_el[0].cssselect("h1")[0].text
text = html2text(article_el[0].cssselect("p"))
text = text.strip() or "."
try:
footer = article_el[0].cssselect("footer")[0]
except IndexError as e:
# Contains <embed> tag which is not closed gracefully :-(
log.exception(e)
return None
author = footer.text.rsplit("|", 1)[0].strip()
timestamp = parse_date(article_el[0].cssselect("footer > time")[0].get("datetime"))
if not title:
return None
children = self._get_comments(title, article_url, article_doc)
article = Article(date=timestamp, title=title, text=text)
article.set_property("author", author)
article.set_property("url", article_url)
article.set_property("medium", "GeenStijl")
return ArticleTree(article, [ArticleTree(c, []) for c in children])
示例5: scrape_unit
# 需要导入模块: from amcat.models import Article [as 别名]
# 或者: from amcat.models.Article import set_property [as 别名]
def scrape_unit(self, article_info: ArticleTuple):
date, page_num, url = article_info
try:
text_url = strip_query(self.session.get_redirected_url(url))
except RedirectError as e:
if e.status_code == 404:
return None
raise
text_doc = self.session.get_html(text_url)
for image in text_doc.cssselect(".image"):
image.getparent().remove(image)
date = datetime.datetime(date.year, date.month, date.day)
try:
title = text_doc.cssselect("article > h1")[0].text.strip()
except:
return None
text = html2text(text_doc.cssselect("main > article > .body"))
if not text.strip():
return None
article = Article(title=title, date=date, text=text, url=url)
if text_doc.cssselect("article > header.themed"):
# New headers style
author = text_doc.cssselect("article > header .author")[0].text
section = text_doc.cssselect("article > header .title")[0].text
article.set_property("author", author)
else:
# Old header style
section = text_doc.cssselect("article > header > .title")
section = section[0].text if section else "NOSECTION"
author_a = text_doc.cssselect("article .author a")
if author_a:
author = author_a[0].text.strip()
article.set_property("author", author)
if author == section:
section = "Opinie"
download = text_doc.cssselect('form[name="download"]')
if download:
pdf_url = download[0].get("action")
article.set_property("pdf_url", pdf_url)
article.set_property("text_url", text_url)
article.set_property("image_url", text_url + "?view=img")
if section:
article.set_property("section", section.strip())
return article