当前位置: 首页>>代码示例>>Python>>正文


Python Article.set_property方法代码示例

本文整理汇总了Python中amcat.models.Article.set_property方法的典型用法代码示例。如果您正苦于以下问题:Python Article.set_property方法的具体用法?Python Article.set_property怎么用?Python Article.set_property使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在amcat.models.Article的用法示例。


在下文中一共展示了Article.set_property方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: scrape_unit

# 需要导入模块: from amcat.models import Article [as 别名]
# 或者: from amcat.models.Article import set_property [as 别名]
 def scrape_unit(self, unit):
     date = iso8601.iso8601.parse_date(unit["datum"], default_timezone=None)
     hostname = urlparse(unit["url"]).hostname
     publisher = ".".join(hostname.split(".")[-2:])
     title = unit["titel"].strip() or "[No title]"
     article = Article(title=title, text=unit["bericht tekst"], url=unit["url"], date=date)
     article.set_property("author", unit["auteur"])
     article.set_property("publisher", publisher)
     return article
开发者ID:amcat,项目名称:amcat-scraping,代码行数:11,代码来源:coosto.py

示例2: _parse_comment

# 需要导入模块: from amcat.models import Article [as 别名]
# 或者: from amcat.models.Article import set_property [as 别名]
    def _parse_comment(self, comment, base_title, base_url):
        text = html2text(comment.cssselect("p"))
        article_id = comment.get("id")
        title = "{base_title}#{article_id}".format(**locals())
        url = "{base_url}#{article_id}".format(**locals())
        author, timestamp = _parse_comment_footer(comment.cssselect("footer")[0].text_content())

        article = Article(date=timestamp, title=title, text=text.strip() or ".", url=url)
        article.set_property("author", author.strip())
        article.set_property("medium", "GeenStijl Comments")
        return article
开发者ID:amcat,项目名称:amcat-scraping,代码行数:13,代码来源:geenstijl.py

示例3: scrape_unit_meta

# 需要导入模块: from amcat.models import Article [as 别名]
# 或者: from amcat.models.Article import set_property [as 别名]
    def scrape_unit_meta(self, article_element):
        article_html = article_element.get_attribute("outerHTML")
        article_doc = lxml.html.fromstring(article_html, base_url=SEARCH_URL)

        def get_byline_prop(prop):
            for meta_element in article_doc.cssselect(".article_byline__element.{}".format(prop)):
                prop_value = meta_element.text_content().strip()
                if prop_value:
                    return prop_value
            else:
                raise ValueError("Article {} has no property '{}'.".format(title, prop))

        text_url = article_doc.cssselect("a.article_headline")[0].get("href")
        url = "newsdesk://{}".format(get_newsdesk_article_id(text_url))

        title = article_doc.cssselect("a.article_headline")[0].text_content().strip()
        publisher = get_byline_prop("source")

        date = get_byline_prop("harvest_date")
        date, pub_date = date.split("(gepubliceerd: ")
        date = dutch_strptime(date.strip(), "%d %b %Y %H:%M")
        pub_date = dutch_strptime(pub_date.strip()[:-1], "%d %b %Y %H:%M")

        article = Article(url=url, title=title, date=date)
        article.set_property("publisher", publisher)
        article.set_property("text_url", text_url)

        # Crashes AmCAT API:
        #article.set_property("pubdate_date", pub_date)

        try:
            article.set_property("author", get_byline_prop("author"))
        except ValueError:
            pass

        try:
            article.set_property("wordcount_int", int(get_byline_prop("word_count").split()[0]))
        except ValueError:
            pass

        try:
            article.set_property("country", get_byline_prop("source_country"))
        except ValueError:
            pass

        return NewsdeskUnit(article_element, article)
开发者ID:amcat,项目名称:amcat-scraping,代码行数:48,代码来源:newsdesk.py

示例4: scrape_unit

# 需要导入模块: from amcat.models import Article [as 别名]
# 或者: from amcat.models.Article import set_property [as 别名]
    def scrape_unit(self, date_and_article_url):
        date, article_url = date_and_article_url
        log.info("Fetching {}".format(article_url))
        article_doc = self.session.get_html(article_url)

        article_el = article_doc.cssselect("#content > article")

        if not article_el:
            log.error("Could not find article on {article_url}".format(**locals()))
            return None

        title = article_el[0].cssselect("h1")[0].text
        text = html2text(article_el[0].cssselect("p"))
        text = text.strip() or "."

        try:
            footer = article_el[0].cssselect("footer")[0]
        except IndexError as e:
            # Contains <embed> tag which is not closed gracefully :-(
            log.exception(e)
            return None

        author = footer.text.rsplit("|", 1)[0].strip()
        timestamp = parse_date(article_el[0].cssselect("footer > time")[0].get("datetime"))
        if not title:
            return None

        children = self._get_comments(title, article_url, article_doc)

        article = Article(date=timestamp, title=title, text=text)
        article.set_property("author", author)
        article.set_property("url", article_url)
        article.set_property("medium", "GeenStijl")

        return ArticleTree(article, [ArticleTree(c, []) for c in children])
开发者ID:amcat,项目名称:amcat-scraping,代码行数:37,代码来源:geenstijl.py

示例5: scrape_unit

# 需要导入模块: from amcat.models import Article [as 别名]
# 或者: from amcat.models.Article import set_property [as 别名]
    def scrape_unit(self, article_info: ArticleTuple):
        date, page_num, url = article_info

        try:
            text_url = strip_query(self.session.get_redirected_url(url))
        except RedirectError as e:
            if e.status_code == 404:
                return None
            raise

        text_doc = self.session.get_html(text_url)

        for image in text_doc.cssselect(".image"):
            image.getparent().remove(image)

        date = datetime.datetime(date.year, date.month, date.day)
        try:
            title = text_doc.cssselect("article > h1")[0].text.strip()
        except:
            return None

        text = html2text(text_doc.cssselect("main > article > .body"))
        if not text.strip():
            return None

        article = Article(title=title, date=date, text=text, url=url)

        if text_doc.cssselect("article > header.themed"):
            # New headers style
            author = text_doc.cssselect("article > header .author")[0].text
            section = text_doc.cssselect("article > header .title")[0].text
            article.set_property("author", author)
        else:
            # Old header style
            section = text_doc.cssselect("article > header > .title")
            section = section[0].text if section else "NOSECTION"
            author_a = text_doc.cssselect("article .author a")
            if author_a:
                author = author_a[0].text.strip()
                article.set_property("author", author)
                if author == section:
                    section = "Opinie"

        download = text_doc.cssselect('form[name="download"]')
        if download:
            pdf_url = download[0].get("action")
            article.set_property("pdf_url", pdf_url)

        article.set_property("text_url", text_url)
        article.set_property("image_url", text_url + "?view=img")

        if section:
            article.set_property("section", section.strip())

        return article
开发者ID:amcat,项目名称:amcat-scraping,代码行数:57,代码来源:fd.py


注:本文中的amcat.models.Article.set_property方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。