当前位置: 首页>>代码示例>>Python>>正文


Python ParserError.anpaParseFileError方法代码示例

本文整理汇总了Python中superdesk.errors.ParserError.anpaParseFileError方法的典型用法代码示例。如果您正苦于以下问题:Python ParserError.anpaParseFileError方法的具体用法?Python ParserError.anpaParseFileError怎么用?Python ParserError.anpaParseFileError使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在superdesk.errors.ParserError的用法示例。


在下文中一共展示了ParserError.anpaParseFileError方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: parse

# 需要导入模块: from superdesk.errors import ParserError [as 别名]
# 或者: from superdesk.errors.ParserError import anpaParseFileError [as 别名]
    def parse(self, file_path, provider=None):
        try:
            item = {ITEM_TYPE: CONTENT_TYPE.TEXT}

            with open(file_path, 'rb') as f:
                lines = [line for line in f]

            # parse first header line
            m = re.match(b'\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)', lines[0], flags=re.I)
            if m:
                item['provider_sequence'] = m.group(2).decode()

            # parse second header line
            m = re.match(
                b'([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-)([a-z-]+)(.*) '
                b'([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})',
                lines[1], flags=re.I)
            if m:
                item['priority'] = self.map_priority(m.group(1).decode())
                item['anpa_category'] = [{'qcode': m.group(2).decode()}]
                item['word_count'] = int(m.group(10).decode())
                if m.group(4) == b'\x12':
                    item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED

            # parse created date at the end of file
            m = re.search(b'\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT', lines[-4], flags=re.I)
            if m:
                item['firstcreated'] = datetime.strptime(m.group(3).decode(), '%m-%d-%y %H%M').replace(tzinfo=utc)

            # parse anpa content
            body = b''.join(lines[2:])
            m = re.match(b'\x02(.*)\x03', body, flags=re.M + re.S)
            if m:
                text = m.group(1).decode().split('\n')

                # text
                body_lines = [l.strip() for l in text if l.startswith('\t')]
                item['body_text'] = '\n'.join(body_lines)

                # content metadata
                header_lines = [l.strip('^<= ') for l in text if l.startswith('^')]
                if len(header_lines) > 3:
                    item['headline'] = header_lines[1]
                    item['byline'] = header_lines[-2]

                # slugline
                if len(header_lines) > 1:
                    m = re.match('[A-Z]{2}-[A-Z]{2}--([a-z-0-9]+)', header_lines[0], flags=re.I)
                    if m:
                        item['slugline'] = m.group(1)

                # ednote
                for line in header_lines:
                    m = re.search("EDITOR'S NOTE _(.*)", line)
                    if m:
                        item['ednote'] = m.group(1).strip()

            return item
        except Exception as ex:
            raise ParserError.anpaParseFileError(file_path, ex)
开发者ID:ahilles107,项目名称:superdesk-core,代码行数:62,代码来源:anpa.py

示例2: parse

# 需要导入模块: from superdesk.errors import ParserError [as 别名]
# 或者: from superdesk.errors.ParserError import anpaParseFileError [as 别名]
    def parse(self, file_path, provider=None):
        try:
            item = {ITEM_TYPE: CONTENT_TYPE.TEXT, GUID_FIELD: generate_guid(type=GUID_TAG), FORMAT: FORMATS.HTML}

            with open(file_path, 'rb') as f:
                lines = [line for line in f]

            # parse first header line
            m = re.match(b'\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)', lines[0], flags=re.I)
            if m:
                item['provider_sequence'] = m.group(2).decode()

            # parse second header line
            m = re.match(
                b'([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-|ap-)([a-z-.]+)(.*) '
                b'([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})',
                lines[1], flags=re.I)
            if m:
                item['priority'] = self.map_priority(m.group(1).decode())
                item['anpa_category'] = [{'qcode': m.group(2).decode()}]
                item['slugline'] = m.group(6).decode('latin-1', 'replace')
                item['anpa_take_key'] = m.group(7).decode('latin-1', 'replace').strip()
                item['word_count'] = int(m.group(10).decode())
                if m.group(4) == b'\x12':
                    item[FORMAT] = FORMATS.PRESERVED

            # parse created date at the end of file
            m = re.search(b'\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT', lines[-4], flags=re.I)
            if m:
                item['firstcreated'] = datetime.strptime(m.group(3).decode(), '%m-%d-%y %H%M').replace(tzinfo=utc)
                item['versioncreated'] = item['firstcreated']

            # parse anpa content
            body = b''.join(lines[2:])
            m = re.match(b'\x02(.*)\x03', body, flags=re.M + re.S)
            if m:
                text = m.group(1).decode('latin-1', 'replace').split('\n')

                if item.get(FORMAT) == FORMATS.PRESERVED:
                    # ANPA defines a number of special characters e.g. TLI (Tab Line Inicator) Hex x08 and
                    # TTS Space Band Hex x10 These will be replaced, there will likely be others
                    body_lines = [l.strip('^').replace('\b', '%08').replace('\x10', '%10') for l in text if
                                  l.startswith(('\t', '^', '\b'))]
                    item['body_html'] = '<pre>' + '\n'.join(body_lines) + '</pre>'
                else:
                    body_lines = [l.strip() for l in text if l.startswith(('\t'))]
                    item['body_html'] = '<p>' + '</p><p>'.join(body_lines) + '</p>'

                # content metadata
                header_lines = [l.strip('^<= ') for l in text if l.startswith('^')]
                if len(header_lines) > 1:
                    item['headline'] = header_lines[1].rstrip('\r\n^<= ')
                if len(header_lines) > 3:
                    item['byline'] = header_lines[-2].rstrip('\r\n^<= ')

                    # if there is no body use header lines
                    if len(body_lines) == 1 and not body_lines[0]:
                        item['body_html'] = '<p>' + '</p><p>'.join(header_lines[2:]) + '</p>'

                # slugline
                if len(header_lines) > 1:
                    m = re.match('[A-Z]{2}-[A-Z]{2}--([a-z-0-9.]+)', header_lines[0], flags=re.I)
                    if m:
                        item['slugline'] = m.group(1)

                # ednote
                self._parse_ednote(header_lines, item)

            return item
        except Exception as ex:
            raise ParserError.anpaParseFileError(file_path, ex)
开发者ID:jerome-poisson,项目名称:superdesk-core,代码行数:73,代码来源:anpa.py

示例3: parse

# 需要导入模块: from superdesk.errors import ParserError [as 别名]
# 或者: from superdesk.errors.ParserError import anpaParseFileError [as 别名]
    def parse(self, file_path, provider=None):
        try:
            item = {ITEM_TYPE: CONTENT_TYPE.TEXT, GUID_FIELD: generate_guid(type=GUID_TAG), FORMAT: FORMATS.HTML}

            with open(file_path, "rb") as f:
                lines = [line for line in f]

            # parse first header line
            m = re.match(b"\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)", lines[0], flags=re.I)
            if m:
                item["provider_sequence"] = m.group(2).decode()

            # parse second header line
            m = re.match(
                b"([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-|ap-)([a-z-.]+)(.*) "
                b"([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})",
                lines[1],
                flags=re.I,
            )
            if m:
                item["priority"] = self.map_priority(m.group(1).decode())
                item["anpa_category"] = [{"qcode": m.group(2).decode()}]
                item["slugline"] = m.group(6).decode("latin-1", "replace")
                item["anpa_take_key"] = m.group(7).decode("latin-1", "replace").strip()
                item["word_count"] = int(m.group(10).decode())
                if m.group(4) == b"\x12":
                    item[FORMAT] = FORMATS.PRESERVED

            # parse created date at the end of file
            m = re.search(b"\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT", lines[-4], flags=re.I)
            if m:
                item["firstcreated"] = datetime.strptime(m.group(3).decode(), "%m-%d-%y %H%M").replace(tzinfo=utc)
                item["versioncreated"] = item["firstcreated"]

            # parse anpa content
            body = b"".join(lines[2:])
            m = re.match(b"\x02(.*)\x03", body, flags=re.M + re.S)
            if m:
                text = m.group(1).decode("latin-1", "replace").split("\n")

                if item.get(FORMAT) == FORMATS.PRESERVED:
                    # ANPA defines a number of special characters e.g. TLI (Tab Line Inicator) Hex x08 and
                    # TTS Space Band Hex x10 These will be replaced, there will likely be others
                    body_lines = [
                        l.strip("^").replace("\b", "%08").replace("\x10", "%10")
                        for l in text
                        if l.startswith(("\t", "^", "\b"))
                    ]
                    item["body_html"] = "<pre>" + "\n".join(body_lines) + "</pre>"
                else:
                    body_lines = [l.strip() for l in text if l.startswith(("\t"))]
                    item["body_html"] = "<p>" + "</p><p>".join(body_lines) + "</p>"

                # content metadata
                header_lines = [l.strip("^<= ") for l in text if l.startswith("^")]
                if len(header_lines) > 1:
                    item["headline"] = header_lines[1].rstrip("\r\n^<= ")
                if len(header_lines) > 3:
                    item["byline"] = header_lines[-2].rstrip("\r\n^<= ")

                    # if there is no body use header lines
                    if len(body_lines) == 1 and not body_lines[0]:
                        item["body_html"] = "<p>" + "</p><p>".join(header_lines[2:]) + "</p>"

                # slugline
                if len(header_lines) > 1:
                    m = re.match("[A-Z]{2}-[A-Z]{2}--([a-z-0-9.]+)", header_lines[0], flags=re.I)
                    if m:
                        item["slugline"] = m.group(1)

                # ednote
                for line in header_lines:
                    m = re.search("EDITOR'S NOTE _(.*)", line)
                    if m:
                        item["ednote"] = m.group(1).strip()

            return item
        except Exception as ex:
            raise ParserError.anpaParseFileError(file_path, ex)
开发者ID:hlmnrmr,项目名称:superdesk-core,代码行数:81,代码来源:anpa.py


注:本文中的superdesk.errors.ParserError.anpaParseFileError方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。