本文整理汇总了Python中superdesk.errors.ParserError.anpaParseFileError方法的典型用法代码示例。如果您正苦于以下问题:Python ParserError.anpaParseFileError方法的具体用法?Python ParserError.anpaParseFileError怎么用?Python ParserError.anpaParseFileError使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类superdesk.errors.ParserError
的用法示例。
在下文中一共展示了ParserError.anpaParseFileError方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: parse
# 需要导入模块: from superdesk.errors import ParserError [as 别名]
# 或者: from superdesk.errors.ParserError import anpaParseFileError [as 别名]
def parse(self, file_path, provider=None):
try:
item = {ITEM_TYPE: CONTENT_TYPE.TEXT}
with open(file_path, 'rb') as f:
lines = [line for line in f]
# parse first header line
m = re.match(b'\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)', lines[0], flags=re.I)
if m:
item['provider_sequence'] = m.group(2).decode()
# parse second header line
m = re.match(
b'([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-)([a-z-]+)(.*) '
b'([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})',
lines[1], flags=re.I)
if m:
item['priority'] = self.map_priority(m.group(1).decode())
item['anpa_category'] = [{'qcode': m.group(2).decode()}]
item['word_count'] = int(m.group(10).decode())
if m.group(4) == b'\x12':
item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED
# parse created date at the end of file
m = re.search(b'\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT', lines[-4], flags=re.I)
if m:
item['firstcreated'] = datetime.strptime(m.group(3).decode(), '%m-%d-%y %H%M').replace(tzinfo=utc)
# parse anpa content
body = b''.join(lines[2:])
m = re.match(b'\x02(.*)\x03', body, flags=re.M + re.S)
if m:
text = m.group(1).decode().split('\n')
# text
body_lines = [l.strip() for l in text if l.startswith('\t')]
item['body_text'] = '\n'.join(body_lines)
# content metadata
header_lines = [l.strip('^<= ') for l in text if l.startswith('^')]
if len(header_lines) > 3:
item['headline'] = header_lines[1]
item['byline'] = header_lines[-2]
# slugline
if len(header_lines) > 1:
m = re.match('[A-Z]{2}-[A-Z]{2}--([a-z-0-9]+)', header_lines[0], flags=re.I)
if m:
item['slugline'] = m.group(1)
# ednote
for line in header_lines:
m = re.search("EDITOR'S NOTE _(.*)", line)
if m:
item['ednote'] = m.group(1).strip()
return item
except Exception as ex:
raise ParserError.anpaParseFileError(file_path, ex)
示例2: parse
# 需要导入模块: from superdesk.errors import ParserError [as 别名]
# 或者: from superdesk.errors.ParserError import anpaParseFileError [as 别名]
def parse(self, file_path, provider=None):
try:
item = {ITEM_TYPE: CONTENT_TYPE.TEXT, GUID_FIELD: generate_guid(type=GUID_TAG), FORMAT: FORMATS.HTML}
with open(file_path, 'rb') as f:
lines = [line for line in f]
# parse first header line
m = re.match(b'\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)', lines[0], flags=re.I)
if m:
item['provider_sequence'] = m.group(2).decode()
# parse second header line
m = re.match(
b'([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-|ap-)([a-z-.]+)(.*) '
b'([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})',
lines[1], flags=re.I)
if m:
item['priority'] = self.map_priority(m.group(1).decode())
item['anpa_category'] = [{'qcode': m.group(2).decode()}]
item['slugline'] = m.group(6).decode('latin-1', 'replace')
item['anpa_take_key'] = m.group(7).decode('latin-1', 'replace').strip()
item['word_count'] = int(m.group(10).decode())
if m.group(4) == b'\x12':
item[FORMAT] = FORMATS.PRESERVED
# parse created date at the end of file
m = re.search(b'\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT', lines[-4], flags=re.I)
if m:
item['firstcreated'] = datetime.strptime(m.group(3).decode(), '%m-%d-%y %H%M').replace(tzinfo=utc)
item['versioncreated'] = item['firstcreated']
# parse anpa content
body = b''.join(lines[2:])
m = re.match(b'\x02(.*)\x03', body, flags=re.M + re.S)
if m:
text = m.group(1).decode('latin-1', 'replace').split('\n')
if item.get(FORMAT) == FORMATS.PRESERVED:
# ANPA defines a number of special characters e.g. TLI (Tab Line Inicator) Hex x08 and
# TTS Space Band Hex x10 These will be replaced, there will likely be others
body_lines = [l.strip('^').replace('\b', '%08').replace('\x10', '%10') for l in text if
l.startswith(('\t', '^', '\b'))]
item['body_html'] = '<pre>' + '\n'.join(body_lines) + '</pre>'
else:
body_lines = [l.strip() for l in text if l.startswith(('\t'))]
item['body_html'] = '<p>' + '</p><p>'.join(body_lines) + '</p>'
# content metadata
header_lines = [l.strip('^<= ') for l in text if l.startswith('^')]
if len(header_lines) > 1:
item['headline'] = header_lines[1].rstrip('\r\n^<= ')
if len(header_lines) > 3:
item['byline'] = header_lines[-2].rstrip('\r\n^<= ')
# if there is no body use header lines
if len(body_lines) == 1 and not body_lines[0]:
item['body_html'] = '<p>' + '</p><p>'.join(header_lines[2:]) + '</p>'
# slugline
if len(header_lines) > 1:
m = re.match('[A-Z]{2}-[A-Z]{2}--([a-z-0-9.]+)', header_lines[0], flags=re.I)
if m:
item['slugline'] = m.group(1)
# ednote
self._parse_ednote(header_lines, item)
return item
except Exception as ex:
raise ParserError.anpaParseFileError(file_path, ex)
示例3: parse
# 需要导入模块: from superdesk.errors import ParserError [as 别名]
# 或者: from superdesk.errors.ParserError import anpaParseFileError [as 别名]
def parse(self, file_path, provider=None):
try:
item = {ITEM_TYPE: CONTENT_TYPE.TEXT, GUID_FIELD: generate_guid(type=GUID_TAG), FORMAT: FORMATS.HTML}
with open(file_path, "rb") as f:
lines = [line for line in f]
# parse first header line
m = re.match(b"\x16\x16\x01([a-z])([0-9]{4})\x1f([a-z-]+)", lines[0], flags=re.I)
if m:
item["provider_sequence"] = m.group(2).decode()
# parse second header line
m = re.match(
b"([a-z]) ([a-z])(\x13|\x14)(\x11|\x12) (am-|pm-|bc-|ap-)([a-z-.]+)(.*) "
b"([0-9]{1,2})-([0-9]{1,2}) ([0-9]{4})",
lines[1],
flags=re.I,
)
if m:
item["priority"] = self.map_priority(m.group(1).decode())
item["anpa_category"] = [{"qcode": m.group(2).decode()}]
item["slugline"] = m.group(6).decode("latin-1", "replace")
item["anpa_take_key"] = m.group(7).decode("latin-1", "replace").strip()
item["word_count"] = int(m.group(10).decode())
if m.group(4) == b"\x12":
item[FORMAT] = FORMATS.PRESERVED
# parse created date at the end of file
m = re.search(b"\x03([a-z]+)-([a-z]+)-([0-9]+-[0-9]+-[0-9]+ [0-9]{2}[0-9]{2})GMT", lines[-4], flags=re.I)
if m:
item["firstcreated"] = datetime.strptime(m.group(3).decode(), "%m-%d-%y %H%M").replace(tzinfo=utc)
item["versioncreated"] = item["firstcreated"]
# parse anpa content
body = b"".join(lines[2:])
m = re.match(b"\x02(.*)\x03", body, flags=re.M + re.S)
if m:
text = m.group(1).decode("latin-1", "replace").split("\n")
if item.get(FORMAT) == FORMATS.PRESERVED:
# ANPA defines a number of special characters e.g. TLI (Tab Line Inicator) Hex x08 and
# TTS Space Band Hex x10 These will be replaced, there will likely be others
body_lines = [
l.strip("^").replace("\b", "%08").replace("\x10", "%10")
for l in text
if l.startswith(("\t", "^", "\b"))
]
item["body_html"] = "<pre>" + "\n".join(body_lines) + "</pre>"
else:
body_lines = [l.strip() for l in text if l.startswith(("\t"))]
item["body_html"] = "<p>" + "</p><p>".join(body_lines) + "</p>"
# content metadata
header_lines = [l.strip("^<= ") for l in text if l.startswith("^")]
if len(header_lines) > 1:
item["headline"] = header_lines[1].rstrip("\r\n^<= ")
if len(header_lines) > 3:
item["byline"] = header_lines[-2].rstrip("\r\n^<= ")
# if there is no body use header lines
if len(body_lines) == 1 and not body_lines[0]:
item["body_html"] = "<p>" + "</p><p>".join(header_lines[2:]) + "</p>"
# slugline
if len(header_lines) > 1:
m = re.match("[A-Z]{2}-[A-Z]{2}--([a-z-0-9.]+)", header_lines[0], flags=re.I)
if m:
item["slugline"] = m.group(1)
# ednote
for line in header_lines:
m = re.search("EDITOR'S NOTE _(.*)", line)
if m:
item["ednote"] = m.group(1).strip()
return item
except Exception as ex:
raise ParserError.anpaParseFileError(file_path, ex)