本文整理汇总了Python中nltk.corpus.reader.xmldocs.XMLCorpusView类的典型用法代码示例。如果您正苦于以下问题:Python XMLCorpusView类的具体用法?Python XMLCorpusView怎么用?Python XMLCorpusView使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了XMLCorpusView类的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
def __init__(self, filename, **kwargs):
self.tags = kwargs.pop('tags', None)
self.tagspec = '.*/seg/fs'
self.xml_tool = XML_Tool(filename, 'ann_morphosyntax.xml')
XMLCorpusView.__init__(
self, self.xml_tool.build_preprocessed_file(), self.tagspec
)
示例2: __init__
def __init__(self, fileid, sent, tag, strip_space, stem):
"""
:param fileid: The name of the underlying file.
:param sent: If true, include sentence bracketing.
:param tag: The name of the tagset to use, or None for no tags.
:param strip_space: If true, strip spaces from word tokens.
:param stem: If true, then substitute stems for words.
"""
if sent:
tagspec = '.*/s'
else:
tagspec = '.*/s/(.*/)?(c|w)'
self._sent = sent
self._tag = tag
self._strip_space = strip_space
self._stem = stem
self.title = None #: Title of the document.
self.author = None #: Author of the document.
self.editor = None #: Editor
self.resps = None #: Statement of responsibility
XMLCorpusView.__init__(self, fileid, tagspec)
# Read in a tasty header.
self._open()
self.read_block(self._stream, '.*/teiHeader$', self.handle_header)
self.close()
# Reset tag context.
self._tag_context = {0: ()}
示例3: __init__
def __init__(self, filename, **kwargs):
self.mode = kwargs.pop('mode', 0)
self.tagspec = '.*/div/ab'
self.segm_dict = dict()
#xml preprocessing
self.xml_tool = XML_Tool(filename, 'text.xml')
#base class init
XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec)
示例4: read_block
def read_block(self, stream, tagspec=None, elt_handler=None):
return list(
filter(
lambda x: x is not None,
XMLCorpusView.read_block(self, stream, tagspec, elt_handler),
)
)
示例5: __init__
def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag):
"""
:param fileid: The name of the underlying file.
:param unit: One of `'token'`, `'word'`, or `'chunk'`.
:param bracket_sent: If true, include sentence bracketing.
:param pos_tag: Whether to include part-of-speech tags.
:param sem_tag: Whether to include semantic tags, namely WordNet lemma
and OOV named entity status.
"""
if bracket_sent: tagspec = '.*/s'
else: tagspec = '.*/s/(punc|wf)'
self._unit = unit
self._sent = bracket_sent
self._pos_tag = pos_tag
self._sem_tag = sem_tag
XMLCorpusView.__init__(self, fileid, tagspec)
示例6: handle_query
def handle_query(self):
self._open()
header = []
while True:
segm = XMLCorpusView.read_block(self, self._stream)
if len(segm) == 0:
break
header.extend(segm)
self.close()
return header
示例7: _detect_encoding
def _detect_encoding(self, fileid):
if isinstance(fileid, PathPointer):
s = fileid.open().readline()
else:
s = open(fileid, 'rb').readline()
m = re.search(r'encoding="([^"]+)"', s)
if m: return m.group(1)
m = re.search(r"encoding='([^']+)'", s)
if m: return m.group(1)
return XMLCorpusView._detect_encoding(self, fileid)
示例8: read_block
def read_block(self, stream, tagspec=None, elt_handler=None):
"""
Returns text as a list of sentences.
"""
txt = []
while True:
segm = XMLCorpusView.read_block(self, stream)
if len(segm) == 0:
break
for part in segm:
txt.append(part)
return [' '.join([segm for segm in txt])]
示例9: __init__
def __init__(self, fileid, tagspec, elt_handler=None):
XMLCorpusView.__init__(self, fileid, tagspec, elt_handler)
示例10: zip
np.savetxt("lsa_model.csv", matrix, delimiter="\t") # raw output
doc_2d = []
for doc, file in zip(matrix, filenames): # reduce the data to 2 dimensions
# print(file, "\n", doc, "\n\n") # debug msg
doc_2d.append(TSNE().fit_transform(doc).tolist()[0])
matrix = np.asarray(doc_2d) # update matrix array
# raw output
np.savetxt("lsa_reduced.csv", matrix, delimiter="\t") # raw output
# build list of tags from the metadata
metadata = pd.DataFrame(index=filenames, columns=["Tags"])
view = XMLCorpusView("txt/export-abstracts.xml", ".*/article")
iter = view.iterate_from(0)
for entry in iter:
metadata.loc[entry.attrib["{http://www.w3.org/XML/1998/namespace}id"] + ".txt", "Tags"] = entry.attrib["type"]
metadata.to_csv("lsa_metadata.csv")
##############################################################################
# CLUSTERING
print("clustering ...\n")
# af = AffinityPropagation(damping=0.9, affinity="euclidean", preference=-50).fit(matrix)
af = AffinityPropagation().fit(matrix) # default
cluster_centers_indices = af.cluster_centers_indices_