本文整理汇总了Python中nltk.compat.text_type方法的典型用法代码示例。如果您正苦于以下问题:Python compat.text_type方法的具体用法?Python compat.text_type怎么用?Python compat.text_type使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.compat
的用法示例。
在下文中一共展示了compat.text_type方法的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: __init__
# 需要导入模块: from nltk import compat [as 别名]
# 或者: from nltk.compat import text_type [as 别名]
def __init__(self, tokens, name=None):
"""
Create a Text object.
:param tokens: The source text.
:type tokens: sequence of str
"""
if self._COPY_TOKENS:
tokens = list(tokens)
self.tokens = tokens
if name:
self.name = name
elif ']' in tokens[:20]:
end = tokens[:20].index(']')
self.name = " ".join(text_type(tok) for tok in tokens[1:end])
else:
self.name = " ".join(text_type(tok) for tok in tokens[:8]) + "..."
#////////////////////////////////////////////////////////////
# Support item & slice access
#////////////////////////////////////////////////////////////
示例2: tgrep_positions
# 需要导入模块: from nltk import compat [as 别名]
# 或者: from nltk.compat import text_type [as 别名]
def tgrep_positions(pattern, trees, search_leaves=True):
"""
Return the tree positions in the trees which match the given pattern.
:param pattern: a tgrep search pattern
:type pattern: str or output of tgrep_compile()
:param trees: a sequence of NLTK trees (usually ParentedTrees)
:type trees: iter(ParentedTree) or iter(Tree)
:param search_leaves: whether ot return matching leaf nodes
:type search_leaves: bool
:rtype: iter(tree positions)
"""
if isinstance(pattern, (binary_type, text_type)):
pattern = tgrep_compile(pattern)
for tree in trees:
try:
if search_leaves:
positions = tree.treepositions()
else:
positions = treepositions_no_leaves(tree)
yield [position for position in positions
if pattern(tree[position])]
except AttributeError:
yield []
示例3: tag
# 需要导入模块: from nltk import compat [as 别名]
# 或者: from nltk.compat import text_type [as 别名]
def tag(self, tokens):
"""Tags a single sentence: a list of words.
The tokens should not contain any newline characters.
"""
for token in tokens:
assert "\n" not in token, "Tokens should not contain newlines"
if isinstance(token, compat.text_type):
token = token.encode(self._encoding)
self._hunpos.stdin.write(token + b"\n")
# We write a final empty line to tell hunpos that the sentence is finished:
self._hunpos.stdin.write(b"\n")
self._hunpos.stdin.flush()
tagged_tokens = []
for token in tokens:
tagged = self._hunpos.stdout.readline().strip().split(b"\t")
tag = (tagged[1] if len(tagged) > 1 else None)
tagged_tokens.append((token, tag))
# We have to read (and dismiss) the final empty line:
self._hunpos.stdout.readline()
return tagged_tokens
# skip doctests if Hunpos tagger is not installed
示例4: tag_sents
# 需要导入模块: from nltk import compat [as 别名]
# 或者: from nltk.compat import text_type [as 别名]
def tag_sents(self, sentences):
encoding = self._encoding
default_options = ' '.join(_java_options)
config_java(options=self.java_options, verbose=False)
# Create a temporary input file
_input_fh, self._input_file_path = tempfile.mkstemp(text=True)
cmd = list(self._cmd)
cmd.extend(['-encoding', encoding])
# Write the actual sentences to the temporary input file
_input_fh = os.fdopen(_input_fh, 'wb')
_input = '\n'.join((' '.join(x) for x in sentences))
if isinstance(_input, compat.text_type) and encoding:
_input = _input.encode(encoding)
_input_fh.write(_input)
_input_fh.close()
# Run the tagger and get the output
stanpos_output, _stderr = java(cmd, classpath=self._stanford_jar,
stdout=PIPE, stderr=PIPE)
stanpos_output = stanpos_output.decode(encoding)
# Delete the temporary file
os.unlink(self._input_file_path)
# Return java configurations to their default values
config_java(options=default_options, verbose=False)
return self.parse_output(stanpos_output, sentences)
示例5: __repr__
# 需要导入模块: from nltk import compat [as 别名]
# 或者: from nltk.compat import text_type [as 别名]
def __repr__(self):
"""
Return a string representation for this corpus view that is
similar to a list's representation; but if it would be more
than 60 characters long, it is truncated.
"""
pieces = []
length = 5
for elt in self:
pieces.append(elt._short_repr()) # key difference from inherited version: call to _short_repr()
length += len(pieces[-1]) + 2
if self._MAX_REPR_SIZE and length > self._MAX_REPR_SIZE and len(pieces) > 2:
return "[%s, ...]" % text_type(',\n ' if self._BREAK_LINES else ', ').join(pieces[:-1])
return "[%s]" % text_type(',\n ' if self._BREAK_LINES else ', ').join(pieces)
示例6: _tgrep_node_literal_value
# 需要导入模块: from nltk import compat [as 别名]
# 或者: from nltk.compat import text_type [as 别名]
def _tgrep_node_literal_value(node):
'''
Gets the string value of a given parse tree node, for comparison
using the tgrep node literal predicates.
'''
return (node.label() if _istree(node) else text_type(node))
示例7: _execute
# 需要导入模块: from nltk import compat [as 别名]
# 或者: from nltk.compat import text_type [as 别名]
def _execute(self, cmd, input_, verbose=False):
encoding = self._encoding
cmd.extend(['-encoding', encoding])
if self.corenlp_options:
cmd.append(self.corenlp_options)
default_options = ' '.join(_java_options)
# Configure java.
config_java(options=self.java_options, verbose=verbose)
# Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file:
# Write the actual sentences to the temporary input file
if isinstance(input_, compat.text_type) and encoding:
input_ = input_.encode(encoding)
input_file.write(input_)
input_file.flush()
# Run the tagger and get the output.
if self._USE_STDIN:
input_file.seek(0)
stdout, stderr = java(cmd, classpath=self._classpath,
stdin=input_file, stdout=PIPE, stderr=PIPE)
else:
cmd.append(input_file.name)
stdout, stderr = java(cmd, classpath=self._classpath,
stdout=PIPE, stderr=PIPE)
stdout = stdout.replace(b'\xc2\xa0',b' ')
stdout = stdout.replace(b'\xa0',b' ')
stdout = stdout.decode(encoding)
os.unlink(input_file.name)
# Return java configurations to their default values.
config_java(options=default_options, verbose=False)
return stdout
示例8: _execute
# 需要导入模块: from nltk import compat [as 别名]
# 或者: from nltk.compat import text_type [as 别名]
def _execute(self, cmd, input_, verbose=False):
encoding = self._encoding
cmd.extend(['-charset', encoding])
_options_cmd = self._options_cmd
if _options_cmd:
cmd.extend(['-options', self._options_cmd])
default_options = ' '.join(_java_options)
# Configure java.
config_java(options=self.java_options, verbose=verbose)
# Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file:
# Write the actual sentences to the temporary input file
if isinstance(input_, compat.text_type) and encoding:
input_ = input_.encode(encoding)
input_file.write(input_)
input_file.flush()
cmd.append(input_file.name)
# Run the tagger and get the output.
stdout, stderr = java(cmd, classpath=self._stanford_jar,
stdout=PIPE, stderr=PIPE)
stdout = stdout.decode(encoding)
os.unlink(input_file.name)
# Return java configurations to their default values.
config_java(options=default_options, verbose=False)
return stdout
示例9: fromxml
# 需要导入模块: from nltk import compat [as 别名]
# 或者: from nltk.compat import text_type [as 别名]
def fromxml(xml):
if isinstance(xml, compat.string_types):
xml = ElementTree.parse(xml)
for key in xml.attrib:
xml.attrib[key] = compat.text_type(xml.attrib[key])
return Package(**xml.attrib)
示例10: tag_sents
# 需要导入模块: from nltk import compat [as 别名]
# 或者: from nltk.compat import text_type [as 别名]
def tag_sents(self, sentences):
encoding = self._encoding
default_options = ' '.join(_java_options)
config_java(options=self.java_options, verbose=False)
# Create a temporary input file
_input_fh, self._input_file_path = tempfile.mkstemp(text=True)
cmd = list(self._cmd)
cmd.extend(['-encoding', encoding])
# Write the actual sentences to the temporary input file
_input_fh = os.fdopen(_input_fh, 'wb')
_input = '\n'.join((' '.join(x) for x in sentences))
if isinstance(_input, compat.text_type) and encoding:
_input = _input.encode(encoding)
_input_fh.write(_input)
_input_fh.close()
# Run the tagger and get the output
stanpos_output, _stderr = java(cmd, classpath=self._stanford_jar,
stdout=PIPE, stderr=PIPE)
stanpos_output = stanpos_output.decode(encoding)
# Delete the temporary file
os.unlink(self._input_file_path)
# Return java configurations to their default values
config_java(options=default_options, verbose=False)
return self.parse_output(stanpos_output, sentences)
示例11: segment_sents
# 需要导入模块: from nltk import compat [as 别名]
# 或者: from nltk.compat import text_type [as 别名]
def segment_sents(self, sentences):
"""
"""
encoding = self._encoding
# Create a temporary input file
_input_fh, self._input_file_path = tempfile.mkstemp(text=True)
# Write the actural sentences to the temporary input file
_input_fh = os.fdopen(_input_fh, 'wb')
_input = '\n'.join((' '.join(x) for x in sentences))
if isinstance(_input, compat.text_type) and encoding:
_input = _input.encode(encoding)
_input_fh.write(_input)
_input_fh.close()
cmd = [
'edu.stanford.nlp.ie.crf.CRFClassifier',
'-sighanCorporaDict', self._sihan_corpora_dict,
'-textFile', self._input_file_path,
'-sighanPostProcessing', 'true',
'-keepAllWhitespaces', 'false',
'-loadClassifier', self._model,
'-serDictionary', self._dict
]
stdout = self._execute(cmd)
# Delete the temporary file
os.unlink(self._input_file_path)
return stdout
开发者ID:SignalMedia,项目名称:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda,代码行数:33,代码来源:stanford_segmenter.py
示例12: fromxml
# 需要导入模块: from nltk import compat [as 别名]
# 或者: from nltk.compat import text_type [as 别名]
def fromxml(xml):
if isinstance(xml, compat.string_types):
xml = ElementTree.parse(xml)
for key in xml.attrib:
xml.attrib[key] = compat.text_type(xml.attrib[key])
children = [child.get('ref') for child in xml.findall('item')]
return Collection(children=children, **xml.attrib)
开发者ID:SignalMedia,项目名称:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda,代码行数:9,代码来源:downloader.py
示例13: _execute
# 需要导入模块: from nltk import compat [as 别名]
# 或者: from nltk.compat import text_type [as 别名]
def _execute(self, cmd, input_, verbose=False):
encoding = self._encoding
cmd.extend(['-encoding', encoding])
if self.corenlp_options:
cmd.append(self.corenlp_options)
default_options = ' '.join(_java_options)
# Configure java.
config_java(options=self.java_options, verbose=verbose)
# Windows is incompatible with NamedTemporaryFile() without passing in delete=False.
with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file:
# Write the actual sentences to the temporary input file
if isinstance(input_, compat.text_type) and encoding:
input_ = input_.encode(encoding)
input_file.write(input_)
input_file.flush()
# Run the tagger and get the output.
if self._USE_STDIN:
input_file.seek(0)
stdout, stderr = java(cmd, classpath=self._classpath,
stdin=input_file, stdout=PIPE, stderr=PIPE)
else:
cmd.append(input_file.name)
stdout, stderr = java(cmd, classpath=self._classpath,
stdout=PIPE, stderr=PIPE)
stdout = stdout.decode(encoding)
os.unlink(input_file.name)
# Return java configurations to their default values.
config_java(options=default_options, verbose=False)
return stdout
示例14: tgrep_nodes
# 需要导入模块: from nltk import compat [as 别名]
# 或者: from nltk.compat import text_type [as 别名]
def tgrep_nodes(pattern, trees, search_leaves=True):
"""
Return the tree nodes in the trees which match the given pattern.
:param pattern: a tgrep search pattern
:type pattern: str or output of tgrep_compile()
:param trees: a sequence of NLTK trees (usually ParentedTrees)
:type trees: iter(ParentedTree) or iter(Tree)
:param search_leaves: whether ot return matching leaf nodes
:type search_leaves: bool
:rtype: iter(tree nodes)
"""
if isinstance(pattern, (binary_type, text_type)):
pattern = tgrep_compile(pattern)
for tree in trees:
try:
if search_leaves:
positions = tree.treepositions()
else:
positions = treepositions_no_leaves(tree)
yield [tree[position] for position in positions
if pattern(tree[position])]
except AttributeError:
yield []