本文整理汇总了Python中nltk.tree.ParentedTree.convert方法的典型用法代码示例。如果您正苦于以下问题:Python ParentedTree.convert方法的具体用法?Python ParentedTree.convert怎么用?Python ParentedTree.convert使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.tree.ParentedTree
的用法示例。
在下文中一共展示了ParentedTree.convert方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: assign_slots
# 需要导入模块: from nltk.tree import ParentedTree [as 别名]
# 或者: from nltk.tree.ParentedTree import convert [as 别名]
def assign_slots(tokens, tag_tree, word_tree):
stopword_list = stopwords.words('english')
tokens_with_slot_tags = []
word_tree = ParentedTree.convert(word_tree)
tag_tree = ParentedTree.convert(tag_tree)
word_tree_with_cats = tag_words_with_categories(word_tree)
tag_tree_with_cats = tag_words_with_categories(tag_tree)
for i, word in enumerate(tokens):
tag = finalize_tags(i, word, tag_tree_with_cats, word_tree_with_cats)
tokens_with_slot_tags.append((word, tag))
found_query_focus = False
for i, item in enumerate(tokens_with_slot_tags):
word, tag = item
if tag in ['USER','MEDIA','NETWORK'] and not found_query_focus:
tokens_with_slot_tags[i] = (word, 'SEARCH')
found_query_focus = True
elif tag == UNK:
tokens_with_slot_tags[i] = (word, 'KEYWORD')
slots = {}
for word, tag in tokens_with_slot_tags:
if tag == 'SKIP':
continue
elif tag == 'KEYWORD':
if 'KEYWORDS' not in slots:
slots['KEYWORDS'] = []
if word not in stopword_list and word not in PUNCTUATION:
slots['KEYWORDS'].append(word)
else:
if tag not in slots:
slots[tag] = word
else:
previous_words = slots[tag]
slots[tag] = ' '.join([previous_words, word])
return slots
示例2: syntax_similarity_two_documents
# 需要导入模块: from nltk.tree import ParentedTree [as 别名]
# 或者: from nltk.tree.ParentedTree import convert [as 别名]
def syntax_similarity_two_documents(self, doc1, doc2, average=False): #syntax similarity of two single documents
global numnodes
doc1sents = self.sent_detector.tokenize(doc1.strip())
doc2sents = self.sent_detector.tokenize(doc2.strip())
for s in doc1sents: # to handle unusual long sentences.
if len(s.split())>100:
return "NA"
for s in doc2sents:
if len(s.split())>100:
return "NA"
try: #to handle parse errors. Parser errors might happen in cases where there is an unsuall long word in the sentence.
doc1parsed = self.parser.raw_parse_sents((doc1sents))
doc2parsed = self.parser.raw_parse_sents((doc2sents))
except Exception as e:
sys.stderr.write(str(e))
return "NA"
costMatrix = []
doc1parsed = list(doc1parsed)
for i in range(len(doc1parsed)):
doc1parsed[i] = list(doc1parsed[i])[0]
doc2parsed = list(doc2parsed)
for i in range(len(doc2parsed)):
doc2parsed[i] = list(doc2parsed[i])[0]
for i in range(len(doc1parsed)):
numnodes = 0
sentencedoc1 = ParentedTree.convert(doc1parsed[i])
tempnode = Node(sentencedoc1.root().label())
new_sentencedoc1 = self.convert_mytree(sentencedoc1,tempnode)
temp_costMatrix = []
sen1nodes = numnodes
for j in range(len(doc2parsed)):
numnodes=0.0
sentencedoc2 = ParentedTree.convert(doc2parsed[j])
tempnode = Node(sentencedoc2.root().label())
new_sentencedoc2 = self.convert_mytree(sentencedoc2,tempnode)
ED = simple_distance(new_sentencedoc1, new_sentencedoc2)
ED = ED / (numnodes + sen1nodes)
temp_costMatrix.append(ED)
costMatrix.append(temp_costMatrix)
costMatrix = np.array(costMatrix)
if average==True:
return 1-np.mean(costMatrix)
else:
indexes = su.linear_assignment(costMatrix)
total = 0
rowMarked = [0] * len(doc1parsed)
colMarked = [0] * len(doc2parsed)
for row, column in indexes:
total += costMatrix[row][column]
rowMarked[row] = 1
colMarked [column] = 1
for k in range(len(rowMarked)):
if rowMarked[k]==0:
total+= np.min(costMatrix[k])
for c in range(len(colMarked)):
if colMarked[c]==0:
total+= np.min(costMatrix[:,c])
maxlengraph = max(len(doc1parsed),len(doc2parsed))
return 1-(total/maxlengraph)
示例3: add_tree
# 需要导入模块: from nltk.tree import ParentedTree [as 别名]
# 或者: from nltk.tree.ParentedTree import convert [as 别名]
def add_tree(self, datum):
# parse tree and binarize
tree = Tree.fromstring(datum["raw_tree"])
tree.chomsky_normal_form()
tree.collapse_unary(collapsePOS=True)
tree = ParentedTree.convert(tree)
# assign indices to subtrees
indices = {}
counter = 0
for t in tree.subtrees():
indices[t.treeposition()] = counter
counter += 1
# generate parent pointers and labels
# (labels = one instance of sent in sents by treelstm terminology)
parents = [0] * (counter - 1)
labels = []
counter = 0
for t in tree.subtrees():
parent = t.parent()
if parent != None:
parents[counter] = indices[parent.treeposition()]
counter += 1
if type(t[0]) is str or type(t[0]) is unicode: labels.append(t[0])
self.parents_file.write(" ".join(map(str, parents)) + "\n")
self.sents_file.write(" ".join(labels) + "\n")
self.trees.append(datum)
return len(self.trees) - 1 # ID
示例4: syntax_similarity_conversation
# 需要导入模块: from nltk.tree import ParentedTree [as 别名]
# 或者: from nltk.tree.ParentedTree import convert [as 别名]
def syntax_similarity_conversation(self, documents1, average=False): #syntax similarity of each document with its before and after document
global numnodes
documents1parsed = []
for d1 in range(len(documents1)):
sys.stderr.write(str(d1)+"\n")
# print documents1[d1]
tempsents = (self.sent_detector.tokenize(documents1[d1].strip()))
for s in tempsents:
if len(s.split())>100:
documents1parsed.append("NA")
break
else:
temp = list(self.parser.raw_parse_sents((tempsents)))
for i in range(len(temp)):
temp[i] = list(temp[i])[0]
temp[i] = ParentedTree.convert(temp[i])
documents1parsed.append(list(temp))
results = OrderedDict()
for d1 in range(len(documents1parsed)):
d2 = d1+1
if d2 == len(documents1parsed):
break
if documents1parsed[d1] == "NA" or documents1parsed[d2]=="NA":
continue
costMatrix = []
for i in range(len(documents1parsed[d1])):
numnodes = 0
tempnode = Node(documents1parsed[d1][i].root().label())
new_sentencedoc1 = self.convert_mytree(documents1parsed[d1][i],tempnode)
temp_costMatrix = []
sen1nodes = numnodes
for j in range(len(documents1parsed[d2])):
numnodes=0.0
tempnode = Node(documents1parsed[d2][j].root().label())
new_sentencedoc2 = self.convert_mytree(documents1parsed[d2][j],tempnode)
ED = simple_distance(new_sentencedoc1, new_sentencedoc2)
ED = ED / (numnodes + sen1nodes)
temp_costMatrix.append(ED)
costMatrix.append(temp_costMatrix)
costMatrix = np.array(costMatrix)
if average==True:
return 1-np.mean(costMatrix)
else:
indexes = su.linear_assignment(costMatrix)
total = 0
rowMarked = [0] * len(documents1parsed[d1])
colMarked = [0] * len(documents1parsed[d2])
for row, column in indexes:
total += costMatrix[row][column]
rowMarked[row] = 1
colMarked [column] = 1
for k in range(len(rowMarked)):
if rowMarked[k]==0:
total+= np.min(costMatrix[k])
for c in range(len(colMarked)):
if colMarked[c]==0:
total+= np.min(costMatrix[:,c])
maxlengraph = max(len(documents1parsed[d1]),len(documents1parsed[d2]))
results[(d1,d2)] = 1-total/maxlengraph#, minWeight/minlengraph, randtotal/lengraph
return results
示例5: merge_tree_nnps
# 需要导入模块: from nltk.tree import ParentedTree [as 别名]
# 或者: from nltk.tree.ParentedTree import convert [as 别名]
def merge_tree_nnps(tree):
"""
Takes a parse tree and merges any consecutive leaf nodes that come from NNPs
For example if there is a segment of:
(NP
(JJ old)
(NNP Pierre)
(NNP Vinken)
)
Returns:
(NP
(JJ old)
(NNP PierreVinken)
)
"""
# require a parented tree to get a subtrees tree position
p = ParentedTree.convert(tree)
# iterates subtrees of height 3. This is where NP's leading to NNP's leading to lexicalizations will be
for s in p.subtrees(filter=lambda s: s.height() == 3):
# merge NNP's in the list representation of this trees children: [(POS, word), ...]
new_noun_phrase = merge_tagged_nnps([(c.label(), c[0]) for c in s])
child_str = " ".join("(%s %s)" % (pos, word) for pos, word in new_noun_phrase)
# create new subtree with merged NNP's
new_s = ParentedTree.fromstring("(%s %s)" % (s.label(), child_str))
# replace old subtree with new subtree
p[s.treeposition()] = new_s
return Tree.convert(p)
示例6: is_pred_nominal
# 需要导入模块: from nltk.tree import ParentedTree [as 别名]
# 或者: from nltk.tree.ParentedTree import convert [as 别名]
def is_pred_nominal(feats):
"""WORKS"""
if feats.sentence != feats.sentence_ref:
return "is_pred_nominal={}".format(False)
else:
s_tree = ParentedTree.convert(TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence)])
NP_i = __get_parent_tree__(feats.token, s_tree)
NP_j = __get_parent_tree__(feats.token_ref,s_tree)
nominal= __get_max_projection__(s_tree,NP_j)
copula_verbs = ["is","are","were","was","am"]
def check_nominal_construction(tree):
found = False
for t in tree:
if found:
break
elif isinstance(t, ParentedTree):
if t == NP_i:
brother = t.right_sibling()
if isinstance(brother,ParentedTree) and brother.node == "VP":
verb = brother.leaves()[0]
if verb in copula_verbs:
for subtree in brother:
if subtree == nominal:
found = True
break
else:
found = check_nominal_construction(t)
return found
return "is_pred_nominal={}".format(check_nominal_construction(s_tree))
示例7: j_is_subject
# 需要导入模块: from nltk.tree import ParentedTree [as 别名]
# 或者: from nltk.tree.ParentedTree import convert [as 别名]
def j_is_subject(feats):
"WORKS"
sentence_tree = TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence_ref)]
ptree = ParentedTree.convert(sentence_tree)
parent = __get_parent_tree__(feats.token_ref, ptree)
j_subject = __is_subject__(ptree,feats.token_ref, parent,ptree)
return "j_is_subject={}".format(j_subject)
示例8: parse_sentences
# 需要导入模块: from nltk.tree import ParentedTree [as 别名]
# 或者: from nltk.tree.ParentedTree import convert [as 别名]
def parse_sentences(self, filename, num_sentences):
"""Parses each one-line sentence into a syntax tree"""
# Open the file and parse a given number of sentences
f = open(filename, 'r')
if num_sentences == 'all':
num_sentences = -1
count = 0
for sentence in f.readlines()[:num_sentences]:
if count%10==0:
print("Number of sentences trained: ",count)
# Get possible parse trees
trees = self.parser.raw_parse(sentence.lower())
for tree in trees:
self.nonterminal_counts['ROOT'] += 1
tokenized_sentence = self.tokenize_sentence(sentence)
# Only extract rules from sentences with greater than 8 tokens,
# to avoid adding rules that generate short, ungrammatical sentences
if len(tokenized_sentence) > 8:
self.extract_rules(tree)
# Convert the tree into a ParentedTree,
# which is an NLTK tree that keeps pointers to each node's parent
ptree = ParentedTree.convert(tree)
# Calculate the bigram counts for this sentence
self.get_bigram(ptree, tokenized_sentence)
count+=1
示例9: apposition
# 需要导入模块: from nltk.tree import ParentedTree [as 别名]
# 或者: from nltk.tree.ParentedTree import convert [as 别名]
def apposition(feats): #this was driving me MAD....I SHOULD CORRECT THE STYLE...aarrrrggghhshs
"""WORKS WITH THE EXAMPLES IN UNITTEST, HOPE THEY WERE NOT A COINDIDENCE"""
if feats.sentence!=feats.sentence_ref:
return "apposition={}".format(False)
else:
sentence_tree = TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence_ref)]
ptree = ParentedTree.convert(sentence_tree)
token_ref = set(feats.token_ref.split("_"))
token = set(feats.token.split("_"))
def is_j_apposition(curr_tree):
found = False
for child in curr_tree:
if found:
break
elif isinstance(child, ParentedTree):
child_leaves = set(child.leaves())
conditions = len(token_ref.intersection(child_leaves))>0 and curr_tree.node == "NP"
if conditions:
brother = child.left_sibling()
if isinstance(brother, ParentedTree) and brother.node == ",":
antecedent = brother.left_sibling()
if isinstance(antecedent,ParentedTree):
previous_words = set(antecedent.leaves())
if len(token.intersection(previous_words))>0:
found = True
else:
found = is_j_apposition(child)
return found
return "apposition={}".format(is_j_apposition(ptree))
示例10: span
# 需要导入模块: from nltk.tree import ParentedTree [as 别名]
# 或者: from nltk.tree.ParentedTree import convert [as 别名]
def span(feats):
"""WORKS"""
if feats.sentence != feats.sentence_ref:
return "span={}".format(False)
else:
s_tree = ParentedTree.convert(TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence)])
i_parent = __get_parent_tree__(feats.token, s_tree)
j_parent = __get_parent_tree__(feats.token_ref,s_tree)
return "span={}".format(i_parent==j_parent)
示例11: update
# 需要导入模块: from nltk.tree import ParentedTree [as 别名]
# 或者: from nltk.tree.ParentedTree import convert [as 别名]
def update(self,syntax_tree):
ptree = ParentedTree.convert(syntax_tree)
bad_words = [":",",",".","?",";"]
for leaf in get_leaves(ptree):
word = leaf[0]
if word in bad_words:
continue
if not word in self:
self.add_node(word,num=0,pos=leaf.pos()[0][1])
self.node[word]["num"] += 1
central_leaf = None
for leaf in get_leaves(ptree):
if leaf[0] == self.target_word:
central_leaf = leaf
break
if not central_leaf:
print "Error: target word not in sentence"
for leaf in get_leaves(ptree):
word = leaf[0]
if word in bad_words:
continue
if word == self.target_word:
for other_leaf in get_leaves(ptree):
other_word = other_leaf[0]
if word == other_word:
continue
if other_word in bad_words:
continue
if not (word,other_word) in self.edges():
self.add_edge(word,other_word,weight=0)
self.edge[word][other_word]["weight"] += 1.0/math.sqrt(get_distance(leaf,other_leaf))
else:
for other_leaf in get_leaves(ptree):
other_word = other_leaf[0]
if word == other_word:
continue
if other_word == self.target_word:
continue
if other_word in bad_words:
continue
if not (word,other_word) in self.edges():
self.add_edge(word,other_word,weight=0)
self.edge[word][other_word]["weight"] += 1.0/math.pow(
get_distance(leaf,other_leaf)*
get_distance(leaf,central_leaf)*
get_distance(other_leaf,central_leaf),1/float(3)
)
self.invalidate_cache()
示例12: same_max_NP
# 需要导入模块: from nltk.tree import ParentedTree [as 别名]
# 或者: from nltk.tree.ParentedTree import convert [as 别名]
def same_max_NP(feats):
"""WORKS"""
if feats.sentence != feats.sentence_ref:
return "same_max_NP={}".format(False)
else:
sentence_tree = TREES_DICTIONARY[feats.article+".raw"][int(feats.sentence)]
ptree = ParentedTree.convert(sentence_tree)
parent1 = __get_parent_tree__(feats.token, ptree)
parent2 = __get_parent_tree__(feats.token_ref, ptree)
#print "parent of: ", feats.token, ":", parent1
#print "parent of: ", feats.token_ref, ":", parent2
max_p_i = __get_max_projection__(ptree,parent1)
max_p_j = __get_max_projection__(ptree, parent2)
if max_p_i is not None and max_p_j is not None:
both_NPs = max_p_i.node == "NP" and max_p_j.node == "NP"
else:
both_NPs = False
return "same_max_NP={}".format(max_p_i == max_p_j and both_NPs)
示例13: read_treebank_files
# 需要导入模块: from nltk.tree import ParentedTree [as 别名]
# 或者: from nltk.tree.ParentedTree import convert [as 别名]
def read_treebank_files(files, extractor,fe):
"""Read the listed treebank files and collect function tagging examples
from each tree.
The user-provided feature extractor is applied to each phrase in each
tree. The extracted feature dicts and the true function tags for each
phrase are stored in two separate lists, which are returned.
"""
X = []
Y = []
for filename in files:
scount = 0
for tree in treebank.parsed_sents(filename):
tree = ParentedTree.convert(tree)
treebank_helper.postprocess(tree)
find_examples_in_tree(tree, X, Y, extractor,fe, filename, scount, 0)
scount += 1
return X, Y
示例14: parse_sentences
# 需要导入模块: from nltk.tree import ParentedTree [as 别名]
# 或者: from nltk.tree.ParentedTree import convert [as 别名]
def parse_sentences(self, filename, num_sentences):
"""Parse each sentence into a tree"""
f = open(filename, 'r')
if num_sentences == 'all':
num_sentences = -1
count = 0
for sentence in f.readlines()[:num_sentences]:
if count%10==0:
print(count)
trees = self.parser.raw_parse(sentence.lower())
for tree in trees:
self.nonterminal_counts['ROOT'] += 1
tokenized_sentence = self.tokenize_sentence(sentence)
if len(tokenized_sentence) > 5:
self.extract_rules(tree)
ptree = ParentedTree.convert(tree)
#print(type(ptree))
self.get_bigram(ptree, tokenized_sentence)
count+=1
示例15: get_arginstances
# 需要导入模块: from nltk.tree import ParentedTree [as 别名]
# 或者: from nltk.tree.ParentedTree import convert [as 别名]
def get_arginstances(self, _pbi) :
'''
returns a list of ARGInstances given a PropbankInstance and according to self.features
Each feature is normalized according to the rules in its if-block.
parameters :
_pbi PropBankInstance
a PropbankInstance that contains the arguments to be extracted
return value :
list of ARGInstances
'''
res = []
for arg in _pbi.arguments : # iterate through all arguments in _pbi
argfeatures = {}
# predicate feature
if 'predicate' in self.features :
argfeatures['predicate'] = re.sub(r'(\w+)\..+', r'\1', _pbi.roleset) # lemmatize the predicate and then set
# argfeatures['predicate'] = self.wnl.lemmatize(_pbi.predicate.select(_pbi.tree).leaves()[0], "v")
# argfeatures['predicate'] = _pbi.predicate.select(_pbi.tree).leaves()[0]
# path feature
if 'path' in self.features :
senTree = ParentedTree.convert(_pbi.tree)
argTree = arg[0].select(senTree)
predTree = _pbi.predicate.select(senTree)
while argTree.label() == "*CHAIN*" or argTree.label() == "*SPLIT*":
argTree = argTree[0]
while predTree.label() == "*CHAIN*" or predTree.label() == "*SPLIT*":
predTree = predTree[0]
argParents = []
predParents = []
while predTree != None:
predParents.append(predTree)
predTree = predTree.parent()
while argTree!= None:
argParents.append(argTree)
argTree = argTree.parent()
jointNode = None
for node in argParents:
if node in predParents:
jointNode = node
stringPath = ""
for i in range(0, argParents.index(jointNode), 1):
node = argParents[i]
stringPath += re.sub(r"(\w+)-.+", r"\1", node.label()) + "^"
for i in range(predParents.index(jointNode) , 0, -1):
node = predParents[i]
stringPath += re.sub(r"(\w+)-.+", r"\1", node.label()) + "!"
argfeatures['path'] = stringPath[:-1]
# phraseType feature
if 'phraseType' in self.features :
argTree = arg[0].select(_pbi.tree)
while argTree.label() == "*CHAIN*" or argTree.label() == "*SPLIT*": # traverse tree until a real constituent is found
argTree = argTree[0]
argfeatures['phraseType'] = re.sub(r"(\w+)[-=$\|].+", r"\1", argTree.label()) # normalize (e.g. NP-SUBJ -> NP) and set
# position feature
if 'position' in self.features :
predTreePointer = _pbi.predicate
while not type(predTreePointer) is PropbankTreePointer: # traverse tree while the pointer is not a real constituent
predTreePointer = predTreePointer.pieces[0]
pred_wordnum = predTreePointer.wordnum # set predicate wordnumber
arg_wordnum = None
if type(arg[0]) is PropbankTreePointer :
arg_wordnum = arg[0].wordnum
# PropChainTreePointer and PropSplitTreePointer don't have wordnums and must be traversed
elif (type(arg[0]) is PropbankChainTreePointer) or (type(arg[0]) is PropbankSplitTreePointer) :
arg_pieces = arg[0].pieces
# traverse the tree (always take the left-most subtree) until a PropbankTreePointer is found
while type(arg_pieces[0]) is not PropbankTreePointer :
arg_pieces = arg_pieces[0].pieces
# then get the wordnum
arg_wordnum = arg_pieces[0].wordnum
# compare wordnumbers and normalize to 'before' or 'after'
if arg_wordnum < pred_wordnum :
argfeatures['position'] = 'before'
else :
argfeatures['position'] = 'after'
# voice feature
if 'voice' in self.features :
# extract voice from PropBankInstance-inflection and normalize to 'active', 'passive' and 'NONE'
if _pbi.inflection.voice == 'a' :
argfeatures['voice'] = 'active'
elif _pbi.inflection.voice == 'p' :
argfeatures['voice'] = 'passive'
else:
argfeatures['voice'] = 'NONE'
# class feature
if 'class' in self.features :
argfeatures['class'] = arg[1].split("-")[0]
#.........这里部分代码省略.........