本文整理汇总了Python中detector.Detector.canonicalise_submission方法的典型用法代码示例。如果您正苦于以下问题:Python Detector.canonicalise_submission方法的具体用法?Python Detector.canonicalise_submission怎么用?Python Detector.canonicalise_submission使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类detector.Detector
的用法示例。
在下文中一共展示了Detector.canonicalise_submission方法的1个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: calculate_document_similarity
# 需要导入模块: from detector import Detector [as 别名]
# 或者: from detector.Detector import canonicalise_submission [as 别名]
def calculate_document_similarity(self, *submissions):
'''
given a list of submissions find source code matches between them
'''
assert len(submissions) == 2
assert submissions[0].language == submissions[1].language
line_numbers = []
canonicalised = []
# strip whitespace, new lines and remove variable names etc
for submission in submissions:
document = Detector.canonicalise_submission(submission)
s, l = self._whitespaced_stripped_with_line_numbers(document)
line_numbers.append(l)
canonicalised.append(s)
# generate generalised suffix tree
st = SuffixTree(canonicalised)
# find long common substrings
common_substrings = list(st.common_substrings_longer_than(20))
# longest to shortest
common_substrings.sort(key=lambda x: len(x), reverse=True)
document_matches = [[], []]
string_indexes = [[], []]
source_lines = map(lambda x: x.program_source.split('\n'), submissions)
# match substrings back into line numbers
for substring in common_substrings:
startAt = [0, 0]
# the substrings may occur multiple times in the string so loop until we can no longer find them
while True:
indexes = []
substring_originals = []
empty = False
newStartAt = startAt[:]
for i in (0, 1):
index = canonicalised[i].find(substring, startAt[i])
# we may find this again next time
if index != -1:
newStartAt[i] = index+1
else:
# we couldn't find it again, just use the old index
index = canonicalised[i].find(substring)
index = self._wrap_substring_to_lines(canonicalised[i], line_numbers[i], index, index + len(substring))
# it wrapped to 0 lines so we ignore it
if index[0] >= index[1]:
empty = True
continue
line_index = line_numbers[i][index[0]], line_numbers[i][index[1]]
indexes.append(line_index)
substring_originals.append('\n'.join(source_lines[i][line_index[0]:line_index[1]]))
if startAt == newStartAt:
break
startAt = newStartAt
# only add this index if we can match variables
if not empty and self._variable_match(*substring_originals):
for i in (0, 1):
string_indexes[i].append(indexes[i])
for i in (0, 1):
line_numbers = self._remove_overlapping_ranges(string_indexes[i])
for begin, end in line_numbers:
if end - begin > 1:
document_matches[i].append(Match(submissions[i].id, begin, end - begin, 0))
return document_matches