本文整理汇总了Python中ngram.NGram.add方法的典型用法代码示例。如果您正苦于以下问题:Python NGram.add方法的具体用法?Python NGram.add怎么用?Python NGram.add使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类ngram.NGram
的用法示例。
在下文中一共展示了NGram.add方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: build_multiclusters
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import add [as 别名]
def build_multiclusters(inlines, threshold=0.05, N=4):
clusters = []
ignoreus = []
for i, iline in enumerate(inlines):
if i in ignoreus:
continue
iString = " ".join(iline.split(" :::: ")[:3])
ignoreus.append(i)
icluster = {}
icluster[iline] = -1
iModel = NGram(iString)
for j in range(i, len(inlines)):
if j in ignoreus:
continue
jline = inlines[j]
jString = " ".join(jline.split(" :::: ")[:3])
results = iModel.search(jString)
score = sum([y for x,y in results]) / len(results) \
if len(results) > 0 else 0.0
print score
if score > threshold:
icluster[jline] = score
iModel.add(jString)
ignoreus.append(j)
clusters.append(icluster)
return clusters
示例2: simtitle
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import add [as 别名]
def simtitle( request ):
"""calculate similarity based on title and naive threshold"""
n = NGram( warp=WARP, iconv=enrich, key=lambda x: x.title )
articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:1000]
results = []
for article in articles:
article.is_duplicate = False
article.duplicate_of = None
article.save()
sim = filter( lambda a: a[1] >= 0.4, n.search( article.title ) )
for match in sim:
nearest = match[0]
if nearest.is_duplicate:
nearest = nearest.duplicate_of
if NGram.compare( article.title, nearest.title ) < 0.7:
results.append( article )
break
article.is_duplicate = True
article.duplicate_of = nearest
article.save()
break
else:
results.append( article )
n.add( article )
return render( request, "dump.html", dictionary = { "article_list": results, } )
示例3: __init__
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import add [as 别名]
class Plagiarism:
def __init__(self,text):
self.ng=NGram()
file = open(text,"r")
linea = file.readline()
while linea != '':
if linea != '\n':
self.ng.add(linea)
linea = file.readline()
self.lsn=list(self.ng);
file.close()
def verify(self,text_compare):
results = []
dictio = []
file2 = open(text_compare,"r")
linea2 = file2.readline()
while linea2 != '':
if linea2 != '\n':
dictio += [self.ng.items_sharing_ngrams(linea2)]
compares = 0.0
for parrafo in self.lsn:
comp = NGram.compare(parrafo,linea2)
if compares < comp:
compares = comp
results += [compares]
linea2 = file2.readline()
file2.close()
major_ocurrences=[]
for d in dictio:
major=0
for val in d.values():
if major<val:
major=val
major_ocurrences+=[major]
avg_perc=0.0
for r in results:
avg_perc+=r
avg_perc=avg_perc/len(results)
print("Mayor numero de ocurrencias por parrafo del texto copia: "+repr(major_ocurrences))
print("Porcentaje Similitud: "+repr(avg_perc))
示例4: handle
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import add [as 别名]
def handle( self, *args, **options ):
if "simonly" in args:
new_count = 100000
else:
new_count = 0
for source in Source.objects.filter( scraper = 'feedparser', status__in = ( 'silent', 'live' ) ):
l = feedparser.parse( source.scraper_config )
ok = True
if l[ "bozo" ] == 1:
if not isinstance( l[ "bozo_exception" ], feedparser.ThingsNobodyCaresAboutButMe ):
ok = False
if ok:
for article in l[ "entries" ]:
#print "Reading feed entry %s: '%s'" % ( article[ "id" ], article[ "title" ] )
a, created = Article.objects.get_or_create(
source = source,
# Wordpress RSS IDs are unique internet-wide, and are immutable (unlike URLs)
source_reference = article[ "id" ],
defaults = {
'date_created' : datetime.now(),
'source_url' : article[ "link" ],
'title' : self.normalise( article[ "title" ] ),
'num_comments' : article.get( "slash_comments", 0 ),
'summary' : article[ "summary" ],
'author' : article.get( "author", "" ),
'date_published' : datetime(*(article[ "updated_parsed" ][:6])),
'status' : "live"
}
)
if created:
#print "Creating new article."
pass
else:
#print "Updating article."
pass
new_count += 1
if article.has_key( "content" ):
# TODO test for multiple content blocks and pick most appropriate
a.body = article[ "content" ][0][ "value" ]
a.tags.clear()
for tag in article.get( "tags", () ):
a.tags.add( tag[ "term" ] )
a.save()
else:
logging.error( "Could not read feed for file '%s': %s" % ( source.scraper_config, l[ "bozo_exception" ] ) )
logging.error( "Skipping '%s': %s" % ( source.scraper_config, l[ "bozo_exception" ] ) )
break
#calculate similarities
#create a similarity corpus of last 200 docs
def enrich( obj ):
s = unicode( obj )
# simple stop words
s = re.sub( r"\b(the|of|in|a)\b", "", s, re.IGNORECASE )
# type prefixes
s = re.sub( r"^(trailer|review|report|screenshots|video):\s*", "", s, re.IGNORECASE )
return s
n = NGram( warp=2.5, iconv=enrich )
articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:(new_count*4)]
for article in articles:
if "simonly" in args:
article.is_duplicate = False
article.duplicate_of = None
article.save()
continue
#articles = Article.objects.filter( status = "live", is_duplicate = False ).order_by( "-date_published" )[:new_count]
#for article in articles:
#print( u"similarity for %s" % ( article.title, ) )
sim = filter( lambda a: a[1] > 0.4, n.search( article.title ) )
for match in sim:
nearest = match[0]
if nearest.source == article.source:
continue
if nearest.is_duplicate:
nearest = nearest.duplicate_of
# do it again!
if nearest.source == article.source:
continue
article.is_duplicate = True
article.duplicate_of = nearest
#print u" is duplicate of %s" % ( nearest.title, )
article.save()
break
n.add( article )
示例5: test_unigram
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import add [as 别名]
def test_unigram(self):
n = NGram(0)
n.add('after')
n.next_word() == 'after'
示例6: test_trigram
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import add [as 别名]
def test_trigram(self):
n = NGram(2)
n.add('after', ('before', 'other'))
assert n.next_word(('before', 'other')) == 'after'
示例7: test_bigram
# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import add [as 别名]
def test_bigram(self):
n = NGram(1)
n.add('after', ('before'))
assert n.next_word(('before')) == 'after'