当前位置: 首页>>代码示例>>Python>>正文


Python NGram.add方法代码示例

本文整理汇总了Python中ngram.NGram.add方法的典型用法代码示例。如果您正苦于以下问题:Python NGram.add方法的具体用法?Python NGram.add怎么用?Python NGram.add使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在ngram.NGram的用法示例。


在下文中一共展示了NGram.add方法的7个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: build_multiclusters

# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import add [as 别名]
def build_multiclusters(inlines, threshold=0.05, N=4):
	clusters = []
	ignoreus = []

	for i, iline in enumerate(inlines):
		if i in ignoreus:
			continue

		iString = " ".join(iline.split(" :::: ")[:3])

		ignoreus.append(i)

		icluster = {}
		icluster[iline] = -1
		iModel = NGram(iString)

		for j in range(i, len(inlines)):
			if j in ignoreus:
				continue
		
			jline = inlines[j]
			jString = " ".join(jline.split(" :::: ")[:3])
		
			results = iModel.search(jString)
			score = sum([y for x,y in results]) / len(results) \
					if len(results) > 0 else 0.0
			print score

			if score > threshold:
				icluster[jline] = score
				iModel.add(jString)
				ignoreus.append(j)

		clusters.append(icluster)
	return clusters
开发者ID:mayhewsw,项目名称:HMMClustering,代码行数:37,代码来源:clusteralgorithm.py

示例2: simtitle

# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import add [as 别名]
def simtitle( request ):
    """calculate similarity based on title and naive threshold"""
    n = NGram( warp=WARP, iconv=enrich, key=lambda x: x.title )
    articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:1000]
    results = []
    for article in articles:
        article.is_duplicate = False
        article.duplicate_of = None
        article.save()
        sim = filter( lambda a: a[1] >= 0.4, n.search( article.title ) )
        for match in sim:
            nearest = match[0]
            if nearest.is_duplicate:
                nearest = nearest.duplicate_of
                if NGram.compare( article.title, nearest.title ) < 0.7:
                    results.append( article )
                    break
            article.is_duplicate = True
            article.duplicate_of = nearest
            article.save()
            break
        else:
            results.append( article )
        n.add( article )
    return render( request, "dump.html", dictionary = { "article_list": results, } )
开发者ID:mrmonkington,项目名称:channelfunnel,代码行数:27,代码来源:views.py

示例3: __init__

# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import add [as 别名]
class Plagiarism:
	def __init__(self,text):
		self.ng=NGram()
		file = open(text,"r")
		linea = file.readline()
		while linea != '':
			if linea != '\n':
				self.ng.add(linea)
			linea = file.readline()
		self.lsn=list(self.ng);
		file.close()

	def verify(self,text_compare):
		results = []
		dictio = []
		file2 = open(text_compare,"r")
		linea2 = file2.readline()
		while linea2 != '':	
			if linea2 != '\n':
				dictio += [self.ng.items_sharing_ngrams(linea2)]
				compares = 0.0
				for parrafo in self.lsn:
					comp = NGram.compare(parrafo,linea2)
					if compares < comp:
						compares = comp
				results += [compares]
			linea2 = file2.readline()
		file2.close()

		major_ocurrences=[]
		for d in dictio:
			major=0
			for val in d.values():
				if major<val:
					major=val
			major_ocurrences+=[major]
			

		avg_perc=0.0
		for r in results:
			avg_perc+=r
		avg_perc=avg_perc/len(results)

		print("Mayor numero de ocurrencias por parrafo del texto copia: "+repr(major_ocurrences))
		print("Porcentaje Similitud: "+repr(avg_perc))
开发者ID:elard28,项目名称:plagiarism-ngram,代码行数:47,代码来源:init.py

示例4: handle

# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import add [as 别名]
    def handle( self, *args, **options ):
        if "simonly" in args:
            new_count = 100000
        else:
            new_count = 0
            for source in Source.objects.filter( scraper = 'feedparser', status__in = ( 'silent', 'live' ) ):
                l = feedparser.parse( source.scraper_config )
                ok = True
                if l[ "bozo" ] == 1:
                   if not isinstance( l[ "bozo_exception" ], feedparser.ThingsNobodyCaresAboutButMe ):
                       ok = False
                if ok:
                    for article in l[ "entries" ]:
                        #print "Reading feed entry %s: '%s'" % ( article[ "id" ], article[ "title" ] )
                        a, created = Article.objects.get_or_create(
                            source = source,
                            # Wordpress RSS IDs are unique internet-wide, and are immutable (unlike URLs)
                            source_reference = article[ "id" ],
                            defaults = {
                                'date_created' : datetime.now(),
                                'source_url' : article[ "link" ],
                                'title' : self.normalise( article[ "title" ] ),
                                'num_comments' : article.get( "slash_comments", 0 ),
                                'summary' : article[ "summary" ],
                                'author' : article.get( "author", "" ),
                                'date_published' : datetime(*(article[ "updated_parsed" ][:6])),
                                'status' : "live"
                            }
                        )
                        if created:
                            #print "Creating new article."
                            pass
                        else:
                            #print "Updating article."
                            pass
                        new_count += 1
                        if article.has_key( "content" ):
                            # TODO test for multiple content blocks and pick most appropriate
                            a.body = article[ "content" ][0][ "value" ]
                        a.tags.clear()
                        for tag in article.get( "tags", () ):
                            a.tags.add( tag[ "term" ] )
                        a.save()

                else:
                    logging.error( "Could not read feed for file '%s': %s" % ( source.scraper_config, l[ "bozo_exception" ] ) ) 
                    logging.error( "Skipping '%s': %s" % ( source.scraper_config, l[ "bozo_exception" ] ) ) 
                    break

        #calculate similarities
        #create a similarity corpus of last 200 docs

        def enrich( obj ):
            s = unicode( obj )
            # simple stop words
            s = re.sub( r"\b(the|of|in|a)\b", "", s, re.IGNORECASE )
            # type prefixes
            s = re.sub( r"^(trailer|review|report|screenshots|video):\s*", "", s, re.IGNORECASE )
            return s
        n = NGram( warp=2.5, iconv=enrich )
        articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:(new_count*4)]
        for article in articles:
            if "simonly" in args:
                article.is_duplicate = False
                article.duplicate_of = None
                article.save()
                continue
        #articles = Article.objects.filter( status = "live", is_duplicate = False ).order_by( "-date_published" )[:new_count]
        #for article in articles:
            #print( u"similarity for %s" % ( article.title, ) )
            sim = filter( lambda a: a[1] > 0.4, n.search( article.title ) )
            for match in sim:
                nearest = match[0]
                if nearest.source == article.source:
                    continue
                if nearest.is_duplicate:
                    nearest = nearest.duplicate_of
                # do it again!
                if nearest.source == article.source:
                    continue
                article.is_duplicate = True
                article.duplicate_of = nearest
                #print u" is duplicate of %s" % ( nearest.title, )
                article.save()
                break
            n.add( article )
开发者ID:ntas,项目名称:channelfunnel,代码行数:88,代码来源:scrape.py

示例5: test_unigram

# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import add [as 别名]
 def test_unigram(self):
     n = NGram(0)
     n.add('after')
     n.next_word() == 'after'
开发者ID:alexlafroscia,项目名称:class-projects,代码行数:6,代码来源:test_ngram.py

示例6: test_trigram

# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import add [as 别名]
 def test_trigram(self):
     n = NGram(2)
     n.add('after', ('before', 'other'))
     assert n.next_word(('before', 'other')) == 'after'
开发者ID:alexlafroscia,项目名称:class-projects,代码行数:6,代码来源:test_ngram.py

示例7: test_bigram

# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import add [as 别名]
 def test_bigram(self):
     n = NGram(1)
     n.add('after', ('before'))
     assert n.next_word(('before')) == 'after'
开发者ID:alexlafroscia,项目名称:class-projects,代码行数:6,代码来源:test_ngram.py


注:本文中的ngram.NGram.add方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。