当前位置: 首页>>代码示例>>Python>>正文


Python NGram.search方法代码示例

本文整理汇总了Python中ngram.NGram.search方法的典型用法代码示例。如果您正苦于以下问题:Python NGram.search方法的具体用法?Python NGram.search怎么用?Python NGram.search使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在ngram.NGram的用法示例。


在下文中一共展示了NGram.search方法的13个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
def main(left_path, left_column, right_path, right_column,
         outfile, titles, join, minscore, count, warp):
    """Perform the similarity join"""
    right_file = csv.reader(open(right_path, 'r'))
    if titles:
        right_header = next(right_file)
    index = NGram((tuple(r) for r in right_file),
                  threshold=minscore,
                  warp=warp, key=lambda x: lowstrip(x[right_column]))
    left_file = csv.reader(open(left_path, 'r'))
    out = csv.writer(open(outfile, 'w'), lineterminator='\n')
    if titles:
        left_header = next(left_file)
        out.writerow(left_header + ["Rank", "Similarity"] + right_header)
    for row in left_file:
        if not row: continue # skip blank lines
        row = tuple(row)
        results = index.search(lowstrip(row[left_column]), threshold=minscore)
        if results:
            if count > 0:
                results = results[:count]
            for rank, result in enumerate(results, 1):
                out.writerow(row + (rank, result[1]) + result[0])
        elif join == "outer":
            out.writerow(row)
开发者ID:gpoulter,项目名称:python-ngram,代码行数:27,代码来源:csvjoin.py

示例2: simtitle

# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
def simtitle( request ):
    """calculate similarity based on title and naive threshold"""
    n = NGram( warp=WARP, iconv=enrich, key=lambda x: x.title )
    articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:1000]
    results = []
    for article in articles:
        article.is_duplicate = False
        article.duplicate_of = None
        article.save()
        sim = filter( lambda a: a[1] >= 0.4, n.search( article.title ) )
        for match in sim:
            nearest = match[0]
            if nearest.is_duplicate:
                nearest = nearest.duplicate_of
                if NGram.compare( article.title, nearest.title ) < 0.7:
                    results.append( article )
                    break
            article.is_duplicate = True
            article.duplicate_of = nearest
            article.save()
            break
        else:
            results.append( article )
        n.add( article )
    return render( request, "dump.html", dictionary = { "article_list": results, } )
开发者ID:mrmonkington,项目名称:channelfunnel,代码行数:27,代码来源:views.py

示例3: map

# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
	def map(self,phrase):
		for term in phrase:
			if len(term) > 4:
				continue
			for word in self.corpus:
				z = Set(term) & Set(word)
				
				matches = []
				if len(z) > 0 and len(z) < len(term):
					#
					#
					g=NGram(z - Set(term))
					#matches = g.search(term)
				else:
					#
					# At this point we assume context is not informative
					# In the advent of context not being informative, we resort to fuzzy lookup
					#		
					g = NGram(word)
					#matches = g.search(term)
				g.remove(term)
				matches = g.search(term)
				key = None
				value = None					
				if len(matches) > 0:
					matches = list(matches[0])
					Pz_ = len(matches) / self.size
					Px_ = fuzz.ratio(term,matches[0]) / 100
					if Px_ > 0.5 and len(term) < len(matches[0]) and len(matches[0]) >= 4:
						key = term
						value= {}
						value= [matches[0],Pz_,Px_,1]
						self.emit (key,value)
开发者ID:weiyixia,项目名称:CSV-file-repair,代码行数:35,代码来源:context.py

示例4: build_multiclusters

# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
def build_multiclusters(inlines, threshold=0.05, N=4):
	clusters = []
	ignoreus = []

	for i, iline in enumerate(inlines):
		if i in ignoreus:
			continue

		iString = " ".join(iline.split(" :::: ")[:3])

		ignoreus.append(i)

		icluster = {}
		icluster[iline] = -1
		iModel = NGram(iString)

		for j in range(i, len(inlines)):
			if j in ignoreus:
				continue
		
			jline = inlines[j]
			jString = " ".join(jline.split(" :::: ")[:3])
		
			results = iModel.search(jString)
			score = sum([y for x,y in results]) / len(results) \
					if len(results) > 0 else 0.0
			print score

			if score > threshold:
				icluster[jline] = score
				iModel.add(jString)
				ignoreus.append(j)

		clusters.append(icluster)
	return clusters
开发者ID:mayhewsw,项目名称:HMMClustering,代码行数:37,代码来源:clusteralgorithm.py

示例5: test_set_operations

# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
 def test_set_operations(self):
     """Test advanced set operations"""
     items1 = set(["abcde", "cdefg", "fghijk", "ijklm"])
     items2 = set(["cdefg", "lmnop"])
     idx1 = NGram(items1)
     idx2 = NGram(items2)
     results = lambda L: sorted(x[0] for x in L)
     # Item removal
     self.assertEqual(results(idx1.search('cde')), ["abcde","cdefg"])
     idx1.remove('abcde')
     self.assertEqual(results(idx1.search('cde')), ["cdefg"])
     # Set intersection operation
     items1.remove('abcde')
     idx1.intersection_update(idx2)
     self.assertEqual(idx1, items1.intersection(items2))
     self.assertEqual(results(idx1.search('lmn')), [])
     self.assertEqual(results(idx1.search('ijk')), [])
     self.assertEqual(results(idx1.search('def')), ['cdefg'])
开发者ID:esbullington,项目名称:python-ngram,代码行数:20,代码来源:test_ngram.py

示例6: test_ngram_search

# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
    def test_ngram_search(self):
        """Tests from the original ngram.py, to check that the
        rewrite still uses the same underlying algorithm"""

        # Basic searching of the index
        idx = NGram(self.items)
        self.assertEqual(idx.search('askfjwehiuasdfji'), [
            ('askfjwehiuasdfji', 1.0),
            ('asdfawe', 0.17391304347826086),
            ('asfwef', 0.083333333333333329),
            ('adfwe', 0.041666666666666664)])
        self.assertEqual(idx.search('afadfwe')[:2],
                [('adfwe', 0.59999999999999998),
                 ('asdfawe', 0.20000000000000001)])

        # Pairwise comparison of strings
        self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0)
        self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)
开发者ID:DavidBrear,项目名称:python-ngram,代码行数:20,代码来源:test_ngram.py

示例7: main

# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
def main(left_path, left_column, right_path, right_column,
         outfile, titles, join, minscore, count, warp):
    """Perform the similarity join

    >>> open('left.csv', 'w').write('''ID,NAME
    ... 1,Joe
    ... 2,Kin
    ... 3,ZAS''')

    >>> open('right.csv', 'w').write('''ID,NAME
    ... ID,NAME
    ... A,Joe
    ... B,Jon
    ... C,Job
    ... D,Kim''')
    >>> main(left_path='left.csv', left_column=1,
    ... right_path='right.csv', right_column=1, outfile='out.csv',
    ... titles=True, join='outer', minscore=0.24, count=5, warp=1.0)
    >>> print open('out.csv').read()  #doctest: +NORMALIZE_WHITESPACE
    ID,NAME,Rank,Similarity,ID,NAME
    1,Joe,1,1.0,A,Joe
    1,Joe,2,0.25,B,Jon
    1,Joe,3,0.25,C,Job
    2,Kin,1,0.25,D,Kim
    3,ZAS
    <BLANKLINE>
    """
    right_file = csv.reader(open(right_path, 'r'))
    if titles:
        right_header = right_file.next()
    index = NGram((tuple(r) for r in right_file),
                  threshold=minscore,
                  warp=warp, key=lambda x: lowstrip(x[right_column]))
    left_file = csv.reader(open(left_path, 'r'))
    out = csv.writer(open(outfile, 'w'))
    if titles:
        left_header = left_file.next()
        out.writerow(left_header + ["Rank", "Similarity"] + right_header)
    for row in left_file:
        if not row: continue # skip blank lines
        row = tuple(row)
        results = index.search(lowstrip(row[left_column]), threshold=minscore)
        if results:
            if count > 0:
                results = results[:count]
            for rank, result in enumerate(results, 1):
                out.writerow(row + (rank, result[1]) + result[0])
        elif join == "outer":
            out.writerow(row)
开发者ID:Rafiot,项目名称:python-ngram,代码行数:51,代码来源:csvjoin.py

示例8: wordsoccurrences

# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
    def wordsoccurrences(self, words_list, option='ortony'):
        frequencies = FreqDist(words_list)
        ordered_unigrams = frequencies.most_common()
        if option == 'ortony':
            lexicon = self.ortony_list
        else:
            lexicon = self.profane_words
        count = 0
        for t_word, count_w in ordered_unigrams:
            lower_word = t_word.lower()
            three_grams = NGram(lexicon)
            likely_words = three_grams.search(lower_word)
            if len(likely_words) > 0:
                # if lower_word in lexicon:
                count += 1 * count_w

            if lower_word in lexicon:
                count += 1
        return count
开发者ID:ARGHZ,项目名称:ClassifTweets,代码行数:21,代码来源:execute_xperiment.py

示例9: verify

# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
	def verify(self,text_compare):
		results = []
		texto = []
		'''
		file2 = open(text_compare,"r")
		for linea2 in file2.readlines():
			texto+=linea2.split(" ")
		tng=NGram(texto)
		file2.close()
		'''
		file2 = open(text_compare,"r")
		linea2 = file2.readline()
		while linea2 != '':
			texto+=linea2.split(" ")
			linea2 = file2.readline()
		tng=NGram(texto)
		file2.close()

		for ngs in self.ng:
			count=0
			for word in list(ngs):
				for porc in tng.search(word):
					if porc[1]>0.3:
						count+=1
			results+=[count]

		print list(results)

		pos=0
		count=0
		i=0
		for res in results:
			if count<res:
				count=res
				pos=i
			i+=1

		if results[pos]>2:
			print("Tema mas preciso del texto: "+repr(self.topic[pos]))
		else:
			print("No se ha podido precisar de que trata")
		print ""			
开发者ID:elard28,项目名称:TopicDetector,代码行数:44,代码来源:topic.py

示例10: handle

# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
    def handle( self, *args, **options ):
        if "simonly" in args:
            new_count = 100000
        else:
            new_count = 0
            for source in Source.objects.filter( scraper = 'feedparser', status__in = ( 'silent', 'live' ) ):
                l = feedparser.parse( source.scraper_config )
                ok = True
                if l[ "bozo" ] == 1:
                   if not isinstance( l[ "bozo_exception" ], feedparser.ThingsNobodyCaresAboutButMe ):
                       ok = False
                if ok:
                    for article in l[ "entries" ]:
                        #print "Reading feed entry %s: '%s'" % ( article[ "id" ], article[ "title" ] )
                        a, created = Article.objects.get_or_create(
                            source = source,
                            # Wordpress RSS IDs are unique internet-wide, and are immutable (unlike URLs)
                            source_reference = article[ "id" ],
                            defaults = {
                                'date_created' : datetime.now(),
                                'source_url' : article[ "link" ],
                                'title' : self.normalise( article[ "title" ] ),
                                'num_comments' : article.get( "slash_comments", 0 ),
                                'summary' : article[ "summary" ],
                                'author' : article.get( "author", "" ),
                                'date_published' : datetime(*(article[ "updated_parsed" ][:6])),
                                'status' : "live"
                            }
                        )
                        if created:
                            #print "Creating new article."
                            pass
                        else:
                            #print "Updating article."
                            pass
                        new_count += 1
                        if article.has_key( "content" ):
                            # TODO test for multiple content blocks and pick most appropriate
                            a.body = article[ "content" ][0][ "value" ]
                        a.tags.clear()
                        for tag in article.get( "tags", () ):
                            a.tags.add( tag[ "term" ] )
                        a.save()

                else:
                    logging.error( "Could not read feed for file '%s': %s" % ( source.scraper_config, l[ "bozo_exception" ] ) ) 
                    logging.error( "Skipping '%s': %s" % ( source.scraper_config, l[ "bozo_exception" ] ) ) 
                    break

        #calculate similarities
        #create a similarity corpus of last 200 docs

        def enrich( obj ):
            s = unicode( obj )
            # simple stop words
            s = re.sub( r"\b(the|of|in|a)\b", "", s, re.IGNORECASE )
            # type prefixes
            s = re.sub( r"^(trailer|review|report|screenshots|video):\s*", "", s, re.IGNORECASE )
            return s
        n = NGram( warp=2.5, iconv=enrich )
        articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:(new_count*4)]
        for article in articles:
            if "simonly" in args:
                article.is_duplicate = False
                article.duplicate_of = None
                article.save()
                continue
        #articles = Article.objects.filter( status = "live", is_duplicate = False ).order_by( "-date_published" )[:new_count]
        #for article in articles:
            #print( u"similarity for %s" % ( article.title, ) )
            sim = filter( lambda a: a[1] > 0.4, n.search( article.title ) )
            for match in sim:
                nearest = match[0]
                if nearest.source == article.source:
                    continue
                if nearest.is_duplicate:
                    nearest = nearest.duplicate_of
                # do it again!
                if nearest.source == article.source:
                    continue
                article.is_duplicate = True
                article.duplicate_of = nearest
                #print u" is duplicate of %s" % ( nearest.title, )
                article.save()
                break
            n.add( article )
开发者ID:ntas,项目名称:channelfunnel,代码行数:88,代码来源:scrape.py

示例11: _location_choices

# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
 def _location_choices(self, search):
     ngram_index = NGram(key=self._location_to_name)
     ngram_index.update(Ward.objects.all())
     ngram_index.update(District.objects.all())
     locations = ngram_index.search(search)[:self.num_choices]
     return [self._location_to_choice(l) for l, _score in locations]
开发者ID:BongoHive,项目名称:magriculture,代码行数:8,代码来源:forms.py

示例12: run

# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
	def run(self):
		N = len(self.context)
		
		imatches = []
		found = {}
		Y = range(0,len(self.bag))
		for i in range(0,N):
			Xo_ = list(self.bag[i])	# skip_gram
			#Y = (Set(range(0,N)) - (Set([i]) | Set(imatches)))
			for ii in Y:
				if self.bag[i] == self.bag[ii] :
					imatches.append(ii) ;
					continue
				#
				# We are sure we are not comparing the identical phrase
				# NOTE: Repetition doesn't yield learning, rather context does.
				# Lets determine if there are common terms
				#
				Z = Set(self.bag[i]) & Set(self.bag[ii])
				
				if len(Z) > 0 and len(Xo_) > 0:

					Xo_ 	= Set(Xo_) - Z # - list(Set(bag[i]) - Set(bag[ii]))
					Yo_ 	= Set(self.bag[ii]) - Z #list(Set(bag[ii]) - Set(bag[i]))
					size 	= len(Xo_)
					g = NGram(Yo_)	
					for term in Xo_:
						
						xo = g.search(term)
						if len(xo) > 0 and len(term) < 4:
							xo = xo[0]
						else:
							continue;
						xo = list(xo)
						xo_i = self.bag[i].index(term) 
						yo_i = self.bag[ii].index(xo[0])
						#
						# We have the pair, and we will compute the distance
						#
						ratio = fuzz.ratio(term,xo[0])/100
						is_subset = len(Set(term) & Set(xo[0])) == len(term)
						if is_subset and len(term) < len(xo[0]) and ratio > 0.5 and xo_i ==yo_i:
							
							xo[1] = [ratio,xo_i]
							if (term not in self.info):
								#xo[1] = ratio
								self.info[term] = [term,xo[0]]+xo[1]
							elif term in self.info and ratio > self.info[term][1] :							
								self.info[term] = [term,xo[0]]+xo[1]
							
							
							imatches.append(ii)
							break;
		#
		# At this point we consolidate all that has been learnt
		# And make it available to the outside word, otherwise client should retrieve it
		#
		self.lock.acquire()
		if self.queue is not None:
			
			for term in self.info:	
				value = ['thread # ',self.name]+list(self.info[term])							
				self.queue.put(value)
		self.lock.release()
开发者ID:weiyixia,项目名称:CSV-file-repair,代码行数:66,代码来源:context.py

示例13: open

# 需要导入模块: from ngram import NGram [as 别名]
# 或者: from ngram.NGram import search [as 别名]
    """
    address_longlat = []      
    for address in location:
        g = geocoder.google(address)
        list_longlat = g.latlnga
        list_longlat.insert(0,address)
        address_longlat.append(list_longlat)
    print address_longlat

    """
    get long lat from data POI using Ngram
    """

    with open("D:/tasya/python/code/Geo-Tag/corpus/sample-poi1.csv") as file:
        reader = csv.reader(file)
        #reader.next()
        corpus = []
        for row in reader:
            corpus.append(row[0])
            
    corpus_name = []
    for word in corpus:
        corpus_name.append(word.split(';')[0])
    address = []    
    G = NGram(corpus_name)
    G_latlng = NGram(corpus)
    for word in location:       
        out = G.search(word)
        out1 = G_latlng.append(out[0][0])
        address.append(out1[0][0])
开发者ID:asyafiq,项目名称:Geolocation,代码行数:32,代码来源:extraction_compile_edit.py


注:本文中的ngram.NGram.search方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。