Python ngram.NGram类代码示例

本文整理汇总了Python中ngram.NGram类的典型用法代码示例。如果您正苦于以下问题：Python NGram类的具体用法？Python NGram怎么用？Python NGram使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

在下文中一共展示了NGram类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: build_multiclusters

def build_multiclusters(inlines, threshold=0.05, N=4):
	clusters = []
	ignoreus = []

	for i, iline in enumerate(inlines):
		if i in ignoreus:
			continue

		iString = " ".join(iline.split(" :::: ")[:3])

		ignoreus.append(i)

		icluster = {}
		icluster[iline] = -1
		iModel = NGram(iString)

		for j in range(i, len(inlines)):
			if j in ignoreus:
				continue
		
			jline = inlines[j]
			jString = " ".join(jline.split(" :::: ")[:3])
		
			results = iModel.search(jString)
			score = sum([y for x,y in results]) / len(results) \
					if len(results) > 0 else 0.0
			print score

			if score > threshold:
				icluster[jline] = score
				iModel.add(jString)
				ignoreus.append(j)

		clusters.append(icluster)
	return clusters

开发者ID:mayhewsw，项目名称:HMMClustering，代码行数:35，代码来源:clusteralgorithm.py

示例2: filterByOp

    def filterByOp(self,clone):
        opStr1 = ""
        opStr2 = ""
        indx1,start1,end1 = clone[1]
        indx2,start2,end2 = clone[2]

        for i in range(start1,end1+1):
            opStr1 += str(self.op1_hash.get(i,-1))
        for i in range(start2,end2+1):
            opStr2 += str(self.op2_hash.get(i,-1))

        if config.DEBUG is True:
            print "start1 = %d, end1 = %d, ops = %s" % (start1,end1,opStr1)
            print "start2 = %d, end2 = %d, ops = %s" % (start2,end2,opStr2)

#        if ((self.hasChanged(opStr1) is False) or
#            (self.hasChanged(opStr2) is False)):
        if not (self.hasChanged(opStr1) and self.hasChanged(opStr2)):
            return None

        idx = NGram(N=config.NGRAM)
        ngram1 = list(idx.ngrams(opStr1))
        ngram2 = list(idx.ngrams(opStr2))
        metric = self.compareList(ngram1,ngram2)

        return metric

开发者ID:Shraddha512，项目名称:FSE-2012-REPERTOIRE，代码行数:26，代码来源:operation_filter.py

示例3: map

	def map(self,phrase):
		for term in phrase:
			if len(term) > 4:
				continue
			for word in self.corpus:
				z = Set(term) & Set(word)
				
				matches = []
				if len(z) > 0 and len(z) < len(term):
					#
					#
					g=NGram(z - Set(term))
					#matches = g.search(term)
				else:
					#
					# At this point we assume context is not informative
					# In the advent of context not being informative, we resort to fuzzy lookup
					#		
					g = NGram(word)
					#matches = g.search(term)
				g.remove(term)
				matches = g.search(term)
				key = None
				value = None					
				if len(matches) > 0:
					matches = list(matches[0])
					Pz_ = len(matches) / self.size
					Px_ = fuzz.ratio(term,matches[0]) / 100
					if Px_ > 0.5 and len(term) < len(matches[0]) and len(matches[0]) >= 4:
						key = term
						value= {}
						value= [matches[0],Pz_,Px_,1]
						self.emit (key,value)

开发者ID:weiyixia，项目名称:CSV-file-repair，代码行数:33，代码来源:context.py

示例4: select_translation

def select_translation(sentence, idx, word, translations):
    # make sure the subject pronoun is in subject form
    # heuristic: if it's the first word or the previous word is punctuation
    # or conjunction, it's considered a subject
    if word[1] == 'r' and word[0] in subject_pronoun:
        if idx == 0 or sentence[idx-1][1] in ['x', 'c']:
            return (subject_pronoun[word[0]], 'pron')

    # handle special case: <digits>/m 日/m
    if word[1] == 'm':
        if DIGITS_PATTERN.match(word[0]):
            if idx+1 < len(sentence) and sentence[idx+1][0] == u'日':
                # return proper date string
                return (translate_date(int(word[0])), 'n')
            else:
                # return digits directly
                return (word[0], 'n')
        elif word[0] == u'日':
            # symmetric case
            if idx > 0 and DIGITS_PATTERN.match(sentence[i-1][0]):
                return ('', '')

    # construct a list of translations with the same pos as word
    same_pos_translations = filter(lambda t: match_pos(word[1], t[1]), translations)

    ng = NGram()

    if len(same_pos_translations) > 0:
        max_unigram_trans = max(same_pos_translations, key=lambda t: ng.get(t[0]))
        return max_unigram_trans

    return translations[0]

开发者ID:scottcheng，项目名称:mt，代码行数:32，代码来源:translate.py

示例5: main

def main(left_path, left_column, right_path, right_column,
         outfile, titles, join, minscore, count, warp):
    """Perform the similarity join"""
    right_file = csv.reader(open(right_path, 'r'))
    if titles:
        right_header = next(right_file)
    index = NGram((tuple(r) for r in right_file),
                  threshold=minscore,
                  warp=warp, key=lambda x: lowstrip(x[right_column]))
    left_file = csv.reader(open(left_path, 'r'))
    out = csv.writer(open(outfile, 'w'), lineterminator='\n')
    if titles:
        left_header = next(left_file)
        out.writerow(left_header + ["Rank", "Similarity"] + right_header)
    for row in left_file:
        if not row: continue # skip blank lines
        row = tuple(row)
        results = index.search(lowstrip(row[left_column]), threshold=minscore)
        if results:
            if count > 0:
                results = results[:count]
            for rank, result in enumerate(results, 1):
                out.writerow(row + (rank, result[1]) + result[0])
        elif join == "outer":
            out.writerow(row)

开发者ID:gpoulter，项目名称:python-ngram，代码行数:25，代码来源:csvjoin.py

示例6: simtitle

def simtitle( request ):
    """calculate similarity based on title and naive threshold"""
    n = NGram( warp=WARP, iconv=enrich, key=lambda x: x.title )
    articles = Article.objects.filter( status = "live" ).order_by( "date_published" )[:1000]
    results = []
    for article in articles:
        article.is_duplicate = False
        article.duplicate_of = None
        article.save()
        sim = filter( lambda a: a[1] >= 0.4, n.search( article.title ) )
        for match in sim:
            nearest = match[0]
            if nearest.is_duplicate:
                nearest = nearest.duplicate_of
                if NGram.compare( article.title, nearest.title ) < 0.7:
                    results.append( article )
                    break
            article.is_duplicate = True
            article.duplicate_of = nearest
            article.save()
            break
        else:
            results.append( article )
        n.add( article )
    return render( request, "dump.html", dictionary = { "article_list": results, } )

开发者ID:mrmonkington，项目名称:channelfunnel，代码行数:25，代码来源:views.py

示例7: test

def test():
    filter = opFilter()

    opStr1 = "nnn+"
    opStr2 = "nn+"

    idx = NGram(N=config.NGRAM)
    l1 = list(idx.ngrams(opStr1))
    l2 = list(idx.ngrams(opStr2))

    print filter.compareList(l1,l2)

开发者ID:Shraddha512，项目名称:FSE-2012-REPERTOIRE，代码行数:11，代码来源:operation_filter.py

示例8: ngram_similarity

def ngram_similarity(univ_name):
    out = {}
    with open("static/UniqueFBUnivNames.csv", 'rb') as f:
        reader = csv.reader(f)
        for row in reader:
            row = re.sub('[^A-Za-z0-9 ]+', ' ', str(row))
            row = re.sub('  ', ' ', str(row))
            out['score'] = NGram.compare(str(row).lower(), univ_name, N=1)
            if NGram.compare(str(row).lower(), str(univ_name).lower()) > 0.5:
                out['score_used'] = NGram.compare(str(row).lower(), univ_name)
                out['univ'] = str(row)
                return out
    return out

开发者ID:MysteriousMagics，项目名称:NLPCareerTrajectory，代码行数:13，代码来源:univ_lookup.py

示例9: main

def main(left_path, left_column, right_path, right_column,
         outfile, titles, join, minscore, count, warp):
    """Perform the similarity join

    >>> open('left.csv', 'w').write('''ID,NAME
    ... 1,Joe
    ... 2,Kin
    ... 3,ZAS''')

    >>> open('right.csv', 'w').write('''ID,NAME
    ... ID,NAME
    ... A,Joe
    ... B,Jon
    ... C,Job
    ... D,Kim''')
    >>> main(left_path='left.csv', left_column=1,
    ... right_path='right.csv', right_column=1, outfile='out.csv',
    ... titles=True, join='outer', minscore=0.24, count=5, warp=1.0)
    >>> print open('out.csv').read()  #doctest: +NORMALIZE_WHITESPACE
    ID,NAME,Rank,Similarity,ID,NAME
    1,Joe,1,1.0,A,Joe
    1,Joe,2,0.25,B,Jon
    1,Joe,3,0.25,C,Job
    2,Kin,1,0.25,D,Kim
    3,ZAS
    <BLANKLINE>
    """
    right_file = csv.reader(open(right_path, 'r'))
    if titles:
        right_header = right_file.next()
    index = NGram((tuple(r) for r in right_file),
                  threshold=minscore,
                  warp=warp, key=lambda x: lowstrip(x[right_column]))
    left_file = csv.reader(open(left_path, 'r'))
    out = csv.writer(open(outfile, 'w'))
    if titles:
        left_header = left_file.next()
        out.writerow(left_header + ["Rank", "Similarity"] + right_header)
    for row in left_file:
        if not row: continue # skip blank lines
        row = tuple(row)
        results = index.search(lowstrip(row[left_column]), threshold=minscore)
        if results:
            if count > 0:
                results = results[:count]
            for rank, result in enumerate(results, 1):
                out.writerow(row + (rank, result[1]) + result[0])
        elif join == "outer":
            out.writerow(row)

开发者ID:Rafiot，项目名称:python-ngram，代码行数:49，代码来源:csvjoin.py

示例10: get_distr

def get_distr(strlist, n_len):
    alphabet = ['A', 'C', 'G', 'T', 'N']
    n = NGram(N=n_len, pad_len=0)
    all_ngrams = 0
    grams = init_grams_dict(n_len, alphabet)
    for item in strlist:
        if item == '':
            continue
        ngram_list = list(n._split(item))
        for ng in ngram_list:
            if ng in grams:
                grams[ng] += float(1)
                all_ngrams += 1
    for item in grams.keys():
        grams[item] /= all_ngrams
    return grams

开发者ID:al-indigo，项目名称:transcriptome-assemblies-refiner，代码行数:16，代码来源:validation_checker.py

示例11: verify

	def verify(self,text_compare):
		results = []
		dictio = []
		file2 = open(text_compare,"r")
		linea2 = file2.readline()
		while linea2 != '':	
			if linea2 != '\n':
				dictio += [self.ng.items_sharing_ngrams(linea2)]
				compares = 0.0
				for parrafo in self.lsn:
					comp = NGram.compare(parrafo,linea2)
					if compares < comp:
						compares = comp
				results += [compares]
			linea2 = file2.readline()
		file2.close()

		major_ocurrences=[]
		for d in dictio:
			major=0
			for val in d.values():
				if major<val:
					major=val
			major_ocurrences+=[major]
			

		avg_perc=0.0
		for r in results:
			avg_perc+=r
		avg_perc=avg_perc/len(results)

		print("Mayor numero de ocurrencias por parrafo del texto copia: "+repr(major_ocurrences))
		print("Porcentaje Similitud: "+repr(avg_perc))

开发者ID:elard28，项目名称:plagiarism-ngram，代码行数:33，代码来源:init.py

示例12: main

def main():
    questions_path, answers_path = sys.argv[1:]

    print("Reading Corpus:")
    train_sentences = read_corpus('train_data', disp=True)

    print('\nTraining on Corpus')
    model = NGram.train_model(train_sentences, disp=True)

    with open(answers_path, 'r') as answer_file:
        answers = get_sentences(untokenized_text=answer_file.read(),
                                is_tokenized=True,
                                token_start_end=('<s>', '</s>'))

    dev_sentences = answers[:520]

    print('Calculating Probabilities for Dev Sentences:')
    model.sentences_probabilities(dev_sentences, disp=True)
    lambdas = optimize_lambdas(model)

    with open(questions_path, 'r') as question_file:
        questions = get_sentences(untokenized_text=question_file.read(),
                                  is_tokenized=True,
                                  token_start_end=('<s>', '</s>'))

    print('Calculating Probabilities for Test Sentences:')
    model.sentences_probabilities(sentences=questions, disp=True)
    _, sentences_perplexity = model.perplexity(lambdas=lambdas)

    print('Writing sentences and perplexities to file')
    with open('output.txt', 'w') as out_file:
        for i, perplexity in enumerate(sentences_perplexity):
            out_file.write('{}\t{}\n'.format(' '.join(questions[i]).replace('<s0> <s1>', '<s>'), perplexity))

开发者ID:EthanWelsh，项目名称:N-Grams，代码行数:33，代码来源:extrinsic.py

示例13: test_count_1gram

    def test_count_1gram(self):
        ngram = NGram(1, self.sents)

        counts = {
            (): 12,
            ('el',): 1,
            ('gato',): 1,
            ('come',): 2,
            ('pescado',): 1,
            ('.',): 2,
            ('</s>',): 2,
            ('la',): 1,
            ('gata',): 1,
            ('salmón',): 1,
        }
        for gram, c in counts.items():
            self.assertEqual(ngram.count(gram), c)

开发者ID:Mallku2，项目名称:PLN-2015，代码行数:17，代码来源:test_ngram.py

示例14: compare_ngrams

def compare_ngrams(left, right, N=2, pad_len=0):
    left = ascii(left)
    right = ascii(right)
    if len(left) == 1 and len(right) == 1:
        # NGram.compare returns 0.0 for 1 letter comparison, even if letters
        # are equal.
        return 1.0 if left == right else 0.0
    return NGram.compare(left, right, N=N, pad_len=pad_len)

开发者ID:digideskio，项目名称:addok，代码行数:8，代码来源:text.py

示例15: test_ngram_search

    def test_ngram_search(self):
        """Tests from the original ngram.py, to check that the
        rewrite still uses the same underlying algorithm"""

        # Basic searching of the index
        idx = NGram(self.items)
        self.assertEqual(idx.search('askfjwehiuasdfji'), [
            ('askfjwehiuasdfji', 1.0),
            ('asdfawe', 0.17391304347826086),
            ('asfwef', 0.083333333333333329),
            ('adfwe', 0.041666666666666664)])
        self.assertEqual(idx.search('afadfwe')[:2],
                [('adfwe', 0.59999999999999998),
                 ('asdfawe', 0.20000000000000001)])

        # Pairwise comparison of strings
        self.assertEqual(NGram.compare('sdfeff', 'sdfeff'), 1.0)
        self.assertEqual(NGram.compare('sdfeff', 'zzzzzz'), 0.0)

开发者ID:DavidBrear，项目名称:python-ngram，代码行数:18，代码来源:test_ngram.py

注：本文中的ngram.NGram类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。