本文整理汇总了Python中lshash.LSHash.arpoxNN方法的典型用法代码示例。如果您正苦于以下问题:Python LSHash.arpoxNN方法的具体用法?Python LSHash.arpoxNN怎么用?Python LSHash.arpoxNN使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类lshash.LSHash
的用法示例。
在下文中一共展示了LSHash.arpoxNN方法的3个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: run
# 需要导入模块: from lshash import LSHash [as 别名]
# 或者: from lshash.LSHash import arpoxNN [as 别名]
def run():
initial = True
size = 2000
tweet_ids = []
tweet_text = []
counter = 0
num_hashtables = 5 ## recompute the random vectors if this is changed
dimension = 5000000 ## recompute the random vectors if this is changed
hash_size = 13 ## length of the LSHash of the tweets
bucket_size = 100 ## size of the queue for each hash in the hash tables
comparisons = 50 ## upper bound on the number of comparisons (dot product) to find the nearest neighbor
cos_threshold = .5 ## threshold for the similarity of two tweets
## initialize the tf-idf vectorizer
vectorizer = onlineTfidfVectorizer(min_df = 1, smooth_idf=True, stop_words='english', min_dict_size = dimension)
## initialize the hash tables, specify the hash size, number of hash tabeles and the queue size
lsh = LSHash(hash_size = hash_size, input_dim = dimension, num_hashtables=num_hashtables, max_queue_size= bucket_size)
clusters = {} ## maintain the clusters
num_clusters = 0
Y = None
Y1 = None
f_d = open("output.txt",'w')
loc = "processed_tweets/"
for root, dirs, filenames in os.walk(loc):
for f in filenames:
with open(loc+f) as infile:
for line in infile:
## load 2000 tweets at a time
tweet = json.loads(line)
tweet_ids.append(tweet['id'])
tweet_text.append(tweet['text'])
counter = counter + 1
t2 = 0
if counter%size == 0:
t1 = time.clock()
## X contains te tf-idf score of the tweets in the "sparse row matrix" format
if initial:
X = vectorizer.fit_transform(tweet_text)
else:
X = vectorizer.transform(tweet_text)
print X.get_shape()
print len(vectorizer.vocabulary_)
## if the total number of keywords exceed the pre-specified dimension, raise error
if X.get_shape()[0] > dimension:
print X.get_shape()
print "dimension exceeded"
raise
for i in range(X.get_shape()[0]):
temp_tweet = X.getrow(i)
## query for the nearest neighbor from the lshash tables
nn = lsh.arpoxNN(temp_tweet, L=comparisons)
c = 2
scase = False
## if nearesr neighbor is not null and the cosine similarity is less than the threshold, add the tweet to the respective cluster
cluster_id = -1
if nn is not None:
((a, (b,d)),c) = nn
if c <= cos_threshold:
cluster_id = d
clusters.setdefault(d,[]).append(tweet_ids[i])
#else:
# scase = True
## else, linearly search through the previous 2000 + i tweets to find the nearest neighbor
""" code to linearly search through the tweets"""
if (c > cos_threshold or nn is None or scase):
cluster_id = num_clusters
clusters.setdefault(num_clusters, []).append(tweet_ids[i])
num_clusters = num_clusters + 1
### index the tweet into the hsh tables
lsh.index(input_point = temp_tweet, extra_data = tuple([tweet_ids[i], cluster_id]))
initial = False
Y = X
Y1 = tweet_ids[:]
tweet_ids = []
tweet_text = []
print counter
print time.clock() - t1
f2 = open('time.txt','a')
f2.write(str(time.clock()-t1) + '\n')
f2.close()
if counter%100000==0:
f2 = open('result.txt', 'a')
f2.write(json.dumps(clusters) + "\n")
f3 = open('vocab.txt', 'a')
f4 = open('vectorizer.txt', 'a')
f3.write(json.dumps(vectorizer.vocabulary_) + "\n")
f4.write(json.dumps(vectorizer.idf_) + "\n")
#print clusters
#print vectorizer.vocabulary_
f2.close()
#.........这里部分代码省略.........
示例2: run
# 需要导入模块: from lshash import LSHash [as 别名]
# 或者: from lshash.LSHash import arpoxNN [as 别名]
def run():
initial = True
size = 2000
tweet_ids = []
tweet_text = []
counter = 0
num_hashtables = 13 ## recompute the random vectors if this is changed
dimension = 50000 ## recompute the random vectors if this is changed
hash_size = 13 ## length of the LSHash of the tweets
bucket_size = 100 ## size of the queue for each hash in the hash tables
comparisons = 50 ## upper bound on the number of comparisons (dot product) to find the nearest neighbor
cos_threshold = .7 ## threshold for the similarity of two tweets
## initialize the tf-idf vectorizer
vectorizer = onlineTfidfVectorizer(min_df = 1, smooth_idf=True, stop_words='english', min_dict_size = dimension)
## initialize the hash tables, specify the hash size, number of hash tabeles and the queue size
lsh = LSHash(hash_size = hash_size, input_dim = dimension, num_hashtables=num_hashtables, max_queue_size= bucket_size)
clusters = {} ## maintain the clusters
num_clusters = 0
inv_index = {} ## inverse mapping from tweet_id to clusters
Y = None
Y1 = None
f_d = open("output.txt",'w')
loc = "/Users/dilpreet/Documents/mtp_documents/markedData/data/"
for root, dirs, filenames in os.walk(loc):
for f in filenames:
with open(loc+f) as infile:
for line in infile:
## load 2000 tweets at a time
tweet = json.loads(line)
tweet_ids.append(tweet['id'])
tweet_text.append(tweet['text'])
counter = counter + 1
t2 = 0
if counter%size == 0:
t1 = time.clock()
## X contains te tf-idf score of the tweets in the "sparse row matrix" format
if initial:
X = vectorizer.fit_transform(tweet_text)
else:
X = vectorizer.transform(tweet_text)
print X.get_shape()
print len(vectorizer.vocabulary_)
## if the total number of keywords exceed the pre-specified dimension, raise error
if X.get_shape()[0] > dimension:
print X.get_shape()
print "dimension exceeded"
raise
for i in range(X.get_shape()[0]):
temp_tweet = X.getrow(i)
## query for the nearest neighbor from the lshash tables
nn = lsh.arpoxNN(temp_tweet, L=comparisons)
c = 2
scase = False
## if nearesr neighbor is not null and the cosine similarity is less than the threshold, add the tweet to the respective cluster
if nn is not None:
((a, b),c) = nn
if c <= cos_threshold:
inv_index[tweet_ids[i]] = inv_index[b]
clusters.setdefault(inv_index[b],[]).append(tweet_ids[i])
#else:
# scase = True
## else, linearly search through the previous 2000 + i tweets to find the nearest neighbor
""" code to linearly search through the tweets"""
if (c > cos_threshold or nn is None or scase):
searchY = False
if (i==0 and not initial):
searchY = True
if (i==0 and initial):
inv_index[tweet_ids[i]] = num_clusters
clusters.setdefault(num_clusters, []).append(tweet_ids[i])
num_clusters = num_clusters + 1
if (i!=0):
Z = X[:i]
#print temp_tweet.shape
t2 = temp_tweet.transpose()
#print i
a1 = Z.dot(t2).toarray()
a2 = Z.multiply(Z).sum(axis = 1)
a3 = sp.csr_matrix(t2.multiply(t2).sum()).toarray()
a2 = sp.csc_matrix(a2).toarray()
b = [j for j in range(Z.shape[0])]
a = min(b, key = lambda x: 1-float(a1[x][0])/((a2[x][0] + a3[0][0])**.5))
#a = min(Z, key = lambda x: cosine_dist(x[0], temp_tweet))
#print a
t3 = tweet_ids[a]
if (1-float(a1[a][0])/((a2[a][0] + a3[0][0])**.5))> cos_threshold:
if not initial and i != size-1:
searchY = True
#.........这里部分代码省略.........
示例3: run
# 需要导入模块: from lshash import LSHash [as 别名]
# 或者: from lshash.LSHash import arpoxNN [as 别名]
def run():
initial = True
size = 200000
tweet_ids = []
tweet_text = []
counter = 0
num_hashtables = 4 ## recompute the random vectors if this is changed
dimension = 5000000 ## recompute the random vectors if this is changed
hash_size = 13 ## length of the LSHash of the tweets
bucket_size = 100 ## size of the queue for each hash in the hash tables
comparisons = 50 ## upper bound on the number of comparisons (dot product) to find the nearest neighbor
cos_threshold = .7 ## threshold for the similarity of two tweets
## initialize the tf-idf vectorizer
vectorizer = onlineTfidfVectorizer(min_df = 1, smooth_idf=True, stop_words='english', min_dict_size = dimension)
## initialize the hash tables, specify the hash size, number of hash tabeles and the queue size
lsh = LSHash(hash_size = hash_size, input_dim = dimension, num_hashtables=num_hashtables, max_queue_size= bucket_size)
clusters = {} ## maintain the clusters
num_clusters = 0
completed = open('/tmp/completed_tmp.txt')
completed = completed.readlines()
completed = set([x.replace('\n', '') for x in completed])
while(True):
clusters_size_prev = {}
files = []
for root, dirs, filenames in os.walk('/tmp/tweets_tmp/'):
for fname in filenames:
if fname != '.DS_Store':
files.append(fname)
files = set(files)
files = files - completed
if len(files) == 0:
print 'sleeping'
time.sleep(3000)
print 'checking'
continue
#print files
tweets_dump = {}
tweet_ids = []
tweet_text = []
time_sleep = time.time()
for fn in files:
print fn
time_tmp2 = time.time()
with open('/tmp/tweets_tmp/' + fn) as infile:
for line in infile:
## load 2000 tweets at a time
tweet = json.loads(line)
tweet_ids.append(tweet['id'])
tweet_text.append(tweet['filtered_text'])
tweets_dump[str(tweet['id'])] = tweet['text']
counter = counter + 1
t2 = 0
if counter%size == 0:
t1 = time.clock()
## X contains te tf-idf score of the tweets in the "sparse row matrix" format
if initial:
X = vectorizer.fit_transform(tweet_text)
else:
X = vectorizer.transform(tweet_text)
#print X.get_shape()
#print len(vectorizer.vocabulary_)
## if the total number of keywords exceed the pre-specified dimension, raise error
if X.get_shape()[0] > dimension:
print X.get_shape()
print "dimension exceeded"
raise
for i in range(X.get_shape()[0]):
temp_tweet = X.getrow(i)
## query for the nearest neighbor from the lshash tables
nn = lsh.arpoxNN(temp_tweet, L=comparisons)
c = 2
scase = False
## if nearesr neighbor is not null and the cosine similarity is less than the threshold, add the tweet to the respective cluster
cluster_id = -1
if nn is not None:
((a, (b,d)),c) = nn
if c <= cos_threshold:
cluster_id = d
clusters.setdefault(d,[]).append(tweet_ids[i])
#else:
# scase = True
## else, linearly search through the previous 2000 + i tweets to find the nearest neighbor
""" code to linearly search through the tweets"""
if (c > cos_threshold or nn is None or scase):
cluster_id = num_clusters
clusters.setdefault(num_clusters, []).append(tweet_ids[i])
num_clusters = num_clusters + 1
#.........这里部分代码省略.........