當前位置: 首頁>>代碼示例>>Python>>正文


Python Dataset.open方法代碼示例

本文整理匯總了Python中Dataset.Dataset.open方法的典型用法代碼示例。如果您正苦於以下問題:Python Dataset.open方法的具體用法?Python Dataset.open怎麽用?Python Dataset.open使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在Dataset.Dataset的用法示例。


在下文中一共展示了Dataset.open方法的6個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。

示例1: main

# 需要導入模塊: from Dataset import Dataset [as 別名]
# 或者: from Dataset.Dataset import open [as 別名]
def main():
	global X
	global Y

	ds = Dataset.open('quora')
	X,Y = ds.X,ds.Y


	# Z = [re.findall(r"[\w']+", x) for x in X]
	# Z = [filter(None, x.split('.')) for x in X]
	# Z = ["".join(s) for s in Z]
	# Z = [z.split(' ') for z in Z]
	# Z = [[len(s) for s in z] for z in Z]

	# feature = []
	# for a in Z:
	# 	wordLenDist = [0]*100
	# 	for ln in a:
	# 			wordLenDist[ln]+=1
	# 	feature.append(wordLenDist)

	feature = []
	tokenizer = RegexpTokenizer(r'\w+')
	for x in X:
		All = len(nltk.word_tokenize(x))
		numPunctuation = All - len(tokenizer.tokenize(x))
		numWords = All - numPunctuation
		ff = [numPunctuation, numWords]
		feature.append(ff)


	X = feature
	Z = zip(X, Y)
	shuffle(Z)
	(X, Y) = zip(*Z)


	si=0
	acc = 0.0
	cnt = 0
	while si<len(X):
		Xe = X[si:si+50]
		Ye = Y[si:si+50]
		X1 = X[:si] + X[si+50:]
		Y1 = Y[:si] + Y[si+50:]
		acc += train_chunk(X1, Y1, Xe, Ye)
		cnt += 1
		si += 50

	print 'Accuracy: %f' % (acc/cnt)
開發者ID:BigBull90,項目名稱:anon,代碼行數:52,代碼來源:punctuation.py

示例2: main

# 需要導入模塊: from Dataset import Dataset [as 別名]
# 或者: from Dataset.Dataset import open [as 別名]
def main():
	global X
	global Y
	global auth_to_id

	ds = Dataset.open('quora')
	# (X, Y) = ([x.split('.') for x in ds.X], ds.Y)
	#X = ([sum([len( filter(None, y.split(' ')) ) for y in x])/len(x) for x in X])
	#X = zip(X, [len( filter(None, x.split('\n')) ) for x in ds.X])
	(X, Y) = (ds.X, ds.Y)

	mx = 0
	for auth in Y:
		if auth not in auth_to_id:
			auth_to_id[auth] = mx
			mx+=1

	getTagsforAll(X)
	# print [x for x in ds.X if len( filter(None, x.split('\n')) ) > 1]

	# print [(x, y) for (x, y) in X if y > 1]
	#X = [[x, y] for (x, y) in X]



	Z = zip(X, Y)
	Z = pred_shuffle(Z)
	(X, Y) = zip(*Z)
	si=0
	acc = 0.0
	cnt = 0
	while si<len(X):
		print "doing iteration ", cnt
		Xe = X[si:si+50]
		Ye = Y[si:si+50]
		X1 = X[:si] + X[si+50:]
		Y1 = Y[:si] + Y[si+50:]
		train, pred  = gen_feature_vector(X1, Y1, Xe)
		acc += train_chunk(train, Y1, pred, Ye)
		cnt += 1
		si += 50

	print 'Accuracy: %f' % (acc/cnt)
開發者ID:BigBull90,項目名稱:anon,代碼行數:45,代碼來源:second.py

示例3: main

# 需要導入模塊: from Dataset import Dataset [as 別名]
# 或者: from Dataset.Dataset import open [as 別名]
def main():
	global X
	global Y

	ds = Dataset.open('quora')
	X,Y = ds.X,ds.Y


	#Z = [re.findall(r"[\w']+", x) for x in X]
	Z = [filter(None, x.split('.')) for x in X]
	Z = ["".join(s) for s in Z]
	Z = [z.split(' ') for z in Z]
	Z = [[len(s) for s in z] for z in Z]

	feature = []
	for a in Z:
		wordLenDist = [0]*100
		for ln in a:
				wordLenDist[ln]+=1
		feature.append(wordLenDist)

	X = feature
	Z = zip(X, Y)
	shuffle(Z)
	(X, Y) = zip(*Z)

	# X = [i for i in range(len(X))]

	si=0
	acc = 0.0
	cnt = 0
	while si<len(X):
		Xe = X[si:si+50]
		Ye = Y[si:si+50]
		X1 = X[:si] + X[si+50:]
		Y1 = Y[:si] + Y[si+50:]
		acc += train_chunk(X1, Y1, Xe, Ye)
		cnt += 1
		si += 50

	print 'Accuracy: %f' % (acc/cnt)
開發者ID:BigBull90,項目名稱:anon,代碼行數:43,代碼來源:wordLen.py

示例4: main

# 需要導入模塊: from Dataset import Dataset [as 別名]
# 或者: from Dataset.Dataset import open [as 別名]
def main():
	global X
	global Y
	ds = Dataset.open('quora')
	(X, Y) = (get_tagged_text(ds.X), ds.Y)

	XX=[]
	YY=[]
	for (auth, ans) in zip(Y, X):
	 	if len(nltk.word_tokenize(ans)) > 200:
	 		XX.append(ans)
	 		YY.append(auth)
	(X, Y) = (XX, YY)

	Z = zip(X, Y)
	shuffle(Z)
	(X, Y) = zip(*Z)
	Xe = X[-50:]
	Ye = Y[-50:]
	X = X[:-50]
	Y = Y[:-50]

	count_vect = CountVectorizer(input='content',ngram_range=(2,3), min_df=0.2, max_df=1.0)
	X_train_counts = count_vect.fit_transform(X)
	tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
	X_train_tf = tf_transformer.transform(X_train_counts)

	clf = MultinomialNB().fit(X_train_tf, Y)
	clf2 = KNeighborsClassifier(n_neighbours=5).fit(X_train_tf, Y)

	X_new_counts = count_vect.transform(Xe)
	X_new_tfidf = tf_transformer.transform(X_new_counts)
	Yd = clf.predict(X_new_tfidf)

	istats(Y)
	print ''
	stats(Ye, Yd)
開發者ID:BigBull90,項目名稱:anon,代碼行數:39,代碼來源:first.py

示例5: main

# 需要導入模塊: from Dataset import Dataset [as 別名]
# 或者: from Dataset.Dataset import open [as 別名]
def main():
	global X
	global Y
	global SIM

	ds = Dataset.open('quora')
	(X, Y) = (ds.X, ds.Y)

	for i in range(0, 200):
		ans = filter(None, X[i].split('.'))
		# texts = [filter(None, sentence.split(' ')) for sentence in ans]

		if len(nltk.word_tokenize(X[i])) < 100:
			continue

		documents = ans

		# remove common words and tokenize
		stoplist = stopwords.words('english')
		stoplist.append('')
		texts = [[cleanword(word) for word in document.lower().split() if cleanword(word) not in stoplist]
				 for document in documents]

		# remove words that appear only once
		all_tokens = sum(texts, [])
		tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)

		texts = [[word for word in text if word not in tokens_once] for text in texts]

		dictionary = corpora.Dictionary(texts)
		corp = [dictionary.doc2bow(text) for text in texts]
		
		lsi = models.lsimodel.LsiModel(corpus=corp, id2word=dictionary, num_topics=2)
		
		# print lsi.print_topics(2)

		sim = []
		for j in range(1, len(ans)):
			s = ans[j]
			vec_bow = dictionary.doc2bow(s.lower().split())
			vec_lsi = lsi[vec_bow] # convert the query to LSI space
			index = similarities.MatrixSimilarity(lsi[corp]) # transform corpus to LSI space and index it
			sims = index[vec_lsi] # perform a similarity query against the corpussims = index[vec_lsi] # perform a similarity query against the corpus
			sim.append( list(enumerate(sims))[j-1][1] ) # print (document_number, document_similarity) 2-tuples

		SIM.append(sim)

	X = [[sum(sim)/(1 + len(sim))] for sim in SIM]
	print X
	Z = zip(X, Y)
	print Z
	shuffle(Z)
	(X, Y) = zip(*Z)
	si=0
	acc = 0.0
	cnt = 0
	print X,Y
	print len(X),len(Y)
	while si<len(X):
		Xe = X[si:si+50]
		Ye = Y[si:si+50]
		X1 = X[:si] + X[si+50:]
		Y1 = Y[:si] + Y[si+50:]
		print len(X1),len(Xe), len(Y1), len(Ye)
		acc += train_chunk(X1, Y1, Xe, Ye)
		cnt += 1
		si += 50

	print 'Accuracy: %f' % (acc/cnt)
開發者ID:BigBull90,項目名稱:anon,代碼行數:71,代碼來源:third.py

示例6:

# 需要導入模塊: from Dataset import Dataset [as 別名]
# 或者: from Dataset.Dataset import open [as 別名]
from Dataset import Dataset

ds = Dataset.open('blahblah')
print ds.X	# list of answers
print ds.Y	# list of corresponding authors
開發者ID:BigBull90,項目名稱:anon,代碼行數:7,代碼來源:example.py


注:本文中的Dataset.Dataset.open方法示例由純淨天空整理自Github/MSDocs等開源代碼及文檔管理平台,相關代碼片段篩選自各路編程大神貢獻的開源項目,源碼版權歸原作者所有,傳播和使用請參考對應項目的License;未經允許,請勿轉載。