当前位置: 首页>>代码示例>>Python>>正文


Python Dataset.open方法代码示例

本文整理汇总了Python中Dataset.Dataset.open方法的典型用法代码示例。如果您正苦于以下问题:Python Dataset.open方法的具体用法?Python Dataset.open怎么用?Python Dataset.open使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在Dataset.Dataset的用法示例。


在下文中一共展示了Dataset.open方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: main

# 需要导入模块: from Dataset import Dataset [as 别名]
# 或者: from Dataset.Dataset import open [as 别名]
def main():
	global X
	global Y

	ds = Dataset.open('quora')
	X,Y = ds.X,ds.Y


	# Z = [re.findall(r"[\w']+", x) for x in X]
	# Z = [filter(None, x.split('.')) for x in X]
	# Z = ["".join(s) for s in Z]
	# Z = [z.split(' ') for z in Z]
	# Z = [[len(s) for s in z] for z in Z]

	# feature = []
	# for a in Z:
	# 	wordLenDist = [0]*100
	# 	for ln in a:
	# 			wordLenDist[ln]+=1
	# 	feature.append(wordLenDist)

	feature = []
	tokenizer = RegexpTokenizer(r'\w+')
	for x in X:
		All = len(nltk.word_tokenize(x))
		numPunctuation = All - len(tokenizer.tokenize(x))
		numWords = All - numPunctuation
		ff = [numPunctuation, numWords]
		feature.append(ff)


	X = feature
	Z = zip(X, Y)
	shuffle(Z)
	(X, Y) = zip(*Z)


	si=0
	acc = 0.0
	cnt = 0
	while si<len(X):
		Xe = X[si:si+50]
		Ye = Y[si:si+50]
		X1 = X[:si] + X[si+50:]
		Y1 = Y[:si] + Y[si+50:]
		acc += train_chunk(X1, Y1, Xe, Ye)
		cnt += 1
		si += 50

	print 'Accuracy: %f' % (acc/cnt)
开发者ID:BigBull90,项目名称:anon,代码行数:52,代码来源:punctuation.py

示例2: main

# 需要导入模块: from Dataset import Dataset [as 别名]
# 或者: from Dataset.Dataset import open [as 别名]
def main():
	global X
	global Y
	global auth_to_id

	ds = Dataset.open('quora')
	# (X, Y) = ([x.split('.') for x in ds.X], ds.Y)
	#X = ([sum([len( filter(None, y.split(' ')) ) for y in x])/len(x) for x in X])
	#X = zip(X, [len( filter(None, x.split('\n')) ) for x in ds.X])
	(X, Y) = (ds.X, ds.Y)

	mx = 0
	for auth in Y:
		if auth not in auth_to_id:
			auth_to_id[auth] = mx
			mx+=1

	getTagsforAll(X)
	# print [x for x in ds.X if len( filter(None, x.split('\n')) ) > 1]

	# print [(x, y) for (x, y) in X if y > 1]
	#X = [[x, y] for (x, y) in X]



	Z = zip(X, Y)
	Z = pred_shuffle(Z)
	(X, Y) = zip(*Z)
	si=0
	acc = 0.0
	cnt = 0
	while si<len(X):
		print "doing iteration ", cnt
		Xe = X[si:si+50]
		Ye = Y[si:si+50]
		X1 = X[:si] + X[si+50:]
		Y1 = Y[:si] + Y[si+50:]
		train, pred  = gen_feature_vector(X1, Y1, Xe)
		acc += train_chunk(train, Y1, pred, Ye)
		cnt += 1
		si += 50

	print 'Accuracy: %f' % (acc/cnt)
开发者ID:BigBull90,项目名称:anon,代码行数:45,代码来源:second.py

示例3: main

# 需要导入模块: from Dataset import Dataset [as 别名]
# 或者: from Dataset.Dataset import open [as 别名]
def main():
	global X
	global Y

	ds = Dataset.open('quora')
	X,Y = ds.X,ds.Y


	#Z = [re.findall(r"[\w']+", x) for x in X]
	Z = [filter(None, x.split('.')) for x in X]
	Z = ["".join(s) for s in Z]
	Z = [z.split(' ') for z in Z]
	Z = [[len(s) for s in z] for z in Z]

	feature = []
	for a in Z:
		wordLenDist = [0]*100
		for ln in a:
				wordLenDist[ln]+=1
		feature.append(wordLenDist)

	X = feature
	Z = zip(X, Y)
	shuffle(Z)
	(X, Y) = zip(*Z)

	# X = [i for i in range(len(X))]

	si=0
	acc = 0.0
	cnt = 0
	while si<len(X):
		Xe = X[si:si+50]
		Ye = Y[si:si+50]
		X1 = X[:si] + X[si+50:]
		Y1 = Y[:si] + Y[si+50:]
		acc += train_chunk(X1, Y1, Xe, Ye)
		cnt += 1
		si += 50

	print 'Accuracy: %f' % (acc/cnt)
开发者ID:BigBull90,项目名称:anon,代码行数:43,代码来源:wordLen.py

示例4: main

# 需要导入模块: from Dataset import Dataset [as 别名]
# 或者: from Dataset.Dataset import open [as 别名]
def main():
	global X
	global Y
	ds = Dataset.open('quora')
	(X, Y) = (get_tagged_text(ds.X), ds.Y)

	XX=[]
	YY=[]
	for (auth, ans) in zip(Y, X):
	 	if len(nltk.word_tokenize(ans)) > 200:
	 		XX.append(ans)
	 		YY.append(auth)
	(X, Y) = (XX, YY)

	Z = zip(X, Y)
	shuffle(Z)
	(X, Y) = zip(*Z)
	Xe = X[-50:]
	Ye = Y[-50:]
	X = X[:-50]
	Y = Y[:-50]

	count_vect = CountVectorizer(input='content',ngram_range=(2,3), min_df=0.2, max_df=1.0)
	X_train_counts = count_vect.fit_transform(X)
	tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
	X_train_tf = tf_transformer.transform(X_train_counts)

	clf = MultinomialNB().fit(X_train_tf, Y)
	clf2 = KNeighborsClassifier(n_neighbours=5).fit(X_train_tf, Y)

	X_new_counts = count_vect.transform(Xe)
	X_new_tfidf = tf_transformer.transform(X_new_counts)
	Yd = clf.predict(X_new_tfidf)

	istats(Y)
	print ''
	stats(Ye, Yd)
开发者ID:BigBull90,项目名称:anon,代码行数:39,代码来源:first.py

示例5: main

# 需要导入模块: from Dataset import Dataset [as 别名]
# 或者: from Dataset.Dataset import open [as 别名]
def main():
	global X
	global Y
	global SIM

	ds = Dataset.open('quora')
	(X, Y) = (ds.X, ds.Y)

	for i in range(0, 200):
		ans = filter(None, X[i].split('.'))
		# texts = [filter(None, sentence.split(' ')) for sentence in ans]

		if len(nltk.word_tokenize(X[i])) < 100:
			continue

		documents = ans

		# remove common words and tokenize
		stoplist = stopwords.words('english')
		stoplist.append('')
		texts = [[cleanword(word) for word in document.lower().split() if cleanword(word) not in stoplist]
				 for document in documents]

		# remove words that appear only once
		all_tokens = sum(texts, [])
		tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)

		texts = [[word for word in text if word not in tokens_once] for text in texts]

		dictionary = corpora.Dictionary(texts)
		corp = [dictionary.doc2bow(text) for text in texts]
		
		lsi = models.lsimodel.LsiModel(corpus=corp, id2word=dictionary, num_topics=2)
		
		# print lsi.print_topics(2)

		sim = []
		for j in range(1, len(ans)):
			s = ans[j]
			vec_bow = dictionary.doc2bow(s.lower().split())
			vec_lsi = lsi[vec_bow] # convert the query to LSI space
			index = similarities.MatrixSimilarity(lsi[corp]) # transform corpus to LSI space and index it
			sims = index[vec_lsi] # perform a similarity query against the corpussims = index[vec_lsi] # perform a similarity query against the corpus
			sim.append( list(enumerate(sims))[j-1][1] ) # print (document_number, document_similarity) 2-tuples

		SIM.append(sim)

	X = [[sum(sim)/(1 + len(sim))] for sim in SIM]
	print X
	Z = zip(X, Y)
	print Z
	shuffle(Z)
	(X, Y) = zip(*Z)
	si=0
	acc = 0.0
	cnt = 0
	print X,Y
	print len(X),len(Y)
	while si<len(X):
		Xe = X[si:si+50]
		Ye = Y[si:si+50]
		X1 = X[:si] + X[si+50:]
		Y1 = Y[:si] + Y[si+50:]
		print len(X1),len(Xe), len(Y1), len(Ye)
		acc += train_chunk(X1, Y1, Xe, Ye)
		cnt += 1
		si += 50

	print 'Accuracy: %f' % (acc/cnt)
开发者ID:BigBull90,项目名称:anon,代码行数:71,代码来源:third.py

示例6:

# 需要导入模块: from Dataset import Dataset [as 别名]
# 或者: from Dataset.Dataset import open [as 别名]
from Dataset import Dataset

ds = Dataset.open('blahblah')
print ds.X	# list of answers
print ds.Y	# list of corresponding authors
开发者ID:BigBull90,项目名称:anon,代码行数:7,代码来源:example.py


注:本文中的Dataset.Dataset.open方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。