本文整理汇总了Python中Dataset.Dataset.open方法的典型用法代码示例。如果您正苦于以下问题:Python Dataset.open方法的具体用法?Python Dataset.open怎么用?Python Dataset.open使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类Dataset.Dataset
的用法示例。
在下文中一共展示了Dataset.open方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from Dataset import Dataset [as 别名]
# 或者: from Dataset.Dataset import open [as 别名]
def main():
global X
global Y
ds = Dataset.open('quora')
X,Y = ds.X,ds.Y
# Z = [re.findall(r"[\w']+", x) for x in X]
# Z = [filter(None, x.split('.')) for x in X]
# Z = ["".join(s) for s in Z]
# Z = [z.split(' ') for z in Z]
# Z = [[len(s) for s in z] for z in Z]
# feature = []
# for a in Z:
# wordLenDist = [0]*100
# for ln in a:
# wordLenDist[ln]+=1
# feature.append(wordLenDist)
feature = []
tokenizer = RegexpTokenizer(r'\w+')
for x in X:
All = len(nltk.word_tokenize(x))
numPunctuation = All - len(tokenizer.tokenize(x))
numWords = All - numPunctuation
ff = [numPunctuation, numWords]
feature.append(ff)
X = feature
Z = zip(X, Y)
shuffle(Z)
(X, Y) = zip(*Z)
si=0
acc = 0.0
cnt = 0
while si<len(X):
Xe = X[si:si+50]
Ye = Y[si:si+50]
X1 = X[:si] + X[si+50:]
Y1 = Y[:si] + Y[si+50:]
acc += train_chunk(X1, Y1, Xe, Ye)
cnt += 1
si += 50
print 'Accuracy: %f' % (acc/cnt)
示例2: main
# 需要导入模块: from Dataset import Dataset [as 别名]
# 或者: from Dataset.Dataset import open [as 别名]
def main():
global X
global Y
global auth_to_id
ds = Dataset.open('quora')
# (X, Y) = ([x.split('.') for x in ds.X], ds.Y)
#X = ([sum([len( filter(None, y.split(' ')) ) for y in x])/len(x) for x in X])
#X = zip(X, [len( filter(None, x.split('\n')) ) for x in ds.X])
(X, Y) = (ds.X, ds.Y)
mx = 0
for auth in Y:
if auth not in auth_to_id:
auth_to_id[auth] = mx
mx+=1
getTagsforAll(X)
# print [x for x in ds.X if len( filter(None, x.split('\n')) ) > 1]
# print [(x, y) for (x, y) in X if y > 1]
#X = [[x, y] for (x, y) in X]
Z = zip(X, Y)
Z = pred_shuffle(Z)
(X, Y) = zip(*Z)
si=0
acc = 0.0
cnt = 0
while si<len(X):
print "doing iteration ", cnt
Xe = X[si:si+50]
Ye = Y[si:si+50]
X1 = X[:si] + X[si+50:]
Y1 = Y[:si] + Y[si+50:]
train, pred = gen_feature_vector(X1, Y1, Xe)
acc += train_chunk(train, Y1, pred, Ye)
cnt += 1
si += 50
print 'Accuracy: %f' % (acc/cnt)
示例3: main
# 需要导入模块: from Dataset import Dataset [as 别名]
# 或者: from Dataset.Dataset import open [as 别名]
def main():
global X
global Y
ds = Dataset.open('quora')
X,Y = ds.X,ds.Y
#Z = [re.findall(r"[\w']+", x) for x in X]
Z = [filter(None, x.split('.')) for x in X]
Z = ["".join(s) for s in Z]
Z = [z.split(' ') for z in Z]
Z = [[len(s) for s in z] for z in Z]
feature = []
for a in Z:
wordLenDist = [0]*100
for ln in a:
wordLenDist[ln]+=1
feature.append(wordLenDist)
X = feature
Z = zip(X, Y)
shuffle(Z)
(X, Y) = zip(*Z)
# X = [i for i in range(len(X))]
si=0
acc = 0.0
cnt = 0
while si<len(X):
Xe = X[si:si+50]
Ye = Y[si:si+50]
X1 = X[:si] + X[si+50:]
Y1 = Y[:si] + Y[si+50:]
acc += train_chunk(X1, Y1, Xe, Ye)
cnt += 1
si += 50
print 'Accuracy: %f' % (acc/cnt)
示例4: main
# 需要导入模块: from Dataset import Dataset [as 别名]
# 或者: from Dataset.Dataset import open [as 别名]
def main():
global X
global Y
ds = Dataset.open('quora')
(X, Y) = (get_tagged_text(ds.X), ds.Y)
XX=[]
YY=[]
for (auth, ans) in zip(Y, X):
if len(nltk.word_tokenize(ans)) > 200:
XX.append(ans)
YY.append(auth)
(X, Y) = (XX, YY)
Z = zip(X, Y)
shuffle(Z)
(X, Y) = zip(*Z)
Xe = X[-50:]
Ye = Y[-50:]
X = X[:-50]
Y = Y[:-50]
count_vect = CountVectorizer(input='content',ngram_range=(2,3), min_df=0.2, max_df=1.0)
X_train_counts = count_vect.fit_transform(X)
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tf, Y)
clf2 = KNeighborsClassifier(n_neighbours=5).fit(X_train_tf, Y)
X_new_counts = count_vect.transform(Xe)
X_new_tfidf = tf_transformer.transform(X_new_counts)
Yd = clf.predict(X_new_tfidf)
istats(Y)
print ''
stats(Ye, Yd)
示例5: main
# 需要导入模块: from Dataset import Dataset [as 别名]
# 或者: from Dataset.Dataset import open [as 别名]
def main():
global X
global Y
global SIM
ds = Dataset.open('quora')
(X, Y) = (ds.X, ds.Y)
for i in range(0, 200):
ans = filter(None, X[i].split('.'))
# texts = [filter(None, sentence.split(' ')) for sentence in ans]
if len(nltk.word_tokenize(X[i])) < 100:
continue
documents = ans
# remove common words and tokenize
stoplist = stopwords.words('english')
stoplist.append('')
texts = [[cleanword(word) for word in document.lower().split() if cleanword(word) not in stoplist]
for document in documents]
# remove words that appear only once
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once] for text in texts]
dictionary = corpora.Dictionary(texts)
corp = [dictionary.doc2bow(text) for text in texts]
lsi = models.lsimodel.LsiModel(corpus=corp, id2word=dictionary, num_topics=2)
# print lsi.print_topics(2)
sim = []
for j in range(1, len(ans)):
s = ans[j]
vec_bow = dictionary.doc2bow(s.lower().split())
vec_lsi = lsi[vec_bow] # convert the query to LSI space
index = similarities.MatrixSimilarity(lsi[corp]) # transform corpus to LSI space and index it
sims = index[vec_lsi] # perform a similarity query against the corpussims = index[vec_lsi] # perform a similarity query against the corpus
sim.append( list(enumerate(sims))[j-1][1] ) # print (document_number, document_similarity) 2-tuples
SIM.append(sim)
X = [[sum(sim)/(1 + len(sim))] for sim in SIM]
print X
Z = zip(X, Y)
print Z
shuffle(Z)
(X, Y) = zip(*Z)
si=0
acc = 0.0
cnt = 0
print X,Y
print len(X),len(Y)
while si<len(X):
Xe = X[si:si+50]
Ye = Y[si:si+50]
X1 = X[:si] + X[si+50:]
Y1 = Y[:si] + Y[si+50:]
print len(X1),len(Xe), len(Y1), len(Ye)
acc += train_chunk(X1, Y1, Xe, Ye)
cnt += 1
si += 50
print 'Accuracy: %f' % (acc/cnt)
示例6:
# 需要导入模块: from Dataset import Dataset [as 别名]
# 或者: from Dataset.Dataset import open [as 别名]
from Dataset import Dataset
ds = Dataset.open('blahblah')
print ds.X # list of answers
print ds.Y # list of corresponding authors