本文整理汇总了Python中sentimentfinding.IOtools.getfilenames_of_dir方法的典型用法代码示例。如果您正苦于以下问题:Python IOtools.getfilenames_of_dir方法的具体用法?Python IOtools.getfilenames_of_dir怎么用?Python IOtools.getfilenames_of_dir使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类sentimentfinding.IOtools
的用法示例。
在下文中一共展示了IOtools.getfilenames_of_dir方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: add_resource_label
# 需要导入模块: from sentimentfinding import IOtools [as 别名]
# 或者: from sentimentfinding.IOtools import getfilenames_of_dir [as 别名]
def add_resource_label(matrixpath, datasetname, replacelabel=False, headers=True):
matrixlines = IOtools.readtextlines(matrixpath) # 1st item=fileid, lastitem=filecat.
newmatrix = []
if headers:
matrixlines = matrixlines[2:]
for instance in matrixlines:
items = instance.split()
fileid = items[0]
print instance,
path = datapath+os.sep+datasetname
foldernames = IOtools.getfoldernames_of_dir(datapath+os.sep+datasetname)
#print foldernames
for folder in foldernames:
allfileids = IOtools.getfilenames_of_dir(path+os.sep+folder, removeextension=False)
#print allfileids
if fileid in allfileids:
newspath = path+os.sep+folder+os.sep+fileid
resourcename = texter.getnewsmetadata(newspath, ["resource"])["resource"]
#print "## ",resourcename," ",type(instance)," ~~ ",instance
if replacelabel: items = items[:-1]
newmatrix.append(items +[resourcename])
break
return newmatrix
示例2: buildcorpus
# 需要导入模块: from sentimentfinding import IOtools [as 别名]
# 或者: from sentimentfinding.IOtools import getfilenames_of_dir [as 别名]
def buildcorpus(nfile, ncat, resourcename, path):
resourcepath = path + os.sep + resourcename
catnames = IOtools.getfoldernames_of_dir(resourcepath)[:ncat]
featurematrix = []
doctermmatrix = []
cfdTermDoc = nltk.ConditionalFreqDist()
for catname in catnames:
fileids = []
p = resourcepath + os.sep + catname + os.sep
fileids.extend(IOtools.getfilenames_of_dir(p, removeextension=False)[:nfile])
corpus = CorpusFeatures(fileids, resourcename+os.sep+catname, p)
corpus.getfeatures()
datapoints = corpus.build_featurematrix()
for k,v in datapoints.iteritems():
featurematrix.append([k]+v+[resourcename])
corpus.plot_features()
#doc term matrix
cfd = corpus.build_termmatrix()
for fileid in cfd.conditions():
for term in list(cfd[fileid]):
cfdTermDoc[fileid].inc(term)
IOtools.todisc_matrix(featurematrix, IOtools.results_rootpath+os.sep+"MATRIX"+str(nfile*ncat)+"texts.txt", mode="a")
示例3: corpus_construction
# 需要导入模块: from sentimentfinding import IOtools [as 别名]
# 或者: from sentimentfinding.IOtools import getfilenames_of_dir [as 别名]
def corpus_construction():
start = datetime.now()
corpus = Corpus("test")
rootpath = "/home/dicle/Dicle/Tez/geziyakurdiproject/corpus/"
labels = ["pos","neg"]
labelwisepathlist = {}
for label in labels:
labelwisepathlist[label] = []
for label in labels:
labelwisepathlist[label] = IOtools.getfilenames_of_dir(rootpath+os.sep+label, removeextension=False)
corpus.read_corpus(rootpath, labelwisepathlist)
end = datetime.now()
print "Reading takes: ", str(end-start)
print corpus.cfd_RootDoc["alevi"].N()
print corpus.cfd_RootDoc.N()
print len(corpus.cfd_RootDoc.conditions())
print corpus.cfd_DocRoot.N()
print len(corpus.cfd_DocRoot.conditions())
df = corpus.compute_tfidf()
end2= datetime.now()
print "tfidf matrix takes: ",str(end2-end)
示例4: getwordsandlemmasfromfile
# 需要导入模块: from sentimentfinding import IOtools [as 别名]
# 或者: from sentimentfinding.IOtools import getfilenames_of_dir [as 别名]
def getwordsandlemmasfromfile():
rootpath = "/home/dicle/Dicle/Tez/geziyakurdiproject/"
corpuspath = rootpath + os.sep + "texts/"
outwordspath = rootpath + os.sep + "weightedwords/"
fileids = IOtools.getfilenames_of_dir(corpuspath, removeextension=False)
for fileid in fileids:
txt = texter.readtxtfile(corpuspath+os.sep+fileid)
marker = "Haziran 2013"
mark = txt.find(marker) # skip metadata
txt = txt[mark+len(marker):]
words = texter.getwords(txt)
lemmatuples = SAKsParser.findrootsinlexicon(words)
roots = [root for _,root,_ in lemmatuples]
fdwords = nltk.FreqDist(words)
fdroots = nltk.FreqDist(roots)
weightedwords = [word+"\t"+str(fdwords[word]) for word in list(fdwords)]
weightedroots = [root+"\t"+str(fdroots[root]) for root in list(fdroots)]
IOtools.todisc_list(outwordspath+os.sep+"lemma"+os.sep+fileid, weightedwords)
IOtools.todisc_list(outwordspath+os.sep+"root"+os.sep+fileid, weightedroots)
示例5: conduct_experiments
# 需要导入模块: from sentimentfinding import IOtools [as 别名]
# 或者: from sentimentfinding.IOtools import getfilenames_of_dir [as 别名]
def conduct_experiments(inrootpath=metacorpus.learningdatapath, outrootpath=metaexperimentation.expscorepath, normalize=False):
annottypes = ["double"]
setsizes = ["150"]
taggertypes = ["user"]
numofcombs = 5
#nclasses = arrange_N_classes.nclasses # [4,5]
#models = []
svmclassifier = SVM("")
clusterer = Clustering("")
nbclassifier = NaiveBayes("")
#nbclassifier = MultinomialNB(outrootpath)
models = [svmclassifier, nbclassifier, clusterer]
for annotationtype in annottypes:
sp1 = IOtools.ensure_dir(os.path.join(outrootpath, annotationtype))
for setsize in setsizes:
sp2 = IOtools.ensure_dir(os.path.join(sp1, setsize))
datasetspath = metacorpus.get_datasets_path(annotationtype, setsize) # finaldatasets
labelspath = metacorpus.get_labels_path(annotationtype, setsize)
nclasses = IOtools.getfoldernames_of_dir(labelspath)
combfilenames = IOtools.getfilenames_of_dir(datasetspath)
combfilenames = combfilenames[:numofcombs]
for combfile in combfilenames:
Xpath = os.path.join(datasetspath, combfile + ".csv")
sp3 = IOtools.ensure_dir(os.path.join(sp2, combfile))
for nclass in nclasses: # count it on labelspath not nclasses
#nclabelspath = arrange_N_classes.nclass_label_folder(labelspath, nc) # get folder path containing nc-grouped labels
nclabelspath = os.path.join(labelspath, nclass)
nc = nclass.split(metaexperimentation.intrafeatsep)[-1]
nc = int(nc)
sp4 = IOtools.ensure_dir(os.path.join(sp3, nclass)) #"NC-"+str(nc)))
for taggertype in taggertypes:
rootscorespath = IOtools.ensure_dir(os.path.join(sp4, taggertype))
metaexperimentation.initialize_score_file(rootscorespath)
ylabelspath = os.path.join(nclabelspath, taggertype+".csv")
for model in models:
#labelnames = metacorpus.get_label_names()
model.prepare_experiment(Xpath, ylabelspath, rootscorespath, labelnames=None, normalize=normalize)
model.apply_algorithms(nc)
示例6: get_fileids_infolder
# 需要导入模块: from sentimentfinding import IOtools [as 别名]
# 或者: from sentimentfinding.IOtools import getfilenames_of_dir [as 别名]
def get_fileids_infolder(path, numofpoints):
fileids = IOtools.getfilenames_of_dir(path, removeextension=False)
if numofpoints > 0:
selected = np.random.randint(0, len(fileids), numofpoints)
fileids = np.array(fileids)
fileids = fileids[selected]
for fileid in fileids[:10]:
print fileid," ",texter.getnewsmetadata(path+os.sep+fileid, ["resource"])
return fileids.tolist()
示例7: clustering
# 需要导入模块: from sentimentfinding import IOtools [as 别名]
# 或者: from sentimentfinding.IOtools import getfilenames_of_dir [as 别名]
def clustering(corpuspath, resultspath, numofclusters):
trainpath = corpuspath + os.sep + "train" + os.sep
testpath = corpuspath + os.sep + "test" + os.sep
# feature hold out!
featurespaces = IOtools.getfilenames_of_dir(trainpath, removeextension=True)
for featurespace in featurespaces:
inpath = trainpath
procedurename = "kmeans#"+str(numofclusters)+"_"+featurespace
recordpath = resultspath
classify.perform_clustering(featurespace, inpath, procedurename, recordpath, numofclusters)
示例8: merge_word_lists
# 需要导入模块: from sentimentfinding import IOtools [as 别名]
# 或者: from sentimentfinding.IOtools import getfilenames_of_dir [as 别名]
def merge_word_lists(indirectory, outdirectory, outfilename):
fileids = IOtools.getfilenames_of_dir(indirectory, removeextension=False)
allwords = []
for fileid in fileids:
words = IOtools.readtextlines(indirectory+os.sep+fileid)
allwords.extend(words)
IOtools.todisc_list(outdirectory+os.sep+outfilename+".txt", allwords)
fdist = nltk.FreqDist(allwords)
IOtools.todisc_freqdist(outdirectory+os.sep+"weighted-"+outfilename+".txt", fdist)
'''
示例9: conduct_experiments2
# 需要导入模块: from sentimentfinding import IOtools [as 别名]
# 或者: from sentimentfinding.IOtools import getfilenames_of_dir [as 别名]
def conduct_experiments2(resultspath):
datafolder = "/home/dicle/Dicle/Tez/corpusstats/learning/data/random-single-N5/finaldatasets_test/"
datasetname = "feat-00111110000"
datasetnames = IOtools.getfilenames_of_dir(datafolder)
for datasetname in datasetnames:
epath = IOtools.ensure_dir(resultspath+os.sep+datasetname)
experiment = Experimentation(experimentrootpath=epath, datasetfolder=datafolder, datasetname=datasetname)
datamatrixcsvpath, ylabels = experiment.prepare_data()
#clusterer = Clustering(erootpath=epath, datamatrixpath=datamatrixcsvpath, yvector=ylabels)
#clusterer.apply_algorithms(scorefilepath=experiment.scorefilepath)
svmclassifier = SVM(erootpath=epath, datamatrixpath=datamatrixcsvpath, yvector=ylabels)
svmclassifier.apply_algorithms(scorefilepath=experiment.scorefilepath)
示例10: corpus_construction_fromwords
# 需要导入模块: from sentimentfinding import IOtools [as 别名]
# 或者: from sentimentfinding.IOtools import getfilenames_of_dir [as 别名]
def corpus_construction_fromwords():
recordpath = "/home/dicle/Dicle/Tez/geziyakurdiproject/corpus2/ldatests22Temmuz/edit/wordletest/matrix/"
inputpath = "/home/dicle/Dicle/Tez/geziyakurdiproject/corpus2/ldatests22Temmuz/edit/wordletest/words/"
labels = ["inlier", "outlier"]
labelwisepathlist = {}
for label in labels:
labelwisepathlist[label] = []
for label in labels:
labelwisepathlist[label] = IOtools.getfilenames_of_dir(inputpath+os.sep+label, removeextension=False)
corpus = Corpus("wordletest")
corpus.read_wordlists(inputpath, recordpath, labelwisepathlist)
doctermfreqdf = corpus.get_docterm_matrix()
corpus.compute_tfidf2(doctermfreqdf)
return corpus
示例11: crawlandmakexmlcorpus
# 需要导入模块: from sentimentfinding import IOtools [as 别名]
# 或者: from sentimentfinding.IOtools import getfilenames_of_dir [as 别名]
def crawlandmakexmlcorpus():
for resource in resourcefolders:
p1 = os.path.join(rawcorpuspath, resource)
xp1 = IOtools.ensure_dir(os.path.join(xmlcorpuspath, resource)) # replicate the folder hierarchy into the xml folder as well
categories = IOtools.getfoldernames_of_dir(p1)
for cat in categories:
p2 = os.path.join(p1,cat)
xp2 = IOtools.ensure_dir(os.path.join(xp1, cat))
txtfiles = IOtools.getfilenames_of_dir(p2, removeextension=True)
for filename in txtfiles:
txtpath = p2 + os.sep + filename + fromextension
xmlpath = xp2 + os.sep + filename + toextension
txtcontent = IOtools.readtxtfile(txtpath)
xmlcontent = headxml + "\n" + txtcontent + "\n" + footxml
IOtools.todisc_txt(xmlcontent, xmlpath)
示例12: recordnewsmetadata_crawltxt
# 需要导入模块: from sentimentfinding import IOtools [as 别名]
# 或者: from sentimentfinding.IOtools import getfilenames_of_dir [as 别名]
def recordnewsmetadata_crawltxt(corpuspath=metacorpus.rawcorpuspath, resourcefolders=metacorpus.resources, csvfilepath=_metafilepath):
for resource in resourcefolders:
xp1 = IOtools.ensure_dir(os.path.join(corpuspath, resource)) # replicate the folder hierarchy into the xml folder as well
categories = IOtools.getfoldernames_of_dir(xp1)
for cat in categories:
xp2 = IOtools.ensure_dir(os.path.join(xp1, cat))
filenames = IOtools.getfilenames_of_dir(xp2, removeextension=False)
for filename in filenames:
filepath = xp2 + os.sep + filename
metadataline = getmetadata_fromtxt(filepath) #metadataline = getmetadata_fromtxt(filepath+".txt")
#print csvfilepath
IOtools.todisc_txt(metadataline, csvfilepath, mode="a")
print "finished "+resource+"/"+cat
示例13: parseXML_phraseslexicon
# 需要导入模块: from sentimentfinding import IOtools [as 别名]
# 或者: from sentimentfinding.IOtools import getfilenames_of_dir [as 别名]
def parseXML_phraseslexicon(xmlinfilepath, txtoutfilepath):
xmlfnames = IOtools.getfilenames_of_dir(xmlinfilepath, removeextension=False)
statsstr = "letter numofphrases"+"\n"
for fname in xmlfnames:
letter = fname.split("_")[-1][:-4] # each fname is of the form "ADB_letter.xml"
print fname
path = xmlinfilepath + os.sep + fname
tree = ET.parse(path)
lexiconroot = tree.getroot()
names = lexiconroot.findall(deyimDOMpath)
phrases = []
for name in names:
phrase = name.text
phrases.append(phrase.strip().lower())
outpath = txtoutfilepath + os.sep + letter + ".txt"
IOtools.todisc_list(outpath, phrases)
statsstr += letter+"\t"+str(len(phrases))+"\n"
IOtools.todisc_txt(statsstr, txtoutfilepath+os.sep+"originalstats.txt")
示例14: classification
# 需要导入模块: from sentimentfinding import IOtools [as 别名]
# 或者: from sentimentfinding.IOtools import getfilenames_of_dir [as 别名]
def classification(corpuspath, resultspath):
trainpath = corpuspath + os.sep + "train" + os.sep
testpath = corpuspath + os.sep + "test" + os.sep
classifiers = ["naivebayes", "ldac"]
learner = {}
learner["naivebayes"] = classify.NBclassifier()
learner["ldac"] = classify.LDACclassifier()
# feature hold out!
featurespaces = IOtools.getfilenames_of_dir(trainpath, removeextension=True)
for featurespace in featurespaces:
trainset = pd.read_csv(trainpath+os.sep+featurespace+".csv", index_col=0)
testset = pd.read_csv(testpath+os.sep+featurespace+".csv", index_col=0)
for clsfalg in classifiers:
procedurename = clsfalg+"#_"+featurespace
recordpath = IOtools.ensure_dir(resultspath + os.sep + procedurename)
learner[clsfalg].setname(procedurename)
learner[clsfalg].run(trainset, testset, recordpath)
'''
示例15: corpus_construction
# 需要导入模块: from sentimentfinding import IOtools [as 别名]
# 或者: from sentimentfinding.IOtools import getfilenames_of_dir [as 别名]
def corpus_construction():
start = datetime.now()
rootpath = "/home/dicle/Dicle/Tez/tests/test30-sept13/"
inputpath = rootpath+os.sep+"dataset/"
recordpath = rootpath+os.sep+"results/"
corpus = Corpus("test30", recordpath)
labels = ["cumhuriyet", "radikal", "vakit"]
labelwisepathlist = {}
for label in labels:
labelwisepathlist[label] = []
for label in labels:
labelwisepathlist[label] = IOtools.getfilenames_of_dir(inputpath+os.sep+label, removeextension=False)
corpus.read_corpus(inputpath, recordpath, labelwisepathlist)
end = datetime.now()
print "Reading takes: ", str(end-start)
print corpus.cfd_RootDoc["alevi"].N()
print corpus.cfd_RootDoc.N()
print len(corpus.cfd_RootDoc.conditions())
print corpus.cfd_DocRoot.N()
print len(corpus.cfd_DocRoot.conditions())
freqdf = corpus.get_docterm_matrix()
tfidfdf = corpus.compute_tfidf2(freqdf)
end2= datetime.now()
print "tfidf matrix takes: ",str(end2-end)
corpus.extract_features()
end3 = datetime.now()
print "features takes: ",str(end3-end2)