本文整理汇总了Python中pynlpl.statistics.FrequencyList.append方法的典型用法代码示例。如果您正苦于以下问题:Python FrequencyList.append方法的具体用法?Python FrequencyList.append怎么用?Python FrequencyList.append使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pynlpl.statistics.FrequencyList
的用法示例。
在下文中一共展示了FrequencyList.append方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: test_freqlist_caseinsens
# 需要导入模块: from pynlpl.statistics import FrequencyList [as 别名]
# 或者: from pynlpl.statistics.FrequencyList import append [as 别名]
def test_freqlist_caseinsens(self):
"""Bigram Frequency List (case insensitive)"""
global sentences
f= FrequencyList(None, False)
for sentence in sentences:
f.append(Windower(sentence,2))
self.assertTrue(( f[('is','a')] == 2 and f[('this','is')] == 1))
示例2: test_freqlist_tokencount
# 需要导入模块: from pynlpl.statistics import FrequencyList [as 别名]
# 或者: from pynlpl.statistics.FrequencyList import append [as 别名]
def test_freqlist_tokencount(self):
"""Frequency List (count tokens)"""
global sentences
f= FrequencyList()
for sentence in sentences:
f.append(sentence)
self.assertEqual(f.total,13)
示例3: test_freqlist_typecount
# 需要导入模块: from pynlpl.statistics import FrequencyList [as 别名]
# 或者: from pynlpl.statistics.FrequencyList import append [as 别名]
def test_freqlist_typecount(self):
"""Frequency List (count types)"""
global sentences
f= FrequencyList()
for sentence in sentences:
f.append(sentence)
self.assertEqual(len(f),9)
示例4: buildclasser
# 需要导入模块: from pynlpl.statistics import FrequencyList [as 别名]
# 或者: from pynlpl.statistics.FrequencyList import append [as 别名]
def buildclasser(file):
freqlist = FrequencyList()
f = open(file,'r')
for line in f:
line = line.strip()
freqlist.append(line.split(' '))
f.close()
return Classer(freqlist)
示例5: buildfromtext
# 需要导入模块: from pynlpl.statistics import FrequencyList [as 别名]
# 或者: from pynlpl.statistics.FrequencyList import append [as 别名]
def buildfromtext(self, files, encoding='utf-8'):
freqlist = FrequencyList()
if isinstance(files, str): files = [files]
for filename in files:
with open(filename, 'r',encoding=encoding) as f:
for line in f:
tokens = line.strip().split()
freqlist.append(tokens)
self.buildfromfreqlist(freqlist)
示例6: buildfromfolia
# 需要导入模块: from pynlpl.statistics import FrequencyList [as 别名]
# 或者: from pynlpl.statistics.FrequencyList import append [as 别名]
def buildfromfolia(self, files, encoding='utf-8'):
freqlist = FrequencyList()
if isinstance(files, str): files = [files]
for filename in files:
f = folia.Document(file=filename)
for sentence in f.sentences():
tokens = sentence.toktext().split(' ')
freqlist.append(tokens)
self.buildfromfreqlist(freqlist)
示例7: main
# 需要导入模块: from pynlpl.statistics import FrequencyList [as 别名]
# 或者: from pynlpl.statistics.FrequencyList import append [as 别名]
def main():
try:
opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"])
except getopt.GetoptError as err:
# print help information and exit:
print(str(err),file=sys.stderr)
usage()
sys.exit(2)
testsetsize = devsetsize = 0
casesensitive = True
encoding = 'utf-8'
n = 1
for o, a in opts:
if o == "-n":
n = int(a)
elif o == "-i":
casesensitive = False
elif o == "-e":
encoding = a
else:
print("ERROR: Unknown option:",o,file=sys.stderr)
sys.exit(1)
if not files:
print >>sys.stderr, "No files specified"
sys.exit(1)
freqlist = FrequencyList(None, casesensitive)
for filename in files:
f = codecs.open(filename,'r',encoding)
for line in f:
if n > 1:
freqlist.append(Windower(crude_tokenizer(line),n))
else:
freqlist.append(crude_tokenizer(line))
f.close()
dist = Distribution(freqlist)
for type, count in freqlist:
if isinstance(type,tuple) or isinstance(type,list):
type = " ".join(type)
s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
print(s)
print("Tokens: ", freqlist.tokens(),file=sys.stderr)
print("Types: ", len(freqlist),file=sys.stderr)
print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr)
print("Entropy: ", dist.entropy(),file=sys.stderr)
示例8: buildclasser
# 需要导入模块: from pynlpl.statistics import FrequencyList [as 别名]
# 或者: from pynlpl.statistics.FrequencyList import append [as 别名]
def buildclasser():
global DOTOKENIZE, ENCODING, outputprefix
log("Counting unigrams (for classer) ...",stream=sys.stderr)
freqlist = FrequencyList()
f = open(corpusfile)
for i, line in enumerate(f):
if (i % 10000 == 0):
log("\tLine " + str(i+1) + " - (classer construction)", stream=sys.stderr)
if DOTOKENIZE:
line = crude_tokenizer(line.strip())
line = line.strip().split(' ')
freqlist.append(['<begin>'] + line + ['<end>'])
f.close()
log("Building classer ...", stream=sys.stderr)
classer = Classer(freqlist)
classer.save(outputprefix + '.cls')
log("\t" + str(len(classer)) + " classes found", stream=sys.stderr)
return classer
示例9: main
# 需要导入模块: from pynlpl.statistics import FrequencyList [as 别名]
# 或者: from pynlpl.statistics.FrequencyList import append [as 别名]
def main():
parser = argparse.ArgumentParser(description="Generate an n-gram frequency list", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-n','--ngramsize', help="N-gram size", type=int, action='store',default=1)
parser.add_argument('-i','--caseinsensitive', help="Case insensitive", action="store_true")
parser.add_argument('-e','--encoding', help="Character encoding", type=str, action='store',default='utf-8')
parser.add_argument('files', type=str, nargs='+', help="The data sets to sample from, must be of equal size (i.e., same number of lines)")
args = parser.parse_args()
if not args.files:
print("No files specified", file=sys.stderr)
sys.exit(1)
freqlist = FrequencyList(None, args.caseinsensitive)
for filename in args.files:
f = io.open(filename,'r',encoding=args.encoding)
for line in f:
if args.ngramsize > 1:
freqlist.append(Windower(crude_tokenizer(line),args.ngramsize))
else:
freqlist.append(crude_tokenizer(line))
f.close()
dist = Distribution(freqlist)
for type, count in freqlist:
if isinstance(type,tuple) or isinstance(type,list):
type = " ".join(type)
s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
print(s)
print("Tokens: ", freqlist.tokens(),file=sys.stderr)
print("Types: ", len(freqlist),file=sys.stderr)
print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr)
print("Entropy: ", dist.entropy(),file=sys.stderr)
示例10: open
# 需要导入模块: from pynlpl.statistics import FrequencyList [as 别名]
# 或者: from pynlpl.statistics.FrequencyList import append [as 别名]
#!/usr/bin/env python
#-*- coding:utf-8 -*-
from pynlpl.textprocessors import Classer
from pynlpl.statistics import FrequencyList
import sys
filename = sys.argv[1]
print >>sys.stderr, "Counting tokens"
f = open(filename)
freqlist = FrequencyList()
for i, line in enumerate(f):
if (i % 10000 == 0):
print >>sys.stderr, "\tLine " + str(i+1)
line = ['<s>'] + line.strip().split(' ') + ['</s>']
freqlist.append(line)
f.close()
print >>sys.stderr, "Building classer"
classer = Classer(freqlist, filesupport=True )
classer.save(filename + '.cls')
print >>sys.stderr, "Encoding data"
classer.encodefile(filename, filename + '.clsenc')
示例11: FrequencyList
# 需要导入模块: from pynlpl.statistics import FrequencyList [as 别名]
# 或者: from pynlpl.statistics.FrequencyList import append [as 别名]
elif o == "-e":
encoding = a
else:
print >>sys.stderr, "ERROR: Unknown option:",o
sys.exit(1)
if not files:
print >>sys.stderr, "No files specified"
sys.exit(1)
freqlist = FrequencyList(None, casesensitive)
for filename in files:
f = codecs.open(filename,'r',encoding)
for line in f:
if n > 1:
freqlist.append(Windower(crude_tokenizer(line),n))
else:
freqlist.append(crude_tokenizer(line))
f.close()
dist = Distribution(freqlist)
for type, count in freqlist:
if isinstance(type,tuple) or isinstance(type,list):
type = " ".join(type)
s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
print s.encode('utf-8')
print >>sys.stderr, "Tokens: ", freqlist.tokens()
print >>sys.stderr, "Types: ", len(freqlist)
print >>sys.stderr, "Type-token ratio: ", freqlist.typetokenratio()
示例12: FrequencyList
# 需要导入模块: from pynlpl.statistics import FrequencyList [as 别名]
# 或者: from pynlpl.statistics.FrequencyList import append [as 别名]
#!/usr/bin/env python
#-*- coding:utf-8 -*-
from pynlpl.statistics import FrequencyList
from pynlpl.textprocessors import crude_tokenizer, Classer
import sys
import codecs
import asizeof
freqlist = FrequencyList()
f = codecs.open(sys.argv[1], 'r','utf-8')
for line in f:
line = crude_tokenizer(line.strip())
freqlist.append(line)
f.close()
print "FREQLIST: " ,asizeof.asizeof(freqlist)
classer = Classer(freqlist)
print "CLASSER: " ,asizeof.asizeof(classer)
classer2 = Classer(freqlist, False,True)
print "CLASSER (ONLY DECODER): " ,asizeof.asizeof(classer2)
freqlist2 = FrequencyList()
f = codecs.open(sys.argv[1], 'r','utf-8')
for line in f: