本文整理匯總了Python中pynlpl.statistics.FrequencyList.typetokenratio方法的典型用法代碼示例。如果您正苦於以下問題:Python FrequencyList.typetokenratio方法的具體用法?Python FrequencyList.typetokenratio怎麽用?Python FrequencyList.typetokenratio使用的例子?那麽, 這裏精選的方法代碼示例或許可以為您提供幫助。您也可以進一步了解該方法所在類pynlpl.statistics.FrequencyList
的用法示例。
在下文中一共展示了FrequencyList.typetokenratio方法的5個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Python代碼示例。
示例1: main
# 需要導入模塊: from pynlpl.statistics import FrequencyList [as 別名]
# 或者: from pynlpl.statistics.FrequencyList import typetokenratio [as 別名]
def main():
try:
opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"])
except getopt.GetoptError as err:
# print help information and exit:
print(str(err),file=sys.stderr)
usage()
sys.exit(2)
testsetsize = devsetsize = 0
casesensitive = True
encoding = 'utf-8'
n = 1
for o, a in opts:
if o == "-n":
n = int(a)
elif o == "-i":
casesensitive = False
elif o == "-e":
encoding = a
else:
print("ERROR: Unknown option:",o,file=sys.stderr)
sys.exit(1)
if not files:
print >>sys.stderr, "No files specified"
sys.exit(1)
freqlist = FrequencyList(None, casesensitive)
for filename in files:
f = codecs.open(filename,'r',encoding)
for line in f:
if n > 1:
freqlist.append(Windower(crude_tokenizer(line),n))
else:
freqlist.append(crude_tokenizer(line))
f.close()
dist = Distribution(freqlist)
for type, count in freqlist:
if isinstance(type,tuple) or isinstance(type,list):
type = " ".join(type)
s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
print(s)
print("Tokens: ", freqlist.tokens(),file=sys.stderr)
print("Types: ", len(freqlist),file=sys.stderr)
print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr)
print("Entropy: ", dist.entropy(),file=sys.stderr)
示例2: main
# 需要導入模塊: from pynlpl.statistics import FrequencyList [as 別名]
# 或者: from pynlpl.statistics.FrequencyList import typetokenratio [as 別名]
def main():
parser = argparse.ArgumentParser(description="Generate an n-gram frequency list", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-n','--ngramsize', help="N-gram size", type=int, action='store',default=1)
parser.add_argument('-i','--caseinsensitive', help="Case insensitive", action="store_true")
parser.add_argument('-e','--encoding', help="Character encoding", type=str, action='store',default='utf-8')
parser.add_argument('files', type=str, nargs='+', help="The data sets to sample from, must be of equal size (i.e., same number of lines)")
args = parser.parse_args()
if not args.files:
print("No files specified", file=sys.stderr)
sys.exit(1)
freqlist = FrequencyList(None, args.caseinsensitive)
for filename in args.files:
f = io.open(filename,'r',encoding=args.encoding)
for line in f:
if args.ngramsize > 1:
freqlist.append(Windower(crude_tokenizer(line),args.ngramsize))
else:
freqlist.append(crude_tokenizer(line))
f.close()
dist = Distribution(freqlist)
for type, count in freqlist:
if isinstance(type,tuple) or isinstance(type,list):
type = " ".join(type)
s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
print(s)
print("Tokens: ", freqlist.tokens(),file=sys.stderr)
print("Types: ", len(freqlist),file=sys.stderr)
print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr)
print("Entropy: ", dist.entropy(),file=sys.stderr)
示例3: FrequencyList
# 需要導入模塊: from pynlpl.statistics import FrequencyList [as 別名]
# 或者: from pynlpl.statistics.FrequencyList import typetokenratio [as 別名]
sys.exit(1)
if not files:
print >>sys.stderr, "No files specified"
sys.exit(1)
freqlist = FrequencyList(None, casesensitive)
for filename in files:
f = codecs.open(filename,'r',encoding)
for line in f:
if n > 1:
freqlist.append(Windower(crude_tokenizer(line),n))
else:
freqlist.append(crude_tokenizer(line))
f.close()
dist = Distribution(freqlist)
for type, count in freqlist:
if isinstance(type,tuple) or isinstance(type,list):
type = " ".join(type)
s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
print s.encode('utf-8')
print >>sys.stderr, "Tokens: ", freqlist.tokens()
print >>sys.stderr, "Types: ", len(freqlist)
print >>sys.stderr, "Type-token ratio: ", freqlist.typetokenratio()
print >>sys.stderr, "Entropy: ", dist.entropy()
示例4: FrequencyList
# 需要導入模塊: from pynlpl.statistics import FrequencyList [as 別名]
# 或者: from pynlpl.statistics.FrequencyList import typetokenratio [as 別名]
#!/usr/bin/env python
#-*- coding:utf-8 -*-
from pynlpl.textprocessors import Windower, crude_tokenizer
from pynlpl.statistics import FrequencyList, Distribution
import sys
import codecs
with codecs.open(sys.argv[1],'r','utf-8') as file:
freqlist = FrequencyList()
for line in file:
freqlist.append(Windower(crude_tokenizer(line),2))
print "Type/Token Ratio: ", freqlist.typetokenratio()
### uncomment if you want to output the full frequency list:
#for line in freqlist.output():
# print line.encode('utf-8')
dist = Distribution(freqlist)
for line in dist.output():
print line.encode('utf-8')
示例5: FrequencyList
# 需要導入模塊: from pynlpl.statistics import FrequencyList [as 別名]
# 或者: from pynlpl.statistics.FrequencyList import typetokenratio [as 別名]
sys.exit(1)
if not files:
print >>sys.stderr, "No files specified"
sys.exit(1)
freqlist = FrequencyList(None, casesensitive)
for filename in files:
f = codecs.open(filename,'r',encoding)
for line in f:
if n > 1:
freqlist.append(Windower(crude_tokenizer(line),n))
else:
freqlist.append(crude_tokenizer(line))
f.close()
dist = Distribution(freqlist)
for type, count in freqlist:
if isinstance(type,tuple) or isinstance(type,list):
type = " ".join(type)
s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str(dist.information(type))
print(s)
print("Tokens: ", freqlist.tokens(),file=sys.stderr)
print("Types: ", len(freqlist),file=sys.stderr)
print("Type-token ratio: ", freqlist.typetokenratio(),file=sys.stderr)
print("Entropy: ", dist.entropy(),file=sys.stderr)