本文整理汇总了Python中nltk.corpus.PlaintextCorpusReader.raw方法的典型用法代码示例。如果您正苦于以下问题:Python PlaintextCorpusReader.raw方法的具体用法?Python PlaintextCorpusReader.raw怎么用?Python PlaintextCorpusReader.raw使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类nltk.corpus.PlaintextCorpusReader
的用法示例。
在下文中一共展示了PlaintextCorpusReader.raw方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: train_computer_science
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def train_computer_science(save):
comp_sci_corpus = PlaintextCorpusReader('{}/corpus/computerscience/'
.format(os.path.dirname(os.path.abspath(__file__))), '.*')
comp_sci_chunker = Chunker('computerscience', comp_sci_corpus.raw('train.txt'))
chunk_score = comp_sci_chunker.evaluate(comp_sci_corpus.raw('test.txt'))
print_chunk_score(chunk_score)
if save:
comp_sci_chunker.save_chunker()
示例2: tokenize_report_sents
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def tokenize_report_sents(self, report_of_the_time):
re = ReportEnviroments()
new_corpus_reports_fileids_list = PlaintextCorpusReader(re.original_reports_corpus_path, '.*')
raw_text = new_corpus_reports_fileids_list.raw(report_of_the_time)
sentencas_raw = sent_tokenize(raw_text)
original_report_path = str(new_corpus_reports_fileids_list.abspath(report_of_the_time))
return sentencas_raw, original_report_path, report_of_the_time
示例3: compare
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def compare(request):
errors = []
statistics=[]
stats=[]
for x in range(1,3):
cantoname = "canto"+str(x)+".txt"
w=PlaintextCorpusReader("./",cantoname);
w.words();
t=nltk.text.Text(w.words());
l_lines=len(line_tokenize(w.raw()))
l_uwords=len(set(w.words()))
l_words=len(w.words())
l_sents=len(w.sents())
l_paras=len(w.paras())
l_linperpara=l_lines/l_paras
statistics.append(x)
statistics.append("Number of Words - "+ str(l_words))
statistics.append("Number of Unique Words - "+ str(l_uwords))
statistics.append("Number of Setences - "+ str(l_sents))
statistics.append("Number of Lines - "+ str(l_lines))
statistics.append("Number of Paras - "+ str(l_paras))
statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
lexical_density=l_words/l_uwords
l_wordpersent = l_words/l_sents
statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
statistics.append("Words per sentence - "+ str(l_wordpersent))
stats.append(statistics)
return render_to_response('compare.html', {'stats':statistics})
示例4: stats
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def stats(request):
errors = []
statistics=[]
if 'q' in request.GET:
q = request.GET['q']
if not q:
errors.append('Enter a Canto Number')
else:
cantoname = "canto"+q+".txt"
w=PlaintextCorpusReader("./",cantoname);
w.words();
t=nltk.text.Text(w.words());
l_lines=len(line_tokenize(w.raw()))
l_uwords=len(set(w.words()))
l_words=len(w.words())
l_sents=len(w.sents())
l_paras=len(w.paras())
l_linperpara=l_lines/l_paras
statistics.append("Number of Words - "+ str(l_words))
statistics.append("Number of Unique Words - "+ str(l_uwords))
statistics.append("Number of Setences - "+ str(l_sents))
statistics.append("Number of Lines - "+ str(l_lines))
statistics.append("Number of Paras - "+ str(l_paras))
statistics.append("Number of Lines/Paras - "+ str(l_linperpara))
lexical_density=l_words/l_uwords
l_wordpersent = l_words/l_sents
statistics.append("Lexical Density (Total/Uniq) words- "+ str(lexical_density))
statistics.append("Words per sentence - "+ str(l_wordpersent))
return render_to_response('stats.html', {'statistics':statistics})
return render_to_response('stats.html', {'errors': errors})
示例5: extract_related_terms
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def extract_related_terms(self):
re = ReportEnviroments()
new_corpus_clusters_fileids_list = PlaintextCorpusReader(re.cluster_corpus_path, '.*')
raw_text_list = []
for i in range(len(new_corpus_clusters_fileids_list.fileids())):
raw_text_list.extend([[new_corpus_clusters_fileids_list.raw(fileids=new_corpus_clusters_fileids_list.fileids()[i])]])
return raw_text_list
示例6: extractWordsOnly
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def extractWordsOnly(self, article):
templist = []
listtextstring = []
articlename = article + '.txt'
#corpus_root = '/home/jesal/onedump/'
wl = PlaintextCorpusReader(corpus_root, '.*')
allwords = wl.words(fileids = articlename)
exturllist = self.extractexternalURL(article)
textstring = wl.raw(articlename)
for item in exturllist:
textstring = textstring.replace(item,' ')
#templist = re.sub(r'[.!,;?]', ' ', textstring).split()
templist = nltk.word_tokenize(textstring)
listtemp = []
for i in templist:
j = re.sub('[^A-Za-z]+', '', i)
listtemp.append(str(j))
templistfinal = []
templistfinal= self.removeEmpty(listtemp)
return templistfinal
示例7: raw
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def raw(self, fileid):
"""
Returns raw text of fileid
>>> hr.raw('1996/HAM2-960622.xml')[:38]
'<?xml version="1.0" encoding="UTF-8"?>'
"""
wordlists = PlaintextCorpusReader(self.hamshahri_root, fileid)
return wordlists.raw(fileid)
示例8: GetTweets
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def GetTweets():
corpusdir = 'DB/'
newCorpus = PlaintextCorpusReader(corpusdir, '.*\.txt$') #Regex allows you to ignore .DS_Store
pattern = '\r\n' #Regex accepts \r\n as the next line encoding in each 'tweet' in the database
tweets = nltk.regexp_tokenize(newCorpus.raw(), pattern, gaps=True) #iterate through list, creating 'tweets'
tweets = [x.lower() for x in tweets] #make all strings lowercase to make matching easier
return tweets
示例9: raw
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def raw():
"""
Returns raw text of corpus
>>> raw()[:54]
'# DELM'
"""
wordlists = PlaintextCorpusReader(bijankhan_root, bijankhan_fileid)
return wordlists.raw(bijankhan_fileid)
示例10: process_nan
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def process_nan():
corpus_root = '../nan_samples/'
library = PlaintextCorpusReader(corpus_root, '.*', encoding='utf-8')
tokens = nltk.word_tokenize(library.raw())
tokens = map(lambda x: process_element(x), tokens)
nan_tokens=[]
for i in tokens:
nan_tokens+=i.split(' ')
return nan_tokens
示例11: small_event_sentences
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def small_event_sentences():
corpus_root = '../Texas_Wild_Fire/'
wordlists = PlaintextCorpusReader(corpus_root, '.*\.txt')
SmallEvent = wordlists.raw()
sent_tokenizer = nltk.data.load('../nltkData/tokenizers/punkt/english.pickle')
SmallEventSentences = sent_tokenizer.tokenize(SmallEvent)
return SmallEventSentences
示例12: class_event_sentences
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def class_event_sentences():
corpus_root = '../Islip13Rain/'
wordlists = PlaintextCorpusReader(corpus_root, ".*\.txt")
ClassEvent = wordlists.raw()
sent_tokenizer = nltk.data.load('../nltkData/tokenizers/punkt/english.pickle')
ClassEventSentences = sent_tokenizer.tokenize(ClassEvent)
return ClassEventSentences
示例13: big_event_sentences
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def big_event_sentences():
corpus_root = '../Brazil_NightClub_Fire/'
wordlists = PlaintextCorpusReader(corpus_root, '.*\.txt')
BigEvent = wordlists.raw()
sent_tokenizer = nltk.data.load('../nltkData/tokenizers/punkt/english.pickle')
BigEventSentences = sent_tokenizer.tokenize(BigEvent)
return BigEventSentences
示例14: carga
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def carga():
client = pymongo.MongoClient(MONGODB_URI)
db = client.docs
docs=db.SIMILITUD
completo=[]
newcorpus = PlaintextCorpusReader(corpus_root, '.*')
result={}
for fileid in newcorpus.fileids():
for file2 in newcorpus.fileids():
result= {"f1": fileid, "f2":file2, "value": compare_texts(newcorpus.raw(fileid), newcorpus.raw(file2))}
docs.insert_one(result).inserted_id
示例15: extractexternalURL
# 需要导入模块: from nltk.corpus import PlaintextCorpusReader [as 别名]
# 或者: from nltk.corpus.PlaintextCorpusReader import raw [as 别名]
def extractexternalURL(self, article):
#corpus_root = '/home/jesal/onedump/'
wl = PlaintextCorpusReader(corpus_root, '.*')
#tempww = wl.words(fileids = article)
articlename = article + '.txt'
rawopen = wl.raw(articlename)
lines = rawopen.splitlines()
txt = rawopen
listfinal = []
#rg = re.compile('http..(?:\\/[\\w\\.\\-]+)+',re.IGNORECASE|re.DOTALL)
listfinal = re.findall(rg,rawopen)
return listfinal