本文整理汇总了Python中gensim.corpora.WikiCorpus.get_texts方法的典型用法代码示例。如果您正苦于以下问题:Python WikiCorpus.get_texts方法的具体用法?Python WikiCorpus.get_texts怎么用?Python WikiCorpus.get_texts使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类gensim.corpora.WikiCorpus
的用法示例。
在下文中一共展示了WikiCorpus.get_texts方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: main
# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import get_texts [as 别名]
def main(argv=None):
if argv is None:
argv = sys.argv
print('Creating simple wiki serialized corpus')
# Download the raw file if we do not have it already
if not os.path.isfile(WIKIFILE):
# Get the file
wget.download(WIKIURL)
wiki = WikiCorpus(WIKIFILE, lemmatize=False)
i = 0
article_dict = {}
for text in wiki.get_texts(meta=True):
url_string = 'https://simple.wikipedia.org/wiki/?curid={}'
article_dict[i] = (url_string.format(text[0]), text[1])
i += 1
with open(ARTICLEDICT, 'w') as f:
json.dump(article_dict, f)
wiki.dictionary.filter_extremes(no_below=20, no_above=0.1,
keep_n=DEFAULT_DICT_SIZE)
MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, )
wiki.dictionary.save_as_text(DICTFILE)
print('Simple wiki serialized corpus created')
# Now run LSI
dictionary = Dictionary.load_from_text(DICTFILE)
mm = MmCorpus(MMFILE)
tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
tfidf.save(TDIFMODEL)
MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
mm_tdif = MmCorpus(TDIFFILE)
lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
index = similarities.MatrixSimilarity(lsi[mm_tdif])
index.save(SIMMATRIX)
lsi.save(LSIMODEL)
print("LSI model and index created")
示例2: extract_wiki
# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import get_texts [as 别名]
def extract_wiki(thresh, env_path, vec_file):
program = os.path.basename(env_path[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) < 3:
print globals()['__doc__'] % locals()
sys.exit(1)
inp, outp = sys.argv[1:3]
space = " "
i = 0
print('--- load ck12 word2vec')
model = gensim.models.Word2Vec.load_word2vec_format(vec_file, binary=False)
print('--- filtering keywords based on sim to ck12 keyword science')
output = open(outp, 'w')
wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
for text in wiki.get_texts():
topic =[ w for w in text[:20] if w not in stopwords.words('english')]
sim = np.mean([ model[w].dot(model['science']) if w in model else 0 for w in topic])
#sim = model['science'].dot(topic_vec)
if sim > thresh:
output.write(space.join(text) + "\n")
i = i + 1
if (i % 100 == 0):
logger.info("Saved " + str(i) + " articles")
output.close()
logger.info("Finished Saved " + str(i) + " articles")
示例3: process_enwiki
# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import get_texts [as 别名]
def process_enwiki(input_file, output_file):
space = ' '
i = 0
output = open(output_file, 'w')
wiki = WikiCorpus(input_file, lemmatize=False, dictionary={})
for text in wiki.get_texts():
output.write(space.join(text) + '\n')
i += 1
if i % 10000 == 0:
logger.info('Saved ' + str(i) + ' articles')
output.close()
示例4: parse_wiki
# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import get_texts [as 别名]
def parse_wiki(filename):
fout = file('../../paper/data/wiki/wiki_corpus', 'w')
wiki = WikiCorpus(filename, lemmatize=False, dictionary={}, processes=5)
count = 0
for text in wiki.get_texts():
fout.write('%s\n' % ' '.join(text))
if count % 10000 == 0:
logging.info(count)
count += 1
fout.close()
logging.info('Finish %d' % count)
示例5: parse
# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import get_texts [as 别名]
def parse(filename):
OUTPATH = '../gen_data/wikicorpus'
fout = open(OUTPATH, 'w')
wiki = WikiCorpus(filename, lemmatize=False, dictionary={}, processes=5)
count = 0
for text in wiki.get_texts():
fout.write(" ".join(text) + "\n")
count = count + 1
if (count % 10000 == 0):
logging.info("Save "+str(count) + " articles")
fout.close()
logging.info("Finished saved "+str(count) + "articles")
示例6: process_wiki
# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import get_texts [as 别名]
def process_wiki(infile, outfile):
from gensim.corpora import WikiCorpus
wiki = WikiCorpus(infile, lemmatize=False, dictionary={})
i = 0
with open(outfile, 'w') as fw:
for text in wiki.get_texts():
text = ' '.join(text)
cut_text = cut(text)
fw.write(re.sub(r' {1,}', ' ', ' '.join(cut_text)) + '\n')
i += 1
if i % 1000 == 0:
logger.info('Saved ' + str(i) + ' texts')
logger.info('Finished ' + str(i) + ' texts')
示例7: enwiki
# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import get_texts [as 别名]
def enwiki(srcPath, tarPath):
index = 0
space = " "
output = open(tarPath, 'w')
wiki = WikiCorpus(srcPath, lemmatize=False, dictionary={})
for text in wiki.get_texts():
output.write(' '.join(text) + '\n')
index += 1
if (index % 10000 == 0):
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') + "\tSaved " + str(index) + " articles.")
output.close()
print("Finished saved " + str(index) + " articles.")
示例8: save_to_batches
# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import get_texts [as 别名]
def save_to_batches(input, doc_set=set(), batch_path='.', batch_size=1000, lang='@body'):
if not doc_set: # is empty
return
wiki = WikiCorpus(input, lemmatize=False, dictionary='empty dictionary')
wiki.metadata = True # request to extract page_id and title
num_docs_found = 0
batch_dict = {}
NNZ = 0
batch = artm.messages_pb2.Batch()
for (text, page_id_and_title) in wiki.get_texts():
page_id = page_id_and_title[0]
title = page_id_and_title[1]
if page_id in doc_set:
num_docs_found += 1
print num_docs_found, page_id, title
# get tokens tf in the text
text_tf = Counter(text)
for token in text:
# update batch dictionary
if token not in batch_dict:
batch.token.append(unicode(token, 'utf-8'))
batch_dict[token] = len(batch.token) - 1
# add item to batch
item = batch.item.add()
item.id = int(page_id)
item.title = title
field = item.field.add()
field.name = lang
for token in text_tf:
field.token_id.append(batch_dict[token])
field.token_count.append(text_tf[token])
NNZ += text_tf[token]
if len(batch.item) == batch_size:
artm.library.Library().SaveBatch(batch, batch_path)
print 'Batch done, |W| = ' + str(len(batch.token)) + ", NNZ = " + str(NNZ)
batch = artm.messages_pb2.Batch()
batch_dict = {}
NNZ = 0
if len(batch.item) > 0:
artm.library.Library().SaveBatch(batch, batch_path)
print 'Last batch done, |W| = ' + str(len(batch.token)) + ", NNZ = " + str(NNZ)
示例9: main
# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import get_texts [as 别名]
def main():
if len(sys.argv) != 2:
print("Usage: python3 " + sys.argv[0] + " wiki_data_path")
exit()
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
wiki_corpus = WikiCorpus(sys.argv[1], dictionary={})
texts_num = 0
with io.open("wiki_texts.txt",'w',encoding='utf-8') as output:
for text in wiki_corpus.get_texts():
output.write(b' '.join(text).decode('utf-8') + '\n')
texts_num += 1
if texts_num % 10000 == 0:
logging.info("已處理 %d 篇文章" % texts_num)
示例10: dataprocess
# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import get_texts [as 别名]
def dataprocess(_config):
i = 0
output = None
if six.PY3:
output = open(os.path.join(_config.data_path, _config.zhwiki_raw), 'w')
else:
output = codecs.open(os.path.join(_config.data_path, _config.zhwiki_raw), 'w')
wiki = WikiCorpus(os.path.join(_config.data_path, _config.zhwiki_bz2), lemmatize=False, dictionary={})
for text in wiki.get_texts():
if six.PY3:
output.write(b' '.join(text).decode('utf-8', 'ignore') + '\n')
else:
output.write(' '.join(text) + '\n')
i += 1
if i % 10000 == 0:
print('Saved ' + str(i) + ' articles')
output.close()
print('Finished Saved ' + str(i) + ' articles')
示例11: process_wiki
# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import get_texts [as 别名]
def process_wiki(inp, outp):
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
i = 0
output = open(outp, 'w', encoding='utf-8')
wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
for text in wiki.get_texts():
output.write(b' '.join(text).decode('utf-8') + '\n')
i = i + 1
if i % 10000 == 0:
logger.info('Saved ' + str(i) + ' articles')
output.close()
logger.info('Finished ' + str(i) + ' articles')
示例12: my_function
# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import get_texts [as 别名]
def my_function():
space = ' '
i = 0
l = []
zhwiki_name = './data/zhwiki-latest-pages-articles.xml.bz2'
f = open('./data/reduce_zhiwiki.txt', 'w')
wiki = WikiCorpus(zhwiki_name, lemmatize=False, dictionary={})
for text in wiki.get_texts():
for temp_sentence in text:
temp_sentence = Converter('zh-hans').convert(temp_sentence)
seg_list = list(jieba.cut(temp_sentence))
for temp_term in seg_list:
l.append(temp_term)
f.write(space.join(l) + '\n')
l = []
i = i + 1
if (i %200 == 0):
print('Saved ' + str(i) + ' articles')
f.close()
示例13: process_wiki
# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import get_texts [as 别名]
def process_wiki(inp,outp):
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
space = " "
i = 0
output = open(outp, 'w')
wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
for text in wiki.get_texts():
output.write(space.join(text) + "\n")
i = i + 1
if (i % 10000 == 0):
logger.info("Saved " + str(i) + " articles")
output.close()
logger.info("Finished Saved " + str(i) + " articles")
示例14: make_wiki_corpus
# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import get_texts [as 别名]
def make_wiki_corpus(inp, outp, logger):
'''
Предобработка википедии.
:param inp: путь к файлу, например: enwiki-20150304-pages-articles.xml.bz2
:param outp: выходной текстовый файл с предобработанной базой текстов
например: wiki.en.text
:param logger: логер для вывода информации о процессе предобработки
'''
output = open(outp, 'w')
wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
i = 0
space = " "
for text in wiki.get_texts():
output.write(space.join(text) + "\n")
i += 1
if i % 10000 == 0:
logger.info("Saved " + str(i) + " articles")
output.close()
logger.info("Finished Saved " + str(i) + " articles")
示例15: zhwiki2chars
# 需要导入模块: from gensim.corpora import WikiCorpus [as 别名]
# 或者: from gensim.corpora.WikiCorpus import get_texts [as 别名]
def zhwiki2chars(in_file, out_file):
reg = re.compile(r'^[a-zA-Z]+$')
def _isalpha(string):
return reg.match(string) is not None
i = 0
out = open(out_file, 'w')
wiki = WikiCorpus(in_file, lemmatize=False, dictionary={})
for article in wiki.get_texts():
tokens = []
for token in article:
token = token.decode("utf-8").strip()
if _isalpha(token):
continue
tokens.append(" ".join(token)) # divided by character
out.write(" ".join(tokens) + "\n")
i += 1
if i % 10000 == 0:
print("process %d articles" % i)
out.close()