本文整理汇总了Python中extractor.Extractor.extract方法的典型用法代码示例。如果您正苦于以下问题:Python Extractor.extract方法的具体用法?Python Extractor.extract怎么用?Python Extractor.extract使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类extractor.Extractor
的用法示例。
在下文中一共展示了Extractor.extract方法的12个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: install
# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
def install(self):
# Check if package installed
db = hpakDB(self.pkg_name)
if db.get_value("status") == "installed":
misc.print_error("%s - already installed!" % (self.pkg_name), False)
return
self.prepare_install()
dl = download(self.options['source'], self.pkg_path, self.pkg_name)
dl.get()
# Extracting the file.
e = Extractor(self.options)
e.extract()
# Install depends
self.install_dep()
Cmds = self.options['install'].split(',')
for cmd in Cmds:
subprocess.Popen(cmd, shell=True).wait()
# Verify package installed.
if os.path.exists("%s/%s" % (HPAK_ROOT, self.options['dir'])):
db = hpakDB(self.pkg_name)
db.set_value("status", "installed")
misc.print_success("%s installed." % (self.pkg_name))
else:
misc.print_error("%s-%s NOT installed, please try again." % (self.pkg_name, self.options['version']), True)
示例2: __init__
# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
def __init__(self, filename):
super(Database, self).__init__()
Extractor.extract(filename)
lines = ''
with open(filename, 'r') as f:
lines = f.readlines()
self.courses = list()
for i in lines:
x = i.split(',')
x = [y.strip('()"') for y in x]
self.courses.append(Course(x[0],x[1],x[2],x[3],x[4],x[5],x[6]))
示例3: IndexTrainer
# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
class IndexTrainer(object):
def __init__(self):
self.index = InvertedIndex()
self.bow = Bow()
self.extractor = Extractor('surf')
print self.index.author
print self.index.description
def load_feature(self, path='../models/feature.npy'):
self.features = np.load(path)
if len(self.features) > 200000:
self.features = self.features[:200000]
print "feature shape: ", self.features.shape
return self.features
def run(self, path):
self.bow.load()
self.index.reset(self.bow.centers)
images = imutil.get_list_image(path)
t = imutil.Timer(1)
t.tic()
for i,image in enumerate(images):
descriptors = self.extractor.extract(image)
self.index.append(image, descriptors)
if (i+1)%1000 == 0:
t.toc('finish 1000 images: ')
t.tic()
示例4: test_cond
# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
def test_cond(self):
from masks import mask
e = Extractor()
logging.debug(e)
e.add_feature_condition(mask)
res = e.extract(self.data)
self.assertTrue(len(res[self.data.keys()[0]]) > 0)
示例5: test_monotony
# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
def test_monotony(self):
from masks import absolute_monotony as monotony
e = Extractor()
logging.debug(e)
e.add_feature_condition(monotony.Raising)
e.add_feature_condition(monotony.Falling)
res = e.extract(self.data)
logging.debug("res: \n%s", pprint.pformat(res))
self.assertTrue(len(res[self.data.keys()[0]]) > 0)
示例6: Extraktor
# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
class Extraktor(object):
def __init__(self):
self.extractor = Extractor()
self.sqs = boto3.client('sqs')
self.queue_url = 'https://sqs.ap-southeast-1.amazonaws.com/841662669278/crawler'
self.s3 = boto3.client('s3')
self.dynamodb = boto3.resource('dynamodb')
self.bloom_filter = MyBloomFilter(self.dynamodb.Table('link'))
def process(self):
while True:
ret = self.sqs.receive_message(
QueueUrl=self.queue_url,
MaxNumberOfMessages=10,
WaitTimeSeconds=1
)
if 'Messages' not in ret:
continue
for msg in ret['Messages']:
key = msg['Body']
record = self.s3.get_object(Bucket='samuel-html', Key=key)
#pack['Body'] botocore.response.StreamingBody
pack = json.loads(lzo.decompress(record['Body'].read()).decode('utf-8'))
# response = self.client.delete_message(
# QueueUrl=self.queue_url,
# ReceiptHandle=msg['ReceiptHandle']
# )
# print(response)
self.bloom_filter.add(pack['url'])
if pack.get('code') == 200:
url = pack['url']
ret = self.extractor.extract(pack)
for link in ret['links']:
if not self.bloom_filter.add(link['url']):
seed(link)
else:
#print 'already crawled', link['url']
pass
#save pack to tbl_link
self.dynamodb.Table('link').put_item(
Item = {
'url': url,
'ctime': Decimal(str(time.time())),
'utime': Decimal(str(time.time()))
}
)
logger.info("%s ok" % (pack['url']))
else:
logger.warn("%s not ok code:%d" % (pack['url'], pack.get('code')))
response = self.sqs.delete_message(
QueueUrl=self.queue_url,
ReceiptHandle=msg['ReceiptHandle']
)
示例7: crawl_school_programs
# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
def crawl_school_programs(data):
programs = []
for program in data:
pprint(program)
if program.has_key('text'):
programs.append(program)
continue
url = program['url']
print 'requesting url %s ...' % url
r = requests.get(url, verify=False)
if r.status_code == 200:
html = r.text
extractor = Extractor()
text = extractor.extract(html)
if len(text.strip()) != 0:
program['text'] = text
else:
print 'Error code'
programs.append(program)
return programs
示例8: spider
# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
def spider(self, root, pages = True, subcategories = True, action = "traverse", preclean = False, depth = 1):
if preclean: self.graphdb.clear()
seen_key = "URL_SEEN"
queue_key = "URL_QUEUE"
ex = Extractor()
batch = neo4j.WriteBatch(self.graphdb)
queue_empty = lambda: self.fdb.scard(queue_key) == 0
seen = lambda x: self.fdb.sismember(seen_key, x)
visit = lambda x: self.fdb.sadd(seen_key, x)
dequeue = lambda: self.fdb.spop(queue_key)
enqueue = lambda x: self.fdb.sadd(queue_key, self._encode_str(x))
if action == "traverse":
enqueue(root)
while not queue_empty():
current = dequeue()
print current
if current and current.strip() and not seen(current):
visit(current)
result = ex.getAllFromCategory(current)
self.updateBatch(batch, type = neo4j.Node, node = {'name': current, 'class': self.CATEGORY})
if pages:
for page in result['pages']:
print "{0}\tp:{1}".format(current[:15], page)
self.incr_rel(page, current, self.CATEGORY_REL)
self.updateBatch(batch, type = neo4j.Node, node = {'name': page, 'class': self.ARTICLE})
links = ex.getWikiLinks(page)
for a in links:
print "{0}\tp:{1}\t{2}".format(current[:15], page, a)
self.incr_rel(a, page, self.SIBLING_REL)
self.updateBatch(batch, type = neo4j.Node, node = {'name': a, 'class': self.ARTICLE})
if subcategories:
for subcat in result['categories']:
print "{0}\tc:{1}".format(current, subcat)
self.incr_rel(subcat, current, self.SUBCAT_REL)
self.updateBatch(batch, type = neo4j.Node, node = {'name': subcat, 'class': self.CATEGORY})
enqueue(subcat)
elif action == "crawl":
enqueue(root)
while not queue_empty():
topic = dequeue()
if topic and topic.strip() and not seen(topic):
visit(topic)
result = ex.extract(topic)
depth -= 1
self.updateBatch(batch, type = neo4j.Node, node = {'name': topic, 'class': result['type']})
if result['type'] == self.CATEGORY:
pass
elif result['type'] == self.ARTICLE:
for a in result['links']:
self.incr_rel(a, topic, self.SIBLING_REL)
print "adding: ", a
self.updateBatch(batch, type = neo4j.Node, node = {'name': a, 'class': self.ARTICLE})
if depth > 0: enqueue(a)
for c in result['categories']:
self.incr_rel(a, topic, self.CATEGORY_REL)
self.updateBatch(batch, type = neo4j.Node, node = {'name': c, 'class': self.CATEGORY})
elif result['type'] == self.DISAMBIGUATION:
for a in result['links']:
self.incr_rel(a, topic, self.DISAMB_REL)
self.updateBatch(batch, type = neo4j.Node, node = {'name': a, 'class': self.DISAMBIGUATION})
print "FINISHED WITH THE NODES..."
for k in self.fdb.smembers(self.rel_key):
print "REL:", k
try:
nodes = k.split(":", 2)
rel = nodes[0]
n1 = self.node_index.get('name', nodes[1])[0]
n2 = self.node_index.get('name', nodes[2])[0]
self.updateBatch(batch, type = neo4j.Relationship, rel = {'node1': n1, 'rel': rel, 'weight': 1, 'node2': n2})
except Exception as e:
print "REL EXCEPTION: ", e
print "DONE>>>>>>>>>>>>>>>"
示例9: extract
# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
def extract():
extractor = Extractor()
extractor.extract()
示例10: MainFrame
# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
class MainFrame(Frame):
def __init__(self, parent):
Frame.__init__(self, parent)
self.parent = parent
self.music_root = ''
self.query_path = ''
self.extractor = Extractor(n_frames=40,
n_blocks=100,
learning_rate=0.00053,
verbose=True)
self.style = Style()
self.style.theme_use("default")
padx = 2
pady = 2
root_select_button = Button(self, text="Select a directory")
root_select_button.pack(fill=tkinter.X, padx=padx, pady=pady)
root_select_button.bind("<Button-1>", self.set_music_root)
analyze_button = Button(self, text="Analyze")
analyze_button.pack(fill=tkinter.X, padx=padx, pady=pady)
analyze_button.bind("<Button-1>", self.analyze)
query_select_button = Button(self, text="Select a file")
query_select_button.pack(fill=tkinter.X, padx=padx, pady=pady)
query_select_button.bind("<Button-1>", self.set_query_path)
search_button = Button(self, text="Search similar songs")
search_button.pack(fill=tkinter.X, padx=padx, pady=pady)
search_button.bind("<Button-1>", self.search_music)
self.pack(fill=BOTH, expand=1)
def set_music_root(self, event):
self.music_root = filedialog.askdirectory()
def analyze(self, event):
if(self.music_root == ''):
#TODO show error dialog
print("Set a music directory first")
return
print("Analyzing")
path_feature_map, error = self.extractor.extract(self.music_root)
print("Saving")
filename = os.path.basename(self.music_root)
jsonpath = os.path.join(jsondir, '{}.json'.format(filename))
dump_json(path_feature_map, jsonpath)
def set_query_path(self, event):
self.query_path = filedialog.askopenfilename(initialdir=self.music_root)
def search_music(self, event):
if(self.query_path == ''):
#TODO show error dialog
print("Set a music file first")
return
k_nearest = search(self.query_path)
music_list = MusicList(self)
for path, vector in k_nearest:
music_list.append(path)
示例11: tqdm
# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
pbar = tqdm(total=len(data.data))
for video in data.data:
# Get the path to the sequence for this video.
path = os.path.join('data', 'sequences', video[2] + '-' + str(seq_length) + \
'-features') # numpy will auto-append .npy
# Check if we already have it.
if os.path.isfile(path + '.npy'):
pbar.update(1)
continue
# Get the frames for this video.
frames = data.get_frames_for_sample(video)
# Now downsample to just the ones we need.
frames = data.rescale_list(frames, seq_length)
# Now loop through and extract features to build the sequence.
sequence = []
for image in frames:
features = model.extract(image)
sequence.append(features)
# Save the sequence.
np.save(path, sequence)
pbar.update(1)
pbar.close()
示例12: __init__
# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
class Crawler:
"""
Main class for this dummy crawler
"""
def __init__(self, dbfile):
self.dbfile = dbfile
self.data = None
self.school_collection = None
self.extractor = Extractor()
def load(self):
if self.data != None:
print 'You have unsaved in-memory data, cannot load new data'
exit(1)
with open(self.dbfile, 'r') as f:
self.data = json.load(f)
self.school_collection = SchoolCollection(self.data['schools'])
print 'Loaded %s json file, got %d schools' % (self.dbfile,
self.school_collection.get_num_schools())
def dump(self):
if self.data == None:
print 'Nothing to dump'
exit(1)
self.data = self.school_collection.toJSON()
with open(self.dbfile, 'w') as f:
json.dump(self.data, f)
print 'Dumped %s json file' % self.dbfile
def fetch(self, url):
"""
Entrance for all kinds of HTTP requests
""",
is_ok,html = False,None
try:
response = requests.get(url, verify=False)
if response.status_code == 200:
html = response.text
is_ok = True
else:
print >>sys.stderr, 'Error fetch'
finally:
return is_ok,html
def fetch_program_text(self, url):
"""
Just read the content from url, load <p> text only.
I think this is the best heuristic method.
"""
is_ok,html = self.fetch(url)
html = html.strip()
text = self.extractor.extract(html)
return is_ok,text
# important public API
def add_program(self, school_name, data, fetch_text=True, override_program=False):
"""
Try to add a program to program list
Currently I dont take care about the return value
"""
if self.school_collection.is_school_exists(school_name) == False:
print >>sys.stderr, "Should add school '%s' first" % school_name
return None
school = self.school_collection.find_school(school_name)
if school.is_program_exists(data['name']):
if override_program == False:
return None
prog = Program(data)
if fetch_text:
is_ok,text = self.fetch_program_text(prog.url)
if is_ok:
prog.text = text
pprint(prog.toJSON())
school.insert_program(prog)
return None