Python Extractor.extract方法代码示例

本文整理汇总了Python中extractor.Extractor.extract方法的典型用法代码示例。如果您正苦于以下问题：Python Extractor.extract方法的具体用法？Python Extractor.extract怎么用？Python Extractor.extract使用的例子？那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类extractor.Extractor的用法示例。

在下文中一共展示了Extractor.extract方法的12个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: install

# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
	def install(self):
		# Check if package installed
		db = hpakDB(self.pkg_name)
		if db.get_value("status") == "installed":
			misc.print_error("%s - already installed!" % (self.pkg_name), False)
			return
							
		self.prepare_install()
		dl = download(self.options['source'], self.pkg_path, self.pkg_name)
		dl.get()
		
		# Extracting the file.
		e =	Extractor(self.options)
		e.extract()

		# Install depends
		self.install_dep()

		Cmds = self.options['install'].split(',')
		for cmd in Cmds:
			subprocess.Popen(cmd, shell=True).wait()

		# Verify package installed.
		if os.path.exists("%s/%s" % (HPAK_ROOT, self.options['dir'])):
			db = hpakDB(self.pkg_name)
			db.set_value("status", "installed")
			misc.print_success("%s installed." % (self.pkg_name))
		else:
			misc.print_error("%s-%s NOT installed, please try again." % (self.pkg_name, self.options['version']), True)

开发者ID:blackreaven，项目名称:hpak，代码行数:31，代码来源:hpak.py

示例2: init

# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
    def __init__(self, filename):
        super(Database, self).__init__()
        Extractor.extract(filename)

        lines = ''
        with open(filename, 'r') as f:
            lines = f.readlines()

        self.courses = list()

        for i in lines:
            x = i.split(',')
            x = [y.strip('()"') for y in x]
            self.courses.append(Course(x[0],x[1],x[2],x[3],x[4],x[5],x[6]))

开发者ID:DarkPotatoKing，项目名称:crs-course-db，代码行数:16，代码来源:database.py

示例3: IndexTrainer

# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
class IndexTrainer(object):

	def __init__(self):
		self.index = InvertedIndex()
		self.bow = Bow()
		self.extractor = Extractor('surf')
		print self.index.author
		print self.index.description

	def load_feature(self, path='../models/feature.npy'):
		self.features = np.load(path)
		if len(self.features) > 200000:
			self.features = self.features[:200000]
		print "feature shape: ", self.features.shape
		return self.features

	def run(self, path):
		self.bow.load()
		self.index.reset(self.bow.centers)
		images = imutil.get_list_image(path)
		t = imutil.Timer(1)
		t.tic()
		for i,image in enumerate(images):
			descriptors = self.extractor.extract(image)
			self.index.append(image, descriptors)
			if (i+1)%1000 == 0:
				t.toc('finish 1000 images: ')
				t.tic()

开发者ID:PierreHao，项目名称:QScode，代码行数:30，代码来源:buildIndex.py

示例4: test_cond

# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
 def test_cond(self):
     from masks import mask
     e = Extractor()
     logging.debug(e)
     e.add_feature_condition(mask)
     res = e.extract(self.data)
     self.assertTrue(len(res[self.data.keys()[0]]) > 0)

开发者ID:phipse，项目名称:complexlab_ai，代码行数:9，代码来源:__test_extractor.py

示例5: test_monotony

# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
 def test_monotony(self):
     from masks import absolute_monotony as monotony
     e = Extractor()
     logging.debug(e)
     e.add_feature_condition(monotony.Raising)
     e.add_feature_condition(monotony.Falling)
     res = e.extract(self.data)
     logging.debug("res: \n%s", pprint.pformat(res))
     self.assertTrue(len(res[self.data.keys()[0]]) > 0)

开发者ID:phipse，项目名称:complexlab_ai，代码行数:11，代码来源:__test_extractor.py

示例6: Extraktor

# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
class Extraktor(object):
    def __init__(self):
        self.extractor = Extractor()
        self.sqs = boto3.client('sqs')
        self.queue_url = 'https://sqs.ap-southeast-1.amazonaws.com/841662669278/crawler'
        self.s3 = boto3.client('s3')
        self.dynamodb = boto3.resource('dynamodb')
        self.bloom_filter = MyBloomFilter(self.dynamodb.Table('link'))

    def process(self):
        while True:
            ret = self.sqs.receive_message(
                QueueUrl=self.queue_url,
                MaxNumberOfMessages=10,
                WaitTimeSeconds=1
            )

            if 'Messages' not in ret:
                continue
            
            for msg in ret['Messages']:
                key = msg['Body']
                record = self.s3.get_object(Bucket='samuel-html', Key=key)
                #pack['Body'] botocore.response.StreamingBody
                pack = json.loads(lzo.decompress(record['Body'].read()).decode('utf-8'))
            #    response = self.client.delete_message(
            #        QueueUrl=self.queue_url,
            #        ReceiptHandle=msg['ReceiptHandle']
            #    )
            #    print(response)

                self.bloom_filter.add(pack['url'])
                if pack.get('code') == 200:
                    url = pack['url']
                    ret = self.extractor.extract(pack)
                    for link in ret['links']:
                        if not self.bloom_filter.add(link['url']):
                            seed(link)
                        else:
                            #print 'already crawled', link['url']
                            pass
                    #save pack to tbl_link
                    self.dynamodb.Table('link').put_item(
                        Item = {
                            'url': url,
                            'ctime': Decimal(str(time.time())),
                            'utime': Decimal(str(time.time()))
                        }
                    )
                    logger.info("%s ok" % (pack['url']))
                else:
                    logger.warn("%s not ok code:%d" % (pack['url'], pack.get('code')))
                response = self.sqs.delete_message(
                    QueueUrl=self.queue_url,
                    ReceiptHandle=msg['ReceiptHandle']
                )

开发者ID:samuelduann，项目名称:scraper，代码行数:58，代码来源:ec.py

示例7: crawl_school_programs

# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
def crawl_school_programs(data):
  programs = []

  for program in data:
    pprint(program)
    if program.has_key('text'):
      programs.append(program)
      continue
    url = program['url']

    print 'requesting url %s ...' % url
    r = requests.get(url, verify=False)
    if r.status_code == 200:
      html = r.text
      extractor = Extractor()
      text = extractor.extract(html)
      if len(text.strip()) != 0:
        program['text'] = text
    else:
      print 'Error code'
    
    programs.append(program)
  
  return programs

开发者ID:PKU-ComNet，项目名称:school-fiesta，代码行数:26，代码来源:main.py

示例8: spider

# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
	def spider(self, root, pages = True, subcategories = True, action = "traverse", preclean = False, depth = 1):
		if preclean: self.graphdb.clear()
		seen_key = "URL_SEEN"
		queue_key = "URL_QUEUE"
		ex = Extractor()
		batch = neo4j.WriteBatch(self.graphdb)

		queue_empty = lambda: self.fdb.scard(queue_key) == 0
		seen = lambda x: self.fdb.sismember(seen_key, x)
		visit = lambda x: self.fdb.sadd(seen_key, x)
		dequeue = lambda: self.fdb.spop(queue_key)
		enqueue = lambda x: self.fdb.sadd(queue_key, self._encode_str(x))

		if action == "traverse":
			enqueue(root)
			while not queue_empty():
				current = dequeue()
				print current
				if current and current.strip() and not seen(current):
					visit(current)
					result = ex.getAllFromCategory(current)
					self.updateBatch(batch, type = neo4j.Node, node = {'name': current, 'class': self.CATEGORY})
					if pages:
						for page in result['pages']:
							print "{0}\tp:{1}".format(current[:15], page)
							self.incr_rel(page, current, self.CATEGORY_REL)
							self.updateBatch(batch, type = neo4j.Node, node = {'name': page, 'class': self.ARTICLE})
							links = ex.getWikiLinks(page)
							for a in links:
								print "{0}\tp:{1}\t{2}".format(current[:15], page, a)
								self.incr_rel(a, page, self.SIBLING_REL)
								self.updateBatch(batch, type = neo4j.Node, node = {'name': a, 'class': self.ARTICLE})
					if subcategories:
						for subcat in result['categories']:
							print "{0}\tc:{1}".format(current, subcat)
							self.incr_rel(subcat, current, self.SUBCAT_REL)
							self.updateBatch(batch, type = neo4j.Node, node = {'name': subcat, 'class': self.CATEGORY})
							enqueue(subcat)
		elif action == "crawl":
			enqueue(root)
			while not queue_empty():
				topic = dequeue()
				if topic and topic.strip() and not seen(topic):
					visit(topic)
					result = ex.extract(topic)
					depth -= 1
					self.updateBatch(batch, type = neo4j.Node, node = {'name': topic, 'class': result['type']})
					if result['type'] == self.CATEGORY:
						pass
					elif result['type'] == self.ARTICLE:
						for a in result['links']:
							self.incr_rel(a, topic, self.SIBLING_REL)
							print "adding: ", a
							self.updateBatch(batch, type = neo4j.Node, node = {'name': a, 'class': self.ARTICLE})
							if depth > 0: enqueue(a)
						for c in result['categories']:
							self.incr_rel(a, topic, self.CATEGORY_REL)
							self.updateBatch(batch, type = neo4j.Node, node = {'name': c, 'class': self.CATEGORY})
					elif result['type'] == self.DISAMBIGUATION:
						for a in result['links']:
							self.incr_rel(a, topic, self.DISAMB_REL)
							self.updateBatch(batch, type = neo4j.Node, node = {'name': a, 'class': self.DISAMBIGUATION})
		print "FINISHED WITH THE NODES..."
		for k in self.fdb.smembers(self.rel_key):
			print "REL:", k
			try:
				nodes = k.split(":", 2)
				rel = nodes[0]
				n1 = self.node_index.get('name', nodes[1])[0]
				n2 = self.node_index.get('name', nodes[2])[0]
				self.updateBatch(batch, type = neo4j.Relationship, rel = {'node1': n1, 'rel': rel, 'weight': 1, 'node2': n2})
			except Exception as e:
				print "REL EXCEPTION: ", e
		print "DONE>>>>>>>>>>>>>>>"

开发者ID:saurabhkb，项目名称:tailor，代码行数:76，代码来源:crawler.py

示例9: extract

# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
def extract():
  extractor = Extractor()
  extractor.extract()

开发者ID:gregkarlin，项目名称:extractor，代码行数:5，代码来源:main.py

示例10: MainFrame

# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
class MainFrame(Frame): 
    def __init__(self, parent):
        Frame.__init__(self, parent)   
         
        self.parent = parent 

        self.music_root = ''
        self.query_path = ''
        self.extractor = Extractor(n_frames=40, 
                                   n_blocks=100, 
                                   learning_rate=0.00053,
                                   verbose=True)

        self.style = Style()
        self.style.theme_use("default")
        
        padx = 2
        pady = 2

        root_select_button = Button(self, text="Select a directory")
        root_select_button.pack(fill=tkinter.X, padx=padx, pady=pady)
        root_select_button.bind("<Button-1>", self.set_music_root)

        analyze_button = Button(self, text="Analyze")
        analyze_button.pack(fill=tkinter.X, padx=padx, pady=pady)
        analyze_button.bind("<Button-1>", self.analyze)

        query_select_button = Button(self, text="Select a file")
        query_select_button.pack(fill=tkinter.X, padx=padx, pady=pady)
        query_select_button.bind("<Button-1>", self.set_query_path)

        search_button = Button(self, text="Search similar songs")
        search_button.pack(fill=tkinter.X, padx=padx, pady=pady)
        search_button.bind("<Button-1>", self.search_music)
 
        self.pack(fill=BOTH, expand=1)

    def set_music_root(self, event):
        self.music_root = filedialog.askdirectory()

    def analyze(self, event):
        if(self.music_root == ''):
            #TODO show error dialog 
            print("Set a music directory first")
            return

        print("Analyzing")
        path_feature_map, error = self.extractor.extract(self.music_root)

        print("Saving")
        filename = os.path.basename(self.music_root)
        jsonpath = os.path.join(jsondir, '{}.json'.format(filename))

        dump_json(path_feature_map, jsonpath)

    def set_query_path(self, event):
        self.query_path = filedialog.askopenfilename(initialdir=self.music_root)

    def search_music(self, event):
        if(self.query_path == ''):
            #TODO show error dialog 
            print("Set a music file first")
            return
    
        k_nearest = search(self.query_path)

        music_list = MusicList(self)
        for path, vector in k_nearest:
            music_list.append(path)

开发者ID:IshitaTakeshi，项目名称:Lyra，代码行数:71，代码来源:gui.py

示例11: tqdm

# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
pbar = tqdm(total=len(data.data))
for video in data.data:

    # Get the path to the sequence for this video.
    path = os.path.join('data', 'sequences', video[2] + '-' + str(seq_length) + \
        '-features')  # numpy will auto-append .npy

    # Check if we already have it.
    if os.path.isfile(path + '.npy'):
        pbar.update(1)
        continue

    # Get the frames for this video.
    frames = data.get_frames_for_sample(video)

    # Now downsample to just the ones we need.
    frames = data.rescale_list(frames, seq_length)

    # Now loop through and extract features to build the sequence.
    sequence = []
    for image in frames:
        features = model.extract(image)
        sequence.append(features)

    # Save the sequence.
    np.save(path, sequence)

    pbar.update(1)

pbar.close()

开发者ID:genghaijian，项目名称:five-video-classification-methods，代码行数:32，代码来源:extract_features.py

示例12: init

# 需要导入模块: from extractor import Extractor [as 别名]
# 或者: from extractor.Extractor import extract [as 别名]
class Crawler:
  """
  Main class for this dummy crawler
  """
  def __init__(self, dbfile):
    self.dbfile = dbfile
    self.data = None
    self.school_collection = None
    self.extractor = Extractor()

  def load(self):
    if self.data != None:
      print 'You have unsaved in-memory data, cannot load new data'
      exit(1)
    with open(self.dbfile, 'r') as f:
      self.data = json.load(f)
      self.school_collection = SchoolCollection(self.data['schools'])
    print 'Loaded %s json file, got %d schools' % (self.dbfile,
        self.school_collection.get_num_schools())

  def dump(self):
    if self.data == None:
      print 'Nothing to dump'
      exit(1)
    self.data = self.school_collection.toJSON()
    with open(self.dbfile, 'w') as f:
      json.dump(self.data, f)
    print 'Dumped %s json file' % self.dbfile

  def fetch(self, url):
    """
    Entrance for all kinds of HTTP requests
    """,
    is_ok,html = False,None
    try:
      response = requests.get(url, verify=False)
      if response.status_code == 200:
        html = response.text
        is_ok = True
      else:
        print >>sys.stderr, 'Error fetch'
    finally:
      return is_ok,html


  def fetch_program_text(self, url):
    """
    Just read the content from url, load <p> text only.
    I think this is the best heuristic method.
    """
    is_ok,html = self.fetch(url)
    html = html.strip()
    text = self.extractor.extract(html)
    return is_ok,text 

  # important public API
  def add_program(self, school_name, data, fetch_text=True, override_program=False):
    """
    Try to add a program to program list
    Currently I dont take care about the return value
    """
    if self.school_collection.is_school_exists(school_name) == False:
      print >>sys.stderr, "Should add school '%s' first" % school_name
      return None
    
    school = self.school_collection.find_school(school_name)
    if school.is_program_exists(data['name']):
      if override_program == False:
        return None

    prog = Program(data)
    if fetch_text:
      is_ok,text = self.fetch_program_text(prog.url)
      if is_ok:
        prog.text = text

    pprint(prog.toJSON())
    school.insert_program(prog)
    return None

开发者ID:PKU-ComNet，项目名称:school-fiesta，代码行数:81，代码来源:crawler.py

注：本文中的extractor.Extractor.extract方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。