当前位置: 首页>>代码示例>>Python>>正文


Python pyquery.pyq函数代码示例

本文整理汇总了Python中pyquery.pyq函数的典型用法代码示例。如果您正苦于以下问题:Python pyq函数的具体用法?Python pyq怎么用?Python pyq使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了pyq函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: _fetch_user

    def _fetch_user(self, url):
        try:
            html = urllib2.urlopen(url + "?tab=repositories").read()
        except urllib2.HTTPError as e:
            if e.code == 429:
                print "#" * 10, " 429 Too many request.Sleep %s seconds. " % self._too_many_request_sleep, "#" * 10
                eventlet.sleep(self._too_many_request_sleep)
                return self._fetch_user(url)
            raise e
        jq = pyq(html)

        data = {}
        data['url'] = url
        data['name'] = jq(".vcard-fullname").text()
        data['avatar'] = jq(".avatar").attr("src")
        data['location'] = jq("[aria-label='Home location']").attr("title")
        data['email'] = jq("[aria-label=Email] a").text()
        data['website'] = jq("[aria-label='Blog or website'] a").text()
        data['join'] = jq(".join-date").attr("datetime")
        data['followers'] = jq(".vcard-stat-count:eq(0)").text()
        data['starred'] = jq(".vcard-stat-count:eq(1)").text()
        data['following'] = jq(".vcard-stat-count:eq(2)").text()

        data['repositories'] = {}
        sources = jq(".repo-list-item.source")
        data['repositories']['source_count'] = len(sources)
        data['repositories']["source_lang"] = {}
        for i in sources:
            lang = pyq(i).find("[itemprop='programmingLanguage']").text()
            data['repositories']["source_lang"].setdefault(lang, 0)
            data['repositories']["source_lang"][lang] += 1

        return data
开发者ID:kinsen,项目名称:found_giter,代码行数:33,代码来源:startup.py

示例2: main

def main():
    doc = pyq(filename='html.txt')
    doc1 = doc('div')
    doc2 = doc1('a')
    # print(doc2)
    TieBaDate = {}

    try:
        f = open('source.txt', 'w')
    except IOError:
        print("Error: open file failed.")
    iSum = 0
    for i in doc2:
        tmphref = pyq(i).attr('href')
        tmptitle = pyq(i).attr('title')
        strhref = repr(tmphref)
        strtitle = repr(tmptitle)
        aryhref = re.findall('/p/(\d+)', strhref)

        if re.findall('/p/(\d+)', strhref) != [] and re.findall('(.*?)魔枪(.*?)', strtitle) != []:
            # print(strtitle)
            # print(strhref)
            strsource = 'http://tieba.baidu.com/p/%s' % aryhref[0]
            f.write(strsource)
            f.write("\n")
            iSum += 1
            AnalyHtml(url=strsource, filePath='')
            break

    print('sum :', iSum)
    f.close()
开发者ID:wangmzuser,项目名称:Python_PyQuery_,代码行数:31,代码来源:PyQuery.py

示例3: _fetch_query

    def _fetch_query(self, url, page=0):
        print "-" * 10, " Fetch Page %s " % (page + 1), "-" * 10
        print url

        try:
            html = urllib2.urlopen(url).read()
        except urllib2.HTTPError as e:
            if e.code == 429:
                print "#" * 10, " 429 Too many request.Sleep %s seconds. " % self._too_many_request_sleep, "#" * 10
                eventlet.sleep(self._too_many_request_sleep)
                return self._fetch_query(url, page)
            raise e
        jq = pyq(html)

        urls = []

        user_list = jq(".user-list-item")
        for i in user_list:
            name = pyq(i).find(".user-list-info a")
            href = self._domain + name.attr("href")

            urls.append(href)

        users = []
        for user in pool.imap(self._fetch_user, urls):
            users.append(user)

        if page == 0:
            max_page_index = jq(".next_page").prev("a").text()
            users.extend(self._fetch_query_by_page(url, int(max_page_index)))

        return users
开发者ID:kinsen,项目名称:found_giter,代码行数:32,代码来源:startup.py

示例4: get_jiandan_mm_pic

def get_jiandan_mm_pic(page_num):
    url = 'http://jandan.net/ooxx/page-' + str(page_num)
    html = pyq(url)
    print('reading ...  http://jandan.net/ooxx/page-{0}\n'.format(page_num))
    sys.stdout.flush()
    #print(html)

    hash_pic_message = {}
    #获取图片地址
    for element in html('li div div.row div.text'):
        img = pyq(element).find('img')
        #img = pyq(element)('img')
        if img != None:
            id = pyq(element)('span a').text()
            #id = id.replace("vote-","")
            hash_pic_message[id]={}
            hash_pic_message[id]['ID']=id
            hash_pic_message[id]['URL']=[]
            hash_pic_message[id]['FileName']=[]

            if img.attr('org_src') == None:
                for t in img:
                    url = img(t).attr('src')
                    hash_pic_message[id]['URL'].append(url)
                    hash_pic_message[id]['FileName'].append(get_file_name2(url))
            else:
                for t in img:
                    url = img(t).attr('org_src')
                    hash_pic_message[id]['URL'].append(url)
                    hash_pic_message[id]['FileName'].append(get_file_name2(url))

    #获取图片ID和评级
    for element in html('li div div.row div.jandan-vote'):
        id = pyq(element)('a').attr('data-id')
        #id = id.replace("vote-","")

        vote = pyq(element).text()

        reg_vote = 'OO \[ (\d.*) \] XX \[ (\d.*) \]'
        pattern = re.compile(reg_vote)
        result = pattern.findall(vote)
        if result != None:
            support = result[0][0]
            unsupport = result[0][1]
            hash_pic_message[id]["Support"] = support
            hash_pic_message[id]["UnSupport"] = unsupport

            if unsupport != "0":
                scale = float(support) / float(unsupport)
            else:
                scale = 0.0
            rank = get_scale(scale)
            hash_pic_message[id]["Scale"] = scale
            hash_pic_message[id]["Rank"] = rank


    for value in hash_pic_message.values():
        #print(value)
        pass
    return hash_pic_message.values()
开发者ID:nivrrex,项目名称:JianDanMM,代码行数:60,代码来源:jiandan.mm.py

示例5: run

	def run(self):
		headers = {'connection': 'close'}
	 	response = requests.get(self.url, headers=headers)
	 	response.encoding = 'utf-8' 
	 	column_jq = pyq(response.text)
	 	column = column_jq('title').text()

		parsed_body = html.fromstring(response.text)
		song_urls = parsed_body.xpath('//a[contains(@href, "/play/")]/@href')
		new_lyrics = []

		for song_url in song_urls:
			full_url = urlparse.urljoin("http://www.9ku.com", song_url)   # base_url ahead
			r = requests.get(full_url, headers=headers)
			r.encoding = 'utf-8'   # refer to test/get_chinese.py
			jq = pyq(r.text)
			# get title, author in song page
			brief = jq('h2#play_title').text()
			title = brief.split(' ')[1]
			author = brief.split(' ')[3]
			# two types of song pages
			if jq('div.lrcBox').text():
				content = jq('div.lrcBox').text()
			else:
				out_url = jq('h2#play_title').find('a').eq(2).attr('href')
				r_out = requests.get(out_url, headers=headers)
				r_out.encoding = 'utf-8'   # maybe dno't need
				jq_out = pyq(r_out.text)
				content = jq_out('div.ciInfo').eq(0).text()

			new_lyric = Lyric2(column=column, title=title, author=author,
				content=content)
			new_lyric.save()
		
			print 'get data from %s at %s' % (full_url, time.ctime())
开发者ID:tuner24,项目名称:practice,代码行数:35,代码来源:crawler21.py

示例6: dir_parse

	def dir_parse(self,page,spider_list,result_list):
		print page
		doc = pyq(page)
		tmp = doc('table[class=tableList]')
		trl = tmp('tr')
		for v in trl:
			td= pyq(v)('td[class=title]')
			a = td('a')
			name =  a.text().encode("UTF-8").decode("UTF-8")
			ename =""
			print name
			if len(name)>1:	
				for uchar in name:
					#print uchar
					if  is_alphabet(uchar) :
						ename += uchar
					#elif uchar =='.' or uchar ==' ' or uchar =='&':
						#ename += uchar
					elif (uchar =='(' or is_number(uchar) ) and len(ename)>2:
						break
				print "xxxx",ename

				link =  "http://banyungong.net/" + a.attr('href')
				result_list.append((ename.lower() +"," +link).encode("UTF-8"))
		

		return ""
开发者ID:zeuswang,项目名称:code,代码行数:27,代码来源:banyungong.py

示例7: getPages

	def getPages(self):
		dirs=[]
		#doc=pyq(self.url)
		#while doc("div:contains('Browse Problems')+div+table img").attr('alt')=="FOLDER" and (None in dirs[p].values()):
			#dirs[p].update(dict.fromkeys([self.root+'/'+a.attr('href') for a in doc("div:contains('Browse Problems')+div+table a")]))
			#for d,c in dirs[p].items():
		dirs.append(self.url)
		while dirs:
			curdir=dirs.pop()
			try:
				doc=pyq(curdir)
			except (httplib.IncompleteRead,urllib2.URLError):
				print "Bug!!!!!!!!!!!!!1"
				httplib.HTTPConnection._http_vsn = 10
				httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
				doc=pyq(curdir)
				#httplib.HTTPConnection._http_vsn = 11
				#httplib.HTTPConnection._http_vsn_str = 'HTTP/1.1'
			if doc("div:contains('Browse Problems')+div+table img").attr('alt')=="FOLDER":
				print "[folder]",curdir
				links=doc("div:contains('Browse Problems')+div+table a")
				for a in links:
					dirs.append(self.root+'/'+pyq(a).attr('href'))
			else:
				print "[file]",curdir
				self.pages.append(curdir)
开发者ID:jiffies,项目名称:AcmSpider,代码行数:26,代码来源:uva.py

示例8: getHtmlByPyquery

def getHtmlByPyquery(tUrl):
    posts =[]
    from pyquery import PyQuery as pyq
    r = requests.get(tUrl)
    doc=pyq(r.text)
    lis = doc(".car-monthlisting li a")
    lis = lis[0:100]
    lis.reverse()
    i=1
    for li in lis:
        link = pyq(li).attr("href")
        title =  pyq(li).text()
        print "抓取文章_%s(%s,link:%s)" %(i,title,link)
        ir = requests.get(link)
        idoc = pyq(ir.text)
        content = idoc("#content .entrybody").remove(".wumii-hook").remove("script").remove("ins").remove(".ds-thread").remove("#ds-ssr").remove("div").remove("#comments").html()
        content = content.replace("\"","\"\"");
        #print content
        post = Post()
        post.category = urllib.quote("notes") + ":段子"
        post.post_author = "geekzone"
        post.post_title = title
        post.post_content = "\""+content+"\""
        posts.append(post)
        i=i+1
    return posts
开发者ID:geekzone,项目名称:joylife,代码行数:26,代码来源:jiongma_spider.py

示例9: exportText

def exportText(section, idx, link):
    #    url = "http://book.kanunu.org/book3/6630/115916.html"
    #    req = urllib2.Request(url)
    #    response = urllib2.urlopen(req).read()
    fileName = section + "/" + idx + ".html"
    textFile = open(fileName)
    mainHtml = textFile.read()
    textFile.close()
    html = unicode(mainHtml, "GBK")
    doc = pyq(html)
    tables = doc.find("table")
    a = []
    for table in tables:
        a.append(len(pyq(table).text()))
    mx = max(a)
    textIdx = a.index(mx)
    titleIdx = textIdx - 1
    mainText = pyq(tables[textIdx]).find("p").html()
    #    afterTitle = mainText.index(r"<br/>")
    #    mainTitle = mainText[0:afterTitle].replace(u" ", "").replace(u"】", "").replace(u"【", "").strip().encode("UTF-8")
    #    mainTitle = pyq(tables[titleIdx]).text().replace(u"上部 ", "").replace(u"中部 ", "").replace(u"下部 ", "").encode("UTF-8")
    mainTitle = pyq(tables[titleIdx]).text().encode("UTF-8")
    outFile = open("Text/" + section + "/" + idx + ".xhtml", "w")
    outFile.write("<h1>" + mainTitle + "<h1/>")
    # outFile.write("<p>")
    outFile.write(mainText.encode("UTF-8"))
    # outFile.write("<p/>")
    outFile.write("<p><br/>" + link + "<br/><p/>")
    outFile.close()
    titleList.append(mainTitle)
    return mainTitle
开发者ID:richard-liang,项目名称:threebody,代码行数:31,代码来源:book.py

示例10: main

def main():
    url = 'http://taiwan.huanqiu.com/news/'
    #url = 'http://world.huanqiu.com/observation/'
    #url = 'http://china.huanqiu.com/politics/'
    doc = pyq(url=url)
    alist = doc('.pad20 li a')
    for a in alist:
        link = pyq(a).attr('href')
        get_info(link)
开发者ID:FashtimeDotCom,项目名称:migrant,代码行数:9,代码来源:importnews.py

示例11: get_proxy_list

    def get_proxy_list(self, page_range=15):
        __all_proxy_list =[]
        for __page in range(page_range):
            __url = 'http://proxylist.hidemyass.com/%s#listable' % __page
            __request = urllib2.Request(__url, headers=self.__headers)
            __response = urllib2.urlopen(__request)
            __the_page = __response.read()
            doc = pyq(__the_page)

            for __list_idx in doc('#listable tbody>tr')[:]:
                __tmp = doc(__list_idx).outerHtml()
                p = pyq(__tmp)
                for __j in p('style').text().split('\n'):
                   if __j.find('display:none')>0:
                      p.remove(__j.split('{')[0])

                p.remove('style')

                for __j in p('span,div'):
                   if p(__j).attr('style')=='display:none':
                      p(__j).remove()

                __proxy = {'last_update' : p('td').eq(0).text(),
                           'ip_address' : p('td').eq(1).text().replace(' ',''),
                           'port' : p('td').eq(2).text(),
                           'country' : p('td').eq(3).text(),
                           'countryIsoCode' : p('td').eq(3).attr('rel'),
                           'type': p('td').eq(6).text(),
                           'anon' : p('td').eq(7).text(),
                           'speed': ''.join( re.findall(u'\d', p('td').eq(4)('.indicator').attr('style').split(';')[0]) ),
                           'connection_time': ''.join( re.findall(u'\d', p('td').eq(4)('.indicator').attr('style').split(';')[0]) )
                           }
                print __proxy
                __all_proxy_list.append(__proxy)

        pickle.dump(__all_proxy_list, open('free_proxy_list', 'wb'))
        __all_proxy_list = pickle.load(open('free_proxy_list' , 'r'))
        return __all_proxy_list

        all_count_cnt = {}
        for __i in __all_proxy_list:
            if all_count_cnt.has_key(__i['country']):
                all_count_cnt[__i['country']] = all_count_cnt[__i['country']]+1
            else:
                all_count_cnt[__i['country']] = 1

        print all_count_cnt

        all_count_cnt = {}
        for __i in __all_proxy_list:
            if all_count_cnt.has_key(__i['countryIsoCode']):
                all_count_cnt[__i['countryIsoCode']] = all_count_cnt[__i['countryIsoCode']]+1
            else:
                all_count_cnt[__i['countryIsoCode']] = 1

        print all_count_cnt
开发者ID:ShaneKao,项目名称:NBA_bet_odds,代码行数:56,代码来源:get_proxy.py

示例12: _parse_data

    def _parse_data(self,pyq_node,k,data,debug):
        keymap =[]
        path = data['path']
        pathlist = path.split(',')
        node = pyq_node
        for p in pathlist:
            if '[email protected]' in p:
                attr = p[5:]
                value= node.attr(attr) 
                return value
            elif 'text' == p:
                if node.text() != None:
                    value = node.text().encode("utf-8")
                else:
                    value = None
                return value
            elif '#' in p:
                pl = p.split('#')    
                #print pl[0],pl[1]
                node = node(pl[0].encode("utf-8")).eq(int(pl[1]))
                if node !=None:
                    node = pyq(node)
                else:
                    return None
            else:
                
                node = node(p.encode("utf-8"))
                if node!=None:
                #node = pyq(node)(p)
                    node = pyq(node)
                else:
                    return None
            if debug:
                print "DEBUG,p",p
                print node

        
#        for key in data:
#            if key != 'path':
#                keymap[k]=[]
#                break;
        if len(node )> 0: 
            if debug:
                print "DEBUG",k
                print node
            for d in node:
                
                submap ={}
                for key in data:
                    if key != 'path':
                        res = self._parse_data(pyq(d),key,data[key],debug)
                        submap[key] = res
                keymap.append(submap)
            
        return keymap
开发者ID:ylbupt,项目名称:movie,代码行数:55,代码来源:parse.py

示例13: parse

    def parse(self, url):
        # 解析第一页商品列表
        res = requests.get(url)
        assert res.status_code == 200
        jq = pyq(res.content)

        goods_list = jq('.list-container>ul>li>a')
        for r in goods_list:
            goods_url = r.get('href')
            if not goods_url:
                continue
            goods_url = '%s%s' % (CosstoresGoodsListPrase.COSSTORES_HOST, goods_url)
            goods_name = r.get('title')
            #  print goods_url, goods_name

            goods_item = {
                'url' : goods_url,
                'name' : goods_name,
            }
            self.goods_list.append(goods_item)

        # 解析ajax动态请求的商品列表页,第2-n页
        next_page = jq('#infiload_nav>a')
        if next_page:
            next_page = next_page[0]
            max_page = int(next_page.get('data-maxpage'))
            next_url = next_page.get('href')
            np = re.findall('page=(\d+)', next_url)
            if not np:
                return
            np = int(np[0])
            while np <= max_page:
                next_url = re.sub('page=(\d+)', 'page=%s' % (np), next_url)
                np += 1
                res = requests.get('%s%s' % (CosstoresGoodsListPrase.COSSTORES_HOST, next_url))
                assert res.status_code == 200
                jq_page = pyq(res.content)
                goods_list = jq_page('li>a')
                if not goods_list:
                    # 解析完了
                    break
                for r in goods_list:
                    goods_url = r.get('href')
                    if not goods_url:
                        continue
                    goods_url = '%s%s' % (CosstoresGoodsListPrase.COSSTORES_HOST, goods_url)
                    goods_name = r.get('title')
                    goods_item = {
                        'url' : goods_url,
                        'name' : goods_name,
                    }
                    self.goods_list.append(goods_item)
开发者ID:Finalcheat,项目名称:Cosstores,代码行数:52,代码来源:cosstores.py

示例14: get_betting_odds_info_list

    def get_betting_odds_info_list(self):
        h = requests.get(self.url, timeout = self.timeout) #, proxies = self.proxies
        text = h.content
        pq = pyq(text)
        betting_odds_info_list = []
        startdate_html = pq('.event-holder.holder-scheduled>.eventLine.status-scheduled')
        url_html = pyq(startdate_html)('meta[itemprop=\'url\']')
        matchup_html = pyq(startdate_html)('meta[itemprop=\'name\']')
        for i in range(len(startdate_html)):
            betting_odds_info_list.append({'start_time': startdate_html.eq(i).attr('rel'),
                                       'url': url_html.eq(i).attr('content'),
                                        'away_team': matchup_html.eq(i).attr('content').split(' vs ')[0],
                                       'home_team': matchup_html.eq(i).attr('content').split(' vs ')[1]})

        return betting_odds_info_list
开发者ID:ShaneKao,项目名称:NBA_bet_odds,代码行数:15,代码来源:nba.py

示例15: parseThread

	def parseThread(self, response):
		url = response.url.replace('http://bbs', 'http://www')
		reply = []
		for floor in response.css('div.tpc_content').extract():
			reply.append(pyq(floor).text())

		self.collection.update({"url": response.url}, {'$set': {"reply": reply}}, True)
开发者ID:PillowSky,项目名称:scrapy-bots,代码行数:7,代码来源:bbs.py


注:本文中的pyquery.pyq函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。