本文整理匯總了Golang中git/oschina/net/ciweilao/game_spider/git/page.Page類的典型用法代碼示例。如果您正苦於以下問題:Golang Page類的具體用法?Golang Page怎麽用?Golang Page使用的例子?那麽, 這裏精選的類代碼示例或許可以為您提供幫助。
在下文中一共展示了Page類的7個代碼示例,這些例子默認根據受歡迎程度排序。您可以為喜歡或者感覺有用的代碼點讚,您的評價將有助於係統推薦出更棒的Golang代碼示例。
示例1: downloadHtml
func (this *HttpDownLoader) downloadHtml(p *page.Page, req *page.Request) *page.Page {
p, destBody := this.downloadFile(p, req)
if !p.IsSucc() {
return p
}
p.SetBody(destBody)
return p
}
示例2: downloadFile
//下載文件,並對字符編碼做相應的處理
func (this *HttpDownLoader) downloadFile(p *page.Page, req *page.Request) (*page.Page, string) {
var err error
var httpResp *http.Response
var urlStr string
var method string
urlStr = req.GetUrl()
if len(urlStr) == 0 {
logs.GetFirstLogger().Error("url is empty")
p.SetStatus(true, "url is empty")
return p, ""
}
method = req.GetMethod()
if method == "POST" {
httpResp, err = http.Post(req.GetUrl(), "application/x-www-form-urlencoded", strings.NewReader(req.GetPostData()))
} else {
httpResp, err = http.Get(req.GetUrl())
}
if err != nil {
logs.GetFirstLogger().Error("http visit error :" + err.Error())
p.SetStatus(true, err.Error())
}
p.SetHeader(httpResp.Header)
p.SetCookies(httpResp.Cookies())
body, _ := ioutil.ReadAll(httpResp.Body)
bodyStr := string(body)
defer httpResp.Body.Close()
return p, bodyStr
}
示例3: parseNewsLinkListInfo
func (this *YouxiduoProcesser) parseNewsLinkListInfo(content string, p *page.Page) *page.Page {
//println("B LINK URLS")
if p.IsBreak() {
return p
}
reg, _ := regexp.Compile(`<a href(.)*<\/a>`)
urlStr := reg.FindAllString(content, -1)
for _, tmp := range urlStr {
var pos1 int = strings.Index(tmp, "href=")
var pos2 int = strings.Index(tmp, ">")
if (pos2 - 1) > (pos1 + 6) {
tmp = string(tmp[pos1+6 : pos2-1])
if strings.Index(tmp, "http://") >= 0 {
continue
}
tmp = util.GetRealUrl(p.GetRequest().GetUrl(), tmp)
p.AddNewUrl(tmp, "list")
// println("list url = " + tmp)
}
}
//println("E LINK URLS")
return p
}
示例4: parseNewsBreifInfo
func (this *YouxiduoProcesser) parseNewsBreifInfo(content string, p *page.Page) *page.Page {
logs.GetFirstLogger().Trace("B TEST LIST ITEMS")
var pos1 int = strings.Index(content, "<li>")
var pos2 int = strings.Index(content, "</li>")
var count int = 1
for pos1 >= 0 && pos2 >= 0 && (pos2 > pos1) {
item := page.NewPageItems("")
tmpStr := string(content[pos1 : pos2+5])
content = string(content[pos2+5 : len(content)])
pos1 = strings.Index(content, "<li>")
pos2 = strings.Index(content, "</li>")
logs.GetFirstLogger().Trace("B================>")
reg, _ := regexp.Compile(`<span>(.)*[\d]{4}-[\d]{2}-[\d]{2}`)
timeStr := reg.FindString(tmpStr)
reg, _ = regexp.Compile(`[\d]{4}-[\d]{2}-[\d]{2}`)
timeStr = reg.FindString(timeStr)
if this.exitDate > timeStr {
p.SetBreak(true)
continue
}
item.AddItem("time", timeStr)
reg, _ = regexp.Compile("title=\"(.)*\"")
title := reg.FindString(tmpStr)
title = string(title[strings.Index(title, "\"")+1 : len(title)])
title = string(title[0:strings.Index(title, "\"")])
logs.GetFirstLogger().Trace("title = " + title)
//p.AddResultItem("title", title)
item.AddItem("title", title)
reg, _ = regexp.Compile("<img src=(.)*alt")
pic := reg.FindString(tmpStr)
pic = string(pic[strings.Index(pic, "\"")+1 : len(pic)])
pic = string(pic[0:strings.Index(pic, "\"")])
if util.IsRelativePath(pic) {
pic = util.GetRealUrl(p.GetRequest().GetUrl(), pic)
}
logs.GetFirstLogger().Trace("pic = " + pic)
//p.AddResultItem("pic", pic)
item.AddItem("pic", pic)
reg, _ = regexp.Compile("<p>(.)*</p>")
info := reg.FindString(tmpStr)
logs.GetFirstLogger().Trace("info = " + info)
//p.AddResultItem("info", info)
info = strings.Replace(info, "'", "\"", -1)
info = strings.Replace(info, "'", "\"", -1)
item.AddItem("info", info)
reg, _ = regexp.Compile("<span(.)*<a(.)*</span>")
detailurl := reg.FindString(tmpStr)
reg, _ = regexp.Compile("href(.)*\">")
detailurl = reg.FindString(detailurl)
detailurl = detailurl[strings.Index(detailurl, "\"")+1 : len(detailurl)]
detailurl = detailurl[0:strings.Index(detailurl, "\"")]
logs.GetFirstLogger().Trace("detailurl = " + detailurl)
//p.AddResultItem("detailurl", detailurl)
item.AddItem("detailurl", detailurl)
//p.AddResultItem("key", detailurl)
item.SetKey(detailurl)
p.AddNewUrl(detailurl, "content")
logs.GetFirstLogger().Trace("E================>")
logs.GetFirstLogger().Tracef("count = %d", count)
count = count + 1
logs.GetFirstLogger().Warn(title)
pos1 = strings.Index(content, "<li>")
pos2 = strings.Index(content, "</li>")
p.AddPageItems(item)
}
return p
}
示例5: Process
func (this *YouxiduoProcesser) Process(p *page.Page) {
if !p.IsSucc() {
println(p.GetErrMsg())
return
}
var body string = p.GetBody()
var urlTag string = p.GetRequest().GetUrlTag()
p.SetUrlTag(urlTag)
//分析這個頁麵是LIST頁麵還是內容頁麵
// <div class="infroList"><ul><li>...</div>===>LIST
// <div class="pagebreak">...</div>===>LIST
// CONTENT
//<div class="article"
if urlTag == "list" {
//
//1.尋找news-brief的content
regList, err := regexp.Compile(`<div class=\"infroList\">(\s|.)*<\/ul>(\s|.)*<div class=\"pagebreak\">`)
if err != nil {
logs.GetFirstLogger().Error("分析頁麵出錯,正則表達式錯誤了,url = " + p.GetRequest().GetUrl())
}
var infroList []string = regList.FindAllString(body, -1)
if len(infroList) > 0 {
this.parseNewsBreifInfo(infroList[0], p)
} else {
logs.GetFirstLogger().Info("No more list items")
}
//先尋找額外的LIST頁麵
if !p.IsBreak() {
regPageBreak, err := regexp.Compile(`<div class=\"pagebreak\">(\s|.)+<li class=\"lastPage\">`)
if err != nil {
logs.GetFirstLogger().Error("分析頁麵出錯,翻頁正則表達式錯誤,url = " + p.GetRequest().GetUrl())
}
var pageBreakList []string = regPageBreak.FindAllString(body, -1)
if len(pageBreakList) > 0 {
this.parseNewsLinkListInfo(pageBreakList[0], p)
} else {
logs.GetFirstLogger().Info("No more links")
}
}
} else {
//CONTENT
this.parseNewsDetail(body, p)
}
}
示例6: parseNewsDetail
func (this *YouxiduoProcesser) parseNewsDetail(content string, p *page.Page) *page.Page {
logs.GetFirstLogger().Trace("B TEST ARTICLE")
//println(content)
//tile , 不用考慮,在前麵已經獲取過了
item := page.NewPageItems(p.GetRequest().GetUrl())
reg, _ := regexp.Compile(`<div><span><em(.)*<\/span></div>`)
newssrc := reg.FindString(content)
//news_src,新聞來源
reg, _ = regexp.Compile(`<a(.)*<\/a>`)
newssrc = reg.FindString(newssrc)
newssrc = newssrc[strings.Index(newssrc, ">")+1 : len(newssrc)]
if strings.Index(newssrc, "<") >= 0 {
newssrc = newssrc[0:strings.Index(newssrc, "<")]
}
logs.GetFirstLogger().Trace("newssrc = " + newssrc)
//p.AddResultItem("news_src", newssrc)
item.AddItem("news_src", newssrc)
//news_content,新聞內容
reg, _ = regexp.Compile(`<div class=\"artCon\">(.|\s)*<\/div>(\s)*<div class=\"pagebreak\"`)
news := reg.FindString(content)
if len(news) > 0 {
pbIndex := strings.Index(news, "<div class=\"pagebreak\"")
if pbIndex > 0 {
news = news[0:pbIndex]
}
}
newsIndex1 := strings.Index(news, ">")
newsIndex2 := strings.Index(news, "</div>")
if newsIndex1 >= 0 && newsIndex2 >= 0 {
news = news[newsIndex1+1 : newsIndex2]
}
//p.AddResultItem("news_content", news)
news = strings.Replace(news, "'", "\"", -1)
news = strings.Replace(news, "'", "\"", -1)
// imgSrcIndex := strings.Index(news, "<img src=\"/")
// if imgSrcIndex >= 0 {
// news = strings.Replace(news, "<img src=\"/", "<img src=\""+util.GetUrlDomain(p.GetRequest().GetUrl())+"/", -1)
// }
////////////////////
imgSrcIndex := strings.Index(news, "<img ")
if imgSrcIndex >= 0 {
news = strings.Replace(news, "<img src=\"/", "<img src=\""+util.GetUrlDomain(p.GetRequest().GetUrl())+"/", -1)
news = strings.Replace(news, "<img alt=\"[^\"]\" src=\"/", "<img src=\""+util.GetUrlDomain(p.GetRequest().GetUrl())+"/", -1)
//println(news_content)
// println("===============")
reg, _ = regexp.Compile(`<img[^>]*>`)
imgList := reg.FindAllString(news, -1)
for _, img := range imgList {
//strings.Replace(news_content, img)
//println("old img ==>" + img)
newImg := img
styleIndex := strings.Index(newImg, "style=\"")
if styleIndex >= 0 {
styleStr := newImg[styleIndex+len("style=\""):]
endIndex := strings.Index(styleStr, "\"")
if endIndex > 0 {
styleStr = styleStr[0:endIndex]
}
newstyleStr := changeImgSize(styleStr)
newImg = strings.Replace(img, styleStr, newstyleStr, -1)
} else {
//找width,找height
reg2, _ := regexp.Compile(`width=\"[0-9]+\"`)
tmpWidthStr := reg2.FindString(img)
reg2, _ = regexp.Compile(`height=\"[0-9]+\"`)
tmpHeightStr := reg2.FindString(img)
//println("tmp height str = " + tmpHeightStr)
var f float32 = 1.0
if len(tmpWidthStr) > 0 {
tmpStr1 := tmpWidthStr[strings.Index(tmpWidthStr, "\"")+1:]
tmpStr1 = tmpStr1[0:strings.Index(tmpStr1, "\"")]
tmpWidth, _ := strconv.Atoi(tmpStr1)
if tmpWidth > 360 {
f = float32(tmpWidth) / 360.0
if len(tmpHeightStr) > 0 {
tmpStr2 := tmpHeightStr[strings.Index(tmpHeightStr, "\"")+1:]
tmpStr2 = tmpStr2[0:strings.Index(tmpStr2, "\"")]
tmpHeight, _ := strconv.Atoi(tmpStr2)
newImg = strings.Replace(img, tmpWidthStr, "width=\"360\"", -1)
tmpHeight = int(float32(tmpHeight) / f)
newImg = strings.Replace(newImg, tmpHeightStr, "height=\""+strconv.Itoa(tmpHeight)+"\"", -1)
} else {
newImg = strings.Replace(img, tmpWidthStr, "width=\"360\"", -1)
}
}
}
}
//.........這裏部分代碼省略.........
示例7: pageProcess
func (this *Spider) pageProcess(req *page.Request) {
var p *page.Page
//下載頁麵
for i := 0; i < 3; i++ {
p = this.m_downLoader.DownLoad(req)
if p.IsSucc() {
break
}
time.Sleep(time.Microsecond * 1000)
}
if !p.IsSucc() {
this.finishForReqProcesser(req.GetUrl())
return
}
//分析頁麵內容
this.m_pageProcesser.Process(p)
//獲取新的鏈接
if p.CountNewUrls() > 0 {
newUrls := p.GetNewUrls()
for tmpUrl, tmpUrlTag := range newUrls {
this.AddUrl(tmpUrl, "html", tmpUrlTag)
}
}
this.finishForReqProcesser(req.GetUrl())
//輸出
for _, tmpOut := range this.m_outputs {
tmpOut.Process(p.GetPageItemsList(), p.GetRequest().GetUrl())
}
}