本文整理汇总了Golang中github.com/yahoo/gryffin.Scan.IsDuplicatedPage方法的典型用法代码示例。如果您正苦于以下问题:Golang Scan.IsDuplicatedPage方法的具体用法?Golang Scan.IsDuplicatedPage怎么用?Golang Scan.IsDuplicatedPage使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类github.com/yahoo/gryffin.Scan
的用法示例。
在下文中一共展示了Scan.IsDuplicatedPage方法的4个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Golang代码示例。
示例1: extract
func (r *PhantomJSRenderer) extract(stdout io.ReadCloser, s *gryffin.Scan) {
defer close(r.done)
dec := json.NewDecoder(stdout)
for {
var m message
err := dec.Decode(&m)
if err == io.EOF {
return
} else {
if m.responseMessage != nil {
m.Response.fill(s)
if s.IsDuplicatedPage() {
return
}
r.chanResponse <- s
for _, link := range m.Response.Details.Links {
if newScan := link.toScan(s); newScan != nil && newScan.IsScanAllowed() {
r.chanLinks <- newScan
}
}
} else if m.domMessage != nil {
for _, link := range m.domMessage.Links {
if newScan := link.toScan(s); newScan != nil && newScan.IsScanAllowed() {
r.chanLinks <- newScan
}
}
}
}
}
}
示例2: extract
func (r *PhantomJSRenderer) extract(stdout io.ReadCloser, s *gryffin.Scan) {
defer close(r.done)
dec := json.NewDecoder(stdout)
for {
var m message
err := dec.Decode(&m)
if err == io.EOF {
return
} else {
if m.responseMessage != nil {
m.Response.fill(s)
if s.IsDuplicatedPage() {
return
}
r.chanResponse <- s
r.parseDetails(&m.Response.Details, s)
}
if m.details != nil {
r.parseDetails(m.details, s)
}
}
}
}
示例3: Do
func (r *PhantomJSRenderer) Do(s *gryffin.Scan) {
r.chanResponse = make(chan *gryffin.Scan, 10)
r.chanLinks = make(chan *gryffin.Scan, 10)
// Construct the command.
// render.js http(s)://<host>[:port][/path] [{"method":"post", "data":"a=1&b=2"}]
url := s.Request.URL.String()
cookies := make([]string, 0)
// ua := s.Request.UserAgent()
ua := "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36"
for _, c := range s.Cookies {
cookies = append(cookies, c.String())
}
arg := input{
Method: s.Request.Method,
Headers: inputHeaders{
UserAgent: ua,
Cookie: strings.Join(cookies, ";"),
},
}
opt, err := json.Marshal(arg)
if err != nil {
s.Error("PhantomjsRenderer.Do", err)
return
}
// s.Logmf("PhantomjsRenderer.Do", "Running: render.js %s '%s'", url, string(opt))
s.Logmf("PhantomjsRenderer.Do", "Running: render.js")
cmd := exec.Command(
os.Getenv("GOPATH")+"/src/github.com/yahoo/gryffin/renderer/resource/render.js",
url,
string(opt))
stdout, err := cmd.StdoutPipe()
if err != nil {
s.Error("PhantomjsRenderer.Do", err)
return
}
if err := cmd.Start(); err != nil {
s.Error("PhantomjsRenderer.Do", err)
return
}
kill := func(reason string) {
if err := cmd.Process.Kill(); err != nil {
// TODO - forgive "os: process already finished"
s.Error("PhantomjsRenderer.Do", err)
// log.Printf("error: %s", err)
} else {
s.Logmf("PhantomjsRenderer.Do", "[%s] Terminating the crawl process.", reason)
}
}
// Kill when timeout
_ = time.Second
if r.Timeout != 0 {
timeout := func() {
<-time.After(time.Duration(r.Timeout) * time.Second)
kill("Timeout")
}
go timeout()
}
crawl := func() {
defer close(r.chanResponse)
defer close(r.chanLinks)
dec := json.NewDecoder(stdout)
for {
var m message
err := dec.Decode(&m)
if err == io.EOF {
return
break
} else {
if m.responseMessage != nil {
m.Response.fill(s)
if s.IsDuplicatedPage() {
kill("Duplicated")
return
}
s.Logm("PhantomjsRenderer.Do.UniqueCrawl", m.MsgType)
r.chanResponse <- s
for _, link := range m.Response.Details.Links {
if newScan := link.toScan(s); newScan != nil && newScan.IsScanAllowed() {
r.chanLinks <- newScan
}
}
} else if m.domMessage != nil {
for _, link := range m.domMessage.Links {
if newScan := link.toScan(s); newScan != nil && newScan.IsScanAllowed() {
r.chanLinks <- newScan
}
}
//.........这里部分代码省略.........
示例4: Do
func (r *NoScriptRenderer) Do(s *gryffin.Scan) {
r.chanResponse = make(chan *gryffin.Scan, 10)
r.chanLinks = make(chan *gryffin.Scan, 10)
crawl := func() {
defer close(r.chanResponse)
defer close(r.chanLinks)
client := &http.Client{}
client.Timeout = time.Duration(3) * time.Second
if response, err := client.Do(s.Request); err == nil {
s.Response = response
} else {
s.Logm("NoScriptRenderer", fmt.Sprintf("error in building request: %s", err))
return
}
s.ReadResponseBody()
if s.IsDuplicatedPage() {
return
}
tokenizer := html.NewTokenizer(strings.NewReader(s.ResponseBody))
r.chanResponse <- s
for {
t := tokenizer.Next()
switch t {
case html.ErrorToken:
return
case html.StartTagToken:
token := tokenizer.Token()
if token.DataAtom.String() == "a" {
for _, attr := range token.Attr {
if attr.Key == "href" {
link := s.Spawn()
// TODO - we drop relative URL as it would drop "#".
// Yet, how about real relative URLs?
if req, err := http.NewRequest("GET", attr.Val, nil); err == nil {
if true {
// || req.URL.IsAbs() {
link.MergeRequest(req)
if link.IsScanAllowed() {
r.chanLinks <- link
}
} else {
// ignore relative URL. TOFIX.
}
} else {
log.Printf("error in building request: %s", err)
}
}
}
}
}
}
// parse and find links.
}
go crawl()
return
}