当前位置: 首页>>代码示例>>Golang>>正文


Golang html.NewTokenizer函数代码示例

本文整理汇总了Golang中golang.org/x/net/html.NewTokenizer函数的典型用法代码示例。如果您正苦于以下问题:Golang NewTokenizer函数的具体用法?Golang NewTokenizer怎么用?Golang NewTokenizer使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。


在下文中一共展示了NewTokenizer函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Golang代码示例。

示例1: ProcessHTML

// ProcessHTML parses given html from Reader interface and fills up OpenGraph structure
func (og *OpenGraph) ProcessHTML(buffer io.Reader) error {
	z := html.NewTokenizer(buffer)
	for {
		tt := z.Next()
		switch tt {
		case html.ErrorToken:
			if z.Err() == io.EOF {
				return nil
			}
			return z.Err()
		case html.StartTagToken, html.SelfClosingTagToken, html.EndTagToken:
			name, hasAttr := z.TagName()
			if atom.Lookup(name) == atom.Body {
				return nil // OpenGraph is only in head, so we don't need body
			}
			if atom.Lookup(name) != atom.Meta || !hasAttr {
				continue
			}
			m := make(map[string]string)
			var key, val []byte
			for hasAttr {
				key, val, hasAttr = z.TagAttr()
				m[atom.String(key)] = string(val)
			}
			og.ProcessMeta(m)
		}
	}
	return nil
}
开发者ID:dyatlov,项目名称:go-opengraph,代码行数:30,代码来源:opengraph.go

示例2: htmlToText

func htmlToText(r io.Reader) []byte {
	t := html.NewTokenizer(r)

	var out bytes.Buffer

	var ignorescore int
	for {
		switch token := t.Next(); token {
		case html.StartTagToken:
			if _, ok := ignoretag[string(t.Token().Data)]; ok {
				ignorescore++
			}
		case html.EndTagToken:
			if _, ok := ignoretag[string(t.Token().Data)]; ok {
				ignorescore--
			}
		case html.ErrorToken:
			return out.Bytes()
		case html.CommentToken:
			continue
		case html.TextToken:
			if ignorescore == 0 {
				html := strings.TrimSpace(t.Token().Data)
				if len(html) > 0 {
					fmt.Fprintln(&out, html)
				}
			}
		}
	}
}
开发者ID:husio,项目名称:apps,代码行数:30,代码来源:scrap.go

示例3: rewriteHTML

// rewriteHTML scans the HTML for tags with url-valued attributes, and updates
// those values with the urlRewriter function. The updated HTML is output to the
// writer.
func rewriteHTML(reader io.Reader, writer io.Writer, urlRewriter func(string) string) error {
	// Note: This assumes the content is UTF-8.
	tokenizer := html.NewTokenizer(reader)

	var err error
	for err == nil {
		tokenType := tokenizer.Next()
		switch tokenType {
		case html.ErrorToken:
			err = tokenizer.Err()
		case html.StartTagToken, html.SelfClosingTagToken:
			token := tokenizer.Token()
			if urlAttrs, ok := atomsToAttrs[token.DataAtom]; ok {
				for i, attr := range token.Attr {
					if urlAttrs.Has(attr.Key) {
						token.Attr[i].Val = urlRewriter(attr.Val)
					}
				}
			}
			_, err = writer.Write([]byte(token.String()))
		default:
			_, err = writer.Write(tokenizer.Raw())
		}
	}
	if err != io.EOF {
		return err
	}
	return nil
}
开发者ID:johndmulhausen,项目名称:kubernetes,代码行数:32,代码来源:transport.go

示例4: isHTML

func isHTML(content []byte) bool {
	isHTML := false
	if len(content) == 0 {
		return isHTML
	}
	if len(content) > 1024 {
		content = content[:1024]
	}

	z := html.NewTokenizer(bytes.NewReader(content))
	isFinish := false
	for !isFinish {
		switch z.Next() {
		case html.ErrorToken:
			isFinish = true
		case html.StartTagToken:
			tagName, _ := z.TagName()
			if bytes.Equal(tagName, []byte("html")) {
				isHTML = true
				isFinish = true
			}
		}
	}

	return isHTML
}
开发者ID:ReanGD,项目名称:go-web-search,代码行数:26,代码来源:body_parser.go

示例5: scrape

func scrape(r io.Reader) {
	z := html.NewTokenizer(r)
	buf := &bytes.Buffer{}

L:
	for {
		tt := z.Next()
		tok := z.Token()

		switch tt {
		case html.StartTagToken:
			// if you find a link, replace it with our stylesheet
			if tok.DataAtom == atom.Tr {
				// check for correct class attr and then switch to
				// html.NewTokenizerFragment
			}
			break
		case html.EndTagToken:
			// once you reach the end of the head, flush everything left in
			// the tokenizer to the buffer
			if tok.String() == "</head>" {
				buf.Write(z.Buffered())
				break L
			}
		case html.ErrorToken:
			// this is left in here for things like tracking pixels that have
			// the HTML content type, so our code doesn't break
			break L
		}
	}
}
开发者ID:bentranter,项目名称:bookstore,代码行数:31,代码来源:main.go

示例6: Autodiscover

func Autodiscover(b []byte) (string, error) {
	r := bytes.NewReader(b)
	z := html.NewTokenizer(r)
	for {
		if z.Next() == html.ErrorToken {
			if err := z.Err(); err == io.EOF {
				break
			} else {
				return "", ErrNoRssLink
			}
		}
		t := z.Token()
		switch t.DataAtom {
		case atom.Link:
			if t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken {
				attrs := make(map[string]string)
				for _, a := range t.Attr {
					attrs[a.Key] = a.Val
				}
				if attrs["rel"] == "alternate" && attrs["href"] != "" &&
					(attrs["type"] == "application/rss+xml" || attrs["type"] == "application/atom+xml") {
					return attrs["href"], nil
				}
			}
		}
	}
	return "", ErrNoRssLink
}
开发者ID:kissthink,项目名称:goread,代码行数:28,代码来源:autodiscover.go

示例7: ParseHtml

func ParseHtml(r io.Reader, url string) ([]byte, error) {
	z := html.NewTokenizer(r)
	var newHtml []byte
	lastTag := ""
	for {
		tt := z.Next()
		rawHtmlBytes := z.Raw()
		switch tt {
		case html.ErrorToken:
			e := z.Err()
			if e.Error() == "EOF" {
				return newHtml, nil
			} else {
				return make([]byte, 0), z.Err()
			}
		case html.TextToken:
			rawHtml := strings.TrimSpace(string(rawHtmlBytes[:]))
			if len(rawHtml) > 0 && lastTag == "style" {
				newCss := ParseCss(rawHtml, url)
				newHtml = append(newHtml, []byte(newCss)...)
			} else {
				newHtml = append(newHtml, rawHtmlBytes...)
			}
		case html.DoctypeToken, html.CommentToken, html.EndTagToken:
			newHtml = append(newHtml, rawHtmlBytes...)
		case html.StartTagToken:
			lastTag = flushTagToken(&newHtml, z, url)
		case html.SelfClosingTagToken:
			flushTagToken(&newHtml, z, url)
		}
		if tt != html.StartTagToken {
			lastTag = ""
		}
	}
}
开发者ID:gongshw,项目名称:lighthouse,代码行数:35,代码来源:html.go

示例8: GetPriceForBestBuy

func GetPriceForBestBuy(url string) float64 {
	resp, err := http.Get(url)
	if err != nil {
		log.Fatal(err)
	}
	defer resp.Body.Close()

	z := html.NewTokenizer(resp.Body)
	for {
		tt := z.Next()
		switch {
		case tt == html.ErrorToken:
			return 0.0
		case tt == html.StartTagToken:
			t := z.Token()
			isSpan := t.Data == "meta"
			if isSpan {
				for _, attr := range t.Attr {
					if attr.Key == "id" && strings.Contains(attr.Val, "schemaorg-offer-price") {
						nxt := z.Next()
						if nxt == html.TextToken {
							t = z.Token()
							return parseCurrency(t.Data)
						}
					}
				}
			}
		}
	}
}
开发者ID:vinaygaba,项目名称:pricetell,代码行数:30,代码来源:bestbuy.go

示例9: getLinks

func getLinks(u *url.URL) []*url.URL {

	resp, err := http.Get(u.String())
	if err != nil {
		logs.Log(fmt.Sprintf("Couldn't crawl %s", u))
	}
	defer resp.Body.Close()

	links := make([]*url.URL, 0)
	tokenizer := html.NewTokenizer(resp.Body)
	for {
		tokenType := tokenizer.Next()
		switch tokenType {
		case html.ErrorToken:
			return links
		case html.StartTagToken, html.SelfClosingTagToken:
			token := tokenizer.Token()
			if link, ok := getURL(u, token); ok {
				links = append(links, link)
			}
		}
	}

	return links
}
开发者ID:fueledbymarvin,项目名称:gocardless,代码行数:25,代码来源:crawler.go

示例10: parseTitle

func parseTitle(resp io.Reader, fallback string) string {
	r := io.LimitedReader{
		R: resp,
		N: 8192,
	}

	h := html.NewTokenizer(&r)
	for {
		tt := h.Next()
		switch tt {
		case html.ErrorToken:
			return fallback
		case html.StartTagToken:
			tag, _ := h.TagName()
			if string(tag) == "title" {
				nt := h.Next()
				switch nt {
				case html.ErrorToken:
					return "Failed to parse title"
				case html.TextToken:
					return h.Token().Data
				}
			}
		}
	}

	return fallback
}
开发者ID:velour,项目名称:holdmypage,代码行数:28,代码来源:main.go

示例11: obtainCsrf

func (w *WebClient) obtainCsrf(b io.Reader) error {
	var errorMessage error = nil
	z := html.NewTokenizer(b)

	for {
		tt := z.Next()
		switch {
		case tt == html.ErrorToken:
			return errorMessage
		case tt == html.SelfClosingTagToken:
			t := z.Token()
			isMeta := t.Data == "meta"
			if isMeta && len(t.Attr) > 0 {
				if (t.Attr[1].Key == "name") && (t.Attr[1].Val == "csrf-token") {
					w.csrf = t.Attr[0].Val
					log.Debugf("Csrf Token: %s", w.csrf)
				} else if (t.Attr[0].Key == "name") && (t.Attr[0].Val == "csrf-token") {
					w.csrf = t.Attr[1].Val
					log.Debugf("Csrf Token: %s", w.csrf)
				}
			}
		case tt == html.StartTagToken:
			t := z.Token()
			if (t.Data == "div") && len(t.Attr) > 0 && (t.Attr[0].Key == "id") && (t.Attr[0].Val == "flash_alert") {
				z.Next()
				errorMessage = errors.New(z.Token().String())
			}
		}
	}

}
开发者ID:odacremolbap,项目名称:concerto,代码行数:31,代码来源:setup.go

示例12: Crawl

// crawl the page
func Crawl(url string, ch chan string) {
	resp, _ := http.Get(url_prefix + url)
	tokenizer := html.NewTokenizer(resp.Body)
	defer resp.Body.Close()

	for {
		token := tokenizer.Next()
		switch {
		case token == html.ErrorToken:
			// End of page
			ch <- "END!"
			return
		case token == html.StartTagToken:
			start_tt := tokenizer.Token()
			if start_tt.Data == "div" {
				//fmt.Println("get a div! %v", num)
				if isSummary(start_tt) {
					getQ(*tokenizer, ch)
				}
			} else {
				continue
			}
		}
	}
}
开发者ID:carol-hsu,项目名称:go-study,代码行数:26,代码来源:multiple-web-crawlers.go

示例13: avanza_get_sellprice

/*
 * avanza_get_sellprice
 *
 * Site: Avanza
 * Gets the current sellprice from a given httpResponse
 */
func (this *Parse) avanza_get_sellprice(resp *http.Response) float64 {
	z := html.NewTokenizer(resp.Body)

	for {
		tt := z.Next()

		switch {
		case tt == html.ErrorToken:
			return 0.0
		case tt == html.StartTagToken:
			t := z.Token()

			if isCatch := t.Data == "span"; isCatch {
				for _, attr := range t.Attr {
					if strings.Contains(attr.Val, "sellPrice") {
						z.Next()
						tt := z.Token()
						strval := strings.Replace(tt.String(), ",", ".", -1)
						value, _ := strconv.ParseFloat(strval, 64)
						return value
					}
				}
			}
		}
	}
}
开发者ID:Balzzanar,项目名称:golang,代码行数:32,代码来源:parse.go

示例14: GetPriceForWalmart

func GetPriceForWalmart(url string) float64 {
	resp, err := http.Get(url)
	if err != nil {
		log.Fatal(err)
	}
	defer resp.Body.Close()

	z := html.NewTokenizer(resp.Body)
	for {
		tt := z.Next()
		switch {
		case tt == html.ErrorToken:
			return 0.0
		case tt == html.StartTagToken:
			t := z.Token()
			isSpan := t.Data == "script"
			if isSpan {
				for _, attr := range t.Attr {
					if attr.Key == "id" && strings.Contains(attr.Val, "tb-djs-wml-base") {
						nxt := z.Next()
						if nxt == html.TextToken {
							return parseJson(z.Token().Data)
						}
					}
				}
			}
		}
	}
}
开发者ID:vinaygaba,项目名称:pricetell,代码行数:29,代码来源:walmart.go

示例15: scrapePageWorker

// scrapePageWorker -- this is the function that does most of the work in parsing the HTML
func scrapePageWorker(page *io.ReadCloser, out chan [2]string, chFinished chan bool) {
	defer func() {
		chFinished <- true
	}()
	z := html.NewTokenizer(*page)
	// infinite loop to toss state tokens into a url map
	for {
		var result [2]string
		tt := z.Next()
		switch {
		case tt == html.ErrorToken:
			return
		case tt == html.StartTagToken:
			t := z.Token()

			isAnchor := t.Data == "a"
			if !isAnchor {
				continue
			}
			if isAnchor {
				for _, attr := range t.Attr {
					if attr.Key == "id" {
						result[0] = attr.Val
					}
					if attr.Key == "data-href" {
						result[1] = attr.Val
						out <- result
					}
				}
			}
		}
	} // end for
}
开发者ID:ohrodr,项目名称:2kcookies,代码行数:34,代码来源:2kcookies.go


注:本文中的golang.org/x/net/html.NewTokenizer函数示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。