本文整理汇总了Golang中golang.org/x/net/html.NewTokenizer函数的典型用法代码示例。如果您正苦于以下问题:Golang NewTokenizer函数的具体用法?Golang NewTokenizer怎么用?Golang NewTokenizer使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了NewTokenizer函数的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Golang代码示例。
示例1: ProcessHTML
// ProcessHTML parses given html from Reader interface and fills up OpenGraph structure
func (og *OpenGraph) ProcessHTML(buffer io.Reader) error {
z := html.NewTokenizer(buffer)
for {
tt := z.Next()
switch tt {
case html.ErrorToken:
if z.Err() == io.EOF {
return nil
}
return z.Err()
case html.StartTagToken, html.SelfClosingTagToken, html.EndTagToken:
name, hasAttr := z.TagName()
if atom.Lookup(name) == atom.Body {
return nil // OpenGraph is only in head, so we don't need body
}
if atom.Lookup(name) != atom.Meta || !hasAttr {
continue
}
m := make(map[string]string)
var key, val []byte
for hasAttr {
key, val, hasAttr = z.TagAttr()
m[atom.String(key)] = string(val)
}
og.ProcessMeta(m)
}
}
return nil
}
示例2: htmlToText
func htmlToText(r io.Reader) []byte {
t := html.NewTokenizer(r)
var out bytes.Buffer
var ignorescore int
for {
switch token := t.Next(); token {
case html.StartTagToken:
if _, ok := ignoretag[string(t.Token().Data)]; ok {
ignorescore++
}
case html.EndTagToken:
if _, ok := ignoretag[string(t.Token().Data)]; ok {
ignorescore--
}
case html.ErrorToken:
return out.Bytes()
case html.CommentToken:
continue
case html.TextToken:
if ignorescore == 0 {
html := strings.TrimSpace(t.Token().Data)
if len(html) > 0 {
fmt.Fprintln(&out, html)
}
}
}
}
}
示例3: rewriteHTML
// rewriteHTML scans the HTML for tags with url-valued attributes, and updates
// those values with the urlRewriter function. The updated HTML is output to the
// writer.
func rewriteHTML(reader io.Reader, writer io.Writer, urlRewriter func(string) string) error {
// Note: This assumes the content is UTF-8.
tokenizer := html.NewTokenizer(reader)
var err error
for err == nil {
tokenType := tokenizer.Next()
switch tokenType {
case html.ErrorToken:
err = tokenizer.Err()
case html.StartTagToken, html.SelfClosingTagToken:
token := tokenizer.Token()
if urlAttrs, ok := atomsToAttrs[token.DataAtom]; ok {
for i, attr := range token.Attr {
if urlAttrs.Has(attr.Key) {
token.Attr[i].Val = urlRewriter(attr.Val)
}
}
}
_, err = writer.Write([]byte(token.String()))
default:
_, err = writer.Write(tokenizer.Raw())
}
}
if err != io.EOF {
return err
}
return nil
}
示例4: isHTML
func isHTML(content []byte) bool {
isHTML := false
if len(content) == 0 {
return isHTML
}
if len(content) > 1024 {
content = content[:1024]
}
z := html.NewTokenizer(bytes.NewReader(content))
isFinish := false
for !isFinish {
switch z.Next() {
case html.ErrorToken:
isFinish = true
case html.StartTagToken:
tagName, _ := z.TagName()
if bytes.Equal(tagName, []byte("html")) {
isHTML = true
isFinish = true
}
}
}
return isHTML
}
示例5: scrape
func scrape(r io.Reader) {
z := html.NewTokenizer(r)
buf := &bytes.Buffer{}
L:
for {
tt := z.Next()
tok := z.Token()
switch tt {
case html.StartTagToken:
// if you find a link, replace it with our stylesheet
if tok.DataAtom == atom.Tr {
// check for correct class attr and then switch to
// html.NewTokenizerFragment
}
break
case html.EndTagToken:
// once you reach the end of the head, flush everything left in
// the tokenizer to the buffer
if tok.String() == "</head>" {
buf.Write(z.Buffered())
break L
}
case html.ErrorToken:
// this is left in here for things like tracking pixels that have
// the HTML content type, so our code doesn't break
break L
}
}
}
示例6: Autodiscover
func Autodiscover(b []byte) (string, error) {
r := bytes.NewReader(b)
z := html.NewTokenizer(r)
for {
if z.Next() == html.ErrorToken {
if err := z.Err(); err == io.EOF {
break
} else {
return "", ErrNoRssLink
}
}
t := z.Token()
switch t.DataAtom {
case atom.Link:
if t.Type == html.StartTagToken || t.Type == html.SelfClosingTagToken {
attrs := make(map[string]string)
for _, a := range t.Attr {
attrs[a.Key] = a.Val
}
if attrs["rel"] == "alternate" && attrs["href"] != "" &&
(attrs["type"] == "application/rss+xml" || attrs["type"] == "application/atom+xml") {
return attrs["href"], nil
}
}
}
}
return "", ErrNoRssLink
}
示例7: ParseHtml
func ParseHtml(r io.Reader, url string) ([]byte, error) {
z := html.NewTokenizer(r)
var newHtml []byte
lastTag := ""
for {
tt := z.Next()
rawHtmlBytes := z.Raw()
switch tt {
case html.ErrorToken:
e := z.Err()
if e.Error() == "EOF" {
return newHtml, nil
} else {
return make([]byte, 0), z.Err()
}
case html.TextToken:
rawHtml := strings.TrimSpace(string(rawHtmlBytes[:]))
if len(rawHtml) > 0 && lastTag == "style" {
newCss := ParseCss(rawHtml, url)
newHtml = append(newHtml, []byte(newCss)...)
} else {
newHtml = append(newHtml, rawHtmlBytes...)
}
case html.DoctypeToken, html.CommentToken, html.EndTagToken:
newHtml = append(newHtml, rawHtmlBytes...)
case html.StartTagToken:
lastTag = flushTagToken(&newHtml, z, url)
case html.SelfClosingTagToken:
flushTagToken(&newHtml, z, url)
}
if tt != html.StartTagToken {
lastTag = ""
}
}
}
示例8: GetPriceForBestBuy
func GetPriceForBestBuy(url string) float64 {
resp, err := http.Get(url)
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
z := html.NewTokenizer(resp.Body)
for {
tt := z.Next()
switch {
case tt == html.ErrorToken:
return 0.0
case tt == html.StartTagToken:
t := z.Token()
isSpan := t.Data == "meta"
if isSpan {
for _, attr := range t.Attr {
if attr.Key == "id" && strings.Contains(attr.Val, "schemaorg-offer-price") {
nxt := z.Next()
if nxt == html.TextToken {
t = z.Token()
return parseCurrency(t.Data)
}
}
}
}
}
}
}
示例9: getLinks
func getLinks(u *url.URL) []*url.URL {
resp, err := http.Get(u.String())
if err != nil {
logs.Log(fmt.Sprintf("Couldn't crawl %s", u))
}
defer resp.Body.Close()
links := make([]*url.URL, 0)
tokenizer := html.NewTokenizer(resp.Body)
for {
tokenType := tokenizer.Next()
switch tokenType {
case html.ErrorToken:
return links
case html.StartTagToken, html.SelfClosingTagToken:
token := tokenizer.Token()
if link, ok := getURL(u, token); ok {
links = append(links, link)
}
}
}
return links
}
示例10: parseTitle
func parseTitle(resp io.Reader, fallback string) string {
r := io.LimitedReader{
R: resp,
N: 8192,
}
h := html.NewTokenizer(&r)
for {
tt := h.Next()
switch tt {
case html.ErrorToken:
return fallback
case html.StartTagToken:
tag, _ := h.TagName()
if string(tag) == "title" {
nt := h.Next()
switch nt {
case html.ErrorToken:
return "Failed to parse title"
case html.TextToken:
return h.Token().Data
}
}
}
}
return fallback
}
示例11: obtainCsrf
func (w *WebClient) obtainCsrf(b io.Reader) error {
var errorMessage error = nil
z := html.NewTokenizer(b)
for {
tt := z.Next()
switch {
case tt == html.ErrorToken:
return errorMessage
case tt == html.SelfClosingTagToken:
t := z.Token()
isMeta := t.Data == "meta"
if isMeta && len(t.Attr) > 0 {
if (t.Attr[1].Key == "name") && (t.Attr[1].Val == "csrf-token") {
w.csrf = t.Attr[0].Val
log.Debugf("Csrf Token: %s", w.csrf)
} else if (t.Attr[0].Key == "name") && (t.Attr[0].Val == "csrf-token") {
w.csrf = t.Attr[1].Val
log.Debugf("Csrf Token: %s", w.csrf)
}
}
case tt == html.StartTagToken:
t := z.Token()
if (t.Data == "div") && len(t.Attr) > 0 && (t.Attr[0].Key == "id") && (t.Attr[0].Val == "flash_alert") {
z.Next()
errorMessage = errors.New(z.Token().String())
}
}
}
}
示例12: Crawl
// crawl the page
func Crawl(url string, ch chan string) {
resp, _ := http.Get(url_prefix + url)
tokenizer := html.NewTokenizer(resp.Body)
defer resp.Body.Close()
for {
token := tokenizer.Next()
switch {
case token == html.ErrorToken:
// End of page
ch <- "END!"
return
case token == html.StartTagToken:
start_tt := tokenizer.Token()
if start_tt.Data == "div" {
//fmt.Println("get a div! %v", num)
if isSummary(start_tt) {
getQ(*tokenizer, ch)
}
} else {
continue
}
}
}
}
示例13: avanza_get_sellprice
/*
* avanza_get_sellprice
*
* Site: Avanza
* Gets the current sellprice from a given httpResponse
*/
func (this *Parse) avanza_get_sellprice(resp *http.Response) float64 {
z := html.NewTokenizer(resp.Body)
for {
tt := z.Next()
switch {
case tt == html.ErrorToken:
return 0.0
case tt == html.StartTagToken:
t := z.Token()
if isCatch := t.Data == "span"; isCatch {
for _, attr := range t.Attr {
if strings.Contains(attr.Val, "sellPrice") {
z.Next()
tt := z.Token()
strval := strings.Replace(tt.String(), ",", ".", -1)
value, _ := strconv.ParseFloat(strval, 64)
return value
}
}
}
}
}
}
示例14: GetPriceForWalmart
func GetPriceForWalmart(url string) float64 {
resp, err := http.Get(url)
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
z := html.NewTokenizer(resp.Body)
for {
tt := z.Next()
switch {
case tt == html.ErrorToken:
return 0.0
case tt == html.StartTagToken:
t := z.Token()
isSpan := t.Data == "script"
if isSpan {
for _, attr := range t.Attr {
if attr.Key == "id" && strings.Contains(attr.Val, "tb-djs-wml-base") {
nxt := z.Next()
if nxt == html.TextToken {
return parseJson(z.Token().Data)
}
}
}
}
}
}
}
示例15: scrapePageWorker
// scrapePageWorker -- this is the function that does most of the work in parsing the HTML
func scrapePageWorker(page *io.ReadCloser, out chan [2]string, chFinished chan bool) {
defer func() {
chFinished <- true
}()
z := html.NewTokenizer(*page)
// infinite loop to toss state tokens into a url map
for {
var result [2]string
tt := z.Next()
switch {
case tt == html.ErrorToken:
return
case tt == html.StartTagToken:
t := z.Token()
isAnchor := t.Data == "a"
if !isAnchor {
continue
}
if isAnchor {
for _, attr := range t.Attr {
if attr.Key == "id" {
result[0] = attr.Val
}
if attr.Key == "data-href" {
result[1] = attr.Val
out <- result
}
}
}
}
} // end for
}