当前位置: 首页>>代码示例>>Scala>>正文


Scala Document类代码示例

本文整理汇总了Scala中org.jsoup.nodes.Document的典型用法代码示例。如果您正苦于以下问题:Scala Document类的具体用法?Scala Document怎么用?Scala Document使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了Document类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。

示例1: NoticeServiceObjects

//设置package包名称以及导入依赖的类
package com.zhranklin.homepage.notice

import org.json4s._
import org.jsoup.Jsoup
import org.jsoup.nodes.Document

object NoticeServiceObjects {

  trait ServiceBase extends IndexService with FunNoticeFetcher with SelectorUrlService {
    val initVal: ((Document) ? String, (Document) ? String, String, String)
    lazy val (getContent, getDateStr, urlPattern, template) = initVal
  }

  class LawService(title: String, listId: String) extends NoticeService(s"??? - $title") with UrlService with IndexService with FunNoticeFetcher {
    val getContent = contentF("div.text")
    val getDateStr = dateF("span:contains(????)")
    val template = "http://law.scu.edu.cn/xjax?arg=8573&arg=<index>&arg=20&arg=list&clazz=PortalArticleAction&method=list"

    def getUrl(id: String) = s"http://law.scu.edu.cn/detail.jsp?portalId=725&cid=8385&nextcid=$listId&aid=$id"

    override def noticeUrlsFromUrl(url: String): Iterable[NoticeEntry] = {
      val jsonStr = Jsoup.connect(url).execute().body()
      val json = jackson.parseJson(jsonStr)
      json.\("data").asInstanceOf[JArray].arr.map(
        jo ? NoticeEntry(getUrl(jo.\("id").values.toString), Some(jo.\("subject").values.toString)))
    }
  }

  val serviceList = List(
    "???? - ???? - test" ?
      "http://www.sculj.cn/Special_News.asp?SpecialID=40&SpecialName=%D1%A7%D4%BA%B6%AF%CC%AC&page=<index>",
    "???? - ???? - test" ? "http://sesu.scu.edu.cn/news/list_1_<index>.html",
    "???? - ????" ? "http://sesu.scu.edu.cn/gonggao/list_2_<index>.html",
    "????? - ???? - test" ? "http://cs.scu.edu.cn/cs/xsky/xskb/H951901index_<index>.htm",
    "????? - ???? - test" ? "http://cs.scu.edu.cn/cs/xytz/H9502index_<index>.htm",
    "????? - ???? - test" ? "http://cs.scu.edu.cn/cs/xyxw/H9501index_<index>.htm",
    "????? - ??? - test" ? "http://cs.scu.edu.cn/cs/fwzy/ftl/H951204index_<index>.htm",
    "???? - test" ? "http://news.scu.edu.cn/news2012/cdzx/I0201index_<index>.htm",
    "???? - ????" ?"http://math.scu.edu.cn/news.asp?PAGE=<index>",
    "?????? - ????" ? "http://seei.scu.edu.cn/student,p<index>,index.jsp",
    "????? - ????" ? "http://flc2.scu.edu.cn/foreign/a/xueyuangonggao/list_27_<index>.html"
  ).map { tp ?
    new NoticeService(tp._1) with UniversalUrlService with UniversalNoticeFetcher with IndexService {
      val template = tp._2
    }
  } ++ List(
    new NoticeService("??? - ??") with ServiceBase {
      val initVal =(selectorF("input[name=news.content]")(_.first.attr("value")), dateF("table[width=900] td:contains(????)"),
        "newsShow.*", "http://jwc.scu.edu.cn/jwc/moreNotice.action?url=moreNotice.action&type=2&keyWord=&pager.pageNow=<index>")},
    new LawService("????", "8572"),
    new LawService("????", "8573")
  )
} 
开发者ID:zhranklin,项目名称:Private_Blog,代码行数:54,代码来源:NoticeServiceObjects.scala

示例2: IsapReader

//设置package包名称以及导入依赖的类
package pl.mojepanstwo.sap.toakoma.readers

import org.slf4j.LoggerFactory
import org.springframework.batch.item.ItemReader

import org.jsoup.Jsoup

import org.jsoup.nodes.Document
import com.gargoylesoftware.htmlunit.WebClient
import pl.mojepanstwo.sap.toakoma._

object IsapReader {
  val BASE_URL = "http://isap.sejm.gov.pl"
  val URL      = BASE_URL + "/DetailsServlet?id="
}

class IsapReader(val id: String) extends ItemReader[Document] {

  val logger = LoggerFactory.getLogger(this.getClass())

  var last = false

  def read : Document = {
    logger.trace("read")

    if(last) return null

    this.last = true
    val isapUrl = IsapReader.URL + id
    val rsp = Jsoup.connect(isapUrl).get
    if(rsp.body.text.contains("Brak aktu prawnego o podanym adresie publikacyjnym !"))
      throw new NoSuchDocumentException
    return rsp
  }
} 
开发者ID:PrawoPolskie,项目名称:toakoma,代码行数:36,代码来源:IsapReader.scala

示例3: get

//设置package包名称以及导入依赖的类
package pl.mojepanstwo.sap.toakoma.services

import java.net.URL
import java.io.File
import org.apache.commons.io.FileUtils
import org.jsoup.nodes.Document
import com.gargoylesoftware.htmlunit._
import org.jsoup.Jsoup

trait Scraper {
  def get(url: String) : Document
  def dowloadFile(fileUrl:String, filePath:String) : String
}

class DefaultScraperService extends Scraper {

  val webClient = new WebClient

  def get(url: String) : Document = {
      webClient.setRefreshHandler(new RefreshHandler {
        override def handleRefresh(page: Page, url: URL, i: Int): Unit = webClient.getPage(url)
      })
      val apPage: Page = webClient.getPage(url)
      Jsoup.parse(apPage.getWebResponse.getContentAsString)
  }

  def dowloadFile(fileUrl:String, filePath:String) : String = {
    val url = new URL(fileUrl)
    val tmp = new File(filePath)
    FileUtils.copyURLToFile(url, tmp)
    tmp.getAbsolutePath()
  }

} 
开发者ID:PrawoPolskie,项目名称:toakoma,代码行数:35,代码来源:Scraper.scala

示例4: ResourceScraperService

//设置package包名称以及导入依赖的类
package pl.mojepanstwo.sap.toakoma

import pl.mojepanstwo.sap.toakoma.services.Scraper
import org.jsoup.nodes.Document
import org.jsoup.Jsoup
import scala.io.Source
import java.io.File
import java.nio.file.Files
import org.apache.commons.io.IOUtils
import java.io.FileOutputStream

class ResourceScraperService extends Scraper {

  def get(url: String) : Document = {
    val pattern = ".*id=(.*)&type=([0-9]+).*".r
    val pattern(id, docType) = url
    Jsoup.parse(Source.fromResource("isap/" + id + "/" + docType + ".html").mkString)
  }

  def dowloadFile(fileUrl:String, filePath:String) : String = {
    val pattern = ".*id=(.*)&type=([0-9]+).*".r
    val pattern(id, docType) = fileUrl
    val src = getClass.getResourceAsStream("/isap/" + id + "/" + docType + ".pdf")
    val dest = new File(filePath)
    val out = new FileOutputStream(dest)
    IOUtils.copy(src, out)
    src.close()
    out.close()
    dest.getAbsolutePath
  }

} 
开发者ID:PrawoPolskie,项目名称:toakoma,代码行数:33,代码来源:ResourceScraperService.scala

示例5: GgleLoginTest

//设置package包名称以及导入依赖的类
package com.szadowsz.tarbh.ggle

import com.szadowsz.common.net.Uri
import com.szadowsz.maeve.core.MaeveDriver
import com.szadowsz.maeve.core.browser.MaeveConf
import com.szadowsz.maeve.core.instruction.MaeveInstruction
import com.szadowsz.maeve.core.instruction.extractor.JsoupExtractor
import com.szadowsz.maeve.core.instruction.target.single.SingleTarget
import com.szadowsz.maeve.gglegrp.actions.GgleExecutor
import org.jsoup.nodes.Document


object GgleLoginTest {
  private val link = ""
  private val username: String = ""
  private val passwd: String = ""
  private val urlOfGrp = Uri(link)
  private val groupName: String = ""

  private def buildConfig(): MaeveConf = {
    MaeveConf()
      .setJavaScriptEnabled(true)
      .setHTTPProxy("", 0, Nil)
      .setThrowExceptionOnScriptError(false)
  }



  def main(args: Array[String]): Unit = {
    System.setProperty("webdriver.chrome.driver", ".\\chromedriver_win32\\chromedriver.exe")
    val conf = buildConfig()
    val scraper = new MaeveDriver(conf)

    val rootTarget = SingleTarget(urlOfGrp)
    class TestExtractor extends JsoupExtractor {
      override def extract(queryUrl: Uri, returnedUrl: Uri, inst: MaeveInstruction[_], page: Document): Unit = {}
      override def shouldContinue(): Boolean = false
    }
    val rootFilter = new TestExtractor()

    val actions = new GgleExecutor(username, passwd)
    val rootInstruction = MaeveInstruction(groupName, rootTarget, actions, rootFilter, "./data/grp/", false, false, false, MaeveConf().setNoProxy())

    scraper.feedInstruction(rootInstruction)
    scraper.scrapeUsingCurrInstruction()
  }
} 
开发者ID:zakski,项目名称:project-disco,代码行数:48,代码来源:GgleLoginTest.scala

示例6: compile

//设置package包名称以及导入依赖的类
package indi.lewis.spider.html

import java.util

import com.google.gson._
import org.jsoup.Jsoup
import org.jsoup.nodes.Document


private[html] trait ElementType {
  var elementName: String = _;
  def compile(doc:Document):JsonElement;
  def compile(doc:String):JsonElement=compile(Jsoup.parse(doc));
}

private[html] case class ModelParent() extends ElementType {
  elementName="root";

  def this(elementName: String) {
    this();
    this.elementName = elementName;
  }

  val properties: java.util.ArrayList[ElementType] = new util.ArrayList[ElementType]();

  override def compile(doc: Document): JsonElement = {
    val ret=new JsonObject
    for(i <- 0 to properties.size()-1; o= properties.get(i)){
      ret.add(o.elementName,o.compile(doc))
    }
    ret
  }
}

private[html] case class ModelElement(val elName: String, val f: (Document) => ElementType) extends ElementType {
  this.elementName=elName
  override def compile(doc: Document): JsonElement = f(doc).compile(doc)
}

private[html] case class ModelArray(val elName: String) extends ElementType {
  val array: java.util.ArrayList[ElementType] = new util.ArrayList[ElementType]();
  this.elementName=elName
  override def compile(doc: Document): JsonElement = {
    val jsonArray=new JsonArray
    for(i <- 0 to array.size()-1; o= array.get(i)){
      jsonArray.add(o.compile(doc))
    }
    jsonArray
  }
}

private[html] case class ModelConstant(val elName: String, val value: Object) extends ElementType {
  this.elementName=elName
  override def compile(doc: Document): JsonElement = if(value!=null)new JsonPrimitive(value.toString) else JsonNull.INSTANCE
} 
开发者ID:TokisakiFun,项目名称:Katipo,代码行数:56,代码来源:ElementType.scala

示例7: LaporBot

//设置package包名称以及导入依赖的类
package io.github.asepsaep.laporcrawler.bot

import scala.collection.JavaConverters._

import org.jsoup.nodes.Document
import org.jsoup.Jsoup

import io.github.asepsaep.laporcrawler.model.Ticket

case class LaporBot(ticketId: Int) {

  private var ticket = new Ticket()
  private val url = "http://36.66.86.72/pengaduan/" + ticketId

  //  System.setProperty("socksProxyHost", "127.0.0.1")
  //  System.setProperty("socksProxyPort", "10001")
  //  System.setProperty("socksProxyVersion", "5")

  def crawl(): Option[Ticket] = {
    val doc = Jsoup.connect(url).timeout(30000).get()
    val maybeTicket = if (doc.getElementsByClass("no-data").isEmpty) Option(parse(doc)) else None
    maybeTicket
  }

  def parse(doc: Document): Ticket = {
    val id = ticketId
    val title = doc.getElementById("row_Subject").text
    val splitContent = doc.getElementById("row_content").text.split(", ")
    val content = if (splitContent.length > 1) splitContent.tail.mkString(", ") else doc.getElementById("row_content").text

    ticket = ticket.copy(id = id, title = title, content = content)

    val details = doc.getElementsByClass("feedback-details").first.getElementsByTag("p").asScala
    for (p ? details) {
      val span = p.getElementsByTag("span")
      span.first.text match {
        case "USER:"     ? ticket = ticket.copy(user = Some(span.last.text))
        case "PLATFORM:" ? ticket = ticket.copy(platform = Some(span.last.text))
        case "TANGGAL:"  ? ticket = ticket.copy(date = Some(span.last.text))
        case "KATEGORI:" ? ticket = ticket.copy(category = Some(span.last.text))
        case "AREA:"     ? ticket = ticket.copy(area = Some(span.last.text))
        case "STATUS:"   ? ticket = ticket.copy(status = Some(span.last.text))
        case _           ? {}
      }
    }

    val dispatchedTo = doc.select(".administrator .comment-content").first.getElementsByTag("p").first.getElementsByTag("span").first.getElementsByTag("b").first.text
    ticket = ticket.copy(dispatchedTo = Some(dispatchedTo))

    ticket
  }

} 
开发者ID:asepsaep,项目名称:lapor-crawler,代码行数:54,代码来源:LaporBot.scala

示例8: checkElementAndConvert

//设置package包名称以及导入依赖的类
package haishu.crawler.selector

import org.jsoup.nodes.{Document, Element}


  private def checkElementAndConvert(element: Element): Element = element match {
    case d: Document => d
    case _ =>
      val root = new Document(element.ownerDocument().baseUri())
      root.appendChild(element.clone())
      root
  }

  override def css(selector: String): Selectable = {
    val cssSelector = Selectors.css(selector)
    selectElements(cssSelector)
  }

  override def css(selector: String, attrName: String): Selectable = {
    val cssSelector = Selectors.css(selector, attrName)
    selectElements(cssSelector)
  }
} 
开发者ID:hualongdata,项目名称:hl-crawler,代码行数:24,代码来源:HtmlNode.scala

示例9: StyleguideSpider

//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt

import org.jsoup.nodes.{ Element, Document }
import scala.concurrent.Future
import com.themillhousegroup.scoup.{ ScoupImplicits, Scoup }
import scala.concurrent.ExecutionContext.Implicits.global
import java.net.URL


object StyleguideSpider extends ScoupImplicits {

  def visit(url: URL, thisPageOnly: Boolean = false): Future[Set[Document]] = {
    visitLink(url, Set.empty, thisPageOnly)
  }

  private def visitLink(url: URL, alreadyVisited: Set[URL], thisPageOnly: Boolean): Future[Set[Document]] = {
    Scoup.parse(url.toString).flatMap { doc =>

      if (thisPageOnly) {
        Future.successful(Set(doc))
      } else {
        visitLinks(url, doc, alreadyVisited)
      }
    }
  }

  private def visitLinks(url: URL, doc: Document, alreadyVisited: Set[URL]) = {
    val links = doc.select("a").filter(isLocal).map(_.attr("href"))
    links.map(createFullLocalUrl(url)).filter(!alreadyVisited.contains(_)).foldLeft(Future.successful(Set(doc))) {
      case (acc, link) =>
        for {
          existingDocs <- acc
          newDocs <- visitLink(link, alreadyVisited + link, false)
        } yield (existingDocs ++ newDocs)
    }
  }

  private def isLocal(link: Element): Boolean = {
    val href = link.attr("href")
    href.startsWith("/")
  }

  def createFullLocalUrl(base: URL)(link: String): URL = {
    (new java.net.URL(base, link))
  }
} 
开发者ID:themillhousegroup,项目名称:witchhunt,代码行数:47,代码来源:StyleguideSpider.scala

示例10: StylesheetFinder

//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt

import com.themillhousegroup.scoup.{ Scoup, ScoupImplicits }
import org.jsoup.nodes.Document

import scala.concurrent.Future
import scala.concurrent.ExecutionContext.Implicits.global


object StylesheetFinder extends ScoupImplicits {

  def allStylesheetUrls(doc: Document): Seq[String] = {
    doc.head.select("link").filter { elem =>
      elem.attr("rel") == "stylesheet"
    }.map { elem =>
      elem.attr("href")
    }.toSeq
  }

  def localStylesheetUrls(doc: Document): Seq[String] = {
    allStylesheetUrls(doc).filter { url =>
      // It starts with a single-slash ONLY (a double-slash means protocol-relative"
      (url.startsWith("/") && !url.startsWith("//")) ||
        // It doesn't start with a traditional protocol specifier
        !(url.startsWith("http:") || url.startsWith("https://"))
    }
  }
} 
开发者ID:themillhousegroup,项目名称:witchhunt,代码行数:29,代码来源:StylesheetFinder.scala

示例11: checkSelector

//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt.checks

import com.themillhousegroup.witchhunt.{ExcessiveSpecificityViolation, RuleEnumerator, Violation, ViolationType}
import org.jsoup.nodes.Document
import com.helger.css.decl.CSSDeclaration

trait WitchhuntViolationCheck {
  def checkSelector(implicit ruleSet: RuleEnumerator, selector: String, lineNumber: Int, declarationsWithin: Seq[CSSDeclaration], applicablePages: Set[Document]): Option[Violation]

  protected def buildViolation[VT <: ViolationType](vt: VT,
                                                    thresholdValue:Option[Int] = None,
                                                    violationValue:Option[Int] = None)(implicit ruleSet: RuleEnumerator,
                                                                                       selector: String,
                                                                                       lineNumber: Int,
                                                                                       applicablePages: Set[Document]):Option[Violation] = {
    Some(
      Violation(
        ruleSet.sourceName,
        ruleSet.sourceUrl,
        lineNumber,
        selector,
        applicablePages.map(_.location),
        vt,
        thresholdValue,
        violationValue
      )
    )
  }
} 
开发者ID:themillhousegroup,项目名称:witchhunt,代码行数:30,代码来源:WitchhuntViolationCheck.scala

示例12: ExcessiveSpecificityCheck

//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt.checks

import com.themillhousegroup.scoup.ScoupImplicits
import com.themillhousegroup.witchhunt._
import org.jsoup.nodes.Document
import com.helger.css.decl.CSSDeclaration

class ExcessiveSpecificityCheck(options: WitchhuntOptions) extends WitchhuntViolationCheck with ScoupImplicits {

  // Return a violation if the selector is more specific that the configured limit
  def checkSelector(implicit ruleSet: RuleEnumerator, selector: String, lineNumber: Int, declarationsWithin: Seq[CSSDeclaration], applicablePages: Set[Document]): Option[Violation] = {

    val result = Specificity.calculateSingle(selector)

    if (result.asInt > options.specificityLimit) {
      buildViolation(
        ExcessiveSpecificityViolation,
        Some(options.specificityLimit),
        Some(result.asInt)
      )
    } else {
      None
    }
  }
} 
开发者ID:themillhousegroup,项目名称:witchhunt,代码行数:26,代码来源:ExcessiveSpecificityCheck.scala

示例13: ExcessiveColorsCheck

//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt.checks

import com.themillhousegroup.scoup.ScoupImplicits
import com.themillhousegroup.witchhunt._
import org.jsoup.nodes.Document
import com.helger.css.decl.{ CSSExpression, CSSDeclaration }

class ExcessiveColorsCheck(options: WitchhuntOptions) extends WitchhuntViolationCheck with ScoupImplicits {

  val CSS_COLOR_PROP = "color"

  val knownColors = scala.collection.mutable.Set[CSSExpression]()

  // Return a violation if the total number of colors defined exceeds the configured limit
  def checkSelector(implicit ruleSet: RuleEnumerator, selector: String, lineNumber: Int, declarationsWithin: Seq[CSSDeclaration], applicablePages: Set[Document]): Option[Violation] = {
    knownColors ++= declarationsWithin.filter(CSS_COLOR_PROP == _.getProperty).map { declaration =>
      declaration.getExpression
    }.toSet

    if (knownColors.size > options.colorLimit) {
      buildViolation(ExcessiveColorsViolation, Some(options.colorLimit), Some(knownColors.size))
    } else {
      None
    }
  }
} 
开发者ID:themillhousegroup,项目名称:witchhunt,代码行数:27,代码来源:ExcessiveColorsCheck.scala

示例14: UnusedSelectorCheck

//设置package包名称以及导入依赖的类
package com.themillhousegroup.witchhunt.checks

import com.themillhousegroup.scoup.ScoupImplicits
import com.themillhousegroup.witchhunt.{ RuleEnumerator, UnusedSelectorViolation, Violation, ViolationType }
import org.jsoup.nodes.Document
import scala._
import scala.Some
import com.themillhousegroup.witchhunt.Violation
import com.helger.css.decl.CSSDeclaration

object UnusedSelectorCheck extends WitchhuntViolationCheck with ScoupImplicits {

  // Return a violation if there is no element matching the selector in ANY of the supplied pages
  def checkSelector(implicit ruleSet: RuleEnumerator, selector: String, lineNumber: Int, declarationsWithin: Seq[CSSDeclaration], applicablePages: Set[Document]): Option[Violation] = {
    // As soon as we find an element that matches the selector, we can stop:
    applicablePages.find { stylePage =>
      stylePage.select(selector).nonEmpty
    }.fold(
      buildViolation(UnusedSelectorViolation)
    )(_ => None)
  }
} 
开发者ID:themillhousegroup,项目名称:witchhunt,代码行数:23,代码来源:UnusedSelectorCheck.scala

示例15: Article

//设置package包名称以及导入依赖的类
package gander

import gander.images.Image
import gander.opengraph.OpenGraphData
import org.joda.time.DateTime
import org.jsoup.nodes.{Document, Element}


final case class Article(title: String,
                         cleanedArticleText: Option[String],
                         metaDescription: String,
                         metaKeywords: String,
                         canonicalLink: String,
                         domain: String,
                         topNode: Option[Element],
                         topImage: Option[Image],
                         tags: Set[String],
                         movies: List[Element],
                         finalUrl: String,
                         linkHash: String,
                         rawHtml: String,
                         doc: Document,
                         rawDoc: Document,
                         publishDate: Option[DateTime],
                         additionalData: Map[String, String],
                         openGraphData: OpenGraphData) 
开发者ID:lloydmeta,项目名称:gander,代码行数:27,代码来源:Article.scala


注:本文中的org.jsoup.nodes.Document类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。