本文整理汇总了Scala中org.jsoup.nodes.Document类的典型用法代码示例。如果您正苦于以下问题:Scala Document类的具体用法?Scala Document怎么用?Scala Document使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
示例1: NoticeServiceObjects
package com.zhranklin.homepage.notice
import org.json4s._
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
object NoticeServiceObjects {
trait ServiceBase extends IndexService with FunNoticeFetcher with SelectorUrlService {
val initVal: ((Document) ? String, (Document) ? String, String, String)
lazy val (getContent, getDateStr, urlPattern, template) = initVal
class LawService(title: String, listId: String) extends NoticeService(s"??? - $title") with UrlService with IndexService with FunNoticeFetcher {
val getContent = contentF("div.text")
val getDateStr = dateF("span:contains(????)")
val template = "http://law.scu.edu.cn/xjax?arg=8573&arg=<index>&arg=20&arg=list&clazz=PortalArticleAction&method=list"
def getUrl(id: String) = s"http://law.scu.edu.cn/detail.jsp?portalId=725&cid=8385&nextcid=$listId&aid=$id"
override def noticeUrlsFromUrl(url: String): Iterable[NoticeEntry] = {
val jsonStr = Jsoup.connect(url).execute().body()
val json = jackson.parseJson(jsonStr)
jo ? NoticeEntry(getUrl(jo.\("id").values.toString), Some(jo.\("subject").values.toString)))
val serviceList = List(
"???? - ???? - test" ?
"???? - ???? - test" ? "http://sesu.scu.edu.cn/news/list_1_<index>.html",
"???? - ????" ? "http://sesu.scu.edu.cn/gonggao/list_2_<index>.html",
"????? - ???? - test" ? "http://cs.scu.edu.cn/cs/xsky/xskb/H951901index_<index>.htm",
"????? - ???? - test" ? "http://cs.scu.edu.cn/cs/xytz/H9502index_<index>.htm",
"????? - ???? - test" ? "http://cs.scu.edu.cn/cs/xyxw/H9501index_<index>.htm",
"????? - ??? - test" ? "http://cs.scu.edu.cn/cs/fwzy/ftl/H951204index_<index>.htm",
"???? - test" ? "http://news.scu.edu.cn/news2012/cdzx/I0201index_<index>.htm",
"???? - ????" ?"http://math.scu.edu.cn/news.asp?PAGE=<index>",
"?????? - ????" ? "http://seei.scu.edu.cn/student,p<index>,index.jsp",
"????? - ????" ? "http://flc2.scu.edu.cn/foreign/a/xueyuangonggao/list_27_<index>.html"
).map { tp ?
new NoticeService(tp._1) with UniversalUrlService with UniversalNoticeFetcher with IndexService {
val template = tp._2
} ++ List(
new NoticeService("??? - ??") with ServiceBase {
val initVal =(selectorF("input[name=news.content]")(_.first.attr("value")), dateF("table[width=900] td:contains(????)"),
"newsShow.*", "http://jwc.scu.edu.cn/jwc/moreNotice.action?url=moreNotice.action&type=2&keyWord=&pager.pageNow=<index>")},
new LawService("????", "8572"),
new LawService("????", "8573")
示例2: IsapReader
package pl.mojepanstwo.sap.toakoma.readers
import org.slf4j.LoggerFactory
import org.springframework.batch.item.ItemReader
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import com.gargoylesoftware.htmlunit.WebClient
import pl.mojepanstwo.sap.toakoma._
object IsapReader {
val BASE_URL = "http://isap.sejm.gov.pl"
val URL = BASE_URL + "/DetailsServlet?id="
class IsapReader(val id: String) extends ItemReader[Document] {
val logger = LoggerFactory.getLogger(this.getClass())
var last = false
def read : Document = {
if(last) return null
this.last = true
val isapUrl = IsapReader.URL + id
val rsp = Jsoup.connect(isapUrl).get
if(rsp.body.text.contains("Brak aktu prawnego o podanym adresie publikacyjnym !"))
throw new NoSuchDocumentException
return rsp
示例3: get
package pl.mojepanstwo.sap.toakoma.services
import java.net.URL
import java.io.File
import org.apache.commons.io.FileUtils
import org.jsoup.nodes.Document
import com.gargoylesoftware.htmlunit._
import org.jsoup.Jsoup
trait Scraper {
def get(url: String) : Document
def dowloadFile(fileUrl:String, filePath:String) : String
class DefaultScraperService extends Scraper {
val webClient = new WebClient
def get(url: String) : Document = {
webClient.setRefreshHandler(new RefreshHandler {
override def handleRefresh(page: Page, url: URL, i: Int): Unit = webClient.getPage(url)
val apPage: Page = webClient.getPage(url)
def dowloadFile(fileUrl:String, filePath:String) : String = {
val url = new URL(fileUrl)
val tmp = new File(filePath)
FileUtils.copyURLToFile(url, tmp)
示例4: ResourceScraperService
package pl.mojepanstwo.sap.toakoma
import pl.mojepanstwo.sap.toakoma.services.Scraper
import org.jsoup.nodes.Document
import org.jsoup.Jsoup
import scala.io.Source
import java.io.File
import java.nio.file.Files
import org.apache.commons.io.IOUtils
import java.io.FileOutputStream
class ResourceScraperService extends Scraper {
def get(url: String) : Document = {
val pattern = ".*id=(.*)&type=([0-9]+).*".r
val pattern(id, docType) = url
Jsoup.parse(Source.fromResource("isap/" + id + "/" + docType + ".html").mkString)
def dowloadFile(fileUrl:String, filePath:String) : String = {
val pattern = ".*id=(.*)&type=([0-9]+).*".r
val pattern(id, docType) = fileUrl
val src = getClass.getResourceAsStream("/isap/" + id + "/" + docType + ".pdf")
val dest = new File(filePath)
val out = new FileOutputStream(dest)
IOUtils.copy(src, out)
示例5: GgleLoginTest
package com.szadowsz.tarbh.ggle
import com.szadowsz.common.net.Uri
import com.szadowsz.maeve.core.MaeveDriver
import com.szadowsz.maeve.core.browser.MaeveConf
import com.szadowsz.maeve.core.instruction.MaeveInstruction
import com.szadowsz.maeve.core.instruction.extractor.JsoupExtractor
import com.szadowsz.maeve.core.instruction.target.single.SingleTarget
import com.szadowsz.maeve.gglegrp.actions.GgleExecutor
import org.jsoup.nodes.Document
object GgleLoginTest {
private val link = ""
private val username: String = ""
private val passwd: String = ""
private val urlOfGrp = Uri(link)
private val groupName: String = ""
private def buildConfig(): MaeveConf = {
.setHTTPProxy("", 0, Nil)
def main(args: Array[String]): Unit = {
System.setProperty("webdriver.chrome.driver", ".\\chromedriver_win32\\chromedriver.exe")
val conf = buildConfig()
val scraper = new MaeveDriver(conf)
val rootTarget = SingleTarget(urlOfGrp)
class TestExtractor extends JsoupExtractor {
override def extract(queryUrl: Uri, returnedUrl: Uri, inst: MaeveInstruction[_], page: Document): Unit = {}
override def shouldContinue(): Boolean = false
val rootFilter = new TestExtractor()
val actions = new GgleExecutor(username, passwd)
val rootInstruction = MaeveInstruction(groupName, rootTarget, actions, rootFilter, "./data/grp/", false, false, false, MaeveConf().setNoProxy())
示例6: compile
package indi.lewis.spider.html
import java.util
import com.google.gson._
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
private[html] trait ElementType {
var elementName: String = _;
def compile(doc:Document):JsonElement;
def compile(doc:String):JsonElement=compile(Jsoup.parse(doc));
private[html] case class ModelParent() extends ElementType {
def this(elementName: String) {
this.elementName = elementName;
val properties: java.util.ArrayList[ElementType] = new util.ArrayList[ElementType]();
override def compile(doc: Document): JsonElement = {
val ret=new JsonObject
for(i <- 0 to properties.size()-1; o= properties.get(i)){
private[html] case class ModelElement(val elName: String, val f: (Document) => ElementType) extends ElementType {
override def compile(doc: Document): JsonElement = f(doc).compile(doc)
private[html] case class ModelArray(val elName: String) extends ElementType {
val array: java.util.ArrayList[ElementType] = new util.ArrayList[ElementType]();
override def compile(doc: Document): JsonElement = {
val jsonArray=new JsonArray
for(i <- 0 to array.size()-1; o= array.get(i)){
private[html] case class ModelConstant(val elName: String, val value: Object) extends ElementType {
override def compile(doc: Document): JsonElement = if(value!=null)new JsonPrimitive(value.toString) else JsonNull.INSTANCE
示例7: LaporBot
package io.github.asepsaep.laporcrawler.bot
import scala.collection.JavaConverters._
import org.jsoup.nodes.Document
import org.jsoup.Jsoup
import io.github.asepsaep.laporcrawler.model.Ticket
case class LaporBot(ticketId: Int) {
private var ticket = new Ticket()
private val url = "" + ticketId
// System.setProperty("socksProxyHost", "")
// System.setProperty("socksProxyPort", "10001")
// System.setProperty("socksProxyVersion", "5")
def crawl(): Option[Ticket] = {
val doc = Jsoup.connect(url).timeout(30000).get()
val maybeTicket = if (doc.getElementsByClass("no-data").isEmpty) Option(parse(doc)) else None
def parse(doc: Document): Ticket = {
val id = ticketId
val title = doc.getElementById("row_Subject").text
val splitContent = doc.getElementById("row_content").text.split(", ")
val content = if (splitContent.length > 1) splitContent.tail.mkString(", ") else doc.getElementById("row_content").text
ticket = ticket.copy(id = id, title = title, content = content)
val details = doc.getElementsByClass("feedback-details").first.getElementsByTag("p").asScala
for (p ? details) {
val span = p.getElementsByTag("span")
span.first.text match {
case "USER:" ? ticket = ticket.copy(user = Some(span.last.text))
case "PLATFORM:" ? ticket = ticket.copy(platform = Some(span.last.text))
case "TANGGAL:" ? ticket = ticket.copy(date = Some(span.last.text))
case "KATEGORI:" ? ticket = ticket.copy(category = Some(span.last.text))
case "AREA:" ? ticket = ticket.copy(area = Some(span.last.text))
case "STATUS:" ? ticket = ticket.copy(status = Some(span.last.text))
case _ ? {}
val dispatchedTo = doc.select(".administrator .comment-content").first.getElementsByTag("p").first.getElementsByTag("span").first.getElementsByTag("b").first.text
ticket = ticket.copy(dispatchedTo = Some(dispatchedTo))
示例8: checkElementAndConvert
package haishu.crawler.selector
import org.jsoup.nodes.{Document, Element}
private def checkElementAndConvert(element: Element): Element = element match {
case d: Document => d
case _ =>
val root = new Document(element.ownerDocument().baseUri())
override def css(selector: String): Selectable = {
val cssSelector = Selectors.css(selector)
override def css(selector: String, attrName: String): Selectable = {
val cssSelector = Selectors.css(selector, attrName)
示例9: StyleguideSpider
package com.themillhousegroup.witchhunt
import org.jsoup.nodes.{ Element, Document }
import scala.concurrent.Future
import com.themillhousegroup.scoup.{ ScoupImplicits, Scoup }
import scala.concurrent.ExecutionContext.Implicits.global
import java.net.URL
object StyleguideSpider extends ScoupImplicits {
def visit(url: URL, thisPageOnly: Boolean = false): Future[Set[Document]] = {
visitLink(url, Set.empty, thisPageOnly)
private def visitLink(url: URL, alreadyVisited: Set[URL], thisPageOnly: Boolean): Future[Set[Document]] = {
Scoup.parse(url.toString).flatMap { doc =>
if (thisPageOnly) {
} else {
visitLinks(url, doc, alreadyVisited)
private def visitLinks(url: URL, doc: Document, alreadyVisited: Set[URL]) = {
val links = doc.select("a").filter(isLocal).map(_.attr("href"))
links.map(createFullLocalUrl(url)).filter(!alreadyVisited.contains(_)).foldLeft(Future.successful(Set(doc))) {
case (acc, link) =>
for {
existingDocs <- acc
newDocs <- visitLink(link, alreadyVisited + link, false)
} yield (existingDocs ++ newDocs)
private def isLocal(link: Element): Boolean = {
val href = link.attr("href")
def createFullLocalUrl(base: URL)(link: String): URL = {
(new java.net.URL(base, link))
示例10: StylesheetFinder
package com.themillhousegroup.witchhunt
import com.themillhousegroup.scoup.{ Scoup, ScoupImplicits }
import org.jsoup.nodes.Document
import scala.concurrent.Future
import scala.concurrent.ExecutionContext.Implicits.global
object StylesheetFinder extends ScoupImplicits {
def allStylesheetUrls(doc: Document): Seq[String] = {
doc.head.select("link").filter { elem =>
elem.attr("rel") == "stylesheet"
}.map { elem =>
def localStylesheetUrls(doc: Document): Seq[String] = {
allStylesheetUrls(doc).filter { url =>
// It starts with a single-slash ONLY (a double-slash means protocol-relative"
(url.startsWith("/") && !url.startsWith("//")) ||
// It doesn't start with a traditional protocol specifier
!(url.startsWith("http:") || url.startsWith("https://"))
示例11: checkSelector
package com.themillhousegroup.witchhunt.checks
import com.themillhousegroup.witchhunt.{ExcessiveSpecificityViolation, RuleEnumerator, Violation, ViolationType}
import org.jsoup.nodes.Document
import com.helger.css.decl.CSSDeclaration
trait WitchhuntViolationCheck {
def checkSelector(implicit ruleSet: RuleEnumerator, selector: String, lineNumber: Int, declarationsWithin: Seq[CSSDeclaration], applicablePages: Set[Document]): Option[Violation]
protected def buildViolation[VT <: ViolationType](vt: VT,
thresholdValue:Option[Int] = None,
violationValue:Option[Int] = None)(implicit ruleSet: RuleEnumerator,
selector: String,
lineNumber: Int,
applicablePages: Set[Document]):Option[Violation] = {
示例12: ExcessiveSpecificityCheck
package com.themillhousegroup.witchhunt.checks
import com.themillhousegroup.scoup.ScoupImplicits
import com.themillhousegroup.witchhunt._
import org.jsoup.nodes.Document
import com.helger.css.decl.CSSDeclaration
class ExcessiveSpecificityCheck(options: WitchhuntOptions) extends WitchhuntViolationCheck with ScoupImplicits {
// Return a violation if the selector is more specific that the configured limit
def checkSelector(implicit ruleSet: RuleEnumerator, selector: String, lineNumber: Int, declarationsWithin: Seq[CSSDeclaration], applicablePages: Set[Document]): Option[Violation] = {
val result = Specificity.calculateSingle(selector)
if (result.asInt > options.specificityLimit) {
} else {
示例13: ExcessiveColorsCheck
package com.themillhousegroup.witchhunt.checks
import com.themillhousegroup.scoup.ScoupImplicits
import com.themillhousegroup.witchhunt._
import org.jsoup.nodes.Document
import com.helger.css.decl.{ CSSExpression, CSSDeclaration }
class ExcessiveColorsCheck(options: WitchhuntOptions) extends WitchhuntViolationCheck with ScoupImplicits {
val CSS_COLOR_PROP = "color"
val knownColors = scala.collection.mutable.Set[CSSExpression]()
// Return a violation if the total number of colors defined exceeds the configured limit
def checkSelector(implicit ruleSet: RuleEnumerator, selector: String, lineNumber: Int, declarationsWithin: Seq[CSSDeclaration], applicablePages: Set[Document]): Option[Violation] = {
knownColors ++= declarationsWithin.filter(CSS_COLOR_PROP == _.getProperty).map { declaration =>
if (knownColors.size > options.colorLimit) {
buildViolation(ExcessiveColorsViolation, Some(options.colorLimit), Some(knownColors.size))
} else {
示例14: UnusedSelectorCheck
package com.themillhousegroup.witchhunt.checks
import com.themillhousegroup.scoup.ScoupImplicits
import com.themillhousegroup.witchhunt.{ RuleEnumerator, UnusedSelectorViolation, Violation, ViolationType }
import org.jsoup.nodes.Document
import scala._
import scala.Some
import com.themillhousegroup.witchhunt.Violation
import com.helger.css.decl.CSSDeclaration
object UnusedSelectorCheck extends WitchhuntViolationCheck with ScoupImplicits {
// Return a violation if there is no element matching the selector in ANY of the supplied pages
def checkSelector(implicit ruleSet: RuleEnumerator, selector: String, lineNumber: Int, declarationsWithin: Seq[CSSDeclaration], applicablePages: Set[Document]): Option[Violation] = {
// As soon as we find an element that matches the selector, we can stop:
applicablePages.find { stylePage =>
)(_ => None)
示例15: Article
package gander
import gander.images.Image
import gander.opengraph.OpenGraphData
import org.joda.time.DateTime
import org.jsoup.nodes.{Document, Element}
final case class Article(title: String,
cleanedArticleText: Option[String],
metaDescription: String,
metaKeywords: String,
canonicalLink: String,
domain: String,
topNode: Option[Element],
topImage: Option[Image],
tags: Set[String],
movies: List[Element],
finalUrl: String,
linkHash: String,
rawHtml: String,
doc: Document,
rawDoc: Document,
publishDate: Option[DateTime],
additionalData: Map[String, String],
openGraphData: OpenGraphData)