本文整理汇总了Scala中org.jsoup.Jsoup类的典型用法代码示例。如果您正苦于以下问题:Scala Jsoup类的具体用法?Scala Jsoup怎么用?Scala Jsoup使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Jsoup类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: HtmlConcatCompiler
//设置package包名称以及导入依赖的类
package com.karasiq.scalajsbundler.compilers
import com.karasiq.scalajsbundler.ScalaJSBundler.PageTypedContent
import org.jsoup.Jsoup
import org.jsoup.nodes.Element
import scala.collection.JavaConversions._
object HtmlConcatCompiler extends AssetCompiler {
private implicit class ElementOps(val e: Element) extends AnyVal {
def concatWith(src: Element): Unit = {
@inline
def delimit(delimiter: String, s1: String, s2: String): String = {
if (s1.endsWith(delimiter)) s1 + s2
else s1 + delimiter + s2
}
src.attributes().foreach {
case a if a.getKey == "class" ?
e.attr(a.getKey, delimit(" ", e.attr(a.getKey), a.getValue))
case a if a.getKey == "style" ?
e.attr(a.getKey, delimit(";", e.attr(a.getKey), a.getValue))
case a ? // Replaces attribute value
e.attr(a.getKey, a.getValue)
}
e.append(src.html())
}
}
def concat(htmlList: Seq[String]): String = {
val result = Jsoup.parse(htmlList.head)
htmlList.tail.foreach { h ?
val html = Jsoup.parse(h)
result.head().concatWith(html.head())
result.body().concatWith(html.body())
}
result.outerHtml()
}
override def compile(contents: Seq[PageTypedContent]): String = {
concat(contents.map(_.asset.asString))
}
}
示例2: ScheduleDownloadActor
//设置package包名称以及导入依赖的类
package logic.actors.schedule
import java.nio.charset.StandardCharsets
import javax.inject._
import akka.actor.{Actor, ActorRef}
import helpers.SpiritHelper
import logic.actors.schedule.ScheduleDownloadActor.DownloadSchedule
import logic.actors.schedule.ScheduleParseActor._
import org.fhs.spirit.scheduleparser.enumerations.EScheduleKind
import org.jsoup.Jsoup
import play.api.libs.ws.WSClient
import scala.collection.JavaConversions._
import scala.concurrent.Await
import scala.concurrent.duration._
@Singleton
class ScheduleDownloadActor @Inject()(ws: WSClient, @Named("parseActor") parseActor: ActorRef) extends Actor with SpiritHelper {
override def receive: Receive = {
case DownloadSchedule =>
val baseUrl = configuration.underlying.getString("schedule.baseUrl")
val lectureResults = uncachedCourseNames.map {
courseName =>
val outcome = "s_" + courseName + ".html"
val httpResult = Await.result(ws.url(baseUrl + outcome).get(), 10 seconds)
if (httpResult.status != 404) {
Some((httpResult.bodyAsBytes.decodeString(StandardCharsets.ISO_8859_1.toString), courseName))
} else {
None
}
}.filter(_.nonEmpty).map(rs => (Jsoup.parse(rs.get._1).toString, rs.get._2)).map(rs => (EScheduleKind.REGULAR, rs))
val blockBaseResult = Await.result(ws.url(baseUrl + "bindex.html").get(), 10 seconds)
val bindex = Jsoup.parse(blockBaseResult.bodyAsBytes.decodeString(StandardCharsets.ISO_8859_1.toString))
val blockRefs = bindex.select("a").map(_.attr("href")).toSet
val blockResult = blockRefs.map {
block =>
val httpResult = Await.result(ws.url(baseUrl + block).get(), 10 seconds)
if (httpResult.status != 404) {
Some((httpResult.bodyAsBytes.decodeString(StandardCharsets.ISO_8859_1.toString), block))
} else {
None
}
}.filter(_.nonEmpty).map(rs => (Jsoup.parse(rs.get._1).toString, rs.get._2)).map(rs => (EScheduleKind.BLOCK, rs))
parseActor ! ParseSchedule(lectureResults ++ blockResult)
}
}
示例3: MainPageResponseParser
//设置package包名称以及导入依赖的类
package bridgeapp.crawler.parsers
import java.net.URL
import akka.actor.{Props, ActorSystem, Actor, ActorRef}
import bridgeapp.crawler.Config
import bridgeapp.crawler.execution.{Response, ResponseParser}
import bridgeapp.crawler.storage.{DiskForumsStorage, ForumsStorage}
import com.typesafe.scalalogging.LazyLogging
import org.jsoup.Jsoup
import scala.collection.JavaConverters._
class MainPageResponseParser(parser: ActorRef) extends ResponseParser {
override def ->(response: Response): Unit = parser ! response
}
object MainPageResponseParser {
def apply()(implicit actorSystem: ActorSystem): MainPageResponseParser = {
val parser = actorSystem.actorOf(Props(new MainPageParser(ForumsStorage())))
new MainPageResponseParser(parser)
}
}
class MainPageParser(forumsListStorage: ForumsStorage) extends Actor with LazyLogging {
override def receive: Receive = {
case response: Response =>
val charset = response.charset.getOrElse("utf-8")
val body = new String(response.body, charset)
val document = Jsoup.parse(body, response.uri.toString)
val forumLink = document.select("[href^=viewforum.php]").asScala.toArray
logger.trace(s" Total url: ${forumLink.length}")
val forumsIds: Array[Int] = forumLink.map(_.attr("abs:href")).collect {
case href: String =>
val s = new URL(href).getQuery.split("&").map { part =>
val pair = part.split("=")
pair(0) -> pair(1)
}.toMap
s.getOrElse("f", "0").toInt
}
logger.trace(s" Extracted forums ids: ${forumsIds.length}")
forumsListStorage.write(forumsIds, Config.forumsStorageURI)(context.dispatcher)
}
}
示例4: MALImage
//设置package包名称以及导入依赖的类
package me.abarrow.ScalaSubNet.mal
import java.io.File
import java.io.FileOutputStream
import java.net.URL
import org.jsoup.Jsoup
import org.jsoup.parser.Parser
import java.nio.channels.Channels
object MALImage {
def saveMainImage(animeID:Int, imagePath:File):Boolean = {
val doc = Jsoup.parse(new URL(MALURLs.MAL_ANIME_PAGE_PREFIX + animeID.toString()), 60000)
val mainImage = doc.select("img.ac").first()
if (mainImage == null) {
return false
}
val imgSrc = mainImage.attr("src")
val rbc = Channels.newChannel(new URL(imgSrc).openStream())
val fos = new FileOutputStream(imagePath)
try {
fos.getChannel().transferFrom(rbc, 0, Long.MaxValue)
} finally {
fos.close()
rbc.close()
}
true
}
}
示例5: MALList
//设置package包名称以及导入依赖的类
package me.abarrow.ScalaSubNet.mal
import org.jsoup.Jsoup
import org.jsoup.parser.Parser
import collection.JavaConverters._
class MALList (val entries:Array[MALEntry]) {
}
object MALList {
private val MAL_LIST_SUFFIX = "&status=all&type=anime"
private val xmlParser = Parser.xmlParser()
def getListByUser(userId:String):MALList = {
val listXML = Jsoup.connect(MALURLs.MAL_LIST_PREFIX + userId + MAL_LIST_SUFFIX).parser(xmlParser).get()
new MALList(listXML.getElementsByTag("anime").asScala.map { x =>
val id = x.getElementsByTag("series_animedb_id").first().html().toInt
val name = x.getElementsByTag("series_title").first().html()
val score = x.getElementsByTag("my_score").first().html().toInt
val status = x.getElementsByTag("my_status").first().html().toInt
new MALEntry(id, name, score, status)
}.toArray)
}
}
示例6: NoticeServiceObjects
//设置package包名称以及导入依赖的类
package com.zhranklin.homepage.notice
import org.json4s._
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
object NoticeServiceObjects {
trait ServiceBase extends IndexService with FunNoticeFetcher with SelectorUrlService {
val initVal: ((Document) ? String, (Document) ? String, String, String)
lazy val (getContent, getDateStr, urlPattern, template) = initVal
}
class LawService(title: String, listId: String) extends NoticeService(s"??? - $title") with UrlService with IndexService with FunNoticeFetcher {
val getContent = contentF("div.text")
val getDateStr = dateF("span:contains(????)")
val template = "http://law.scu.edu.cn/xjax?arg=8573&arg=<index>&arg=20&arg=list&clazz=PortalArticleAction&method=list"
def getUrl(id: String) = s"http://law.scu.edu.cn/detail.jsp?portalId=725&cid=8385&nextcid=$listId&aid=$id"
override def noticeUrlsFromUrl(url: String): Iterable[NoticeEntry] = {
val jsonStr = Jsoup.connect(url).execute().body()
val json = jackson.parseJson(jsonStr)
json.\("data").asInstanceOf[JArray].arr.map(
jo ? NoticeEntry(getUrl(jo.\("id").values.toString), Some(jo.\("subject").values.toString)))
}
}
val serviceList = List(
"???? - ???? - test" ?
"http://www.sculj.cn/Special_News.asp?SpecialID=40&SpecialName=%D1%A7%D4%BA%B6%AF%CC%AC&page=<index>",
"???? - ???? - test" ? "http://sesu.scu.edu.cn/news/list_1_<index>.html",
"???? - ????" ? "http://sesu.scu.edu.cn/gonggao/list_2_<index>.html",
"????? - ???? - test" ? "http://cs.scu.edu.cn/cs/xsky/xskb/H951901index_<index>.htm",
"????? - ???? - test" ? "http://cs.scu.edu.cn/cs/xytz/H9502index_<index>.htm",
"????? - ???? - test" ? "http://cs.scu.edu.cn/cs/xyxw/H9501index_<index>.htm",
"????? - ??? - test" ? "http://cs.scu.edu.cn/cs/fwzy/ftl/H951204index_<index>.htm",
"???? - test" ? "http://news.scu.edu.cn/news2012/cdzx/I0201index_<index>.htm",
"???? - ????" ?"http://math.scu.edu.cn/news.asp?PAGE=<index>",
"?????? - ????" ? "http://seei.scu.edu.cn/student,p<index>,index.jsp",
"????? - ????" ? "http://flc2.scu.edu.cn/foreign/a/xueyuangonggao/list_27_<index>.html"
).map { tp ?
new NoticeService(tp._1) with UniversalUrlService with UniversalNoticeFetcher with IndexService {
val template = tp._2
}
} ++ List(
new NoticeService("??? - ??") with ServiceBase {
val initVal =(selectorF("input[name=news.content]")(_.first.attr("value")), dateF("table[width=900] td:contains(????)"),
"newsShow.*", "http://jwc.scu.edu.cn/jwc/moreNotice.action?url=moreNotice.action&type=2&keyWord=&pager.pageNow=<index>")},
new LawService("????", "8572"),
new LawService("????", "8573")
)
}
示例7: IsapReader
//设置package包名称以及导入依赖的类
package pl.mojepanstwo.sap.toakoma.readers
import org.slf4j.LoggerFactory
import org.springframework.batch.item.ItemReader
import org.jsoup.Jsoup
import org.jsoup.nodes.Document
import com.gargoylesoftware.htmlunit.WebClient
import pl.mojepanstwo.sap.toakoma._
object IsapReader {
val BASE_URL = "http://isap.sejm.gov.pl"
val URL = BASE_URL + "/DetailsServlet?id="
}
class IsapReader(val id: String) extends ItemReader[Document] {
val logger = LoggerFactory.getLogger(this.getClass())
var last = false
def read : Document = {
logger.trace("read")
if(last) return null
this.last = true
val isapUrl = IsapReader.URL + id
val rsp = Jsoup.connect(isapUrl).get
if(rsp.body.text.contains("Brak aktu prawnego o podanym adresie publikacyjnym !"))
throw new NoSuchDocumentException
return rsp
}
}
示例8: get
//设置package包名称以及导入依赖的类
package pl.mojepanstwo.sap.toakoma.services
import java.net.URL
import java.io.File
import org.apache.commons.io.FileUtils
import org.jsoup.nodes.Document
import com.gargoylesoftware.htmlunit._
import org.jsoup.Jsoup
trait Scraper {
def get(url: String) : Document
def dowloadFile(fileUrl:String, filePath:String) : String
}
class DefaultScraperService extends Scraper {
val webClient = new WebClient
def get(url: String) : Document = {
webClient.setRefreshHandler(new RefreshHandler {
override def handleRefresh(page: Page, url: URL, i: Int): Unit = webClient.getPage(url)
})
val apPage: Page = webClient.getPage(url)
Jsoup.parse(apPage.getWebResponse.getContentAsString)
}
def dowloadFile(fileUrl:String, filePath:String) : String = {
val url = new URL(fileUrl)
val tmp = new File(filePath)
FileUtils.copyURLToFile(url, tmp)
tmp.getAbsolutePath()
}
}
示例9: ResourceScraperService
//设置package包名称以及导入依赖的类
package pl.mojepanstwo.sap.toakoma
import pl.mojepanstwo.sap.toakoma.services.Scraper
import org.jsoup.nodes.Document
import org.jsoup.Jsoup
import scala.io.Source
import java.io.File
import java.nio.file.Files
import org.apache.commons.io.IOUtils
import java.io.FileOutputStream
class ResourceScraperService extends Scraper {
def get(url: String) : Document = {
val pattern = ".*id=(.*)&type=([0-9]+).*".r
val pattern(id, docType) = url
Jsoup.parse(Source.fromResource("isap/" + id + "/" + docType + ".html").mkString)
}
def dowloadFile(fileUrl:String, filePath:String) : String = {
val pattern = ".*id=(.*)&type=([0-9]+).*".r
val pattern(id, docType) = fileUrl
val src = getClass.getResourceAsStream("/isap/" + id + "/" + docType + ".pdf")
val dest = new File(filePath)
val out = new FileOutputStream(dest)
IOUtils.copy(src, out)
src.close()
out.close()
dest.getAbsolutePath
}
}
示例10: first
//设置package包名称以及导入依赖的类
package com.zhranklin.notice.service
import java.util.Date
import org.jsoup.Jsoup
import scala.collection.JavaConverters._
import scala.util._
trait IndexService {
val template: String
def first = rawIndices.head
protected def firstIndex: Int = 1
protected def valueStream(i: Int): Stream[Int] = i #:: valueStream(i + 1)
protected def indexNums: Iterable[Any] = valueStream(firstIndex)
protected def interpolate(value: Any): String = template.replaceAll("<index>", value.toString)
def rawIndices: Iterable[String] = indexNums map interpolate
def indexUrls: Iterable[String] = Stream(first) ++ rawIndices.drop(1)
}
case class Notice(url: String, title: String, html: String, date: Date) {
def widthlessHtml = {
val doc = Jsoup.parse(html)
doc.select("*[width]").asScala.map(_.removeAttr("width"))
doc.select("*[height]").asScala.map(_.removeAttr("height"))
doc.toString
}
def stylelessHtml = {
val doc = Jsoup.parse(html)
doc.select("*[width]").asScala.map(_.removeAttr("width"))
doc.select("*[height]").asScala.map(_.removeAttr("height"))
doc.select("*[style]").asScala.map(_.removeAttr("style"))
doc.toString
}
def imgs = Jsoup.parse(html).select("img[src]").asScala.map(_.attr("src"))
}
case class NoticeEntry(url: String, title: Option[String] = None)
abstract class NoticeService(val source: String) extends UrlService with IndexService with NoticeFetcher {
def getUrls: Iterable[Try[NoticeEntry]] = indexUrls.map (i ? Try(noticeUrlsFromUrl(i))).flatMap {
case Success(urls) ? urls map Success.apply
case Failure(t) ? Iterable(Failure(t))
}
def notices: Iterable[Try[Notice]] = getUrls.map (_.flatMap(u ? Try(fetch(u))))
def noticesWithErr(limit: Int, offset: Int): (List[Notice], List[Throwable]) = {
val (succ, err) = notices.slice(offset, offset + limit).toList.partition(_.isSuccess)
val successes = succ.asInstanceOf[List[Success[Notice]]].map(_.value)
val failures = err.asInstanceOf[List[Failure[Throwable]]].map(_.exception)
failures.groupBy(_.getClass.getSimpleName).map(_._2.head).foreach(t ? log.i(s"error when fetching news", t))
(successes, failures)
}
}
示例11: SearchControllerTest
//设置package包名称以及导入依赖的类
package controllers
import model.{Runway, Airport, Country, SearchResult}
import org.jsoup.Jsoup
import org.scalatest.concurrent.ScalaFutures
import org.scalatest.mock.MockitoSugar
import org.scalatest.{Matchers, FunSpec}
import play.api.test.FakeRequest
import services.SearchService
import org.mockito.Mockito._
import scala.concurrent.Future
import scala.concurrent.ExecutionContext.Implicits.global
class SearchControllerTest extends FunSpec with Matchers with MockitoSugar with ScalaFutures{
describe("Search Controller"){
it("should generate search results page for given search term"){
new Setup {
when(mockSearchService.searchCountriesByNameOrCountryCode("aus")).thenReturn(Future(expectedSearchResult))
val response = searchController.searchByCountry("aus")(FakeRequest()).futureValue
response.header.status should be(200)
expectedFirstRow should be("Australia AUS Melbourne Airport small CONCRETE 1")
}
}
}
trait Setup{
val mockSearchService = mock[SearchService]
val searchController = new SearchController(mockSearchService)
val expectedSearchResult: Vector[SearchResult] = Vector(SearchResult(Country("Australia","AUS"),Airport("Melbourne Airport","small"),Runway("CONCRETE",1)))
val expectedFirstRow = Jsoup.parse(views.html.search_results(expectedSearchResult.toList).body).select("table > tbody > tr:nth-child(1) td").text()
}
}
示例12: ReportsControllerTest
//设置package包名称以及导入依赖的类
package controllers
import model._
import org.jsoup.Jsoup
import org.scalatest.concurrent.ScalaFutures
import org.scalatest.mock.MockitoSugar
import org.scalatest.{FunSpec, Matchers}
import play.api.test.FakeRequest
import services.ReportService
import org.mockito.Mockito._
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.Future
class ReportsControllerTest extends FunSpec with Matchers with MockitoSugar with ScalaFutures{
describe("Reports Controller"){
it("should display country name and count of airports in country"){
new Setup {
val response = reportController.getCountriesWithHighestNoOfAirports(FakeRequest()).futureValue
response.header.status should be(200)
val expectedFirstRow = Jsoup.parse(views.html.report("Some Title",expectedSearchResult.toList).body).select("table > tbody > tr:nth-child(1) td").text()
expectedFirstRow should be("Australia 100")
}
}
}
trait Setup{
val mockReportService = mock[ReportService]
val reportController = new ReportsController(mockReportService)
val expectedSearchResult: Vector[CountryReport] = Vector(CountryReport(Country("Australia","AUS"),100))
when(mockReportService.findCountriesWithHighestNoOfAirports).thenReturn(Future(expectedSearchResult))
}
}
示例13: Crawler
//设置package包名称以及导入依赖的类
package pl.krix.scalacrawl
import java.net.URI
import org.jsoup.Jsoup
import scala.collection.JavaConversions._
object Crawler {
def getDomain(URL: String): Option[String] = { // get domain from URL method
new URI(URL).getHost match { // get URI's host
case s: String => Some(s.stripPrefix("www.")) // if got string, strip useless prefix
case null => None // if got null, return none
}
}
def crawl(URL: String, visited: Set[String], interval: Int) { // crawling method
Thread.sleep(interval) // sleep before launching a request
Jsoup.connect(URL) // connect
.get() // get content
.select("a[href]") // get href elements from content (links)
.map(_.attr("abs:href")) // get their absolute path
.filter(!_.isEmpty()) // weed out empty ones
.filter(getDomain(_) == getDomain(URL)) // we want links from same domain only
.filter(!visited.contains(_)) // we want unvisited links
.foreach { // for every such link
link:String => { // execute lambda which
println(URL + " --> " + link) // prints URL and its link
crawl(link, visited + URL, interval) // crawl inside link
}
}
}
def printHelp() = {
println("USAGE: sbt \"run [URL] [TIME INTERVAL BETWEEN REQUESTS]\"")
}
def main(args: Array[String]) { // run with arguments [URL] [TIME INTERVAL BETWEEN REQUESTS IN SECS]
if(args.length < 2){
printHelp()
}else{
crawl(args(0), Set[String](args(0)), args(1).toInt * 1000)
}
}
}
示例14: LinkExtractor
//设置package包名称以及导入依赖的类
package wipro.crawler.util
import org.jsoup.Jsoup
import scala.collection.JavaConverters._
class LinkExtractor {
var crawledLinks : List[String] = List.empty[String]
def getAllPageLinks(url : String) = {
val links = Jsoup.connect(url).timeout(0).get().select("a[href]")
(for (link <- links.iterator().asScala) yield {
link.attr("href")
}).toSeq.distinct
}
def filterLinks(links : Seq[String],baseUrl : String) = {
links.filter(link => link != null && link.length > 0)
.filter(link => link.contains(baseUrl))
}
def crawlDomainLinks(url : String,depth : Int,maxDepth : Int,baseUrl : String) : Unit = {
if((!crawledLinks.contains(url)) && (depth < maxDepth)){
crawledLinks = url :: crawledLinks
if(url.contains(baseUrl)){
for(link <- getAllPageLinks(url)){
crawlDomainLinks(link,depth + 1,maxDepth,baseUrl)
}
}
}
}
}
示例15:
//设置package包名称以及导入依赖的类
import com.mashape.unirest.http.Unirest
import org.jsoup.Jsoup
import org.jsoup.nodes.Element
import purecsv.safe._
val results = Unirest.post("http://nturanking.lis.ntu.edu.tw/DataPage/OverallRanking.aspx")
.queryString("pagesize", pagesize)
.queryString("y", year)
.asString.getBody
val jsoup = Jsoup.parse(results)
val jsoupResults = jsoup.body.select("#MainContain_GridView1 > tbody").select("tr").toArray.tail
val csvResults = (0 until jsoupResults.size)
.map(idx => (idx, jsoupResults(idx))).map(_.asInstanceOf[(Int, Element)])
.map(t =>
(t._1 + 1, t._2.child(1).child(0).html, t._2.child(2).child(0).html, t._2.child(3).child(0).html))
println(csvResults.map(t => s"${t._1},${t._2},${t._3},${t._4}").mkString("\n"))
}
}