本文整理汇总了Scala中net.ruippeixotog.scalascraper.browser.JsoupBrowser类的典型用法代码示例。如果您正苦于以下问题:Scala JsoupBrowser类的具体用法?Scala JsoupBrowser怎么用?Scala JsoupBrowser使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了JsoupBrowser类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。
示例1: FlatExtractor
//设置package包名称以及导入依赖的类
package org.oginskis.ss.tool
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.model.Element
import org.jsoup.Connection
import org.oginskis.ss.model.Flat
object FlatExtractor {
val SS_LV_BASE ="ss.lv.base.url"
class ExtendedJsoupBrowser extends JsoupBrowser {
override protected[this] def defaultRequestSettings(conn: Connection): Connection = {
super.defaultRequestSettings(conn)
conn.followRedirects(false)
}
}
val browser = new ExtendedJsoupBrowser()
def extractFlats() : List[Flat] = {
def extractFlats(page: Int) : List[Flat] = {
try {
val doc = browser.get(Properties.getProperty(SS_LV_BASE)+"/riga/centre/sell/page"
+ page + ".html")
val rawList: Iterable[Element] = doc.body.select("[id^=\"tr_\"]")
rawList.init.toList.map(
entry => {
val attr: List[Element] = entry.select(".msga2-o").toList
val link: String = entry.select(".msg2 .d1 .am").head.attr("href")
new Flat(Option(attr(0).text.replace("\\", "/")),
Option(attr(1).text.trim.replace("\\", "/")),
Option(attr(2).text.trim.toInt),
Option(attr(3).text.replace("\\", "/")),
Option(attr(6).text.replace(",","").replace(" €","").trim.toInt),
Option(link))
}) ::: extractFlats(page + 1)
}
catch {
case unknown: Throwable => {
List[Flat]()
}
}
}
extractFlats(1)
}
}
示例2: HtmlParseTest
//设置package包名称以及导入依赖的类
package anywhere
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.dsl.DSL._
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
import net.ruippeixotog.scalascraper.dsl.DSL.Parse._
object HtmlParseTest extends App {
val text =
"""
|<div class="highlight highlight-source-scala"><pre><span class="pl-k">val</span> <span class="pl-en">conflicts</span><span class="pl-k">:</span> <span class="pl-en">Set</span>[<span class="pl-en">Dependency</span>] <span class="pl-k">=</span> resolution.conflicts</pre></div>
""".stripMargin
val browser = JsoupBrowser()
val doc=browser.parseString(text)
val cont=doc >> "span"
cont.foreach(println)
}
示例3: ScraperApp
//设置package包名称以及导入依赖的类
package com.sretsnom.mangareader.scraper
import com.typesafe.scalalogging.LazyLogging
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
import net.ruippeixotog.scalascraper.dsl.DSL._
object ScraperApp extends App with LazyLogging {
val browser = JsoupBrowser()
val doc = browser.get("http://observador.pt")
println()
println("=== OBSERVADOR ===")
doc >> extractor(".logo img", attr("src")) |> println
doc >> extractorAt[String]("example-extractor") |> println
println("==================")
println()
doc >> ".small-news-list h4 > a" foreach println
}
示例4: ScrapeActor
//设置package包名称以及导入依赖的类
package com.stacktrace.yo.scrapeline.old
import akka.actor.{Actor, ActorLogging, PoisonPill}
import com.stacktrace.yo.scrapeline.old.ScrapeActor.{BeginScrape, ScrapeContent}
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.model.Document
class ScrapeActor extends Actor with ActorLogging {
lazy val jsoup = JsoupBrowser()
override def receive: Receive = {
case [email protected](url: String) =>
val oSender = sender
log.info("Getting {}", url)
val doc = jsoup.get(url)
sender ! ScrapeContent(doc)
log.info("Response Returned .. Closing")
self ! PoisonPill
}
}
object ScrapeActor {
case class BeginScrape(url: String)
case class ScrapeContent(document: Document)
}
示例5: Scheduler
//设置package包名称以及导入依赖的类
package net.seabears.hockey
import java.time.{LocalDate, LocalTime, ZonedDateTime}
import java.time.format.DateTimeFormatter
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.dsl.DSL._
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
import net.ruippeixotog.scalascraper.dsl.DSL.Parse._
import net.ruippeixotog.scalascraper.model.Element
import net.seabears.hockey.core._
import net.seabears.hockey.util.DateUtils
object Scheduler {
def apply(adapterFactory: Game => GameAdapter, dateStart: String, dateEnd: String, host: String)(implicit userAgentFactory: () => String, pauseFactory: () => Unit) =
new Scheduler(adapterFactory, host, DateUtils.dates(dateStart, dateEnd), userAgentFactory, pauseFactory)
}
class Scheduler(adapterFactory: Game => GameAdapter, host: String, dates: Seq[LocalDate], userAgentFactory: () => String, pauseFactory: () => Unit) {
private[this] val formatter = DateTimeFormatter.ofPattern("yyyyMMdd")
private[this] val browser = new JsoupBrowser(userAgentFactory())
def run() {
dates.flatMap(getGames)
.map(adapterFactory)
.filter(_.isNew)
.foreach(_.save)
}
private[this] def getGames(date: LocalDate): List[FutureGame] = {
val dayId = date.format(formatter)
val url = host + dayId
println("Searching for games at " + url)
val doc = browser.get(url)
val tables: List[Element] = doc >> elementList("div.game-header")
tables.map(toGame(date))
}
private def toGame(date: LocalDate)(element: Element): FutureGame = {
pauseFactory()
val away = element.select("table.game-header-table tr:nth-child(1) td.team-name").head.text
val home = element.select("table.game-header-table tr:nth-child(3) td.team-name").head.text
val time = element.select("ul.game-info li:nth-child(2) span:first-child").head.text
FutureGame(Team("", home), Team("", away), parseTime(date, time))
}
private def parseTime(date: LocalDate, timeToParse: String) = {
val gameTime = """^\s*(\d+:\d+\s+\w+)\s+(\w+)\s*$""".r
val gameTime(rawTime, rawZone) = timeToParse
val time = LocalTime.parse(rawTime, DateTimeFormatter.ofPattern("h:mm a"))
ZonedDateTime.of(date, time, DateUtils.parseZone(rawZone))
}
}
示例6: dojo_char
//设置package包名称以及导入依赖的类
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
import net.ruippeixotog.scalascraper.dsl.DSL._
object dojo_char extends App {
def contentByUrl(url: String) = {
JsoupBrowser().get(url) >> texts(".content") mkString
}
def flipLetter(l: String) = l match {
case "á" | "à" | "ã" => "a"
case "é" | "ê" => "e"
case "ó" | "ô" | "õ" => "o"
case "ú" | "ü" => "u"
case "í" => "i"
case "ç" => "c"
case ext: String => ext
}
def normalize(s: String) = {
(s.toLowerCase().split("") map (flipLetter(_)) mkString).replaceAll("(\\W|\\d| |_)+", " ")
}
val link = "http://diversao.terra.com.br/tv/sala-de-tv/blog/2016/07/02/gloria-maria-fica-%E2%80%98tonta%E2%80%99-ao-fumar-maconha-e-a-internet-pira/"
val siteText = contentByUrl(link)
val text = normalize(siteText)
print(text)
}
示例7: Main
//设置package包名称以及导入依赖的类
package tomblachut.unraghvel
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
import net.ruippeixotog.scalascraper.dsl.DSL._
import tomblachut.unraghvel.origin.FormNames._
import tomblachut.unraghvel.origin.{FormNames, Selectors, Urls}
import scala.io.StdIn
object Main extends App {
val valueAttr = attr("value")
val srcAttr = attr("src")
val browser = new JsoupBrowser("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36")
println(s"GET ${Urls.main}")
val doc = browser.get(Urls.main)
println("OK")
val dates = doc >> text(Selectors.timeFrame)
val captchaSrc = doc >> srcAttr(Selectors.captcha)
println(dates)
println(Urls.base + captchaSrc)
val captchaResponse = StdIn.readLine().trim
val parsedFields = FormNames.specialFields.map(n => n -> (doc >?> valueAttr(s"[name=$n]")).getOrElse(""))
val request = Map(
faculty -> wimiip,
field -> is,
degree -> firstDegree,
kind -> daily,
form -> all,
semester -> "5",
subject -> all,
teacher -> all,
room -> all,
captcha -> captchaResponse
) ++ parsedFields + filterAction
request.foreach { case (key, value) =>
println(s"$key: ${value.take(100)}")
}
println(s"\nPOST ${Urls.main}")
val timetable = browser.post(Urls.main, request)
val message = timetable >?> text(Selectors.message)
val data = timetable >?> text(Selectors.agenda)
println(message)
println(data)
val content = timetable >> text(Selectors.content)
content.take(100) |> println
}
示例8: marrickville
//设置package包名称以及导入依赖的类
package services
import models.CouncilIn
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.dsl.DSL._
import net.ruippeixotog.scalascraper.model.Element
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
def marrickville = {
def oneElement(e:Element) = {
val rows = e.flatMap(_ >> elements("tr"))
val tdrows = rows.flatMap(_ >> elements("td"))
val tds = tdrows.map(_ >> text("td")).toList
CouncilIn(tds(1), tds(5), tds(7), tds(3))
}
val browser = JsoupBrowser()
val urlStr = "https://eproperty.marrickville.nsw.gov.au/eServices/P1/PublicNotices/AllPublicNotices.aspx?r=MC.P1.WEBGUEST&f=%24P1.ESB.PUBNOTAL.ENQ"
val doc = browser.get(urlStr)
// Extract the elements with name
val grid = doc >> element("#ctl00_Content_cusApplicationResultsGrid_pnlCustomisationGrid")
val items = grid >> elements(".grid")
items.map(e => oneElement(e)).toList
}
// scrape blacktown
def blacktown = {
def oneElement(e:Element) = {
val rows = e.flatMap(_ >> elements("tr"))
val tdrows = rows.flatMap(_ >> elements("td"))
val tds = tdrows.map(_ >> text("td")).toList
CouncilIn(tds(1), tds(2), tds(3), tds(4))
}
val browser = JsoupBrowser()
val urlStr = "http://www.blacktown.nsw.gov.au/Planning_and_Development/Development_Assessment/Development_Online/Developments_on_Notification"
val doc = browser.get(urlStr)
// Extract the elements with name
val main = doc >> element("#main-content")
val body = main >> element(".body-content")
val items = body >> elements("table")
items.map(e => oneElement(e)).toList
}
}
示例9: DropTableParser
//设置package包名称以及导入依赖的类
package com.harry0000.kancolle.ac
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import wvlet.log.{LogFormatter, LogSupport, Logger}
object DropTableParser extends LogSupport {
Logger.scheduleLogLevelScan
Logger.setDefaultFormatter(LogFormatter.AppLogFormatter)
def main(args: Array[String]): Unit = {
implicit val browser = JsoupBrowser()
(for {
card <- DropListByCardScraper.scrape().right
area <- DropListByAreaScraper.scrape().right
} yield {
println(Printer.prettyPrint(diff(card, area)))
}).left
.foreach(error(_))
}
def diff(ds1: Seq[ShipDrops], ds2: Seq[ShipDrops]): Seq[ShipDrops] = {
val m1 = ds1.flatMap(ShipDrops.unapply).toMap
val m2 = ds2.flatMap(ShipDrops.unapply).toMap
val areas = (m1.keys.toSet ++ m2.keys).toSeq.sorted
areas.foldLeft(Seq.empty[ShipDrops]) { case (z, area) =>
(m1.get(area), m2.get(area)) match {
case (Some(ships1), None ) => z :+ ShipDrops(area, ships1)
case (None, Some(ships2)) => z :+ ShipDrops(area, ships2)
case (Some(ships1), Some(ships2)) =>
ShipCategory.values.flatMap { c =>
(ships1.get(c), ships2.get(c)) match {
case (Some(names1), None ) => Some(c -> names1)
case (None, Some(names2)) => Some(c -> names2)
case (Some(names1), Some(names2)) => Some(names1 diff names2).filter(_.nonEmpty).map(c -> _)
case _ => None
}
} match {
case Nil => z
case diff => z :+ ShipDrops(area, ShipMap(diff: _*))
}
case _ => z
}
}
}
}
示例10: ExtractCandidates
//设置package包名称以及导入依赖的类
package edu.emory.mathcs.ir.liveqa.tools
import java.io.{PrintWriter, File}
import edu.emory.mathcs.ir.liveqa.verticals.web.ContentExtractor
import edu.emory.mathcs.ir.liveqa.verticals.yahooanswers.YahooAnswersQuestion
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
object ExtractCandidates extends App {
val browser = new JsoupBrowser
scala.io.Source.fromFile(args(0) + "urls.txt").getLines().zipWithIndex.foreach { case (url, index) =>
try {
val content = scala.io.Source.fromFile(args(0) + (index + 5) + ".txt").mkString
if (index % 100 == 0) println("Processed " + index + " documents")
url match {
case u: String if u.contains("answers.yahoo.com") =>
val question = YahooAnswersQuestion.parse(content)
val out = new PrintWriter(new File(args(0) + (index + 5) + "_content.txt"))
out.println(question.qid)
out.println(question.categories.mkString("\t"))
out.println(question.title.replace("\n", " "))
out.println(question.body.replace("\n", " "))
out.close()
case _ =>
val out = new PrintWriter(new File(args(0) + (index + 5) + "_content.txt"))
out.println(browser.parseString(content).title.replace("\n", " "))
ContentExtractor.apply(content).foreach {
block => out.println(block.replace("\n", " "))
}
out.close()
}
} catch {
case exc: Exception =>
System.err.println(exc.getMessage)
}
}
}
示例11: VirusTotalScanner
//设置package包名称以及导入依赖的类
package com.gilazaria.subsearch.discovery
import dispatch.{Http, HttpExecutor, Req, url}
import net.ruippeixotog.scalascraper.browser.{Browser, JsoupBrowser}
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
import net.ruippeixotog.scalascraper.dsl.DSL._
import scala.concurrent.{ExecutionContext, Future}
class VirusTotalScanner private[discovery] (private val browser: Browser = JsoupBrowser(),
private val http: HttpExecutor = Http)
(implicit ec: ExecutionContext)
extends Scanner {
override val name: String = "Virus Total Scanner"
override def scan(hostname: String): Future[Set[String]] = {
retrieveHTML(hostname)
.map(html => extractSubdomains(html, hostname))
}
private[discovery] def extractSubdomains(html: String, hostname: String): Set[String] =
(browser.parseString(html).body >> elementList("a"))
.map(e => e.innerHtml)
.filter(subdomain => subdomain.endsWith(hostname) && subdomain != hostname)
.toSet
private[discovery] def retrieveHTML(hostname: String): Future[String] = {
val request: Req =
url(s"https://www.virustotal.com/en-gb/domain/$hostname/information/")
.GET
.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:45.0) Gecko/20100101 Firefox/45.0")
.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
.setHeader("Accept-Language", "en-US,en;q=0.5")
http(request).map(_.getResponseBody)
}
}
object VirusTotalScanner {
def conditionallyCreate(create: Boolean)(implicit ec: ExecutionContext): Option[VirusTotalScanner] =
if (create) Some(VirusTotalScanner.create())
else None
def create()(implicit ec: ExecutionContext): VirusTotalScanner =
new VirusTotalScanner()
}
示例12: getDocumentFromUrl
//设置package包名称以及导入依赖的类
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.model.Document
trait CustomBrowser extends JsoupBrowser {
val browser = JsoupBrowser()
def getDocumentFromUrl( url: String ): Either[ErrorMessage, Document] = {
try {
Right(browser.get(url))
} catch {
// https://jsoup.org/apidocs/ => org.jsoup
case e: org.jsoup.HttpStatusException =>
Left(ErrorMessage(e.getStatusCode, e.getMessage, e.getUrl))
case e: org.jsoup.SerializationException =>
//Refer: http://docs.oracle.com/cloud/latest/marketingcs_gs/OMCAB/Developers/GettingStarted/API%20requests/http-status-codes.htm
Left(ErrorMessage(400, e.getMessage, url)) //A SerializationException is raised whenever serialization of a DOM element fails.
case e: org.jsoup.UnsupportedMimeTypeException =>
//Refer: http://stackoverflow.com/questions/11973813/http-status-code-for-unaccepted-content-type-in-request
Left(ErrorMessage(415, e.getMessage, e.getUrl)) //Signals that a HTTP response returned a mime type that is not supported.
}
}
}
示例13: visitUrl
//设置package包名称以及导入依赖的类
package com.morenware.tvcrawler.crawling
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.model.Document
trait CrawlerUtils {
val browser = JsoupBrowser()
def visitUrl(url: String): Document = {
retry(5) {
val doc = browser.get(url)
doc
}
}
def getInitialSectionLink(baseUrl: String, baseSectionLink: String, extraParameters: String): String = {
baseUrl + baseSectionLink + extraParameters
}
def getHtmlFromPage(url: String) = {
visitUrl(url).toHtml
}
def retry[T](n: Int)(op: => T): T = {
try {
op
} catch {
// Catch any exception and retry the operation
case e if n > 1 =>
retry(n - 1)(op)
}
}
}
示例14: SearchModule
//设置package包名称以及导入依赖的类
package modules
import net.ruippeixotog.scalascraper.browser.{Browser, HtmlUnitBrowser, JsoupBrowser}
import play.api.inject._
import play.api.{Configuration, Environment}
import com.google.common.util.concurrent.RateLimiter
class SearchModule extends Module {
override def bindings(environment: Environment, configuration: Configuration): Seq[Binding[_]] = {
val configBlock = configuration.getConfig("search").getOrElse {
throw new RuntimeException("No search block in Play config!")
}
val rateLimiter = RateLimiter.create(
configBlock
.getDouble("rateLimit")
.getOrElse {
throw new RuntimeException("No rateLimit in Play search config block!")
}
)
val browser = configBlock
.getString("browser", Some(Set("jsoup", "htmlunit")))
.getOrElse {
throw new RuntimeException("No browser in Play search config block!")
} match {
case "jsoup" => JsoupBrowser()
case "htmlunit" => HtmlUnitBrowser()
}
Seq(
bind[RateLimiter].toInstance(rateLimiter),
bind[Browser].toInstance(browser)
)
}
}
示例15: RoleSBOLExtractor
//设置package包名称以及导入依赖的类
package comp.bio.aging.benchling
import cats.Foldable
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.dsl.DSL._
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
import net.ruippeixotog.scalascraper.dsl.DSL.Parse._
import cats.implicits._
import java.net._
object RoleSBOLExtractor{
def apply(urls: Iterable[String]) ={
new RoleSBOLExtractor(urls.map(u=>new URI(u)))
}
}
class RoleSBOLExtractor(urls: Iterable[URI]) {
val browser = JsoupBrowser()
def extract(url: String): Option[String] = {
val doc = browser.get(url)
doc >?> text("[property=name]")
}
lazy val titleMap: Map[URI, String] = urls.map(u=> u-> extract(u.toString).getOrElse(u.getPath)).toMap
}