当前位置: 首页>>代码示例>>Scala>>正文


Scala JsoupBrowser类代码示例

本文整理汇总了Scala中net.ruippeixotog.scalascraper.browser.JsoupBrowser的典型用法代码示例。如果您正苦于以下问题:Scala JsoupBrowser类的具体用法?Scala JsoupBrowser怎么用?Scala JsoupBrowser使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


在下文中一共展示了JsoupBrowser类的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Scala代码示例。

示例1: FlatExtractor

//设置package包名称以及导入依赖的类
package org.oginskis.ss.tool

import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.model.Element
import org.jsoup.Connection
import org.oginskis.ss.model.Flat


object FlatExtractor {

  val SS_LV_BASE ="ss.lv.base.url"

  class ExtendedJsoupBrowser extends JsoupBrowser {
    override protected[this] def defaultRequestSettings(conn: Connection): Connection = {
      super.defaultRequestSettings(conn)
      conn.followRedirects(false)
    }
  }

  val browser = new ExtendedJsoupBrowser()

  def extractFlats() : List[Flat] = {
    def extractFlats(page: Int) : List[Flat] = {
      try {
        val doc = browser.get(Properties.getProperty(SS_LV_BASE)+"/riga/centre/sell/page"
            + page + ".html")
        val rawList: Iterable[Element] = doc.body.select("[id^=\"tr_\"]")
        rawList.init.toList.map(
          entry => {
            val attr: List[Element] = entry.select(".msga2-o").toList
            val link: String = entry.select(".msg2 .d1 .am").head.attr("href")
            new Flat(Option(attr(0).text.replace("\\", "/")),
              Option(attr(1).text.trim.replace("\\", "/")),
              Option(attr(2).text.trim.toInt),
              Option(attr(3).text.replace("\\", "/")),
              Option(attr(6).text.replace(",","").replace(" €","").trim.toInt),
              Option(link))
          }) ::: extractFlats(page + 1)
      }
      catch {
        case unknown: Throwable => {
          List[Flat]()
        }
      }
    }
    extractFlats(1)
  }



} 
开发者ID:oginskis,项目名称:vo-ss-extractor,代码行数:52,代码来源:FlatExtractor.scala

示例2: HtmlParseTest

//设置package包名称以及导入依赖的类
package anywhere

import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.dsl.DSL._
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
import net.ruippeixotog.scalascraper.dsl.DSL.Parse._


object HtmlParseTest extends App {
  val text =
    """
      |<div class="highlight highlight-source-scala"><pre><span class="pl-k">val</span> <span class="pl-en">conflicts</span><span class="pl-k">:</span> <span class="pl-en">Set</span>[<span class="pl-en">Dependency</span>] <span class="pl-k">=</span> resolution.conflicts</pre></div>
    """.stripMargin

  val browser = JsoupBrowser()
  val doc=browser.parseString(text)
  val cont=doc >> "span"
  cont.foreach(println)
} 
开发者ID:cuzfrog,项目名称:WebDriverServ,代码行数:20,代码来源:HtmlParseTest.scala

示例3: ScraperApp

//设置package包名称以及导入依赖的类
package com.sretsnom.mangareader.scraper

import com.typesafe.scalalogging.LazyLogging
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
import net.ruippeixotog.scalascraper.dsl.DSL._

object ScraperApp extends App with LazyLogging {
    val browser = JsoupBrowser()
    val doc = browser.get("http://observador.pt")

    println()
    println("=== OBSERVADOR ===")

    doc >> extractor(".logo img", attr("src")) |> println
    doc >> extractorAt[String]("example-extractor") |> println

    println("==================")
    println()

    doc >> ".small-news-list h4 > a" foreach println
} 
开发者ID:sretsnom,项目名称:mangareader,代码行数:23,代码来源:ScraperApp.scala

示例4: ScrapeActor

//设置package包名称以及导入依赖的类
package com.stacktrace.yo.scrapeline.old

import akka.actor.{Actor, ActorLogging, PoisonPill}
import com.stacktrace.yo.scrapeline.old.ScrapeActor.{BeginScrape, ScrapeContent}
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.model.Document

class ScrapeActor extends Actor with ActorLogging {

  lazy val jsoup = JsoupBrowser()

  override def receive: Receive = {
    case [email protected](url: String) =>
      val oSender = sender
      log.info("Getting {}", url)
      val doc = jsoup.get(url)
      sender ! ScrapeContent(doc)
      log.info("Response Returned .. Closing")
      self ! PoisonPill
  }
}

object ScrapeActor {

  case class BeginScrape(url: String)

  case class ScrapeContent(document: Document)

} 
开发者ID:StackTraceYo,项目名称:scrapeline,代码行数:30,代码来源:ScrapeActor.scala

示例5: Scheduler

//设置package包名称以及导入依赖的类
package net.seabears.hockey

import java.time.{LocalDate, LocalTime, ZonedDateTime}
import java.time.format.DateTimeFormatter

import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.dsl.DSL._
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
import net.ruippeixotog.scalascraper.dsl.DSL.Parse._
import net.ruippeixotog.scalascraper.model.Element

import net.seabears.hockey.core._
import net.seabears.hockey.util.DateUtils

object Scheduler {
  def apply(adapterFactory: Game => GameAdapter, dateStart: String, dateEnd: String, host: String)(implicit userAgentFactory: () => String, pauseFactory: () => Unit) =
    new Scheduler(adapterFactory, host, DateUtils.dates(dateStart, dateEnd), userAgentFactory, pauseFactory)
}

class Scheduler(adapterFactory: Game => GameAdapter, host: String, dates: Seq[LocalDate], userAgentFactory: () => String, pauseFactory: () => Unit) {
  private[this] val formatter = DateTimeFormatter.ofPattern("yyyyMMdd")
  private[this] val browser = new JsoupBrowser(userAgentFactory())

  def run() {
    dates.flatMap(getGames)
         .map(adapterFactory)
         .filter(_.isNew)
         .foreach(_.save)
  }

  private[this] def getGames(date: LocalDate): List[FutureGame] = {
    val dayId = date.format(formatter)
    val url = host + dayId
    println("Searching for games at " + url)
    val doc = browser.get(url)
    val tables: List[Element] = doc >> elementList("div.game-header")
    tables.map(toGame(date))
  }

  private def toGame(date: LocalDate)(element: Element): FutureGame = {
    pauseFactory()
    val away = element.select("table.game-header-table tr:nth-child(1) td.team-name").head.text
    val home = element.select("table.game-header-table tr:nth-child(3) td.team-name").head.text
    val time = element.select("ul.game-info li:nth-child(2) span:first-child").head.text
    FutureGame(Team("", home), Team("", away), parseTime(date, time))
  }

  private def parseTime(date: LocalDate, timeToParse: String) = {
    val gameTime = """^\s*(\d+:\d+\s+\w+)\s+(\w+)\s*$""".r
    val gameTime(rawTime, rawZone) = timeToParse
    val time = LocalTime.parse(rawTime, DateTimeFormatter.ofPattern("h:mm a"))
    ZonedDateTime.of(date, time, DateUtils.parseZone(rawZone))
  }
} 
开发者ID:cberes,项目名称:hockey-stats-loader,代码行数:55,代码来源:scheduler.scala

示例6: dojo_char

//设置package包名称以及导入依赖的类
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
import net.ruippeixotog.scalascraper.dsl.DSL._

object dojo_char extends App {
  def contentByUrl(url: String) = {
    JsoupBrowser().get(url) >> texts(".content") mkString
  }

  def flipLetter(l: String) =  l match {
    case "á" | "à" | "ã" => "a"
    case "é" | "ê" => "e"
    case "ó" | "ô" | "õ" => "o"
    case "ú" | "ü" => "u"
    case "í" => "i"
    case "ç" => "c"
    case ext: String => ext
  }

  def normalize(s: String) = {
    (s.toLowerCase().split("") map (flipLetter(_)) mkString).replaceAll("(\\W|\\d| |_)+", " ")
  }

  val link = "http://diversao.terra.com.br/tv/sala-de-tv/blog/2016/07/02/gloria-maria-fica-%E2%80%98tonta%E2%80%99-ao-fumar-maconha-e-a-internet-pira/"
  val siteText = contentByUrl(link)
  val text = normalize(siteText)

  print(text)
} 
开发者ID:0unit,项目名称:seo-dojo,代码行数:30,代码来源:dojo_seo.scala

示例7: Main

//设置package包名称以及导入依赖的类
package tomblachut.unraghvel

import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
import net.ruippeixotog.scalascraper.dsl.DSL._
import tomblachut.unraghvel.origin.FormNames._
import tomblachut.unraghvel.origin.{FormNames, Selectors, Urls}

import scala.io.StdIn


object Main extends App {

  val valueAttr = attr("value")
  val srcAttr = attr("src")

  val browser = new JsoupBrowser("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36")

  println(s"GET ${Urls.main}")
  val doc = browser.get(Urls.main)

  println("OK")
  val dates = doc >> text(Selectors.timeFrame)
  val captchaSrc = doc >> srcAttr(Selectors.captcha)

  println(dates)
  println(Urls.base + captchaSrc)
  val captchaResponse = StdIn.readLine().trim

  val parsedFields = FormNames.specialFields.map(n => n -> (doc >?> valueAttr(s"[name=$n]")).getOrElse(""))

  val request = Map(
    faculty -> wimiip,
    field -> is,
    degree -> firstDegree,
    kind -> daily,
    form -> all,
    semester -> "5",
    subject -> all,
    teacher -> all,
    room -> all,
    captcha -> captchaResponse
  ) ++ parsedFields + filterAction

  request.foreach { case (key, value) =>
    println(s"$key: ${value.take(100)}")
  }

  println(s"\nPOST ${Urls.main}")
  val timetable = browser.post(Urls.main, request)

  val message = timetable >?> text(Selectors.message)
  val data = timetable >?> text(Selectors.agenda)

  println(message)
  println(data)

  val content = timetable >> text(Selectors.content)
  content.take(100) |> println

} 
开发者ID:tomblachut,项目名称:unraghvel,代码行数:62,代码来源:Main.scala

示例8: marrickville

//设置package包名称以及导入依赖的类
package services

import models.CouncilIn
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.dsl.DSL._
import net.ruippeixotog.scalascraper.model.Element
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._


  def marrickville = {
    def oneElement(e:Element) = {
      val rows = e.flatMap(_ >> elements("tr"))
      val tdrows = rows.flatMap(_ >> elements("td"))
      val tds = tdrows.map(_ >> text("td")).toList
      CouncilIn(tds(1), tds(5), tds(7), tds(3))
    }

    val browser = JsoupBrowser()
    val urlStr = "https://eproperty.marrickville.nsw.gov.au/eServices/P1/PublicNotices/AllPublicNotices.aspx?r=MC.P1.WEBGUEST&f=%24P1.ESB.PUBNOTAL.ENQ"
    val doc = browser.get(urlStr)

    // Extract the elements with name
    val grid = doc >> element("#ctl00_Content_cusApplicationResultsGrid_pnlCustomisationGrid")
    val items = grid >> elements(".grid")
    items.map(e => oneElement(e)).toList

  }

  // scrape blacktown
  def blacktown = {
    def oneElement(e:Element) = {
      val rows = e.flatMap(_ >> elements("tr"))
      val tdrows = rows.flatMap(_ >> elements("td"))
      val tds = tdrows.map(_ >> text("td")).toList
      CouncilIn(tds(1), tds(2), tds(3), tds(4))
    }

    val browser = JsoupBrowser()
    val urlStr = "http://www.blacktown.nsw.gov.au/Planning_and_Development/Development_Assessment/Development_Online/Developments_on_Notification"
    val doc = browser.get(urlStr)

    // Extract the elements with name
    val main = doc >> element("#main-content")
    val body = main >> element(".body-content")
    val items = body >> elements("table")
    items.map(e => oneElement(e)).toList

  }

} 
开发者ID:olambo,项目名称:flexscraper,代码行数:51,代码来源:Scrape.scala

示例9: DropTableParser

//设置package包名称以及导入依赖的类
package com.harry0000.kancolle.ac

import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import wvlet.log.{LogFormatter, LogSupport, Logger}

object DropTableParser extends LogSupport {
  Logger.scheduleLogLevelScan
  Logger.setDefaultFormatter(LogFormatter.AppLogFormatter)

  def main(args: Array[String]): Unit = {
    implicit val browser = JsoupBrowser()

    (for {
      card <- DropListByCardScraper.scrape().right
      area <- DropListByAreaScraper.scrape().right
    } yield {
      println(Printer.prettyPrint(diff(card, area)))
    }).left
      .foreach(error(_))
  }

  def diff(ds1: Seq[ShipDrops], ds2: Seq[ShipDrops]): Seq[ShipDrops] = {
    val m1 = ds1.flatMap(ShipDrops.unapply).toMap
    val m2 = ds2.flatMap(ShipDrops.unapply).toMap
    val areas = (m1.keys.toSet ++ m2.keys).toSeq.sorted

    areas.foldLeft(Seq.empty[ShipDrops]) { case (z, area) =>
      (m1.get(area), m2.get(area)) match {
        case (Some(ships1), None        ) => z :+ ShipDrops(area, ships1)
        case (None,         Some(ships2)) => z :+ ShipDrops(area, ships2)
        case (Some(ships1), Some(ships2)) =>
          ShipCategory.values.flatMap { c =>
            (ships1.get(c), ships2.get(c)) match {
              case (Some(names1), None        ) => Some(c -> names1)
              case (None,         Some(names2)) => Some(c -> names2)
              case (Some(names1), Some(names2)) => Some(names1 diff names2).filter(_.nonEmpty).map(c -> _)
              case _ => None
            }
          } match {
            case Nil  => z
            case diff => z :+ ShipDrops(area, ShipMap(diff: _*))
          }
        case _ => z
      }
    }
  }

} 
开发者ID:harry0000,项目名称:DropTableParser,代码行数:49,代码来源:DropTableParser.scala

示例10: ExtractCandidates

//设置package包名称以及导入依赖的类
package edu.emory.mathcs.ir.liveqa.tools

import java.io.{PrintWriter, File}

import edu.emory.mathcs.ir.liveqa.verticals.web.ContentExtractor
import edu.emory.mathcs.ir.liveqa.verticals.yahooanswers.YahooAnswersQuestion
import net.ruippeixotog.scalascraper.browser.JsoupBrowser


object ExtractCandidates extends App {
  val browser = new JsoupBrowser

  scala.io.Source.fromFile(args(0) + "urls.txt").getLines().zipWithIndex.foreach { case (url, index) =>
    try {
      val content = scala.io.Source.fromFile(args(0) + (index + 5) + ".txt").mkString
      if (index % 100 == 0) println("Processed " + index + " documents")
      url match {
        case u: String if u.contains("answers.yahoo.com") =>
          val question = YahooAnswersQuestion.parse(content)
          val out = new PrintWriter(new File(args(0) + (index + 5) + "_content.txt"))
          out.println(question.qid)
          out.println(question.categories.mkString("\t"))
          out.println(question.title.replace("\n", " "))
          out.println(question.body.replace("\n", " "))
          out.close()
        case _ =>
          val out = new PrintWriter(new File(args(0) + (index + 5) + "_content.txt"))
          out.println(browser.parseString(content).title.replace("\n", " "))
          ContentExtractor.apply(content).foreach {
            block => out.println(block.replace("\n", " "))
          }
          out.close()
      }
    } catch {
      case exc: Exception =>
        System.err.println(exc.getMessage)
    }
  }
} 
开发者ID:emory-irlab,项目名称:liveqa,代码行数:40,代码来源:ExtractCandidates.scala

示例11: VirusTotalScanner

//设置package包名称以及导入依赖的类
package com.gilazaria.subsearch.discovery

import dispatch.{Http, HttpExecutor, Req, url}
import net.ruippeixotog.scalascraper.browser.{Browser, JsoupBrowser}
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
import net.ruippeixotog.scalascraper.dsl.DSL._

import scala.concurrent.{ExecutionContext, Future}

class VirusTotalScanner private[discovery] (private val browser: Browser = JsoupBrowser(),
                                            private val http: HttpExecutor = Http)
                                           (implicit ec: ExecutionContext)
extends Scanner {
  override val name: String = "Virus Total Scanner"

  override def scan(hostname: String): Future[Set[String]] = {
    retrieveHTML(hostname)
      .map(html => extractSubdomains(html, hostname))
  }

  private[discovery] def extractSubdomains(html: String, hostname: String): Set[String] =
    (browser.parseString(html).body >> elementList("a"))
      .map(e => e.innerHtml)
      .filter(subdomain => subdomain.endsWith(hostname) && subdomain != hostname)
      .toSet

  private[discovery] def retrieveHTML(hostname: String): Future[String] = {
    val request: Req =
      url(s"https://www.virustotal.com/en-gb/domain/$hostname/information/")
        .GET
        .setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:45.0) Gecko/20100101 Firefox/45.0")
        .setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
        .setHeader("Accept-Language", "en-US,en;q=0.5")

    http(request).map(_.getResponseBody)
  }
}

object VirusTotalScanner {
  def conditionallyCreate(create: Boolean)(implicit ec: ExecutionContext): Option[VirusTotalScanner] =
    if (create) Some(VirusTotalScanner.create())
    else None

  def create()(implicit ec: ExecutionContext): VirusTotalScanner =
    new VirusTotalScanner()
} 
开发者ID:gavia,项目名称:subsearch,代码行数:47,代码来源:VirusTotalScanner.scala

示例12: getDocumentFromUrl

//设置package包名称以及导入依赖的类
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.model.Document


trait CustomBrowser extends JsoupBrowser {

  val browser = JsoupBrowser()

  def getDocumentFromUrl( url: String ): Either[ErrorMessage, Document] = {
    try {
      Right(browser.get(url))
    } catch {
      // https://jsoup.org/apidocs/ => org.jsoup
      case e: org.jsoup.HttpStatusException =>
        Left(ErrorMessage(e.getStatusCode, e.getMessage, e.getUrl))
      case e: org.jsoup.SerializationException =>
        //Refer: http://docs.oracle.com/cloud/latest/marketingcs_gs/OMCAB/Developers/GettingStarted/API%20requests/http-status-codes.htm
        Left(ErrorMessage(400, e.getMessage, url)) //A SerializationException is raised whenever serialization of a DOM element fails.
      case e: org.jsoup.UnsupportedMimeTypeException =>
        //Refer: http://stackoverflow.com/questions/11973813/http-status-code-for-unaccepted-content-type-in-request
        Left(ErrorMessage(415, e.getMessage, e.getUrl)) //Signals that a HTTP response returned a mime type that is not supported.
    }
  }
} 
开发者ID:jungbin-kim,项目名称:web-scraping-with-scala,代码行数:25,代码来源:CustomBrowser.scala

示例13: visitUrl

//设置package包名称以及导入依赖的类
package com.morenware.tvcrawler.crawling

import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.model.Document


trait CrawlerUtils {
  val browser = JsoupBrowser()

  def visitUrl(url: String): Document = {
    retry(5) {
      val doc = browser.get(url)
      doc
    }
  }

  def getInitialSectionLink(baseUrl: String, baseSectionLink: String, extraParameters: String): String = {
    baseUrl + baseSectionLink + extraParameters
  }

  def getHtmlFromPage(url: String) = {
    visitUrl(url).toHtml
  }

  def retry[T](n: Int)(op: => T): T = {
    try {
      op
    } catch {
      // Catch any exception and retry the operation
      case e if n > 1 =>
        retry(n - 1)(op)
    }
  }
} 
开发者ID:dfernandezm,项目名称:tv-crawler-scala,代码行数:35,代码来源:CrawlerUtils.scala

示例14: SearchModule

//设置package包名称以及导入依赖的类
package modules

import net.ruippeixotog.scalascraper.browser.{Browser, HtmlUnitBrowser, JsoupBrowser}
import play.api.inject._
import play.api.{Configuration, Environment}

import com.google.common.util.concurrent.RateLimiter


class SearchModule extends Module {
  override def bindings(environment: Environment, configuration: Configuration): Seq[Binding[_]] = {
    val configBlock = configuration.getConfig("search").getOrElse {
      throw new RuntimeException("No search block in Play config!")
    }

    val rateLimiter = RateLimiter.create(
      configBlock
        .getDouble("rateLimit")
        .getOrElse {
          throw new RuntimeException("No rateLimit in Play search config block!")
        }
    )

    val browser = configBlock
      .getString("browser", Some(Set("jsoup", "htmlunit")))
      .getOrElse {
        throw new RuntimeException("No browser in Play search config block!")
      } match {
      case "jsoup" => JsoupBrowser()
      case "htmlunit" => HtmlUnitBrowser()
    }

    Seq(
      bind[RateLimiter].toInstance(rateLimiter),
      bind[Browser].toInstance(browser)
    )
  }
} 
开发者ID:haruko-devs,项目名称:haruko,代码行数:39,代码来源:SearchModule.scala

示例15: RoleSBOLExtractor

//设置package包名称以及导入依赖的类
package comp.bio.aging.benchling

import cats.Foldable
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.dsl.DSL._
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
import net.ruippeixotog.scalascraper.dsl.DSL.Parse._
import cats.implicits._
import java.net._

object RoleSBOLExtractor{
  def apply(urls: Iterable[String]) ={
    new RoleSBOLExtractor(urls.map(u=>new URI(u)))
  }
}

class RoleSBOLExtractor(urls: Iterable[URI]) {
  val browser = JsoupBrowser()

  def extract(url: String): Option[String] = {
    val doc = browser.get(url)
    doc >?> text("[property=name]")
  }

  lazy val titleMap: Map[URI, String] = urls.map(u=> u-> extract(u.toString).getOrElse(u.getPath)).toMap
} 
开发者ID:antonkulaga,项目名称:benchling-client,代码行数:27,代码来源:RoleSBOLExtractor.scala


注:本文中的net.ruippeixotog.scalascraper.browser.JsoupBrowser类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。