当前位置: 首页>>代码示例>>Java>>正文


Java WARCRecordInfo类代码示例

本文整理汇总了Java中org.archive.io.warc.WARCRecordInfo的典型用法代码示例。如果您正苦于以下问题:Java WARCRecordInfo类的具体用法?Java WARCRecordInfo怎么用?Java WARCRecordInfo使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。


WARCRecordInfo类属于org.archive.io.warc包,在下文中一共展示了WARCRecordInfo类的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: acceptContent

import org.archive.io.warc.WARCRecordInfo; //导入依赖的package包/类
@Override
public boolean acceptContent(WARCRecordInfo recordInfo)
{
    ByteArrayInputStream contentStream = (ByteArrayInputStream) recordInfo.getContentStream();

    try {
        List<String> lines = IOUtils.readLines(contentStream);
        contentStream.reset();

        for (String line : lines) {
            if (line.contains("creativecommons")) {
                return true;
            }
        }
    }
    catch (IOException e) {
        throw new RuntimeException(e);
    }

    return false;
}
 
开发者ID:habernal,项目名称:nutch-content-exporter,代码行数:22,代码来源:CreativeCommonsCandidateFilter.java

示例2: insert

import org.archive.io.warc.WARCRecordInfo; //导入依赖的package包/类
@Override
public boolean insert(Page target) {
    try {
        if (writer == null) {
            createWarcWriter();
        }
        WARCRecordInfo warcRecord = getWarcRecordInfo(target, generator.getRecordID());
        synchronized (writer) {
            writer.checkSize();
            writer.writeRecord(warcRecord);
            writer.resetTmpStats();
            writer.resetTmpRecordLog();
        }
        return true;
    } catch (IOException e) {
        logger.error("Exception thrown while creating a WARC record.", e);
    }
    return false;
}
 
开发者ID:ViDA-NYU,项目名称:ache,代码行数:20,代码来源:WarcTargetRepository.java

示例3: writeRequest

import org.archive.io.warc.WARCRecordInfo; //导入依赖的package包/类
protected URI writeRequest(URI id) throws IOException, ParseException {
  WARCRecordInfo record = new WARCRecordInfo();

  record.setType(WARCConstants.WARCRecordType.request);
  record.setUrl(getUrl());
  record.setCreate14DigitDate(DateUtils
      .getLog14Date(Long.parseLong(metadata.get("nutch.fetch.time"))));
  record.setMimetype(WARCConstants.HTTP_REQUEST_MIMETYPE);
  record.setRecordId(GENERATOR.getRecordID());

  if (id != null) {
    ANVLRecord headers = new ANVLRecord();
    headers.addLabelValue(WARCConstants.HEADER_KEY_CONCURRENT_TO,
        '<' + id.toString() + '>');
    record.setExtraHeaders(headers);
  }

  ByteArrayOutputStream output = new ByteArrayOutputStream();

  output.write(metadata.get("_request_").getBytes());
  record.setContentLength(output.size());
  record.setContentStream(new ByteArrayInputStream(output.toByteArray()));

  writer.writeRecord(record);

  return record.getRecordId();
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:28,代码来源:CommonCrawlFormatWARC.java

示例4: acceptContent

import org.archive.io.warc.WARCRecordInfo; //导入依赖的package包/类
@Override
public boolean acceptContent(WARCRecordInfo recordInfo)
{
    frequency.addValue(getContentType(recordInfo));

    return true;
}
 
开发者ID:habernal,项目名称:nutch-content-exporter,代码行数:8,代码来源:ContentTypeStatistics.java

示例5: getContentType

import org.archive.io.warc.WARCRecordInfo; //导入依赖的package包/类
/**
 * Returns the given content-type or empty string, if not available
 *
 * @param recordInfo WARC record info
 * @return string, never null
 */
protected String getContentType(WARCRecordInfo recordInfo)
{
    String contentTypeFull = recordInfo.getExtraHeaders().asMap().get("Nutch_Content-Type");

    if (contentTypeFull != null) {
        // split by ";"
        return contentTypeFull.split(";")[0];
    }

    return "";
}
 
开发者ID:habernal,项目名称:nutch-content-exporter,代码行数:18,代码来源:ContentTypeFilter.java

示例6: acceptContent

import org.archive.io.warc.WARCRecordInfo; //导入依赖的package包/类
@Override
public boolean acceptContent(WARCRecordInfo recordInfo)
{
    String contentType = getContentType(recordInfo);

    return ACCEPTED_CONTENT_TYPE.contains(contentType);
}
 
开发者ID:habernal,项目名称:nutch-content-exporter,代码行数:8,代码来源:ContentTypeFilter.java

示例7: getWarcRecordInfo

import org.archive.io.warc.WARCRecordInfo; //导入依赖的package包/类
public WARCRecordInfo getWarcRecordInfo(Page page, URI uri) throws IOException {
    WARCRecordInfo warcRecord = new WARCRecordInfo();

    warcRecord.setUrl(page.getFinalUrl());
    warcRecord.setRecordId(uri);
    warcRecord.setType(WARCRecordType.response);
    warcRecord.setMimetype(WARCConstants.HTTP_RESPONSE_MIMETYPE);

    // Store fetch times using ISO-8601 format
    Date fetchTime = createFetchTimeDate(page);
    warcRecord.setCreate14DigitDate(dateFormater.get().format(fetchTime));

    // Re-create response body based on content bytes and response headers
    byte[] contentBytes = createContentBytes(page);
    warcRecord.setContentLength(contentBytes.length);
    warcRecord.setContentStream(new ByteArrayInputStream(contentBytes));

    // Store ACHE-specific metadata as non-standard extension header fields
    if (page.getTargetRelevance() != null) {
        TargetRelevance targetRelevance = page.getTargetRelevance();
        warcRecord.addExtraHeader("ACHE-IsRelevant", String.valueOf(targetRelevance.isRelevant()));
        warcRecord.addExtraHeader("ACHE-Relevance", String.format("%.10f", targetRelevance.getRelevance()));
    }
    warcRecord.addExtraHeader("ACHE-Requested-URL", page.getRequestedUrl());

    return warcRecord;
}
 
开发者ID:ViDA-NYU,项目名称:ache,代码行数:28,代码来源:WarcTargetRepository.java

示例8: writeResponse

import org.archive.io.warc.WARCRecordInfo; //导入依赖的package包/类
protected URI writeResponse() throws IOException, ParseException {
  WARCRecordInfo record = new WARCRecordInfo();

  record.setType(WARCConstants.WARCRecordType.response);
  record.setUrl(getUrl());

  String fetchTime;

  record.setCreate14DigitDate(DateUtils
      .getLog14Date(Long.parseLong(metadata.get("nutch.fetch.time"))));
  record.setMimetype(WARCConstants.HTTP_RESPONSE_MIMETYPE);
  record.setRecordId(GENERATOR.getRecordID());

  String IP = getResponseAddress();

  if (StringUtils.isNotBlank(IP))
    record.addExtraHeader(WARCConstants.HEADER_KEY_IP, IP);

  if (ParseSegment.isTruncated(content))
    record.addExtraHeader(WARCConstants.HEADER_KEY_TRUNCATED, "unspecified");

  ByteArrayOutputStream output = new ByteArrayOutputStream();

  String httpHeaders = metadata.get("_response.headers_");

  if (StringUtils.isNotBlank(httpHeaders)) {
    output.write(httpHeaders.getBytes());
  } else {
    // change the record type to resource as we not have information about
    // the headers
    record.setType(WARCConstants.WARCRecordType.resource);
    record.setMimetype(content.getContentType());
  }

  output.write(getResponseContent().getBytes());

  record.setContentLength(output.size());
  record.setContentStream(new ByteArrayInputStream(output.toByteArray()));

  if (output.size() > 0) {
    // avoid generating a 0 sized record, as the webarchive library will
    // complain about it
    writer.writeRecord(record);
  }

  return record.getRecordId();
}
 
开发者ID:jorcox,项目名称:GeoCrawler,代码行数:48,代码来源:CommonCrawlFormatWARC.java

示例9: write

import org.archive.io.warc.WARCRecordInfo; //导入依赖的package包/类
protected void write(final WARCWriter writer, final Content content)
        throws IOException, ParseException
{
    WARCRecordInfo recordInfo = new WARCRecordInfo();
    recordInfo.setUrl(content.getUrl());

    byte[] byteContent = content.getContent();

    // skip empty records
    if (byteContent.length == 0) {
        return;
    }

    recordInfo.setContentStream(new ByteArrayInputStream(byteContent));
    recordInfo.setContentLength(byteContent.length);
    recordInfo.setEnforceLength(true);

    String warcDateString = DEFAULT_WARC_DATE;

    // convert date to WARC-Date format
    String date = content.getMetadata().get("Date");
    if (date != null) {
        try {
            warcDateString = String.valueOf(
                    new SimpleDateFormat("EEE, dd MMM yyyy kk:mm:ss ZZZ", Locale.ENGLISH)
                            .parse(date).getTime());
        }
        catch (ParseException ex) {
            // ignore
        }
    }

    recordInfo.setCreate14DigitDate(warcDateString);

    recordInfo.setType(WARCConstants.WARCRecordType.response);
    recordInfo.setMimetype(WARCConstants.HTTP_RESPONSE_MIMETYPE);
    recordInfo.setRecordId(generator.getRecordID());

    // add some extra headers from nutch
    Set<String> extraHeaders = new HashSet<String>(
            Arrays.asList("nutch.crawl.score", "nutch.segment.name", "Set-Cookie",
                    "Content-Type", "Server", "Pragma", "Cache-Control"));

    for (String extraHeader : extraHeaders) {
        String value = content.getMetadata().get(extraHeader);
        if (value != null) {
            recordInfo.addExtraHeader("Nutch_" + extraHeader, value);
        }
    }

    // apply filters
    boolean acceptExport = true;
    for (ExportContentFilter filter : filters) {
        acceptExport &= filter.acceptContent(recordInfo);
    }

    // and write only if we accept this content
    if (acceptExport) {
        writer.writeRecord(recordInfo);

        totalBytesWritten += byteContent.length;
        entriesCounter++;
    }

}
 
开发者ID:habernal,项目名称:nutch-content-exporter,代码行数:66,代码来源:NutchToWARCConverter.java

示例10: acceptContent

import org.archive.io.warc.WARCRecordInfo; //导入依赖的package包/类
/**
 * Returns true, if the filter should accept this content for exporting to WARC file; false
 * otherwise
 *
 * @param recordInfo record info
 * @return boolean value
 */
boolean acceptContent(WARCRecordInfo recordInfo);
 
开发者ID:habernal,项目名称:nutch-content-exporter,代码行数:9,代码来源:ExportContentFilter.java


注:本文中的org.archive.io.warc.WARCRecordInfo类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。