本文整理汇总了Java中org.archive.io.warc.WARCRecordInfo类的典型用法代码示例。如果您正苦于以下问题:Java WARCRecordInfo类的具体用法?Java WARCRecordInfo怎么用?Java WARCRecordInfo使用的例子?那么, 这里精选的类代码示例或许可以为您提供帮助。
WARCRecordInfo类属于org.archive.io.warc包,在下文中一共展示了WARCRecordInfo类的10个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: acceptContent
import org.archive.io.warc.WARCRecordInfo; //导入依赖的package包/类
@Override
public boolean acceptContent(WARCRecordInfo recordInfo)
{
ByteArrayInputStream contentStream = (ByteArrayInputStream) recordInfo.getContentStream();
try {
List<String> lines = IOUtils.readLines(contentStream);
contentStream.reset();
for (String line : lines) {
if (line.contains("creativecommons")) {
return true;
}
}
}
catch (IOException e) {
throw new RuntimeException(e);
}
return false;
}
示例2: insert
import org.archive.io.warc.WARCRecordInfo; //导入依赖的package包/类
@Override
public boolean insert(Page target) {
try {
if (writer == null) {
createWarcWriter();
}
WARCRecordInfo warcRecord = getWarcRecordInfo(target, generator.getRecordID());
synchronized (writer) {
writer.checkSize();
writer.writeRecord(warcRecord);
writer.resetTmpStats();
writer.resetTmpRecordLog();
}
return true;
} catch (IOException e) {
logger.error("Exception thrown while creating a WARC record.", e);
}
return false;
}
示例3: writeRequest
import org.archive.io.warc.WARCRecordInfo; //导入依赖的package包/类
protected URI writeRequest(URI id) throws IOException, ParseException {
WARCRecordInfo record = new WARCRecordInfo();
record.setType(WARCConstants.WARCRecordType.request);
record.setUrl(getUrl());
record.setCreate14DigitDate(DateUtils
.getLog14Date(Long.parseLong(metadata.get("nutch.fetch.time"))));
record.setMimetype(WARCConstants.HTTP_REQUEST_MIMETYPE);
record.setRecordId(GENERATOR.getRecordID());
if (id != null) {
ANVLRecord headers = new ANVLRecord();
headers.addLabelValue(WARCConstants.HEADER_KEY_CONCURRENT_TO,
'<' + id.toString() + '>');
record.setExtraHeaders(headers);
}
ByteArrayOutputStream output = new ByteArrayOutputStream();
output.write(metadata.get("_request_").getBytes());
record.setContentLength(output.size());
record.setContentStream(new ByteArrayInputStream(output.toByteArray()));
writer.writeRecord(record);
return record.getRecordId();
}
示例4: acceptContent
import org.archive.io.warc.WARCRecordInfo; //导入依赖的package包/类
@Override
public boolean acceptContent(WARCRecordInfo recordInfo)
{
frequency.addValue(getContentType(recordInfo));
return true;
}
示例5: getContentType
import org.archive.io.warc.WARCRecordInfo; //导入依赖的package包/类
/**
* Returns the given content-type or empty string, if not available
*
* @param recordInfo WARC record info
* @return string, never null
*/
protected String getContentType(WARCRecordInfo recordInfo)
{
String contentTypeFull = recordInfo.getExtraHeaders().asMap().get("Nutch_Content-Type");
if (contentTypeFull != null) {
// split by ";"
return contentTypeFull.split(";")[0];
}
return "";
}
示例6: acceptContent
import org.archive.io.warc.WARCRecordInfo; //导入依赖的package包/类
@Override
public boolean acceptContent(WARCRecordInfo recordInfo)
{
String contentType = getContentType(recordInfo);
return ACCEPTED_CONTENT_TYPE.contains(contentType);
}
示例7: getWarcRecordInfo
import org.archive.io.warc.WARCRecordInfo; //导入依赖的package包/类
public WARCRecordInfo getWarcRecordInfo(Page page, URI uri) throws IOException {
WARCRecordInfo warcRecord = new WARCRecordInfo();
warcRecord.setUrl(page.getFinalUrl());
warcRecord.setRecordId(uri);
warcRecord.setType(WARCRecordType.response);
warcRecord.setMimetype(WARCConstants.HTTP_RESPONSE_MIMETYPE);
// Store fetch times using ISO-8601 format
Date fetchTime = createFetchTimeDate(page);
warcRecord.setCreate14DigitDate(dateFormater.get().format(fetchTime));
// Re-create response body based on content bytes and response headers
byte[] contentBytes = createContentBytes(page);
warcRecord.setContentLength(contentBytes.length);
warcRecord.setContentStream(new ByteArrayInputStream(contentBytes));
// Store ACHE-specific metadata as non-standard extension header fields
if (page.getTargetRelevance() != null) {
TargetRelevance targetRelevance = page.getTargetRelevance();
warcRecord.addExtraHeader("ACHE-IsRelevant", String.valueOf(targetRelevance.isRelevant()));
warcRecord.addExtraHeader("ACHE-Relevance", String.format("%.10f", targetRelevance.getRelevance()));
}
warcRecord.addExtraHeader("ACHE-Requested-URL", page.getRequestedUrl());
return warcRecord;
}
示例8: writeResponse
import org.archive.io.warc.WARCRecordInfo; //导入依赖的package包/类
protected URI writeResponse() throws IOException, ParseException {
WARCRecordInfo record = new WARCRecordInfo();
record.setType(WARCConstants.WARCRecordType.response);
record.setUrl(getUrl());
String fetchTime;
record.setCreate14DigitDate(DateUtils
.getLog14Date(Long.parseLong(metadata.get("nutch.fetch.time"))));
record.setMimetype(WARCConstants.HTTP_RESPONSE_MIMETYPE);
record.setRecordId(GENERATOR.getRecordID());
String IP = getResponseAddress();
if (StringUtils.isNotBlank(IP))
record.addExtraHeader(WARCConstants.HEADER_KEY_IP, IP);
if (ParseSegment.isTruncated(content))
record.addExtraHeader(WARCConstants.HEADER_KEY_TRUNCATED, "unspecified");
ByteArrayOutputStream output = new ByteArrayOutputStream();
String httpHeaders = metadata.get("_response.headers_");
if (StringUtils.isNotBlank(httpHeaders)) {
output.write(httpHeaders.getBytes());
} else {
// change the record type to resource as we not have information about
// the headers
record.setType(WARCConstants.WARCRecordType.resource);
record.setMimetype(content.getContentType());
}
output.write(getResponseContent().getBytes());
record.setContentLength(output.size());
record.setContentStream(new ByteArrayInputStream(output.toByteArray()));
if (output.size() > 0) {
// avoid generating a 0 sized record, as the webarchive library will
// complain about it
writer.writeRecord(record);
}
return record.getRecordId();
}
示例9: write
import org.archive.io.warc.WARCRecordInfo; //导入依赖的package包/类
protected void write(final WARCWriter writer, final Content content)
throws IOException, ParseException
{
WARCRecordInfo recordInfo = new WARCRecordInfo();
recordInfo.setUrl(content.getUrl());
byte[] byteContent = content.getContent();
// skip empty records
if (byteContent.length == 0) {
return;
}
recordInfo.setContentStream(new ByteArrayInputStream(byteContent));
recordInfo.setContentLength(byteContent.length);
recordInfo.setEnforceLength(true);
String warcDateString = DEFAULT_WARC_DATE;
// convert date to WARC-Date format
String date = content.getMetadata().get("Date");
if (date != null) {
try {
warcDateString = String.valueOf(
new SimpleDateFormat("EEE, dd MMM yyyy kk:mm:ss ZZZ", Locale.ENGLISH)
.parse(date).getTime());
}
catch (ParseException ex) {
// ignore
}
}
recordInfo.setCreate14DigitDate(warcDateString);
recordInfo.setType(WARCConstants.WARCRecordType.response);
recordInfo.setMimetype(WARCConstants.HTTP_RESPONSE_MIMETYPE);
recordInfo.setRecordId(generator.getRecordID());
// add some extra headers from nutch
Set<String> extraHeaders = new HashSet<String>(
Arrays.asList("nutch.crawl.score", "nutch.segment.name", "Set-Cookie",
"Content-Type", "Server", "Pragma", "Cache-Control"));
for (String extraHeader : extraHeaders) {
String value = content.getMetadata().get(extraHeader);
if (value != null) {
recordInfo.addExtraHeader("Nutch_" + extraHeader, value);
}
}
// apply filters
boolean acceptExport = true;
for (ExportContentFilter filter : filters) {
acceptExport &= filter.acceptContent(recordInfo);
}
// and write only if we accept this content
if (acceptExport) {
writer.writeRecord(recordInfo);
totalBytesWritten += byteContent.length;
entriesCounter++;
}
}
示例10: acceptContent
import org.archive.io.warc.WARCRecordInfo; //导入依赖的package包/类
/**
* Returns true, if the filter should accept this content for exporting to WARC file; false
* otherwise
*
* @param recordInfo record info
* @return boolean value
*/
boolean acceptContent(WARCRecordInfo recordInfo);