本文整理汇总了Java中org.archive.io.warc.WARCReaderFactory.get方法的典型用法代码示例。如果您正苦于以下问题:Java WARCReaderFactory.get方法的具体用法?Java WARCReaderFactory.get怎么用?Java WARCReaderFactory.get使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.archive.io.warc.WARCReaderFactory
的用法示例。
在下文中一共展示了WARCReaderFactory.get方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: processWarc
import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
private void processWarc(Path warcFile) throws IOException {
extractorStats.addWarc(warcFile.getFileName().toString());
InputStream is = Files.newInputStream(warcFile);
ArchiveReader reader = WARCReaderFactory.get(warcFile.toString(), is, true);
int i = 0;
reader.setStrict(false);
for (ArchiveRecord record : reader) {
record.setStrict(false);
extractorStats.visitedRecord();
handleRecord(record);
if (i++ % 1000 == 0) {
System.err.println(extractorStats);
}
}
}
示例2: generate
import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
public static void generate(Path path, int numPages) throws Exception {
Gson gson = new Gson();
long count = 0;
try (BufferedWriter writer = Files.newBufferedWriter(path)) {
ArchiveReader ar = WARCReaderFactory.get(new URL(sourceURL), 0);
for (ArchiveRecord r : ar) {
Page p = ArchiveUtil.buildPage(r);
if (p.isEmpty() || p.getOutboundLinks().isEmpty()) {
log.debug("Skipping {}", p.getUrl());
continue;
}
log.debug("Found {} {}", p.getUrl(), p.getNumOutbound());
String json = gson.toJson(p);
writer.write(json);
writer.newLine();
count++;
if (count == numPages) {
break;
} else if ((count % 1000) == 0) {
log.info("Wrote {} of {} pages to {}", count, numPages, path);
}
}
}
log.info("Wrote {} pages to {}", numPages, path);
}
示例3: readBz2
import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
/**
* Reads bz2 warc file
*
* @param file warc file
* @throws IOException
*/
public static void readBz2(String file)
throws IOException
{
// decompress bz2 file to tmp file
File tmpFile = File.createTempFile("tmp", ".warc");
BZip2CompressorInputStream inputStream = new BZip2CompressorInputStream(
new FileInputStream(file));
IOUtils.copy(inputStream, new FileOutputStream(tmpFile));
WARCReader reader = WARCReaderFactory.get(tmpFile);
int counter = 0;
for (ArchiveRecord record : reader) {
System.out.println(record.getHeader().getHeaderFields());
counter++;
}
FileUtils.forceDelete(tmpFile);
System.out.println(counter);
}
示例4: testARCReaderClose
import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
public void testARCReaderClose() {
try {
final File testfile = new File(ARCHIVE_DIR + testFileName);
FileUtils.copyFile(new File(ARCHIVE_DIR + "fyensdk.warc"),
testfile);
WARCReader reader = WARCReaderFactory.get(testfile);
WARCRecord record = (WARCRecord) reader.get(0);
BitarchiveRecord rec =
new BitarchiveRecord(record, testFileName);
record.close();
reader.close();
testfile.delete();
} catch (IOException e) {
fail("Should not throw IOException " + e);
}
}
示例5: main
import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
// Set up a local compressed WARC file for reading
String url = "https://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-data/CC-MAIN-2014-23/segments/1404776400583.60/warc/CC-MAIN-20140707234000-00000-ip-10-180-212-248.ec2.internal.warc.gz";
// String fn = "data/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
String fn = url;
FileInputStream is = new FileInputStream(fn);
// The file name identifies the ArchiveReader and indicates if it should be decompressed
ArchiveReader ar = WARCReaderFactory.get(fn, is, true);
// Once we have an ArchiveReader, we can work through each of the records it contains
int i = 0;
for(ArchiveRecord r : ar) {
// The header file contains information such as the type of record, size, creation time, and URL
System.out.println(r.getHeader());
System.out.println(r.getHeader().getUrl());
System.out.println();
// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
// Create a byte array that is as long as the record's stated length
byte[] rawData = IOUtils.toByteArray(r, r.available());
// Why don't we convert it to a string and print the start of it? Let's hope it's text!
String content = new String(rawData);
System.out.println(content.substring(0, Math.min(500, content.length())));
System.out.println((content.length() > 500 ? "..." : ""));
// Pretty printing to make the output more readable
System.out.println("=-=-=-=-=-=-=-=-=");
if (i++ > 4) break;
}
}
示例6: initialize
import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
throws IOException, InterruptedException {
FileSplit split = (FileSplit) inputSplit;
Configuration conf = context.getConfiguration();
Path path = split.getPath();
FileSystem fs = path.getFileSystem(conf);
fsin = fs.open(path);
arPath = path.getName();
ar = WARCReaderFactory.get(path.getName(), fsin, true);
}
示例7: initialize
import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException,
InterruptedException {
FileSplit split = (FileSplit) inputSplit;
Configuration conf = context.getConfiguration();
Path path = split.getPath();
FileSystem fs = path.getFileSystem(conf);
fsin = fs.open(path);
arPath = path.getName();
ar = WARCReaderFactory.get(path.getName(), fsin, true);
}
示例8: readPages
import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
public static Map<URL, Page> readPages(File input) throws Exception {
Map<URL, Page> pageMap = new HashMap<>();
ArchiveReader ar = WARCReaderFactory.get(input);
for (ArchiveRecord r : ar) {
Page p = ArchiveUtil.buildPage(r);
if (p.isEmpty() || p.getOutboundLinks().isEmpty()) {
continue;
}
pageMap.put(URL.fromUri(p.getUri()), p);
}
ar.close();
return pageMap;
}
示例9: read
import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
/**
* Reads default (gzipped) warc file
*
* @param file gz file
* @throws IOException
*/
public static void read(String file)
throws IOException
{
WARCReader reader = WARCReaderFactory.get(new File(file));
int counter = 0;
for (ArchiveRecord record : reader) {
System.out.println(record.getHeader().getHeaderFields());
counter++;
}
System.out.println(counter);
}
示例10: openFile
import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
private WARCReader openFile(Path filePath) throws IOException {
return WARCReaderFactory.get(filePath.toFile());
}
示例11: open
import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
public static ArchiveReader open(Path path) throws IOException {
/*
* ArchiveReaderFactor.get doesn't understand the .open extension.
*/
if (path.toString().endsWith(".warc.gz.open")) {
return WARCReaderFactory.get(path.toFile());
} else {
return ArchiveReaderFactory.get(path.toFile());
}
}
示例12: testWarcCopy
import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
public void testWarcCopy() {
try {
byte[] warcBytes = (
"WARC/1.0\r\n"
+ "WARC-Type: metadata\r\n"
+ "WARC-Target-URI: metadata://netarkivet.dk/crawl/setup/duplicatereductionjobs?majorversion=1&minorversion=0&harvestid=1&harvestnum=59&jobid=86\r\n"
+ "WARC-Date: 2012-08-24T11:42:55Z\r\n"
+ "WARC-Record-ID: <urn:uuid:c93099e5-2304-487e-9ff2-41e3c01c2b51>\r\n"
+ "WARC-Payload-Digest: sha1:SUCGMUVXDKVB5CS2NL4R4JABNX7K466U\r\n"
+ "WARC-IP-Address: 207.241.229.39\r\n"
+ "WARC-Concurrent-To: <urn:uuid:e7c9eff8-f5bc-4aeb-b3d2-9d3df99afb30>\r\n"
+ "WARC-Concurrent-To: <urn:uuid:e7c9eff8-f5bc-4aeb-b3d2-9d3df99afb31>\r\n"
+ "Content-Type: text/plain\r\n"
+ "Content-Length: 2\r\n"
+ "\r\n"
+ "85"
+ "\r\n"
+ "\r\n").getBytes();
File orgFile = new File(TestInfo.WORKING_DIR, "original4copy.warc");
FileUtils.writeBinaryFile(orgFile, warcBytes);
File copiedFile = new File(TestInfo.WORKING_DIR, "copied.warc");
WARCWriter writer = WARCUtils.createWARCWriter(copiedFile);
WARCUtils.insertWARCFile(orgFile, writer);
writer.close();
byte[] bytes = FileUtils.readBinaryFile(copiedFile);
//System.out.println( new String(bytes));
WARCReader reader = WARCReaderFactory.get(copiedFile);
Assert.assertNotNull(reader);
ArchiveRecord record = reader.get();
Assert.assertNotNull(record);
ArchiveRecordHeader header = record.getHeader();
Assert.assertNotNull(header);
Assert.assertEquals("metadata", header.getHeaderValue("WARC-Type"));
Assert.assertEquals("metadata://netarkivet.dk/crawl/setup/duplicatereductionjobs?majorversion=1&minorversion=0&harvestid=1&harvestnum=59&jobid=86", header.getHeaderValue("WARC-Target-URI"));
Assert.assertEquals("2012-08-24T11:42:55Z", header.getHeaderValue("WARC-Date"));
Assert.assertEquals("<urn:uuid:c93099e5-2304-487e-9ff2-41e3c01c2b51>", header.getHeaderValue("WARC-Record-ID"));
Assert.assertEquals("sha1:SUCGMUVXDKVB5CS2NL4R4JABNX7K466U", header.getHeaderValue("WARC-Payload-Digest"));
Assert.assertEquals("207.241.229.39", header.getHeaderValue("WARC-IP-Address"));
Assert.assertEquals("<urn:uuid:e7c9eff8-f5bc-4aeb-b3d2-9d3df99afb31>", header.getHeaderValue("WARC-Concurrent-To"));
Assert.assertEquals("text/plain", header.getHeaderValue("Content-Type"));
Assert.assertEquals("2", header.getHeaderValue("Content-Length"));
}
catch (IOException e) {
e.printStackTrace();
Assert.fail("Unexpected exception!");
}
}
示例13: main
import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException, S3ServiceException {
// We're accessing a publicly available bucket so don't need to fill in our credentials
S3Service s3s = new RestS3Service(null);
// Let's grab a file out of the CommonCrawl S3 bucket
String fn = "common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/warc/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
S3Object f = s3s.getObject("aws-publicdatasets", fn, null, null, null, null, null, null);
// The file name identifies the ArchiveReader and indicates if it should be decompressed
ArchiveReader ar = WARCReaderFactory.get(fn, f.getDataInputStream(), true);
// Once we have an ArchiveReader, we can work through each of the records it contains
int i = 0;
for(ArchiveRecord r : ar) {
// The header file contains information such as the type of record, size, creation time, and URL
System.out.println("Header: " + r.getHeader());
System.out.println("URL: " + r.getHeader().getUrl());
System.out.println();
// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
// Create a byte array that is as long as all the record's stated length
byte[] rawData = new byte[r.available()];
r.read(rawData);
// Note: potential optimization would be to have a large buffer only allocated once
// Why don't we convert it to a string and print the start of it? Let's hope it's text!
String content = new String(rawData);
System.out.println(content.substring(0, Math.min(500, content.length())));
System.out.println((content.length() > 500 ? "..." : ""));
// Pretty printing to make the output more readable
System.out.println("=-=-=-=-=-=-=-=-=");
if (i++ > 4) break;
}
}
示例14: main
import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
// Set up a local compressed WARC file for reading
String fn = "data/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
FileInputStream is = new FileInputStream(fn);
// The file name identifies the ArchiveReader and indicates if it should be decompressed
ArchiveReader ar = WARCReaderFactory.get(fn, is, true);
// Once we have an ArchiveReader, we can work through each of the records it contains
int i = 0;
for(ArchiveRecord r : ar) {
// The header file contains information such as the type of record, size, creation time, and URL
System.out.println(r.getHeader());
System.out.println(r.getHeader().getUrl());
System.out.println();
// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
// Create a byte array that is as long as the record's stated length
byte[] rawData = IOUtils.toByteArray(r, r.available());
// Why don't we convert it to a string and print the start of it? Let's hope it's text!
String content = new String(rawData);
System.out.println(content.substring(0, Math.min(500, content.length())));
System.out.println((content.length() > 500 ? "..." : ""));
// Pretty printing to make the output more readable
System.out.println("=-=-=-=-=-=-=-=-=");
if (i++ > 4) break;
}
}
示例15: getArchiveReader
import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
protected ArchiveReader getArchiveReader(final File f,
final long offset)
throws IOException {
if (ARCReaderFactory.isARCSuffix(f.getName())) {
return ARCReaderFactory.get(f, true, offset);
} else if (WARCReaderFactory.isWARCSuffix(f.getName())) {
return WARCReaderFactory.get(f, offset);
}
throw new IOException("Unknown file extension (Not ARC nor WARC): "
+ f.getName());
}