当前位置: 首页>>代码示例>>Java>>正文


Java WARCReaderFactory.get方法代码示例

本文整理汇总了Java中org.archive.io.warc.WARCReaderFactory.get方法的典型用法代码示例。如果您正苦于以下问题:Java WARCReaderFactory.get方法的具体用法?Java WARCReaderFactory.get怎么用?Java WARCReaderFactory.get使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.archive.io.warc.WARCReaderFactory的用法示例。


在下文中一共展示了WARCReaderFactory.get方法的15个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: processWarc

import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
private void processWarc(Path warcFile) throws IOException {
    extractorStats.addWarc(warcFile.getFileName().toString());
    InputStream is = Files.newInputStream(warcFile);
    ArchiveReader reader = WARCReaderFactory.get(warcFile.toString(), is, true);

    int i = 0;
    reader.setStrict(false);
    for (ArchiveRecord record : reader) {
        record.setStrict(false);
        extractorStats.visitedRecord();
        handleRecord(record);
        if (i++ % 1000 == 0) {
            System.err.println(extractorStats);
        }
    }
}
 
开发者ID:tballison,项目名称:SimpleCommonCrawlExtractor,代码行数:17,代码来源:AbstractExtractor.java

示例2: generate

import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
public static void generate(Path path, int numPages) throws Exception {

    Gson gson = new Gson();
    long count = 0;
    try (BufferedWriter writer = Files.newBufferedWriter(path)) {
      ArchiveReader ar = WARCReaderFactory.get(new URL(sourceURL), 0);
      for (ArchiveRecord r : ar) {
        Page p = ArchiveUtil.buildPage(r);
        if (p.isEmpty() || p.getOutboundLinks().isEmpty()) {
          log.debug("Skipping {}", p.getUrl());
          continue;
        }
        log.debug("Found {} {}", p.getUrl(), p.getNumOutbound());
        String json = gson.toJson(p);
        writer.write(json);
        writer.newLine();
        count++;
        if (count == numPages) {
          break;
        } else if ((count % 1000) == 0) {
          log.info("Wrote {} of {} pages to {}", count, numPages, path);
        }
      }
    }
    log.info("Wrote {} pages to {}", numPages, path);
  }
 
开发者ID:astralway,项目名称:webindex,代码行数:27,代码来源:SampleData.java

示例3: readBz2

import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
/**
 * Reads bz2 warc file
 *
 * @param file warc file
 * @throws IOException
 */
public static void readBz2(String file)
        throws IOException
{
    // decompress bz2 file to tmp file
    File tmpFile = File.createTempFile("tmp", ".warc");
    BZip2CompressorInputStream inputStream = new BZip2CompressorInputStream(
            new FileInputStream(file));

    IOUtils.copy(inputStream, new FileOutputStream(tmpFile));

    WARCReader reader = WARCReaderFactory.get(tmpFile);

    int counter = 0;
    for (ArchiveRecord record : reader) {
        System.out.println(record.getHeader().getHeaderFields());

        counter++;
    }

    FileUtils.forceDelete(tmpFile);

    System.out.println(counter);
}
 
开发者ID:habernal,项目名称:nutch-content-exporter,代码行数:30,代码来源:WARCReaderTest.java

示例4: testARCReaderClose

import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
public void testARCReaderClose() {
    try {
        final File testfile = new File(ARCHIVE_DIR + testFileName);
        FileUtils.copyFile(new File(ARCHIVE_DIR + "fyensdk.warc"),
                testfile);
        
        WARCReader reader = WARCReaderFactory.get(testfile);
        WARCRecord record = (WARCRecord) reader.get(0);
        BitarchiveRecord rec =
                new BitarchiveRecord(record, testFileName);
        record.close();
        reader.close();
        testfile.delete();
    } catch (IOException e) {
        fail("Should not throw IOException " + e);
    }

}
 
开发者ID:netarchivesuite,项目名称:netarchivesuite-svngit-migration,代码行数:19,代码来源:WARCReaderTester.java

示例5: main

import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
/**
	 * @param args
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		// Set up a local compressed WARC file for reading 
		String url = "https://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-data/CC-MAIN-2014-23/segments/1404776400583.60/warc/CC-MAIN-20140707234000-00000-ip-10-180-212-248.ec2.internal.warc.gz";
//		String fn = "data/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
		String fn = url;
		FileInputStream is = new FileInputStream(fn);
		// The file name identifies the ArchiveReader and indicates if it should be decompressed
		ArchiveReader ar = WARCReaderFactory.get(fn, is, true);
		
		// Once we have an ArchiveReader, we can work through each of the records it contains
		int i = 0;
		for(ArchiveRecord r : ar) {
			// The header file contains information such as the type of record, size, creation time, and URL
			System.out.println(r.getHeader());
			System.out.println(r.getHeader().getUrl());
			System.out.println();
			
			// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
			// Create a byte array that is as long as the record's stated length
			byte[] rawData = IOUtils.toByteArray(r, r.available());
			
			// Why don't we convert it to a string and print the start of it? Let's hope it's text!
			String content = new String(rawData);
			System.out.println(content.substring(0, Math.min(500, content.length())));
			System.out.println((content.length() > 500 ? "..." : ""));
			
			// Pretty printing to make the output more readable 
			System.out.println("=-=-=-=-=-=-=-=-=");
			if (i++ > 4) break; 
		}
	}
 
开发者ID:TeamHG-Memex,项目名称:common-crawl-mapreduce,代码行数:36,代码来源:WARCReaderTest.java

示例6: initialize

import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
		throws IOException, InterruptedException {
	FileSplit split = (FileSplit) inputSplit;
	Configuration conf = context.getConfiguration();
	Path path = split.getPath();
	FileSystem fs = path.getFileSystem(conf);
	fsin = fs.open(path);
	arPath = path.getName();
	ar = WARCReaderFactory.get(path.getName(), fsin, true);
}
 
开发者ID:TeamHG-Memex,项目名称:common-crawl-mapreduce,代码行数:12,代码来源:WARCFileRecordReader.java

示例7: initialize

import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException,
    InterruptedException {
  FileSplit split = (FileSplit) inputSplit;
  Configuration conf = context.getConfiguration();
  Path path = split.getPath();
  FileSystem fs = path.getFileSystem(conf);
  fsin = fs.open(path);
  arPath = path.getName();
  ar = WARCReaderFactory.get(path.getName(), fsin, true);
}
 
开发者ID:astralway,项目名称:webindex,代码行数:12,代码来源:WARCFileRecordReader.java

示例8: readPages

import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
public static Map<URL, Page> readPages(File input) throws Exception {
  Map<URL, Page> pageMap = new HashMap<>();
  ArchiveReader ar = WARCReaderFactory.get(input);
  for (ArchiveRecord r : ar) {
    Page p = ArchiveUtil.buildPage(r);
    if (p.isEmpty() || p.getOutboundLinks().isEmpty()) {
      continue;
    }
    pageMap.put(URL.fromUri(p.getUri()), p);
  }
  ar.close();
  return pageMap;
}
 
开发者ID:astralway,项目名称:webindex,代码行数:14,代码来源:IndexIT.java

示例9: read

import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
/**
 * Reads default (gzipped) warc file
 *
 * @param file gz file
 * @throws IOException
 */
public static void read(String file)
        throws IOException
{
    WARCReader reader = WARCReaderFactory.get(new File(file));

    int counter = 0;
    for (ArchiveRecord record : reader) {
        System.out.println(record.getHeader().getHeaderFields());

        counter++;
    }

    System.out.println(counter);
}
 
开发者ID:habernal,项目名称:nutch-content-exporter,代码行数:21,代码来源:WARCReaderTest.java

示例10: openFile

import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
private WARCReader openFile(Path filePath) throws IOException {
    return WARCReaderFactory.get(filePath.toFile());
}
 
开发者ID:ViDA-NYU,项目名称:ache,代码行数:4,代码来源:WarcTargetRepository.java

示例11: open

import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
public static ArchiveReader open(Path path) throws IOException {
    /*
     * ArchiveReaderFactor.get doesn't understand the .open extension.
     */
    if (path.toString().endsWith(".warc.gz.open")) {
        return WARCReaderFactory.get(path.toFile());
    } else {
        return ArchiveReaderFactory.get(path.toFile());
    }
}
 
开发者ID:nla,项目名称:bamboo,代码行数:11,代码来源:WarcUtils.java

示例12: testWarcCopy

import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
public void testWarcCopy() {
    try {
        byte[] warcBytes = (
                "WARC/1.0\r\n"
                + "WARC-Type: metadata\r\n"
                + "WARC-Target-URI: metadata://netarkivet.dk/crawl/setup/duplicatereductionjobs?majorversion=1&minorversion=0&harvestid=1&harvestnum=59&jobid=86\r\n"
                + "WARC-Date: 2012-08-24T11:42:55Z\r\n"
                + "WARC-Record-ID: <urn:uuid:c93099e5-2304-487e-9ff2-41e3c01c2b51>\r\n"
                + "WARC-Payload-Digest: sha1:SUCGMUVXDKVB5CS2NL4R4JABNX7K466U\r\n"
                + "WARC-IP-Address: 207.241.229.39\r\n"
                + "WARC-Concurrent-To: <urn:uuid:e7c9eff8-f5bc-4aeb-b3d2-9d3df99afb30>\r\n"
                + "WARC-Concurrent-To: <urn:uuid:e7c9eff8-f5bc-4aeb-b3d2-9d3df99afb31>\r\n"
                + "Content-Type: text/plain\r\n"
                + "Content-Length: 2\r\n"
                + "\r\n"
                + "85"
                + "\r\n"
                + "\r\n").getBytes();
        File orgFile = new File(TestInfo.WORKING_DIR, "original4copy.warc");
        FileUtils.writeBinaryFile(orgFile, warcBytes);

        File copiedFile = new File(TestInfo.WORKING_DIR, "copied.warc");
        WARCWriter writer = WARCUtils.createWARCWriter(copiedFile);
        WARCUtils.insertWARCFile(orgFile, writer);
        writer.close();

        byte[] bytes = FileUtils.readBinaryFile(copiedFile);
        //System.out.println( new String(bytes));

        WARCReader reader = WARCReaderFactory.get(copiedFile);
        Assert.assertNotNull(reader);
        ArchiveRecord record = reader.get();
        Assert.assertNotNull(record);
        ArchiveRecordHeader header = record.getHeader();
        Assert.assertNotNull(header);

        Assert.assertEquals("metadata", header.getHeaderValue("WARC-Type"));
        Assert.assertEquals("metadata://netarkivet.dk/crawl/setup/duplicatereductionjobs?majorversion=1&minorversion=0&harvestid=1&harvestnum=59&jobid=86", header.getHeaderValue("WARC-Target-URI"));
        Assert.assertEquals("2012-08-24T11:42:55Z", header.getHeaderValue("WARC-Date"));
        Assert.assertEquals("<urn:uuid:c93099e5-2304-487e-9ff2-41e3c01c2b51>", header.getHeaderValue("WARC-Record-ID"));
        Assert.assertEquals("sha1:SUCGMUVXDKVB5CS2NL4R4JABNX7K466U", header.getHeaderValue("WARC-Payload-Digest"));
        Assert.assertEquals("207.241.229.39", header.getHeaderValue("WARC-IP-Address"));
        Assert.assertEquals("<urn:uuid:e7c9eff8-f5bc-4aeb-b3d2-9d3df99afb31>", header.getHeaderValue("WARC-Concurrent-To"));
        Assert.assertEquals("text/plain", header.getHeaderValue("Content-Type"));
        Assert.assertEquals("2", header.getHeaderValue("Content-Length"));
    }
    catch (IOException e) {
        e.printStackTrace();
        Assert.fail("Unexpected exception!");
    }

}
 
开发者ID:netarchivesuite,项目名称:netarchivesuite-svngit-migration,代码行数:53,代码来源:WARCUtilsTester.java

示例13: main

import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
public static void main(String[] args) throws IOException, S3ServiceException {
	// We're accessing a publicly available bucket so don't need to fill in our credentials
	S3Service s3s = new RestS3Service(null);
	
	// Let's grab a file out of the CommonCrawl S3 bucket
	String fn = "common-crawl/crawl-data/CC-MAIN-2013-48/segments/1386163035819/warc/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
	S3Object f = s3s.getObject("aws-publicdatasets", fn, null, null, null, null, null, null);
	
	// The file name identifies the ArchiveReader and indicates if it should be decompressed
	ArchiveReader ar = WARCReaderFactory.get(fn, f.getDataInputStream(), true);
	
	// Once we have an ArchiveReader, we can work through each of the records it contains
	int i = 0;
	for(ArchiveRecord r : ar) {
		// The header file contains information such as the type of record, size, creation time, and URL
		System.out.println("Header: " + r.getHeader());
		System.out.println("URL: " + r.getHeader().getUrl());
		System.out.println();
		
		// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
		// Create a byte array that is as long as all the record's stated length
		byte[] rawData = new byte[r.available()];
		r.read(rawData);
		// Note: potential optimization would be to have a large buffer only allocated once
		
		// Why don't we convert it to a string and print the start of it? Let's hope it's text!
		String content = new String(rawData);
		System.out.println(content.substring(0, Math.min(500, content.length())));
		System.out.println((content.length() > 500 ? "..." : ""));
		
		// Pretty printing to make the output more readable 
		System.out.println("=-=-=-=-=-=-=-=-=");
		if (i++ > 4) break; 
	}
}
 
开发者ID:Smerity,项目名称:cc-warc-examples,代码行数:36,代码来源:S3ReaderTest.java

示例14: main

import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
/**
 * @param args
 * @throws IOException 
 */
public static void main(String[] args) throws IOException {
	// Set up a local compressed WARC file for reading 
	String fn = "data/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
	FileInputStream is = new FileInputStream(fn);
	// The file name identifies the ArchiveReader and indicates if it should be decompressed
	ArchiveReader ar = WARCReaderFactory.get(fn, is, true);
	
	// Once we have an ArchiveReader, we can work through each of the records it contains
	int i = 0;
	for(ArchiveRecord r : ar) {
		// The header file contains information such as the type of record, size, creation time, and URL
		System.out.println(r.getHeader());
		System.out.println(r.getHeader().getUrl());
		System.out.println();
		
		// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
		// Create a byte array that is as long as the record's stated length
		byte[] rawData = IOUtils.toByteArray(r, r.available());
		
		// Why don't we convert it to a string and print the start of it? Let's hope it's text!
		String content = new String(rawData);
		System.out.println(content.substring(0, Math.min(500, content.length())));
		System.out.println((content.length() > 500 ? "..." : ""));
		
		// Pretty printing to make the output more readable 
		System.out.println("=-=-=-=-=-=-=-=-=");
		if (i++ > 4) break; 
	}
}
 
开发者ID:Smerity,项目名称:cc-warc-examples,代码行数:34,代码来源:WARCReaderTest.java

示例15: getArchiveReader

import org.archive.io.warc.WARCReaderFactory; //导入方法依赖的package包/类
protected ArchiveReader getArchiveReader(final File f,
	final long offset)
throws IOException {
	if (ARCReaderFactory.isARCSuffix(f.getName())) {
		return ARCReaderFactory.get(f, true, offset);
	} else if (WARCReaderFactory.isWARCSuffix(f.getName())) {
		return WARCReaderFactory.get(f, offset);
	}
	throw new IOException("Unknown file extension (Not ARC nor WARC): "
		+ f.getName());
}
 
开发者ID:iipc,项目名称:webarchive-commons,代码行数:12,代码来源:ArchiveReaderFactory.java


注:本文中的org.archive.io.warc.WARCReaderFactory.get方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。