Java ArchiveReader类代码示例

本文整理汇总了Java中org.archive.io.ArchiveReader类的典型用法代码示例。如果您正苦于以下问题：Java ArchiveReader类的具体用法？Java ArchiveReader怎么用？Java ArchiveReader使用的例子？那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。

ArchiveReader类属于org.archive.io包，在下文中一共展示了ArchiveReader类的15个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: processWarc

import org.archive.io.ArchiveReader; //导入依赖的package包/类
private void processWarc(Path warcFile) throws IOException {
    extractorStats.addWarc(warcFile.getFileName().toString());
    InputStream is = Files.newInputStream(warcFile);
    ArchiveReader reader = WARCReaderFactory.get(warcFile.toString(), is, true);

    int i = 0;
    reader.setStrict(false);
    for (ArchiveRecord record : reader) {
        record.setStrict(false);
        extractorStats.visitedRecord();
        handleRecord(record);
        if (i++ % 1000 == 0) {
            System.err.println(extractorStats);
        }
    }
}

开发者ID:tballison，项目名称:SimpleCommonCrawlExtractor，代码行数:17，代码来源:AbstractExtractor.java

示例2: generate

import org.archive.io.ArchiveReader; //导入依赖的package包/类
public static void generate(Path path, int numPages) throws Exception {

    Gson gson = new Gson();
    long count = 0;
    try (BufferedWriter writer = Files.newBufferedWriter(path)) {
      ArchiveReader ar = WARCReaderFactory.get(new URL(sourceURL), 0);
      for (ArchiveRecord r : ar) {
        Page p = ArchiveUtil.buildPage(r);
        if (p.isEmpty() || p.getOutboundLinks().isEmpty()) {
          log.debug("Skipping {}", p.getUrl());
          continue;
        }
        log.debug("Found {} {}", p.getUrl(), p.getNumOutbound());
        String json = gson.toJson(p);
        writer.write(json);
        writer.newLine();
        count++;
        if (count == numPages) {
          break;
        } else if ((count % 1000) == 0) {
          log.info("Wrote {} of {} pages to {}", count, numPages, path);
        }
      }
    }
    log.info("Wrote {} pages to {}", numPages, path);
  }

开发者ID:astralway，项目名称:webindex，代码行数:27，代码来源:SampleData.java

示例3: test

import org.archive.io.ArchiveReader; //导入依赖的package包/类
@Test
public void test() throws IOException {
    List<Cdx.CdxRecord> records;
    URL resource = getClass().getResource("example.warc.gz");
    try (ArchiveReader warc = ArchiveReaderFactory.get(resource)) {
        records = Cdx.records(warc, "example.warc.gz", resource.openConnection().getContentLength()).collect(Collectors.toList());
    }

    assertEquals(2, records.size());

    Cdx.Capture record = (Cdx.Capture)records.get(0);
    assertEquals("text/html", record.contentType);
    assertEquals(200, record.status);
    assertEquals("20161116220655", record.date);
    assertEquals("http://www-test.nla.gov.au/xinq/presentations/abstract.html", record.url);
    assertEquals(2756, record.compressedLength);
    assertEquals(339, record.offset);
    assertEquals("387f5ef1511fe47bf91ca9fdcf6c41511fc3e480", record.digest);
}

开发者ID:nla，项目名称:bamboo，代码行数:20，代码来源:CdxTest.java

示例4: test2

import org.archive.io.ArchiveReader; //导入依赖的package包/类
@Test
public void test2() throws IOException {
    List<Cdx.CdxRecord> records;
    URL resource = getClass().getResource("notfound.warc.gz");
    try (ArchiveReader warc = ArchiveReaderFactory.get(resource)) {
        records = Cdx.records(warc, "notfound.warc.gz", resource.openConnection().getContentLength()).collect(Collectors.toList());
    }

    assertEquals(3, records.size());

    Cdx.Capture record = (Cdx.Capture)records.get(0);
    assertEquals("text/html", record.contentType);
    assertEquals(302, record.status);
    assertEquals("20161128015313", record.date);
    assertEquals("http://nla.gov.au/foobar", record.url);
    assertEquals(665, record.compressedLength);
    assertEquals(830, record.offset);
    assertEquals("WNT4SKWUNA5F4Q3HYKF5AMY2M5ZIPBYW", record.digest);
    assertEquals("http://www.nla.gov.au/foobar", record.location);
}

开发者ID:nla，项目名称:bamboo，代码行数:21，代码来源:CdxTest.java

示例5: findFirstRecordWithUri

import org.archive.io.ArchiveReader; //导入依赖的package包/类
private ArchiveRecord findFirstRecordWithUri(File f, String uri) 
throws IOException {
    
    ArchiveReader r = ARCReaderFactory.get(f);
    
    Iterator<ArchiveRecord> it = r.iterator();
    ArchiveRecord record = it.next(); //Skip ARC file header
    // ARCReaderFactory guarantees the first record exists and is a
    // filedesc, or it would throw exception
    
    // next record should contain INPUT_1_ARC
    while (it.hasNext()) {
        record = it.next();
        if (record.getHeader().getUrl().equals(uri)){
            return record;
        }
    }
    return null;
}

开发者ID:netarchivesuite，项目名称:netarchivesuite-svngit-migration，代码行数:20，代码来源:ARCUtilsTester.java

示例6: get

import org.archive.io.ArchiveReader; //导入依赖的package包/类
@Override
public BitarchiveRecord get(String arcfile, long index)
        throws ArgumentNotValid {
    ArgumentNotValid.checkNotNull(arcfile, "arcfile");
    ArgumentNotValid.checkNotNegative(index, "index");

    File in = new File(arcDir, arcfile);
    try {
        ArchiveReader arcReader = ArchiveReaderFactory.get(in);
        ArchiveRecord arc = arcReader.get(index);
        BitarchiveRecord result = new BitarchiveRecord(arc, arcfile);
        return result;
    } catch (IOException e) {
        throw new IOFailure("Error reading record from " + arcfile + " offset " + index, e);
    }
}

开发者ID:netarchivesuite，项目名称:netarchivesuite-svngit-migration，代码行数:17，代码来源:TestArcRepositoryClient.java

示例7: map

import org.archive.io.ArchiveReader; //导入依赖的package包/类
@Override
public void map(Text key, ArchiveReader value, Context context)
		throws IOException {
	for (ArchiveRecord r : value) {
		// Skip any records that are not JSON
		if (!r.getHeader().getMimetype().equals("application/json")) {
			continue;
		}				
		String sourceURL = r.getHeader().getUrl();
		if (StringUtils.isBlank(sourceURL))
			continue;
		outKey.set(sourceURL);
		try {
			context.write(outKey, NullWritable.get());
		} catch (Exception ex) {
			LOG.error("Caught Exception", ex);
			context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1);
		} finally {
			IOUtils.closeQuietly(r);
		}
	}
}

开发者ID:DigitalPebble，项目名称:NutchFight，代码行数:23，代码来源:URLExtractor.java

示例8: initialize

import org.archive.io.ArchiveReader; //导入依赖的package包/类
@Override
public void initialize(InputSplit is, TaskAttemptContext tac) throws IOException, InterruptedException {
    //throw new UnsupportedOperationException("Unused.");

    FileSplit fileSplit = (FileSplit) is;
    try {
        Path path = fileSplit.getPath();

        FileSystem fileSystem = path.getFileSystem(tac.getConfiguration());

        FSDataInputStream fileInputStream = fileSystem.open(path);
        FileStatus fileStatus = fileSystem.getFileStatus(path);
        fileLength = fileStatus.getLen();
        ArchiveReader reader = ArchiveReaderFactory.get(path.getName(), fileInputStream, true);
        recordIterator = reader.iterator();

        currentKey = new Text();
        currentArcRecord = new ArcRecord();
    } catch (IOException ex) {
        Logger.getLogger(ArcRecordReader.class.getName()).log(Level.SEVERE, null, ex);
    }



}

开发者ID:shsdev，项目名称:archiventory，代码行数:26，代码来源:ArcRecordReader.java

示例9: getArchiveReader

import org.archive.io.ArchiveReader; //导入依赖的package包/类
protected ArchiveReader getArchiveReader(final File arcFile,
           final boolean skipSuffixTest, final long offset)
   throws IOException {
       boolean compressed = testCompressedARCFile(arcFile, skipSuffixTest);
       if (!compressed) {
           if (!FileUtils.isReadableWithExtensionAndMagic(arcFile,
                   ARC_FILE_EXTENSION, ARC_MAGIC_NUMBER)) {
               throw new IOException(arcFile.getAbsolutePath() +
                   " is not an Internet Archive ARC file.");
           }
       }
       return compressed?
           (ARCReader)ARCReaderFactory.factory.
               new CompressedARCReader(arcFile, offset):
           (ARCReader)ARCReaderFactory.factory.
               new UncompressedARCReader(arcFile, offset);
}

开发者ID:iipc，项目名称:webarchive-commons，代码行数:18，代码来源:ARCReaderFactory.java

示例10: offsetResourceTest

import org.archive.io.ArchiveReader; //导入依赖的package包/类
private void offsetResourceTest( File testfile, long offset, String uri ) throws Exception {
  	RandomAccessFile raf = new RandomAccessFile(testfile, "r");
raf.seek(offset);
InputStream is = new FileInputStream(raf.getFD());
String fPath = testfile.getAbsolutePath();
ArchiveReader reader = ARCReaderFactory.get(fPath, is, false);    	
// This one works:
//ArchiveReader reader = ARCReaderFactory.get(testfile, offset);
ArchiveRecord record = reader.get();

final String url = record.getHeader().getUrl();
assertEquals("URL of record is not as expected.", uri, url);

      final long position = record.getPosition();
      final long recordLength = record.getHeader().getLength();
      assertTrue("Position " + position + " is after end of record " + recordLength, position <= recordLength);

      // Clean up:
      if( raf != null )
      	raf.close();
  }

开发者ID:iipc，项目名称:webarchive-commons，代码行数:22，代码来源:ARCReaderFactoryTest.java

示例11: main

import org.archive.io.ArchiveReader; //导入依赖的package包/类
/**
	 * @param args
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		// Set up a local compressed WARC file for reading 
		String url = "https://aws-publicdatasets.s3.amazonaws.com/common-crawl/crawl-data/CC-MAIN-2014-23/segments/1404776400583.60/warc/CC-MAIN-20140707234000-00000-ip-10-180-212-248.ec2.internal.warc.gz";
//		String fn = "data/CC-MAIN-20131204131715-00000-ip-10-33-133-15.ec2.internal.warc.gz";
		String fn = url;
		FileInputStream is = new FileInputStream(fn);
		// The file name identifies the ArchiveReader and indicates if it should be decompressed
		ArchiveReader ar = WARCReaderFactory.get(fn, is, true);
		
		// Once we have an ArchiveReader, we can work through each of the records it contains
		int i = 0;
		for(ArchiveRecord r : ar) {
			// The header file contains information such as the type of record, size, creation time, and URL
			System.out.println(r.getHeader());
			System.out.println(r.getHeader().getUrl());
			System.out.println();
			
			// If we want to read the contents of the record, we can use the ArchiveRecord as an InputStream
			// Create a byte array that is as long as the record's stated length
			byte[] rawData = IOUtils.toByteArray(r, r.available());
			
			// Why don't we convert it to a string and print the start of it? Let's hope it's text!
			String content = new String(rawData);
			System.out.println(content.substring(0, Math.min(500, content.length())));
			System.out.println((content.length() > 500 ? "..." : ""));
			
			// Pretty printing to make the output more readable 
			System.out.println("=-=-=-=-=-=-=-=-=");
			if (i++ > 4) break; 
		}
	}

开发者ID:TeamHG-Memex，项目名称:common-crawl-mapreduce，代码行数:36，代码来源:WARCReaderTest.java

示例12: map

import org.archive.io.ArchiveReader; //导入依赖的package包/类
@Override
public void map(Text key, ArchiveReader value, Context context) throws IOException {
	// Compile the regular expression once as it will be used continuously
	patternTag = Pattern.compile(HTML_TAG_PATTERN);
	
	for (ArchiveRecord r : value) {
		try {
			LOG.debug(r.getHeader().getUrl() + " -- " + r.available());
			// We're only interested in processing the responses, not requests or metadata
			if (r.getHeader().getMimetype().equals("application/http; msgtype=response")) {
				// Convenience function that reads the full message into a raw byte array
				byte[] rawData = IOUtils.toByteArray(r, r.available());
				String content = new String(rawData);
				// The HTTP header gives us valuable information about what was received during the request
				String headerText = content.substring(0, content.indexOf("\r\n\r\n"));
				
				// In our task, we're only interested in text/html, so we can be a little lax
				// TODO: Proper HTTP header parsing + don't trust headers
				if (headerText.contains("Content-Type: text/html")) {
					context.getCounter(MAPPERCOUNTER.RECORDS_IN).increment(1);
					// Only extract the body of the HTTP response when necessary
					// Due to the way strings work in Java, we don't use any more memory than before
					String body = content.substring(content.indexOf("\r\n\r\n") + 4);
					// Process all the matched HTML tags found in the body of the document
					matcherTag = patternTag.matcher(body);
					while (matcherTag.find()) {
						String tagName = matcherTag.group(1);
						outKey.set(tagName.toLowerCase());
						context.write(outKey, outVal);
					}
				}
			}
		}
		catch (Exception ex) {
			LOG.error("Caught Exception", ex);
			context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1);
		}
	}
}

开发者ID:TeamHG-Memex，项目名称:common-crawl-mapreduce，代码行数:40，代码来源:TagCounterMap.java

示例13: map

import org.archive.io.ArchiveReader; //导入依赖的package包/类
@Override
public void map(Text key, ArchiveReader value, Context context) throws IOException {
	for (ArchiveRecord r : value) {
		try {
			if (r.getHeader().getMimetype().equals("text/plain")) {
				context.getCounter(MAPPERCOUNTER.RECORDS_IN).increment(1);
				LOG.debug(r.getHeader().getUrl() + " -- " + r.available());
				// Convenience function that reads the full message into a raw byte array
				byte[] rawData = IOUtils.toByteArray(r, r.available());
				String content = new String(rawData);
				// Grab each word from the document
				tokenizer = new StringTokenizer(content);
				if (!tokenizer.hasMoreTokens()) {
					context.getCounter(MAPPERCOUNTER.EMPTY_PAGE_TEXT).increment(1);
				} else {
					while (tokenizer.hasMoreTokens()) {
						outKey.set(tokenizer.nextToken());
						context.write(outKey, outVal);
					}
				}
			} else {
				context.getCounter(MAPPERCOUNTER.NON_PLAIN_TEXT).increment(1);
			}
		}
		catch (Exception ex) {
			LOG.error("Caught Exception", ex);
			context.getCounter(MAPPERCOUNTER.EXCEPTIONS).increment(1);
		}
	}
}

开发者ID:TeamHG-Memex，项目名称:common-crawl-mapreduce，代码行数:31，代码来源:WordCounterMap.java

示例14: main

import org.archive.io.ArchiveReader; //导入依赖的package包/类
public static void main(String[] args) {
  if (args.length != 1) {
    log.error("Usage: CalcSplits <dataDir>");
    System.exit(1);
  }
  final String dataDir = args[0];
  IndexEnv.validateDataDir(dataDir);

  SparkConf sparkConf = new SparkConf().setAppName("webindex-calcsplits");
  try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {

    IndexStats stats = new IndexStats(ctx);

    final JavaPairRDD<Text, ArchiveReader> archives =
        ctx.newAPIHadoopFile(dataDir, WARCFileInputFormat.class, Text.class, ArchiveReader.class,
            new Configuration());

    JavaRDD<Page> pages = IndexUtil.createPages(archives);

    JavaPairRDD<String, UriInfo> uriMap = IndexUtil.createUriMap(pages);
    JavaPairRDD<String, Long> domainMap = IndexUtil.createDomainMap(uriMap);
    JavaPairRDD<RowColumn, Bytes> accumuloIndex =
        IndexUtil.createAccumuloIndex(stats, pages, uriMap, domainMap);
    SortedSet<Text> splits = IndexUtil.calculateSplits(accumuloIndex, 100);
    log.info("Accumulo splits:");
    splits.forEach(System.out::println);
  }
}

开发者ID:astralway，项目名称:webindex，代码行数:29，代码来源:CalcSplits.java

示例15: main

import org.archive.io.ArchiveReader; //导入依赖的package包/类
public static void main(String[] args) throws Exception {

    if (args.length != 2) {
      log.error("Usage: TestParser <pathsFile> <range>");
      System.exit(1);
    }
    final List<String> loadList = IndexEnv.getPathsRange(args[0], args[1]);
    if (loadList.isEmpty()) {
      log.error("No files to load given {} {}", args[0], args[1]);
      System.exit(1);
    }

    WebIndexConfig.load();

    SparkConf sparkConf = new SparkConf().setAppName("webindex-test-parser");
    try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {

      log.info("Parsing {} files (Range {} of paths file {}) from AWS", loadList.size(), args[1],
          args[0]);

      JavaRDD<String> loadRDD = ctx.parallelize(loadList, loadList.size());

      final String prefix = WebIndexConfig.CC_URL_PREFIX;

      loadRDD.foreachPartition(iter -> iter.forEachRemaining(path -> {
        String urlToCopy = prefix + path;
        log.info("Parsing {}", urlToCopy);
        try {
          ArchiveReader reader = WARCReaderFactory.get(new URL(urlToCopy), 0);
          for (ArchiveRecord record : reader) {
            ArchiveUtil.buildPageIgnoreErrors(record);
          }
        } catch (Exception e) {
          log.error("Exception while processing {}", path, e);
        }
      }));
    }
  }

开发者ID:astralway，项目名称:webindex，代码行数:39，代码来源:TestParser.java

注：本文中的org.archive.io.ArchiveReader类示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。