本文整理汇总了Java中org.archive.io.ArchiveReader.iterator方法的典型用法代码示例。如果您正苦于以下问题:Java ArchiveReader.iterator方法的具体用法?Java ArchiveReader.iterator怎么用?Java ArchiveReader.iterator使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类org.archive.io.ArchiveReader
的用法示例。
在下文中一共展示了ArchiveReader.iterator方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: findFirstRecordWithUri
import org.archive.io.ArchiveReader; //导入方法依赖的package包/类
private ArchiveRecord findFirstRecordWithUri(File f, String uri)
throws IOException {
ArchiveReader r = ARCReaderFactory.get(f);
Iterator<ArchiveRecord> it = r.iterator();
ArchiveRecord record = it.next(); //Skip ARC file header
// ARCReaderFactory guarantees the first record exists and is a
// filedesc, or it would throw exception
// next record should contain INPUT_1_ARC
while (it.hasNext()) {
record = it.next();
if (record.getHeader().getUrl().equals(uri)){
return record;
}
}
return null;
}
示例2: initialize
import org.archive.io.ArchiveReader; //导入方法依赖的package包/类
@Override
public void initialize(InputSplit is, TaskAttemptContext tac) throws IOException, InterruptedException {
//throw new UnsupportedOperationException("Unused.");
FileSplit fileSplit = (FileSplit) is;
try {
Path path = fileSplit.getPath();
FileSystem fileSystem = path.getFileSystem(tac.getConfiguration());
FSDataInputStream fileInputStream = fileSystem.open(path);
FileStatus fileStatus = fileSystem.getFileStatus(path);
fileLength = fileStatus.getLen();
ArchiveReader reader = ArchiveReaderFactory.get(path.getName(), fileInputStream, true);
recordIterator = reader.iterator();
currentKey = new Text();
currentArcRecord = new ArcRecord();
} catch (IOException ex) {
Logger.getLogger(ArcRecordReader.class.getName()).log(Level.SEVERE, null, ex);
}
}
示例3: index
import org.archive.io.ArchiveReader; //导入方法依赖的package包/类
/**
* Create and return the index of the ArcHarvestFile.
* @param baseDir the base directory of the arcs
* @throws IOException thrown if there is an error
* @throws ParseException
*/
public Map<String, HarvestResourceDTO> index(File baseDir) throws IOException, ParseException {
Map<String, HarvestResourceDTO> results = new HashMap<String, HarvestResourceDTO>();
File theArchiveFile = new File(baseDir, this.getName());
ArchiveReader reader = ArchiveReaderFactory.get(theArchiveFile);
this.compressed = reader.isCompressed();
Iterator<ArchiveRecord> it = reader.iterator();
while(it.hasNext()) {
ArchiveRecord rec = it.next();
if(rec instanceof WARCRecord) {
String type = rec.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
if(type.equals(WARCConstants.RESPONSE)) {
String mime = rec.getHeader().getMimetype();
if(!mime.equals("text/dns")) {
indexWARCResponse(rec, results);
}
}
}
else {
indexARCRecord(rec, results);
}
}
reader.close();
return results;
}
示例4: testFtpHarvesting
import org.archive.io.ArchiveReader; //导入方法依赖的package包/类
/**
* Test we can harvest from FTP-sites using the FTP processor.
* Downloads max 25 files from klid.dk using the seed:
* ftp://ftp.klid.dk/OpenOffice/haandbog
* @throws Exception
*/
public void testFtpHarvesting() throws Exception {
validateOrder(TestInfo.FTPHARVESTING_ORDERXML_FILE);
File tempDir = mtf.newTmpDir();
LuceneUtils.makeDummyIndex(tempDir);
runHeritrix(TestInfo.FTPHARVESTING_ORDERXML_FILE,
TestInfo.FTP_HARVESTING_SEEDLIST_FILE, tempDir);
// test that both the heritrix-temp-dir and the bitarchive has at least one file - and has the same file !!
File[] files = TestInfo.HERITRIX_ARCS_DIR.listFiles(FileUtils.ARCS_FILTER);
assertNotNull("Files array should be non-null", files);
assertEquals("Should be exactly one file in " + TestInfo.HERITRIX_ARCS_DIR.getAbsolutePath(),
1, files.length);
File first_arcfile = files[0];
assertNotNull("Should be ARC files in " + TestInfo.HERITRIX_ARCS_DIR.getAbsolutePath(),
first_arcfile);
ArchiveReader reader = ArchiveReaderFactory.get(files[0]);
Iterator<ArchiveRecord> i = reader.iterator();
Set<String> urlSet = new HashSet<String>();
while (i.hasNext()) {
ArchiveRecord o = i.next();
if (o instanceof ARCRecord) {
ARCRecord a = (ARCRecord) o;
urlSet.add(a.getMetaData().getUrl());
} else {
fail("ARCrecords expected, not objects of class"
+ o.getClass().getName());
}
}
assertTrue("Should have harvested more than 10 objects but only harvested "
+ urlSet.size(), urlSet.size() > 10);
String searchString = "ftp://ftp.klid.dk/OpenOffice/haandbog/Haandbog-2-2.pdf";
if (!urlSet.contains(searchString)) {
fail("Expected to harvest '" + searchString + "' but we only harvested : "
+ StringUtils.conjoin(",", urlSet));
}
}
示例5: testWarcReading
import org.archive.io.ArchiveReader; //导入方法依赖的package包/类
public void testWarcReading() throws Exception{
ArchiveReader archiveReader = ArchiveReaderFactory.get(TestInfo.WARC_FILE1);
Iterator<? extends ArchiveRecord> it = archiveReader.iterator();
assertTrue("Warc should contains records", it.hasNext());
while (it.hasNext()) {
ArchiveRecord next = it.next();
System.out.println("mimetype:" + next.getHeader().getMimetype());
System.out.println("url:" + next.getHeader().getUrl());
}
}
开发者ID:netarchivesuite,项目名称:netarchivesuite-svngit-migration,代码行数:13,代码来源:ExtractCDXFromWarcJobTester.java
示例6: testBasic
import org.archive.io.ArchiveReader; //导入方法依赖的package包/类
@Test
public void testBasic() throws IOException, ParseException {
ArchiveReader archiveReader = WARCReaderFactory.get(new File("src/test/resources/wat.warc"));
Page page = ArchiveUtil.buildPage(archiveReader.get());
Assert.assertNotNull(page);
Assert.assertFalse(page.isEmpty());
Assert
.assertEquals(
"http://1079ishot.com/presale-password-trey-songz-young-jeezy-pre-christmas-bash/screen-shot-2011-10-27-at-11-12-06-am/",
page.getUrl());
Assert
.assertEquals(
"com.1079ishot>>o>/presale-password-trey-songz-young-jeezy-pre-christmas-bash/screen-shot-2011-10-27-at-11-12-06-am/",
page.getUri());
Assert.assertEquals("2015-04-18T03:35:13Z", page.getCrawlDate());
Assert.assertEquals("nginx/1.6.2", page.getServer());
Assert
.assertEquals(
"Presale Password – Trey Songz & Young Jeezy Pre-Christmas Bash Screen shot 2011-10-27 at ",
page.getTitle());
Assert.assertEquals(0, page.getOutboundLinks().size());
ArchiveReader ar2 = WARCReaderFactory.get(new File("src/test/resources/wat-18.warc"));
int valid = 0;
int invalid = 0;
Iterator<ArchiveRecord> records = ar2.iterator();
while (records.hasNext()) {
try {
ArchiveRecord r = records.next();
ArchiveUtil.buildPage(r);
valid++;
} catch (ParseException e) {
invalid++;
}
}
Assert.assertEquals(18, valid);
Assert.assertEquals(0, invalid);
}