当前位置: 首页>>代码示例>>Java>>正文


Java ArchiveReader.iterator方法代码示例

本文整理汇总了Java中org.archive.io.ArchiveReader.iterator方法的典型用法代码示例。如果您正苦于以下问题:Java ArchiveReader.iterator方法的具体用法?Java ArchiveReader.iterator怎么用?Java ArchiveReader.iterator使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.archive.io.ArchiveReader的用法示例。


在下文中一共展示了ArchiveReader.iterator方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。

示例1: findFirstRecordWithUri

import org.archive.io.ArchiveReader; //导入方法依赖的package包/类
private ArchiveRecord findFirstRecordWithUri(File f, String uri) 
throws IOException {
    
    ArchiveReader r = ARCReaderFactory.get(f);
    
    Iterator<ArchiveRecord> it = r.iterator();
    ArchiveRecord record = it.next(); //Skip ARC file header
    // ARCReaderFactory guarantees the first record exists and is a
    // filedesc, or it would throw exception
    
    // next record should contain INPUT_1_ARC
    while (it.hasNext()) {
        record = it.next();
        if (record.getHeader().getUrl().equals(uri)){
            return record;
        }
    }
    return null;
}
 
开发者ID:netarchivesuite,项目名称:netarchivesuite-svngit-migration,代码行数:20,代码来源:ARCUtilsTester.java

示例2: initialize

import org.archive.io.ArchiveReader; //导入方法依赖的package包/类
@Override
public void initialize(InputSplit is, TaskAttemptContext tac) throws IOException, InterruptedException {
    //throw new UnsupportedOperationException("Unused.");

    FileSplit fileSplit = (FileSplit) is;
    try {
        Path path = fileSplit.getPath();

        FileSystem fileSystem = path.getFileSystem(tac.getConfiguration());

        FSDataInputStream fileInputStream = fileSystem.open(path);
        FileStatus fileStatus = fileSystem.getFileStatus(path);
        fileLength = fileStatus.getLen();
        ArchiveReader reader = ArchiveReaderFactory.get(path.getName(), fileInputStream, true);
        recordIterator = reader.iterator();

        currentKey = new Text();
        currentArcRecord = new ArcRecord();
    } catch (IOException ex) {
        Logger.getLogger(ArcRecordReader.class.getName()).log(Level.SEVERE, null, ex);
    }



}
 
开发者ID:shsdev,项目名称:archiventory,代码行数:26,代码来源:ArcRecordReader.java

示例3: index

import org.archive.io.ArchiveReader; //导入方法依赖的package包/类
/**
 * Create and return the index of the ArcHarvestFile.
 * @param baseDir the base directory of the arcs
 * @throws IOException thrown if there is an error
 * @throws ParseException 
 */
public Map<String, HarvestResourceDTO> index(File baseDir) throws IOException, ParseException {
	Map<String, HarvestResourceDTO> results = new HashMap<String, HarvestResourceDTO>();
	
	File theArchiveFile = new File(baseDir, this.getName());
	ArchiveReader reader = ArchiveReaderFactory.get(theArchiveFile);
	this.compressed = reader.isCompressed();
	
	Iterator<ArchiveRecord> it = reader.iterator();
	while(it.hasNext()) {
		ArchiveRecord rec = it.next();
		
		if(rec instanceof WARCRecord) {
			String type = rec.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
			if(type.equals(WARCConstants.RESPONSE)) {
				String mime = rec.getHeader().getMimetype();
				if(!mime.equals("text/dns")) {
					indexWARCResponse(rec, results);
				}
			}
		}
		else {
			indexARCRecord(rec, results);
		}
	}
	reader.close();
	
	return results;
}
 
开发者ID:DIA-NZ,项目名称:webcurator,代码行数:35,代码来源:ArcHarvestFileDTO.java

示例4: testFtpHarvesting

import org.archive.io.ArchiveReader; //导入方法依赖的package包/类
/**
 * Test we can harvest from FTP-sites using the FTP processor.
 * Downloads max 25 files from klid.dk using the seed:
 * ftp://ftp.klid.dk/OpenOffice/haandbog
 * @throws Exception
 */
public void testFtpHarvesting() throws Exception {
    validateOrder(TestInfo.FTPHARVESTING_ORDERXML_FILE);
    File tempDir = mtf.newTmpDir();
    LuceneUtils.makeDummyIndex(tempDir);
    runHeritrix(TestInfo.FTPHARVESTING_ORDERXML_FILE,
            TestInfo.FTP_HARVESTING_SEEDLIST_FILE, tempDir);

    // test that both the heritrix-temp-dir and the bitarchive has at least one file - and has the same file !!
    File[] files = TestInfo.HERITRIX_ARCS_DIR.listFiles(FileUtils.ARCS_FILTER);
    assertNotNull("Files array should be non-null", files);
    assertEquals("Should be exactly one file in " + TestInfo.HERITRIX_ARCS_DIR.getAbsolutePath(),
            1, files.length);
    File first_arcfile = files[0];
    assertNotNull("Should be ARC files in " + TestInfo.HERITRIX_ARCS_DIR.getAbsolutePath(),
            first_arcfile);
    ArchiveReader reader = ArchiveReaderFactory.get(files[0]);
    Iterator<ArchiveRecord> i = reader.iterator();
    Set<String> urlSet = new HashSet<String>();
    while (i.hasNext()) {
        ArchiveRecord o = i.next();
        if (o instanceof ARCRecord) {
            ARCRecord a = (ARCRecord) o;
            urlSet.add(a.getMetaData().getUrl());
        } else {
            fail("ARCrecords expected, not objects of class"
                    + o.getClass().getName());
        }
    }
    assertTrue("Should have harvested more than 10 objects but only harvested "
            + urlSet.size(), urlSet.size() > 10);
    String searchString = "ftp://ftp.klid.dk/OpenOffice/haandbog/Haandbog-2-2.pdf";
    if (!urlSet.contains(searchString)) {
        fail("Expected to harvest '" + searchString + "' but we only harvested : "
                + StringUtils.conjoin(",", urlSet));
    }
}
 
开发者ID:netarchivesuite,项目名称:netarchivesuite-svngit-migration,代码行数:43,代码来源:HeritrixTests.java

示例5: testWarcReading

import org.archive.io.ArchiveReader; //导入方法依赖的package包/类
public void testWarcReading() throws Exception{     

    ArchiveReader archiveReader = ArchiveReaderFactory.get(TestInfo.WARC_FILE1);
    
    Iterator<? extends ArchiveRecord> it = archiveReader.iterator();
    assertTrue("Warc should contains records", it.hasNext());
    while (it.hasNext()) {
        ArchiveRecord next = it.next();
        System.out.println("mimetype:" + next.getHeader().getMimetype());
        System.out.println("url:" + next.getHeader().getUrl());
    }
}
 
开发者ID:netarchivesuite,项目名称:netarchivesuite-svngit-migration,代码行数:13,代码来源:ExtractCDXFromWarcJobTester.java

示例6: testBasic

import org.archive.io.ArchiveReader; //导入方法依赖的package包/类
@Test
public void testBasic() throws IOException, ParseException {

  ArchiveReader archiveReader = WARCReaderFactory.get(new File("src/test/resources/wat.warc"));
  Page page = ArchiveUtil.buildPage(archiveReader.get());
  Assert.assertNotNull(page);
  Assert.assertFalse(page.isEmpty());

  Assert
      .assertEquals(
          "http://1079ishot.com/presale-password-trey-songz-young-jeezy-pre-christmas-bash/screen-shot-2011-10-27-at-11-12-06-am/",
          page.getUrl());
  Assert
      .assertEquals(
          "com.1079ishot>>o>/presale-password-trey-songz-young-jeezy-pre-christmas-bash/screen-shot-2011-10-27-at-11-12-06-am/",
          page.getUri());

  Assert.assertEquals("2015-04-18T03:35:13Z", page.getCrawlDate());
  Assert.assertEquals("nginx/1.6.2", page.getServer());
  Assert
      .assertEquals(
          "Presale Password &#8211; Trey Songz &#038; Young Jeezy Pre-Christmas Bash Screen shot 2011-10-27 at ",
          page.getTitle());
  Assert.assertEquals(0, page.getOutboundLinks().size());

  ArchiveReader ar2 = WARCReaderFactory.get(new File("src/test/resources/wat-18.warc"));

  int valid = 0;
  int invalid = 0;
  Iterator<ArchiveRecord> records = ar2.iterator();
  while (records.hasNext()) {
    try {
      ArchiveRecord r = records.next();
      ArchiveUtil.buildPage(r);
      valid++;
    } catch (ParseException e) {
      invalid++;
    }
  }
  Assert.assertEquals(18, valid);
  Assert.assertEquals(0, invalid);
}
 
开发者ID:astralway,项目名称:webindex,代码行数:43,代码来源:ArchiveUtilTest.java


注:本文中的org.archive.io.ArchiveReader.iterator方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。