本文整理汇总了Java中org.apache.lucene.benchmark.byTask.feeds.DocData类的典型用法代码示例。如果您正苦于以下问题:Java DocData类的具体用法?Java DocData怎么用?Java DocData使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
DocData类属于org.apache.lucene.benchmark.byTask.feeds包,在下文中一共展示了DocData类的9个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Java代码示例。
示例1: getNextDocData
import org.apache.lucene.benchmark.byTask.feeds.DocData; //导入依赖的package包/类
@Override
public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
String[] tuple = parser.next();
docData.clear();
docData.setName(tuple[ID]);
docData.setBody(tuple[TITLE] + " " + tuple[BODY]);
docData.setDate(tuple[DATE]);
docData.setTitle(tuple[TITLE]);
/*
* TODO: @leo This is not a real URL, maybe we will need a real URL some day.
* This should be fine for sorting purposes, though. If the input
* is unsorted and we want to produce sorted document ids,
* this is just fine.
*/
Properties props = new Properties();
props.put("url", tuple[TITLE]);
docData.setProps(props);
return docData;
}
示例2: parse
import org.apache.lucene.benchmark.byTask.feeds.DocData; //导入依赖的package包/类
@Override
public DocData parse(DocData docData, String name, Date date, Reader reader,
ContentSourceDateUtil trecSrc) throws IOException {
try {
return parse(docData, name, date, new InputSource(reader), trecSrc);
} catch (SAXException saxe) {
throw new IOException("SAX exception occurred while parsing HTML document.", saxe);
}
}
示例3: parse
import org.apache.lucene.benchmark.byTask.feeds.DocData; //导入依赖的package包/类
@Override
public DocData parse(DocData docData,
String name,
Date date,
Reader reader,
ContentSourceDateUtil trecSrc) throws IOException {
return parse(docData, name, date, new InputSource(reader), trecSrc);
}
示例4: getNextDocData
import org.apache.lucene.benchmark.byTask.feeds.DocData; //导入依赖的package包/类
@Override
public DocData getNextDocData(DocData docData)
throws NoMoreDataException, IOException {
return docData;
}
示例5: getNextDocData
import org.apache.lucene.benchmark.byTask.feeds.DocData; //导入依赖的package包/类
@Override
public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
String name = null;
StringBuilder docBuf = getDocBuffer();
ParsePathType parsedPathType;
// protect reading from the TREC files by multiple threads. The rest of the
// method, i.e., parsing the content and returning the DocData can run unprotected.
synchronized (lock) {
if (reader == null) {
openNextFile();
}
// 1. skip until doc start - required for all TREC formats
docBuf.setLength(0);
read(docBuf, DOC, false, false);
// save parsedFile for passing trecDataParser after the sync block, in
// case another thread will open another file in between.
parsedPathType = currPathType;
// 2. name - required for all TREC formats
docBuf.setLength(0);
read(docBuf, DOCNO, true, false);
name = docBuf.substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO,
DOCNO.length())).trim();
if (!excludeDocnameIteration) {
name = name + "_" + iteration;
}
// 3. read all until end of doc
docBuf.setLength(0);
read(docBuf, TERMINATING_DOC, false, true);
}
// count char length of text to be parsed (may be larger than the resulted plain doc body text).
addBytes(docBuf.length());
// This code segment relies on HtmlParser being thread safe. When we get
// here, everything else is already private to that thread, so we're safe.
docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType);
addItem();
return docData;
}
示例6: getNextDocData
import org.apache.lucene.benchmark.byTask.feeds.DocData; //导入依赖的package包/类
@Override
public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
WarcRecord CurrRec = null;
// protect reading from the TREC files by multiple threads. The rest of the
// method, i.e., parsing the content and returning the DocData can run unprotected.
synchronized (lock) {
if (reader == null) {
openNextFile();
}
do {
CurrRec = WarcRecord.readNextWarcRecord(reader);
/*
* We need to skip special auxiliary entries, e.g., in the
* beginning of the file.
*/
} while (CurrRec != null && !CurrRec.getHeaderRecordType().equals("response"));
if (CurrRec == null) {
openNextFile();
return getNextDocData(docData);
}
}
Date date = parseDate(CurrRec.getHeaderMetadataItem("WARC-Date"));
String url = CurrRec.getHeaderMetadataItem("WARC-Target-URI");
// This code segment relies on HtmlParser being thread safe. When we get
// here, everything else is already private to that thread, so we're safe.
if (url.startsWith("http://") ||
url.startsWith("ftp://") ||
url.startsWith("https://")
) {
String Response = CurrRec.getContentUTF8();
int EndOfHead = Response.indexOf("\n\n");
if (EndOfHead >= 0) {
String html = Response.substring(EndOfHead + 2);
Properties props = new Properties();
docData = htmlParser.parse(docData, url, date, new StringReader(html), this);
// This should be done after parse(), b/c parse() resets properties
docData.getProps().put("url", url);
} else {
/*
* TODO: @leo What do we do here exactly?
* The interface doesn't allow us to signal that an entry should be skipped.
*/
System.err.println("Cannot extract HTML in URI: " + url);
}
} else {
/*
* TODO: @leo What do we do here exactly?
* The interface doesn't allow us to signal that an entry should be skipped.
*/
System.err.println("Ignoring schema in URI: " + url);
}
addItem();
return docData;
}
示例7: parse
import org.apache.lucene.benchmark.byTask.feeds.DocData; //导入依赖的package包/类
@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException {
// skip some of the non-html text, optionally set date
Date date = null;
int start = 0;
final int h1 = docBuf.indexOf(DOCHDR);
if (h1 >= 0) {
final int hStart2dLine = h1 + DOCHDR.length() + 1;
final int hEnd2dLine = docBuf.indexOf("\n", hStart2dLine);
if (hEnd2dLine >= 0) {
String url = docBuf.substring(hStart2dLine, hEnd2dLine)
.toLowerCase().trim();
if (url.startsWith("http://") ||
url.startsWith("ftp://") ||
url.startsWith("https://")
) {
final int h2 = docBuf.indexOf(TERMINATING_DOCHDR, h1);
final String dateStr = extract(docBuf, DATE, DATE_END, h2, null);
if (dateStr != null) {
date = trecSrc.parseDate(dateStr);
}
start = h2 + TERMINATING_DOCHDR.length();
final String html = docBuf.substring(start);
docData = trecSrc.getHtmlParser().parse(docData, name, date, new StringReader(html), trecSrc);
// This should be done after parse(), b/c parse() resets properties
docData.getProps().put("url", url);
return docData;
} else {
System.err.println("Ignoring schema in URI: " + url);
}
} else {
System.err.println("Invalid header: " + docBuf.toString());
}
}
/*
* TODO: @leo What do we do here exactly?
* The interface doesn't allow us to signal that an entry should be skipped.
*/
return docData;
}
示例8: parse
import org.apache.lucene.benchmark.byTask.feeds.DocData; //导入依赖的package包/类
/**
* parse the text prepared in docBuf into a result DocData,
* no synchronization is required.
* @param docData reusable result
* @param name name that should be set to the result
* @param trecSrc calling trec content source
* @param docBuf text to parse
* @param pathType type of parsed file, or null if unknown - may be used by
* parsers to alter their behavior according to the file path type.
*/
public abstract DocData parse(DocData docData, String name, TrecContentSource trecSrc,
StringBuilder docBuf, ParsePathType pathType) throws IOException;
示例9: parse
import org.apache.lucene.benchmark.byTask.feeds.DocData; //导入依赖的package包/类
/**
* Parse the input Reader and return DocData.
* The provided name,title,date are used for the result, unless when they're null,
* in which case an attempt is made to set them from the parsed data.
* @param docData result reused
* @param name name of the result doc data.
* @param date date of the result doc data. If null, attempt to set by parsed data.
* @param reader reader of html text to parse.
* @param trecSrc the {@link ContentSourceDateUtil} used to parse dates.
* @return Parsed doc data.
* @throws IOException If there is a low-level I/O error.
*/
public DocData parse(DocData docData, String name, Date date, Reader reader, ContentSourceDateUtil trecSrc) throws IOException;