当前位置: 首页>>代码示例>>Python>>正文


Python FieldType.setStoreTermVectorPositions方法代码示例

本文整理汇总了Python中org.apache.lucene.document.FieldType.setStoreTermVectorPositions方法的典型用法代码示例。如果您正苦于以下问题:Python FieldType.setStoreTermVectorPositions方法的具体用法?Python FieldType.setStoreTermVectorPositions怎么用?Python FieldType.setStoreTermVectorPositions使用的例子?那么, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在org.apache.lucene.document.FieldType的用法示例。


在下文中一共展示了FieldType.setStoreTermVectorPositions方法的5个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。

示例1: index_docs

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setStoreTermVectorPositions [as 别名]
    def index_docs(self, train_set, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setStoreTermVectorOffsets(True)
        t2.setStoreTermVectorPayloads(True)
        t2.setStoreTermVectorPositions(True)
        t2.setStoreTermVectors(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        for ii in train_set:
            doc = Document()
            doc.add(Field("answer", ii['Answer'], t1))
            doc.add(Field("qid", ii['Question ID'], t1))
            doc.add(Field("category", ii['category'], t1))
            doc.add(Field("position", ii['Sentence Position'], t1))
            doc.add(Field("question", ii['Question Text'], t2))
            doc.add(Field("wiki_plain",
                          self.wiki_reader.get_text(ii['Answer']), t2))
            writer.addDocument(doc)
开发者ID:sangheestyle,项目名称:nlp2014,代码行数:30,代码来源:index.py

示例2: LuceneDocumentField

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setStoreTermVectorPositions [as 别名]
class LuceneDocumentField(object):
    """Internal handler class for possible field types"""

    def __init__(self):
        """Init possible field types"""

        # FIELD_ID: stored, indexed, non-tokenized
        self.field_id = FieldType()
        self.field_id.setIndexed(True)
        self.field_id.setStored(True)
        self.field_id.setTokenized(False)

        # FIELD_ID_TV: stored, indexed, not tokenized, with term vectors (without positions)
        # for storing IDs with term vector info
        self.field_id_tv = FieldType()
        self.field_id_tv.setIndexed(True)
        self.field_id_tv.setStored(True)
        self.field_id_tv.setTokenized(False)
        self.field_id_tv.setStoreTermVectors(True)

        # FIELD_TEXT: stored, indexed, tokenized, with positions
        self.field_text = FieldType()
        self.field_text.setIndexed(True)
        self.field_text.setStored(True)
        self.field_text.setTokenized(True)

        # FIELD_TEXT_TV: stored, indexed, tokenized, with term vectors (without positions)
        self.field_text_tv = FieldType()
        self.field_text_tv.setIndexed(True)
        self.field_text_tv.setStored(True)
        self.field_text_tv.setTokenized(True)
        self.field_text_tv.setStoreTermVectors(True)

        # FIELD_TEXT_TVP: stored, indexed, tokenized, with term vectors and positions
        # (but no character offsets)
        self.field_text_tvp = FieldType()
        self.field_text_tvp.setIndexed(True)
        self.field_text_tvp.setStored(True)
        self.field_text_tvp.setTokenized(True)
        self.field_text_tvp.setStoreTermVectors(True)
        self.field_text_tvp.setStoreTermVectorPositions(True)

    def get_field(self, type):
        """Get Lucene FieldType object for the corresponding internal FIELDTYPE_ value"""
        if type == Lucene.FIELDTYPE_ID:
            return self.field_id
        elif type == Lucene.FIELDTYPE_ID_TV:
            return self.field_id_tv
        elif type == Lucene.FIELDTYPE_TEXT:
            return self.field_text
        elif type == Lucene.FIELDTYPE_TEXT_TV:
            return self.field_text_tv
        elif type == Lucene.FIELDTYPE_TEXT_TVP:
            return self.field_text_tvp
        else:
            raise Exception("Unknown field type")
开发者ID:renespeck,项目名称:TAGME_Reproducibility,代码行数:58,代码来源:lucene_tools.py

示例3: tweetIndexer

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setStoreTermVectorPositions [as 别名]
    def tweetIndexer(self, writer):

        t1 = FieldType()
        t1.setIndexed(True)
        t1.setStored(True)
        t1.setTokenized(False)
        t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
        
        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setStoreTermVectorOffsets(True)
        t2.setStoreTermVectorPayloads(True)
        t2.setStoreTermVectorPositions(True)
        t2.setStoreTermVectors(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
        x = 0
        for i in range(0,500):
            if not os.path.isfile("json/tweets-" + str(i) + ".json"):
                break

            print "adding tweets-" + str(i) + ".json"
            tweets = open("json/tweets-" + str(i) + ".json", "r")

            for line in tweets.readlines():
                tweet = json.loads(line)
                if 'limit' in tweet:
                    continue
                try:
                    doc = Document()
                    doc.add(Field("file", "json/tweets-" + str(i) + ".json", t1))
                    sname = tweet['user']['screen_name']
                    tid = str(tweet['id'])
                    text = tweet['text']
                    uname = tweet['user']['name']
                    created = tweet['created_at']
                    tstamp = tweet['timestamp_ms']
                    place = ""
                    if tweet['place']:
                        place = tweet['place']['full_name'] + ", " + tweet['place']['country']
                    lat = ""
                    lng = ""
                    titles = ""
                    urls = ""
                    exist = "false"

                    if tweet['coordinates']:
                        lat = str(tweet['coordinates']['coordinates'][1])
                        lng = str(tweet['coordinates']['coordinates'][0])
                    else:
                        lat = str((tweet['place']['bounding_box']['coordinates'][0][0][1] + tweet['place']['bounding_box']['coordinates'][0][2][1])/2)
                        lng = str((tweet['place']['bounding_box']['coordinates'][0][0][0] + tweet['place']['bounding_box']['coordinates'][0][2][0])/2)
                    
                    if len(tweet['entities']['urls']) != 0:
                        exist = "true"
                        for index in range(len(tweet['entities']['urls'])):
                            title = tweet['entities']['urls'][index]['url_title']
                            if title == None:
                                titles += ",-"
                            else:
                                title = title.encode('ascii','ignore')
                                titles += "," + str(title)
                            urls += " " + str(tweet['entities']['urls'][index]['expanded_url'])


                    searchable = text + " " + urls + " " + uname + " " + sname + " " + place
                    doc.add(Field("lookup", searchable, t2))
                    doc.add(Field("text", text, t2))
                    doc.add(Field("user_name", uname, t2)) 
                    doc.add(Field("screen_name", sname, t2))                    
                    doc.add(Field("tweet_id", tid, t2))
                    doc.add(Field("created_at", created, t2))
                    doc.add(Field("geo_lat", lat, t2))
                    doc.add(Field("geo_lng", lng, t2))
                    doc.add(Field("url_exist", exist, t2))
                    doc.add(Field("url_url", urls, t2))
                    doc.add(Field("url_title", titles, t2))
                    doc.add(Field("timestamp", tstamp, t2))
                    writer.addDocument(doc)
                    x += 1
                except Exception, e:
                    pass
            tweets.close()
开发者ID:christian91cruz,项目名称:Twitter-Search-Engine-and-Data-Retrieval,代码行数:86,代码来源:indexTweets.py

示例4: RAMDirectory

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setStoreTermVectorPositions [as 别名]
from org.apache.lucene.index import \
    IndexWriterConfig, IndexWriter, DirectoryReader, IndexOptions

if __name__ == '__main__':
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])

directory = RAMDirectory()
iconfig = IndexWriterConfig(LimitTokenCountAnalyzer(StandardAnalyzer(), 100))
iwriter = IndexWriter(directory, iconfig)

ft = FieldType()
ft.setStored(True)
ft.setTokenized(True)
ft.setStoreTermVectors(True)
ft.setStoreTermVectorOffsets(True)
ft.setStoreTermVectorPositions(True)
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)

ts = ["this bernhard is the text to be index text",
      "this claudia is the text to be indexed"]
for t in ts:
    doc = Document()
    doc.add(Field("fieldname", t, ft))
    iwriter.addDocument(doc)

iwriter.commit()
iwriter.close()
ireader = DirectoryReader.open(directory)

for doc in xrange(0, len(ts)):
    tv = ireader.getTermVector(doc, "fieldname")
开发者ID:svn2github,项目名称:pylucene,代码行数:33,代码来源:TermPositionVector.py

示例5: open_searcher

# 需要导入模块: from org.apache.lucene.document import FieldType [as 别名]
# 或者: from org.apache.lucene.document.FieldType import setStoreTermVectorPositions [as 别名]
    return writer

def open_searcher(writer):
    from org.apache.lucene.search import IndexSearcher
    reader = writer.getReader()
    searcher = IndexSearcher(reader)
    return reader, searcher

from org.apache.lucene.document import Document, Field, FieldType, TextField, StringField
from org.apache.lucene.util import BytesRef, BytesRefIterator
from org.apache.lucene.index import Term
vectorFieldType = FieldType(TextField.TYPE_NOT_STORED)
vectorFieldType.setIndexed(True)
vectorFieldType.setTokenized(True)
vectorFieldType.setStoreTermVectors(True)
vectorFieldType.setStoreTermVectorPositions(False)

writer = open_writer('data/index')

def addToIndex(lxmlNode):
    uri = xpathFirst(lxmlNode, '//oa:hasTarget/@rdf:resource')
    print uri
    seen = set()
    doc = Document()
    for fieldName in FIELD_NAMES:
        seen.clear()
        for subpath in [
            '', '/*/rdfs:label', '/*/skos:prefLabel', '/*/skos:altLabel',
            '/*/dcterms:title', '/*/foaf:name']:
            for value in xpath(lxmlNode, '//%(fieldName)s%(subpath)s/text()' % locals()):
                if value in seen:
开发者ID:komax,项目名称:tirza,代码行数:33,代码来源:tirza.py


注:本文中的org.apache.lucene.document.FieldType.setStoreTermVectorPositions方法示例由纯净天空整理自Github/MSDocs等开源代码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。