本文整理汇总了Python中pysolr.Solr.extract方法的典型用法代码示例。如果您正苦于以下问题:Python Solr.extract方法的具体用法?Python Solr.extract怎么用?Python Solr.extract使用的例子?那么恭喜您, 这里精选的方法代码示例或许可以为您提供帮助。您也可以进一步了解该方法所在类pysolr.Solr
的用法示例。
在下文中一共展示了Solr.extract方法的6个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于系统推荐出更棒的Python代码示例。
示例1: SolrSearchBackend
# 需要导入模块: from pysolr import Solr [as 别名]
# 或者: from pysolr.Solr import extract [as 别名]
#.........这里部分代码省略.........
hits -= 1
return {
'results': results,
'hits': hits,
'stats': stats,
'facets': facets,
'spelling_suggestion': spelling_suggestion,
}
def build_schema(self, fields):
content_field_name = ''
schema_fields = []
for field_name, field_class in fields.items():
field_data = {
'name': field_class.index_fieldname,
'type': 'text_en',
'indexed': 'true',
'stored': 'true',
'multiValued': 'false',
}
if field_class.document is True:
content_field_name = field_class.index_fieldname
# DRL_FIXME: Perhaps move to something where, if none of these
# checks succeed, call a custom method on the form that
# returns, per-backend, the right type of storage?
if field_class.field_type in ['date', 'datetime']:
field_data['type'] = 'date'
elif field_class.field_type == 'integer':
field_data['type'] = 'long'
elif field_class.field_type == 'float':
field_data['type'] = 'float'
elif field_class.field_type == 'boolean':
field_data['type'] = 'boolean'
elif field_class.field_type == 'ngram':
field_data['type'] = 'ngram'
elif field_class.field_type == 'edge_ngram':
field_data['type'] = 'edge_ngram'
elif field_class.field_type == 'location':
field_data['type'] = 'location'
if field_class.is_multivalued:
field_data['multiValued'] = 'true'
if field_class.stored is False:
field_data['stored'] = 'false'
# Do this last to override `text` fields.
if field_class.indexed is False:
field_data['indexed'] = 'false'
# If it's text and not being indexed, we probably don't want
# to do the normal lowercase/tokenize/stemming/etc. dance.
if field_data['type'] == 'text_en':
field_data['type'] = 'string'
# If it's a ``FacetField``, make sure we don't postprocess it.
if hasattr(field_class, 'facet_for'):
# If it's text, it ought to be a string.
if field_data['type'] == 'text_en':
field_data['type'] = 'string'
schema_fields.append(field_data)
return (content_field_name, schema_fields)
def extract_file_contents(self, file_obj):
"""Extract text and metadata from a structured file (PDF, MS Word, etc.)
Uses the Solr ExtractingRequestHandler, which is based on Apache Tika.
See the Solr wiki for details:
http://wiki.apache.org/solr/ExtractingRequestHandler
Due to the way the ExtractingRequestHandler is implemented it completely
replaces the normal Haystack indexing process with several unfortunate
restrictions: only one file per request, the extracted data is added to
the index with no ability to modify it, etc. To simplify the process and
allow for more advanced use we'll run using the extract-only mode to
return the extracted data without adding it to the index so we can then
use it within Haystack's normal templating process.
Returns None if metadata cannot be extracted; otherwise returns a
dictionary containing at least two keys:
:contents:
Extracted full-text content, if applicable
:metadata:
key:value pairs of text strings
"""
try:
return self.conn.extract(file_obj)
except Exception as e:
self.log.warning(u"Unable to extract file contents: %s", e,
exc_info=True, extra={"data": {"file": file_obj}})
return None
示例2: SolrTestCase
# 需要导入模块: from pysolr import Solr [as 别名]
# 或者: from pysolr.Solr import extract [as 别名]
#.........这里部分代码省略.........
def test__select(self):
# Short params.
resp_body = self.solr._select({'q': 'doc'})
resp_data = json.loads(resp_body)
self.assertEqual(resp_data['response']['numFound'], 3)
# Long params.
resp_body = self.solr._select({'q': 'doc' * 1024})
resp_data = json.loads(resp_body)
self.assertEqual(resp_data['response']['numFound'], 0)
self.assertEqual(len(resp_data['responseHeader']['params']['q']), 3 * 1024)
def test__mlt(self):
resp_body = self.solr._mlt({'q': 'id:doc_1', 'mlt.fl': 'title'})
resp_data = json.loads(resp_body)
self.assertEqual(resp_data['response']['numFound'], 0)
def test__suggest_terms(self):
resp_body = self.solr._select({'terms.fl': 'title'})
resp_data = json.loads(resp_body)
self.assertEqual(resp_data['response']['numFound'], 0)
def test__update(self):
xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>'
resp_body = self.solr._update(xml_body)
self.assertTrue('<int name="status">0</int>' in resp_body)
def test__soft_commit(self):
xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>'
resp_body = self.solr._update(xml_body, softCommit=True)
self.assertTrue('<int name="status">0</int>' in resp_body)
def test__extract_error(self):
class RubbishResponse(object):
def __init__(self, content, headers=None):
if isinstance(content, bytes):
content = content.decode('utf-8')
self.content = content
self.headers = headers
if self.headers is None:
self.headers = {}
def json(self):
return json.loads(self.content)
# Just the reason.
resp_1 = RubbishResponse("We don't care.", {'reason': 'Something went wrong.'})
self.assertEqual(self.solr._extract_error(resp_1), "[Reason: Something went wrong.]")
# Empty reason.
resp_2 = RubbishResponse("We don't care.", {'reason': None})
self.assertEqual(self.solr._extract_error(resp_2), "[Reason: None]\nWe don't care.")
# No reason. Time to scrape.
resp_3 = RubbishResponse('<html><body><pre>Something is broke.</pre></body></html>', {'server': 'jetty'})
self.assertEqual(self.solr._extract_error(resp_3), "[Reason: Something is broke.]")
# No reason. JSON response.
resp_4 = RubbishResponse(b'\n {"error": {"msg": "It happens"}}', {'server': 'tomcat'})
self.assertEqual(self.solr._extract_error(resp_4), "[Reason: It happens]")
# No reason. Weird JSON response.
resp_5 = RubbishResponse(b'{"kinda": "weird"}', {'server': 'jetty'})
self.assertEqual(self.solr._extract_error(resp_5), '[Reason: None]\n{"kinda": "weird"}')
示例3: SolrTestCase
# 需要导入模块: from pysolr import Solr [as 别名]
# 或者: from pysolr.Solr import extract [as 别名]
#.........这里部分代码省略.........
# Test a non-existent URL.
old_url = self.solr.url
self.solr.url = 'http://127.0.0.1:567898/wahtever'
self.assertRaises(SolrError, self.solr._send_request, 'get', 'select/?q=doc&wt=json')
self.solr.url = old_url
def test__select(self):
# Short params.
resp_body = self.solr._select({'q': 'doc'})
resp_data = json.loads(resp_body)
self.assertEqual(resp_data['response']['numFound'], 3)
# Long params.
resp_body = self.solr._select({'q': 'doc' * 1024})
resp_data = json.loads(resp_body)
self.assertEqual(resp_data['response']['numFound'], 0)
self.assertEqual(len(resp_data['responseHeader']['params']['q']), 3 * 1024)
def test__mlt(self):
resp_body = self.solr._mlt({'q': 'id:doc_1', 'mlt.fl': 'title'})
resp_data = json.loads(resp_body)
self.assertEqual(resp_data['response']['numFound'], 0)
def test__suggest_terms(self):
resp_body = self.solr._select({'terms.fl': 'title'})
resp_data = json.loads(resp_body)
self.assertEqual(resp_data['response']['numFound'], 0)
def test__update(self):
xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>'
resp_body = self.solr._update(xml_body)
self.assertTrue('<int name="status">0</int>' in resp_body)
def test__extract_error(self):
class RubbishResponse(object):
def __init__(self, content, headers=None):
self.content = content
self.headers = headers
if self.headers is None:
self.headers = {}
# Just the reason.
resp_1 = RubbishResponse("We don't care.", {'reason': 'Something went wrong.'})
self.assertEqual(self.solr._extract_error(resp_1), "[Reason: Something went wrong.]")
# Empty reason.
resp_2 = RubbishResponse("We don't care.", {'reason': None})
self.assertEqual(self.solr._extract_error(resp_2), "[Reason: None]\nWe don't care.")
# No reason. Time to scrape.
resp_3 = RubbishResponse('<html><body><pre>Something is broke.</pre></body></html>', {'server': 'jetty'})
self.assertEqual(self.solr._extract_error(resp_3), "[Reason: Something is broke.]")
def test__scrape_response(self):
# Tomcat.
resp_1 = self.solr._scrape_response({'server': 'coyote'}, '<html><body><p><span>Error message</span><span>messed up.</span></p></body></html>')
self.assertEqual(resp_1, ('messed up.', ''))
# Jetty.
resp_2 = self.solr._scrape_response({'server': 'jetty'}, '<html><body><pre>Something is broke.</pre></body></html>')
self.assertEqual(resp_2, ('Something is broke.', u''))
# Broken Tomcat.
resp_3 = self.solr._scrape_response({'server': 'coyote'}, '<html><body><p>Really broken. Scraping Java-generated HTML sucks.</pre></body></html>')
self.assertEqual(resp_3, (None, u'<div><body><p>Really broken. Scraping Java-generated HTML sucks.</p></body></div>'))
示例4: SolrSearchBackend
# 需要导入模块: from pysolr import Solr [as 别名]
# 或者: from pysolr.Solr import extract [as 别名]
#.........这里部分代码省略.........
results = []
hits = raw_results.hits
facets = {}
stats = {}
spelling_suggestion = spelling_suggestions = None
if result_class is None:
result_class = SearchResult
if hasattr(raw_results, "stats"):
stats = raw_results.stats.get("stats_fields", {})
if hasattr(raw_results, "facets"):
facets = {
"fields": raw_results.facets.get("facet_fields", {}),
"dates": raw_results.facets.get("facet_dates", {}),
"queries": raw_results.facets.get("facet_queries", {}),
}
for key in ["fields"]:
for facet_field in facets[key]:
# Convert to a two-tuple, as Solr's json format returns a list of
# pairs.
facets[key][facet_field] = list(
zip(
facets[key][facet_field][::2],
facets[key][facet_field][1::2],
)
)
if self.include_spelling and hasattr(raw_results, "spellcheck"):
try:
spelling_suggestions = self.extract_spelling_suggestions(raw_results)
except Exception as exc:
self.log.error(
"Error extracting spelling suggestions: %s",
exc,
exc_info=True,
extra={"data": {"spellcheck": raw_results.spellcheck}},
)
if not self.silently_fail:
raise
spelling_suggestions = None
if spelling_suggestions:
# Maintain compatibility with older versions of Haystack which returned a single suggestion:
spelling_suggestion = spelling_suggestions[-1]
assert isinstance(spelling_suggestion, six.string_types)
else:
spelling_suggestion = None
unified_index = connections[self.connection_alias].get_unified_index()
indexed_models = unified_index.get_indexed_models()
for raw_result in raw_results.docs:
app_label, model_name = raw_result[DJANGO_CT].split(".")
additional_fields = {}
model = haystack_get_model(app_label, model_name)
if model and model in indexed_models:
index = unified_index.get_index(model)
index_field_map = index.field_map
for key, value in raw_result.items():
示例5: SolrTestCase
# 需要导入模块: from pysolr import Solr [as 别名]
# 或者: from pysolr.Solr import extract [as 别名]
#.........这里部分代码省略.........
# Long params.
resp_body = self.solr._select({"q": "doc" * 1024})
resp_data = json.loads(resp_body)
self.assertEqual(resp_data["response"]["numFound"], 0)
self.assertEqual(len(resp_data["responseHeader"]["params"]["q"]), 3 * 1024)
# Test Deep Pagination CursorMark
resp_body = self.solr._select({"q": "*", "cursorMark": "*", "sort": "id desc", "start": 0, "rows": 2})
resp_data = json.loads(resp_body)
self.assertEqual(len(resp_data["response"]["docs"]), 2)
self.assertIn("nextCursorMark", resp_data)
def test__mlt(self):
resp_body = self.solr._mlt({"q": "id:doc_1", "mlt.fl": "title"})
resp_data = json.loads(resp_body)
self.assertEqual(resp_data["response"]["numFound"], 0)
def test__suggest_terms(self):
resp_body = self.solr._select({"terms.fl": "title"})
resp_data = json.loads(resp_body)
self.assertEqual(resp_data["response"]["numFound"], 0)
def test__update(self):
xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>'
resp_body = self.solr._update(xml_body)
self.assertTrue('<int name="status">0</int>' in resp_body)
def test__soft_commit(self):
xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>'
resp_body = self.solr._update(xml_body, softCommit=True)
self.assertTrue('<int name="status">0</int>' in resp_body)
def test__extract_error(self):
class RubbishResponse(object):
def __init__(self, content, headers=None):
if isinstance(content, bytes):
content = content.decode("utf-8")
self.content = content
self.headers = headers
if self.headers is None:
self.headers = {}
def json(self):
return json.loads(self.content)
# Just the reason.
resp_1 = RubbishResponse("We don't care.", {"reason": "Something went wrong."})
self.assertEqual(self.solr._extract_error(resp_1), "[Reason: Something went wrong.]")
# Empty reason.
resp_2 = RubbishResponse("We don't care.", {"reason": None})
self.assertEqual(self.solr._extract_error(resp_2), "[Reason: None]\nWe don't care.")
# No reason. Time to scrape.
resp_3 = RubbishResponse("<html><body><pre>Something is broke.</pre></body></html>", {"server": "jetty"})
self.assertEqual(self.solr._extract_error(resp_3), "[Reason: Something is broke.]")
# No reason. JSON response.
resp_4 = RubbishResponse(b'\n {"error": {"msg": "It happens"}}', {"server": "tomcat"})
self.assertEqual(self.solr._extract_error(resp_4), "[Reason: It happens]")
# No reason. Weird JSON response.
resp_5 = RubbishResponse(b'{"kinda": "weird"}', {"server": "jetty"})
self.assertEqual(self.solr._extract_error(resp_5), '[Reason: None]\n{"kinda": "weird"}')
示例6: SolrTestCase
# 需要导入模块: from pysolr import Solr [as 别名]
# 或者: from pysolr.Solr import extract [as 别名]
#.........这里部分代码省略.........
# Long params.
resp_body = self.solr._select({'q': 'doc' * 1024})
resp_data = json.loads(resp_body)
self.assertEqual(resp_data['response']['numFound'], 0)
self.assertEqual(len(resp_data['responseHeader']['params']['q']), 3 * 1024)
# Test Deep Pagination CursorMark
resp_body = self.solr._select({'q': '*', 'cursorMark':'*', 'sort':'id desc', 'start':0, 'rows': 2})
resp_data = json.loads(resp_body)
self.assertEqual(len(resp_data['response']['docs']), 2)
self.assertIn('nextCursorMark', resp_data)
def test__mlt(self):
resp_body = self.solr._mlt({'q': 'id:doc_1', 'mlt.fl': 'title'})
resp_data = json.loads(resp_body)
self.assertEqual(resp_data['response']['numFound'], 0)
def test__suggest_terms(self):
resp_body = self.solr._select({'terms.fl': 'title'})
resp_data = json.loads(resp_body)
self.assertEqual(resp_data['response']['numFound'], 0)
def test__update(self):
xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>'
resp_body = self.solr._update(xml_body)
self.assertTrue('<int name="status">0</int>' in resp_body)
def test__soft_commit(self):
xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>'
resp_body = self.solr._update(xml_body, softCommit=True)
self.assertTrue('<int name="status">0</int>' in resp_body)
def test__extract_error(self):
class RubbishResponse(object):
def __init__(self, content, headers=None):
if isinstance(content, bytes):
content = content.decode('utf-8')
self.content = content
self.headers = headers
if self.headers is None:
self.headers = {}
def json(self):
return json.loads(self.content)
# Just the reason.
resp_1 = RubbishResponse("We don't care.", {'reason': 'Something went wrong.'})
self.assertEqual(self.solr._extract_error(resp_1), "[Reason: Something went wrong.]")
# Empty reason.
resp_2 = RubbishResponse("We don't care.", {'reason': None})
self.assertEqual(self.solr._extract_error(resp_2), "[Reason: None]\nWe don't care.")
# No reason. Time to scrape.
resp_3 = RubbishResponse('<html><body><pre>Something is broke.</pre></body></html>', {'server': 'jetty'})
self.assertEqual(self.solr._extract_error(resp_3), "[Reason: Something is broke.]")
# No reason. JSON response.
resp_4 = RubbishResponse(b'\n {"error": {"msg": "It happens"}}', {'server': 'tomcat'})
self.assertEqual(self.solr._extract_error(resp_4), "[Reason: It happens]")
# No reason. Weird JSON response.
resp_5 = RubbishResponse(b'{"kinda": "weird"}', {'server': 'jetty'})
self.assertEqual(self.solr._extract_error(resp_5), '[Reason: None]\n{"kinda": "weird"}')