2019-06-04 09:09:58 +00:00
import tantivy
2019-08-02 21:32:08 +00:00
import pytest
2022-04-27 01:48:19 +00:00
from tantivy import Document , Index , SchemaBuilder
2019-08-02 11:23:10 +00:00
2019-08-29 00:55:36 +00:00
2019-08-02 11:23:10 +00:00
def schema ( ) :
2022-04-15 03:50:37 +00:00
return (
SchemaBuilder ( )
. add_text_field ( " title " , stored = True )
. add_text_field ( " body " )
. build ( )
)
2023-01-17 00:43:39 +00:00
def schema_numeric_fields ( ) :
return (
SchemaBuilder ( )
. add_integer_field ( " id " , stored = True , indexed = True )
. add_float_field ( " rating " , stored = True , indexed = True )
. add_text_field ( " body " , stored = True )
. build ( )
)
2019-06-04 09:09:58 +00:00
2019-10-01 16:50:09 +00:00
def create_index ( dir = None ) :
2019-08-29 00:55:36 +00:00
# assume all tests will use the same documents for now
# other methods may set up function-local indexes
2019-10-01 16:50:09 +00:00
index = Index ( schema ( ) , dir )
2019-08-29 00:55:36 +00:00
writer = index . writer ( )
# 2 ways of adding documents
# 1
doc = Document ( )
# create a document instance
# add field-value pairs
doc . add_text ( " title " , " The Old Man and the Sea " )
2019-09-09 12:30:50 +00:00
doc . add_text (
" body " ,
(
" He was an old man who fished alone in a skiff in "
" the Gulf Stream and he had gone eighty-four days "
" now without taking a fish. "
) ,
)
2019-08-29 00:55:36 +00:00
writer . add_document ( doc )
# 2 use the built-in json support
# keys need to coincide with field names
2019-09-09 12:30:50 +00:00
doc = Document . from_dict (
{
" title " : " Of Mice and Men " ,
" body " : (
" A few miles south of Soledad, the Salinas River drops "
" in close to the hillside bank and runs deep and "
" green. The water is warm too, for it has slipped "
" twinkling over the yellow sands in the sunlight "
" before reaching the narrow pool. On one side of the "
" river the golden foothill slopes curve up to the "
" strong and rocky Gabilan Mountains, but on the valley "
" side the water is lined with trees—willows fresh and "
" green with every spring, carrying in their lower leaf "
" junctures the debris of the winter’ s flooding; and "
" sycamores with mottled, white, recumbent limbs and "
" branches that arch over the pool "
) ,
}
)
2019-08-29 00:55:36 +00:00
writer . add_document ( doc )
2019-09-09 12:30:50 +00:00
writer . add_json (
""" {
2019-06-04 09:09:58 +00:00
" title " : [ " Frankenstein " , " The Modern Prometheus " ] ,
2019-08-02 11:23:10 +00:00
" body " : " You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking. "
2019-09-09 12:30:50 +00:00
} """
)
2019-08-29 00:55:36 +00:00
writer . commit ( )
index . reload ( )
return index
2019-08-03 11:28:12 +00:00
2023-01-17 00:43:39 +00:00
def create_index_with_numeric_fields ( dir = None ) :
index = Index ( schema_numeric_fields ( ) , dir )
writer = index . writer ( )
doc = Document ( )
doc . add_integer ( " id " , 1 )
doc . add_float ( " rating " , 3.5 )
doc . add_text (
" body " ,
(
" He was an old man who fished alone in a skiff in "
" the Gulf Stream and he had gone eighty-four days "
" now without taking a fish. "
) ,
)
writer . add_document ( doc )
doc = Document . from_dict (
{
" id " : 2 ,
" rating " : 4.5 ,
" body " : (
" A few miles south of Soledad, the Salinas River drops "
" in close to the hillside bank and runs deep and "
" green. The water is warm too, for it has slipped "
" twinkling over the yellow sands in the sunlight "
" before reaching the narrow pool. On one side of the "
" river the golden foothill slopes curve up to the "
" strong and rocky Gabilan Mountains, but on the valley "
" side the water is lined with trees—willows fresh and "
" green with every spring, carrying in their lower leaf "
" junctures the debris of the winter’ s flooding; and "
" sycamores with mottled, white, recumbent limbs and "
" branches that arch over the pool "
) ,
}
)
writer . add_document ( doc )
writer . commit ( )
index . reload ( )
return index
2019-06-04 09:09:58 +00:00
2019-10-01 16:50:09 +00:00
@pytest.fixture ( )
def dir_index ( tmpdir ) :
return ( tmpdir , create_index ( str ( tmpdir ) ) )
@pytest.fixture ( scope = " class " )
def ram_index ( ) :
return create_index ( )
2023-01-17 00:43:39 +00:00
@pytest.fixture ( scope = " class " )
def ram_index_numeric_fields ( ) :
return create_index_with_numeric_fields ( )
2019-08-03 11:28:12 +00:00
class TestClass ( object ) :
2019-10-01 16:50:09 +00:00
def test_simple_search_in_dir ( self , dir_index ) :
_ , index = dir_index
query = index . parse_query ( " sea whale " , [ " title " , " body " ] )
2019-10-01 18:05:38 +00:00
result = index . searcher ( ) . search ( query , 10 )
2019-12-17 22:17:44 +00:00
assert len ( result . hits ) == 1
2019-10-01 16:50:09 +00:00
def test_simple_search_after_reuse ( self , dir_index ) :
index_dir , _ = dir_index
index = Index ( schema ( ) , str ( index_dir ) )
query = index . parse_query ( " sea whale " , [ " title " , " body " ] )
2019-10-01 18:05:38 +00:00
result = index . searcher ( ) . search ( query , 10 )
2019-12-17 22:17:44 +00:00
assert len ( result . hits ) == 1
2019-10-01 16:50:09 +00:00
def test_simple_search_in_ram ( self , ram_index ) :
2019-08-02 11:23:10 +00:00
index = ram_index
query = index . parse_query ( " sea whale " , [ " title " , " body " ] )
2019-06-04 09:09:58 +00:00
2019-10-01 18:05:38 +00:00
result = index . searcher ( ) . search ( query , 10 )
2019-12-17 22:17:44 +00:00
assert len ( result . hits ) == 1
_ , doc_address = result . hits [ 0 ]
2019-08-02 11:23:10 +00:00
searched_doc = index . searcher ( ) . doc ( doc_address )
assert searched_doc [ " title " ] == [ " The Old Man and the Sea " ]
2019-08-02 21:32:08 +00:00
2019-08-03 11:28:12 +00:00
def test_and_query ( self , ram_index ) :
2019-08-02 11:23:10 +00:00
index = ram_index
2022-04-15 03:50:37 +00:00
query = index . parse_query (
" title:men AND body:summer " , default_field_names = [ " title " , " body " ]
)
2019-08-02 21:32:08 +00:00
# look for an intersection of documents
2019-08-02 11:23:10 +00:00
searcher = index . searcher ( )
2019-10-01 18:05:38 +00:00
result = searcher . search ( query , 10 )
2019-08-02 21:32:08 +00:00
# summer isn't present
2019-12-17 22:17:44 +00:00
assert len ( result . hits ) == 0
2019-08-02 21:32:08 +00:00
2019-08-02 11:23:10 +00:00
query = index . parse_query ( " title:men AND body:winter " , [ " title " , " body " ] )
2019-10-01 18:05:38 +00:00
result = searcher . search ( query )
2019-08-02 21:32:08 +00:00
2019-12-17 22:17:44 +00:00
assert len ( result . hits ) == 1
2019-08-02 21:32:08 +00:00
2023-01-17 00:43:39 +00:00
def test_and_query_numeric_fields ( self , ram_index_numeric_fields ) :
index = ram_index_numeric_fields
searcher = index . searcher ( )
# 1 result
float_query = index . parse_query ( " 3.5 " , [ " rating " ] )
result = searcher . search ( float_query )
assert len ( result . hits ) == 1
assert searcher . doc ( result . hits [ 0 ] [ 1 ] ) [ ' rating ' ] [ 0 ] == 3.5
integer_query = index . parse_query ( " 1 " , [ " id " ] )
result = searcher . search ( integer_query )
assert len ( result . hits ) == 1
# 0 result
integer_query = index . parse_query ( " 10 " , [ " id " ] )
result = searcher . search ( integer_query )
assert len ( result . hits ) == 0
2019-08-02 11:23:10 +00:00
def test_and_query_parser_default_fields ( self , ram_index ) :
query = ram_index . parse_query ( " winter " , default_field_names = [ " title " ] )
2022-04-15 03:50:37 +00:00
assert repr ( query ) == """ Query(TermQuery(Term(type=Str, field=0, " winter " ))) """
2019-08-02 11:23:10 +00:00
def test_and_query_parser_default_fields_undefined ( self , ram_index ) :
query = ram_index . parse_query ( " winter " )
2019-09-09 12:30:50 +00:00
assert (
2022-04-15 03:50:37 +00:00
repr ( query )
== """ Query(BooleanQuery { subqueries: [(Should, TermQuery(Term(type=Str, field=0, " winter " ))), (Should, TermQuery(Term(type=Str, field=1, " winter " )))] }) """
2019-09-09 12:30:50 +00:00
)
2019-08-02 11:23:10 +00:00
2019-08-03 11:28:12 +00:00
def test_query_errors ( self , ram_index ) :
2019-08-02 11:23:10 +00:00
index = ram_index
2019-08-02 21:32:08 +00:00
# no "bod" field
with pytest . raises ( ValueError ) :
2019-08-02 11:23:10 +00:00
index . parse_query ( " bod:men " , [ " title " , " body " ] )
2019-08-02 23:39:02 +00:00
2020-04-19 10:26:08 +00:00
def test_order_by_search ( self ) :
2022-04-15 03:50:37 +00:00
schema = (
SchemaBuilder ( )
2020-04-19 10:26:08 +00:00
. add_unsigned_field ( " order " , fast = " single " )
2022-04-15 03:50:37 +00:00
. add_text_field ( " title " , stored = True )
. build ( )
2020-04-19 10:26:08 +00:00
)
index = Index ( schema )
writer = index . writer ( )
doc = Document ( )
doc . add_unsigned ( " order " , 0 )
doc . add_text ( " title " , " Test title " )
writer . add_document ( doc )
doc = Document ( )
doc . add_unsigned ( " order " , 2 )
doc . add_text ( " title " , " Final test title " )
writer . add_document ( doc )
doc = Document ( )
doc . add_unsigned ( " order " , 1 )
doc . add_text ( " title " , " Another test title " )
writer . add_document ( doc )
writer . commit ( )
index . reload ( )
query = index . parse_query ( " test " )
searcher = index . searcher ( )
2020-09-06 10:26:17 +00:00
result = searcher . search ( query , 10 , offset = 2 , order_by_field = " order " )
assert len ( result . hits ) == 1
2020-04-19 10:26:08 +00:00
result = searcher . search ( query , 10 , order_by_field = " order " )
assert len ( result . hits ) == 3
_ , doc_address = result . hits [ 0 ]
searched_doc = index . searcher ( ) . doc ( doc_address )
assert searched_doc [ " title " ] == [ " Final test title " ]
_ , doc_address = result . hits [ 1 ]
searched_doc = index . searcher ( ) . doc ( doc_address )
assert searched_doc [ " title " ] == [ " Another test title " ]
_ , doc_address = result . hits [ 2 ]
searched_doc = index . searcher ( ) . doc ( doc_address )
assert searched_doc [ " title " ] == [ " Test title " ]
def test_order_by_search_without_fast_field ( self ) :
2022-04-15 03:50:37 +00:00
schema = (
SchemaBuilder ( )
2020-04-19 10:26:08 +00:00
. add_unsigned_field ( " order " )
2022-04-15 03:50:37 +00:00
. add_text_field ( " title " , stored = True )
. build ( )
2020-04-19 10:26:08 +00:00
)
index = Index ( schema )
writer = index . writer ( )
doc = Document ( )
doc . add_unsigned ( " order " , 0 )
doc . add_text ( " title " , " Test title " )
query = index . parse_query ( " test " )
searcher = index . searcher ( )
result = searcher . search ( query , 10 , order_by_field = " order " )
assert len ( result . hits ) == 0
2019-09-09 12:30:50 +00:00
class TestUpdateClass ( object ) :
def test_delete_update ( self , ram_index ) :
query = ram_index . parse_query ( " Frankenstein " , [ " title " ] )
2019-10-01 18:05:38 +00:00
result = ram_index . searcher ( ) . search ( query , 10 )
2019-12-17 22:17:44 +00:00
assert len ( result . hits ) == 1
2019-09-09 12:30:50 +00:00
writer = ram_index . writer ( )
with pytest . raises ( ValueError ) :
writer . delete_documents ( " fake_field " , " frankenstein " )
with pytest . raises ( ValueError ) :
writer . delete_documents ( " title " , b " frankenstein " )
writer . delete_documents ( " title " , " frankenstein " )
writer . commit ( )
ram_index . reload ( )
2019-10-01 18:05:38 +00:00
result = ram_index . searcher ( ) . search ( query )
2019-12-17 22:17:44 +00:00
assert len ( result . hits ) == 0
2019-09-09 12:30:50 +00:00
2019-08-03 11:28:12 +00:00
class TestFromDiskClass ( object ) :
2022-04-27 01:48:19 +00:00
def test_opens_from_dir_invalid_schema ( self , dir_index ) :
invalid_schema = SchemaBuilder ( ) . add_text_field ( " 🐱 " ) . build ( )
index_dir , _ = dir_index
2020-01-05 11:04:27 +00:00
with pytest . raises ( ValueError ) :
2022-04-27 01:48:19 +00:00
Index ( invalid_schema , str ( index_dir ) , reuse = True )
2020-01-05 11:04:27 +00:00
def test_opens_from_dir ( self , dir_index ) :
index_dir , _ = dir_index
index = Index ( schema ( ) , str ( index_dir ) , reuse = True )
2019-08-02 11:23:10 +00:00
assert index . searcher ( ) . num_docs == 3
def test_create_readers ( self ) :
# not sure what is the point of this test.
idx = Index ( schema ( ) )
2019-09-06 00:45:29 +00:00
idx . config_reader ( " Manual " , 4 )
2019-08-29 00:55:36 +00:00
assert idx . searcher ( ) . num_docs == 0
2019-08-02 11:23:10 +00:00
# by default this is manual mode
writer = idx . writer ( 30000000 , 1 )
2019-08-29 00:55:36 +00:00
writer . add_document ( Document ( title = " mytitle " , body = " mybody " ) )
2019-08-02 11:23:10 +00:00
writer . commit ( )
2019-08-29 00:55:36 +00:00
assert idx . searcher ( ) . num_docs == 0
2019-08-02 11:23:10 +00:00
# Manual is the default setting.
# In this case, change are reflected only when
# the index is manually reloaded.
idx . reload ( )
2019-08-29 00:55:36 +00:00
assert idx . searcher ( ) . num_docs == 1
2019-08-02 11:23:10 +00:00
idx . config_reader ( " OnCommit " , 4 )
2019-08-29 00:55:36 +00:00
writer . add_document ( Document ( title = " mytitle2 " , body = " mybody2 " ) )
2019-08-02 11:23:10 +00:00
writer . commit ( )
import time
2019-09-09 12:30:50 +00:00
2019-08-02 11:23:10 +00:00
for i in range ( 50 ) :
# The index should be automatically reloaded.
# Wait for at most 5s for it to happen.
time . sleep ( 0.1 )
2019-08-29 00:55:36 +00:00
if idx . searcher ( ) . num_docs == 2 :
2019-08-02 11:23:10 +00:00
return
assert False
2019-08-29 00:55:36 +00:00
2019-08-02 11:23:10 +00:00
class TestSearcher ( object ) :
2023-01-17 00:43:39 +00:00
def test_searcher_repr ( self , ram_index , ram_index_numeric_fields ) :
2019-08-02 11:23:10 +00:00
assert repr ( ram_index . searcher ( ) ) == " Searcher(num_docs=3, num_segments=1) "
2023-01-17 00:43:39 +00:00
assert repr ( ram_index_numeric_fields . searcher ( ) ) == " Searcher(num_docs=2, num_segments=1) "
2019-08-02 11:23:10 +00:00
class TestDocument ( object ) :
def test_document ( self ) :
doc = tantivy . Document ( name = " Bill " , reference = [ 1 , 2 ] )
assert doc [ " reference " ] == [ 1 , 2 ]
assert doc [ " name " ] == [ " Bill " ]
assert doc . get_first ( " name " ) == " Bill "
assert doc . get_first ( " reference " ) == 1
assert doc . to_dict ( ) == { " name " : [ " Bill " ] , " reference " : [ 1 , 2 ] }
def test_document_with_date ( self ) :
import datetime
2019-09-09 12:30:50 +00:00
date = datetime . datetime ( 2019 , 8 , 12 , 13 , 0 , 0 )
2019-08-02 11:23:10 +00:00
doc = tantivy . Document ( name = " Bill " , date = date )
assert doc [ " date " ] [ 0 ] == date
def test_document_repr ( self ) :
doc = tantivy . Document ( name = " Bill " , reference = [ 1 , 2 ] )
assert repr ( doc ) == " Document(name=[Bill],reference=[1,2]) "
2020-10-01 13:35:36 +00:00
def test_document_repr_utf8 ( self ) :
doc = tantivy . Document ( name = " 野菜食べないとやばい " , reference = [ 1 , 2 ] )
assert repr ( doc ) == " Document(name=[野菜食べないとやばい],reference=[1,2]) "
2019-08-02 11:23:10 +00:00
def test_document_with_facet ( self ) :
doc = tantivy . Document ( )
facet = tantivy . Facet . from_string ( " /europe/france " )
doc . add_facet ( " facet " , facet )
2019-09-09 12:30:50 +00:00
assert doc [ " facet " ] [ 0 ] . to_path ( ) == [ " europe " , " france " ]
2019-08-02 11:23:10 +00:00
doc = tantivy . Document ( )
facet = tantivy . Facet . from_string ( " /asia \\ /oceania/fiji " )
doc . add_facet ( " facet " , facet )
2019-09-09 12:30:50 +00:00
assert doc [ " facet " ] [ 0 ] . to_path ( ) == [ " asia/oceania " , " fiji " ]
2019-08-02 11:23:10 +00:00
assert doc [ " facet " ] [ 0 ] . to_path_str ( ) == " /asia \\ /oceania/fiji "
assert repr ( doc [ " facet " ] [ 0 ] ) == " Facet(/asia \\ /oceania/fiji) "
doc = tantivy . Document ( facet = facet )
2019-09-09 12:30:50 +00:00
assert doc [ " facet " ] [ 0 ] . to_path ( ) == [ " asia/oceania " , " fiji " ]
2019-08-02 11:23:10 +00:00
def test_document_error ( self ) :
with pytest . raises ( ValueError ) :
2019-08-29 00:55:36 +00:00
tantivy . Document ( name = { } )
2022-04-15 03:50:37 +00:00
class TestJsonField :
def test_query_from_json_field ( self ) :
schema = (
SchemaBuilder ( )
. add_json_field (
" attributes " ,
stored = True ,
tokenizer_name = " default " ,
index_option = " position " ,
)
. build ( )
)
index = Index ( schema )
writer = index . writer ( )
doc = Document ( )
doc . add_json (
" attributes " ,
""" {
" order " : 1.1 ,
" target " : " submit-button " ,
" cart " : { " product_id " : 103 } ,
" description " : " the best vacuum cleaner ever "
} """ ,
)
writer . add_document ( doc )
doc = Document ( )
doc . add_json (
" attributes " ,
""" {
" order " : 1.2 ,
" target " : " submit-button " ,
" cart " : { " product_id " : 133 } ,
" description " : " das keyboard "
} """ ,
)
writer . add_document ( doc )
writer . commit ( )
index . reload ( )
query = index . parse_query ( " target:submit-button " , [ " attributes " ] )
result = index . searcher ( ) . search ( query , 2 )
assert len ( result . hits ) == 2
query = index . parse_query ( " target:submit " , [ " attributes " ] )
result = index . searcher ( ) . search ( query , 2 )
assert len ( result . hits ) == 2
query = index . parse_query ( " order:1.1 " , [ " attributes " ] )
result = index . searcher ( ) . search ( query , 2 )
assert len ( result . hits ) == 1
# query = index.parse_query_for_attributes("cart.product_id:103")
# result = index.searcher().search(query, 1)
# assert len(result.hits) == 1
# query = index.parse_query_for_attributes(
# "target:submit-button AND cart.product_id:133"
# )
# result = index.searcher().search(query, 2)
# assert len(result.hits) == 1