2023-03-26 13:03:31 +00:00
from io import BytesIO
2023-08-26 12:13:29 +00:00
2023-08-04 07:23:31 +00:00
import copy
2023-08-26 12:13:29 +00:00
import datetime
2019-06-04 09:09:58 +00:00
import tantivy
2023-08-26 12:13:29 +00:00
import pickle
2019-08-02 21:32:08 +00:00
import pytest
2022-04-27 01:48:19 +00:00
from tantivy import Document , Index , SchemaBuilder
2019-08-02 11:23:10 +00:00
2019-08-29 00:55:36 +00:00
2019-08-02 11:23:10 +00:00
def schema ( ) :
2022-04-15 03:50:37 +00:00
return (
SchemaBuilder ( )
. add_text_field ( " title " , stored = True )
. add_text_field ( " body " )
. build ( )
)
2023-07-21 22:13:03 +00:00
2023-01-17 00:43:39 +00:00
def schema_numeric_fields ( ) :
return (
SchemaBuilder ( )
. add_integer_field ( " id " , stored = True , indexed = True )
. add_float_field ( " rating " , stored = True , indexed = True )
2023-08-15 22:39:28 +00:00
. add_boolean_field ( " is_good " , stored = True , indexed = True )
2023-01-17 00:43:39 +00:00
. add_text_field ( " body " , stored = True )
. build ( )
)
2019-06-04 09:09:58 +00:00
2023-07-21 22:13:03 +00:00
2019-10-01 16:50:09 +00:00
def create_index ( dir = None ) :
2019-08-29 00:55:36 +00:00
# assume all tests will use the same documents for now
# other methods may set up function-local indexes
2019-10-01 16:50:09 +00:00
index = Index ( schema ( ) , dir )
2023-07-20 23:44:47 +00:00
writer = index . writer ( 10_000_000 , 1 )
2019-08-29 00:55:36 +00:00
# 2 ways of adding documents
# 1
doc = Document ( )
# create a document instance
# add field-value pairs
doc . add_text ( " title " , " The Old Man and the Sea " )
2019-09-09 12:30:50 +00:00
doc . add_text (
" body " ,
(
" He was an old man who fished alone in a skiff in "
" the Gulf Stream and he had gone eighty-four days "
" now without taking a fish. "
) ,
)
2019-08-29 00:55:36 +00:00
writer . add_document ( doc )
# 2 use the built-in json support
# keys need to coincide with field names
2019-09-09 12:30:50 +00:00
doc = Document . from_dict (
{
" title " : " Of Mice and Men " ,
" body " : (
" A few miles south of Soledad, the Salinas River drops "
" in close to the hillside bank and runs deep and "
" green. The water is warm too, for it has slipped "
" twinkling over the yellow sands in the sunlight "
" before reaching the narrow pool. On one side of the "
" river the golden foothill slopes curve up to the "
" strong and rocky Gabilan Mountains, but on the valley "
" side the water is lined with trees—willows fresh and "
" green with every spring, carrying in their lower leaf "
" junctures the debris of the winter’ s flooding; and "
" sycamores with mottled, white, recumbent limbs and "
" branches that arch over the pool "
) ,
}
)
2019-08-29 00:55:36 +00:00
writer . add_document ( doc )
2019-09-09 12:30:50 +00:00
writer . add_json (
""" {
2019-06-04 09:09:58 +00:00
" title " : [ " Frankenstein " , " The Modern Prometheus " ] ,
2019-08-02 11:23:10 +00:00
" body " : " You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking. "
2019-09-09 12:30:50 +00:00
} """
)
2019-08-29 00:55:36 +00:00
writer . commit ( )
index . reload ( )
return index
2019-08-03 11:28:12 +00:00
2023-07-21 22:13:03 +00:00
2023-01-17 00:43:39 +00:00
def create_index_with_numeric_fields ( dir = None ) :
index = Index ( schema_numeric_fields ( ) , dir )
2023-07-20 23:44:47 +00:00
writer = index . writer ( 10_000_000 , 1 )
2023-01-17 00:43:39 +00:00
doc = Document ( )
doc . add_integer ( " id " , 1 )
doc . add_float ( " rating " , 3.5 )
2023-08-15 22:39:28 +00:00
doc . add_boolean ( " is_good " , True )
2023-01-17 00:43:39 +00:00
doc . add_text (
" body " ,
(
" He was an old man who fished alone in a skiff in "
" the Gulf Stream and he had gone eighty-four days "
" now without taking a fish. "
) ,
)
writer . add_document ( doc )
doc = Document . from_dict (
{
" id " : 2 ,
" rating " : 4.5 ,
2023-08-15 22:39:28 +00:00
" is_good " : False ,
2023-01-17 00:43:39 +00:00
" body " : (
" A few miles south of Soledad, the Salinas River drops "
" in close to the hillside bank and runs deep and "
" green. The water is warm too, for it has slipped "
" twinkling over the yellow sands in the sunlight "
" before reaching the narrow pool. On one side of the "
" river the golden foothill slopes curve up to the "
" strong and rocky Gabilan Mountains, but on the valley "
" side the water is lined with trees—willows fresh and "
" green with every spring, carrying in their lower leaf "
" junctures the debris of the winter’ s flooding; and "
" sycamores with mottled, white, recumbent limbs and "
" branches that arch over the pool "
) ,
2023-08-15 22:39:28 +00:00
} ,
2023-01-17 00:43:39 +00:00
)
writer . add_document ( doc )
writer . commit ( )
index . reload ( )
return index
2019-06-04 09:09:58 +00:00
2023-07-21 22:13:03 +00:00
2023-02-14 13:20:59 +00:00
def spanish_schema ( ) :
return (
SchemaBuilder ( )
2023-07-21 22:13:03 +00:00
. add_text_field ( " title " , stored = True , tokenizer_name = " es_stem " )
. add_text_field ( " body " , tokenizer_name = " es_stem " )
2023-02-14 13:20:59 +00:00
. build ( )
)
def create_spanish_index ( ) :
# assume all tests will use the same documents for now
# other methods may set up function-local indexes
index = Index ( spanish_schema ( ) , None )
writer = index . writer ( )
# 2 ways of adding documents
# 1
doc = Document ( )
# create a document instance
# add field-value pairs
doc . add_text ( " title " , " El viejo y el mar " )
doc . add_text (
" body " ,
(
" Era un viejo que pescaba solo en un bote en el Gulf Stream y hacía ochenta y cuatro días que no cogía un pez. "
) ,
)
writer . add_document ( doc )
# 2 use the built-in json support
# keys need to coincide with field names
doc = Document . from_dict (
{
" title " : " De ratones y hombres " ,
" body " : (
" Unas millas al sur de Soledad, el río Salinas se ahonda junto al margen de la ladera y fluye profundo y verde. Es tibia el agua, porque se ha deslizado chispeante sobre la arena amarilla y al calor del sol antes de llegar a la angosta laguna. A un lado del río, la dorada falda de la ladera se curva hacia arriba trepando hasta las montañas Gabilán, fuertes y rocosas, pero del lado del valle los árboles bordean la orilla: sauces frescos y verdes cada primavera, que en la s junturas más bajas de sus hojas muestran las consecuencias de la crecida invernal; y sicomoros de troncos veteados, blancos, recostados, y ramas quesear quean sobre el estanque "
) ,
}
)
writer . add_document ( doc )
writer . add_json (
""" {
" title " : [ " Frankenstein " , " El moderno Prometeo " ] ,
" body " : " Te alegrará saber que no ha ocurrido ningún percance al principio de una aventura que siempre consideraste cargada de malos presagios. Llegué aquí ayer, y mi primera tarea es asegurarle a mi querida hermana que me hallo perfectamente y que tengo una gran confianza en el éxito de mi empresa. "
} """
)
writer . commit ( )
index . reload ( )
return index
2019-10-01 16:50:09 +00:00
@pytest.fixture ( )
def dir_index ( tmpdir ) :
return ( tmpdir , create_index ( str ( tmpdir ) ) )
@pytest.fixture ( scope = " class " )
def ram_index ( ) :
return create_index ( )
2023-01-17 00:43:39 +00:00
@pytest.fixture ( scope = " class " )
def ram_index_numeric_fields ( ) :
return create_index_with_numeric_fields ( )
2023-03-18 04:56:50 +00:00
@pytest.fixture ( scope = " class " )
2023-02-14 13:20:59 +00:00
def spanish_index ( ) :
return create_spanish_index ( )
2019-08-03 11:28:12 +00:00
class TestClass ( object ) :
2019-10-01 16:50:09 +00:00
def test_simple_search_in_dir ( self , dir_index ) :
_ , index = dir_index
query = index . parse_query ( " sea whale " , [ " title " , " body " ] )
2019-10-01 18:05:38 +00:00
result = index . searcher ( ) . search ( query , 10 )
2019-12-17 22:17:44 +00:00
assert len ( result . hits ) == 1
2019-10-01 16:50:09 +00:00
def test_simple_search_after_reuse ( self , dir_index ) :
index_dir , _ = dir_index
index = Index ( schema ( ) , str ( index_dir ) )
query = index . parse_query ( " sea whale " , [ " title " , " body " ] )
2019-10-01 18:05:38 +00:00
result = index . searcher ( ) . search ( query , 10 )
2019-12-17 22:17:44 +00:00
assert len ( result . hits ) == 1
2019-10-01 16:50:09 +00:00
def test_simple_search_in_ram ( self , ram_index ) :
2019-08-02 11:23:10 +00:00
index = ram_index
query = index . parse_query ( " sea whale " , [ " title " , " body " ] )
2019-06-04 09:09:58 +00:00
2019-10-01 18:05:38 +00:00
result = index . searcher ( ) . search ( query , 10 )
2019-12-17 22:17:44 +00:00
assert len ( result . hits ) == 1
_ , doc_address = result . hits [ 0 ]
2019-08-02 11:23:10 +00:00
searched_doc = index . searcher ( ) . doc ( doc_address )
assert searched_doc [ " title " ] == [ " The Old Man and the Sea " ]
2019-08-02 21:32:08 +00:00
2023-02-14 13:20:59 +00:00
def test_simple_search_in_spanish ( self , spanish_index ) :
index = spanish_index
query = index . parse_query ( " vieja " , [ " title " , " body " ] )
result = index . searcher ( ) . search ( query , 10 )
assert len ( result . hits ) == 1
_ , doc_address = result . hits [ 0 ]
search_doc = index . searcher ( ) . doc ( doc_address )
assert search_doc [ " title " ] == [ " El viejo y el mar " ]
2019-08-03 11:28:12 +00:00
def test_and_query ( self , ram_index ) :
2019-08-02 11:23:10 +00:00
index = ram_index
2022-04-15 03:50:37 +00:00
query = index . parse_query (
" title:men AND body:summer " , default_field_names = [ " title " , " body " ]
)
2019-08-02 21:32:08 +00:00
# look for an intersection of documents
2019-08-02 11:23:10 +00:00
searcher = index . searcher ( )
2019-10-01 18:05:38 +00:00
result = searcher . search ( query , 10 )
2019-08-02 21:32:08 +00:00
# summer isn't present
2019-12-17 22:17:44 +00:00
assert len ( result . hits ) == 0
2019-08-02 21:32:08 +00:00
2019-08-02 11:23:10 +00:00
query = index . parse_query ( " title:men AND body:winter " , [ " title " , " body " ] )
2019-10-01 18:05:38 +00:00
result = searcher . search ( query )
2019-08-02 21:32:08 +00:00
2019-12-17 22:17:44 +00:00
assert len ( result . hits ) == 1
2019-08-02 21:32:08 +00:00
2023-01-17 00:43:39 +00:00
def test_and_query_numeric_fields ( self , ram_index_numeric_fields ) :
index = ram_index_numeric_fields
searcher = index . searcher ( )
# 1 result
float_query = index . parse_query ( " 3.5 " , [ " rating " ] )
result = searcher . search ( float_query )
assert len ( result . hits ) == 1
2023-07-21 22:13:03 +00:00
assert searcher . doc ( result . hits [ 0 ] [ 1 ] ) [ " rating " ] [ 0 ] == 3.5
2023-01-17 00:43:39 +00:00
integer_query = index . parse_query ( " 1 " , [ " id " ] )
result = searcher . search ( integer_query )
assert len ( result . hits ) == 1
# 0 result
integer_query = index . parse_query ( " 10 " , [ " id " ] )
result = searcher . search ( integer_query )
assert len ( result . hits ) == 0
2019-08-02 11:23:10 +00:00
def test_and_query_parser_default_fields ( self , ram_index ) :
query = ram_index . parse_query ( " winter " , default_field_names = [ " title " ] )
2023-07-20 23:44:47 +00:00
assert repr ( query ) == """ Query(TermQuery(Term(field=0, type=Str, " winter " ))) """
2019-08-02 11:23:10 +00:00
def test_and_query_parser_default_fields_undefined ( self , ram_index ) :
query = ram_index . parse_query ( " winter " )
2019-09-09 12:30:50 +00:00
assert (
2022-04-15 03:50:37 +00:00
repr ( query )
2023-07-20 23:44:47 +00:00
== """ Query(BooleanQuery { subqueries: [(Should, TermQuery(Term(field=0, type=Str, " winter " ))), (Should, TermQuery(Term(field=1, type=Str, " winter " )))] }) """
2019-09-09 12:30:50 +00:00
)
2019-08-02 11:23:10 +00:00
2019-08-03 11:28:12 +00:00
def test_query_errors ( self , ram_index ) :
2019-08-02 11:23:10 +00:00
index = ram_index
2019-08-02 21:32:08 +00:00
# no "bod" field
with pytest . raises ( ValueError ) :
2019-08-02 11:23:10 +00:00
index . parse_query ( " bod:men " , [ " title " , " body " ] )
2019-08-02 23:39:02 +00:00
2020-04-19 10:26:08 +00:00
def test_order_by_search ( self ) :
2022-04-15 03:50:37 +00:00
schema = (
SchemaBuilder ( )
2023-07-20 23:44:47 +00:00
. add_unsigned_field ( " order " , fast = True )
2022-04-15 03:50:37 +00:00
. add_text_field ( " title " , stored = True )
. build ( )
2020-04-19 10:26:08 +00:00
)
index = Index ( schema )
writer = index . writer ( )
doc = Document ( )
doc . add_unsigned ( " order " , 0 )
doc . add_text ( " title " , " Test title " )
writer . add_document ( doc )
doc = Document ( )
doc . add_unsigned ( " order " , 2 )
doc . add_text ( " title " , " Final test title " )
writer . add_document ( doc )
doc = Document ( )
doc . add_unsigned ( " order " , 1 )
doc . add_text ( " title " , " Another test title " )
writer . add_document ( doc )
writer . commit ( )
index . reload ( )
query = index . parse_query ( " test " )
searcher = index . searcher ( )
2020-09-06 10:26:17 +00:00
result = searcher . search ( query , 10 , offset = 2 , order_by_field = " order " )
assert len ( result . hits ) == 1
2020-04-19 10:26:08 +00:00
result = searcher . search ( query , 10 , order_by_field = " order " )
assert len ( result . hits ) == 3
_ , doc_address = result . hits [ 0 ]
searched_doc = index . searcher ( ) . doc ( doc_address )
assert searched_doc [ " title " ] == [ " Final test title " ]
_ , doc_address = result . hits [ 1 ]
searched_doc = index . searcher ( ) . doc ( doc_address )
assert searched_doc [ " title " ] == [ " Another test title " ]
_ , doc_address = result . hits [ 2 ]
searched_doc = index . searcher ( ) . doc ( doc_address )
assert searched_doc [ " title " ] == [ " Test title " ]
def test_order_by_search_without_fast_field ( self ) :
2022-04-15 03:50:37 +00:00
schema = (
SchemaBuilder ( )
2020-04-19 10:26:08 +00:00
. add_unsigned_field ( " order " )
2022-04-15 03:50:37 +00:00
. add_text_field ( " title " , stored = True )
. build ( )
2020-04-19 10:26:08 +00:00
)
index = Index ( schema )
writer = index . writer ( )
doc = Document ( )
doc . add_unsigned ( " order " , 0 )
doc . add_text ( " title " , " Test title " )
query = index . parse_query ( " test " )
searcher = index . searcher ( )
result = searcher . search ( query , 10 , order_by_field = " order " )
assert len ( result . hits ) == 0
2023-07-22 19:57:30 +00:00
def test_with_merges ( self ) :
# This test is taken from tantivy's test suite:
# https://github.com/quickwit-oss/tantivy/blob/42acd334f49d5ff7e4fe846b5c12198f24409b50/src/indexer/index_writer.rs#L1130
schema = SchemaBuilder ( ) . add_text_field ( " text " , stored = True ) . build ( )
index = Index ( schema )
index . config_reader ( reload_policy = " Manual " )
writer = index . writer ( )
for _ in range ( 100 ) :
doc = Document ( )
doc . add_text ( " text " , " a " )
writer . add_document ( doc )
writer . commit ( )
for _ in range ( 100 ) :
doc = Document ( )
doc . add_text ( " text " , " a " )
writer . add_document ( doc )
# This should create 8 segments and trigger a merge.
writer . commit ( )
writer . wait_merging_threads ( )
# Accessing the writer again should result in an error.
with pytest . raises ( RuntimeError ) :
writer . wait_merging_threads ( )
index . reload ( )
query = index . parse_query ( " a " )
searcher = index . searcher ( )
result = searcher . search ( query , limit = 500 , count = True )
assert result . count == 200
assert searcher . num_segments < 8
2023-07-21 22:13:03 +00:00
def test_doc_from_dict_schema_validation ( self ) :
schema = (
SchemaBuilder ( )
. add_unsigned_field ( " unsigned " )
. add_integer_field ( " signed " )
. add_float_field ( " float " )
. build ( )
)
good = Document . from_dict (
{ " unsigned " : 1000 , " signed " : - 5 , " float " : 0.4 } ,
schema ,
)
good = Document . from_dict (
{ " unsigned " : 1000 , " signed " : - 5 , " float " : 0.4 } ,
schema ,
)
with pytest . raises ( ValueError ) :
bad = Document . from_dict (
{ " unsigned " : - 50 , " signed " : - 5 , " float " : 0.4 } ,
schema ,
)
with pytest . raises ( ValueError ) :
bad = Document . from_dict (
{ " unsigned " : 1000 , " signed " : 50.4 , " float " : 0.4 } ,
schema ,
)
with pytest . raises ( ValueError ) :
bad = Document . from_dict (
{
" unsigned " : 1000 ,
" signed " : - 5 ,
" float " : " bad_string " ,
} ,
schema ,
)
with pytest . raises ( ValueError ) :
bad = Document . from_dict (
{
" unsigned " : [ 1000 , - 50 ] ,
" signed " : - 5 ,
" float " : 0.4 ,
} ,
schema ,
)
with pytest . raises ( ValueError ) :
bad = Document . from_dict (
{
" unsigned " : 1000 ,
" signed " : [ - 5 , 150 , - 3.14 ] ,
" float " : 0.4 ,
} ,
schema ,
)
2023-08-04 07:23:31 +00:00
def test_search_result_eq ( self , ram_index , spanish_index ) :
eng_index = ram_index
eng_query = eng_index . parse_query ( " sea whale " , [ " title " , " body " ] )
esp_index = spanish_index
esp_query = esp_index . parse_query ( " vieja " , [ " title " , " body " ] )
eng_result1 = eng_index . searcher ( ) . search ( eng_query , 10 )
eng_result2 = eng_index . searcher ( ) . search ( eng_query , 10 )
esp_result = esp_index . searcher ( ) . search ( esp_query , 10 )
assert eng_result1 == eng_result2
assert eng_result1 != esp_result
assert eng_result2 != esp_result
2023-08-26 12:13:29 +00:00
def test_search_result_pickle ( self , ram_index ) :
index = ram_index
query = index . parse_query ( " sea whale " , [ " title " , " body " ] )
orig = index . searcher ( ) . search ( query , 10 )
pickled = pickle . loads ( pickle . dumps ( orig ) )
assert orig == pickled
2020-04-19 10:26:08 +00:00
2019-09-09 12:30:50 +00:00
class TestUpdateClass ( object ) :
def test_delete_update ( self , ram_index ) :
query = ram_index . parse_query ( " Frankenstein " , [ " title " ] )
2019-10-01 18:05:38 +00:00
result = ram_index . searcher ( ) . search ( query , 10 )
2019-12-17 22:17:44 +00:00
assert len ( result . hits ) == 1
2019-09-09 12:30:50 +00:00
writer = ram_index . writer ( )
with pytest . raises ( ValueError ) :
writer . delete_documents ( " fake_field " , " frankenstein " )
with pytest . raises ( ValueError ) :
writer . delete_documents ( " title " , b " frankenstein " )
writer . delete_documents ( " title " , " frankenstein " )
writer . commit ( )
ram_index . reload ( )
2019-10-01 18:05:38 +00:00
result = ram_index . searcher ( ) . search ( query )
2019-12-17 22:17:44 +00:00
assert len ( result . hits ) == 0
2019-09-09 12:30:50 +00:00
2019-08-03 11:28:12 +00:00
class TestFromDiskClass ( object ) :
2022-04-27 01:48:19 +00:00
def test_opens_from_dir_invalid_schema ( self , dir_index ) :
invalid_schema = SchemaBuilder ( ) . add_text_field ( " 🐱 " ) . build ( )
index_dir , _ = dir_index
2020-01-05 11:04:27 +00:00
with pytest . raises ( ValueError ) :
2022-04-27 01:48:19 +00:00
Index ( invalid_schema , str ( index_dir ) , reuse = True )
2020-01-05 11:04:27 +00:00
def test_opens_from_dir ( self , dir_index ) :
index_dir , _ = dir_index
index = Index ( schema ( ) , str ( index_dir ) , reuse = True )
2019-08-02 11:23:10 +00:00
assert index . searcher ( ) . num_docs == 3
def test_create_readers ( self ) :
# not sure what is the point of this test.
idx = Index ( schema ( ) )
2019-09-06 00:45:29 +00:00
idx . config_reader ( " Manual " , 4 )
2019-08-29 00:55:36 +00:00
assert idx . searcher ( ) . num_docs == 0
2019-08-02 11:23:10 +00:00
# by default this is manual mode
writer = idx . writer ( 30000000 , 1 )
2019-08-29 00:55:36 +00:00
writer . add_document ( Document ( title = " mytitle " , body = " mybody " ) )
2019-08-02 11:23:10 +00:00
writer . commit ( )
2019-08-29 00:55:36 +00:00
assert idx . searcher ( ) . num_docs == 0
2019-08-02 11:23:10 +00:00
# Manual is the default setting.
# In this case, change are reflected only when
# the index is manually reloaded.
idx . reload ( )
2019-08-29 00:55:36 +00:00
assert idx . searcher ( ) . num_docs == 1
2019-08-02 11:23:10 +00:00
idx . config_reader ( " OnCommit " , 4 )
2019-08-29 00:55:36 +00:00
writer . add_document ( Document ( title = " mytitle2 " , body = " mybody2 " ) )
2019-08-02 11:23:10 +00:00
writer . commit ( )
import time
2019-09-09 12:30:50 +00:00
2019-08-02 11:23:10 +00:00
for i in range ( 50 ) :
# The index should be automatically reloaded.
# Wait for at most 5s for it to happen.
time . sleep ( 0.1 )
2019-08-29 00:55:36 +00:00
if idx . searcher ( ) . num_docs == 2 :
2019-08-02 11:23:10 +00:00
return
assert False
2019-08-29 00:55:36 +00:00
2019-08-02 11:23:10 +00:00
class TestSearcher ( object ) :
2023-01-17 00:43:39 +00:00
def test_searcher_repr ( self , ram_index , ram_index_numeric_fields ) :
2019-08-02 11:23:10 +00:00
assert repr ( ram_index . searcher ( ) ) == " Searcher(num_docs=3, num_segments=1) "
2023-08-26 12:13:29 +00:00
assert (
repr ( ram_index_numeric_fields . searcher ( ) )
== " Searcher(num_docs=2, num_segments=1) "
)
2019-08-02 11:23:10 +00:00
class TestDocument ( object ) :
def test_document ( self ) :
doc = tantivy . Document ( name = " Bill " , reference = [ 1 , 2 ] )
assert doc [ " reference " ] == [ 1 , 2 ]
assert doc [ " name " ] == [ " Bill " ]
assert doc . get_first ( " name " ) == " Bill "
assert doc . get_first ( " reference " ) == 1
assert doc . to_dict ( ) == { " name " : [ " Bill " ] , " reference " : [ 1 , 2 ] }
def test_document_with_date ( self ) :
2019-09-09 12:30:50 +00:00
date = datetime . datetime ( 2019 , 8 , 12 , 13 , 0 , 0 )
2019-08-02 11:23:10 +00:00
doc = tantivy . Document ( name = " Bill " , date = date )
assert doc [ " date " ] [ 0 ] == date
def test_document_repr ( self ) :
doc = tantivy . Document ( name = " Bill " , reference = [ 1 , 2 ] )
assert repr ( doc ) == " Document(name=[Bill],reference=[1,2]) "
2020-10-01 13:35:36 +00:00
def test_document_repr_utf8 ( self ) :
doc = tantivy . Document ( name = " 野菜食べないとやばい " , reference = [ 1 , 2 ] )
assert repr ( doc ) == " Document(name=[野菜食べないとやばい],reference=[1,2]) "
2019-08-02 11:23:10 +00:00
def test_document_with_facet ( self ) :
doc = tantivy . Document ( )
facet = tantivy . Facet . from_string ( " /europe/france " )
doc . add_facet ( " facet " , facet )
2019-09-09 12:30:50 +00:00
assert doc [ " facet " ] [ 0 ] . to_path ( ) == [ " europe " , " france " ]
2019-08-02 11:23:10 +00:00
doc = tantivy . Document ( )
facet = tantivy . Facet . from_string ( " /asia \\ /oceania/fiji " )
doc . add_facet ( " facet " , facet )
2019-09-09 12:30:50 +00:00
assert doc [ " facet " ] [ 0 ] . to_path ( ) == [ " asia/oceania " , " fiji " ]
2019-08-02 11:23:10 +00:00
assert doc [ " facet " ] [ 0 ] . to_path_str ( ) == " /asia \\ /oceania/fiji "
assert repr ( doc [ " facet " ] [ 0 ] ) == " Facet(/asia \\ /oceania/fiji) "
doc = tantivy . Document ( facet = facet )
2019-09-09 12:30:50 +00:00
assert doc [ " facet " ] [ 0 ] . to_path ( ) == [ " asia/oceania " , " fiji " ]
2019-08-02 11:23:10 +00:00
def test_document_error ( self ) :
with pytest . raises ( ValueError ) :
2019-08-29 00:55:36 +00:00
tantivy . Document ( name = { } )
2022-04-15 03:50:37 +00:00
2023-08-04 07:23:31 +00:00
def test_document_eq ( self ) :
doc1 = tantivy . Document ( name = " Bill " , reference = [ 1 , 2 ] )
doc2 = tantivy . Document . from_dict ( { " name " : " Bill " , " reference " : [ 1 , 2 ] } )
doc3 = tantivy . Document ( name = " Bob " , reference = [ 3 , 4 ] )
assert doc1 == doc2
assert doc1 != doc3
assert doc2 != doc3
def test_document_copy ( self ) :
doc1 = tantivy . Document ( name = " Bill " , reference = [ 1 , 2 ] )
doc2 = copy . copy ( doc1 )
doc3 = copy . deepcopy ( doc2 )
assert doc1 == doc2
assert doc1 == doc3
assert doc2 == doc3
2023-08-26 12:13:29 +00:00
def test_document_pickle ( self ) :
orig = Document ( )
orig . add_unsigned ( " unsigned " , 1 )
orig . add_integer ( " integer " , 5 )
orig . add_float ( " float " , 1.0 )
orig . add_date ( " birth " , datetime . datetime ( 2019 , 8 , 12 , 13 , 0 , 5 ) )
orig . add_text ( " title " , " hello world! " )
orig . add_json ( " json " , ' { " a " : 1, " b " : 2} ' )
orig . add_bytes ( " bytes " , b " abc " )
facet = tantivy . Facet . from_string ( " /europe/france " )
orig . add_facet ( " facet " , facet )
pickled = pickle . loads ( pickle . dumps ( orig ) )
assert orig == pickled
2022-04-15 03:50:37 +00:00
class TestJsonField :
def test_query_from_json_field ( self ) :
schema = (
SchemaBuilder ( )
. add_json_field (
" attributes " ,
stored = True ,
tokenizer_name = " default " ,
index_option = " position " ,
)
. build ( )
)
index = Index ( schema )
writer = index . writer ( )
doc = Document ( )
doc . add_json (
" attributes " ,
""" {
" order " : 1.1 ,
" target " : " submit-button " ,
" cart " : { " product_id " : 103 } ,
" description " : " the best vacuum cleaner ever "
} """ ,
)
writer . add_document ( doc )
doc = Document ( )
doc . add_json (
" attributes " ,
""" {
" order " : 1.2 ,
" target " : " submit-button " ,
" cart " : { " product_id " : 133 } ,
" description " : " das keyboard "
} """ ,
)
writer . add_document ( doc )
writer . commit ( )
index . reload ( )
query = index . parse_query ( " target:submit-button " , [ " attributes " ] )
result = index . searcher ( ) . search ( query , 2 )
assert len ( result . hits ) == 2
query = index . parse_query ( " target:submit " , [ " attributes " ] )
result = index . searcher ( ) . search ( query , 2 )
assert len ( result . hits ) == 2
query = index . parse_query ( " order:1.1 " , [ " attributes " ] )
result = index . searcher ( ) . search ( query , 2 )
assert len ( result . hits ) == 1
# query = index.parse_query_for_attributes("cart.product_id:103")
# result = index.searcher().search(query, 1)
# assert len(result.hits) == 1
# query = index.parse_query_for_attributes(
# "target:submit-button AND cart.product_id:133"
# )
# result = index.searcher().search(query, 2)
# assert len(result.hits) == 1
2023-03-26 13:03:31 +00:00
2023-07-21 22:13:03 +00:00
@pytest.mark.parametrize ( " bytes_kwarg " , [ True , False ] )
@pytest.mark.parametrize (
" bytes_payload " ,
[
b " abc " ,
bytearray ( b " abc " ) ,
memoryview ( b " abc " ) ,
BytesIO ( b " abc " ) . read ( ) ,
BytesIO ( b " abc " ) . getbuffer ( ) ,
] ,
)
2023-03-26 13:03:31 +00:00
def test_bytes ( bytes_kwarg , bytes_payload ) :
2023-03-26 13:27:14 +00:00
schema = SchemaBuilder ( ) . add_bytes_field ( " embedding " ) . build ( )
2023-03-26 13:03:31 +00:00
index = Index ( schema )
writer = index . writer ( )
if bytes_kwarg :
doc = Document ( id = 1 , embedding = bytes_payload )
else :
doc = Document ( id = 1 )
doc . add_bytes ( " embedding " , bytes_payload )
writer . add_document ( doc )
writer . commit ( )
index . reload ( )
2023-08-04 07:23:31 +00:00
def test_schema_eq ( ) :
schema1 = schema ( )
schema2 = schema ( )
schema3 = schema_numeric_fields ( )
assert schema1 == schema2
assert schema1 != schema3
assert schema2 != schema3
def test_facet_eq ( ) :
facet1 = tantivy . Facet . from_string ( " /europe/france " )
facet2 = tantivy . Facet . from_string ( " /europe/france " )
facet3 = tantivy . Facet . from_string ( " /europe/germany " )
assert facet1 == facet2
assert facet1 != facet3
assert facet2 != facet3
2023-08-26 12:13:29 +00:00
def test_schema_pickle ( ) :
orig = (
SchemaBuilder ( )
. add_integer_field ( " id " , stored = True , indexed = True )
. add_unsigned_field ( " unsigned " )
. add_float_field ( " rating " , stored = True , indexed = True )
. add_text_field ( " body " , stored = True )
. add_date_field ( " date " )
. add_json_field ( " json " )
. add_bytes_field ( " bytes " )
. build ( )
)
pickled = pickle . loads ( pickle . dumps ( orig ) )
assert orig == pickled
def test_facet_pickle ( ) :
orig = tantivy . Facet . from_string ( " /europe/france " )
pickled = pickle . loads ( pickle . dumps ( orig ) )
assert orig == pickled
def test_doc_address_pickle ( ) :
orig = tantivy . DocAddress ( 42 , 123 )
pickled = pickle . loads ( pickle . dumps ( orig ) )
assert orig == pickled